libsanitizer merge from upstream r175733
[official-gcc.git] / gcc / config / i386 / i386.c
blobb835c5da2abbfd9769a3cbbe76c4529b4a08a50b
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
66 static rtx legitimize_dllimport_symbol (rtx, bool);
68 #ifndef CHECK_STACK_LIMIT
69 #define CHECK_STACK_LIMIT (-1)
70 #endif
72 /* Return index of given mode in mult and division cost tables. */
73 #define MODE_INDEX(mode) \
74 ((mode) == QImode ? 0 \
75 : (mode) == HImode ? 1 \
76 : (mode) == SImode ? 2 \
77 : (mode) == DImode ? 3 \
78 : 4)
80 /* Processor costs (relative to an add) */
81 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
82 #define COSTS_N_BYTES(N) ((N) * 2)
84 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
86 const
87 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
88 COSTS_N_BYTES (2), /* cost of an add instruction */
89 COSTS_N_BYTES (3), /* cost of a lea instruction */
90 COSTS_N_BYTES (2), /* variable shift costs */
91 COSTS_N_BYTES (3), /* constant shift costs */
92 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
93 COSTS_N_BYTES (3), /* HI */
94 COSTS_N_BYTES (3), /* SI */
95 COSTS_N_BYTES (3), /* DI */
96 COSTS_N_BYTES (5)}, /* other */
97 0, /* cost of multiply per each bit set */
98 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
99 COSTS_N_BYTES (3), /* HI */
100 COSTS_N_BYTES (3), /* SI */
101 COSTS_N_BYTES (3), /* DI */
102 COSTS_N_BYTES (5)}, /* other */
103 COSTS_N_BYTES (3), /* cost of movsx */
104 COSTS_N_BYTES (3), /* cost of movzx */
105 0, /* "large" insn */
106 2, /* MOVE_RATIO */
107 2, /* cost for loading QImode using movzbl */
108 {2, 2, 2}, /* cost of loading integer registers
109 in QImode, HImode and SImode.
110 Relative to reg-reg move (2). */
111 {2, 2, 2}, /* cost of storing integer registers */
112 2, /* cost of reg,reg fld/fst */
113 {2, 2, 2}, /* cost of loading fp registers
114 in SFmode, DFmode and XFmode */
115 {2, 2, 2}, /* cost of storing fp registers
116 in SFmode, DFmode and XFmode */
117 3, /* cost of moving MMX register */
118 {3, 3}, /* cost of loading MMX registers
119 in SImode and DImode */
120 {3, 3}, /* cost of storing MMX registers
121 in SImode and DImode */
122 3, /* cost of moving SSE register */
123 {3, 3, 3}, /* cost of loading SSE registers
124 in SImode, DImode and TImode */
125 {3, 3, 3}, /* cost of storing SSE registers
126 in SImode, DImode and TImode */
127 3, /* MMX or SSE register to integer */
128 0, /* size of l1 cache */
129 0, /* size of l2 cache */
130 0, /* size of prefetch block */
131 0, /* number of parallel prefetches */
132 2, /* Branch cost */
133 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
134 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
135 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
136 COSTS_N_BYTES (2), /* cost of FABS instruction. */
137 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
138 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
139 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
140 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
143 1, /* scalar_stmt_cost. */
144 1, /* scalar load_cost. */
145 1, /* scalar_store_cost. */
146 1, /* vec_stmt_cost. */
147 1, /* vec_to_scalar_cost. */
148 1, /* scalar_to_vec_cost. */
149 1, /* vec_align_load_cost. */
150 1, /* vec_unalign_load_cost. */
151 1, /* vec_store_cost. */
152 1, /* cond_taken_branch_cost. */
153 1, /* cond_not_taken_branch_cost. */
156 /* Processor costs (relative to an add) */
157 static const
158 struct processor_costs i386_cost = { /* 386 specific costs */
159 COSTS_N_INSNS (1), /* cost of an add instruction */
160 COSTS_N_INSNS (1), /* cost of a lea instruction */
161 COSTS_N_INSNS (3), /* variable shift costs */
162 COSTS_N_INSNS (2), /* constant shift costs */
163 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
164 COSTS_N_INSNS (6), /* HI */
165 COSTS_N_INSNS (6), /* SI */
166 COSTS_N_INSNS (6), /* DI */
167 COSTS_N_INSNS (6)}, /* other */
168 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
169 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
170 COSTS_N_INSNS (23), /* HI */
171 COSTS_N_INSNS (23), /* SI */
172 COSTS_N_INSNS (23), /* DI */
173 COSTS_N_INSNS (23)}, /* other */
174 COSTS_N_INSNS (3), /* cost of movsx */
175 COSTS_N_INSNS (2), /* cost of movzx */
176 15, /* "large" insn */
177 3, /* MOVE_RATIO */
178 4, /* cost for loading QImode using movzbl */
179 {2, 4, 2}, /* cost of loading integer registers
180 in QImode, HImode and SImode.
181 Relative to reg-reg move (2). */
182 {2, 4, 2}, /* cost of storing integer registers */
183 2, /* cost of reg,reg fld/fst */
184 {8, 8, 8}, /* cost of loading fp registers
185 in SFmode, DFmode and XFmode */
186 {8, 8, 8}, /* cost of storing fp registers
187 in SFmode, DFmode and XFmode */
188 2, /* cost of moving MMX register */
189 {4, 8}, /* cost of loading MMX registers
190 in SImode and DImode */
191 {4, 8}, /* cost of storing MMX registers
192 in SImode and DImode */
193 2, /* cost of moving SSE register */
194 {4, 8, 16}, /* cost of loading SSE registers
195 in SImode, DImode and TImode */
196 {4, 8, 16}, /* cost of storing SSE registers
197 in SImode, DImode and TImode */
198 3, /* MMX or SSE register to integer */
199 0, /* size of l1 cache */
200 0, /* size of l2 cache */
201 0, /* size of prefetch block */
202 0, /* number of parallel prefetches */
203 1, /* Branch cost */
204 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
205 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
206 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
207 COSTS_N_INSNS (22), /* cost of FABS instruction. */
208 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
209 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
210 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
211 DUMMY_STRINGOP_ALGS},
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
213 DUMMY_STRINGOP_ALGS},
214 1, /* scalar_stmt_cost. */
215 1, /* scalar load_cost. */
216 1, /* scalar_store_cost. */
217 1, /* vec_stmt_cost. */
218 1, /* vec_to_scalar_cost. */
219 1, /* scalar_to_vec_cost. */
220 1, /* vec_align_load_cost. */
221 2, /* vec_unalign_load_cost. */
222 1, /* vec_store_cost. */
223 3, /* cond_taken_branch_cost. */
224 1, /* cond_not_taken_branch_cost. */
227 static const
228 struct processor_costs i486_cost = { /* 486 specific costs */
229 COSTS_N_INSNS (1), /* cost of an add instruction */
230 COSTS_N_INSNS (1), /* cost of a lea instruction */
231 COSTS_N_INSNS (3), /* variable shift costs */
232 COSTS_N_INSNS (2), /* constant shift costs */
233 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
234 COSTS_N_INSNS (12), /* HI */
235 COSTS_N_INSNS (12), /* SI */
236 COSTS_N_INSNS (12), /* DI */
237 COSTS_N_INSNS (12)}, /* other */
238 1, /* cost of multiply per each bit set */
239 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
240 COSTS_N_INSNS (40), /* HI */
241 COSTS_N_INSNS (40), /* SI */
242 COSTS_N_INSNS (40), /* DI */
243 COSTS_N_INSNS (40)}, /* other */
244 COSTS_N_INSNS (3), /* cost of movsx */
245 COSTS_N_INSNS (2), /* cost of movzx */
246 15, /* "large" insn */
247 3, /* MOVE_RATIO */
248 4, /* cost for loading QImode using movzbl */
249 {2, 4, 2}, /* cost of loading integer registers
250 in QImode, HImode and SImode.
251 Relative to reg-reg move (2). */
252 {2, 4, 2}, /* cost of storing integer registers */
253 2, /* cost of reg,reg fld/fst */
254 {8, 8, 8}, /* cost of loading fp registers
255 in SFmode, DFmode and XFmode */
256 {8, 8, 8}, /* cost of storing fp registers
257 in SFmode, DFmode and XFmode */
258 2, /* cost of moving MMX register */
259 {4, 8}, /* cost of loading MMX registers
260 in SImode and DImode */
261 {4, 8}, /* cost of storing MMX registers
262 in SImode and DImode */
263 2, /* cost of moving SSE register */
264 {4, 8, 16}, /* cost of loading SSE registers
265 in SImode, DImode and TImode */
266 {4, 8, 16}, /* cost of storing SSE registers
267 in SImode, DImode and TImode */
268 3, /* MMX or SSE register to integer */
269 4, /* size of l1 cache. 486 has 8kB cache
270 shared for code and data, so 4kB is
271 not really precise. */
272 4, /* size of l2 cache */
273 0, /* size of prefetch block */
274 0, /* number of parallel prefetches */
275 1, /* Branch cost */
276 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
277 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
278 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
279 COSTS_N_INSNS (3), /* cost of FABS instruction. */
280 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
281 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
282 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
283 DUMMY_STRINGOP_ALGS},
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
285 DUMMY_STRINGOP_ALGS},
286 1, /* scalar_stmt_cost. */
287 1, /* scalar load_cost. */
288 1, /* scalar_store_cost. */
289 1, /* vec_stmt_cost. */
290 1, /* vec_to_scalar_cost. */
291 1, /* scalar_to_vec_cost. */
292 1, /* vec_align_load_cost. */
293 2, /* vec_unalign_load_cost. */
294 1, /* vec_store_cost. */
295 3, /* cond_taken_branch_cost. */
296 1, /* cond_not_taken_branch_cost. */
299 static const
300 struct processor_costs pentium_cost = {
301 COSTS_N_INSNS (1), /* cost of an add instruction */
302 COSTS_N_INSNS (1), /* cost of a lea instruction */
303 COSTS_N_INSNS (4), /* variable shift costs */
304 COSTS_N_INSNS (1), /* constant shift costs */
305 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
306 COSTS_N_INSNS (11), /* HI */
307 COSTS_N_INSNS (11), /* SI */
308 COSTS_N_INSNS (11), /* DI */
309 COSTS_N_INSNS (11)}, /* other */
310 0, /* cost of multiply per each bit set */
311 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
312 COSTS_N_INSNS (25), /* HI */
313 COSTS_N_INSNS (25), /* SI */
314 COSTS_N_INSNS (25), /* DI */
315 COSTS_N_INSNS (25)}, /* other */
316 COSTS_N_INSNS (3), /* cost of movsx */
317 COSTS_N_INSNS (2), /* cost of movzx */
318 8, /* "large" insn */
319 6, /* MOVE_RATIO */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, /* cost of moving SSE register */
336 {4, 8, 16}, /* cost of loading SSE registers
337 in SImode, DImode and TImode */
338 {4, 8, 16}, /* cost of storing SSE registers
339 in SImode, DImode and TImode */
340 3, /* MMX or SSE register to integer */
341 8, /* size of l1 cache. */
342 8, /* size of l2 cache */
343 0, /* size of prefetch block */
344 0, /* number of parallel prefetches */
345 2, /* Branch cost */
346 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
347 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
348 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
349 COSTS_N_INSNS (1), /* cost of FABS instruction. */
350 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
351 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
352 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
353 DUMMY_STRINGOP_ALGS},
354 {{libcall, {{-1, rep_prefix_4_byte, false}}},
355 DUMMY_STRINGOP_ALGS},
356 1, /* scalar_stmt_cost. */
357 1, /* scalar load_cost. */
358 1, /* scalar_store_cost. */
359 1, /* vec_stmt_cost. */
360 1, /* vec_to_scalar_cost. */
361 1, /* scalar_to_vec_cost. */
362 1, /* vec_align_load_cost. */
363 2, /* vec_unalign_load_cost. */
364 1, /* vec_store_cost. */
365 3, /* cond_taken_branch_cost. */
366 1, /* cond_not_taken_branch_cost. */
369 static const
370 struct processor_costs pentiumpro_cost = {
371 COSTS_N_INSNS (1), /* cost of an add instruction */
372 COSTS_N_INSNS (1), /* cost of a lea instruction */
373 COSTS_N_INSNS (1), /* variable shift costs */
374 COSTS_N_INSNS (1), /* constant shift costs */
375 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
376 COSTS_N_INSNS (4), /* HI */
377 COSTS_N_INSNS (4), /* SI */
378 COSTS_N_INSNS (4), /* DI */
379 COSTS_N_INSNS (4)}, /* other */
380 0, /* cost of multiply per each bit set */
381 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
382 COSTS_N_INSNS (17), /* HI */
383 COSTS_N_INSNS (17), /* SI */
384 COSTS_N_INSNS (17), /* DI */
385 COSTS_N_INSNS (17)}, /* other */
386 COSTS_N_INSNS (1), /* cost of movsx */
387 COSTS_N_INSNS (1), /* cost of movzx */
388 8, /* "large" insn */
389 6, /* MOVE_RATIO */
390 2, /* cost for loading QImode using movzbl */
391 {4, 4, 4}, /* cost of loading integer registers
392 in QImode, HImode and SImode.
393 Relative to reg-reg move (2). */
394 {2, 2, 2}, /* cost of storing integer registers */
395 2, /* cost of reg,reg fld/fst */
396 {2, 2, 6}, /* cost of loading fp registers
397 in SFmode, DFmode and XFmode */
398 {4, 4, 6}, /* cost of storing fp registers
399 in SFmode, DFmode and XFmode */
400 2, /* cost of moving MMX register */
401 {2, 2}, /* cost of loading MMX registers
402 in SImode and DImode */
403 {2, 2}, /* cost of storing MMX registers
404 in SImode and DImode */
405 2, /* cost of moving SSE register */
406 {2, 2, 8}, /* cost of loading SSE registers
407 in SImode, DImode and TImode */
408 {2, 2, 8}, /* cost of storing SSE registers
409 in SImode, DImode and TImode */
410 3, /* MMX or SSE register to integer */
411 8, /* size of l1 cache. */
412 256, /* size of l2 cache */
413 32, /* size of prefetch block */
414 6, /* number of parallel prefetches */
415 2, /* Branch cost */
416 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
417 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
418 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
421 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
422 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
423 (we ensure the alignment). For small blocks inline loop is still a
424 noticeable win, for bigger blocks either rep movsl or rep movsb is
425 way to go. Rep movsb has apparently more expensive startup time in CPU,
426 but after 4K the difference is down in the noise. */
427 {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
428 {8192, rep_prefix_4_byte, false},
429 {-1, rep_prefix_1_byte, false}}},
430 DUMMY_STRINGOP_ALGS},
431 {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
432 {8192, rep_prefix_4_byte, false},
433 {-1, libcall, false}}},
434 DUMMY_STRINGOP_ALGS},
435 1, /* scalar_stmt_cost. */
436 1, /* scalar load_cost. */
437 1, /* scalar_store_cost. */
438 1, /* vec_stmt_cost. */
439 1, /* vec_to_scalar_cost. */
440 1, /* scalar_to_vec_cost. */
441 1, /* vec_align_load_cost. */
442 2, /* vec_unalign_load_cost. */
443 1, /* vec_store_cost. */
444 3, /* cond_taken_branch_cost. */
445 1, /* cond_not_taken_branch_cost. */
448 static const
449 struct processor_costs geode_cost = {
450 COSTS_N_INSNS (1), /* cost of an add instruction */
451 COSTS_N_INSNS (1), /* cost of a lea instruction */
452 COSTS_N_INSNS (2), /* variable shift costs */
453 COSTS_N_INSNS (1), /* constant shift costs */
454 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
455 COSTS_N_INSNS (4), /* HI */
456 COSTS_N_INSNS (7), /* SI */
457 COSTS_N_INSNS (7), /* DI */
458 COSTS_N_INSNS (7)}, /* other */
459 0, /* cost of multiply per each bit set */
460 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
461 COSTS_N_INSNS (23), /* HI */
462 COSTS_N_INSNS (39), /* SI */
463 COSTS_N_INSNS (39), /* DI */
464 COSTS_N_INSNS (39)}, /* other */
465 COSTS_N_INSNS (1), /* cost of movsx */
466 COSTS_N_INSNS (1), /* cost of movzx */
467 8, /* "large" insn */
468 4, /* MOVE_RATIO */
469 1, /* cost for loading QImode using movzbl */
470 {1, 1, 1}, /* cost of loading integer registers
471 in QImode, HImode and SImode.
472 Relative to reg-reg move (2). */
473 {1, 1, 1}, /* cost of storing integer registers */
474 1, /* cost of reg,reg fld/fst */
475 {1, 1, 1}, /* cost of loading fp registers
476 in SFmode, DFmode and XFmode */
477 {4, 6, 6}, /* cost of storing fp registers
478 in SFmode, DFmode and XFmode */
480 1, /* cost of moving MMX register */
481 {1, 1}, /* cost of loading MMX registers
482 in SImode and DImode */
483 {1, 1}, /* cost of storing MMX registers
484 in SImode and DImode */
485 1, /* cost of moving SSE register */
486 {1, 1, 1}, /* cost of loading SSE registers
487 in SImode, DImode and TImode */
488 {1, 1, 1}, /* cost of storing SSE registers
489 in SImode, DImode and TImode */
490 1, /* MMX or SSE register to integer */
491 64, /* size of l1 cache. */
492 128, /* size of l2 cache. */
493 32, /* size of prefetch block */
494 1, /* number of parallel prefetches */
495 1, /* Branch cost */
496 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
497 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
498 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
501 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
502 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
503 DUMMY_STRINGOP_ALGS},
504 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS},
506 1, /* scalar_stmt_cost. */
507 1, /* scalar load_cost. */
508 1, /* scalar_store_cost. */
509 1, /* vec_stmt_cost. */
510 1, /* vec_to_scalar_cost. */
511 1, /* scalar_to_vec_cost. */
512 1, /* vec_align_load_cost. */
513 2, /* vec_unalign_load_cost. */
514 1, /* vec_store_cost. */
515 3, /* cond_taken_branch_cost. */
516 1, /* cond_not_taken_branch_cost. */
519 static const
520 struct processor_costs k6_cost = {
521 COSTS_N_INSNS (1), /* cost of an add instruction */
522 COSTS_N_INSNS (2), /* cost of a lea instruction */
523 COSTS_N_INSNS (1), /* variable shift costs */
524 COSTS_N_INSNS (1), /* constant shift costs */
525 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
526 COSTS_N_INSNS (3), /* HI */
527 COSTS_N_INSNS (3), /* SI */
528 COSTS_N_INSNS (3), /* DI */
529 COSTS_N_INSNS (3)}, /* other */
530 0, /* cost of multiply per each bit set */
531 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
532 COSTS_N_INSNS (18), /* HI */
533 COSTS_N_INSNS (18), /* SI */
534 COSTS_N_INSNS (18), /* DI */
535 COSTS_N_INSNS (18)}, /* other */
536 COSTS_N_INSNS (2), /* cost of movsx */
537 COSTS_N_INSNS (2), /* cost of movzx */
538 8, /* "large" insn */
539 4, /* MOVE_RATIO */
540 3, /* cost for loading QImode using movzbl */
541 {4, 5, 4}, /* cost of loading integer registers
542 in QImode, HImode and SImode.
543 Relative to reg-reg move (2). */
544 {2, 3, 2}, /* cost of storing integer registers */
545 4, /* cost of reg,reg fld/fst */
546 {6, 6, 6}, /* cost of loading fp registers
547 in SFmode, DFmode and XFmode */
548 {4, 4, 4}, /* cost of storing fp registers
549 in SFmode, DFmode and XFmode */
550 2, /* cost of moving MMX register */
551 {2, 2}, /* cost of loading MMX registers
552 in SImode and DImode */
553 {2, 2}, /* cost of storing MMX registers
554 in SImode and DImode */
555 2, /* cost of moving SSE register */
556 {2, 2, 8}, /* cost of loading SSE registers
557 in SImode, DImode and TImode */
558 {2, 2, 8}, /* cost of storing SSE registers
559 in SImode, DImode and TImode */
560 6, /* MMX or SSE register to integer */
561 32, /* size of l1 cache. */
562 32, /* size of l2 cache. Some models
563 have integrated l2 cache, but
564 optimizing for k6 is not important
565 enough to worry about that. */
566 32, /* size of prefetch block */
567 1, /* number of parallel prefetches */
568 1, /* Branch cost */
569 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
570 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
571 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
572 COSTS_N_INSNS (2), /* cost of FABS instruction. */
573 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
574 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
575 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS},
577 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
578 DUMMY_STRINGOP_ALGS},
579 1, /* scalar_stmt_cost. */
580 1, /* scalar load_cost. */
581 1, /* scalar_store_cost. */
582 1, /* vec_stmt_cost. */
583 1, /* vec_to_scalar_cost. */
584 1, /* scalar_to_vec_cost. */
585 1, /* vec_align_load_cost. */
586 2, /* vec_unalign_load_cost. */
587 1, /* vec_store_cost. */
588 3, /* cond_taken_branch_cost. */
589 1, /* cond_not_taken_branch_cost. */
592 static const
593 struct processor_costs athlon_cost = {
594 COSTS_N_INSNS (1), /* cost of an add instruction */
595 COSTS_N_INSNS (2), /* cost of a lea instruction */
596 COSTS_N_INSNS (1), /* variable shift costs */
597 COSTS_N_INSNS (1), /* constant shift costs */
598 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
599 COSTS_N_INSNS (5), /* HI */
600 COSTS_N_INSNS (5), /* SI */
601 COSTS_N_INSNS (5), /* DI */
602 COSTS_N_INSNS (5)}, /* other */
603 0, /* cost of multiply per each bit set */
604 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
605 COSTS_N_INSNS (26), /* HI */
606 COSTS_N_INSNS (42), /* SI */
607 COSTS_N_INSNS (74), /* DI */
608 COSTS_N_INSNS (74)}, /* other */
609 COSTS_N_INSNS (1), /* cost of movsx */
610 COSTS_N_INSNS (1), /* cost of movzx */
611 8, /* "large" insn */
612 9, /* MOVE_RATIO */
613 4, /* cost for loading QImode using movzbl */
614 {3, 4, 3}, /* cost of loading integer registers
615 in QImode, HImode and SImode.
616 Relative to reg-reg move (2). */
617 {3, 4, 3}, /* cost of storing integer registers */
618 4, /* cost of reg,reg fld/fst */
619 {4, 4, 12}, /* cost of loading fp registers
620 in SFmode, DFmode and XFmode */
621 {6, 6, 8}, /* cost of storing fp registers
622 in SFmode, DFmode and XFmode */
623 2, /* cost of moving MMX register */
624 {4, 4}, /* cost of loading MMX registers
625 in SImode and DImode */
626 {4, 4}, /* cost of storing MMX registers
627 in SImode and DImode */
628 2, /* cost of moving SSE register */
629 {4, 4, 6}, /* cost of loading SSE registers
630 in SImode, DImode and TImode */
631 {4, 4, 5}, /* cost of storing SSE registers
632 in SImode, DImode and TImode */
633 5, /* MMX or SSE register to integer */
634 64, /* size of l1 cache. */
635 256, /* size of l2 cache. */
636 64, /* size of prefetch block */
637 6, /* number of parallel prefetches */
638 5, /* Branch cost */
639 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
640 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
641 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
642 COSTS_N_INSNS (2), /* cost of FABS instruction. */
643 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
644 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
645 /* For some reason, Athlon deals better with REP prefix (relative to loops)
646 compared to K8. Alignment becomes important after 8 bytes for memcpy and
647 128 bytes for memset. */
648 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
649 DUMMY_STRINGOP_ALGS},
650 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS},
652 1, /* scalar_stmt_cost. */
653 1, /* scalar load_cost. */
654 1, /* scalar_store_cost. */
655 1, /* vec_stmt_cost. */
656 1, /* vec_to_scalar_cost. */
657 1, /* scalar_to_vec_cost. */
658 1, /* vec_align_load_cost. */
659 2, /* vec_unalign_load_cost. */
660 1, /* vec_store_cost. */
661 3, /* cond_taken_branch_cost. */
662 1, /* cond_not_taken_branch_cost. */
665 static const
666 struct processor_costs k8_cost = {
667 COSTS_N_INSNS (1), /* cost of an add instruction */
668 COSTS_N_INSNS (2), /* cost of a lea instruction */
669 COSTS_N_INSNS (1), /* variable shift costs */
670 COSTS_N_INSNS (1), /* constant shift costs */
671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
672 COSTS_N_INSNS (4), /* HI */
673 COSTS_N_INSNS (3), /* SI */
674 COSTS_N_INSNS (4), /* DI */
675 COSTS_N_INSNS (5)}, /* other */
676 0, /* cost of multiply per each bit set */
677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
678 COSTS_N_INSNS (26), /* HI */
679 COSTS_N_INSNS (42), /* SI */
680 COSTS_N_INSNS (74), /* DI */
681 COSTS_N_INSNS (74)}, /* other */
682 COSTS_N_INSNS (1), /* cost of movsx */
683 COSTS_N_INSNS (1), /* cost of movzx */
684 8, /* "large" insn */
685 9, /* MOVE_RATIO */
686 4, /* cost for loading QImode using movzbl */
687 {3, 4, 3}, /* cost of loading integer registers
688 in QImode, HImode and SImode.
689 Relative to reg-reg move (2). */
690 {3, 4, 3}, /* cost of storing integer registers */
691 4, /* cost of reg,reg fld/fst */
692 {4, 4, 12}, /* cost of loading fp registers
693 in SFmode, DFmode and XFmode */
694 {6, 6, 8}, /* cost of storing fp registers
695 in SFmode, DFmode and XFmode */
696 2, /* cost of moving MMX register */
697 {3, 3}, /* cost of loading MMX registers
698 in SImode and DImode */
699 {4, 4}, /* cost of storing MMX registers
700 in SImode and DImode */
701 2, /* cost of moving SSE register */
702 {4, 3, 6}, /* cost of loading SSE registers
703 in SImode, DImode and TImode */
704 {4, 4, 5}, /* cost of storing SSE registers
705 in SImode, DImode and TImode */
706 5, /* MMX or SSE register to integer */
707 64, /* size of l1 cache. */
708 512, /* size of l2 cache. */
709 64, /* size of prefetch block */
710 /* New AMD processors never drop prefetches; if they cannot be performed
711 immediately, they are queued. We set number of simultaneous prefetches
712 to a large constant to reflect this (it probably is not a good idea not
713 to limit number of prefetches at all, as their execution also takes some
714 time). */
715 100, /* number of parallel prefetches */
716 3, /* Branch cost */
717 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
718 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
719 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
720 COSTS_N_INSNS (2), /* cost of FABS instruction. */
721 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
722 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
723 /* K8 has optimized REP instruction for medium sized blocks, but for very
724 small blocks it is better to use loop. For large blocks, libcall can
725 do nontemporary accesses and beat inline considerably. */
726 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
727 {-1, rep_prefix_4_byte, false}}},
728 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
729 {-1, libcall, false}}}},
730 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
731 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
732 {libcall, {{48, unrolled_loop, false},
733 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
734 4, /* scalar_stmt_cost. */
735 2, /* scalar load_cost. */
736 2, /* scalar_store_cost. */
737 5, /* vec_stmt_cost. */
738 0, /* vec_to_scalar_cost. */
739 2, /* scalar_to_vec_cost. */
740 2, /* vec_align_load_cost. */
741 3, /* vec_unalign_load_cost. */
742 3, /* vec_store_cost. */
743 3, /* cond_taken_branch_cost. */
744 2, /* cond_not_taken_branch_cost. */
747 struct processor_costs amdfam10_cost = {
748 COSTS_N_INSNS (1), /* cost of an add instruction */
749 COSTS_N_INSNS (2), /* cost of a lea instruction */
750 COSTS_N_INSNS (1), /* variable shift costs */
751 COSTS_N_INSNS (1), /* constant shift costs */
752 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
753 COSTS_N_INSNS (4), /* HI */
754 COSTS_N_INSNS (3), /* SI */
755 COSTS_N_INSNS (4), /* DI */
756 COSTS_N_INSNS (5)}, /* other */
757 0, /* cost of multiply per each bit set */
758 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
759 COSTS_N_INSNS (35), /* HI */
760 COSTS_N_INSNS (51), /* SI */
761 COSTS_N_INSNS (83), /* DI */
762 COSTS_N_INSNS (83)}, /* other */
763 COSTS_N_INSNS (1), /* cost of movsx */
764 COSTS_N_INSNS (1), /* cost of movzx */
765 8, /* "large" insn */
766 9, /* MOVE_RATIO */
767 4, /* cost for loading QImode using movzbl */
768 {3, 4, 3}, /* cost of loading integer registers
769 in QImode, HImode and SImode.
770 Relative to reg-reg move (2). */
771 {3, 4, 3}, /* cost of storing integer registers */
772 4, /* cost of reg,reg fld/fst */
773 {4, 4, 12}, /* cost of loading fp registers
774 in SFmode, DFmode and XFmode */
775 {6, 6, 8}, /* cost of storing fp registers
776 in SFmode, DFmode and XFmode */
777 2, /* cost of moving MMX register */
778 {3, 3}, /* cost of loading MMX registers
779 in SImode and DImode */
780 {4, 4}, /* cost of storing MMX registers
781 in SImode and DImode */
782 2, /* cost of moving SSE register */
783 {4, 4, 3}, /* cost of loading SSE registers
784 in SImode, DImode and TImode */
785 {4, 4, 5}, /* cost of storing SSE registers
786 in SImode, DImode and TImode */
787 3, /* MMX or SSE register to integer */
788 /* On K8:
789 MOVD reg64, xmmreg Double FSTORE 4
790 MOVD reg32, xmmreg Double FSTORE 4
791 On AMDFAM10:
792 MOVD reg64, xmmreg Double FADD 3
793 1/1 1/1
794 MOVD reg32, xmmreg Double FADD 3
795 1/1 1/1 */
796 64, /* size of l1 cache. */
797 512, /* size of l2 cache. */
798 64, /* size of prefetch block */
799 /* New AMD processors never drop prefetches; if they cannot be performed
800 immediately, they are queued. We set number of simultaneous prefetches
801 to a large constant to reflect this (it probably is not a good idea not
802 to limit number of prefetches at all, as their execution also takes some
803 time). */
804 100, /* number of parallel prefetches */
805 2, /* Branch cost */
806 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
807 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
808 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
809 COSTS_N_INSNS (2), /* cost of FABS instruction. */
810 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
811 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
813 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
814 very small blocks it is better to use loop. For large blocks, libcall can
815 do nontemporary accesses and beat inline considerably. */
816 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
817 {-1, rep_prefix_4_byte, false}}},
818 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
819 {-1, libcall, false}}}},
820 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
821 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
822 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
823 {-1, libcall, false}}}},
824 4, /* scalar_stmt_cost. */
825 2, /* scalar load_cost. */
826 2, /* scalar_store_cost. */
827 6, /* vec_stmt_cost. */
828 0, /* vec_to_scalar_cost. */
829 2, /* scalar_to_vec_cost. */
830 2, /* vec_align_load_cost. */
831 2, /* vec_unalign_load_cost. */
832 2, /* vec_store_cost. */
833 2, /* cond_taken_branch_cost. */
834 1, /* cond_not_taken_branch_cost. */
837 struct processor_costs bdver1_cost = {
838 COSTS_N_INSNS (1), /* cost of an add instruction */
839 COSTS_N_INSNS (1), /* cost of a lea instruction */
840 COSTS_N_INSNS (1), /* variable shift costs */
841 COSTS_N_INSNS (1), /* constant shift costs */
842 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
843 COSTS_N_INSNS (4), /* HI */
844 COSTS_N_INSNS (4), /* SI */
845 COSTS_N_INSNS (6), /* DI */
846 COSTS_N_INSNS (6)}, /* other */
847 0, /* cost of multiply per each bit set */
848 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
849 COSTS_N_INSNS (35), /* HI */
850 COSTS_N_INSNS (51), /* SI */
851 COSTS_N_INSNS (83), /* DI */
852 COSTS_N_INSNS (83)}, /* other */
853 COSTS_N_INSNS (1), /* cost of movsx */
854 COSTS_N_INSNS (1), /* cost of movzx */
855 8, /* "large" insn */
856 9, /* MOVE_RATIO */
857 4, /* cost for loading QImode using movzbl */
858 {5, 5, 4}, /* cost of loading integer registers
859 in QImode, HImode and SImode.
860 Relative to reg-reg move (2). */
861 {4, 4, 4}, /* cost of storing integer registers */
862 2, /* cost of reg,reg fld/fst */
863 {5, 5, 12}, /* cost of loading fp registers
864 in SFmode, DFmode and XFmode */
865 {4, 4, 8}, /* cost of storing fp registers
866 in SFmode, DFmode and XFmode */
867 2, /* cost of moving MMX register */
868 {4, 4}, /* cost of loading MMX registers
869 in SImode and DImode */
870 {4, 4}, /* cost of storing MMX registers
871 in SImode and DImode */
872 2, /* cost of moving SSE register */
873 {4, 4, 4}, /* cost of loading SSE registers
874 in SImode, DImode and TImode */
875 {4, 4, 4}, /* cost of storing SSE registers
876 in SImode, DImode and TImode */
877 2, /* MMX or SSE register to integer */
878 /* On K8:
879 MOVD reg64, xmmreg Double FSTORE 4
880 MOVD reg32, xmmreg Double FSTORE 4
881 On AMDFAM10:
882 MOVD reg64, xmmreg Double FADD 3
883 1/1 1/1
884 MOVD reg32, xmmreg Double FADD 3
885 1/1 1/1 */
886 16, /* size of l1 cache. */
887 2048, /* size of l2 cache. */
888 64, /* size of prefetch block */
889 /* New AMD processors never drop prefetches; if they cannot be performed
890 immediately, they are queued. We set number of simultaneous prefetches
891 to a large constant to reflect this (it probably is not a good idea not
892 to limit number of prefetches at all, as their execution also takes some
893 time). */
894 100, /* number of parallel prefetches */
895 2, /* Branch cost */
896 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
897 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
898 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
899 COSTS_N_INSNS (2), /* cost of FABS instruction. */
900 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
901 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
903 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
904 very small blocks it is better to use loop. For large blocks, libcall
905 can do nontemporary accesses and beat inline considerably. */
906 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}},
910 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
911 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
912 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
913 {-1, libcall, false}}}},
914 6, /* scalar_stmt_cost. */
915 4, /* scalar load_cost. */
916 4, /* scalar_store_cost. */
917 6, /* vec_stmt_cost. */
918 0, /* vec_to_scalar_cost. */
919 2, /* scalar_to_vec_cost. */
920 4, /* vec_align_load_cost. */
921 4, /* vec_unalign_load_cost. */
922 4, /* vec_store_cost. */
923 2, /* cond_taken_branch_cost. */
924 1, /* cond_not_taken_branch_cost. */
927 struct processor_costs bdver2_cost = {
928 COSTS_N_INSNS (1), /* cost of an add instruction */
929 COSTS_N_INSNS (1), /* cost of a lea instruction */
930 COSTS_N_INSNS (1), /* variable shift costs */
931 COSTS_N_INSNS (1), /* constant shift costs */
932 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
933 COSTS_N_INSNS (4), /* HI */
934 COSTS_N_INSNS (4), /* SI */
935 COSTS_N_INSNS (6), /* DI */
936 COSTS_N_INSNS (6)}, /* other */
937 0, /* cost of multiply per each bit set */
938 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
939 COSTS_N_INSNS (35), /* HI */
940 COSTS_N_INSNS (51), /* SI */
941 COSTS_N_INSNS (83), /* DI */
942 COSTS_N_INSNS (83)}, /* other */
943 COSTS_N_INSNS (1), /* cost of movsx */
944 COSTS_N_INSNS (1), /* cost of movzx */
945 8, /* "large" insn */
946 9, /* MOVE_RATIO */
947 4, /* cost for loading QImode using movzbl */
948 {5, 5, 4}, /* cost of loading integer registers
949 in QImode, HImode and SImode.
950 Relative to reg-reg move (2). */
951 {4, 4, 4}, /* cost of storing integer registers */
952 2, /* cost of reg,reg fld/fst */
953 {5, 5, 12}, /* cost of loading fp registers
954 in SFmode, DFmode and XFmode */
955 {4, 4, 8}, /* cost of storing fp registers
956 in SFmode, DFmode and XFmode */
957 2, /* cost of moving MMX register */
958 {4, 4}, /* cost of loading MMX registers
959 in SImode and DImode */
960 {4, 4}, /* cost of storing MMX registers
961 in SImode and DImode */
962 2, /* cost of moving SSE register */
963 {4, 4, 4}, /* cost of loading SSE registers
964 in SImode, DImode and TImode */
965 {4, 4, 4}, /* cost of storing SSE registers
966 in SImode, DImode and TImode */
967 2, /* MMX or SSE register to integer */
968 /* On K8:
969 MOVD reg64, xmmreg Double FSTORE 4
970 MOVD reg32, xmmreg Double FSTORE 4
971 On AMDFAM10:
972 MOVD reg64, xmmreg Double FADD 3
973 1/1 1/1
974 MOVD reg32, xmmreg Double FADD 3
975 1/1 1/1 */
976 16, /* size of l1 cache. */
977 2048, /* size of l2 cache. */
978 64, /* size of prefetch block */
979 /* New AMD processors never drop prefetches; if they cannot be performed
980 immediately, they are queued. We set number of simultaneous prefetches
981 to a large constant to reflect this (it probably is not a good idea not
982 to limit number of prefetches at all, as their execution also takes some
983 time). */
984 100, /* number of parallel prefetches */
985 2, /* Branch cost */
986 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
987 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
988 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
989 COSTS_N_INSNS (2), /* cost of FABS instruction. */
990 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
991 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
993 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
994 very small blocks it is better to use loop. For large blocks, libcall
995 can do nontemporary accesses and beat inline considerably. */
996 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
997 {-1, rep_prefix_4_byte, false}}},
998 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
999 {-1, libcall, false}}}},
1000 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1001 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1002 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1003 {-1, libcall, false}}}},
1004 6, /* scalar_stmt_cost. */
1005 4, /* scalar load_cost. */
1006 4, /* scalar_store_cost. */
1007 6, /* vec_stmt_cost. */
1008 0, /* vec_to_scalar_cost. */
1009 2, /* scalar_to_vec_cost. */
1010 4, /* vec_align_load_cost. */
1011 4, /* vec_unalign_load_cost. */
1012 4, /* vec_store_cost. */
1013 2, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1017 struct processor_costs bdver3_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (1), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (4), /* HI */
1024 COSTS_N_INSNS (4), /* SI */
1025 COSTS_N_INSNS (6), /* DI */
1026 COSTS_N_INSNS (6)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (35), /* HI */
1030 COSTS_N_INSNS (51), /* SI */
1031 COSTS_N_INSNS (83), /* DI */
1032 COSTS_N_INSNS (83)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {5, 5, 4}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {4, 4, 4}, /* cost of storing integer registers */
1042 2, /* cost of reg,reg fld/fst */
1043 {5, 5, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {4, 4, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 4}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 4}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 2, /* MMX or SSE register to integer */
1058 16, /* size of l1 cache. */
1059 2048, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 /* New AMD processors never drop prefetches; if they cannot be performed
1062 immediately, they are queued. We set number of simultaneous prefetches
1063 to a large constant to reflect this (it probably is not a good idea not
1064 to limit number of prefetches at all, as their execution also takes some
1065 time). */
1066 100, /* number of parallel prefetches */
1067 2, /* Branch cost */
1068 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1069 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1070 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1071 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1072 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1073 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1075 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1076 very small blocks it is better to use loop. For large blocks, libcall
1077 can do nontemporary accesses and beat inline considerably. */
1078 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1079 {-1, rep_prefix_4_byte, false}}},
1080 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1081 {-1, libcall, false}}}},
1082 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}},
1086 6, /* scalar_stmt_cost. */
1087 4, /* scalar load_cost. */
1088 4, /* scalar_store_cost. */
1089 6, /* vec_stmt_cost. */
1090 0, /* vec_to_scalar_cost. */
1091 2, /* scalar_to_vec_cost. */
1092 4, /* vec_align_load_cost. */
1093 4, /* vec_unalign_load_cost. */
1094 4, /* vec_store_cost. */
1095 2, /* cond_taken_branch_cost. */
1096 1, /* cond_not_taken_branch_cost. */
1099 struct processor_costs btver1_cost = {
1100 COSTS_N_INSNS (1), /* cost of an add instruction */
1101 COSTS_N_INSNS (2), /* cost of a lea instruction */
1102 COSTS_N_INSNS (1), /* variable shift costs */
1103 COSTS_N_INSNS (1), /* constant shift costs */
1104 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1105 COSTS_N_INSNS (4), /* HI */
1106 COSTS_N_INSNS (3), /* SI */
1107 COSTS_N_INSNS (4), /* DI */
1108 COSTS_N_INSNS (5)}, /* other */
1109 0, /* cost of multiply per each bit set */
1110 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1111 COSTS_N_INSNS (35), /* HI */
1112 COSTS_N_INSNS (51), /* SI */
1113 COSTS_N_INSNS (83), /* DI */
1114 COSTS_N_INSNS (83)}, /* other */
1115 COSTS_N_INSNS (1), /* cost of movsx */
1116 COSTS_N_INSNS (1), /* cost of movzx */
1117 8, /* "large" insn */
1118 9, /* MOVE_RATIO */
1119 4, /* cost for loading QImode using movzbl */
1120 {3, 4, 3}, /* cost of loading integer registers
1121 in QImode, HImode and SImode.
1122 Relative to reg-reg move (2). */
1123 {3, 4, 3}, /* cost of storing integer registers */
1124 4, /* cost of reg,reg fld/fst */
1125 {4, 4, 12}, /* cost of loading fp registers
1126 in SFmode, DFmode and XFmode */
1127 {6, 6, 8}, /* cost of storing fp registers
1128 in SFmode, DFmode and XFmode */
1129 2, /* cost of moving MMX register */
1130 {3, 3}, /* cost of loading MMX registers
1131 in SImode and DImode */
1132 {4, 4}, /* cost of storing MMX registers
1133 in SImode and DImode */
1134 2, /* cost of moving SSE register */
1135 {4, 4, 3}, /* cost of loading SSE registers
1136 in SImode, DImode and TImode */
1137 {4, 4, 5}, /* cost of storing SSE registers
1138 in SImode, DImode and TImode */
1139 3, /* MMX or SSE register to integer */
1140 /* On K8:
1141 MOVD reg64, xmmreg Double FSTORE 4
1142 MOVD reg32, xmmreg Double FSTORE 4
1143 On AMDFAM10:
1144 MOVD reg64, xmmreg Double FADD 3
1145 1/1 1/1
1146 MOVD reg32, xmmreg Double FADD 3
1147 1/1 1/1 */
1148 32, /* size of l1 cache. */
1149 512, /* size of l2 cache. */
1150 64, /* size of prefetch block */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1160 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1161 very small blocks it is better to use loop. For large blocks, libcall can
1162 do nontemporary accesses and beat inline considerably. */
1163 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1164 {-1, rep_prefix_4_byte, false}}},
1165 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1166 {-1, libcall, false}}}},
1167 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1168 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1169 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}},
1171 4, /* scalar_stmt_cost. */
1172 2, /* scalar load_cost. */
1173 2, /* scalar_store_cost. */
1174 6, /* vec_stmt_cost. */
1175 0, /* vec_to_scalar_cost. */
1176 2, /* scalar_to_vec_cost. */
1177 2, /* vec_align_load_cost. */
1178 2, /* vec_unalign_load_cost. */
1179 2, /* vec_store_cost. */
1180 2, /* cond_taken_branch_cost. */
1181 1, /* cond_not_taken_branch_cost. */
1184 struct processor_costs btver2_cost = {
1185 COSTS_N_INSNS (1), /* cost of an add instruction */
1186 COSTS_N_INSNS (2), /* cost of a lea instruction */
1187 COSTS_N_INSNS (1), /* variable shift costs */
1188 COSTS_N_INSNS (1), /* constant shift costs */
1189 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1190 COSTS_N_INSNS (4), /* HI */
1191 COSTS_N_INSNS (3), /* SI */
1192 COSTS_N_INSNS (4), /* DI */
1193 COSTS_N_INSNS (5)}, /* other */
1194 0, /* cost of multiply per each bit set */
1195 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1196 COSTS_N_INSNS (35), /* HI */
1197 COSTS_N_INSNS (51), /* SI */
1198 COSTS_N_INSNS (83), /* DI */
1199 COSTS_N_INSNS (83)}, /* other */
1200 COSTS_N_INSNS (1), /* cost of movsx */
1201 COSTS_N_INSNS (1), /* cost of movzx */
1202 8, /* "large" insn */
1203 9, /* MOVE_RATIO */
1204 4, /* cost for loading QImode using movzbl */
1205 {3, 4, 3}, /* cost of loading integer registers
1206 in QImode, HImode and SImode.
1207 Relative to reg-reg move (2). */
1208 {3, 4, 3}, /* cost of storing integer registers */
1209 4, /* cost of reg,reg fld/fst */
1210 {4, 4, 12}, /* cost of loading fp registers
1211 in SFmode, DFmode and XFmode */
1212 {6, 6, 8}, /* cost of storing fp registers
1213 in SFmode, DFmode and XFmode */
1214 2, /* cost of moving MMX register */
1215 {3, 3}, /* cost of loading MMX registers
1216 in SImode and DImode */
1217 {4, 4}, /* cost of storing MMX registers
1218 in SImode and DImode */
1219 2, /* cost of moving SSE register */
1220 {4, 4, 3}, /* cost of loading SSE registers
1221 in SImode, DImode and TImode */
1222 {4, 4, 5}, /* cost of storing SSE registers
1223 in SImode, DImode and TImode */
1224 3, /* MMX or SSE register to integer */
1225 /* On K8:
1226 MOVD reg64, xmmreg Double FSTORE 4
1227 MOVD reg32, xmmreg Double FSTORE 4
1228 On AMDFAM10:
1229 MOVD reg64, xmmreg Double FADD 3
1230 1/1 1/1
1231 MOVD reg32, xmmreg Double FADD 3
1232 1/1 1/1 */
1233 32, /* size of l1 cache. */
1234 2048, /* size of l2 cache. */
1235 64, /* size of prefetch block */
1236 100, /* number of parallel prefetches */
1237 2, /* Branch cost */
1238 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1239 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1240 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1241 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1242 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1243 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1245 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1246 {-1, rep_prefix_4_byte, false}}},
1247 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1248 {-1, libcall, false}}}},
1249 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1250 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1251 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1252 {-1, libcall, false}}}},
1253 4, /* scalar_stmt_cost. */
1254 2, /* scalar load_cost. */
1255 2, /* scalar_store_cost. */
1256 6, /* vec_stmt_cost. */
1257 0, /* vec_to_scalar_cost. */
1258 2, /* scalar_to_vec_cost. */
1259 2, /* vec_align_load_cost. */
1260 2, /* vec_unalign_load_cost. */
1261 2, /* vec_store_cost. */
1262 2, /* cond_taken_branch_cost. */
1263 1, /* cond_not_taken_branch_cost. */
1266 static const
1267 struct processor_costs pentium4_cost = {
1268 COSTS_N_INSNS (1), /* cost of an add instruction */
1269 COSTS_N_INSNS (3), /* cost of a lea instruction */
1270 COSTS_N_INSNS (4), /* variable shift costs */
1271 COSTS_N_INSNS (4), /* constant shift costs */
1272 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1273 COSTS_N_INSNS (15), /* HI */
1274 COSTS_N_INSNS (15), /* SI */
1275 COSTS_N_INSNS (15), /* DI */
1276 COSTS_N_INSNS (15)}, /* other */
1277 0, /* cost of multiply per each bit set */
1278 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1279 COSTS_N_INSNS (56), /* HI */
1280 COSTS_N_INSNS (56), /* SI */
1281 COSTS_N_INSNS (56), /* DI */
1282 COSTS_N_INSNS (56)}, /* other */
1283 COSTS_N_INSNS (1), /* cost of movsx */
1284 COSTS_N_INSNS (1), /* cost of movzx */
1285 16, /* "large" insn */
1286 6, /* MOVE_RATIO */
1287 2, /* cost for loading QImode using movzbl */
1288 {4, 5, 4}, /* cost of loading integer registers
1289 in QImode, HImode and SImode.
1290 Relative to reg-reg move (2). */
1291 {2, 3, 2}, /* cost of storing integer registers */
1292 2, /* cost of reg,reg fld/fst */
1293 {2, 2, 6}, /* cost of loading fp registers
1294 in SFmode, DFmode and XFmode */
1295 {4, 4, 6}, /* cost of storing fp registers
1296 in SFmode, DFmode and XFmode */
1297 2, /* cost of moving MMX register */
1298 {2, 2}, /* cost of loading MMX registers
1299 in SImode and DImode */
1300 {2, 2}, /* cost of storing MMX registers
1301 in SImode and DImode */
1302 12, /* cost of moving SSE register */
1303 {12, 12, 12}, /* cost of loading SSE registers
1304 in SImode, DImode and TImode */
1305 {2, 2, 8}, /* cost of storing SSE registers
1306 in SImode, DImode and TImode */
1307 10, /* MMX or SSE register to integer */
1308 8, /* size of l1 cache. */
1309 256, /* size of l2 cache. */
1310 64, /* size of prefetch block */
1311 6, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1319 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1320 DUMMY_STRINGOP_ALGS},
1321 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1322 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1323 DUMMY_STRINGOP_ALGS},
1324 1, /* scalar_stmt_cost. */
1325 1, /* scalar load_cost. */
1326 1, /* scalar_store_cost. */
1327 1, /* vec_stmt_cost. */
1328 1, /* vec_to_scalar_cost. */
1329 1, /* scalar_to_vec_cost. */
1330 1, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 1, /* vec_store_cost. */
1333 3, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1337 static const
1338 struct processor_costs nocona_cost = {
1339 COSTS_N_INSNS (1), /* cost of an add instruction */
1340 COSTS_N_INSNS (1), /* cost of a lea instruction */
1341 COSTS_N_INSNS (1), /* variable shift costs */
1342 COSTS_N_INSNS (1), /* constant shift costs */
1343 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1344 COSTS_N_INSNS (10), /* HI */
1345 COSTS_N_INSNS (10), /* SI */
1346 COSTS_N_INSNS (10), /* DI */
1347 COSTS_N_INSNS (10)}, /* other */
1348 0, /* cost of multiply per each bit set */
1349 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1350 COSTS_N_INSNS (66), /* HI */
1351 COSTS_N_INSNS (66), /* SI */
1352 COSTS_N_INSNS (66), /* DI */
1353 COSTS_N_INSNS (66)}, /* other */
1354 COSTS_N_INSNS (1), /* cost of movsx */
1355 COSTS_N_INSNS (1), /* cost of movzx */
1356 16, /* "large" insn */
1357 17, /* MOVE_RATIO */
1358 4, /* cost for loading QImode using movzbl */
1359 {4, 4, 4}, /* cost of loading integer registers
1360 in QImode, HImode and SImode.
1361 Relative to reg-reg move (2). */
1362 {4, 4, 4}, /* cost of storing integer registers */
1363 3, /* cost of reg,reg fld/fst */
1364 {12, 12, 12}, /* cost of loading fp registers
1365 in SFmode, DFmode and XFmode */
1366 {4, 4, 4}, /* cost of storing fp registers
1367 in SFmode, DFmode and XFmode */
1368 6, /* cost of moving MMX register */
1369 {12, 12}, /* cost of loading MMX registers
1370 in SImode and DImode */
1371 {12, 12}, /* cost of storing MMX registers
1372 in SImode and DImode */
1373 6, /* cost of moving SSE register */
1374 {12, 12, 12}, /* cost of loading SSE registers
1375 in SImode, DImode and TImode */
1376 {12, 12, 12}, /* cost of storing SSE registers
1377 in SImode, DImode and TImode */
1378 8, /* MMX or SSE register to integer */
1379 8, /* size of l1 cache. */
1380 1024, /* size of l2 cache. */
1381 128, /* size of prefetch block */
1382 8, /* number of parallel prefetches */
1383 1, /* Branch cost */
1384 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1385 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1386 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1387 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1388 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1389 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1390 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1391 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1392 {100000, unrolled_loop, false}, {-1, libcall, false}}}},
1393 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1394 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1395 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1396 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1397 1, /* scalar_stmt_cost. */
1398 1, /* scalar load_cost. */
1399 1, /* scalar_store_cost. */
1400 1, /* vec_stmt_cost. */
1401 1, /* vec_to_scalar_cost. */
1402 1, /* scalar_to_vec_cost. */
1403 1, /* vec_align_load_cost. */
1404 2, /* vec_unalign_load_cost. */
1405 1, /* vec_store_cost. */
1406 3, /* cond_taken_branch_cost. */
1407 1, /* cond_not_taken_branch_cost. */
1410 static const
1411 struct processor_costs atom_cost = {
1412 COSTS_N_INSNS (1), /* cost of an add instruction */
1413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1414 COSTS_N_INSNS (1), /* variable shift costs */
1415 COSTS_N_INSNS (1), /* constant shift costs */
1416 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1417 COSTS_N_INSNS (4), /* HI */
1418 COSTS_N_INSNS (3), /* SI */
1419 COSTS_N_INSNS (4), /* DI */
1420 COSTS_N_INSNS (2)}, /* other */
1421 0, /* cost of multiply per each bit set */
1422 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1423 COSTS_N_INSNS (26), /* HI */
1424 COSTS_N_INSNS (42), /* SI */
1425 COSTS_N_INSNS (74), /* DI */
1426 COSTS_N_INSNS (74)}, /* other */
1427 COSTS_N_INSNS (1), /* cost of movsx */
1428 COSTS_N_INSNS (1), /* cost of movzx */
1429 8, /* "large" insn */
1430 17, /* MOVE_RATIO */
1431 4, /* cost for loading QImode using movzbl */
1432 {4, 4, 4}, /* cost of loading integer registers
1433 in QImode, HImode and SImode.
1434 Relative to reg-reg move (2). */
1435 {4, 4, 4}, /* cost of storing integer registers */
1436 4, /* cost of reg,reg fld/fst */
1437 {12, 12, 12}, /* cost of loading fp registers
1438 in SFmode, DFmode and XFmode */
1439 {6, 6, 8}, /* cost of storing fp registers
1440 in SFmode, DFmode and XFmode */
1441 2, /* cost of moving MMX register */
1442 {8, 8}, /* cost of loading MMX registers
1443 in SImode and DImode */
1444 {8, 8}, /* cost of storing MMX registers
1445 in SImode and DImode */
1446 2, /* cost of moving SSE register */
1447 {8, 8, 8}, /* cost of loading SSE registers
1448 in SImode, DImode and TImode */
1449 {8, 8, 8}, /* cost of storing SSE registers
1450 in SImode, DImode and TImode */
1451 5, /* MMX or SSE register to integer */
1452 32, /* size of l1 cache. */
1453 256, /* size of l2 cache. */
1454 64, /* size of prefetch block */
1455 6, /* number of parallel prefetches */
1456 3, /* Branch cost */
1457 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1458 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1459 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1460 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1461 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1462 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1463 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1464 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1465 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1466 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1467 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1468 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1469 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1470 1, /* scalar_stmt_cost. */
1471 1, /* scalar load_cost. */
1472 1, /* scalar_store_cost. */
1473 1, /* vec_stmt_cost. */
1474 1, /* vec_to_scalar_cost. */
1475 1, /* scalar_to_vec_cost. */
1476 1, /* vec_align_load_cost. */
1477 2, /* vec_unalign_load_cost. */
1478 1, /* vec_store_cost. */
1479 3, /* cond_taken_branch_cost. */
1480 1, /* cond_not_taken_branch_cost. */
1483 /* Generic64 should produce code tuned for Nocona and K8. */
1484 static const
1485 struct processor_costs generic64_cost = {
1486 COSTS_N_INSNS (1), /* cost of an add instruction */
1487 /* On all chips taken into consideration lea is 2 cycles and more. With
1488 this cost however our current implementation of synth_mult results in
1489 use of unnecessary temporary registers causing regression on several
1490 SPECfp benchmarks. */
1491 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1492 COSTS_N_INSNS (1), /* variable shift costs */
1493 COSTS_N_INSNS (1), /* constant shift costs */
1494 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1495 COSTS_N_INSNS (4), /* HI */
1496 COSTS_N_INSNS (3), /* SI */
1497 COSTS_N_INSNS (4), /* DI */
1498 COSTS_N_INSNS (2)}, /* other */
1499 0, /* cost of multiply per each bit set */
1500 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1501 COSTS_N_INSNS (26), /* HI */
1502 COSTS_N_INSNS (42), /* SI */
1503 COSTS_N_INSNS (74), /* DI */
1504 COSTS_N_INSNS (74)}, /* other */
1505 COSTS_N_INSNS (1), /* cost of movsx */
1506 COSTS_N_INSNS (1), /* cost of movzx */
1507 8, /* "large" insn */
1508 17, /* MOVE_RATIO */
1509 4, /* cost for loading QImode using movzbl */
1510 {4, 4, 4}, /* cost of loading integer registers
1511 in QImode, HImode and SImode.
1512 Relative to reg-reg move (2). */
1513 {4, 4, 4}, /* cost of storing integer registers */
1514 4, /* cost of reg,reg fld/fst */
1515 {12, 12, 12}, /* cost of loading fp registers
1516 in SFmode, DFmode and XFmode */
1517 {6, 6, 8}, /* cost of storing fp registers
1518 in SFmode, DFmode and XFmode */
1519 2, /* cost of moving MMX register */
1520 {8, 8}, /* cost of loading MMX registers
1521 in SImode and DImode */
1522 {8, 8}, /* cost of storing MMX registers
1523 in SImode and DImode */
1524 2, /* cost of moving SSE register */
1525 {8, 8, 8}, /* cost of loading SSE registers
1526 in SImode, DImode and TImode */
1527 {8, 8, 8}, /* cost of storing SSE registers
1528 in SImode, DImode and TImode */
1529 5, /* MMX or SSE register to integer */
1530 32, /* size of l1 cache. */
1531 512, /* size of l2 cache. */
1532 64, /* size of prefetch block */
1533 6, /* number of parallel prefetches */
1534 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1535 value is increased to perhaps more appropriate value of 5. */
1536 3, /* Branch cost */
1537 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1538 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1539 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1540 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1541 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1542 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1543 {DUMMY_STRINGOP_ALGS,
1544 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1545 {-1, libcall, false}}}},
1546 {DUMMY_STRINGOP_ALGS,
1547 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1548 {-1, libcall, false}}}},
1549 1, /* scalar_stmt_cost. */
1550 1, /* scalar load_cost. */
1551 1, /* scalar_store_cost. */
1552 1, /* vec_stmt_cost. */
1553 1, /* vec_to_scalar_cost. */
1554 1, /* scalar_to_vec_cost. */
1555 1, /* vec_align_load_cost. */
1556 2, /* vec_unalign_load_cost. */
1557 1, /* vec_store_cost. */
1558 3, /* cond_taken_branch_cost. */
1559 1, /* cond_not_taken_branch_cost. */
1562 /* core_cost should produce code tuned for Core familly of CPUs. */
1563 static const
1564 struct processor_costs core_cost = {
1565 COSTS_N_INSNS (1), /* cost of an add instruction */
1566 /* On all chips taken into consideration lea is 2 cycles and more. With
1567 this cost however our current implementation of synth_mult results in
1568 use of unnecessary temporary registers causing regression on several
1569 SPECfp benchmarks. */
1570 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1571 COSTS_N_INSNS (1), /* variable shift costs */
1572 COSTS_N_INSNS (1), /* constant shift costs */
1573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1574 COSTS_N_INSNS (4), /* HI */
1575 COSTS_N_INSNS (3), /* SI */
1576 COSTS_N_INSNS (4), /* DI */
1577 COSTS_N_INSNS (2)}, /* other */
1578 0, /* cost of multiply per each bit set */
1579 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1580 COSTS_N_INSNS (26), /* HI */
1581 COSTS_N_INSNS (42), /* SI */
1582 COSTS_N_INSNS (74), /* DI */
1583 COSTS_N_INSNS (74)}, /* other */
1584 COSTS_N_INSNS (1), /* cost of movsx */
1585 COSTS_N_INSNS (1), /* cost of movzx */
1586 8, /* "large" insn */
1587 17, /* MOVE_RATIO */
1588 4, /* cost for loading QImode using movzbl */
1589 {4, 4, 4}, /* cost of loading integer registers
1590 in QImode, HImode and SImode.
1591 Relative to reg-reg move (2). */
1592 {4, 4, 4}, /* cost of storing integer registers */
1593 4, /* cost of reg,reg fld/fst */
1594 {12, 12, 12}, /* cost of loading fp registers
1595 in SFmode, DFmode and XFmode */
1596 {6, 6, 8}, /* cost of storing fp registers
1597 in SFmode, DFmode and XFmode */
1598 2, /* cost of moving MMX register */
1599 {8, 8}, /* cost of loading MMX registers
1600 in SImode and DImode */
1601 {8, 8}, /* cost of storing MMX registers
1602 in SImode and DImode */
1603 2, /* cost of moving SSE register */
1604 {8, 8, 8}, /* cost of loading SSE registers
1605 in SImode, DImode and TImode */
1606 {8, 8, 8}, /* cost of storing SSE registers
1607 in SImode, DImode and TImode */
1608 5, /* MMX or SSE register to integer */
1609 64, /* size of l1 cache. */
1610 512, /* size of l2 cache. */
1611 64, /* size of prefetch block */
1612 6, /* number of parallel prefetches */
1613 /* FIXME perhaps more appropriate value is 5. */
1614 3, /* Branch cost */
1615 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1616 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1617 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1618 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1619 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1620 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1621 {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1622 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1623 {-1, libcall, false}}}},
1624 {{libcall, {{6, loop_1_byte, true},
1625 {24, loop, true},
1626 {8192, rep_prefix_4_byte, true},
1627 {-1, libcall, false}}},
1628 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1629 {-1, libcall, false}}}},
1630 1, /* scalar_stmt_cost. */
1631 1, /* scalar load_cost. */
1632 1, /* scalar_store_cost. */
1633 1, /* vec_stmt_cost. */
1634 1, /* vec_to_scalar_cost. */
1635 1, /* scalar_to_vec_cost. */
1636 1, /* vec_align_load_cost. */
1637 2, /* vec_unalign_load_cost. */
1638 1, /* vec_store_cost. */
1639 3, /* cond_taken_branch_cost. */
1640 1, /* cond_not_taken_branch_cost. */
1643 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1644 Athlon and K8. */
1645 static const
1646 struct processor_costs generic32_cost = {
1647 COSTS_N_INSNS (1), /* cost of an add instruction */
1648 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1649 COSTS_N_INSNS (1), /* variable shift costs */
1650 COSTS_N_INSNS (1), /* constant shift costs */
1651 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1652 COSTS_N_INSNS (4), /* HI */
1653 COSTS_N_INSNS (3), /* SI */
1654 COSTS_N_INSNS (4), /* DI */
1655 COSTS_N_INSNS (2)}, /* other */
1656 0, /* cost of multiply per each bit set */
1657 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1658 COSTS_N_INSNS (26), /* HI */
1659 COSTS_N_INSNS (42), /* SI */
1660 COSTS_N_INSNS (74), /* DI */
1661 COSTS_N_INSNS (74)}, /* other */
1662 COSTS_N_INSNS (1), /* cost of movsx */
1663 COSTS_N_INSNS (1), /* cost of movzx */
1664 8, /* "large" insn */
1665 17, /* MOVE_RATIO */
1666 4, /* cost for loading QImode using movzbl */
1667 {4, 4, 4}, /* cost of loading integer registers
1668 in QImode, HImode and SImode.
1669 Relative to reg-reg move (2). */
1670 {4, 4, 4}, /* cost of storing integer registers */
1671 4, /* cost of reg,reg fld/fst */
1672 {12, 12, 12}, /* cost of loading fp registers
1673 in SFmode, DFmode and XFmode */
1674 {6, 6, 8}, /* cost of storing fp registers
1675 in SFmode, DFmode and XFmode */
1676 2, /* cost of moving MMX register */
1677 {8, 8}, /* cost of loading MMX registers
1678 in SImode and DImode */
1679 {8, 8}, /* cost of storing MMX registers
1680 in SImode and DImode */
1681 2, /* cost of moving SSE register */
1682 {8, 8, 8}, /* cost of loading SSE registers
1683 in SImode, DImode and TImode */
1684 {8, 8, 8}, /* cost of storing SSE registers
1685 in SImode, DImode and TImode */
1686 5, /* MMX or SSE register to integer */
1687 32, /* size of l1 cache. */
1688 256, /* size of l2 cache. */
1689 64, /* size of prefetch block */
1690 6, /* number of parallel prefetches */
1691 3, /* Branch cost */
1692 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1693 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1694 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1695 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1696 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1697 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1698 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1699 {-1, libcall, false}}},
1700 DUMMY_STRINGOP_ALGS},
1701 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1702 {-1, libcall, false}}},
1703 DUMMY_STRINGOP_ALGS},
1704 1, /* scalar_stmt_cost. */
1705 1, /* scalar load_cost. */
1706 1, /* scalar_store_cost. */
1707 1, /* vec_stmt_cost. */
1708 1, /* vec_to_scalar_cost. */
1709 1, /* scalar_to_vec_cost. */
1710 1, /* vec_align_load_cost. */
1711 2, /* vec_unalign_load_cost. */
1712 1, /* vec_store_cost. */
1713 3, /* cond_taken_branch_cost. */
1714 1, /* cond_not_taken_branch_cost. */
1717 /* Set by -mtune. */
1718 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1720 /* Set by -mtune or -Os. */
1721 const struct processor_costs *ix86_cost = &pentium_cost;
1723 /* Processor feature/optimization bitmasks. */
1724 #define m_386 (1<<PROCESSOR_I386)
1725 #define m_486 (1<<PROCESSOR_I486)
1726 #define m_PENT (1<<PROCESSOR_PENTIUM)
1727 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1728 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1729 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1730 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1731 #define m_CORE2 (1<<PROCESSOR_CORE2)
1732 #define m_COREI7 (1<<PROCESSOR_COREI7)
1733 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1734 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1735 #define m_ATOM (1<<PROCESSOR_ATOM)
1737 #define m_GEODE (1<<PROCESSOR_GEODE)
1738 #define m_K6 (1<<PROCESSOR_K6)
1739 #define m_K6_GEODE (m_K6 | m_GEODE)
1740 #define m_K8 (1<<PROCESSOR_K8)
1741 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1742 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1743 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1744 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1745 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1746 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1747 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1748 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1749 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1750 #define m_BTVER (m_BTVER1 | m_BTVER2)
1751 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1753 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1754 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1756 /* Generic instruction choice should be common subset of supported CPUs
1757 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1758 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1760 /* Feature tests against the various tunings. */
1761 unsigned char ix86_tune_features[X86_TUNE_LAST];
1763 /* Feature tests against the various tunings used to create ix86_tune_features
1764 based on the processor mask. */
1765 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1766 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1767 negatively, so enabling for Generic64 seems like good code size
1768 tradeoff. We can't enable it for 32bit generic because it does not
1769 work well with PPro base chips. */
1770 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1772 /* X86_TUNE_PUSH_MEMORY */
1773 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1775 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1776 m_486 | m_PENT,
1778 /* X86_TUNE_UNROLL_STRLEN */
1779 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1781 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1782 on simulation result. But after P4 was made, no performance benefit
1783 was observed with branch hints. It also increases the code size.
1784 As a result, icc never generates branch hints. */
1787 /* X86_TUNE_DOUBLE_WITH_ADD */
1788 ~m_386,
1790 /* X86_TUNE_USE_SAHF */
1791 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1793 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1794 partial dependencies. */
1795 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1797 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1798 register stalls on Generic32 compilation setting as well. However
1799 in current implementation the partial register stalls are not eliminated
1800 very well - they can be introduced via subregs synthesized by combine
1801 and can happen in caller/callee saving sequences. Because this option
1802 pays back little on PPro based chips and is in conflict with partial reg
1803 dependencies used by Athlon/P4 based chips, it is better to leave it off
1804 for generic32 for now. */
1805 m_PPRO,
1807 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1808 m_CORE_ALL | m_GENERIC,
1810 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1811 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1812 m_CORE_ALL | m_GENERIC,
1814 /* X86_TUNE_USE_HIMODE_FIOP */
1815 m_386 | m_486 | m_K6_GEODE,
1817 /* X86_TUNE_USE_SIMODE_FIOP */
1818 ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1820 /* X86_TUNE_USE_MOV0 */
1821 m_K6,
1823 /* X86_TUNE_USE_CLTD */
1824 ~(m_PENT | m_ATOM | m_K6),
1826 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1827 m_PENT4,
1829 /* X86_TUNE_SPLIT_LONG_MOVES */
1830 m_PPRO,
1832 /* X86_TUNE_READ_MODIFY_WRITE */
1833 ~m_PENT,
1835 /* X86_TUNE_READ_MODIFY */
1836 ~(m_PENT | m_PPRO),
1838 /* X86_TUNE_PROMOTE_QIMODE */
1839 m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1841 /* X86_TUNE_FAST_PREFIX */
1842 ~(m_386 | m_486 | m_PENT),
1844 /* X86_TUNE_SINGLE_STRINGOP */
1845 m_386 | m_P4_NOCONA,
1847 /* X86_TUNE_QIMODE_MATH */
1850 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1851 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1852 might be considered for Generic32 if our scheme for avoiding partial
1853 stalls was more effective. */
1854 ~m_PPRO,
1856 /* X86_TUNE_PROMOTE_QI_REGS */
1859 /* X86_TUNE_PROMOTE_HI_REGS */
1860 m_PPRO,
1862 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1863 over esp addition. */
1864 m_386 | m_486 | m_PENT | m_PPRO,
1866 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1867 over esp addition. */
1868 m_PENT,
1870 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1871 over esp subtraction. */
1872 m_386 | m_486 | m_PENT | m_K6_GEODE,
1874 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1875 over esp subtraction. */
1876 m_PENT | m_K6_GEODE,
1878 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1879 for DFmode copies */
1880 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1882 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1883 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1885 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1886 conflict here in between PPro/Pentium4 based chips that thread 128bit
1887 SSE registers as single units versus K8 based chips that divide SSE
1888 registers to two 64bit halves. This knob promotes all store destinations
1889 to be 128bit to allow register renaming on 128bit SSE units, but usually
1890 results in one extra microop on 64bit SSE units. Experimental results
1891 shows that disabling this option on P4 brings over 20% SPECfp regression,
1892 while enabling it on K8 brings roughly 2.4% regression that can be partly
1893 masked by careful scheduling of moves. */
1894 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1896 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1897 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
1899 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1900 m_COREI7 | m_BDVER,
1902 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1903 m_BDVER ,
1905 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1906 are resolved on SSE register parts instead of whole registers, so we may
1907 maintain just lower part of scalar values in proper format leaving the
1908 upper part undefined. */
1909 m_ATHLON_K8,
1911 /* X86_TUNE_SSE_TYPELESS_STORES */
1912 m_AMD_MULTIPLE,
1914 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1915 m_PPRO | m_P4_NOCONA,
1917 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1918 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1920 /* X86_TUNE_PROLOGUE_USING_MOVE */
1921 m_PPRO | m_ATHLON_K8,
1923 /* X86_TUNE_EPILOGUE_USING_MOVE */
1924 m_PPRO | m_ATHLON_K8,
1926 /* X86_TUNE_SHIFT1 */
1927 ~m_486,
1929 /* X86_TUNE_USE_FFREEP */
1930 m_AMD_MULTIPLE,
1932 /* X86_TUNE_INTER_UNIT_MOVES */
1933 ~(m_AMD_MULTIPLE | m_GENERIC),
1935 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1936 ~(m_AMDFAM10 | m_BDVER ),
1938 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1939 than 4 branch instructions in the 16 byte window. */
1940 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1942 /* X86_TUNE_SCHEDULE */
1943 m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1945 /* X86_TUNE_USE_BT */
1946 m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1948 /* X86_TUNE_USE_INCDEC */
1949 ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC),
1951 /* X86_TUNE_PAD_RETURNS */
1952 m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
1954 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1955 m_ATOM,
1957 /* X86_TUNE_EXT_80387_CONSTANTS */
1958 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1960 /* X86_TUNE_AVOID_VECTOR_DECODE */
1961 m_CORE_ALL | m_K8 | m_GENERIC64,
1963 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1964 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1965 ~(m_386 | m_486),
1967 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1968 vector path on AMD machines. */
1969 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1971 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1972 machines. */
1973 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1975 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1976 than a MOV. */
1977 m_PENT,
1979 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1980 but one byte longer. */
1981 m_PENT,
1983 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1984 operand that cannot be represented using a modRM byte. The XOR
1985 replacement is long decoded, so this split helps here as well. */
1986 m_K6,
1988 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1989 from FP to FP. */
1990 m_CORE_ALL | m_AMDFAM10 | m_GENERIC,
1992 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1993 from integer to FP. */
1994 m_AMDFAM10,
1996 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1997 with a subsequent conditional jump instruction into a single
1998 compare-and-branch uop. */
1999 m_BDVER,
2001 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2002 will impact LEA instruction selection. */
2003 m_ATOM,
2005 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2006 instructions. */
2007 ~m_ATOM,
2009 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2010 at -O3. For the moment, the prefetching seems badly tuned for Intel
2011 chips. */
2012 m_K6_GEODE | m_AMD_MULTIPLE,
2014 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2015 the auto-vectorizer. */
2016 m_BDVER | m_BTVER2,
2018 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2019 during reassociation of integer computation. */
2020 m_ATOM,
2022 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2023 during reassociation of fp computation. */
2024 m_ATOM | m_HASWELL,
2026 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2027 regs instead of memory. */
2028 m_CORE_ALL,
2030 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2031 a conditional move. */
2032 m_ATOM
2035 /* Feature tests against the various architecture variations. */
2036 unsigned char ix86_arch_features[X86_ARCH_LAST];
2038 /* Feature tests against the various architecture variations, used to create
2039 ix86_arch_features based on the processor mask. */
2040 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2041 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2042 ~(m_386 | m_486 | m_PENT | m_K6),
2044 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2045 ~m_386,
2047 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2048 ~(m_386 | m_486),
2050 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2051 ~m_386,
2053 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2054 ~m_386,
2057 static const unsigned int x86_accumulate_outgoing_args
2058 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
2060 static const unsigned int x86_arch_always_fancy_math_387
2061 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2063 static const unsigned int x86_avx256_split_unaligned_load
2064 = m_COREI7 | m_GENERIC;
2066 static const unsigned int x86_avx256_split_unaligned_store
2067 = m_COREI7 | m_BDVER | m_GENERIC;
2069 /* In case the average insn count for single function invocation is
2070 lower than this constant, emit fast (but longer) prologue and
2071 epilogue code. */
2072 #define FAST_PROLOGUE_INSN_COUNT 20
2074 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2075 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2076 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2077 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2079 /* Array of the smallest class containing reg number REGNO, indexed by
2080 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2082 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2084 /* ax, dx, cx, bx */
2085 AREG, DREG, CREG, BREG,
2086 /* si, di, bp, sp */
2087 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2088 /* FP registers */
2089 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2090 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2091 /* arg pointer */
2092 NON_Q_REGS,
2093 /* flags, fpsr, fpcr, frame */
2094 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2095 /* SSE registers */
2096 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2097 SSE_REGS, SSE_REGS,
2098 /* MMX registers */
2099 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2100 MMX_REGS, MMX_REGS,
2101 /* REX registers */
2102 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2103 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2104 /* SSE REX registers */
2105 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2106 SSE_REGS, SSE_REGS,
2109 /* The "default" register map used in 32bit mode. */
2111 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2113 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2114 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2115 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2116 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2117 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2118 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2119 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2122 /* The "default" register map used in 64bit mode. */
2124 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2126 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2127 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2128 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2129 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2130 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2131 8,9,10,11,12,13,14,15, /* extended integer registers */
2132 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2135 /* Define the register numbers to be used in Dwarf debugging information.
2136 The SVR4 reference port C compiler uses the following register numbers
2137 in its Dwarf output code:
2138 0 for %eax (gcc regno = 0)
2139 1 for %ecx (gcc regno = 2)
2140 2 for %edx (gcc regno = 1)
2141 3 for %ebx (gcc regno = 3)
2142 4 for %esp (gcc regno = 7)
2143 5 for %ebp (gcc regno = 6)
2144 6 for %esi (gcc regno = 4)
2145 7 for %edi (gcc regno = 5)
2146 The following three DWARF register numbers are never generated by
2147 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2148 believes these numbers have these meanings.
2149 8 for %eip (no gcc equivalent)
2150 9 for %eflags (gcc regno = 17)
2151 10 for %trapno (no gcc equivalent)
2152 It is not at all clear how we should number the FP stack registers
2153 for the x86 architecture. If the version of SDB on x86/svr4 were
2154 a bit less brain dead with respect to floating-point then we would
2155 have a precedent to follow with respect to DWARF register numbers
2156 for x86 FP registers, but the SDB on x86/svr4 is so completely
2157 broken with respect to FP registers that it is hardly worth thinking
2158 of it as something to strive for compatibility with.
2159 The version of x86/svr4 SDB I have at the moment does (partially)
2160 seem to believe that DWARF register number 11 is associated with
2161 the x86 register %st(0), but that's about all. Higher DWARF
2162 register numbers don't seem to be associated with anything in
2163 particular, and even for DWARF regno 11, SDB only seems to under-
2164 stand that it should say that a variable lives in %st(0) (when
2165 asked via an `=' command) if we said it was in DWARF regno 11,
2166 but SDB still prints garbage when asked for the value of the
2167 variable in question (via a `/' command).
2168 (Also note that the labels SDB prints for various FP stack regs
2169 when doing an `x' command are all wrong.)
2170 Note that these problems generally don't affect the native SVR4
2171 C compiler because it doesn't allow the use of -O with -g and
2172 because when it is *not* optimizing, it allocates a memory
2173 location for each floating-point variable, and the memory
2174 location is what gets described in the DWARF AT_location
2175 attribute for the variable in question.
2176 Regardless of the severe mental illness of the x86/svr4 SDB, we
2177 do something sensible here and we use the following DWARF
2178 register numbers. Note that these are all stack-top-relative
2179 numbers.
2180 11 for %st(0) (gcc regno = 8)
2181 12 for %st(1) (gcc regno = 9)
2182 13 for %st(2) (gcc regno = 10)
2183 14 for %st(3) (gcc regno = 11)
2184 15 for %st(4) (gcc regno = 12)
2185 16 for %st(5) (gcc regno = 13)
2186 17 for %st(6) (gcc regno = 14)
2187 18 for %st(7) (gcc regno = 15)
2189 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2191 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2192 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2193 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2194 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2195 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2196 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2197 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2200 /* Define parameter passing and return registers. */
2202 static int const x86_64_int_parameter_registers[6] =
2204 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2207 static int const x86_64_ms_abi_int_parameter_registers[4] =
2209 CX_REG, DX_REG, R8_REG, R9_REG
2212 static int const x86_64_int_return_registers[4] =
2214 AX_REG, DX_REG, DI_REG, SI_REG
2217 /* Define the structure for the machine field in struct function. */
2219 struct GTY(()) stack_local_entry {
2220 unsigned short mode;
2221 unsigned short n;
2222 rtx rtl;
2223 struct stack_local_entry *next;
2226 /* Structure describing stack frame layout.
2227 Stack grows downward:
2229 [arguments]
2230 <- ARG_POINTER
2231 saved pc
2233 saved static chain if ix86_static_chain_on_stack
2235 saved frame pointer if frame_pointer_needed
2236 <- HARD_FRAME_POINTER
2237 [saved regs]
2238 <- regs_save_offset
2239 [padding0]
2241 [saved SSE regs]
2242 <- sse_regs_save_offset
2243 [padding1] |
2244 | <- FRAME_POINTER
2245 [va_arg registers] |
2247 [frame] |
2249 [padding2] | = to_allocate
2250 <- STACK_POINTER
2252 struct ix86_frame
2254 int nsseregs;
2255 int nregs;
2256 int va_arg_size;
2257 int red_zone_size;
2258 int outgoing_arguments_size;
2260 /* The offsets relative to ARG_POINTER. */
2261 HOST_WIDE_INT frame_pointer_offset;
2262 HOST_WIDE_INT hard_frame_pointer_offset;
2263 HOST_WIDE_INT stack_pointer_offset;
2264 HOST_WIDE_INT hfp_save_offset;
2265 HOST_WIDE_INT reg_save_offset;
2266 HOST_WIDE_INT sse_reg_save_offset;
2268 /* When save_regs_using_mov is set, emit prologue using
2269 move instead of push instructions. */
2270 bool save_regs_using_mov;
2273 /* Which cpu are we scheduling for. */
2274 enum attr_cpu ix86_schedule;
2276 /* Which cpu are we optimizing for. */
2277 enum processor_type ix86_tune;
2279 /* Which instruction set architecture to use. */
2280 enum processor_type ix86_arch;
2282 /* True if processor has SSE prefetch instruction. */
2283 unsigned char x86_prefetch_sse;
2285 /* -mstackrealign option */
2286 static const char ix86_force_align_arg_pointer_string[]
2287 = "force_align_arg_pointer";
2289 static rtx (*ix86_gen_leave) (void);
2290 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2291 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2292 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2293 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2294 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2295 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2296 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2297 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2298 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2299 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2300 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2302 /* Preferred alignment for stack boundary in bits. */
2303 unsigned int ix86_preferred_stack_boundary;
2305 /* Alignment for incoming stack boundary in bits specified at
2306 command line. */
2307 static unsigned int ix86_user_incoming_stack_boundary;
2309 /* Default alignment for incoming stack boundary in bits. */
2310 static unsigned int ix86_default_incoming_stack_boundary;
2312 /* Alignment for incoming stack boundary in bits. */
2313 unsigned int ix86_incoming_stack_boundary;
2315 /* Calling abi specific va_list type nodes. */
2316 static GTY(()) tree sysv_va_list_type_node;
2317 static GTY(()) tree ms_va_list_type_node;
2319 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2320 char internal_label_prefix[16];
2321 int internal_label_prefix_len;
2323 /* Fence to use after loop using movnt. */
2324 tree x86_mfence;
2326 /* Register class used for passing given 64bit part of the argument.
2327 These represent classes as documented by the PS ABI, with the exception
2328 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2329 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2331 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2332 whenever possible (upper half does contain padding). */
2333 enum x86_64_reg_class
2335 X86_64_NO_CLASS,
2336 X86_64_INTEGER_CLASS,
2337 X86_64_INTEGERSI_CLASS,
2338 X86_64_SSE_CLASS,
2339 X86_64_SSESF_CLASS,
2340 X86_64_SSEDF_CLASS,
2341 X86_64_SSEUP_CLASS,
2342 X86_64_X87_CLASS,
2343 X86_64_X87UP_CLASS,
2344 X86_64_COMPLEX_X87_CLASS,
2345 X86_64_MEMORY_CLASS
2348 #define MAX_CLASSES 4
2350 /* Table of constants used by fldpi, fldln2, etc.... */
2351 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2352 static bool ext_80387_constants_init = 0;
2355 static struct machine_function * ix86_init_machine_status (void);
2356 static rtx ix86_function_value (const_tree, const_tree, bool);
2357 static bool ix86_function_value_regno_p (const unsigned int);
2358 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2359 const_tree);
2360 static rtx ix86_static_chain (const_tree, bool);
2361 static int ix86_function_regparm (const_tree, const_tree);
2362 static void ix86_compute_frame_layout (struct ix86_frame *);
2363 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2364 rtx, rtx, int);
2365 static void ix86_add_new_builtins (HOST_WIDE_INT);
2366 static tree ix86_canonical_va_list_type (tree);
2367 static void predict_jump (int);
2368 static unsigned int split_stack_prologue_scratch_regno (void);
2369 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2371 enum ix86_function_specific_strings
2373 IX86_FUNCTION_SPECIFIC_ARCH,
2374 IX86_FUNCTION_SPECIFIC_TUNE,
2375 IX86_FUNCTION_SPECIFIC_MAX
2378 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2379 const char *, enum fpmath_unit, bool);
2380 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2381 static void ix86_function_specific_save (struct cl_target_option *);
2382 static void ix86_function_specific_restore (struct cl_target_option *);
2383 static void ix86_function_specific_print (FILE *, int,
2384 struct cl_target_option *);
2385 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2386 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2387 struct gcc_options *);
2388 static bool ix86_can_inline_p (tree, tree);
2389 static void ix86_set_current_function (tree);
2390 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2392 static enum calling_abi ix86_function_abi (const_tree);
2395 #ifndef SUBTARGET32_DEFAULT_CPU
2396 #define SUBTARGET32_DEFAULT_CPU "i386"
2397 #endif
2399 /* Whether -mtune= or -march= were specified */
2400 static int ix86_tune_defaulted;
2401 static int ix86_arch_specified;
2403 /* Vectorization library interface and handlers. */
2404 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2406 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2407 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2409 /* Processor target table, indexed by processor number */
2410 struct ptt
2412 const struct processor_costs *cost; /* Processor costs */
2413 const int align_loop; /* Default alignments. */
2414 const int align_loop_max_skip;
2415 const int align_jump;
2416 const int align_jump_max_skip;
2417 const int align_func;
2420 static const struct ptt processor_target_table[PROCESSOR_max] =
2422 {&i386_cost, 4, 3, 4, 3, 4},
2423 {&i486_cost, 16, 15, 16, 15, 16},
2424 {&pentium_cost, 16, 7, 16, 7, 16},
2425 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2426 {&geode_cost, 0, 0, 0, 0, 0},
2427 {&k6_cost, 32, 7, 32, 7, 32},
2428 {&athlon_cost, 16, 7, 16, 7, 16},
2429 {&pentium4_cost, 0, 0, 0, 0, 0},
2430 {&k8_cost, 16, 7, 16, 7, 16},
2431 {&nocona_cost, 0, 0, 0, 0, 0},
2432 /* Core 2 */
2433 {&core_cost, 16, 10, 16, 10, 16},
2434 /* Core i7 */
2435 {&core_cost, 16, 10, 16, 10, 16},
2436 /* Core avx2 */
2437 {&core_cost, 16, 10, 16, 10, 16},
2438 {&generic32_cost, 16, 7, 16, 7, 16},
2439 {&generic64_cost, 16, 10, 16, 10, 16},
2440 {&amdfam10_cost, 32, 24, 32, 7, 32},
2441 {&bdver1_cost, 32, 24, 32, 7, 32},
2442 {&bdver2_cost, 32, 24, 32, 7, 32},
2443 {&bdver3_cost, 32, 24, 32, 7, 32},
2444 {&btver1_cost, 32, 24, 32, 7, 32},
2445 {&btver2_cost, 32, 24, 32, 7, 32},
2446 {&atom_cost, 16, 15, 16, 7, 16}
2449 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2451 "generic",
2452 "i386",
2453 "i486",
2454 "pentium",
2455 "pentium-mmx",
2456 "pentiumpro",
2457 "pentium2",
2458 "pentium3",
2459 "pentium4",
2460 "pentium-m",
2461 "prescott",
2462 "nocona",
2463 "core2",
2464 "corei7",
2465 "core-avx2",
2466 "atom",
2467 "geode",
2468 "k6",
2469 "k6-2",
2470 "k6-3",
2471 "athlon",
2472 "athlon-4",
2473 "k8",
2474 "amdfam10",
2475 "bdver1",
2476 "bdver2",
2477 "bdver3",
2478 "btver1",
2479 "btver2"
2482 static bool
2483 gate_insert_vzeroupper (void)
2485 return TARGET_VZEROUPPER;
2488 static unsigned int
2489 rest_of_handle_insert_vzeroupper (void)
2491 int i;
2493 /* vzeroupper instructions are inserted immediately after reload to
2494 account for possible spills from 256bit registers. The pass
2495 reuses mode switching infrastructure by re-running mode insertion
2496 pass, so disable entities that have already been processed. */
2497 for (i = 0; i < MAX_386_ENTITIES; i++)
2498 ix86_optimize_mode_switching[i] = 0;
2500 ix86_optimize_mode_switching[AVX_U128] = 1;
2502 /* Call optimize_mode_switching. */
2503 pass_mode_switching.pass.execute ();
2504 return 0;
2507 struct rtl_opt_pass pass_insert_vzeroupper =
2510 RTL_PASS,
2511 "vzeroupper", /* name */
2512 OPTGROUP_NONE, /* optinfo_flags */
2513 gate_insert_vzeroupper, /* gate */
2514 rest_of_handle_insert_vzeroupper, /* execute */
2515 NULL, /* sub */
2516 NULL, /* next */
2517 0, /* static_pass_number */
2518 TV_NONE, /* tv_id */
2519 0, /* properties_required */
2520 0, /* properties_provided */
2521 0, /* properties_destroyed */
2522 0, /* todo_flags_start */
2523 TODO_df_finish | TODO_verify_rtl_sharing |
2524 0, /* todo_flags_finish */
2528 /* Return true if a red-zone is in use. */
2530 static inline bool
2531 ix86_using_red_zone (void)
2533 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2536 /* Return a string that documents the current -m options. The caller is
2537 responsible for freeing the string. */
2539 static char *
2540 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2541 const char *tune, enum fpmath_unit fpmath,
2542 bool add_nl_p)
2544 struct ix86_target_opts
2546 const char *option; /* option string */
2547 HOST_WIDE_INT mask; /* isa mask options */
2550 /* This table is ordered so that options like -msse4.2 that imply
2551 preceding options while match those first. */
2552 static struct ix86_target_opts isa_opts[] =
2554 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2555 { "-mfma", OPTION_MASK_ISA_FMA },
2556 { "-mxop", OPTION_MASK_ISA_XOP },
2557 { "-mlwp", OPTION_MASK_ISA_LWP },
2558 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2559 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2560 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2561 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2562 { "-msse3", OPTION_MASK_ISA_SSE3 },
2563 { "-msse2", OPTION_MASK_ISA_SSE2 },
2564 { "-msse", OPTION_MASK_ISA_SSE },
2565 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2566 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2567 { "-mmmx", OPTION_MASK_ISA_MMX },
2568 { "-mabm", OPTION_MASK_ISA_ABM },
2569 { "-mbmi", OPTION_MASK_ISA_BMI },
2570 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2571 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2572 { "-mhle", OPTION_MASK_ISA_HLE },
2573 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2574 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2575 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2576 { "-madx", OPTION_MASK_ISA_ADX },
2577 { "-mtbm", OPTION_MASK_ISA_TBM },
2578 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2579 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2580 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2581 { "-maes", OPTION_MASK_ISA_AES },
2582 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2583 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2584 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2585 { "-mf16c", OPTION_MASK_ISA_F16C },
2586 { "-mrtm", OPTION_MASK_ISA_RTM },
2587 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2588 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2591 /* Flag options. */
2592 static struct ix86_target_opts flag_opts[] =
2594 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2595 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2596 { "-m80387", MASK_80387 },
2597 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2598 { "-malign-double", MASK_ALIGN_DOUBLE },
2599 { "-mcld", MASK_CLD },
2600 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2601 { "-mieee-fp", MASK_IEEE_FP },
2602 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2603 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2604 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2605 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2606 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2607 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2608 { "-mno-red-zone", MASK_NO_RED_ZONE },
2609 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2610 { "-mrecip", MASK_RECIP },
2611 { "-mrtd", MASK_RTD },
2612 { "-msseregparm", MASK_SSEREGPARM },
2613 { "-mstack-arg-probe", MASK_STACK_PROBE },
2614 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2615 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2616 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2617 { "-mvzeroupper", MASK_VZEROUPPER },
2618 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2619 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2620 { "-mprefer-avx128", MASK_PREFER_AVX128},
2623 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2625 char isa_other[40];
2626 char target_other[40];
2627 unsigned num = 0;
2628 unsigned i, j;
2629 char *ret;
2630 char *ptr;
2631 size_t len;
2632 size_t line_len;
2633 size_t sep_len;
2634 const char *abi;
2636 memset (opts, '\0', sizeof (opts));
2638 /* Add -march= option. */
2639 if (arch)
2641 opts[num][0] = "-march=";
2642 opts[num++][1] = arch;
2645 /* Add -mtune= option. */
2646 if (tune)
2648 opts[num][0] = "-mtune=";
2649 opts[num++][1] = tune;
2652 /* Add -m32/-m64/-mx32. */
2653 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2655 if ((isa & OPTION_MASK_ABI_64) != 0)
2656 abi = "-m64";
2657 else
2658 abi = "-mx32";
2659 isa &= ~ (OPTION_MASK_ISA_64BIT
2660 | OPTION_MASK_ABI_64
2661 | OPTION_MASK_ABI_X32);
2663 else
2664 abi = "-m32";
2665 opts[num++][0] = abi;
2667 /* Pick out the options in isa options. */
2668 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2670 if ((isa & isa_opts[i].mask) != 0)
2672 opts[num++][0] = isa_opts[i].option;
2673 isa &= ~ isa_opts[i].mask;
2677 if (isa && add_nl_p)
2679 opts[num++][0] = isa_other;
2680 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2681 isa);
2684 /* Add flag options. */
2685 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2687 if ((flags & flag_opts[i].mask) != 0)
2689 opts[num++][0] = flag_opts[i].option;
2690 flags &= ~ flag_opts[i].mask;
2694 if (flags && add_nl_p)
2696 opts[num++][0] = target_other;
2697 sprintf (target_other, "(other flags: %#x)", flags);
2700 /* Add -fpmath= option. */
2701 if (fpmath)
2703 opts[num][0] = "-mfpmath=";
2704 switch ((int) fpmath)
2706 case FPMATH_387:
2707 opts[num++][1] = "387";
2708 break;
2710 case FPMATH_SSE:
2711 opts[num++][1] = "sse";
2712 break;
2714 case FPMATH_387 | FPMATH_SSE:
2715 opts[num++][1] = "sse+387";
2716 break;
2718 default:
2719 gcc_unreachable ();
2723 /* Any options? */
2724 if (num == 0)
2725 return NULL;
2727 gcc_assert (num < ARRAY_SIZE (opts));
2729 /* Size the string. */
2730 len = 0;
2731 sep_len = (add_nl_p) ? 3 : 1;
2732 for (i = 0; i < num; i++)
2734 len += sep_len;
2735 for (j = 0; j < 2; j++)
2736 if (opts[i][j])
2737 len += strlen (opts[i][j]);
2740 /* Build the string. */
2741 ret = ptr = (char *) xmalloc (len);
2742 line_len = 0;
2744 for (i = 0; i < num; i++)
2746 size_t len2[2];
2748 for (j = 0; j < 2; j++)
2749 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2751 if (i != 0)
2753 *ptr++ = ' ';
2754 line_len++;
2756 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2758 *ptr++ = '\\';
2759 *ptr++ = '\n';
2760 line_len = 0;
2764 for (j = 0; j < 2; j++)
2765 if (opts[i][j])
2767 memcpy (ptr, opts[i][j], len2[j]);
2768 ptr += len2[j];
2769 line_len += len2[j];
2773 *ptr = '\0';
2774 gcc_assert (ret + len >= ptr);
2776 return ret;
2779 /* Return true, if profiling code should be emitted before
2780 prologue. Otherwise it returns false.
2781 Note: For x86 with "hotfix" it is sorried. */
2782 static bool
2783 ix86_profile_before_prologue (void)
2785 return flag_fentry != 0;
2788 /* Function that is callable from the debugger to print the current
2789 options. */
2790 void
2791 ix86_debug_options (void)
2793 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2794 ix86_arch_string, ix86_tune_string,
2795 ix86_fpmath, true);
2797 if (opts)
2799 fprintf (stderr, "%s\n\n", opts);
2800 free (opts);
2802 else
2803 fputs ("<no options>\n\n", stderr);
2805 return;
2808 /* Override various settings based on options. If MAIN_ARGS_P, the
2809 options are from the command line, otherwise they are from
2810 attributes. */
2812 static void
2813 ix86_option_override_internal (bool main_args_p)
2815 int i;
2816 unsigned int ix86_arch_mask, ix86_tune_mask;
2817 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2818 const char *prefix;
2819 const char *suffix;
2820 const char *sw;
2822 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2823 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2824 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2825 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2826 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2827 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2828 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2829 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2830 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2831 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2832 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2833 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2834 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2835 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2836 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2837 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2838 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2839 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2840 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2841 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2842 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2843 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2844 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2845 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2846 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2847 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2848 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2849 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2850 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2851 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2852 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2853 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2854 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2855 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2856 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2857 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2858 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2859 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2860 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2861 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2863 /* if this reaches 64, need to widen struct pta flags below */
2865 static struct pta
2867 const char *const name; /* processor name or nickname. */
2868 const enum processor_type processor;
2869 const enum attr_cpu schedule;
2870 const unsigned HOST_WIDE_INT flags;
2872 const processor_alias_table[] =
2874 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2875 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2876 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2877 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2878 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2879 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2880 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2881 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2882 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2883 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2884 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2885 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2886 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2887 PTA_MMX | PTA_SSE | PTA_FXSR},
2888 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2889 PTA_MMX | PTA_SSE | PTA_FXSR},
2890 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2891 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2892 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2893 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2894 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2895 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2896 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2897 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2898 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2899 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2900 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2901 {"core2", PROCESSOR_CORE2, CPU_CORE2,
2902 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2903 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2904 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
2905 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2906 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
2907 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
2908 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2909 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2910 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2911 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2912 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
2913 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2914 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2915 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2916 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2917 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
2918 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2919 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2920 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2921 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2922 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
2923 | PTA_XSAVEOPT},
2924 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2925 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2926 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2927 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2928 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2929 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2930 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2931 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2932 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2933 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2934 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2935 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2936 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2937 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2938 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2939 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2940 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2941 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2942 {"x86-64", PROCESSOR_K8, CPU_K8,
2943 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2944 {"k8", PROCESSOR_K8, CPU_K8,
2945 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2946 | PTA_SSE2 | PTA_NO_SAHF},
2947 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2948 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2949 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2950 {"opteron", PROCESSOR_K8, CPU_K8,
2951 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2952 | PTA_SSE2 | PTA_NO_SAHF},
2953 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2954 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2955 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2956 {"athlon64", PROCESSOR_K8, CPU_K8,
2957 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2958 | PTA_SSE2 | PTA_NO_SAHF},
2959 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2960 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2961 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2962 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2963 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2964 | PTA_SSE2 | PTA_NO_SAHF},
2965 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2966 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2967 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2968 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2969 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2970 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2971 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2974 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2975 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2976 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2977 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2978 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2979 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2980 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2981 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2982 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
2983 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2984 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2985 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2986 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2987 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
2988 | PTA_XSAVEOPT},
2989 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
2990 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2991 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
2992 | PTA_FXSR | PTA_XSAVE},
2993 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
2994 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2995 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
2996 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2997 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
2998 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3000 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3001 PTA_HLE /* flags are only used for -march switch. */ },
3002 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3003 PTA_64BIT
3004 | PTA_HLE /* flags are only used for -march switch. */ },
3007 /* -mrecip options. */
3008 static struct
3010 const char *string; /* option name */
3011 unsigned int mask; /* mask bits to set */
3013 const recip_options[] =
3015 { "all", RECIP_MASK_ALL },
3016 { "none", RECIP_MASK_NONE },
3017 { "div", RECIP_MASK_DIV },
3018 { "sqrt", RECIP_MASK_SQRT },
3019 { "vec-div", RECIP_MASK_VEC_DIV },
3020 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3023 int const pta_size = ARRAY_SIZE (processor_alias_table);
3025 /* Set up prefix/suffix so the error messages refer to either the command
3026 line argument, or the attribute(target). */
3027 if (main_args_p)
3029 prefix = "-m";
3030 suffix = "";
3031 sw = "switch";
3033 else
3035 prefix = "option(\"";
3036 suffix = "\")";
3037 sw = "attribute";
3040 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3041 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3042 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3043 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3044 #ifdef TARGET_BI_ARCH
3045 else
3047 #if TARGET_BI_ARCH == 1
3048 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3049 is on and OPTION_MASK_ABI_X32 is off. We turn off
3050 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3051 -mx32. */
3052 if (TARGET_X32)
3053 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3054 #else
3055 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3056 on and OPTION_MASK_ABI_64 is off. We turn off
3057 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3058 -m64. */
3059 if (TARGET_LP64)
3060 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3061 #endif
3063 #endif
3065 if (TARGET_X32)
3067 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3068 OPTION_MASK_ABI_64 for TARGET_X32. */
3069 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3070 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3072 else if (TARGET_LP64)
3074 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3075 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3076 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3077 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3080 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3081 SUBTARGET_OVERRIDE_OPTIONS;
3082 #endif
3084 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3085 SUBSUBTARGET_OVERRIDE_OPTIONS;
3086 #endif
3088 /* -fPIC is the default for x86_64. */
3089 if (TARGET_MACHO && TARGET_64BIT)
3090 flag_pic = 2;
3092 /* Need to check -mtune=generic first. */
3093 if (ix86_tune_string)
3095 if (!strcmp (ix86_tune_string, "generic")
3096 || !strcmp (ix86_tune_string, "i686")
3097 /* As special support for cross compilers we read -mtune=native
3098 as -mtune=generic. With native compilers we won't see the
3099 -mtune=native, as it was changed by the driver. */
3100 || !strcmp (ix86_tune_string, "native"))
3102 if (TARGET_64BIT)
3103 ix86_tune_string = "generic64";
3104 else
3105 ix86_tune_string = "generic32";
3107 /* If this call is for setting the option attribute, allow the
3108 generic32/generic64 that was previously set. */
3109 else if (!main_args_p
3110 && (!strcmp (ix86_tune_string, "generic32")
3111 || !strcmp (ix86_tune_string, "generic64")))
3113 else if (!strncmp (ix86_tune_string, "generic", 7))
3114 error ("bad value (%s) for %stune=%s %s",
3115 ix86_tune_string, prefix, suffix, sw);
3116 else if (!strcmp (ix86_tune_string, "x86-64"))
3117 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3118 "%stune=k8%s or %stune=generic%s instead as appropriate",
3119 prefix, suffix, prefix, suffix, prefix, suffix);
3121 else
3123 if (ix86_arch_string)
3124 ix86_tune_string = ix86_arch_string;
3125 if (!ix86_tune_string)
3127 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3128 ix86_tune_defaulted = 1;
3131 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3132 need to use a sensible tune option. */
3133 if (!strcmp (ix86_tune_string, "generic")
3134 || !strcmp (ix86_tune_string, "x86-64")
3135 || !strcmp (ix86_tune_string, "i686"))
3137 if (TARGET_64BIT)
3138 ix86_tune_string = "generic64";
3139 else
3140 ix86_tune_string = "generic32";
3144 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3146 /* rep; movq isn't available in 32-bit code. */
3147 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3148 ix86_stringop_alg = no_stringop;
3151 if (!ix86_arch_string)
3152 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3153 else
3154 ix86_arch_specified = 1;
3156 if (global_options_set.x_ix86_pmode)
3158 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3159 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3160 error ("address mode %qs not supported in the %s bit mode",
3161 TARGET_64BIT ? "short" : "long",
3162 TARGET_64BIT ? "64" : "32");
3164 else
3165 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3167 if (!global_options_set.x_ix86_abi)
3168 ix86_abi = DEFAULT_ABI;
3170 if (global_options_set.x_ix86_cmodel)
3172 switch (ix86_cmodel)
3174 case CM_SMALL:
3175 case CM_SMALL_PIC:
3176 if (flag_pic)
3177 ix86_cmodel = CM_SMALL_PIC;
3178 if (!TARGET_64BIT)
3179 error ("code model %qs not supported in the %s bit mode",
3180 "small", "32");
3181 break;
3183 case CM_MEDIUM:
3184 case CM_MEDIUM_PIC:
3185 if (flag_pic)
3186 ix86_cmodel = CM_MEDIUM_PIC;
3187 if (!TARGET_64BIT)
3188 error ("code model %qs not supported in the %s bit mode",
3189 "medium", "32");
3190 else if (TARGET_X32)
3191 error ("code model %qs not supported in x32 mode",
3192 "medium");
3193 break;
3195 case CM_LARGE:
3196 case CM_LARGE_PIC:
3197 if (flag_pic)
3198 ix86_cmodel = CM_LARGE_PIC;
3199 if (!TARGET_64BIT)
3200 error ("code model %qs not supported in the %s bit mode",
3201 "large", "32");
3202 else if (TARGET_X32)
3203 error ("code model %qs not supported in x32 mode",
3204 "large");
3205 break;
3207 case CM_32:
3208 if (flag_pic)
3209 error ("code model %s does not support PIC mode", "32");
3210 if (TARGET_64BIT)
3211 error ("code model %qs not supported in the %s bit mode",
3212 "32", "64");
3213 break;
3215 case CM_KERNEL:
3216 if (flag_pic)
3218 error ("code model %s does not support PIC mode", "kernel");
3219 ix86_cmodel = CM_32;
3221 if (!TARGET_64BIT)
3222 error ("code model %qs not supported in the %s bit mode",
3223 "kernel", "32");
3224 break;
3226 default:
3227 gcc_unreachable ();
3230 else
3232 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3233 use of rip-relative addressing. This eliminates fixups that
3234 would otherwise be needed if this object is to be placed in a
3235 DLL, and is essentially just as efficient as direct addressing. */
3236 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3237 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3238 else if (TARGET_64BIT && TARGET_RDOS)
3239 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3240 else if (TARGET_64BIT)
3241 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3242 else
3243 ix86_cmodel = CM_32;
3245 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3247 error ("-masm=intel not supported in this configuration");
3248 ix86_asm_dialect = ASM_ATT;
3250 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3251 sorry ("%i-bit mode not compiled in",
3252 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3254 for (i = 0; i < pta_size; i++)
3255 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3257 ix86_schedule = processor_alias_table[i].schedule;
3258 ix86_arch = processor_alias_table[i].processor;
3259 /* Default cpu tuning to the architecture. */
3260 ix86_tune = ix86_arch;
3262 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3263 error ("CPU you selected does not support x86-64 "
3264 "instruction set");
3266 if (processor_alias_table[i].flags & PTA_MMX
3267 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3268 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3269 if (processor_alias_table[i].flags & PTA_3DNOW
3270 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3271 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3272 if (processor_alias_table[i].flags & PTA_3DNOW_A
3273 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3274 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3275 if (processor_alias_table[i].flags & PTA_SSE
3276 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3277 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3278 if (processor_alias_table[i].flags & PTA_SSE2
3279 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3280 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3281 if (processor_alias_table[i].flags & PTA_SSE3
3282 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3283 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3284 if (processor_alias_table[i].flags & PTA_SSSE3
3285 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3286 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3287 if (processor_alias_table[i].flags & PTA_SSE4_1
3288 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3289 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3290 if (processor_alias_table[i].flags & PTA_SSE4_2
3291 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3292 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3293 if (processor_alias_table[i].flags & PTA_AVX
3294 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3295 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3296 if (processor_alias_table[i].flags & PTA_AVX2
3297 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3298 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3299 if (processor_alias_table[i].flags & PTA_FMA
3300 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3301 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3302 if (processor_alias_table[i].flags & PTA_SSE4A
3303 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3304 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3305 if (processor_alias_table[i].flags & PTA_FMA4
3306 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3307 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3308 if (processor_alias_table[i].flags & PTA_XOP
3309 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3310 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3311 if (processor_alias_table[i].flags & PTA_LWP
3312 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3313 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3314 if (processor_alias_table[i].flags & PTA_ABM
3315 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3316 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3317 if (processor_alias_table[i].flags & PTA_BMI
3318 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3319 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3320 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3321 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3322 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3323 if (processor_alias_table[i].flags & PTA_TBM
3324 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3325 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3326 if (processor_alias_table[i].flags & PTA_BMI2
3327 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3328 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3329 if (processor_alias_table[i].flags & PTA_CX16
3330 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3331 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3332 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3333 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3334 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3335 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3336 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3337 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3338 if (processor_alias_table[i].flags & PTA_MOVBE
3339 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3340 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3341 if (processor_alias_table[i].flags & PTA_AES
3342 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3343 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3344 if (processor_alias_table[i].flags & PTA_PCLMUL
3345 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3346 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3347 if (processor_alias_table[i].flags & PTA_FSGSBASE
3348 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3349 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3350 if (processor_alias_table[i].flags & PTA_RDRND
3351 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3352 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3353 if (processor_alias_table[i].flags & PTA_F16C
3354 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3355 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3356 if (processor_alias_table[i].flags & PTA_RTM
3357 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3358 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3359 if (processor_alias_table[i].flags & PTA_HLE
3360 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3361 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3362 if (processor_alias_table[i].flags & PTA_PRFCHW
3363 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3364 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3365 if (processor_alias_table[i].flags & PTA_RDSEED
3366 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3367 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3368 if (processor_alias_table[i].flags & PTA_ADX
3369 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3370 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3371 if (processor_alias_table[i].flags & PTA_FXSR
3372 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3373 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3374 if (processor_alias_table[i].flags & PTA_XSAVE
3375 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3376 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3377 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3378 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3379 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3380 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3381 x86_prefetch_sse = true;
3383 break;
3386 if (!strcmp (ix86_arch_string, "generic"))
3387 error ("generic CPU can be used only for %stune=%s %s",
3388 prefix, suffix, sw);
3389 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3390 error ("bad value (%s) for %sarch=%s %s",
3391 ix86_arch_string, prefix, suffix, sw);
3393 ix86_arch_mask = 1u << ix86_arch;
3394 for (i = 0; i < X86_ARCH_LAST; ++i)
3395 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3397 for (i = 0; i < pta_size; i++)
3398 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3400 ix86_schedule = processor_alias_table[i].schedule;
3401 ix86_tune = processor_alias_table[i].processor;
3402 if (TARGET_64BIT)
3404 if (!(processor_alias_table[i].flags & PTA_64BIT))
3406 if (ix86_tune_defaulted)
3408 ix86_tune_string = "x86-64";
3409 for (i = 0; i < pta_size; i++)
3410 if (! strcmp (ix86_tune_string,
3411 processor_alias_table[i].name))
3412 break;
3413 ix86_schedule = processor_alias_table[i].schedule;
3414 ix86_tune = processor_alias_table[i].processor;
3416 else
3417 error ("CPU you selected does not support x86-64 "
3418 "instruction set");
3421 else
3423 /* Adjust tuning when compiling for 32-bit ABI. */
3424 switch (ix86_tune)
3426 case PROCESSOR_GENERIC64:
3427 ix86_tune = PROCESSOR_GENERIC32;
3428 ix86_schedule = CPU_PENTIUMPRO;
3429 break;
3431 default:
3432 break;
3435 /* Intel CPUs have always interpreted SSE prefetch instructions as
3436 NOPs; so, we can enable SSE prefetch instructions even when
3437 -mtune (rather than -march) points us to a processor that has them.
3438 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3439 higher processors. */
3440 if (TARGET_CMOV
3441 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3442 x86_prefetch_sse = true;
3443 break;
3446 if (ix86_tune_specified && i == pta_size)
3447 error ("bad value (%s) for %stune=%s %s",
3448 ix86_tune_string, prefix, suffix, sw);
3450 ix86_tune_mask = 1u << ix86_tune;
3451 for (i = 0; i < X86_TUNE_LAST; ++i)
3452 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3454 #ifndef USE_IX86_FRAME_POINTER
3455 #define USE_IX86_FRAME_POINTER 0
3456 #endif
3458 #ifndef USE_X86_64_FRAME_POINTER
3459 #define USE_X86_64_FRAME_POINTER 0
3460 #endif
3462 /* Set the default values for switches whose default depends on TARGET_64BIT
3463 in case they weren't overwritten by command line options. */
3464 if (TARGET_64BIT)
3466 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3467 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3468 if (flag_asynchronous_unwind_tables == 2)
3469 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3470 if (flag_pcc_struct_return == 2)
3471 flag_pcc_struct_return = 0;
3473 else
3475 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3476 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3477 if (flag_asynchronous_unwind_tables == 2)
3478 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3479 if (flag_pcc_struct_return == 2)
3480 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3483 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3484 if (optimize_size)
3485 ix86_cost = &ix86_size_cost;
3486 else
3487 ix86_cost = ix86_tune_cost;
3489 /* Arrange to set up i386_stack_locals for all functions. */
3490 init_machine_status = ix86_init_machine_status;
3492 /* Validate -mregparm= value. */
3493 if (global_options_set.x_ix86_regparm)
3495 if (TARGET_64BIT)
3496 warning (0, "-mregparm is ignored in 64-bit mode");
3497 if (ix86_regparm > REGPARM_MAX)
3499 error ("-mregparm=%d is not between 0 and %d",
3500 ix86_regparm, REGPARM_MAX);
3501 ix86_regparm = 0;
3504 if (TARGET_64BIT)
3505 ix86_regparm = REGPARM_MAX;
3507 /* Default align_* from the processor table. */
3508 if (align_loops == 0)
3510 align_loops = processor_target_table[ix86_tune].align_loop;
3511 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3513 if (align_jumps == 0)
3515 align_jumps = processor_target_table[ix86_tune].align_jump;
3516 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3518 if (align_functions == 0)
3520 align_functions = processor_target_table[ix86_tune].align_func;
3523 /* Provide default for -mbranch-cost= value. */
3524 if (!global_options_set.x_ix86_branch_cost)
3525 ix86_branch_cost = ix86_cost->branch_cost;
3527 if (TARGET_64BIT)
3529 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3531 /* Enable by default the SSE and MMX builtins. Do allow the user to
3532 explicitly disable any of these. In particular, disabling SSE and
3533 MMX for kernel code is extremely useful. */
3534 if (!ix86_arch_specified)
3535 ix86_isa_flags
3536 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3537 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3539 if (TARGET_RTD)
3540 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3542 else
3544 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3546 if (!ix86_arch_specified)
3547 ix86_isa_flags
3548 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3550 /* i386 ABI does not specify red zone. It still makes sense to use it
3551 when programmer takes care to stack from being destroyed. */
3552 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3553 target_flags |= MASK_NO_RED_ZONE;
3556 /* Keep nonleaf frame pointers. */
3557 if (flag_omit_frame_pointer)
3558 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3559 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3560 flag_omit_frame_pointer = 1;
3562 /* If we're doing fast math, we don't care about comparison order
3563 wrt NaNs. This lets us use a shorter comparison sequence. */
3564 if (flag_finite_math_only)
3565 target_flags &= ~MASK_IEEE_FP;
3567 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3568 since the insns won't need emulation. */
3569 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3570 target_flags &= ~MASK_NO_FANCY_MATH_387;
3572 /* Likewise, if the target doesn't have a 387, or we've specified
3573 software floating point, don't use 387 inline intrinsics. */
3574 if (!TARGET_80387)
3575 target_flags |= MASK_NO_FANCY_MATH_387;
3577 /* Turn on MMX builtins for -msse. */
3578 if (TARGET_SSE)
3579 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3581 /* Enable SSE prefetch. */
3582 if (TARGET_SSE || TARGET_PRFCHW)
3583 x86_prefetch_sse = true;
3585 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3586 if (TARGET_SSE4_2 || TARGET_ABM)
3587 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3589 /* Turn on lzcnt instruction for -mabm. */
3590 if (TARGET_ABM)
3591 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3593 /* Validate -mpreferred-stack-boundary= value or default it to
3594 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3595 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3596 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3598 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3599 int max = (TARGET_SEH ? 4 : 12);
3601 if (ix86_preferred_stack_boundary_arg < min
3602 || ix86_preferred_stack_boundary_arg > max)
3604 if (min == max)
3605 error ("-mpreferred-stack-boundary is not supported "
3606 "for this target");
3607 else
3608 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3609 ix86_preferred_stack_boundary_arg, min, max);
3611 else
3612 ix86_preferred_stack_boundary
3613 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3616 /* Set the default value for -mstackrealign. */
3617 if (ix86_force_align_arg_pointer == -1)
3618 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3620 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3622 /* Validate -mincoming-stack-boundary= value or default it to
3623 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3624 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3625 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3627 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3628 || ix86_incoming_stack_boundary_arg > 12)
3629 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3630 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3631 else
3633 ix86_user_incoming_stack_boundary
3634 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3635 ix86_incoming_stack_boundary
3636 = ix86_user_incoming_stack_boundary;
3640 /* Accept -msseregparm only if at least SSE support is enabled. */
3641 if (TARGET_SSEREGPARM
3642 && ! TARGET_SSE)
3643 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3645 if (global_options_set.x_ix86_fpmath)
3647 if (ix86_fpmath & FPMATH_SSE)
3649 if (!TARGET_SSE)
3651 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3652 ix86_fpmath = FPMATH_387;
3654 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3656 warning (0, "387 instruction set disabled, using SSE arithmetics");
3657 ix86_fpmath = FPMATH_SSE;
3661 else
3662 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3664 /* If the i387 is disabled, then do not return values in it. */
3665 if (!TARGET_80387)
3666 target_flags &= ~MASK_FLOAT_RETURNS;
3668 /* Use external vectorized library in vectorizing intrinsics. */
3669 if (global_options_set.x_ix86_veclibabi_type)
3670 switch (ix86_veclibabi_type)
3672 case ix86_veclibabi_type_svml:
3673 ix86_veclib_handler = ix86_veclibabi_svml;
3674 break;
3676 case ix86_veclibabi_type_acml:
3677 ix86_veclib_handler = ix86_veclibabi_acml;
3678 break;
3680 default:
3681 gcc_unreachable ();
3684 if ((!USE_IX86_FRAME_POINTER
3685 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3686 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3687 && !optimize_size)
3688 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 /* ??? Unwind info is not correct around the CFG unless either a frame
3691 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3692 unwind info generation to be aware of the CFG and propagating states
3693 around edges. */
3694 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3695 || flag_exceptions || flag_non_call_exceptions)
3696 && flag_omit_frame_pointer
3697 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3699 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3700 warning (0, "unwind tables currently require either a frame pointer "
3701 "or %saccumulate-outgoing-args%s for correctness",
3702 prefix, suffix);
3703 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3706 /* If stack probes are required, the space used for large function
3707 arguments on the stack must also be probed, so enable
3708 -maccumulate-outgoing-args so this happens in the prologue. */
3709 if (TARGET_STACK_PROBE
3710 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3712 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3713 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3714 "for correctness", prefix, suffix);
3715 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3718 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3720 char *p;
3721 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3722 p = strchr (internal_label_prefix, 'X');
3723 internal_label_prefix_len = p - internal_label_prefix;
3724 *p = '\0';
3727 /* When scheduling description is not available, disable scheduler pass
3728 so it won't slow down the compilation and make x87 code slower. */
3729 if (!TARGET_SCHEDULE)
3730 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3732 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3733 ix86_tune_cost->simultaneous_prefetches,
3734 global_options.x_param_values,
3735 global_options_set.x_param_values);
3736 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3737 ix86_tune_cost->prefetch_block,
3738 global_options.x_param_values,
3739 global_options_set.x_param_values);
3740 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3741 ix86_tune_cost->l1_cache_size,
3742 global_options.x_param_values,
3743 global_options_set.x_param_values);
3744 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3745 ix86_tune_cost->l2_cache_size,
3746 global_options.x_param_values,
3747 global_options_set.x_param_values);
3749 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3750 if (flag_prefetch_loop_arrays < 0
3751 && HAVE_prefetch
3752 && (optimize >= 3 || flag_profile_use)
3753 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3754 flag_prefetch_loop_arrays = 1;
3756 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3757 can be optimized to ap = __builtin_next_arg (0). */
3758 if (!TARGET_64BIT && !flag_split_stack)
3759 targetm.expand_builtin_va_start = NULL;
3761 if (TARGET_64BIT)
3763 ix86_gen_leave = gen_leave_rex64;
3764 if (Pmode == DImode)
3766 ix86_gen_monitor = gen_sse3_monitor64_di;
3767 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3768 ix86_gen_tls_local_dynamic_base_64
3769 = gen_tls_local_dynamic_base_64_di;
3771 else
3773 ix86_gen_monitor = gen_sse3_monitor64_si;
3774 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3775 ix86_gen_tls_local_dynamic_base_64
3776 = gen_tls_local_dynamic_base_64_si;
3779 else
3781 ix86_gen_leave = gen_leave;
3782 ix86_gen_monitor = gen_sse3_monitor;
3785 if (Pmode == DImode)
3787 ix86_gen_add3 = gen_adddi3;
3788 ix86_gen_sub3 = gen_subdi3;
3789 ix86_gen_sub3_carry = gen_subdi3_carry;
3790 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3791 ix86_gen_andsp = gen_anddi3;
3792 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3793 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3794 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3796 else
3798 ix86_gen_add3 = gen_addsi3;
3799 ix86_gen_sub3 = gen_subsi3;
3800 ix86_gen_sub3_carry = gen_subsi3_carry;
3801 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3802 ix86_gen_andsp = gen_andsi3;
3803 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3804 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3805 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3808 #ifdef USE_IX86_CLD
3809 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3810 if (!TARGET_64BIT)
3811 target_flags |= MASK_CLD & ~target_flags_explicit;
3812 #endif
3814 if (!TARGET_64BIT && flag_pic)
3816 if (flag_fentry > 0)
3817 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3818 "with -fpic");
3819 flag_fentry = 0;
3821 else if (TARGET_SEH)
3823 if (flag_fentry == 0)
3824 sorry ("-mno-fentry isn%'t compatible with SEH");
3825 flag_fentry = 1;
3827 else if (flag_fentry < 0)
3829 #if defined(PROFILE_BEFORE_PROLOGUE)
3830 flag_fentry = 1;
3831 #else
3832 flag_fentry = 0;
3833 #endif
3836 if (TARGET_AVX)
3838 /* When not optimize for size, enable vzeroupper optimization for
3839 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3840 AVX unaligned load/store. */
3841 if (!optimize_size)
3843 if (flag_expensive_optimizations
3844 && !(target_flags_explicit & MASK_VZEROUPPER))
3845 target_flags |= MASK_VZEROUPPER;
3846 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3847 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3848 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3849 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3850 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3851 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3852 /* Enable 128-bit AVX instruction generation
3853 for the auto-vectorizer. */
3854 if (TARGET_AVX128_OPTIMAL
3855 && !(target_flags_explicit & MASK_PREFER_AVX128))
3856 target_flags |= MASK_PREFER_AVX128;
3859 else
3861 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3862 target_flags &= ~MASK_VZEROUPPER;
3865 if (ix86_recip_name)
3867 char *p = ASTRDUP (ix86_recip_name);
3868 char *q;
3869 unsigned int mask, i;
3870 bool invert;
3872 while ((q = strtok (p, ",")) != NULL)
3874 p = NULL;
3875 if (*q == '!')
3877 invert = true;
3878 q++;
3880 else
3881 invert = false;
3883 if (!strcmp (q, "default"))
3884 mask = RECIP_MASK_ALL;
3885 else
3887 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3888 if (!strcmp (q, recip_options[i].string))
3890 mask = recip_options[i].mask;
3891 break;
3894 if (i == ARRAY_SIZE (recip_options))
3896 error ("unknown option for -mrecip=%s", q);
3897 invert = false;
3898 mask = RECIP_MASK_NONE;
3902 recip_mask_explicit |= mask;
3903 if (invert)
3904 recip_mask &= ~mask;
3905 else
3906 recip_mask |= mask;
3910 if (TARGET_RECIP)
3911 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3912 else if (target_flags_explicit & MASK_RECIP)
3913 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3915 /* Default long double to 64-bit for Bionic. */
3916 if (TARGET_HAS_BIONIC
3917 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3918 target_flags |= MASK_LONG_DOUBLE_64;
3920 /* Save the initial options in case the user does function specific
3921 options. */
3922 if (main_args_p)
3923 target_option_default_node = target_option_current_node
3924 = build_target_option_node ();
3927 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3929 static void
3930 ix86_option_override (void)
3932 static struct register_pass_info insert_vzeroupper_info
3933 = { &pass_insert_vzeroupper.pass, "reload",
3934 1, PASS_POS_INSERT_AFTER
3937 ix86_option_override_internal (true);
3940 /* This needs to be done at start up. It's convenient to do it here. */
3941 register_pass (&insert_vzeroupper_info);
3944 /* Update register usage after having seen the compiler flags. */
3946 static void
3947 ix86_conditional_register_usage (void)
3949 int i, c_mask;
3950 unsigned int j;
3952 /* The PIC register, if it exists, is fixed. */
3953 j = PIC_OFFSET_TABLE_REGNUM;
3954 if (j != INVALID_REGNUM)
3955 fixed_regs[j] = call_used_regs[j] = 1;
3957 /* For 32-bit targets, squash the REX registers. */
3958 if (! TARGET_64BIT)
3960 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3961 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3962 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3963 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3966 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3967 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3968 : TARGET_64BIT ? (1 << 2)
3969 : (1 << 1));
3971 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3973 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975 /* Set/reset conditionally defined registers from
3976 CALL_USED_REGISTERS initializer. */
3977 if (call_used_regs[i] > 1)
3978 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3980 /* Calculate registers of CLOBBERED_REGS register set
3981 as call used registers from GENERAL_REGS register set. */
3982 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3983 && call_used_regs[i])
3984 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3987 /* If MMX is disabled, squash the registers. */
3988 if (! TARGET_MMX)
3989 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3990 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3991 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3993 /* If SSE is disabled, squash the registers. */
3994 if (! TARGET_SSE)
3995 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3996 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3997 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3999 /* If the FPU is disabled, squash the registers. */
4000 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4001 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4002 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4003 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4007 /* Save the current options */
4009 static void
4010 ix86_function_specific_save (struct cl_target_option *ptr)
4012 ptr->arch = ix86_arch;
4013 ptr->schedule = ix86_schedule;
4014 ptr->tune = ix86_tune;
4015 ptr->branch_cost = ix86_branch_cost;
4016 ptr->tune_defaulted = ix86_tune_defaulted;
4017 ptr->arch_specified = ix86_arch_specified;
4018 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4019 ptr->ix86_target_flags_explicit = target_flags_explicit;
4020 ptr->x_recip_mask_explicit = recip_mask_explicit;
4022 /* The fields are char but the variables are not; make sure the
4023 values fit in the fields. */
4024 gcc_assert (ptr->arch == ix86_arch);
4025 gcc_assert (ptr->schedule == ix86_schedule);
4026 gcc_assert (ptr->tune == ix86_tune);
4027 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4030 /* Restore the current options */
4032 static void
4033 ix86_function_specific_restore (struct cl_target_option *ptr)
4035 enum processor_type old_tune = ix86_tune;
4036 enum processor_type old_arch = ix86_arch;
4037 unsigned int ix86_arch_mask, ix86_tune_mask;
4038 int i;
4040 ix86_arch = (enum processor_type) ptr->arch;
4041 ix86_schedule = (enum attr_cpu) ptr->schedule;
4042 ix86_tune = (enum processor_type) ptr->tune;
4043 ix86_branch_cost = ptr->branch_cost;
4044 ix86_tune_defaulted = ptr->tune_defaulted;
4045 ix86_arch_specified = ptr->arch_specified;
4046 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4047 target_flags_explicit = ptr->ix86_target_flags_explicit;
4048 recip_mask_explicit = ptr->x_recip_mask_explicit;
4050 /* Recreate the arch feature tests if the arch changed */
4051 if (old_arch != ix86_arch)
4053 ix86_arch_mask = 1u << ix86_arch;
4054 for (i = 0; i < X86_ARCH_LAST; ++i)
4055 ix86_arch_features[i]
4056 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4059 /* Recreate the tune optimization tests */
4060 if (old_tune != ix86_tune)
4062 ix86_tune_mask = 1u << ix86_tune;
4063 for (i = 0; i < X86_TUNE_LAST; ++i)
4064 ix86_tune_features[i]
4065 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4069 /* Print the current options */
4071 static void
4072 ix86_function_specific_print (FILE *file, int indent,
4073 struct cl_target_option *ptr)
4075 char *target_string
4076 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4077 NULL, NULL, ptr->x_ix86_fpmath, false);
4079 fprintf (file, "%*sarch = %d (%s)\n",
4080 indent, "",
4081 ptr->arch,
4082 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4083 ? cpu_names[ptr->arch]
4084 : "<unknown>"));
4086 fprintf (file, "%*stune = %d (%s)\n",
4087 indent, "",
4088 ptr->tune,
4089 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4090 ? cpu_names[ptr->tune]
4091 : "<unknown>"));
4093 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4095 if (target_string)
4097 fprintf (file, "%*s%s\n", indent, "", target_string);
4098 free (target_string);
4103 /* Inner function to process the attribute((target(...))), take an argument and
4104 set the current options from the argument. If we have a list, recursively go
4105 over the list. */
4107 static bool
4108 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4109 struct gcc_options *enum_opts_set)
4111 char *next_optstr;
4112 bool ret = true;
4114 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4115 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4116 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4117 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4118 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4120 enum ix86_opt_type
4122 ix86_opt_unknown,
4123 ix86_opt_yes,
4124 ix86_opt_no,
4125 ix86_opt_str,
4126 ix86_opt_enum,
4127 ix86_opt_isa
4130 static const struct
4132 const char *string;
4133 size_t len;
4134 enum ix86_opt_type type;
4135 int opt;
4136 int mask;
4137 } attrs[] = {
4138 /* isa options */
4139 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4140 IX86_ATTR_ISA ("abm", OPT_mabm),
4141 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4142 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4143 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4144 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4145 IX86_ATTR_ISA ("aes", OPT_maes),
4146 IX86_ATTR_ISA ("avx", OPT_mavx),
4147 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4148 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4149 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4150 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4151 IX86_ATTR_ISA ("sse", OPT_msse),
4152 IX86_ATTR_ISA ("sse2", OPT_msse2),
4153 IX86_ATTR_ISA ("sse3", OPT_msse3),
4154 IX86_ATTR_ISA ("sse4", OPT_msse4),
4155 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4156 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4157 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4158 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4159 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4160 IX86_ATTR_ISA ("fma", OPT_mfma),
4161 IX86_ATTR_ISA ("xop", OPT_mxop),
4162 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4163 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4164 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4165 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4166 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4167 IX86_ATTR_ISA ("hle", OPT_mhle),
4168 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4169 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4170 IX86_ATTR_ISA ("adx", OPT_madx),
4171 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4172 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4173 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4175 /* enum options */
4176 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4178 /* string options */
4179 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4180 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4182 /* flag options */
4183 IX86_ATTR_YES ("cld",
4184 OPT_mcld,
4185 MASK_CLD),
4187 IX86_ATTR_NO ("fancy-math-387",
4188 OPT_mfancy_math_387,
4189 MASK_NO_FANCY_MATH_387),
4191 IX86_ATTR_YES ("ieee-fp",
4192 OPT_mieee_fp,
4193 MASK_IEEE_FP),
4195 IX86_ATTR_YES ("inline-all-stringops",
4196 OPT_minline_all_stringops,
4197 MASK_INLINE_ALL_STRINGOPS),
4199 IX86_ATTR_YES ("inline-stringops-dynamically",
4200 OPT_minline_stringops_dynamically,
4201 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4203 IX86_ATTR_NO ("align-stringops",
4204 OPT_mno_align_stringops,
4205 MASK_NO_ALIGN_STRINGOPS),
4207 IX86_ATTR_YES ("recip",
4208 OPT_mrecip,
4209 MASK_RECIP),
4213 /* If this is a list, recurse to get the options. */
4214 if (TREE_CODE (args) == TREE_LIST)
4216 bool ret = true;
4218 for (; args; args = TREE_CHAIN (args))
4219 if (TREE_VALUE (args)
4220 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4221 p_strings, enum_opts_set))
4222 ret = false;
4224 return ret;
4227 else if (TREE_CODE (args) != STRING_CST)
4229 error ("attribute %<target%> argument not a string");
4230 return false;
4233 /* Handle multiple arguments separated by commas. */
4234 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4236 while (next_optstr && *next_optstr != '\0')
4238 char *p = next_optstr;
4239 char *orig_p = p;
4240 char *comma = strchr (next_optstr, ',');
4241 const char *opt_string;
4242 size_t len, opt_len;
4243 int opt;
4244 bool opt_set_p;
4245 char ch;
4246 unsigned i;
4247 enum ix86_opt_type type = ix86_opt_unknown;
4248 int mask = 0;
4250 if (comma)
4252 *comma = '\0';
4253 len = comma - next_optstr;
4254 next_optstr = comma + 1;
4256 else
4258 len = strlen (p);
4259 next_optstr = NULL;
4262 /* Recognize no-xxx. */
4263 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4265 opt_set_p = false;
4266 p += 3;
4267 len -= 3;
4269 else
4270 opt_set_p = true;
4272 /* Find the option. */
4273 ch = *p;
4274 opt = N_OPTS;
4275 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4277 type = attrs[i].type;
4278 opt_len = attrs[i].len;
4279 if (ch == attrs[i].string[0]
4280 && ((type != ix86_opt_str && type != ix86_opt_enum)
4281 ? len == opt_len
4282 : len > opt_len)
4283 && memcmp (p, attrs[i].string, opt_len) == 0)
4285 opt = attrs[i].opt;
4286 mask = attrs[i].mask;
4287 opt_string = attrs[i].string;
4288 break;
4292 /* Process the option. */
4293 if (opt == N_OPTS)
4295 error ("attribute(target(\"%s\")) is unknown", orig_p);
4296 ret = false;
4299 else if (type == ix86_opt_isa)
4301 struct cl_decoded_option decoded;
4303 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4304 ix86_handle_option (&global_options, &global_options_set,
4305 &decoded, input_location);
4308 else if (type == ix86_opt_yes || type == ix86_opt_no)
4310 if (type == ix86_opt_no)
4311 opt_set_p = !opt_set_p;
4313 if (opt_set_p)
4314 target_flags |= mask;
4315 else
4316 target_flags &= ~mask;
4319 else if (type == ix86_opt_str)
4321 if (p_strings[opt])
4323 error ("option(\"%s\") was already specified", opt_string);
4324 ret = false;
4326 else
4327 p_strings[opt] = xstrdup (p + opt_len);
4330 else if (type == ix86_opt_enum)
4332 bool arg_ok;
4333 int value;
4335 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4336 if (arg_ok)
4337 set_option (&global_options, enum_opts_set, opt, value,
4338 p + opt_len, DK_UNSPECIFIED, input_location,
4339 global_dc);
4340 else
4342 error ("attribute(target(\"%s\")) is unknown", orig_p);
4343 ret = false;
4347 else
4348 gcc_unreachable ();
4351 return ret;
4354 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4356 tree
4357 ix86_valid_target_attribute_tree (tree args)
4359 const char *orig_arch_string = ix86_arch_string;
4360 const char *orig_tune_string = ix86_tune_string;
4361 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4362 int orig_tune_defaulted = ix86_tune_defaulted;
4363 int orig_arch_specified = ix86_arch_specified;
4364 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4365 tree t = NULL_TREE;
4366 int i;
4367 struct cl_target_option *def
4368 = TREE_TARGET_OPTION (target_option_default_node);
4369 struct gcc_options enum_opts_set;
4371 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4373 /* Process each of the options on the chain. */
4374 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4375 &enum_opts_set))
4376 return error_mark_node;
4378 /* If the changed options are different from the default, rerun
4379 ix86_option_override_internal, and then save the options away.
4380 The string options are are attribute options, and will be undone
4381 when we copy the save structure. */
4382 if (ix86_isa_flags != def->x_ix86_isa_flags
4383 || target_flags != def->x_target_flags
4384 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4385 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4386 || enum_opts_set.x_ix86_fpmath)
4388 /* If we are using the default tune= or arch=, undo the string assigned,
4389 and use the default. */
4390 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4391 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4392 else if (!orig_arch_specified)
4393 ix86_arch_string = NULL;
4395 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4396 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4397 else if (orig_tune_defaulted)
4398 ix86_tune_string = NULL;
4400 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4401 if (enum_opts_set.x_ix86_fpmath)
4402 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4403 else if (!TARGET_64BIT && TARGET_SSE)
4405 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4406 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4409 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4410 ix86_option_override_internal (false);
4412 /* Add any builtin functions with the new isa if any. */
4413 ix86_add_new_builtins (ix86_isa_flags);
4415 /* Save the current options unless we are validating options for
4416 #pragma. */
4417 t = build_target_option_node ();
4419 ix86_arch_string = orig_arch_string;
4420 ix86_tune_string = orig_tune_string;
4421 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4423 /* Free up memory allocated to hold the strings */
4424 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4425 free (option_strings[i]);
4428 return t;
4431 /* Hook to validate attribute((target("string"))). */
4433 static bool
4434 ix86_valid_target_attribute_p (tree fndecl,
4435 tree ARG_UNUSED (name),
4436 tree args,
4437 int ARG_UNUSED (flags))
4439 struct cl_target_option cur_target;
4440 bool ret = true;
4442 /* attribute((target("default"))) does nothing, beyond
4443 affecting multi-versioning. */
4444 if (TREE_VALUE (args)
4445 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4446 && TREE_CHAIN (args) == NULL_TREE
4447 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4448 return true;
4450 tree old_optimize = build_optimization_node ();
4451 tree new_target, new_optimize;
4452 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4454 /* If the function changed the optimization levels as well as setting target
4455 options, start with the optimizations specified. */
4456 if (func_optimize && func_optimize != old_optimize)
4457 cl_optimization_restore (&global_options,
4458 TREE_OPTIMIZATION (func_optimize));
4460 /* The target attributes may also change some optimization flags, so update
4461 the optimization options if necessary. */
4462 cl_target_option_save (&cur_target, &global_options);
4463 new_target = ix86_valid_target_attribute_tree (args);
4464 new_optimize = build_optimization_node ();
4466 if (new_target == error_mark_node)
4467 ret = false;
4469 else if (fndecl && new_target)
4471 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4473 if (old_optimize != new_optimize)
4474 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4477 cl_target_option_restore (&global_options, &cur_target);
4479 if (old_optimize != new_optimize)
4480 cl_optimization_restore (&global_options,
4481 TREE_OPTIMIZATION (old_optimize));
4483 return ret;
4487 /* Hook to determine if one function can safely inline another. */
4489 static bool
4490 ix86_can_inline_p (tree caller, tree callee)
4492 bool ret = false;
4493 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4494 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4496 /* If callee has no option attributes, then it is ok to inline. */
4497 if (!callee_tree)
4498 ret = true;
4500 /* If caller has no option attributes, but callee does then it is not ok to
4501 inline. */
4502 else if (!caller_tree)
4503 ret = false;
4505 else
4507 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4508 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4510 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4511 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4512 function. */
4513 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4514 != callee_opts->x_ix86_isa_flags)
4515 ret = false;
4517 /* See if we have the same non-isa options. */
4518 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4519 ret = false;
4521 /* See if arch, tune, etc. are the same. */
4522 else if (caller_opts->arch != callee_opts->arch)
4523 ret = false;
4525 else if (caller_opts->tune != callee_opts->tune)
4526 ret = false;
4528 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4529 ret = false;
4531 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4532 ret = false;
4534 else
4535 ret = true;
4538 return ret;
4542 /* Remember the last target of ix86_set_current_function. */
4543 static GTY(()) tree ix86_previous_fndecl;
4545 /* Establish appropriate back-end context for processing the function
4546 FNDECL. The argument might be NULL to indicate processing at top
4547 level, outside of any function scope. */
4548 static void
4549 ix86_set_current_function (tree fndecl)
4551 /* Only change the context if the function changes. This hook is called
4552 several times in the course of compiling a function, and we don't want to
4553 slow things down too much or call target_reinit when it isn't safe. */
4554 if (fndecl && fndecl != ix86_previous_fndecl)
4556 tree old_tree = (ix86_previous_fndecl
4557 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4558 : NULL_TREE);
4560 tree new_tree = (fndecl
4561 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4562 : NULL_TREE);
4564 ix86_previous_fndecl = fndecl;
4565 if (old_tree == new_tree)
4568 else if (new_tree)
4570 cl_target_option_restore (&global_options,
4571 TREE_TARGET_OPTION (new_tree));
4572 target_reinit ();
4575 else if (old_tree)
4577 struct cl_target_option *def
4578 = TREE_TARGET_OPTION (target_option_current_node);
4580 cl_target_option_restore (&global_options, def);
4581 target_reinit ();
4587 /* Return true if this goes in large data/bss. */
4589 static bool
4590 ix86_in_large_data_p (tree exp)
4592 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4593 return false;
4595 /* Functions are never large data. */
4596 if (TREE_CODE (exp) == FUNCTION_DECL)
4597 return false;
4599 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4601 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4602 if (strcmp (section, ".ldata") == 0
4603 || strcmp (section, ".lbss") == 0)
4604 return true;
4605 return false;
4607 else
4609 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4611 /* If this is an incomplete type with size 0, then we can't put it
4612 in data because it might be too big when completed. */
4613 if (!size || size > ix86_section_threshold)
4614 return true;
4617 return false;
4620 /* Switch to the appropriate section for output of DECL.
4621 DECL is either a `VAR_DECL' node or a constant of some sort.
4622 RELOC indicates whether forming the initial value of DECL requires
4623 link-time relocations. */
4625 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4626 ATTRIBUTE_UNUSED;
4628 static section *
4629 x86_64_elf_select_section (tree decl, int reloc,
4630 unsigned HOST_WIDE_INT align)
4632 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4633 && ix86_in_large_data_p (decl))
4635 const char *sname = NULL;
4636 unsigned int flags = SECTION_WRITE;
4637 switch (categorize_decl_for_section (decl, reloc))
4639 case SECCAT_DATA:
4640 sname = ".ldata";
4641 break;
4642 case SECCAT_DATA_REL:
4643 sname = ".ldata.rel";
4644 break;
4645 case SECCAT_DATA_REL_LOCAL:
4646 sname = ".ldata.rel.local";
4647 break;
4648 case SECCAT_DATA_REL_RO:
4649 sname = ".ldata.rel.ro";
4650 break;
4651 case SECCAT_DATA_REL_RO_LOCAL:
4652 sname = ".ldata.rel.ro.local";
4653 break;
4654 case SECCAT_BSS:
4655 sname = ".lbss";
4656 flags |= SECTION_BSS;
4657 break;
4658 case SECCAT_RODATA:
4659 case SECCAT_RODATA_MERGE_STR:
4660 case SECCAT_RODATA_MERGE_STR_INIT:
4661 case SECCAT_RODATA_MERGE_CONST:
4662 sname = ".lrodata";
4663 flags = 0;
4664 break;
4665 case SECCAT_SRODATA:
4666 case SECCAT_SDATA:
4667 case SECCAT_SBSS:
4668 gcc_unreachable ();
4669 case SECCAT_TEXT:
4670 case SECCAT_TDATA:
4671 case SECCAT_TBSS:
4672 /* We don't split these for medium model. Place them into
4673 default sections and hope for best. */
4674 break;
4676 if (sname)
4678 /* We might get called with string constants, but get_named_section
4679 doesn't like them as they are not DECLs. Also, we need to set
4680 flags in that case. */
4681 if (!DECL_P (decl))
4682 return get_section (sname, flags, NULL);
4683 return get_named_section (decl, sname, reloc);
4686 return default_elf_select_section (decl, reloc, align);
4689 /* Build up a unique section name, expressed as a
4690 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4691 RELOC indicates whether the initial value of EXP requires
4692 link-time relocations. */
4694 static void ATTRIBUTE_UNUSED
4695 x86_64_elf_unique_section (tree decl, int reloc)
4697 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4698 && ix86_in_large_data_p (decl))
4700 const char *prefix = NULL;
4701 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4702 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4704 switch (categorize_decl_for_section (decl, reloc))
4706 case SECCAT_DATA:
4707 case SECCAT_DATA_REL:
4708 case SECCAT_DATA_REL_LOCAL:
4709 case SECCAT_DATA_REL_RO:
4710 case SECCAT_DATA_REL_RO_LOCAL:
4711 prefix = one_only ? ".ld" : ".ldata";
4712 break;
4713 case SECCAT_BSS:
4714 prefix = one_only ? ".lb" : ".lbss";
4715 break;
4716 case SECCAT_RODATA:
4717 case SECCAT_RODATA_MERGE_STR:
4718 case SECCAT_RODATA_MERGE_STR_INIT:
4719 case SECCAT_RODATA_MERGE_CONST:
4720 prefix = one_only ? ".lr" : ".lrodata";
4721 break;
4722 case SECCAT_SRODATA:
4723 case SECCAT_SDATA:
4724 case SECCAT_SBSS:
4725 gcc_unreachable ();
4726 case SECCAT_TEXT:
4727 case SECCAT_TDATA:
4728 case SECCAT_TBSS:
4729 /* We don't split these for medium model. Place them into
4730 default sections and hope for best. */
4731 break;
4733 if (prefix)
4735 const char *name, *linkonce;
4736 char *string;
4738 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4739 name = targetm.strip_name_encoding (name);
4741 /* If we're using one_only, then there needs to be a .gnu.linkonce
4742 prefix to the section name. */
4743 linkonce = one_only ? ".gnu.linkonce" : "";
4745 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4747 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4748 return;
4751 default_unique_section (decl, reloc);
4754 #ifdef COMMON_ASM_OP
4755 /* This says how to output assembler code to declare an
4756 uninitialized external linkage data object.
4758 For medium model x86-64 we need to use .largecomm opcode for
4759 large objects. */
4760 void
4761 x86_elf_aligned_common (FILE *file,
4762 const char *name, unsigned HOST_WIDE_INT size,
4763 int align)
4765 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4766 && size > (unsigned int)ix86_section_threshold)
4767 fputs (".largecomm\t", file);
4768 else
4769 fputs (COMMON_ASM_OP, file);
4770 assemble_name (file, name);
4771 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4772 size, align / BITS_PER_UNIT);
4774 #endif
4776 /* Utility function for targets to use in implementing
4777 ASM_OUTPUT_ALIGNED_BSS. */
4779 void
4780 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4781 const char *name, unsigned HOST_WIDE_INT size,
4782 int align)
4784 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4785 && size > (unsigned int)ix86_section_threshold)
4786 switch_to_section (get_named_section (decl, ".lbss", 0));
4787 else
4788 switch_to_section (bss_section);
4789 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4790 #ifdef ASM_DECLARE_OBJECT_NAME
4791 last_assemble_variable_decl = decl;
4792 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4793 #else
4794 /* Standard thing is just output label for the object. */
4795 ASM_OUTPUT_LABEL (file, name);
4796 #endif /* ASM_DECLARE_OBJECT_NAME */
4797 ASM_OUTPUT_SKIP (file, size ? size : 1);
4800 /* Decide whether we must probe the stack before any space allocation
4801 on this target. It's essentially TARGET_STACK_PROBE except when
4802 -fstack-check causes the stack to be already probed differently. */
4804 bool
4805 ix86_target_stack_probe (void)
4807 /* Do not probe the stack twice if static stack checking is enabled. */
4808 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4809 return false;
4811 return TARGET_STACK_PROBE;
4814 /* Decide whether we can make a sibling call to a function. DECL is the
4815 declaration of the function being targeted by the call and EXP is the
4816 CALL_EXPR representing the call. */
4818 static bool
4819 ix86_function_ok_for_sibcall (tree decl, tree exp)
4821 tree type, decl_or_type;
4822 rtx a, b;
4824 /* If we are generating position-independent code, we cannot sibcall
4825 optimize any indirect call, or a direct call to a global function,
4826 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4827 if (!TARGET_MACHO
4828 && !TARGET_64BIT
4829 && flag_pic
4830 && (!decl || !targetm.binds_local_p (decl)))
4831 return false;
4833 /* If we need to align the outgoing stack, then sibcalling would
4834 unalign the stack, which may break the called function. */
4835 if (ix86_minimum_incoming_stack_boundary (true)
4836 < PREFERRED_STACK_BOUNDARY)
4837 return false;
4839 if (decl)
4841 decl_or_type = decl;
4842 type = TREE_TYPE (decl);
4844 else
4846 /* We're looking at the CALL_EXPR, we need the type of the function. */
4847 type = CALL_EXPR_FN (exp); /* pointer expression */
4848 type = TREE_TYPE (type); /* pointer type */
4849 type = TREE_TYPE (type); /* function type */
4850 decl_or_type = type;
4853 /* Check that the return value locations are the same. Like
4854 if we are returning floats on the 80387 register stack, we cannot
4855 make a sibcall from a function that doesn't return a float to a
4856 function that does or, conversely, from a function that does return
4857 a float to a function that doesn't; the necessary stack adjustment
4858 would not be executed. This is also the place we notice
4859 differences in the return value ABI. Note that it is ok for one
4860 of the functions to have void return type as long as the return
4861 value of the other is passed in a register. */
4862 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4863 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4864 cfun->decl, false);
4865 if (STACK_REG_P (a) || STACK_REG_P (b))
4867 if (!rtx_equal_p (a, b))
4868 return false;
4870 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4872 else if (!rtx_equal_p (a, b))
4873 return false;
4875 if (TARGET_64BIT)
4877 /* The SYSV ABI has more call-clobbered registers;
4878 disallow sibcalls from MS to SYSV. */
4879 if (cfun->machine->call_abi == MS_ABI
4880 && ix86_function_type_abi (type) == SYSV_ABI)
4881 return false;
4883 else
4885 /* If this call is indirect, we'll need to be able to use a
4886 call-clobbered register for the address of the target function.
4887 Make sure that all such registers are not used for passing
4888 parameters. Note that DLLIMPORT functions are indirect. */
4889 if (!decl
4890 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4892 if (ix86_function_regparm (type, NULL) >= 3)
4894 /* ??? Need to count the actual number of registers to be used,
4895 not the possible number of registers. Fix later. */
4896 return false;
4901 /* Otherwise okay. That also includes certain types of indirect calls. */
4902 return true;
4905 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4906 and "sseregparm" calling convention attributes;
4907 arguments as in struct attribute_spec.handler. */
4909 static tree
4910 ix86_handle_cconv_attribute (tree *node, tree name,
4911 tree args,
4912 int flags ATTRIBUTE_UNUSED,
4913 bool *no_add_attrs)
4915 if (TREE_CODE (*node) != FUNCTION_TYPE
4916 && TREE_CODE (*node) != METHOD_TYPE
4917 && TREE_CODE (*node) != FIELD_DECL
4918 && TREE_CODE (*node) != TYPE_DECL)
4920 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4921 name);
4922 *no_add_attrs = true;
4923 return NULL_TREE;
4926 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4927 if (is_attribute_p ("regparm", name))
4929 tree cst;
4931 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4933 error ("fastcall and regparm attributes are not compatible");
4936 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4938 error ("regparam and thiscall attributes are not compatible");
4941 cst = TREE_VALUE (args);
4942 if (TREE_CODE (cst) != INTEGER_CST)
4944 warning (OPT_Wattributes,
4945 "%qE attribute requires an integer constant argument",
4946 name);
4947 *no_add_attrs = true;
4949 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4951 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4952 name, REGPARM_MAX);
4953 *no_add_attrs = true;
4956 return NULL_TREE;
4959 if (TARGET_64BIT)
4961 /* Do not warn when emulating the MS ABI. */
4962 if ((TREE_CODE (*node) != FUNCTION_TYPE
4963 && TREE_CODE (*node) != METHOD_TYPE)
4964 || ix86_function_type_abi (*node) != MS_ABI)
4965 warning (OPT_Wattributes, "%qE attribute ignored",
4966 name);
4967 *no_add_attrs = true;
4968 return NULL_TREE;
4971 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4972 if (is_attribute_p ("fastcall", name))
4974 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4976 error ("fastcall and cdecl attributes are not compatible");
4978 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4980 error ("fastcall and stdcall attributes are not compatible");
4982 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4984 error ("fastcall and regparm attributes are not compatible");
4986 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4988 error ("fastcall and thiscall attributes are not compatible");
4992 /* Can combine stdcall with fastcall (redundant), regparm and
4993 sseregparm. */
4994 else if (is_attribute_p ("stdcall", name))
4996 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4998 error ("stdcall and cdecl attributes are not compatible");
5000 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5002 error ("stdcall and fastcall attributes are not compatible");
5004 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5006 error ("stdcall and thiscall attributes are not compatible");
5010 /* Can combine cdecl with regparm and sseregparm. */
5011 else if (is_attribute_p ("cdecl", name))
5013 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5015 error ("stdcall and cdecl attributes are not compatible");
5017 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5019 error ("fastcall and cdecl attributes are not compatible");
5021 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5023 error ("cdecl and thiscall attributes are not compatible");
5026 else if (is_attribute_p ("thiscall", name))
5028 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5029 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5030 name);
5031 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5033 error ("stdcall and thiscall attributes are not compatible");
5035 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5037 error ("fastcall and thiscall attributes are not compatible");
5039 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5041 error ("cdecl and thiscall attributes are not compatible");
5045 /* Can combine sseregparm with all attributes. */
5047 return NULL_TREE;
5050 /* The transactional memory builtins are implicitly regparm or fastcall
5051 depending on the ABI. Override the generic do-nothing attribute that
5052 these builtins were declared with, and replace it with one of the two
5053 attributes that we expect elsewhere. */
5055 static tree
5056 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5057 tree args ATTRIBUTE_UNUSED,
5058 int flags ATTRIBUTE_UNUSED,
5059 bool *no_add_attrs)
5061 tree alt;
5063 /* In no case do we want to add the placeholder attribute. */
5064 *no_add_attrs = true;
5066 /* The 64-bit ABI is unchanged for transactional memory. */
5067 if (TARGET_64BIT)
5068 return NULL_TREE;
5070 /* ??? Is there a better way to validate 32-bit windows? We have
5071 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5072 if (CHECK_STACK_LIMIT > 0)
5073 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5074 else
5076 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5077 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5079 decl_attributes (node, alt, flags);
5081 return NULL_TREE;
5084 /* This function determines from TYPE the calling-convention. */
5086 unsigned int
5087 ix86_get_callcvt (const_tree type)
5089 unsigned int ret = 0;
5090 bool is_stdarg;
5091 tree attrs;
5093 if (TARGET_64BIT)
5094 return IX86_CALLCVT_CDECL;
5096 attrs = TYPE_ATTRIBUTES (type);
5097 if (attrs != NULL_TREE)
5099 if (lookup_attribute ("cdecl", attrs))
5100 ret |= IX86_CALLCVT_CDECL;
5101 else if (lookup_attribute ("stdcall", attrs))
5102 ret |= IX86_CALLCVT_STDCALL;
5103 else if (lookup_attribute ("fastcall", attrs))
5104 ret |= IX86_CALLCVT_FASTCALL;
5105 else if (lookup_attribute ("thiscall", attrs))
5106 ret |= IX86_CALLCVT_THISCALL;
5108 /* Regparam isn't allowed for thiscall and fastcall. */
5109 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5111 if (lookup_attribute ("regparm", attrs))
5112 ret |= IX86_CALLCVT_REGPARM;
5113 if (lookup_attribute ("sseregparm", attrs))
5114 ret |= IX86_CALLCVT_SSEREGPARM;
5117 if (IX86_BASE_CALLCVT(ret) != 0)
5118 return ret;
5121 is_stdarg = stdarg_p (type);
5122 if (TARGET_RTD && !is_stdarg)
5123 return IX86_CALLCVT_STDCALL | ret;
5125 if (ret != 0
5126 || is_stdarg
5127 || TREE_CODE (type) != METHOD_TYPE
5128 || ix86_function_type_abi (type) != MS_ABI)
5129 return IX86_CALLCVT_CDECL | ret;
5131 return IX86_CALLCVT_THISCALL;
5134 /* Return 0 if the attributes for two types are incompatible, 1 if they
5135 are compatible, and 2 if they are nearly compatible (which causes a
5136 warning to be generated). */
5138 static int
5139 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5141 unsigned int ccvt1, ccvt2;
5143 if (TREE_CODE (type1) != FUNCTION_TYPE
5144 && TREE_CODE (type1) != METHOD_TYPE)
5145 return 1;
5147 ccvt1 = ix86_get_callcvt (type1);
5148 ccvt2 = ix86_get_callcvt (type2);
5149 if (ccvt1 != ccvt2)
5150 return 0;
5151 if (ix86_function_regparm (type1, NULL)
5152 != ix86_function_regparm (type2, NULL))
5153 return 0;
5155 return 1;
5158 /* Return the regparm value for a function with the indicated TYPE and DECL.
5159 DECL may be NULL when calling function indirectly
5160 or considering a libcall. */
5162 static int
5163 ix86_function_regparm (const_tree type, const_tree decl)
5165 tree attr;
5166 int regparm;
5167 unsigned int ccvt;
5169 if (TARGET_64BIT)
5170 return (ix86_function_type_abi (type) == SYSV_ABI
5171 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5172 ccvt = ix86_get_callcvt (type);
5173 regparm = ix86_regparm;
5175 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5177 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5178 if (attr)
5180 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5181 return regparm;
5184 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5185 return 2;
5186 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5187 return 1;
5189 /* Use register calling convention for local functions when possible. */
5190 if (decl
5191 && TREE_CODE (decl) == FUNCTION_DECL
5192 && optimize
5193 && !(profile_flag && !flag_fentry))
5195 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5196 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5197 if (i && i->local && i->can_change_signature)
5199 int local_regparm, globals = 0, regno;
5201 /* Make sure no regparm register is taken by a
5202 fixed register variable. */
5203 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5204 if (fixed_regs[local_regparm])
5205 break;
5207 /* We don't want to use regparm(3) for nested functions as
5208 these use a static chain pointer in the third argument. */
5209 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5210 local_regparm = 2;
5212 /* In 32-bit mode save a register for the split stack. */
5213 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5214 local_regparm = 2;
5216 /* Each fixed register usage increases register pressure,
5217 so less registers should be used for argument passing.
5218 This functionality can be overriden by an explicit
5219 regparm value. */
5220 for (regno = AX_REG; regno <= DI_REG; regno++)
5221 if (fixed_regs[regno])
5222 globals++;
5224 local_regparm
5225 = globals < local_regparm ? local_regparm - globals : 0;
5227 if (local_regparm > regparm)
5228 regparm = local_regparm;
5232 return regparm;
5235 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5236 DFmode (2) arguments in SSE registers for a function with the
5237 indicated TYPE and DECL. DECL may be NULL when calling function
5238 indirectly or considering a libcall. Otherwise return 0. */
5240 static int
5241 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5243 gcc_assert (!TARGET_64BIT);
5245 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5246 by the sseregparm attribute. */
5247 if (TARGET_SSEREGPARM
5248 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5250 if (!TARGET_SSE)
5252 if (warn)
5254 if (decl)
5255 error ("calling %qD with attribute sseregparm without "
5256 "SSE/SSE2 enabled", decl);
5257 else
5258 error ("calling %qT with attribute sseregparm without "
5259 "SSE/SSE2 enabled", type);
5261 return 0;
5264 return 2;
5267 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5268 (and DFmode for SSE2) arguments in SSE registers. */
5269 if (decl && TARGET_SSE_MATH && optimize
5270 && !(profile_flag && !flag_fentry))
5272 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5273 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5274 if (i && i->local && i->can_change_signature)
5275 return TARGET_SSE2 ? 2 : 1;
5278 return 0;
5281 /* Return true if EAX is live at the start of the function. Used by
5282 ix86_expand_prologue to determine if we need special help before
5283 calling allocate_stack_worker. */
5285 static bool
5286 ix86_eax_live_at_start_p (void)
5288 /* Cheat. Don't bother working forward from ix86_function_regparm
5289 to the function type to whether an actual argument is located in
5290 eax. Instead just look at cfg info, which is still close enough
5291 to correct at this point. This gives false positives for broken
5292 functions that might use uninitialized data that happens to be
5293 allocated in eax, but who cares? */
5294 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5297 static bool
5298 ix86_keep_aggregate_return_pointer (tree fntype)
5300 tree attr;
5302 if (!TARGET_64BIT)
5304 attr = lookup_attribute ("callee_pop_aggregate_return",
5305 TYPE_ATTRIBUTES (fntype));
5306 if (attr)
5307 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5309 /* For 32-bit MS-ABI the default is to keep aggregate
5310 return pointer. */
5311 if (ix86_function_type_abi (fntype) == MS_ABI)
5312 return true;
5314 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5317 /* Value is the number of bytes of arguments automatically
5318 popped when returning from a subroutine call.
5319 FUNDECL is the declaration node of the function (as a tree),
5320 FUNTYPE is the data type of the function (as a tree),
5321 or for a library call it is an identifier node for the subroutine name.
5322 SIZE is the number of bytes of arguments passed on the stack.
5324 On the 80386, the RTD insn may be used to pop them if the number
5325 of args is fixed, but if the number is variable then the caller
5326 must pop them all. RTD can't be used for library calls now
5327 because the library is compiled with the Unix compiler.
5328 Use of RTD is a selectable option, since it is incompatible with
5329 standard Unix calling sequences. If the option is not selected,
5330 the caller must always pop the args.
5332 The attribute stdcall is equivalent to RTD on a per module basis. */
5334 static int
5335 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5337 unsigned int ccvt;
5339 /* None of the 64-bit ABIs pop arguments. */
5340 if (TARGET_64BIT)
5341 return 0;
5343 ccvt = ix86_get_callcvt (funtype);
5345 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5346 | IX86_CALLCVT_THISCALL)) != 0
5347 && ! stdarg_p (funtype))
5348 return size;
5350 /* Lose any fake structure return argument if it is passed on the stack. */
5351 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5352 && !ix86_keep_aggregate_return_pointer (funtype))
5354 int nregs = ix86_function_regparm (funtype, fundecl);
5355 if (nregs == 0)
5356 return GET_MODE_SIZE (Pmode);
5359 return 0;
5362 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5364 static bool
5365 ix86_legitimate_combined_insn (rtx insn)
5367 /* Check operand constraints in case hard registers were propagated
5368 into insn pattern. This check prevents combine pass from
5369 generating insn patterns with invalid hard register operands.
5370 These invalid insns can eventually confuse reload to error out
5371 with a spill failure. See also PRs 46829 and 46843. */
5372 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5374 int i;
5376 extract_insn (insn);
5377 preprocess_constraints ();
5379 for (i = 0; i < recog_data.n_operands; i++)
5381 rtx op = recog_data.operand[i];
5382 enum machine_mode mode = GET_MODE (op);
5383 struct operand_alternative *op_alt;
5384 int offset = 0;
5385 bool win;
5386 int j;
5388 /* A unary operator may be accepted by the predicate, but it
5389 is irrelevant for matching constraints. */
5390 if (UNARY_P (op))
5391 op = XEXP (op, 0);
5393 if (GET_CODE (op) == SUBREG)
5395 if (REG_P (SUBREG_REG (op))
5396 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5397 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5398 GET_MODE (SUBREG_REG (op)),
5399 SUBREG_BYTE (op),
5400 GET_MODE (op));
5401 op = SUBREG_REG (op);
5404 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5405 continue;
5407 op_alt = recog_op_alt[i];
5409 /* Operand has no constraints, anything is OK. */
5410 win = !recog_data.n_alternatives;
5412 for (j = 0; j < recog_data.n_alternatives; j++)
5414 if (op_alt[j].anything_ok
5415 || (op_alt[j].matches != -1
5416 && operands_match_p
5417 (recog_data.operand[i],
5418 recog_data.operand[op_alt[j].matches]))
5419 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5421 win = true;
5422 break;
5426 if (!win)
5427 return false;
5431 return true;
5434 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5436 static unsigned HOST_WIDE_INT
5437 ix86_asan_shadow_offset (void)
5439 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5440 : HOST_WIDE_INT_C (0x7fff8000))
5441 : (HOST_WIDE_INT_1 << 29);
5444 /* Argument support functions. */
5446 /* Return true when register may be used to pass function parameters. */
5447 bool
5448 ix86_function_arg_regno_p (int regno)
5450 int i;
5451 const int *parm_regs;
5453 if (!TARGET_64BIT)
5455 if (TARGET_MACHO)
5456 return (regno < REGPARM_MAX
5457 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5458 else
5459 return (regno < REGPARM_MAX
5460 || (TARGET_MMX && MMX_REGNO_P (regno)
5461 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5462 || (TARGET_SSE && SSE_REGNO_P (regno)
5463 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5466 if (TARGET_MACHO)
5468 if (SSE_REGNO_P (regno) && TARGET_SSE)
5469 return true;
5471 else
5473 if (TARGET_SSE && SSE_REGNO_P (regno)
5474 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5475 return true;
5478 /* TODO: The function should depend on current function ABI but
5479 builtins.c would need updating then. Therefore we use the
5480 default ABI. */
5482 /* RAX is used as hidden argument to va_arg functions. */
5483 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5484 return true;
5486 if (ix86_abi == MS_ABI)
5487 parm_regs = x86_64_ms_abi_int_parameter_registers;
5488 else
5489 parm_regs = x86_64_int_parameter_registers;
5490 for (i = 0; i < (ix86_abi == MS_ABI
5491 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5492 if (regno == parm_regs[i])
5493 return true;
5494 return false;
5497 /* Return if we do not know how to pass TYPE solely in registers. */
5499 static bool
5500 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5502 if (must_pass_in_stack_var_size_or_pad (mode, type))
5503 return true;
5505 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5506 The layout_type routine is crafty and tries to trick us into passing
5507 currently unsupported vector types on the stack by using TImode. */
5508 return (!TARGET_64BIT && mode == TImode
5509 && type && TREE_CODE (type) != VECTOR_TYPE);
5512 /* It returns the size, in bytes, of the area reserved for arguments passed
5513 in registers for the function represented by fndecl dependent to the used
5514 abi format. */
5516 ix86_reg_parm_stack_space (const_tree fndecl)
5518 enum calling_abi call_abi = SYSV_ABI;
5519 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5520 call_abi = ix86_function_abi (fndecl);
5521 else
5522 call_abi = ix86_function_type_abi (fndecl);
5523 if (TARGET_64BIT && call_abi == MS_ABI)
5524 return 32;
5525 return 0;
5528 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5529 call abi used. */
5530 enum calling_abi
5531 ix86_function_type_abi (const_tree fntype)
5533 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5535 enum calling_abi abi = ix86_abi;
5536 if (abi == SYSV_ABI)
5538 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5539 abi = MS_ABI;
5541 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5542 abi = SYSV_ABI;
5543 return abi;
5545 return ix86_abi;
5548 static bool
5549 ix86_function_ms_hook_prologue (const_tree fn)
5551 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5553 if (decl_function_context (fn) != NULL_TREE)
5554 error_at (DECL_SOURCE_LOCATION (fn),
5555 "ms_hook_prologue is not compatible with nested function");
5556 else
5557 return true;
5559 return false;
5562 static enum calling_abi
5563 ix86_function_abi (const_tree fndecl)
5565 if (! fndecl)
5566 return ix86_abi;
5567 return ix86_function_type_abi (TREE_TYPE (fndecl));
5570 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5571 call abi used. */
5572 enum calling_abi
5573 ix86_cfun_abi (void)
5575 if (! cfun)
5576 return ix86_abi;
5577 return cfun->machine->call_abi;
5580 /* Write the extra assembler code needed to declare a function properly. */
5582 void
5583 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5584 tree decl)
5586 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5588 if (is_ms_hook)
5590 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5591 unsigned int filler_cc = 0xcccccccc;
5593 for (i = 0; i < filler_count; i += 4)
5594 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5597 #ifdef SUBTARGET_ASM_UNWIND_INIT
5598 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5599 #endif
5601 ASM_OUTPUT_LABEL (asm_out_file, fname);
5603 /* Output magic byte marker, if hot-patch attribute is set. */
5604 if (is_ms_hook)
5606 if (TARGET_64BIT)
5608 /* leaq [%rsp + 0], %rsp */
5609 asm_fprintf (asm_out_file, ASM_BYTE
5610 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5612 else
5614 /* movl.s %edi, %edi
5615 push %ebp
5616 movl.s %esp, %ebp */
5617 asm_fprintf (asm_out_file, ASM_BYTE
5618 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5623 /* regclass.c */
5624 extern void init_regs (void);
5626 /* Implementation of call abi switching target hook. Specific to FNDECL
5627 the specific call register sets are set. See also
5628 ix86_conditional_register_usage for more details. */
5629 void
5630 ix86_call_abi_override (const_tree fndecl)
5632 if (fndecl == NULL_TREE)
5633 cfun->machine->call_abi = ix86_abi;
5634 else
5635 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5638 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5639 expensive re-initialization of init_regs each time we switch function context
5640 since this is needed only during RTL expansion. */
5641 static void
5642 ix86_maybe_switch_abi (void)
5644 if (TARGET_64BIT &&
5645 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5646 reinit_regs ();
5649 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5650 for a call to a function whose data type is FNTYPE.
5651 For a library call, FNTYPE is 0. */
5653 void
5654 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5655 tree fntype, /* tree ptr for function decl */
5656 rtx libname, /* SYMBOL_REF of library name or 0 */
5657 tree fndecl,
5658 int caller)
5660 struct cgraph_local_info *i;
5662 memset (cum, 0, sizeof (*cum));
5664 if (fndecl)
5666 i = cgraph_local_info (fndecl);
5667 cum->call_abi = ix86_function_abi (fndecl);
5669 else
5671 i = NULL;
5672 cum->call_abi = ix86_function_type_abi (fntype);
5675 cum->caller = caller;
5677 /* Set up the number of registers to use for passing arguments. */
5679 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5680 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5681 "or subtarget optimization implying it");
5682 cum->nregs = ix86_regparm;
5683 if (TARGET_64BIT)
5685 cum->nregs = (cum->call_abi == SYSV_ABI
5686 ? X86_64_REGPARM_MAX
5687 : X86_64_MS_REGPARM_MAX);
5689 if (TARGET_SSE)
5691 cum->sse_nregs = SSE_REGPARM_MAX;
5692 if (TARGET_64BIT)
5694 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5695 ? X86_64_SSE_REGPARM_MAX
5696 : X86_64_MS_SSE_REGPARM_MAX);
5699 if (TARGET_MMX)
5700 cum->mmx_nregs = MMX_REGPARM_MAX;
5701 cum->warn_avx = true;
5702 cum->warn_sse = true;
5703 cum->warn_mmx = true;
5705 /* Because type might mismatch in between caller and callee, we need to
5706 use actual type of function for local calls.
5707 FIXME: cgraph_analyze can be told to actually record if function uses
5708 va_start so for local functions maybe_vaarg can be made aggressive
5709 helping K&R code.
5710 FIXME: once typesytem is fixed, we won't need this code anymore. */
5711 if (i && i->local && i->can_change_signature)
5712 fntype = TREE_TYPE (fndecl);
5713 cum->maybe_vaarg = (fntype
5714 ? (!prototype_p (fntype) || stdarg_p (fntype))
5715 : !libname);
5717 if (!TARGET_64BIT)
5719 /* If there are variable arguments, then we won't pass anything
5720 in registers in 32-bit mode. */
5721 if (stdarg_p (fntype))
5723 cum->nregs = 0;
5724 cum->sse_nregs = 0;
5725 cum->mmx_nregs = 0;
5726 cum->warn_avx = 0;
5727 cum->warn_sse = 0;
5728 cum->warn_mmx = 0;
5729 return;
5732 /* Use ecx and edx registers if function has fastcall attribute,
5733 else look for regparm information. */
5734 if (fntype)
5736 unsigned int ccvt = ix86_get_callcvt (fntype);
5737 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5739 cum->nregs = 1;
5740 cum->fastcall = 1; /* Same first register as in fastcall. */
5742 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5744 cum->nregs = 2;
5745 cum->fastcall = 1;
5747 else
5748 cum->nregs = ix86_function_regparm (fntype, fndecl);
5751 /* Set up the number of SSE registers used for passing SFmode
5752 and DFmode arguments. Warn for mismatching ABI. */
5753 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5757 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5758 But in the case of vector types, it is some vector mode.
5760 When we have only some of our vector isa extensions enabled, then there
5761 are some modes for which vector_mode_supported_p is false. For these
5762 modes, the generic vector support in gcc will choose some non-vector mode
5763 in order to implement the type. By computing the natural mode, we'll
5764 select the proper ABI location for the operand and not depend on whatever
5765 the middle-end decides to do with these vector types.
5767 The midde-end can't deal with the vector types > 16 bytes. In this
5768 case, we return the original mode and warn ABI change if CUM isn't
5769 NULL. */
5771 static enum machine_mode
5772 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5774 enum machine_mode mode = TYPE_MODE (type);
5776 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5778 HOST_WIDE_INT size = int_size_in_bytes (type);
5779 if ((size == 8 || size == 16 || size == 32)
5780 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5781 && TYPE_VECTOR_SUBPARTS (type) > 1)
5783 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5785 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5786 mode = MIN_MODE_VECTOR_FLOAT;
5787 else
5788 mode = MIN_MODE_VECTOR_INT;
5790 /* Get the mode which has this inner mode and number of units. */
5791 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5792 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5793 && GET_MODE_INNER (mode) == innermode)
5795 if (size == 32 && !TARGET_AVX)
5797 static bool warnedavx;
5799 if (cum
5800 && !warnedavx
5801 && cum->warn_avx)
5803 warnedavx = true;
5804 warning (0, "AVX vector argument without AVX "
5805 "enabled changes the ABI");
5807 return TYPE_MODE (type);
5809 else if ((size == 8 || size == 16) && !TARGET_SSE)
5811 static bool warnedsse;
5813 if (cum
5814 && !warnedsse
5815 && cum->warn_sse)
5817 warnedsse = true;
5818 warning (0, "SSE vector argument without SSE "
5819 "enabled changes the ABI");
5821 return mode;
5823 else
5824 return mode;
5827 gcc_unreachable ();
5831 return mode;
5834 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5835 this may not agree with the mode that the type system has chosen for the
5836 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5837 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5839 static rtx
5840 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5841 unsigned int regno)
5843 rtx tmp;
5845 if (orig_mode != BLKmode)
5846 tmp = gen_rtx_REG (orig_mode, regno);
5847 else
5849 tmp = gen_rtx_REG (mode, regno);
5850 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5851 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5854 return tmp;
5857 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5858 of this code is to classify each 8bytes of incoming argument by the register
5859 class and assign registers accordingly. */
5861 /* Return the union class of CLASS1 and CLASS2.
5862 See the x86-64 PS ABI for details. */
5864 static enum x86_64_reg_class
5865 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5867 /* Rule #1: If both classes are equal, this is the resulting class. */
5868 if (class1 == class2)
5869 return class1;
5871 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5872 the other class. */
5873 if (class1 == X86_64_NO_CLASS)
5874 return class2;
5875 if (class2 == X86_64_NO_CLASS)
5876 return class1;
5878 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5879 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5880 return X86_64_MEMORY_CLASS;
5882 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5883 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5884 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5885 return X86_64_INTEGERSI_CLASS;
5886 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5887 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5888 return X86_64_INTEGER_CLASS;
5890 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5891 MEMORY is used. */
5892 if (class1 == X86_64_X87_CLASS
5893 || class1 == X86_64_X87UP_CLASS
5894 || class1 == X86_64_COMPLEX_X87_CLASS
5895 || class2 == X86_64_X87_CLASS
5896 || class2 == X86_64_X87UP_CLASS
5897 || class2 == X86_64_COMPLEX_X87_CLASS)
5898 return X86_64_MEMORY_CLASS;
5900 /* Rule #6: Otherwise class SSE is used. */
5901 return X86_64_SSE_CLASS;
5904 /* Classify the argument of type TYPE and mode MODE.
5905 CLASSES will be filled by the register class used to pass each word
5906 of the operand. The number of words is returned. In case the parameter
5907 should be passed in memory, 0 is returned. As a special case for zero
5908 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5910 BIT_OFFSET is used internally for handling records and specifies offset
5911 of the offset in bits modulo 256 to avoid overflow cases.
5913 See the x86-64 PS ABI for details.
5916 static int
5917 classify_argument (enum machine_mode mode, const_tree type,
5918 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5920 HOST_WIDE_INT bytes =
5921 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5922 int words
5923 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5925 /* Variable sized entities are always passed/returned in memory. */
5926 if (bytes < 0)
5927 return 0;
5929 if (mode != VOIDmode
5930 && targetm.calls.must_pass_in_stack (mode, type))
5931 return 0;
5933 if (type && AGGREGATE_TYPE_P (type))
5935 int i;
5936 tree field;
5937 enum x86_64_reg_class subclasses[MAX_CLASSES];
5939 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5940 if (bytes > 32)
5941 return 0;
5943 for (i = 0; i < words; i++)
5944 classes[i] = X86_64_NO_CLASS;
5946 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5947 signalize memory class, so handle it as special case. */
5948 if (!words)
5950 classes[0] = X86_64_NO_CLASS;
5951 return 1;
5954 /* Classify each field of record and merge classes. */
5955 switch (TREE_CODE (type))
5957 case RECORD_TYPE:
5958 /* And now merge the fields of structure. */
5959 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5961 if (TREE_CODE (field) == FIELD_DECL)
5963 int num;
5965 if (TREE_TYPE (field) == error_mark_node)
5966 continue;
5968 /* Bitfields are always classified as integer. Handle them
5969 early, since later code would consider them to be
5970 misaligned integers. */
5971 if (DECL_BIT_FIELD (field))
5973 for (i = (int_bit_position (field)
5974 + (bit_offset % 64)) / 8 / 8;
5975 i < ((int_bit_position (field) + (bit_offset % 64))
5976 + tree_low_cst (DECL_SIZE (field), 0)
5977 + 63) / 8 / 8; i++)
5978 classes[i] =
5979 merge_classes (X86_64_INTEGER_CLASS,
5980 classes[i]);
5982 else
5984 int pos;
5986 type = TREE_TYPE (field);
5988 /* Flexible array member is ignored. */
5989 if (TYPE_MODE (type) == BLKmode
5990 && TREE_CODE (type) == ARRAY_TYPE
5991 && TYPE_SIZE (type) == NULL_TREE
5992 && TYPE_DOMAIN (type) != NULL_TREE
5993 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5994 == NULL_TREE))
5996 static bool warned;
5998 if (!warned && warn_psabi)
6000 warned = true;
6001 inform (input_location,
6002 "the ABI of passing struct with"
6003 " a flexible array member has"
6004 " changed in GCC 4.4");
6006 continue;
6008 num = classify_argument (TYPE_MODE (type), type,
6009 subclasses,
6010 (int_bit_position (field)
6011 + bit_offset) % 256);
6012 if (!num)
6013 return 0;
6014 pos = (int_bit_position (field)
6015 + (bit_offset % 64)) / 8 / 8;
6016 for (i = 0; i < num && (i + pos) < words; i++)
6017 classes[i + pos] =
6018 merge_classes (subclasses[i], classes[i + pos]);
6022 break;
6024 case ARRAY_TYPE:
6025 /* Arrays are handled as small records. */
6027 int num;
6028 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6029 TREE_TYPE (type), subclasses, bit_offset);
6030 if (!num)
6031 return 0;
6033 /* The partial classes are now full classes. */
6034 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6035 subclasses[0] = X86_64_SSE_CLASS;
6036 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6037 && !((bit_offset % 64) == 0 && bytes == 4))
6038 subclasses[0] = X86_64_INTEGER_CLASS;
6040 for (i = 0; i < words; i++)
6041 classes[i] = subclasses[i % num];
6043 break;
6045 case UNION_TYPE:
6046 case QUAL_UNION_TYPE:
6047 /* Unions are similar to RECORD_TYPE but offset is always 0.
6049 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6051 if (TREE_CODE (field) == FIELD_DECL)
6053 int num;
6055 if (TREE_TYPE (field) == error_mark_node)
6056 continue;
6058 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6059 TREE_TYPE (field), subclasses,
6060 bit_offset);
6061 if (!num)
6062 return 0;
6063 for (i = 0; i < num; i++)
6064 classes[i] = merge_classes (subclasses[i], classes[i]);
6067 break;
6069 default:
6070 gcc_unreachable ();
6073 if (words > 2)
6075 /* When size > 16 bytes, if the first one isn't
6076 X86_64_SSE_CLASS or any other ones aren't
6077 X86_64_SSEUP_CLASS, everything should be passed in
6078 memory. */
6079 if (classes[0] != X86_64_SSE_CLASS)
6080 return 0;
6082 for (i = 1; i < words; i++)
6083 if (classes[i] != X86_64_SSEUP_CLASS)
6084 return 0;
6087 /* Final merger cleanup. */
6088 for (i = 0; i < words; i++)
6090 /* If one class is MEMORY, everything should be passed in
6091 memory. */
6092 if (classes[i] == X86_64_MEMORY_CLASS)
6093 return 0;
6095 /* The X86_64_SSEUP_CLASS should be always preceded by
6096 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6097 if (classes[i] == X86_64_SSEUP_CLASS
6098 && classes[i - 1] != X86_64_SSE_CLASS
6099 && classes[i - 1] != X86_64_SSEUP_CLASS)
6101 /* The first one should never be X86_64_SSEUP_CLASS. */
6102 gcc_assert (i != 0);
6103 classes[i] = X86_64_SSE_CLASS;
6106 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6107 everything should be passed in memory. */
6108 if (classes[i] == X86_64_X87UP_CLASS
6109 && (classes[i - 1] != X86_64_X87_CLASS))
6111 static bool warned;
6113 /* The first one should never be X86_64_X87UP_CLASS. */
6114 gcc_assert (i != 0);
6115 if (!warned && warn_psabi)
6117 warned = true;
6118 inform (input_location,
6119 "the ABI of passing union with long double"
6120 " has changed in GCC 4.4");
6122 return 0;
6125 return words;
6128 /* Compute alignment needed. We align all types to natural boundaries with
6129 exception of XFmode that is aligned to 64bits. */
6130 if (mode != VOIDmode && mode != BLKmode)
6132 int mode_alignment = GET_MODE_BITSIZE (mode);
6134 if (mode == XFmode)
6135 mode_alignment = 128;
6136 else if (mode == XCmode)
6137 mode_alignment = 256;
6138 if (COMPLEX_MODE_P (mode))
6139 mode_alignment /= 2;
6140 /* Misaligned fields are always returned in memory. */
6141 if (bit_offset % mode_alignment)
6142 return 0;
6145 /* for V1xx modes, just use the base mode */
6146 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6147 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6148 mode = GET_MODE_INNER (mode);
6150 /* Classification of atomic types. */
6151 switch (mode)
6153 case SDmode:
6154 case DDmode:
6155 classes[0] = X86_64_SSE_CLASS;
6156 return 1;
6157 case TDmode:
6158 classes[0] = X86_64_SSE_CLASS;
6159 classes[1] = X86_64_SSEUP_CLASS;
6160 return 2;
6161 case DImode:
6162 case SImode:
6163 case HImode:
6164 case QImode:
6165 case CSImode:
6166 case CHImode:
6167 case CQImode:
6169 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6171 if (size <= 32)
6173 classes[0] = X86_64_INTEGERSI_CLASS;
6174 return 1;
6176 else if (size <= 64)
6178 classes[0] = X86_64_INTEGER_CLASS;
6179 return 1;
6181 else if (size <= 64+32)
6183 classes[0] = X86_64_INTEGER_CLASS;
6184 classes[1] = X86_64_INTEGERSI_CLASS;
6185 return 2;
6187 else if (size <= 64+64)
6189 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6190 return 2;
6192 else
6193 gcc_unreachable ();
6195 case CDImode:
6196 case TImode:
6197 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6198 return 2;
6199 case COImode:
6200 case OImode:
6201 /* OImode shouldn't be used directly. */
6202 gcc_unreachable ();
6203 case CTImode:
6204 return 0;
6205 case SFmode:
6206 if (!(bit_offset % 64))
6207 classes[0] = X86_64_SSESF_CLASS;
6208 else
6209 classes[0] = X86_64_SSE_CLASS;
6210 return 1;
6211 case DFmode:
6212 classes[0] = X86_64_SSEDF_CLASS;
6213 return 1;
6214 case XFmode:
6215 classes[0] = X86_64_X87_CLASS;
6216 classes[1] = X86_64_X87UP_CLASS;
6217 return 2;
6218 case TFmode:
6219 classes[0] = X86_64_SSE_CLASS;
6220 classes[1] = X86_64_SSEUP_CLASS;
6221 return 2;
6222 case SCmode:
6223 classes[0] = X86_64_SSE_CLASS;
6224 if (!(bit_offset % 64))
6225 return 1;
6226 else
6228 static bool warned;
6230 if (!warned && warn_psabi)
6232 warned = true;
6233 inform (input_location,
6234 "the ABI of passing structure with complex float"
6235 " member has changed in GCC 4.4");
6237 classes[1] = X86_64_SSESF_CLASS;
6238 return 2;
6240 case DCmode:
6241 classes[0] = X86_64_SSEDF_CLASS;
6242 classes[1] = X86_64_SSEDF_CLASS;
6243 return 2;
6244 case XCmode:
6245 classes[0] = X86_64_COMPLEX_X87_CLASS;
6246 return 1;
6247 case TCmode:
6248 /* This modes is larger than 16 bytes. */
6249 return 0;
6250 case V8SFmode:
6251 case V8SImode:
6252 case V32QImode:
6253 case V16HImode:
6254 case V4DFmode:
6255 case V4DImode:
6256 classes[0] = X86_64_SSE_CLASS;
6257 classes[1] = X86_64_SSEUP_CLASS;
6258 classes[2] = X86_64_SSEUP_CLASS;
6259 classes[3] = X86_64_SSEUP_CLASS;
6260 return 4;
6261 case V4SFmode:
6262 case V4SImode:
6263 case V16QImode:
6264 case V8HImode:
6265 case V2DFmode:
6266 case V2DImode:
6267 classes[0] = X86_64_SSE_CLASS;
6268 classes[1] = X86_64_SSEUP_CLASS;
6269 return 2;
6270 case V1TImode:
6271 case V1DImode:
6272 case V2SFmode:
6273 case V2SImode:
6274 case V4HImode:
6275 case V8QImode:
6276 classes[0] = X86_64_SSE_CLASS;
6277 return 1;
6278 case BLKmode:
6279 case VOIDmode:
6280 return 0;
6281 default:
6282 gcc_assert (VECTOR_MODE_P (mode));
6284 if (bytes > 16)
6285 return 0;
6287 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6289 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6290 classes[0] = X86_64_INTEGERSI_CLASS;
6291 else
6292 classes[0] = X86_64_INTEGER_CLASS;
6293 classes[1] = X86_64_INTEGER_CLASS;
6294 return 1 + (bytes > 8);
6298 /* Examine the argument and return set number of register required in each
6299 class. Return 0 iff parameter should be passed in memory. */
6300 static int
6301 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6302 int *int_nregs, int *sse_nregs)
6304 enum x86_64_reg_class regclass[MAX_CLASSES];
6305 int n = classify_argument (mode, type, regclass, 0);
6307 *int_nregs = 0;
6308 *sse_nregs = 0;
6309 if (!n)
6310 return 0;
6311 for (n--; n >= 0; n--)
6312 switch (regclass[n])
6314 case X86_64_INTEGER_CLASS:
6315 case X86_64_INTEGERSI_CLASS:
6316 (*int_nregs)++;
6317 break;
6318 case X86_64_SSE_CLASS:
6319 case X86_64_SSESF_CLASS:
6320 case X86_64_SSEDF_CLASS:
6321 (*sse_nregs)++;
6322 break;
6323 case X86_64_NO_CLASS:
6324 case X86_64_SSEUP_CLASS:
6325 break;
6326 case X86_64_X87_CLASS:
6327 case X86_64_X87UP_CLASS:
6328 if (!in_return)
6329 return 0;
6330 break;
6331 case X86_64_COMPLEX_X87_CLASS:
6332 return in_return ? 2 : 0;
6333 case X86_64_MEMORY_CLASS:
6334 gcc_unreachable ();
6336 return 1;
6339 /* Construct container for the argument used by GCC interface. See
6340 FUNCTION_ARG for the detailed description. */
6342 static rtx
6343 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6344 const_tree type, int in_return, int nintregs, int nsseregs,
6345 const int *intreg, int sse_regno)
6347 /* The following variables hold the static issued_error state. */
6348 static bool issued_sse_arg_error;
6349 static bool issued_sse_ret_error;
6350 static bool issued_x87_ret_error;
6352 enum machine_mode tmpmode;
6353 int bytes =
6354 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6355 enum x86_64_reg_class regclass[MAX_CLASSES];
6356 int n;
6357 int i;
6358 int nexps = 0;
6359 int needed_sseregs, needed_intregs;
6360 rtx exp[MAX_CLASSES];
6361 rtx ret;
6363 n = classify_argument (mode, type, regclass, 0);
6364 if (!n)
6365 return NULL;
6366 if (!examine_argument (mode, type, in_return, &needed_intregs,
6367 &needed_sseregs))
6368 return NULL;
6369 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6370 return NULL;
6372 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6373 some less clueful developer tries to use floating-point anyway. */
6374 if (needed_sseregs && !TARGET_SSE)
6376 if (in_return)
6378 if (!issued_sse_ret_error)
6380 error ("SSE register return with SSE disabled");
6381 issued_sse_ret_error = true;
6384 else if (!issued_sse_arg_error)
6386 error ("SSE register argument with SSE disabled");
6387 issued_sse_arg_error = true;
6389 return NULL;
6392 /* Likewise, error if the ABI requires us to return values in the
6393 x87 registers and the user specified -mno-80387. */
6394 if (!TARGET_80387 && in_return)
6395 for (i = 0; i < n; i++)
6396 if (regclass[i] == X86_64_X87_CLASS
6397 || regclass[i] == X86_64_X87UP_CLASS
6398 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6400 if (!issued_x87_ret_error)
6402 error ("x87 register return with x87 disabled");
6403 issued_x87_ret_error = true;
6405 return NULL;
6408 /* First construct simple cases. Avoid SCmode, since we want to use
6409 single register to pass this type. */
6410 if (n == 1 && mode != SCmode)
6411 switch (regclass[0])
6413 case X86_64_INTEGER_CLASS:
6414 case X86_64_INTEGERSI_CLASS:
6415 return gen_rtx_REG (mode, intreg[0]);
6416 case X86_64_SSE_CLASS:
6417 case X86_64_SSESF_CLASS:
6418 case X86_64_SSEDF_CLASS:
6419 if (mode != BLKmode)
6420 return gen_reg_or_parallel (mode, orig_mode,
6421 SSE_REGNO (sse_regno));
6422 break;
6423 case X86_64_X87_CLASS:
6424 case X86_64_COMPLEX_X87_CLASS:
6425 return gen_rtx_REG (mode, FIRST_STACK_REG);
6426 case X86_64_NO_CLASS:
6427 /* Zero sized array, struct or class. */
6428 return NULL;
6429 default:
6430 gcc_unreachable ();
6432 if (n == 2
6433 && regclass[0] == X86_64_SSE_CLASS
6434 && regclass[1] == X86_64_SSEUP_CLASS
6435 && mode != BLKmode)
6436 return gen_reg_or_parallel (mode, orig_mode,
6437 SSE_REGNO (sse_regno));
6438 if (n == 4
6439 && regclass[0] == X86_64_SSE_CLASS
6440 && regclass[1] == X86_64_SSEUP_CLASS
6441 && regclass[2] == X86_64_SSEUP_CLASS
6442 && regclass[3] == X86_64_SSEUP_CLASS
6443 && mode != BLKmode)
6444 return gen_reg_or_parallel (mode, orig_mode,
6445 SSE_REGNO (sse_regno));
6446 if (n == 2
6447 && regclass[0] == X86_64_X87_CLASS
6448 && regclass[1] == X86_64_X87UP_CLASS)
6449 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6451 if (n == 2
6452 && regclass[0] == X86_64_INTEGER_CLASS
6453 && regclass[1] == X86_64_INTEGER_CLASS
6454 && (mode == CDImode || mode == TImode || mode == TFmode)
6455 && intreg[0] + 1 == intreg[1])
6456 return gen_rtx_REG (mode, intreg[0]);
6458 /* Otherwise figure out the entries of the PARALLEL. */
6459 for (i = 0; i < n; i++)
6461 int pos;
6463 switch (regclass[i])
6465 case X86_64_NO_CLASS:
6466 break;
6467 case X86_64_INTEGER_CLASS:
6468 case X86_64_INTEGERSI_CLASS:
6469 /* Merge TImodes on aligned occasions here too. */
6470 if (i * 8 + 8 > bytes)
6471 tmpmode
6472 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6473 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6474 tmpmode = SImode;
6475 else
6476 tmpmode = DImode;
6477 /* We've requested 24 bytes we
6478 don't have mode for. Use DImode. */
6479 if (tmpmode == BLKmode)
6480 tmpmode = DImode;
6481 exp [nexps++]
6482 = gen_rtx_EXPR_LIST (VOIDmode,
6483 gen_rtx_REG (tmpmode, *intreg),
6484 GEN_INT (i*8));
6485 intreg++;
6486 break;
6487 case X86_64_SSESF_CLASS:
6488 exp [nexps++]
6489 = gen_rtx_EXPR_LIST (VOIDmode,
6490 gen_rtx_REG (SFmode,
6491 SSE_REGNO (sse_regno)),
6492 GEN_INT (i*8));
6493 sse_regno++;
6494 break;
6495 case X86_64_SSEDF_CLASS:
6496 exp [nexps++]
6497 = gen_rtx_EXPR_LIST (VOIDmode,
6498 gen_rtx_REG (DFmode,
6499 SSE_REGNO (sse_regno)),
6500 GEN_INT (i*8));
6501 sse_regno++;
6502 break;
6503 case X86_64_SSE_CLASS:
6504 pos = i;
6505 switch (n)
6507 case 1:
6508 tmpmode = DImode;
6509 break;
6510 case 2:
6511 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6513 tmpmode = TImode;
6514 i++;
6516 else
6517 tmpmode = DImode;
6518 break;
6519 case 4:
6520 gcc_assert (i == 0
6521 && regclass[1] == X86_64_SSEUP_CLASS
6522 && regclass[2] == X86_64_SSEUP_CLASS
6523 && regclass[3] == X86_64_SSEUP_CLASS);
6524 tmpmode = OImode;
6525 i += 3;
6526 break;
6527 default:
6528 gcc_unreachable ();
6530 exp [nexps++]
6531 = gen_rtx_EXPR_LIST (VOIDmode,
6532 gen_rtx_REG (tmpmode,
6533 SSE_REGNO (sse_regno)),
6534 GEN_INT (pos*8));
6535 sse_regno++;
6536 break;
6537 default:
6538 gcc_unreachable ();
6542 /* Empty aligned struct, union or class. */
6543 if (nexps == 0)
6544 return NULL;
6546 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6547 for (i = 0; i < nexps; i++)
6548 XVECEXP (ret, 0, i) = exp [i];
6549 return ret;
6552 /* Update the data in CUM to advance over an argument of mode MODE
6553 and data type TYPE. (TYPE is null for libcalls where that information
6554 may not be available.) */
6556 static void
6557 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6558 const_tree type, HOST_WIDE_INT bytes,
6559 HOST_WIDE_INT words)
6561 switch (mode)
6563 default:
6564 break;
6566 case BLKmode:
6567 if (bytes < 0)
6568 break;
6569 /* FALLTHRU */
6571 case DImode:
6572 case SImode:
6573 case HImode:
6574 case QImode:
6575 cum->words += words;
6576 cum->nregs -= words;
6577 cum->regno += words;
6579 if (cum->nregs <= 0)
6581 cum->nregs = 0;
6582 cum->regno = 0;
6584 break;
6586 case OImode:
6587 /* OImode shouldn't be used directly. */
6588 gcc_unreachable ();
6590 case DFmode:
6591 if (cum->float_in_sse < 2)
6592 break;
6593 case SFmode:
6594 if (cum->float_in_sse < 1)
6595 break;
6596 /* FALLTHRU */
6598 case V8SFmode:
6599 case V8SImode:
6600 case V32QImode:
6601 case V16HImode:
6602 case V4DFmode:
6603 case V4DImode:
6604 case TImode:
6605 case V16QImode:
6606 case V8HImode:
6607 case V4SImode:
6608 case V2DImode:
6609 case V4SFmode:
6610 case V2DFmode:
6611 if (!type || !AGGREGATE_TYPE_P (type))
6613 cum->sse_words += words;
6614 cum->sse_nregs -= 1;
6615 cum->sse_regno += 1;
6616 if (cum->sse_nregs <= 0)
6618 cum->sse_nregs = 0;
6619 cum->sse_regno = 0;
6622 break;
6624 case V8QImode:
6625 case V4HImode:
6626 case V2SImode:
6627 case V2SFmode:
6628 case V1TImode:
6629 case V1DImode:
6630 if (!type || !AGGREGATE_TYPE_P (type))
6632 cum->mmx_words += words;
6633 cum->mmx_nregs -= 1;
6634 cum->mmx_regno += 1;
6635 if (cum->mmx_nregs <= 0)
6637 cum->mmx_nregs = 0;
6638 cum->mmx_regno = 0;
6641 break;
6645 static void
6646 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6647 const_tree type, HOST_WIDE_INT words, bool named)
6649 int int_nregs, sse_nregs;
6651 /* Unnamed 256bit vector mode parameters are passed on stack. */
6652 if (!named && VALID_AVX256_REG_MODE (mode))
6653 return;
6655 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6656 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6658 cum->nregs -= int_nregs;
6659 cum->sse_nregs -= sse_nregs;
6660 cum->regno += int_nregs;
6661 cum->sse_regno += sse_nregs;
6663 else
6665 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6666 cum->words = (cum->words + align - 1) & ~(align - 1);
6667 cum->words += words;
6671 static void
6672 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6673 HOST_WIDE_INT words)
6675 /* Otherwise, this should be passed indirect. */
6676 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6678 cum->words += words;
6679 if (cum->nregs > 0)
6681 cum->nregs -= 1;
6682 cum->regno += 1;
6686 /* Update the data in CUM to advance over an argument of mode MODE and
6687 data type TYPE. (TYPE is null for libcalls where that information
6688 may not be available.) */
6690 static void
6691 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6692 const_tree type, bool named)
6694 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6695 HOST_WIDE_INT bytes, words;
6697 if (mode == BLKmode)
6698 bytes = int_size_in_bytes (type);
6699 else
6700 bytes = GET_MODE_SIZE (mode);
6701 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6703 if (type)
6704 mode = type_natural_mode (type, NULL);
6706 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6707 function_arg_advance_ms_64 (cum, bytes, words);
6708 else if (TARGET_64BIT)
6709 function_arg_advance_64 (cum, mode, type, words, named);
6710 else
6711 function_arg_advance_32 (cum, mode, type, bytes, words);
6714 /* Define where to put the arguments to a function.
6715 Value is zero to push the argument on the stack,
6716 or a hard register in which to store the argument.
6718 MODE is the argument's machine mode.
6719 TYPE is the data type of the argument (as a tree).
6720 This is null for libcalls where that information may
6721 not be available.
6722 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6723 the preceding args and about the function being called.
6724 NAMED is nonzero if this argument is a named parameter
6725 (otherwise it is an extra parameter matching an ellipsis). */
6727 static rtx
6728 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6729 enum machine_mode orig_mode, const_tree type,
6730 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6732 static bool warnedsse, warnedmmx;
6734 /* Avoid the AL settings for the Unix64 ABI. */
6735 if (mode == VOIDmode)
6736 return constm1_rtx;
6738 switch (mode)
6740 default:
6741 break;
6743 case BLKmode:
6744 if (bytes < 0)
6745 break;
6746 /* FALLTHRU */
6747 case DImode:
6748 case SImode:
6749 case HImode:
6750 case QImode:
6751 if (words <= cum->nregs)
6753 int regno = cum->regno;
6755 /* Fastcall allocates the first two DWORD (SImode) or
6756 smaller arguments to ECX and EDX if it isn't an
6757 aggregate type . */
6758 if (cum->fastcall)
6760 if (mode == BLKmode
6761 || mode == DImode
6762 || (type && AGGREGATE_TYPE_P (type)))
6763 break;
6765 /* ECX not EAX is the first allocated register. */
6766 if (regno == AX_REG)
6767 regno = CX_REG;
6769 return gen_rtx_REG (mode, regno);
6771 break;
6773 case DFmode:
6774 if (cum->float_in_sse < 2)
6775 break;
6776 case SFmode:
6777 if (cum->float_in_sse < 1)
6778 break;
6779 /* FALLTHRU */
6780 case TImode:
6781 /* In 32bit, we pass TImode in xmm registers. */
6782 case V16QImode:
6783 case V8HImode:
6784 case V4SImode:
6785 case V2DImode:
6786 case V4SFmode:
6787 case V2DFmode:
6788 if (!type || !AGGREGATE_TYPE_P (type))
6790 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6792 warnedsse = true;
6793 warning (0, "SSE vector argument without SSE enabled "
6794 "changes the ABI");
6796 if (cum->sse_nregs)
6797 return gen_reg_or_parallel (mode, orig_mode,
6798 cum->sse_regno + FIRST_SSE_REG);
6800 break;
6802 case OImode:
6803 /* OImode shouldn't be used directly. */
6804 gcc_unreachable ();
6806 case V8SFmode:
6807 case V8SImode:
6808 case V32QImode:
6809 case V16HImode:
6810 case V4DFmode:
6811 case V4DImode:
6812 if (!type || !AGGREGATE_TYPE_P (type))
6814 if (cum->sse_nregs)
6815 return gen_reg_or_parallel (mode, orig_mode,
6816 cum->sse_regno + FIRST_SSE_REG);
6818 break;
6820 case V8QImode:
6821 case V4HImode:
6822 case V2SImode:
6823 case V2SFmode:
6824 case V1TImode:
6825 case V1DImode:
6826 if (!type || !AGGREGATE_TYPE_P (type))
6828 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6830 warnedmmx = true;
6831 warning (0, "MMX vector argument without MMX enabled "
6832 "changes the ABI");
6834 if (cum->mmx_nregs)
6835 return gen_reg_or_parallel (mode, orig_mode,
6836 cum->mmx_regno + FIRST_MMX_REG);
6838 break;
6841 return NULL_RTX;
6844 static rtx
6845 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6846 enum machine_mode orig_mode, const_tree type, bool named)
6848 /* Handle a hidden AL argument containing number of registers
6849 for varargs x86-64 functions. */
6850 if (mode == VOIDmode)
6851 return GEN_INT (cum->maybe_vaarg
6852 ? (cum->sse_nregs < 0
6853 ? X86_64_SSE_REGPARM_MAX
6854 : cum->sse_regno)
6855 : -1);
6857 switch (mode)
6859 default:
6860 break;
6862 case V8SFmode:
6863 case V8SImode:
6864 case V32QImode:
6865 case V16HImode:
6866 case V4DFmode:
6867 case V4DImode:
6868 /* Unnamed 256bit vector mode parameters are passed on stack. */
6869 if (!named)
6870 return NULL;
6871 break;
6874 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6875 cum->sse_nregs,
6876 &x86_64_int_parameter_registers [cum->regno],
6877 cum->sse_regno);
6880 static rtx
6881 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6882 enum machine_mode orig_mode, bool named,
6883 HOST_WIDE_INT bytes)
6885 unsigned int regno;
6887 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6888 We use value of -2 to specify that current function call is MSABI. */
6889 if (mode == VOIDmode)
6890 return GEN_INT (-2);
6892 /* If we've run out of registers, it goes on the stack. */
6893 if (cum->nregs == 0)
6894 return NULL_RTX;
6896 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6898 /* Only floating point modes are passed in anything but integer regs. */
6899 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6901 if (named)
6902 regno = cum->regno + FIRST_SSE_REG;
6903 else
6905 rtx t1, t2;
6907 /* Unnamed floating parameters are passed in both the
6908 SSE and integer registers. */
6909 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6910 t2 = gen_rtx_REG (mode, regno);
6911 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6912 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6913 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6916 /* Handle aggregated types passed in register. */
6917 if (orig_mode == BLKmode)
6919 if (bytes > 0 && bytes <= 8)
6920 mode = (bytes > 4 ? DImode : SImode);
6921 if (mode == BLKmode)
6922 mode = DImode;
6925 return gen_reg_or_parallel (mode, orig_mode, regno);
6928 /* Return where to put the arguments to a function.
6929 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6931 MODE is the argument's machine mode. TYPE is the data type of the
6932 argument. It is null for libcalls where that information may not be
6933 available. CUM gives information about the preceding args and about
6934 the function being called. NAMED is nonzero if this argument is a
6935 named parameter (otherwise it is an extra parameter matching an
6936 ellipsis). */
6938 static rtx
6939 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6940 const_tree type, bool named)
6942 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6943 enum machine_mode mode = omode;
6944 HOST_WIDE_INT bytes, words;
6945 rtx arg;
6947 if (mode == BLKmode)
6948 bytes = int_size_in_bytes (type);
6949 else
6950 bytes = GET_MODE_SIZE (mode);
6951 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6953 /* To simplify the code below, represent vector types with a vector mode
6954 even if MMX/SSE are not active. */
6955 if (type && TREE_CODE (type) == VECTOR_TYPE)
6956 mode = type_natural_mode (type, cum);
6958 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6959 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6960 else if (TARGET_64BIT)
6961 arg = function_arg_64 (cum, mode, omode, type, named);
6962 else
6963 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6965 return arg;
6968 /* A C expression that indicates when an argument must be passed by
6969 reference. If nonzero for an argument, a copy of that argument is
6970 made in memory and a pointer to the argument is passed instead of
6971 the argument itself. The pointer is passed in whatever way is
6972 appropriate for passing a pointer to that type. */
6974 static bool
6975 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6976 enum machine_mode mode ATTRIBUTE_UNUSED,
6977 const_tree type, bool named ATTRIBUTE_UNUSED)
6979 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6981 /* See Windows x64 Software Convention. */
6982 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6984 int msize = (int) GET_MODE_SIZE (mode);
6985 if (type)
6987 /* Arrays are passed by reference. */
6988 if (TREE_CODE (type) == ARRAY_TYPE)
6989 return true;
6991 if (AGGREGATE_TYPE_P (type))
6993 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6994 are passed by reference. */
6995 msize = int_size_in_bytes (type);
6999 /* __m128 is passed by reference. */
7000 switch (msize) {
7001 case 1: case 2: case 4: case 8:
7002 break;
7003 default:
7004 return true;
7007 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7008 return 1;
7010 return 0;
7013 /* Return true when TYPE should be 128bit aligned for 32bit argument
7014 passing ABI. XXX: This function is obsolete and is only used for
7015 checking psABI compatibility with previous versions of GCC. */
7017 static bool
7018 ix86_compat_aligned_value_p (const_tree type)
7020 enum machine_mode mode = TYPE_MODE (type);
7021 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7022 || mode == TDmode
7023 || mode == TFmode
7024 || mode == TCmode)
7025 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7026 return true;
7027 if (TYPE_ALIGN (type) < 128)
7028 return false;
7030 if (AGGREGATE_TYPE_P (type))
7032 /* Walk the aggregates recursively. */
7033 switch (TREE_CODE (type))
7035 case RECORD_TYPE:
7036 case UNION_TYPE:
7037 case QUAL_UNION_TYPE:
7039 tree field;
7041 /* Walk all the structure fields. */
7042 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7044 if (TREE_CODE (field) == FIELD_DECL
7045 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7046 return true;
7048 break;
7051 case ARRAY_TYPE:
7052 /* Just for use if some languages passes arrays by value. */
7053 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7054 return true;
7055 break;
7057 default:
7058 gcc_unreachable ();
7061 return false;
7064 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7065 XXX: This function is obsolete and is only used for checking psABI
7066 compatibility with previous versions of GCC. */
7068 static unsigned int
7069 ix86_compat_function_arg_boundary (enum machine_mode mode,
7070 const_tree type, unsigned int align)
7072 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7073 natural boundaries. */
7074 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7076 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7077 make an exception for SSE modes since these require 128bit
7078 alignment.
7080 The handling here differs from field_alignment. ICC aligns MMX
7081 arguments to 4 byte boundaries, while structure fields are aligned
7082 to 8 byte boundaries. */
7083 if (!type)
7085 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7086 align = PARM_BOUNDARY;
7088 else
7090 if (!ix86_compat_aligned_value_p (type))
7091 align = PARM_BOUNDARY;
7094 if (align > BIGGEST_ALIGNMENT)
7095 align = BIGGEST_ALIGNMENT;
7096 return align;
7099 /* Return true when TYPE should be 128bit aligned for 32bit argument
7100 passing ABI. */
7102 static bool
7103 ix86_contains_aligned_value_p (const_tree type)
7105 enum machine_mode mode = TYPE_MODE (type);
7107 if (mode == XFmode || mode == XCmode)
7108 return false;
7110 if (TYPE_ALIGN (type) < 128)
7111 return false;
7113 if (AGGREGATE_TYPE_P (type))
7115 /* Walk the aggregates recursively. */
7116 switch (TREE_CODE (type))
7118 case RECORD_TYPE:
7119 case UNION_TYPE:
7120 case QUAL_UNION_TYPE:
7122 tree field;
7124 /* Walk all the structure fields. */
7125 for (field = TYPE_FIELDS (type);
7126 field;
7127 field = DECL_CHAIN (field))
7129 if (TREE_CODE (field) == FIELD_DECL
7130 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7131 return true;
7133 break;
7136 case ARRAY_TYPE:
7137 /* Just for use if some languages passes arrays by value. */
7138 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7139 return true;
7140 break;
7142 default:
7143 gcc_unreachable ();
7146 else
7147 return TYPE_ALIGN (type) >= 128;
7149 return false;
7152 /* Gives the alignment boundary, in bits, of an argument with the
7153 specified mode and type. */
7155 static unsigned int
7156 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7158 unsigned int align;
7159 if (type)
7161 /* Since the main variant type is used for call, we convert it to
7162 the main variant type. */
7163 type = TYPE_MAIN_VARIANT (type);
7164 align = TYPE_ALIGN (type);
7166 else
7167 align = GET_MODE_ALIGNMENT (mode);
7168 if (align < PARM_BOUNDARY)
7169 align = PARM_BOUNDARY;
7170 else
7172 static bool warned;
7173 unsigned int saved_align = align;
7175 if (!TARGET_64BIT)
7177 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7178 if (!type)
7180 if (mode == XFmode || mode == XCmode)
7181 align = PARM_BOUNDARY;
7183 else if (!ix86_contains_aligned_value_p (type))
7184 align = PARM_BOUNDARY;
7186 if (align < 128)
7187 align = PARM_BOUNDARY;
7190 if (warn_psabi
7191 && !warned
7192 && align != ix86_compat_function_arg_boundary (mode, type,
7193 saved_align))
7195 warned = true;
7196 inform (input_location,
7197 "The ABI for passing parameters with %d-byte"
7198 " alignment has changed in GCC 4.6",
7199 align / BITS_PER_UNIT);
7203 return align;
7206 /* Return true if N is a possible register number of function value. */
7208 static bool
7209 ix86_function_value_regno_p (const unsigned int regno)
7211 switch (regno)
7213 case AX_REG:
7214 return true;
7216 case FIRST_FLOAT_REG:
7217 /* TODO: The function should depend on current function ABI but
7218 builtins.c would need updating then. Therefore we use the
7219 default ABI. */
7220 if (TARGET_64BIT && ix86_abi == MS_ABI)
7221 return false;
7222 return TARGET_FLOAT_RETURNS_IN_80387;
7224 case FIRST_SSE_REG:
7225 return TARGET_SSE;
7227 case FIRST_MMX_REG:
7228 if (TARGET_MACHO || TARGET_64BIT)
7229 return false;
7230 return TARGET_MMX;
7233 return false;
7236 /* Define how to find the value returned by a function.
7237 VALTYPE is the data type of the value (as a tree).
7238 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7239 otherwise, FUNC is 0. */
7241 static rtx
7242 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7243 const_tree fntype, const_tree fn)
7245 unsigned int regno;
7247 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7248 we normally prevent this case when mmx is not available. However
7249 some ABIs may require the result to be returned like DImode. */
7250 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7251 regno = FIRST_MMX_REG;
7253 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7254 we prevent this case when sse is not available. However some ABIs
7255 may require the result to be returned like integer TImode. */
7256 else if (mode == TImode
7257 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7258 regno = FIRST_SSE_REG;
7260 /* 32-byte vector modes in %ymm0. */
7261 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7262 regno = FIRST_SSE_REG;
7264 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7265 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7266 regno = FIRST_FLOAT_REG;
7267 else
7268 /* Most things go in %eax. */
7269 regno = AX_REG;
7271 /* Override FP return register with %xmm0 for local functions when
7272 SSE math is enabled or for functions with sseregparm attribute. */
7273 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7275 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7276 if ((sse_level >= 1 && mode == SFmode)
7277 || (sse_level == 2 && mode == DFmode))
7278 regno = FIRST_SSE_REG;
7281 /* OImode shouldn't be used directly. */
7282 gcc_assert (mode != OImode);
7284 return gen_rtx_REG (orig_mode, regno);
7287 static rtx
7288 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7289 const_tree valtype)
7291 rtx ret;
7293 /* Handle libcalls, which don't provide a type node. */
7294 if (valtype == NULL)
7296 unsigned int regno;
7298 switch (mode)
7300 case SFmode:
7301 case SCmode:
7302 case DFmode:
7303 case DCmode:
7304 case TFmode:
7305 case SDmode:
7306 case DDmode:
7307 case TDmode:
7308 regno = FIRST_SSE_REG;
7309 break;
7310 case XFmode:
7311 case XCmode:
7312 regno = FIRST_FLOAT_REG;
7313 break;
7314 case TCmode:
7315 return NULL;
7316 default:
7317 regno = AX_REG;
7320 return gen_rtx_REG (mode, regno);
7322 else if (POINTER_TYPE_P (valtype))
7324 /* Pointers are always returned in word_mode. */
7325 mode = word_mode;
7328 ret = construct_container (mode, orig_mode, valtype, 1,
7329 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7330 x86_64_int_return_registers, 0);
7332 /* For zero sized structures, construct_container returns NULL, but we
7333 need to keep rest of compiler happy by returning meaningful value. */
7334 if (!ret)
7335 ret = gen_rtx_REG (orig_mode, AX_REG);
7337 return ret;
7340 static rtx
7341 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7342 const_tree valtype)
7344 unsigned int regno = AX_REG;
7346 if (TARGET_SSE)
7348 switch (GET_MODE_SIZE (mode))
7350 case 16:
7351 if (valtype != NULL_TREE
7352 && !VECTOR_INTEGER_TYPE_P (valtype)
7353 && !VECTOR_INTEGER_TYPE_P (valtype)
7354 && !INTEGRAL_TYPE_P (valtype)
7355 && !VECTOR_FLOAT_TYPE_P (valtype))
7356 break;
7357 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7358 && !COMPLEX_MODE_P (mode))
7359 regno = FIRST_SSE_REG;
7360 break;
7361 case 8:
7362 case 4:
7363 if (mode == SFmode || mode == DFmode)
7364 regno = FIRST_SSE_REG;
7365 break;
7366 default:
7367 break;
7370 return gen_rtx_REG (orig_mode, regno);
7373 static rtx
7374 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7375 enum machine_mode orig_mode, enum machine_mode mode)
7377 const_tree fn, fntype;
7379 fn = NULL_TREE;
7380 if (fntype_or_decl && DECL_P (fntype_or_decl))
7381 fn = fntype_or_decl;
7382 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7384 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7385 return function_value_ms_64 (orig_mode, mode, valtype);
7386 else if (TARGET_64BIT)
7387 return function_value_64 (orig_mode, mode, valtype);
7388 else
7389 return function_value_32 (orig_mode, mode, fntype, fn);
7392 static rtx
7393 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7394 bool outgoing ATTRIBUTE_UNUSED)
7396 enum machine_mode mode, orig_mode;
7398 orig_mode = TYPE_MODE (valtype);
7399 mode = type_natural_mode (valtype, NULL);
7400 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7403 /* Pointer function arguments and return values are promoted to
7404 word_mode. */
7406 static enum machine_mode
7407 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7408 int *punsignedp, const_tree fntype,
7409 int for_return)
7411 if (type != NULL_TREE && POINTER_TYPE_P (type))
7413 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7414 return word_mode;
7416 return default_promote_function_mode (type, mode, punsignedp, fntype,
7417 for_return);
7420 /* Return true if a structure, union or array with MODE containing FIELD
7421 should be accessed using BLKmode. */
7423 static bool
7424 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7426 /* Union with XFmode must be in BLKmode. */
7427 return (mode == XFmode
7428 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7429 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7433 ix86_libcall_value (enum machine_mode mode)
7435 return ix86_function_value_1 (NULL, NULL, mode, mode);
7438 /* Return true iff type is returned in memory. */
7440 static bool ATTRIBUTE_UNUSED
7441 return_in_memory_32 (const_tree type, enum machine_mode mode)
7443 HOST_WIDE_INT size;
7445 if (mode == BLKmode)
7446 return true;
7448 size = int_size_in_bytes (type);
7450 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7451 return false;
7453 if (VECTOR_MODE_P (mode) || mode == TImode)
7455 /* User-created vectors small enough to fit in EAX. */
7456 if (size < 8)
7457 return false;
7459 /* MMX/3dNow values are returned in MM0,
7460 except when it doesn't exits or the ABI prescribes otherwise. */
7461 if (size == 8)
7462 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7464 /* SSE values are returned in XMM0, except when it doesn't exist. */
7465 if (size == 16)
7466 return !TARGET_SSE;
7468 /* AVX values are returned in YMM0, except when it doesn't exist. */
7469 if (size == 32)
7470 return !TARGET_AVX;
7473 if (mode == XFmode)
7474 return false;
7476 if (size > 12)
7477 return true;
7479 /* OImode shouldn't be used directly. */
7480 gcc_assert (mode != OImode);
7482 return false;
7485 static bool ATTRIBUTE_UNUSED
7486 return_in_memory_64 (const_tree type, enum machine_mode mode)
7488 int needed_intregs, needed_sseregs;
7489 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7492 static bool ATTRIBUTE_UNUSED
7493 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7495 HOST_WIDE_INT size = int_size_in_bytes (type);
7497 /* __m128 is returned in xmm0. */
7498 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7499 || VECTOR_FLOAT_TYPE_P (type))
7500 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7501 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7502 return false;
7504 /* Otherwise, the size must be exactly in [1248]. */
7505 return size != 1 && size != 2 && size != 4 && size != 8;
7508 static bool
7509 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7511 #ifdef SUBTARGET_RETURN_IN_MEMORY
7512 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7513 #else
7514 const enum machine_mode mode = type_natural_mode (type, NULL);
7516 if (TARGET_64BIT)
7518 if (ix86_function_type_abi (fntype) == MS_ABI)
7519 return return_in_memory_ms_64 (type, mode);
7520 else
7521 return return_in_memory_64 (type, mode);
7523 else
7524 return return_in_memory_32 (type, mode);
7525 #endif
7528 /* When returning SSE vector types, we have a choice of either
7529 (1) being abi incompatible with a -march switch, or
7530 (2) generating an error.
7531 Given no good solution, I think the safest thing is one warning.
7532 The user won't be able to use -Werror, but....
7534 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7535 called in response to actually generating a caller or callee that
7536 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7537 via aggregate_value_p for general type probing from tree-ssa. */
7539 static rtx
7540 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7542 static bool warnedsse, warnedmmx;
7544 if (!TARGET_64BIT && type)
7546 /* Look at the return type of the function, not the function type. */
7547 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7549 if (!TARGET_SSE && !warnedsse)
7551 if (mode == TImode
7552 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7554 warnedsse = true;
7555 warning (0, "SSE vector return without SSE enabled "
7556 "changes the ABI");
7560 if (!TARGET_MMX && !warnedmmx)
7562 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7564 warnedmmx = true;
7565 warning (0, "MMX vector return without MMX enabled "
7566 "changes the ABI");
7571 return NULL;
7575 /* Create the va_list data type. */
7577 /* Returns the calling convention specific va_list date type.
7578 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7580 static tree
7581 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7583 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7585 /* For i386 we use plain pointer to argument area. */
7586 if (!TARGET_64BIT || abi == MS_ABI)
7587 return build_pointer_type (char_type_node);
7589 record = lang_hooks.types.make_type (RECORD_TYPE);
7590 type_decl = build_decl (BUILTINS_LOCATION,
7591 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7593 f_gpr = build_decl (BUILTINS_LOCATION,
7594 FIELD_DECL, get_identifier ("gp_offset"),
7595 unsigned_type_node);
7596 f_fpr = build_decl (BUILTINS_LOCATION,
7597 FIELD_DECL, get_identifier ("fp_offset"),
7598 unsigned_type_node);
7599 f_ovf = build_decl (BUILTINS_LOCATION,
7600 FIELD_DECL, get_identifier ("overflow_arg_area"),
7601 ptr_type_node);
7602 f_sav = build_decl (BUILTINS_LOCATION,
7603 FIELD_DECL, get_identifier ("reg_save_area"),
7604 ptr_type_node);
7606 va_list_gpr_counter_field = f_gpr;
7607 va_list_fpr_counter_field = f_fpr;
7609 DECL_FIELD_CONTEXT (f_gpr) = record;
7610 DECL_FIELD_CONTEXT (f_fpr) = record;
7611 DECL_FIELD_CONTEXT (f_ovf) = record;
7612 DECL_FIELD_CONTEXT (f_sav) = record;
7614 TYPE_STUB_DECL (record) = type_decl;
7615 TYPE_NAME (record) = type_decl;
7616 TYPE_FIELDS (record) = f_gpr;
7617 DECL_CHAIN (f_gpr) = f_fpr;
7618 DECL_CHAIN (f_fpr) = f_ovf;
7619 DECL_CHAIN (f_ovf) = f_sav;
7621 layout_type (record);
7623 /* The correct type is an array type of one element. */
7624 return build_array_type (record, build_index_type (size_zero_node));
7627 /* Setup the builtin va_list data type and for 64-bit the additional
7628 calling convention specific va_list data types. */
7630 static tree
7631 ix86_build_builtin_va_list (void)
7633 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7635 /* Initialize abi specific va_list builtin types. */
7636 if (TARGET_64BIT)
7638 tree t;
7639 if (ix86_abi == MS_ABI)
7641 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7642 if (TREE_CODE (t) != RECORD_TYPE)
7643 t = build_variant_type_copy (t);
7644 sysv_va_list_type_node = t;
7646 else
7648 t = ret;
7649 if (TREE_CODE (t) != RECORD_TYPE)
7650 t = build_variant_type_copy (t);
7651 sysv_va_list_type_node = t;
7653 if (ix86_abi != MS_ABI)
7655 t = ix86_build_builtin_va_list_abi (MS_ABI);
7656 if (TREE_CODE (t) != RECORD_TYPE)
7657 t = build_variant_type_copy (t);
7658 ms_va_list_type_node = t;
7660 else
7662 t = ret;
7663 if (TREE_CODE (t) != RECORD_TYPE)
7664 t = build_variant_type_copy (t);
7665 ms_va_list_type_node = t;
7669 return ret;
7672 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7674 static void
7675 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7677 rtx save_area, mem;
7678 alias_set_type set;
7679 int i, max;
7681 /* GPR size of varargs save area. */
7682 if (cfun->va_list_gpr_size)
7683 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7684 else
7685 ix86_varargs_gpr_size = 0;
7687 /* FPR size of varargs save area. We don't need it if we don't pass
7688 anything in SSE registers. */
7689 if (TARGET_SSE && cfun->va_list_fpr_size)
7690 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7691 else
7692 ix86_varargs_fpr_size = 0;
7694 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7695 return;
7697 save_area = frame_pointer_rtx;
7698 set = get_varargs_alias_set ();
7700 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7701 if (max > X86_64_REGPARM_MAX)
7702 max = X86_64_REGPARM_MAX;
7704 for (i = cum->regno; i < max; i++)
7706 mem = gen_rtx_MEM (word_mode,
7707 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7708 MEM_NOTRAP_P (mem) = 1;
7709 set_mem_alias_set (mem, set);
7710 emit_move_insn (mem,
7711 gen_rtx_REG (word_mode,
7712 x86_64_int_parameter_registers[i]));
7715 if (ix86_varargs_fpr_size)
7717 enum machine_mode smode;
7718 rtx label, test;
7720 /* Now emit code to save SSE registers. The AX parameter contains number
7721 of SSE parameter registers used to call this function, though all we
7722 actually check here is the zero/non-zero status. */
7724 label = gen_label_rtx ();
7725 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7726 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7727 label));
7729 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7730 we used movdqa (i.e. TImode) instead? Perhaps even better would
7731 be if we could determine the real mode of the data, via a hook
7732 into pass_stdarg. Ignore all that for now. */
7733 smode = V4SFmode;
7734 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7735 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7737 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7738 if (max > X86_64_SSE_REGPARM_MAX)
7739 max = X86_64_SSE_REGPARM_MAX;
7741 for (i = cum->sse_regno; i < max; ++i)
7743 mem = plus_constant (Pmode, save_area,
7744 i * 16 + ix86_varargs_gpr_size);
7745 mem = gen_rtx_MEM (smode, mem);
7746 MEM_NOTRAP_P (mem) = 1;
7747 set_mem_alias_set (mem, set);
7748 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7750 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7753 emit_label (label);
7757 static void
7758 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7760 alias_set_type set = get_varargs_alias_set ();
7761 int i;
7763 /* Reset to zero, as there might be a sysv vaarg used
7764 before. */
7765 ix86_varargs_gpr_size = 0;
7766 ix86_varargs_fpr_size = 0;
7768 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7770 rtx reg, mem;
7772 mem = gen_rtx_MEM (Pmode,
7773 plus_constant (Pmode, virtual_incoming_args_rtx,
7774 i * UNITS_PER_WORD));
7775 MEM_NOTRAP_P (mem) = 1;
7776 set_mem_alias_set (mem, set);
7778 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7779 emit_move_insn (mem, reg);
7783 static void
7784 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7785 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7786 int no_rtl)
7788 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7789 CUMULATIVE_ARGS next_cum;
7790 tree fntype;
7792 /* This argument doesn't appear to be used anymore. Which is good,
7793 because the old code here didn't suppress rtl generation. */
7794 gcc_assert (!no_rtl);
7796 if (!TARGET_64BIT)
7797 return;
7799 fntype = TREE_TYPE (current_function_decl);
7801 /* For varargs, we do not want to skip the dummy va_dcl argument.
7802 For stdargs, we do want to skip the last named argument. */
7803 next_cum = *cum;
7804 if (stdarg_p (fntype))
7805 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7806 true);
7808 if (cum->call_abi == MS_ABI)
7809 setup_incoming_varargs_ms_64 (&next_cum);
7810 else
7811 setup_incoming_varargs_64 (&next_cum);
7814 /* Checks if TYPE is of kind va_list char *. */
7816 static bool
7817 is_va_list_char_pointer (tree type)
7819 tree canonic;
7821 /* For 32-bit it is always true. */
7822 if (!TARGET_64BIT)
7823 return true;
7824 canonic = ix86_canonical_va_list_type (type);
7825 return (canonic == ms_va_list_type_node
7826 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7829 /* Implement va_start. */
7831 static void
7832 ix86_va_start (tree valist, rtx nextarg)
7834 HOST_WIDE_INT words, n_gpr, n_fpr;
7835 tree f_gpr, f_fpr, f_ovf, f_sav;
7836 tree gpr, fpr, ovf, sav, t;
7837 tree type;
7838 rtx ovf_rtx;
7840 if (flag_split_stack
7841 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7843 unsigned int scratch_regno;
7845 /* When we are splitting the stack, we can't refer to the stack
7846 arguments using internal_arg_pointer, because they may be on
7847 the old stack. The split stack prologue will arrange to
7848 leave a pointer to the old stack arguments in a scratch
7849 register, which we here copy to a pseudo-register. The split
7850 stack prologue can't set the pseudo-register directly because
7851 it (the prologue) runs before any registers have been saved. */
7853 scratch_regno = split_stack_prologue_scratch_regno ();
7854 if (scratch_regno != INVALID_REGNUM)
7856 rtx reg, seq;
7858 reg = gen_reg_rtx (Pmode);
7859 cfun->machine->split_stack_varargs_pointer = reg;
7861 start_sequence ();
7862 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7863 seq = get_insns ();
7864 end_sequence ();
7866 push_topmost_sequence ();
7867 emit_insn_after (seq, entry_of_function ());
7868 pop_topmost_sequence ();
7872 /* Only 64bit target needs something special. */
7873 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7875 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7876 std_expand_builtin_va_start (valist, nextarg);
7877 else
7879 rtx va_r, next;
7881 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7882 next = expand_binop (ptr_mode, add_optab,
7883 cfun->machine->split_stack_varargs_pointer,
7884 crtl->args.arg_offset_rtx,
7885 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7886 convert_move (va_r, next, 0);
7888 return;
7891 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7892 f_fpr = DECL_CHAIN (f_gpr);
7893 f_ovf = DECL_CHAIN (f_fpr);
7894 f_sav = DECL_CHAIN (f_ovf);
7896 valist = build_simple_mem_ref (valist);
7897 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7898 /* The following should be folded into the MEM_REF offset. */
7899 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7900 f_gpr, NULL_TREE);
7901 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7902 f_fpr, NULL_TREE);
7903 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7904 f_ovf, NULL_TREE);
7905 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7906 f_sav, NULL_TREE);
7908 /* Count number of gp and fp argument registers used. */
7909 words = crtl->args.info.words;
7910 n_gpr = crtl->args.info.regno;
7911 n_fpr = crtl->args.info.sse_regno;
7913 if (cfun->va_list_gpr_size)
7915 type = TREE_TYPE (gpr);
7916 t = build2 (MODIFY_EXPR, type,
7917 gpr, build_int_cst (type, n_gpr * 8));
7918 TREE_SIDE_EFFECTS (t) = 1;
7919 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7922 if (TARGET_SSE && cfun->va_list_fpr_size)
7924 type = TREE_TYPE (fpr);
7925 t = build2 (MODIFY_EXPR, type, fpr,
7926 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7927 TREE_SIDE_EFFECTS (t) = 1;
7928 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7931 /* Find the overflow area. */
7932 type = TREE_TYPE (ovf);
7933 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7934 ovf_rtx = crtl->args.internal_arg_pointer;
7935 else
7936 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7937 t = make_tree (type, ovf_rtx);
7938 if (words != 0)
7939 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7940 t = build2 (MODIFY_EXPR, type, ovf, t);
7941 TREE_SIDE_EFFECTS (t) = 1;
7942 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7944 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7946 /* Find the register save area.
7947 Prologue of the function save it right above stack frame. */
7948 type = TREE_TYPE (sav);
7949 t = make_tree (type, frame_pointer_rtx);
7950 if (!ix86_varargs_gpr_size)
7951 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7952 t = build2 (MODIFY_EXPR, type, sav, t);
7953 TREE_SIDE_EFFECTS (t) = 1;
7954 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7958 /* Implement va_arg. */
7960 static tree
7961 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7962 gimple_seq *post_p)
7964 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7965 tree f_gpr, f_fpr, f_ovf, f_sav;
7966 tree gpr, fpr, ovf, sav, t;
7967 int size, rsize;
7968 tree lab_false, lab_over = NULL_TREE;
7969 tree addr, t2;
7970 rtx container;
7971 int indirect_p = 0;
7972 tree ptrtype;
7973 enum machine_mode nat_mode;
7974 unsigned int arg_boundary;
7976 /* Only 64bit target needs something special. */
7977 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7978 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7980 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7981 f_fpr = DECL_CHAIN (f_gpr);
7982 f_ovf = DECL_CHAIN (f_fpr);
7983 f_sav = DECL_CHAIN (f_ovf);
7985 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7986 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7987 valist = build_va_arg_indirect_ref (valist);
7988 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7989 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7990 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7992 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7993 if (indirect_p)
7994 type = build_pointer_type (type);
7995 size = int_size_in_bytes (type);
7996 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7998 nat_mode = type_natural_mode (type, NULL);
7999 switch (nat_mode)
8001 case V8SFmode:
8002 case V8SImode:
8003 case V32QImode:
8004 case V16HImode:
8005 case V4DFmode:
8006 case V4DImode:
8007 /* Unnamed 256bit vector mode parameters are passed on stack. */
8008 if (!TARGET_64BIT_MS_ABI)
8010 container = NULL;
8011 break;
8014 default:
8015 container = construct_container (nat_mode, TYPE_MODE (type),
8016 type, 0, X86_64_REGPARM_MAX,
8017 X86_64_SSE_REGPARM_MAX, intreg,
8019 break;
8022 /* Pull the value out of the saved registers. */
8024 addr = create_tmp_var (ptr_type_node, "addr");
8026 if (container)
8028 int needed_intregs, needed_sseregs;
8029 bool need_temp;
8030 tree int_addr, sse_addr;
8032 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8033 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8035 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8037 need_temp = (!REG_P (container)
8038 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8039 || TYPE_ALIGN (type) > 128));
8041 /* In case we are passing structure, verify that it is consecutive block
8042 on the register save area. If not we need to do moves. */
8043 if (!need_temp && !REG_P (container))
8045 /* Verify that all registers are strictly consecutive */
8046 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8048 int i;
8050 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8052 rtx slot = XVECEXP (container, 0, i);
8053 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8054 || INTVAL (XEXP (slot, 1)) != i * 16)
8055 need_temp = 1;
8058 else
8060 int i;
8062 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8064 rtx slot = XVECEXP (container, 0, i);
8065 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8066 || INTVAL (XEXP (slot, 1)) != i * 8)
8067 need_temp = 1;
8071 if (!need_temp)
8073 int_addr = addr;
8074 sse_addr = addr;
8076 else
8078 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8079 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8082 /* First ensure that we fit completely in registers. */
8083 if (needed_intregs)
8085 t = build_int_cst (TREE_TYPE (gpr),
8086 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8087 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8088 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8089 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8090 gimplify_and_add (t, pre_p);
8092 if (needed_sseregs)
8094 t = build_int_cst (TREE_TYPE (fpr),
8095 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8096 + X86_64_REGPARM_MAX * 8);
8097 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8098 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8099 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8100 gimplify_and_add (t, pre_p);
8103 /* Compute index to start of area used for integer regs. */
8104 if (needed_intregs)
8106 /* int_addr = gpr + sav; */
8107 t = fold_build_pointer_plus (sav, gpr);
8108 gimplify_assign (int_addr, t, pre_p);
8110 if (needed_sseregs)
8112 /* sse_addr = fpr + sav; */
8113 t = fold_build_pointer_plus (sav, fpr);
8114 gimplify_assign (sse_addr, t, pre_p);
8116 if (need_temp)
8118 int i, prev_size = 0;
8119 tree temp = create_tmp_var (type, "va_arg_tmp");
8121 /* addr = &temp; */
8122 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8123 gimplify_assign (addr, t, pre_p);
8125 for (i = 0; i < XVECLEN (container, 0); i++)
8127 rtx slot = XVECEXP (container, 0, i);
8128 rtx reg = XEXP (slot, 0);
8129 enum machine_mode mode = GET_MODE (reg);
8130 tree piece_type;
8131 tree addr_type;
8132 tree daddr_type;
8133 tree src_addr, src;
8134 int src_offset;
8135 tree dest_addr, dest;
8136 int cur_size = GET_MODE_SIZE (mode);
8138 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8139 prev_size = INTVAL (XEXP (slot, 1));
8140 if (prev_size + cur_size > size)
8142 cur_size = size - prev_size;
8143 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8144 if (mode == BLKmode)
8145 mode = QImode;
8147 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8148 if (mode == GET_MODE (reg))
8149 addr_type = build_pointer_type (piece_type);
8150 else
8151 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8152 true);
8153 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8154 true);
8156 if (SSE_REGNO_P (REGNO (reg)))
8158 src_addr = sse_addr;
8159 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8161 else
8163 src_addr = int_addr;
8164 src_offset = REGNO (reg) * 8;
8166 src_addr = fold_convert (addr_type, src_addr);
8167 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8169 dest_addr = fold_convert (daddr_type, addr);
8170 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8171 if (cur_size == GET_MODE_SIZE (mode))
8173 src = build_va_arg_indirect_ref (src_addr);
8174 dest = build_va_arg_indirect_ref (dest_addr);
8176 gimplify_assign (dest, src, pre_p);
8178 else
8180 tree copy
8181 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8182 3, dest_addr, src_addr,
8183 size_int (cur_size));
8184 gimplify_and_add (copy, pre_p);
8186 prev_size += cur_size;
8190 if (needed_intregs)
8192 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8193 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8194 gimplify_assign (gpr, t, pre_p);
8197 if (needed_sseregs)
8199 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8200 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8201 gimplify_assign (fpr, t, pre_p);
8204 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8206 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8209 /* ... otherwise out of the overflow area. */
8211 /* When we align parameter on stack for caller, if the parameter
8212 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8213 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8214 here with caller. */
8215 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8216 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8217 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8219 /* Care for on-stack alignment if needed. */
8220 if (arg_boundary <= 64 || size == 0)
8221 t = ovf;
8222 else
8224 HOST_WIDE_INT align = arg_boundary / 8;
8225 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8226 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8227 build_int_cst (TREE_TYPE (t), -align));
8230 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8231 gimplify_assign (addr, t, pre_p);
8233 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8234 gimplify_assign (unshare_expr (ovf), t, pre_p);
8236 if (container)
8237 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8239 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8240 addr = fold_convert (ptrtype, addr);
8242 if (indirect_p)
8243 addr = build_va_arg_indirect_ref (addr);
8244 return build_va_arg_indirect_ref (addr);
8247 /* Return true if OPNUM's MEM should be matched
8248 in movabs* patterns. */
8250 bool
8251 ix86_check_movabs (rtx insn, int opnum)
8253 rtx set, mem;
8255 set = PATTERN (insn);
8256 if (GET_CODE (set) == PARALLEL)
8257 set = XVECEXP (set, 0, 0);
8258 gcc_assert (GET_CODE (set) == SET);
8259 mem = XEXP (set, opnum);
8260 while (GET_CODE (mem) == SUBREG)
8261 mem = SUBREG_REG (mem);
8262 gcc_assert (MEM_P (mem));
8263 return volatile_ok || !MEM_VOLATILE_P (mem);
8266 /* Initialize the table of extra 80387 mathematical constants. */
8268 static void
8269 init_ext_80387_constants (void)
8271 static const char * cst[5] =
8273 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8274 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8275 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8276 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8277 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8279 int i;
8281 for (i = 0; i < 5; i++)
8283 real_from_string (&ext_80387_constants_table[i], cst[i]);
8284 /* Ensure each constant is rounded to XFmode precision. */
8285 real_convert (&ext_80387_constants_table[i],
8286 XFmode, &ext_80387_constants_table[i]);
8289 ext_80387_constants_init = 1;
8292 /* Return non-zero if the constant is something that
8293 can be loaded with a special instruction. */
8296 standard_80387_constant_p (rtx x)
8298 enum machine_mode mode = GET_MODE (x);
8300 REAL_VALUE_TYPE r;
8302 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8303 return -1;
8305 if (x == CONST0_RTX (mode))
8306 return 1;
8307 if (x == CONST1_RTX (mode))
8308 return 2;
8310 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8312 /* For XFmode constants, try to find a special 80387 instruction when
8313 optimizing for size or on those CPUs that benefit from them. */
8314 if (mode == XFmode
8315 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8317 int i;
8319 if (! ext_80387_constants_init)
8320 init_ext_80387_constants ();
8322 for (i = 0; i < 5; i++)
8323 if (real_identical (&r, &ext_80387_constants_table[i]))
8324 return i + 3;
8327 /* Load of the constant -0.0 or -1.0 will be split as
8328 fldz;fchs or fld1;fchs sequence. */
8329 if (real_isnegzero (&r))
8330 return 8;
8331 if (real_identical (&r, &dconstm1))
8332 return 9;
8334 return 0;
8337 /* Return the opcode of the special instruction to be used to load
8338 the constant X. */
8340 const char *
8341 standard_80387_constant_opcode (rtx x)
8343 switch (standard_80387_constant_p (x))
8345 case 1:
8346 return "fldz";
8347 case 2:
8348 return "fld1";
8349 case 3:
8350 return "fldlg2";
8351 case 4:
8352 return "fldln2";
8353 case 5:
8354 return "fldl2e";
8355 case 6:
8356 return "fldl2t";
8357 case 7:
8358 return "fldpi";
8359 case 8:
8360 case 9:
8361 return "#";
8362 default:
8363 gcc_unreachable ();
8367 /* Return the CONST_DOUBLE representing the 80387 constant that is
8368 loaded by the specified special instruction. The argument IDX
8369 matches the return value from standard_80387_constant_p. */
8372 standard_80387_constant_rtx (int idx)
8374 int i;
8376 if (! ext_80387_constants_init)
8377 init_ext_80387_constants ();
8379 switch (idx)
8381 case 3:
8382 case 4:
8383 case 5:
8384 case 6:
8385 case 7:
8386 i = idx - 3;
8387 break;
8389 default:
8390 gcc_unreachable ();
8393 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8394 XFmode);
8397 /* Return 1 if X is all 0s and 2 if x is all 1s
8398 in supported SSE/AVX vector mode. */
8401 standard_sse_constant_p (rtx x)
8403 enum machine_mode mode = GET_MODE (x);
8405 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8406 return 1;
8407 if (vector_all_ones_operand (x, mode))
8408 switch (mode)
8410 case V16QImode:
8411 case V8HImode:
8412 case V4SImode:
8413 case V2DImode:
8414 if (TARGET_SSE2)
8415 return 2;
8416 case V32QImode:
8417 case V16HImode:
8418 case V8SImode:
8419 case V4DImode:
8420 if (TARGET_AVX2)
8421 return 2;
8422 default:
8423 break;
8426 return 0;
8429 /* Return the opcode of the special instruction to be used to load
8430 the constant X. */
8432 const char *
8433 standard_sse_constant_opcode (rtx insn, rtx x)
8435 switch (standard_sse_constant_p (x))
8437 case 1:
8438 switch (get_attr_mode (insn))
8440 case MODE_TI:
8441 return "%vpxor\t%0, %d0";
8442 case MODE_V2DF:
8443 return "%vxorpd\t%0, %d0";
8444 case MODE_V4SF:
8445 return "%vxorps\t%0, %d0";
8447 case MODE_OI:
8448 return "vpxor\t%x0, %x0, %x0";
8449 case MODE_V4DF:
8450 return "vxorpd\t%x0, %x0, %x0";
8451 case MODE_V8SF:
8452 return "vxorps\t%x0, %x0, %x0";
8454 default:
8455 break;
8458 case 2:
8459 if (TARGET_AVX)
8460 return "vpcmpeqd\t%0, %0, %0";
8461 else
8462 return "pcmpeqd\t%0, %0";
8464 default:
8465 break;
8467 gcc_unreachable ();
8470 /* Returns true if OP contains a symbol reference */
8472 bool
8473 symbolic_reference_mentioned_p (rtx op)
8475 const char *fmt;
8476 int i;
8478 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8479 return true;
8481 fmt = GET_RTX_FORMAT (GET_CODE (op));
8482 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8484 if (fmt[i] == 'E')
8486 int j;
8488 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8489 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8490 return true;
8493 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8494 return true;
8497 return false;
8500 /* Return true if it is appropriate to emit `ret' instructions in the
8501 body of a function. Do this only if the epilogue is simple, needing a
8502 couple of insns. Prior to reloading, we can't tell how many registers
8503 must be saved, so return false then. Return false if there is no frame
8504 marker to de-allocate. */
8506 bool
8507 ix86_can_use_return_insn_p (void)
8509 struct ix86_frame frame;
8511 if (! reload_completed || frame_pointer_needed)
8512 return 0;
8514 /* Don't allow more than 32k pop, since that's all we can do
8515 with one instruction. */
8516 if (crtl->args.pops_args && crtl->args.size >= 32768)
8517 return 0;
8519 ix86_compute_frame_layout (&frame);
8520 return (frame.stack_pointer_offset == UNITS_PER_WORD
8521 && (frame.nregs + frame.nsseregs) == 0);
8524 /* Value should be nonzero if functions must have frame pointers.
8525 Zero means the frame pointer need not be set up (and parms may
8526 be accessed via the stack pointer) in functions that seem suitable. */
8528 static bool
8529 ix86_frame_pointer_required (void)
8531 /* If we accessed previous frames, then the generated code expects
8532 to be able to access the saved ebp value in our frame. */
8533 if (cfun->machine->accesses_prev_frame)
8534 return true;
8536 /* Several x86 os'es need a frame pointer for other reasons,
8537 usually pertaining to setjmp. */
8538 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8539 return true;
8541 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8542 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8543 return true;
8545 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8546 allocation is 4GB. */
8547 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8548 return true;
8550 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8551 turns off the frame pointer by default. Turn it back on now if
8552 we've not got a leaf function. */
8553 if (TARGET_OMIT_LEAF_FRAME_POINTER
8554 && (!crtl->is_leaf
8555 || ix86_current_function_calls_tls_descriptor))
8556 return true;
8558 if (crtl->profile && !flag_fentry)
8559 return true;
8561 return false;
8564 /* Record that the current function accesses previous call frames. */
8566 void
8567 ix86_setup_frame_addresses (void)
8569 cfun->machine->accesses_prev_frame = 1;
8572 #ifndef USE_HIDDEN_LINKONCE
8573 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8574 # define USE_HIDDEN_LINKONCE 1
8575 # else
8576 # define USE_HIDDEN_LINKONCE 0
8577 # endif
8578 #endif
8580 static int pic_labels_used;
8582 /* Fills in the label name that should be used for a pc thunk for
8583 the given register. */
8585 static void
8586 get_pc_thunk_name (char name[32], unsigned int regno)
8588 gcc_assert (!TARGET_64BIT);
8590 if (USE_HIDDEN_LINKONCE)
8591 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8592 else
8593 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8597 /* This function generates code for -fpic that loads %ebx with
8598 the return address of the caller and then returns. */
8600 static void
8601 ix86_code_end (void)
8603 rtx xops[2];
8604 int regno;
8606 for (regno = AX_REG; regno <= SP_REG; regno++)
8608 char name[32];
8609 tree decl;
8611 if (!(pic_labels_used & (1 << regno)))
8612 continue;
8614 get_pc_thunk_name (name, regno);
8616 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8617 get_identifier (name),
8618 build_function_type_list (void_type_node, NULL_TREE));
8619 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8620 NULL_TREE, void_type_node);
8621 TREE_PUBLIC (decl) = 1;
8622 TREE_STATIC (decl) = 1;
8623 DECL_IGNORED_P (decl) = 1;
8625 #if TARGET_MACHO
8626 if (TARGET_MACHO)
8628 switch_to_section (darwin_sections[text_coal_section]);
8629 fputs ("\t.weak_definition\t", asm_out_file);
8630 assemble_name (asm_out_file, name);
8631 fputs ("\n\t.private_extern\t", asm_out_file);
8632 assemble_name (asm_out_file, name);
8633 putc ('\n', asm_out_file);
8634 ASM_OUTPUT_LABEL (asm_out_file, name);
8635 DECL_WEAK (decl) = 1;
8637 else
8638 #endif
8639 if (USE_HIDDEN_LINKONCE)
8641 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8643 targetm.asm_out.unique_section (decl, 0);
8644 switch_to_section (get_named_section (decl, NULL, 0));
8646 targetm.asm_out.globalize_label (asm_out_file, name);
8647 fputs ("\t.hidden\t", asm_out_file);
8648 assemble_name (asm_out_file, name);
8649 putc ('\n', asm_out_file);
8650 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8652 else
8654 switch_to_section (text_section);
8655 ASM_OUTPUT_LABEL (asm_out_file, name);
8658 DECL_INITIAL (decl) = make_node (BLOCK);
8659 current_function_decl = decl;
8660 init_function_start (decl);
8661 first_function_block_is_cold = false;
8662 /* Make sure unwind info is emitted for the thunk if needed. */
8663 final_start_function (emit_barrier (), asm_out_file, 1);
8665 /* Pad stack IP move with 4 instructions (two NOPs count
8666 as one instruction). */
8667 if (TARGET_PAD_SHORT_FUNCTION)
8669 int i = 8;
8671 while (i--)
8672 fputs ("\tnop\n", asm_out_file);
8675 xops[0] = gen_rtx_REG (Pmode, regno);
8676 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8677 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8678 fputs ("\tret\n", asm_out_file);
8679 final_end_function ();
8680 init_insn_lengths ();
8681 free_after_compilation (cfun);
8682 set_cfun (NULL);
8683 current_function_decl = NULL;
8686 if (flag_split_stack)
8687 file_end_indicate_split_stack ();
8690 /* Emit code for the SET_GOT patterns. */
8692 const char *
8693 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8695 rtx xops[3];
8697 xops[0] = dest;
8699 if (TARGET_VXWORKS_RTP && flag_pic)
8701 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8702 xops[2] = gen_rtx_MEM (Pmode,
8703 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8704 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8706 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8707 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8708 an unadorned address. */
8709 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8710 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8711 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8712 return "";
8715 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8717 if (!flag_pic)
8719 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8721 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8723 #if TARGET_MACHO
8724 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8725 is what will be referenced by the Mach-O PIC subsystem. */
8726 if (!label)
8727 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8728 #endif
8730 targetm.asm_out.internal_label (asm_out_file, "L",
8731 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8733 else
8735 char name[32];
8736 get_pc_thunk_name (name, REGNO (dest));
8737 pic_labels_used |= 1 << REGNO (dest);
8739 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8740 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8741 output_asm_insn ("call\t%X2", xops);
8742 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8743 is what will be referenced by the Mach-O PIC subsystem. */
8744 #if TARGET_MACHO
8745 if (!label)
8746 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8747 else
8748 targetm.asm_out.internal_label (asm_out_file, "L",
8749 CODE_LABEL_NUMBER (label));
8750 #endif
8753 if (!TARGET_MACHO)
8754 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8756 return "";
8759 /* Generate an "push" pattern for input ARG. */
8761 static rtx
8762 gen_push (rtx arg)
8764 struct machine_function *m = cfun->machine;
8766 if (m->fs.cfa_reg == stack_pointer_rtx)
8767 m->fs.cfa_offset += UNITS_PER_WORD;
8768 m->fs.sp_offset += UNITS_PER_WORD;
8770 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8771 arg = gen_rtx_REG (word_mode, REGNO (arg));
8773 return gen_rtx_SET (VOIDmode,
8774 gen_rtx_MEM (word_mode,
8775 gen_rtx_PRE_DEC (Pmode,
8776 stack_pointer_rtx)),
8777 arg);
8780 /* Generate an "pop" pattern for input ARG. */
8782 static rtx
8783 gen_pop (rtx arg)
8785 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8786 arg = gen_rtx_REG (word_mode, REGNO (arg));
8788 return gen_rtx_SET (VOIDmode,
8789 arg,
8790 gen_rtx_MEM (word_mode,
8791 gen_rtx_POST_INC (Pmode,
8792 stack_pointer_rtx)));
8795 /* Return >= 0 if there is an unused call-clobbered register available
8796 for the entire function. */
8798 static unsigned int
8799 ix86_select_alt_pic_regnum (void)
8801 if (crtl->is_leaf
8802 && !crtl->profile
8803 && !ix86_current_function_calls_tls_descriptor)
8805 int i, drap;
8806 /* Can't use the same register for both PIC and DRAP. */
8807 if (crtl->drap_reg)
8808 drap = REGNO (crtl->drap_reg);
8809 else
8810 drap = -1;
8811 for (i = 2; i >= 0; --i)
8812 if (i != drap && !df_regs_ever_live_p (i))
8813 return i;
8816 return INVALID_REGNUM;
8819 /* Return TRUE if we need to save REGNO. */
8821 static bool
8822 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8824 if (pic_offset_table_rtx
8825 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8826 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8827 || crtl->profile
8828 || crtl->calls_eh_return
8829 || crtl->uses_const_pool))
8830 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8832 if (crtl->calls_eh_return && maybe_eh_return)
8834 unsigned i;
8835 for (i = 0; ; i++)
8837 unsigned test = EH_RETURN_DATA_REGNO (i);
8838 if (test == INVALID_REGNUM)
8839 break;
8840 if (test == regno)
8841 return true;
8845 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8846 return true;
8848 return (df_regs_ever_live_p (regno)
8849 && !call_used_regs[regno]
8850 && !fixed_regs[regno]
8851 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8854 /* Return number of saved general prupose registers. */
8856 static int
8857 ix86_nsaved_regs (void)
8859 int nregs = 0;
8860 int regno;
8862 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8863 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8864 nregs ++;
8865 return nregs;
8868 /* Return number of saved SSE registrers. */
8870 static int
8871 ix86_nsaved_sseregs (void)
8873 int nregs = 0;
8874 int regno;
8876 if (!TARGET_64BIT_MS_ABI)
8877 return 0;
8878 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8879 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8880 nregs ++;
8881 return nregs;
8884 /* Given FROM and TO register numbers, say whether this elimination is
8885 allowed. If stack alignment is needed, we can only replace argument
8886 pointer with hard frame pointer, or replace frame pointer with stack
8887 pointer. Otherwise, frame pointer elimination is automatically
8888 handled and all other eliminations are valid. */
8890 static bool
8891 ix86_can_eliminate (const int from, const int to)
8893 if (stack_realign_fp)
8894 return ((from == ARG_POINTER_REGNUM
8895 && to == HARD_FRAME_POINTER_REGNUM)
8896 || (from == FRAME_POINTER_REGNUM
8897 && to == STACK_POINTER_REGNUM));
8898 else
8899 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8902 /* Return the offset between two registers, one to be eliminated, and the other
8903 its replacement, at the start of a routine. */
8905 HOST_WIDE_INT
8906 ix86_initial_elimination_offset (int from, int to)
8908 struct ix86_frame frame;
8909 ix86_compute_frame_layout (&frame);
8911 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8912 return frame.hard_frame_pointer_offset;
8913 else if (from == FRAME_POINTER_REGNUM
8914 && to == HARD_FRAME_POINTER_REGNUM)
8915 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8916 else
8918 gcc_assert (to == STACK_POINTER_REGNUM);
8920 if (from == ARG_POINTER_REGNUM)
8921 return frame.stack_pointer_offset;
8923 gcc_assert (from == FRAME_POINTER_REGNUM);
8924 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8928 /* In a dynamically-aligned function, we can't know the offset from
8929 stack pointer to frame pointer, so we must ensure that setjmp
8930 eliminates fp against the hard fp (%ebp) rather than trying to
8931 index from %esp up to the top of the frame across a gap that is
8932 of unknown (at compile-time) size. */
8933 static rtx
8934 ix86_builtin_setjmp_frame_value (void)
8936 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8939 /* When using -fsplit-stack, the allocation routines set a field in
8940 the TCB to the bottom of the stack plus this much space, measured
8941 in bytes. */
8943 #define SPLIT_STACK_AVAILABLE 256
8945 /* Fill structure ix86_frame about frame of currently computed function. */
8947 static void
8948 ix86_compute_frame_layout (struct ix86_frame *frame)
8950 unsigned HOST_WIDE_INT stack_alignment_needed;
8951 HOST_WIDE_INT offset;
8952 unsigned HOST_WIDE_INT preferred_alignment;
8953 HOST_WIDE_INT size = get_frame_size ();
8954 HOST_WIDE_INT to_allocate;
8956 frame->nregs = ix86_nsaved_regs ();
8957 frame->nsseregs = ix86_nsaved_sseregs ();
8959 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8960 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8962 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8963 function prologues and leaf. */
8964 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8965 && (!crtl->is_leaf || cfun->calls_alloca != 0
8966 || ix86_current_function_calls_tls_descriptor))
8968 preferred_alignment = 16;
8969 stack_alignment_needed = 16;
8970 crtl->preferred_stack_boundary = 128;
8971 crtl->stack_alignment_needed = 128;
8974 gcc_assert (!size || stack_alignment_needed);
8975 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8976 gcc_assert (preferred_alignment <= stack_alignment_needed);
8978 /* For SEH we have to limit the amount of code movement into the prologue.
8979 At present we do this via a BLOCKAGE, at which point there's very little
8980 scheduling that can be done, which means that there's very little point
8981 in doing anything except PUSHs. */
8982 if (TARGET_SEH)
8983 cfun->machine->use_fast_prologue_epilogue = false;
8985 /* During reload iteration the amount of registers saved can change.
8986 Recompute the value as needed. Do not recompute when amount of registers
8987 didn't change as reload does multiple calls to the function and does not
8988 expect the decision to change within single iteration. */
8989 else if (!optimize_function_for_size_p (cfun)
8990 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8992 int count = frame->nregs;
8993 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8995 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8997 /* The fast prologue uses move instead of push to save registers. This
8998 is significantly longer, but also executes faster as modern hardware
8999 can execute the moves in parallel, but can't do that for push/pop.
9001 Be careful about choosing what prologue to emit: When function takes
9002 many instructions to execute we may use slow version as well as in
9003 case function is known to be outside hot spot (this is known with
9004 feedback only). Weight the size of function by number of registers
9005 to save as it is cheap to use one or two push instructions but very
9006 slow to use many of them. */
9007 if (count)
9008 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9009 if (node->frequency < NODE_FREQUENCY_NORMAL
9010 || (flag_branch_probabilities
9011 && node->frequency < NODE_FREQUENCY_HOT))
9012 cfun->machine->use_fast_prologue_epilogue = false;
9013 else
9014 cfun->machine->use_fast_prologue_epilogue
9015 = !expensive_function_p (count);
9018 frame->save_regs_using_mov
9019 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9020 /* If static stack checking is enabled and done with probes,
9021 the registers need to be saved before allocating the frame. */
9022 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9024 /* Skip return address. */
9025 offset = UNITS_PER_WORD;
9027 /* Skip pushed static chain. */
9028 if (ix86_static_chain_on_stack)
9029 offset += UNITS_PER_WORD;
9031 /* Skip saved base pointer. */
9032 if (frame_pointer_needed)
9033 offset += UNITS_PER_WORD;
9034 frame->hfp_save_offset = offset;
9036 /* The traditional frame pointer location is at the top of the frame. */
9037 frame->hard_frame_pointer_offset = offset;
9039 /* Register save area */
9040 offset += frame->nregs * UNITS_PER_WORD;
9041 frame->reg_save_offset = offset;
9043 /* On SEH target, registers are pushed just before the frame pointer
9044 location. */
9045 if (TARGET_SEH)
9046 frame->hard_frame_pointer_offset = offset;
9048 /* Align and set SSE register save area. */
9049 if (frame->nsseregs)
9051 /* The only ABI that has saved SSE registers (Win64) also has a
9052 16-byte aligned default stack, and thus we don't need to be
9053 within the re-aligned local stack frame to save them. */
9054 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9055 offset = (offset + 16 - 1) & -16;
9056 offset += frame->nsseregs * 16;
9058 frame->sse_reg_save_offset = offset;
9060 /* The re-aligned stack starts here. Values before this point are not
9061 directly comparable with values below this point. In order to make
9062 sure that no value happens to be the same before and after, force
9063 the alignment computation below to add a non-zero value. */
9064 if (stack_realign_fp)
9065 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9067 /* Va-arg area */
9068 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9069 offset += frame->va_arg_size;
9071 /* Align start of frame for local function. */
9072 if (stack_realign_fp
9073 || offset != frame->sse_reg_save_offset
9074 || size != 0
9075 || !crtl->is_leaf
9076 || cfun->calls_alloca
9077 || ix86_current_function_calls_tls_descriptor)
9078 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9080 /* Frame pointer points here. */
9081 frame->frame_pointer_offset = offset;
9083 offset += size;
9085 /* Add outgoing arguments area. Can be skipped if we eliminated
9086 all the function calls as dead code.
9087 Skipping is however impossible when function calls alloca. Alloca
9088 expander assumes that last crtl->outgoing_args_size
9089 of stack frame are unused. */
9090 if (ACCUMULATE_OUTGOING_ARGS
9091 && (!crtl->is_leaf || cfun->calls_alloca
9092 || ix86_current_function_calls_tls_descriptor))
9094 offset += crtl->outgoing_args_size;
9095 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9097 else
9098 frame->outgoing_arguments_size = 0;
9100 /* Align stack boundary. Only needed if we're calling another function
9101 or using alloca. */
9102 if (!crtl->is_leaf || cfun->calls_alloca
9103 || ix86_current_function_calls_tls_descriptor)
9104 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9106 /* We've reached end of stack frame. */
9107 frame->stack_pointer_offset = offset;
9109 /* Size prologue needs to allocate. */
9110 to_allocate = offset - frame->sse_reg_save_offset;
9112 if ((!to_allocate && frame->nregs <= 1)
9113 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9114 frame->save_regs_using_mov = false;
9116 if (ix86_using_red_zone ()
9117 && crtl->sp_is_unchanging
9118 && crtl->is_leaf
9119 && !ix86_current_function_calls_tls_descriptor)
9121 frame->red_zone_size = to_allocate;
9122 if (frame->save_regs_using_mov)
9123 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9124 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9125 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9127 else
9128 frame->red_zone_size = 0;
9129 frame->stack_pointer_offset -= frame->red_zone_size;
9131 /* The SEH frame pointer location is near the bottom of the frame.
9132 This is enforced by the fact that the difference between the
9133 stack pointer and the frame pointer is limited to 240 bytes in
9134 the unwind data structure. */
9135 if (TARGET_SEH)
9137 HOST_WIDE_INT diff;
9139 /* If we can leave the frame pointer where it is, do so. Also, returns
9140 the establisher frame for __builtin_frame_address (0). */
9141 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9142 if (diff <= SEH_MAX_FRAME_SIZE
9143 && (diff > 240 || (diff & 15) != 0)
9144 && !crtl->accesses_prior_frames)
9146 /* Ideally we'd determine what portion of the local stack frame
9147 (within the constraint of the lowest 240) is most heavily used.
9148 But without that complication, simply bias the frame pointer
9149 by 128 bytes so as to maximize the amount of the local stack
9150 frame that is addressable with 8-bit offsets. */
9151 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9156 /* This is semi-inlined memory_address_length, but simplified
9157 since we know that we're always dealing with reg+offset, and
9158 to avoid having to create and discard all that rtl. */
9160 static inline int
9161 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9163 int len = 4;
9165 if (offset == 0)
9167 /* EBP and R13 cannot be encoded without an offset. */
9168 len = (regno == BP_REG || regno == R13_REG);
9170 else if (IN_RANGE (offset, -128, 127))
9171 len = 1;
9173 /* ESP and R12 must be encoded with a SIB byte. */
9174 if (regno == SP_REG || regno == R12_REG)
9175 len++;
9177 return len;
9180 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9181 The valid base registers are taken from CFUN->MACHINE->FS. */
9183 static rtx
9184 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9186 const struct machine_function *m = cfun->machine;
9187 rtx base_reg = NULL;
9188 HOST_WIDE_INT base_offset = 0;
9190 if (m->use_fast_prologue_epilogue)
9192 /* Choose the base register most likely to allow the most scheduling
9193 opportunities. Generally FP is valid throughout the function,
9194 while DRAP must be reloaded within the epilogue. But choose either
9195 over the SP due to increased encoding size. */
9197 if (m->fs.fp_valid)
9199 base_reg = hard_frame_pointer_rtx;
9200 base_offset = m->fs.fp_offset - cfa_offset;
9202 else if (m->fs.drap_valid)
9204 base_reg = crtl->drap_reg;
9205 base_offset = 0 - cfa_offset;
9207 else if (m->fs.sp_valid)
9209 base_reg = stack_pointer_rtx;
9210 base_offset = m->fs.sp_offset - cfa_offset;
9213 else
9215 HOST_WIDE_INT toffset;
9216 int len = 16, tlen;
9218 /* Choose the base register with the smallest address encoding.
9219 With a tie, choose FP > DRAP > SP. */
9220 if (m->fs.sp_valid)
9222 base_reg = stack_pointer_rtx;
9223 base_offset = m->fs.sp_offset - cfa_offset;
9224 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9226 if (m->fs.drap_valid)
9228 toffset = 0 - cfa_offset;
9229 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9230 if (tlen <= len)
9232 base_reg = crtl->drap_reg;
9233 base_offset = toffset;
9234 len = tlen;
9237 if (m->fs.fp_valid)
9239 toffset = m->fs.fp_offset - cfa_offset;
9240 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9241 if (tlen <= len)
9243 base_reg = hard_frame_pointer_rtx;
9244 base_offset = toffset;
9245 len = tlen;
9249 gcc_assert (base_reg != NULL);
9251 return plus_constant (Pmode, base_reg, base_offset);
9254 /* Emit code to save registers in the prologue. */
9256 static void
9257 ix86_emit_save_regs (void)
9259 unsigned int regno;
9260 rtx insn;
9262 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9263 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9265 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9266 RTX_FRAME_RELATED_P (insn) = 1;
9270 /* Emit a single register save at CFA - CFA_OFFSET. */
9272 static void
9273 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9274 HOST_WIDE_INT cfa_offset)
9276 struct machine_function *m = cfun->machine;
9277 rtx reg = gen_rtx_REG (mode, regno);
9278 rtx mem, addr, base, insn;
9280 addr = choose_baseaddr (cfa_offset);
9281 mem = gen_frame_mem (mode, addr);
9283 /* For SSE saves, we need to indicate the 128-bit alignment. */
9284 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9286 insn = emit_move_insn (mem, reg);
9287 RTX_FRAME_RELATED_P (insn) = 1;
9289 base = addr;
9290 if (GET_CODE (base) == PLUS)
9291 base = XEXP (base, 0);
9292 gcc_checking_assert (REG_P (base));
9294 /* When saving registers into a re-aligned local stack frame, avoid
9295 any tricky guessing by dwarf2out. */
9296 if (m->fs.realigned)
9298 gcc_checking_assert (stack_realign_drap);
9300 if (regno == REGNO (crtl->drap_reg))
9302 /* A bit of a hack. We force the DRAP register to be saved in
9303 the re-aligned stack frame, which provides us with a copy
9304 of the CFA that will last past the prologue. Install it. */
9305 gcc_checking_assert (cfun->machine->fs.fp_valid);
9306 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9307 cfun->machine->fs.fp_offset - cfa_offset);
9308 mem = gen_rtx_MEM (mode, addr);
9309 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9311 else
9313 /* The frame pointer is a stable reference within the
9314 aligned frame. Use it. */
9315 gcc_checking_assert (cfun->machine->fs.fp_valid);
9316 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9317 cfun->machine->fs.fp_offset - cfa_offset);
9318 mem = gen_rtx_MEM (mode, addr);
9319 add_reg_note (insn, REG_CFA_EXPRESSION,
9320 gen_rtx_SET (VOIDmode, mem, reg));
9324 /* The memory may not be relative to the current CFA register,
9325 which means that we may need to generate a new pattern for
9326 use by the unwind info. */
9327 else if (base != m->fs.cfa_reg)
9329 addr = plus_constant (Pmode, m->fs.cfa_reg,
9330 m->fs.cfa_offset - cfa_offset);
9331 mem = gen_rtx_MEM (mode, addr);
9332 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9336 /* Emit code to save registers using MOV insns.
9337 First register is stored at CFA - CFA_OFFSET. */
9338 static void
9339 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9341 unsigned int regno;
9343 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9344 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9346 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9347 cfa_offset -= UNITS_PER_WORD;
9351 /* Emit code to save SSE registers using MOV insns.
9352 First register is stored at CFA - CFA_OFFSET. */
9353 static void
9354 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9356 unsigned int regno;
9358 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9359 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9361 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9362 cfa_offset -= 16;
9366 static GTY(()) rtx queued_cfa_restores;
9368 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9369 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9370 Don't add the note if the previously saved value will be left untouched
9371 within stack red-zone till return, as unwinders can find the same value
9372 in the register and on the stack. */
9374 static void
9375 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9377 if (!crtl->shrink_wrapped
9378 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9379 return;
9381 if (insn)
9383 add_reg_note (insn, REG_CFA_RESTORE, reg);
9384 RTX_FRAME_RELATED_P (insn) = 1;
9386 else
9387 queued_cfa_restores
9388 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9391 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9393 static void
9394 ix86_add_queued_cfa_restore_notes (rtx insn)
9396 rtx last;
9397 if (!queued_cfa_restores)
9398 return;
9399 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9401 XEXP (last, 1) = REG_NOTES (insn);
9402 REG_NOTES (insn) = queued_cfa_restores;
9403 queued_cfa_restores = NULL_RTX;
9404 RTX_FRAME_RELATED_P (insn) = 1;
9407 /* Expand prologue or epilogue stack adjustment.
9408 The pattern exist to put a dependency on all ebp-based memory accesses.
9409 STYLE should be negative if instructions should be marked as frame related,
9410 zero if %r11 register is live and cannot be freely used and positive
9411 otherwise. */
9413 static void
9414 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9415 int style, bool set_cfa)
9417 struct machine_function *m = cfun->machine;
9418 rtx insn;
9419 bool add_frame_related_expr = false;
9421 if (Pmode == SImode)
9422 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9423 else if (x86_64_immediate_operand (offset, DImode))
9424 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9425 else
9427 rtx tmp;
9428 /* r11 is used by indirect sibcall return as well, set before the
9429 epilogue and used after the epilogue. */
9430 if (style)
9431 tmp = gen_rtx_REG (DImode, R11_REG);
9432 else
9434 gcc_assert (src != hard_frame_pointer_rtx
9435 && dest != hard_frame_pointer_rtx);
9436 tmp = hard_frame_pointer_rtx;
9438 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9439 if (style < 0)
9440 add_frame_related_expr = true;
9442 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9445 insn = emit_insn (insn);
9446 if (style >= 0)
9447 ix86_add_queued_cfa_restore_notes (insn);
9449 if (set_cfa)
9451 rtx r;
9453 gcc_assert (m->fs.cfa_reg == src);
9454 m->fs.cfa_offset += INTVAL (offset);
9455 m->fs.cfa_reg = dest;
9457 r = gen_rtx_PLUS (Pmode, src, offset);
9458 r = gen_rtx_SET (VOIDmode, dest, r);
9459 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9460 RTX_FRAME_RELATED_P (insn) = 1;
9462 else if (style < 0)
9464 RTX_FRAME_RELATED_P (insn) = 1;
9465 if (add_frame_related_expr)
9467 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9468 r = gen_rtx_SET (VOIDmode, dest, r);
9469 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9473 if (dest == stack_pointer_rtx)
9475 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9476 bool valid = m->fs.sp_valid;
9478 if (src == hard_frame_pointer_rtx)
9480 valid = m->fs.fp_valid;
9481 ooffset = m->fs.fp_offset;
9483 else if (src == crtl->drap_reg)
9485 valid = m->fs.drap_valid;
9486 ooffset = 0;
9488 else
9490 /* Else there are two possibilities: SP itself, which we set
9491 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9492 taken care of this by hand along the eh_return path. */
9493 gcc_checking_assert (src == stack_pointer_rtx
9494 || offset == const0_rtx);
9497 m->fs.sp_offset = ooffset - INTVAL (offset);
9498 m->fs.sp_valid = valid;
9502 /* Find an available register to be used as dynamic realign argument
9503 pointer regsiter. Such a register will be written in prologue and
9504 used in begin of body, so it must not be
9505 1. parameter passing register.
9506 2. GOT pointer.
9507 We reuse static-chain register if it is available. Otherwise, we
9508 use DI for i386 and R13 for x86-64. We chose R13 since it has
9509 shorter encoding.
9511 Return: the regno of chosen register. */
9513 static unsigned int
9514 find_drap_reg (void)
9516 tree decl = cfun->decl;
9518 if (TARGET_64BIT)
9520 /* Use R13 for nested function or function need static chain.
9521 Since function with tail call may use any caller-saved
9522 registers in epilogue, DRAP must not use caller-saved
9523 register in such case. */
9524 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9525 return R13_REG;
9527 return R10_REG;
9529 else
9531 /* Use DI for nested function or function need static chain.
9532 Since function with tail call may use any caller-saved
9533 registers in epilogue, DRAP must not use caller-saved
9534 register in such case. */
9535 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9536 return DI_REG;
9538 /* Reuse static chain register if it isn't used for parameter
9539 passing. */
9540 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9542 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9543 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9544 return CX_REG;
9546 return DI_REG;
9550 /* Return minimum incoming stack alignment. */
9552 static unsigned int
9553 ix86_minimum_incoming_stack_boundary (bool sibcall)
9555 unsigned int incoming_stack_boundary;
9557 /* Prefer the one specified at command line. */
9558 if (ix86_user_incoming_stack_boundary)
9559 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9560 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9561 if -mstackrealign is used, it isn't used for sibcall check and
9562 estimated stack alignment is 128bit. */
9563 else if (!sibcall
9564 && !TARGET_64BIT
9565 && ix86_force_align_arg_pointer
9566 && crtl->stack_alignment_estimated == 128)
9567 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9568 else
9569 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9571 /* Incoming stack alignment can be changed on individual functions
9572 via force_align_arg_pointer attribute. We use the smallest
9573 incoming stack boundary. */
9574 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9575 && lookup_attribute (ix86_force_align_arg_pointer_string,
9576 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9577 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9579 /* The incoming stack frame has to be aligned at least at
9580 parm_stack_boundary. */
9581 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9582 incoming_stack_boundary = crtl->parm_stack_boundary;
9584 /* Stack at entrance of main is aligned by runtime. We use the
9585 smallest incoming stack boundary. */
9586 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9587 && DECL_NAME (current_function_decl)
9588 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9589 && DECL_FILE_SCOPE_P (current_function_decl))
9590 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9592 return incoming_stack_boundary;
9595 /* Update incoming stack boundary and estimated stack alignment. */
9597 static void
9598 ix86_update_stack_boundary (void)
9600 ix86_incoming_stack_boundary
9601 = ix86_minimum_incoming_stack_boundary (false);
9603 /* x86_64 vararg needs 16byte stack alignment for register save
9604 area. */
9605 if (TARGET_64BIT
9606 && cfun->stdarg
9607 && crtl->stack_alignment_estimated < 128)
9608 crtl->stack_alignment_estimated = 128;
9611 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9612 needed or an rtx for DRAP otherwise. */
9614 static rtx
9615 ix86_get_drap_rtx (void)
9617 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9618 crtl->need_drap = true;
9620 if (stack_realign_drap)
9622 /* Assign DRAP to vDRAP and returns vDRAP */
9623 unsigned int regno = find_drap_reg ();
9624 rtx drap_vreg;
9625 rtx arg_ptr;
9626 rtx seq, insn;
9628 arg_ptr = gen_rtx_REG (Pmode, regno);
9629 crtl->drap_reg = arg_ptr;
9631 start_sequence ();
9632 drap_vreg = copy_to_reg (arg_ptr);
9633 seq = get_insns ();
9634 end_sequence ();
9636 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9637 if (!optimize)
9639 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9640 RTX_FRAME_RELATED_P (insn) = 1;
9642 return drap_vreg;
9644 else
9645 return NULL;
9648 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9650 static rtx
9651 ix86_internal_arg_pointer (void)
9653 return virtual_incoming_args_rtx;
9656 struct scratch_reg {
9657 rtx reg;
9658 bool saved;
9661 /* Return a short-lived scratch register for use on function entry.
9662 In 32-bit mode, it is valid only after the registers are saved
9663 in the prologue. This register must be released by means of
9664 release_scratch_register_on_entry once it is dead. */
9666 static void
9667 get_scratch_register_on_entry (struct scratch_reg *sr)
9669 int regno;
9671 sr->saved = false;
9673 if (TARGET_64BIT)
9675 /* We always use R11 in 64-bit mode. */
9676 regno = R11_REG;
9678 else
9680 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9681 bool fastcall_p
9682 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9683 bool thiscall_p
9684 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9685 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9686 int regparm = ix86_function_regparm (fntype, decl);
9687 int drap_regno
9688 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9690 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9691 for the static chain register. */
9692 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9693 && drap_regno != AX_REG)
9694 regno = AX_REG;
9695 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9696 for the static chain register. */
9697 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9698 regno = AX_REG;
9699 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9700 regno = DX_REG;
9701 /* ecx is the static chain register. */
9702 else if (regparm < 3 && !fastcall_p && !thiscall_p
9703 && !static_chain_p
9704 && drap_regno != CX_REG)
9705 regno = CX_REG;
9706 else if (ix86_save_reg (BX_REG, true))
9707 regno = BX_REG;
9708 /* esi is the static chain register. */
9709 else if (!(regparm == 3 && static_chain_p)
9710 && ix86_save_reg (SI_REG, true))
9711 regno = SI_REG;
9712 else if (ix86_save_reg (DI_REG, true))
9713 regno = DI_REG;
9714 else
9716 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9717 sr->saved = true;
9721 sr->reg = gen_rtx_REG (Pmode, regno);
9722 if (sr->saved)
9724 rtx insn = emit_insn (gen_push (sr->reg));
9725 RTX_FRAME_RELATED_P (insn) = 1;
9729 /* Release a scratch register obtained from the preceding function. */
9731 static void
9732 release_scratch_register_on_entry (struct scratch_reg *sr)
9734 if (sr->saved)
9736 struct machine_function *m = cfun->machine;
9737 rtx x, insn = emit_insn (gen_pop (sr->reg));
9739 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9740 RTX_FRAME_RELATED_P (insn) = 1;
9741 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9742 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9743 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9744 m->fs.sp_offset -= UNITS_PER_WORD;
9748 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9750 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9752 static void
9753 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9755 /* We skip the probe for the first interval + a small dope of 4 words and
9756 probe that many bytes past the specified size to maintain a protection
9757 area at the botton of the stack. */
9758 const int dope = 4 * UNITS_PER_WORD;
9759 rtx size_rtx = GEN_INT (size), last;
9761 /* See if we have a constant small number of probes to generate. If so,
9762 that's the easy case. The run-time loop is made up of 11 insns in the
9763 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9764 for n # of intervals. */
9765 if (size <= 5 * PROBE_INTERVAL)
9767 HOST_WIDE_INT i, adjust;
9768 bool first_probe = true;
9770 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9771 values of N from 1 until it exceeds SIZE. If only one probe is
9772 needed, this will not generate any code. Then adjust and probe
9773 to PROBE_INTERVAL + SIZE. */
9774 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9776 if (first_probe)
9778 adjust = 2 * PROBE_INTERVAL + dope;
9779 first_probe = false;
9781 else
9782 adjust = PROBE_INTERVAL;
9784 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9785 plus_constant (Pmode, stack_pointer_rtx,
9786 -adjust)));
9787 emit_stack_probe (stack_pointer_rtx);
9790 if (first_probe)
9791 adjust = size + PROBE_INTERVAL + dope;
9792 else
9793 adjust = size + PROBE_INTERVAL - i;
9795 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9796 plus_constant (Pmode, stack_pointer_rtx,
9797 -adjust)));
9798 emit_stack_probe (stack_pointer_rtx);
9800 /* Adjust back to account for the additional first interval. */
9801 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9802 plus_constant (Pmode, stack_pointer_rtx,
9803 PROBE_INTERVAL + dope)));
9806 /* Otherwise, do the same as above, but in a loop. Note that we must be
9807 extra careful with variables wrapping around because we might be at
9808 the very top (or the very bottom) of the address space and we have
9809 to be able to handle this case properly; in particular, we use an
9810 equality test for the loop condition. */
9811 else
9813 HOST_WIDE_INT rounded_size;
9814 struct scratch_reg sr;
9816 get_scratch_register_on_entry (&sr);
9819 /* Step 1: round SIZE to the previous multiple of the interval. */
9821 rounded_size = size & -PROBE_INTERVAL;
9824 /* Step 2: compute initial and final value of the loop counter. */
9826 /* SP = SP_0 + PROBE_INTERVAL. */
9827 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9828 plus_constant (Pmode, stack_pointer_rtx,
9829 - (PROBE_INTERVAL + dope))));
9831 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9832 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9833 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9834 gen_rtx_PLUS (Pmode, sr.reg,
9835 stack_pointer_rtx)));
9838 /* Step 3: the loop
9840 while (SP != LAST_ADDR)
9842 SP = SP + PROBE_INTERVAL
9843 probe at SP
9846 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9847 values of N from 1 until it is equal to ROUNDED_SIZE. */
9849 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9852 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9853 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9855 if (size != rounded_size)
9857 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9858 plus_constant (Pmode, stack_pointer_rtx,
9859 rounded_size - size)));
9860 emit_stack_probe (stack_pointer_rtx);
9863 /* Adjust back to account for the additional first interval. */
9864 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9865 plus_constant (Pmode, stack_pointer_rtx,
9866 PROBE_INTERVAL + dope)));
9868 release_scratch_register_on_entry (&sr);
9871 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9873 /* Even if the stack pointer isn't the CFA register, we need to correctly
9874 describe the adjustments made to it, in particular differentiate the
9875 frame-related ones from the frame-unrelated ones. */
9876 if (size > 0)
9878 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9879 XVECEXP (expr, 0, 0)
9880 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9881 plus_constant (Pmode, stack_pointer_rtx, -size));
9882 XVECEXP (expr, 0, 1)
9883 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9884 plus_constant (Pmode, stack_pointer_rtx,
9885 PROBE_INTERVAL + dope + size));
9886 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9887 RTX_FRAME_RELATED_P (last) = 1;
9889 cfun->machine->fs.sp_offset += size;
9892 /* Make sure nothing is scheduled before we are done. */
9893 emit_insn (gen_blockage ());
9896 /* Adjust the stack pointer up to REG while probing it. */
9898 const char *
9899 output_adjust_stack_and_probe (rtx reg)
9901 static int labelno = 0;
9902 char loop_lab[32], end_lab[32];
9903 rtx xops[2];
9905 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9906 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9908 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9910 /* Jump to END_LAB if SP == LAST_ADDR. */
9911 xops[0] = stack_pointer_rtx;
9912 xops[1] = reg;
9913 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9914 fputs ("\tje\t", asm_out_file);
9915 assemble_name_raw (asm_out_file, end_lab);
9916 fputc ('\n', asm_out_file);
9918 /* SP = SP + PROBE_INTERVAL. */
9919 xops[1] = GEN_INT (PROBE_INTERVAL);
9920 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9922 /* Probe at SP. */
9923 xops[1] = const0_rtx;
9924 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9926 fprintf (asm_out_file, "\tjmp\t");
9927 assemble_name_raw (asm_out_file, loop_lab);
9928 fputc ('\n', asm_out_file);
9930 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9932 return "";
9935 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9936 inclusive. These are offsets from the current stack pointer. */
9938 static void
9939 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9941 /* See if we have a constant small number of probes to generate. If so,
9942 that's the easy case. The run-time loop is made up of 7 insns in the
9943 generic case while the compile-time loop is made up of n insns for n #
9944 of intervals. */
9945 if (size <= 7 * PROBE_INTERVAL)
9947 HOST_WIDE_INT i;
9949 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9950 it exceeds SIZE. If only one probe is needed, this will not
9951 generate any code. Then probe at FIRST + SIZE. */
9952 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9953 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9954 -(first + i)));
9956 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9957 -(first + size)));
9960 /* Otherwise, do the same as above, but in a loop. Note that we must be
9961 extra careful with variables wrapping around because we might be at
9962 the very top (or the very bottom) of the address space and we have
9963 to be able to handle this case properly; in particular, we use an
9964 equality test for the loop condition. */
9965 else
9967 HOST_WIDE_INT rounded_size, last;
9968 struct scratch_reg sr;
9970 get_scratch_register_on_entry (&sr);
9973 /* Step 1: round SIZE to the previous multiple of the interval. */
9975 rounded_size = size & -PROBE_INTERVAL;
9978 /* Step 2: compute initial and final value of the loop counter. */
9980 /* TEST_OFFSET = FIRST. */
9981 emit_move_insn (sr.reg, GEN_INT (-first));
9983 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9984 last = first + rounded_size;
9987 /* Step 3: the loop
9989 while (TEST_ADDR != LAST_ADDR)
9991 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9992 probe at TEST_ADDR
9995 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9996 until it is equal to ROUNDED_SIZE. */
9998 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10001 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10002 that SIZE is equal to ROUNDED_SIZE. */
10004 if (size != rounded_size)
10005 emit_stack_probe (plus_constant (Pmode,
10006 gen_rtx_PLUS (Pmode,
10007 stack_pointer_rtx,
10008 sr.reg),
10009 rounded_size - size));
10011 release_scratch_register_on_entry (&sr);
10014 /* Make sure nothing is scheduled before we are done. */
10015 emit_insn (gen_blockage ());
10018 /* Probe a range of stack addresses from REG to END, inclusive. These are
10019 offsets from the current stack pointer. */
10021 const char *
10022 output_probe_stack_range (rtx reg, rtx end)
10024 static int labelno = 0;
10025 char loop_lab[32], end_lab[32];
10026 rtx xops[3];
10028 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10029 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10031 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10033 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10034 xops[0] = reg;
10035 xops[1] = end;
10036 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10037 fputs ("\tje\t", asm_out_file);
10038 assemble_name_raw (asm_out_file, end_lab);
10039 fputc ('\n', asm_out_file);
10041 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10042 xops[1] = GEN_INT (PROBE_INTERVAL);
10043 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10045 /* Probe at TEST_ADDR. */
10046 xops[0] = stack_pointer_rtx;
10047 xops[1] = reg;
10048 xops[2] = const0_rtx;
10049 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10051 fprintf (asm_out_file, "\tjmp\t");
10052 assemble_name_raw (asm_out_file, loop_lab);
10053 fputc ('\n', asm_out_file);
10055 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10057 return "";
10060 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10061 to be generated in correct form. */
10062 static void
10063 ix86_finalize_stack_realign_flags (void)
10065 /* Check if stack realign is really needed after reload, and
10066 stores result in cfun */
10067 unsigned int incoming_stack_boundary
10068 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10069 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10070 unsigned int stack_realign = (incoming_stack_boundary
10071 < (crtl->is_leaf
10072 ? crtl->max_used_stack_slot_alignment
10073 : crtl->stack_alignment_needed));
10075 if (crtl->stack_realign_finalized)
10077 /* After stack_realign_needed is finalized, we can't no longer
10078 change it. */
10079 gcc_assert (crtl->stack_realign_needed == stack_realign);
10080 return;
10083 /* If the only reason for frame_pointer_needed is that we conservatively
10084 assumed stack realignment might be needed, but in the end nothing that
10085 needed the stack alignment had been spilled, clear frame_pointer_needed
10086 and say we don't need stack realignment. */
10087 if (stack_realign
10088 && !crtl->need_drap
10089 && frame_pointer_needed
10090 && crtl->is_leaf
10091 && flag_omit_frame_pointer
10092 && crtl->sp_is_unchanging
10093 && !ix86_current_function_calls_tls_descriptor
10094 && !crtl->accesses_prior_frames
10095 && !cfun->calls_alloca
10096 && !crtl->calls_eh_return
10097 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10098 && !ix86_frame_pointer_required ()
10099 && get_frame_size () == 0
10100 && ix86_nsaved_sseregs () == 0
10101 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10103 HARD_REG_SET set_up_by_prologue, prologue_used;
10104 basic_block bb;
10106 CLEAR_HARD_REG_SET (prologue_used);
10107 CLEAR_HARD_REG_SET (set_up_by_prologue);
10108 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10109 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10110 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10111 HARD_FRAME_POINTER_REGNUM);
10112 FOR_EACH_BB (bb)
10114 rtx insn;
10115 FOR_BB_INSNS (bb, insn)
10116 if (NONDEBUG_INSN_P (insn)
10117 && requires_stack_frame_p (insn, prologue_used,
10118 set_up_by_prologue))
10120 crtl->stack_realign_needed = stack_realign;
10121 crtl->stack_realign_finalized = true;
10122 return;
10126 frame_pointer_needed = false;
10127 stack_realign = false;
10128 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10129 crtl->stack_alignment_needed = incoming_stack_boundary;
10130 crtl->stack_alignment_estimated = incoming_stack_boundary;
10131 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10132 crtl->preferred_stack_boundary = incoming_stack_boundary;
10133 df_finish_pass (true);
10134 df_scan_alloc (NULL);
10135 df_scan_blocks ();
10136 df_compute_regs_ever_live (true);
10137 df_analyze ();
10140 crtl->stack_realign_needed = stack_realign;
10141 crtl->stack_realign_finalized = true;
10144 /* Expand the prologue into a bunch of separate insns. */
10146 void
10147 ix86_expand_prologue (void)
10149 struct machine_function *m = cfun->machine;
10150 rtx insn, t;
10151 bool pic_reg_used;
10152 struct ix86_frame frame;
10153 HOST_WIDE_INT allocate;
10154 bool int_registers_saved;
10155 bool sse_registers_saved;
10157 ix86_finalize_stack_realign_flags ();
10159 /* DRAP should not coexist with stack_realign_fp */
10160 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10162 memset (&m->fs, 0, sizeof (m->fs));
10164 /* Initialize CFA state for before the prologue. */
10165 m->fs.cfa_reg = stack_pointer_rtx;
10166 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10168 /* Track SP offset to the CFA. We continue tracking this after we've
10169 swapped the CFA register away from SP. In the case of re-alignment
10170 this is fudged; we're interested to offsets within the local frame. */
10171 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10172 m->fs.sp_valid = true;
10174 ix86_compute_frame_layout (&frame);
10176 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10178 /* We should have already generated an error for any use of
10179 ms_hook on a nested function. */
10180 gcc_checking_assert (!ix86_static_chain_on_stack);
10182 /* Check if profiling is active and we shall use profiling before
10183 prologue variant. If so sorry. */
10184 if (crtl->profile && flag_fentry != 0)
10185 sorry ("ms_hook_prologue attribute isn%'t compatible "
10186 "with -mfentry for 32-bit");
10188 /* In ix86_asm_output_function_label we emitted:
10189 8b ff movl.s %edi,%edi
10190 55 push %ebp
10191 8b ec movl.s %esp,%ebp
10193 This matches the hookable function prologue in Win32 API
10194 functions in Microsoft Windows XP Service Pack 2 and newer.
10195 Wine uses this to enable Windows apps to hook the Win32 API
10196 functions provided by Wine.
10198 What that means is that we've already set up the frame pointer. */
10200 if (frame_pointer_needed
10201 && !(crtl->drap_reg && crtl->stack_realign_needed))
10203 rtx push, mov;
10205 /* We've decided to use the frame pointer already set up.
10206 Describe this to the unwinder by pretending that both
10207 push and mov insns happen right here.
10209 Putting the unwind info here at the end of the ms_hook
10210 is done so that we can make absolutely certain we get
10211 the required byte sequence at the start of the function,
10212 rather than relying on an assembler that can produce
10213 the exact encoding required.
10215 However it does mean (in the unpatched case) that we have
10216 a 1 insn window where the asynchronous unwind info is
10217 incorrect. However, if we placed the unwind info at
10218 its correct location we would have incorrect unwind info
10219 in the patched case. Which is probably all moot since
10220 I don't expect Wine generates dwarf2 unwind info for the
10221 system libraries that use this feature. */
10223 insn = emit_insn (gen_blockage ());
10225 push = gen_push (hard_frame_pointer_rtx);
10226 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10227 stack_pointer_rtx);
10228 RTX_FRAME_RELATED_P (push) = 1;
10229 RTX_FRAME_RELATED_P (mov) = 1;
10231 RTX_FRAME_RELATED_P (insn) = 1;
10232 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10233 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10235 /* Note that gen_push incremented m->fs.cfa_offset, even
10236 though we didn't emit the push insn here. */
10237 m->fs.cfa_reg = hard_frame_pointer_rtx;
10238 m->fs.fp_offset = m->fs.cfa_offset;
10239 m->fs.fp_valid = true;
10241 else
10243 /* The frame pointer is not needed so pop %ebp again.
10244 This leaves us with a pristine state. */
10245 emit_insn (gen_pop (hard_frame_pointer_rtx));
10249 /* The first insn of a function that accepts its static chain on the
10250 stack is to push the register that would be filled in by a direct
10251 call. This insn will be skipped by the trampoline. */
10252 else if (ix86_static_chain_on_stack)
10254 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10255 emit_insn (gen_blockage ());
10257 /* We don't want to interpret this push insn as a register save,
10258 only as a stack adjustment. The real copy of the register as
10259 a save will be done later, if needed. */
10260 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10261 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10262 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10263 RTX_FRAME_RELATED_P (insn) = 1;
10266 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10267 of DRAP is needed and stack realignment is really needed after reload */
10268 if (stack_realign_drap)
10270 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10272 /* Only need to push parameter pointer reg if it is caller saved. */
10273 if (!call_used_regs[REGNO (crtl->drap_reg)])
10275 /* Push arg pointer reg */
10276 insn = emit_insn (gen_push (crtl->drap_reg));
10277 RTX_FRAME_RELATED_P (insn) = 1;
10280 /* Grab the argument pointer. */
10281 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10282 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10283 RTX_FRAME_RELATED_P (insn) = 1;
10284 m->fs.cfa_reg = crtl->drap_reg;
10285 m->fs.cfa_offset = 0;
10287 /* Align the stack. */
10288 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10289 stack_pointer_rtx,
10290 GEN_INT (-align_bytes)));
10291 RTX_FRAME_RELATED_P (insn) = 1;
10293 /* Replicate the return address on the stack so that return
10294 address can be reached via (argp - 1) slot. This is needed
10295 to implement macro RETURN_ADDR_RTX and intrinsic function
10296 expand_builtin_return_addr etc. */
10297 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10298 t = gen_frame_mem (word_mode, t);
10299 insn = emit_insn (gen_push (t));
10300 RTX_FRAME_RELATED_P (insn) = 1;
10302 /* For the purposes of frame and register save area addressing,
10303 we've started over with a new frame. */
10304 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10305 m->fs.realigned = true;
10308 int_registers_saved = (frame.nregs == 0);
10309 sse_registers_saved = (frame.nsseregs == 0);
10311 if (frame_pointer_needed && !m->fs.fp_valid)
10313 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10314 slower on all targets. Also sdb doesn't like it. */
10315 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10316 RTX_FRAME_RELATED_P (insn) = 1;
10318 /* Push registers now, before setting the frame pointer
10319 on SEH target. */
10320 if (!int_registers_saved
10321 && TARGET_SEH
10322 && !frame.save_regs_using_mov)
10324 ix86_emit_save_regs ();
10325 int_registers_saved = true;
10326 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10329 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10331 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10332 RTX_FRAME_RELATED_P (insn) = 1;
10334 if (m->fs.cfa_reg == stack_pointer_rtx)
10335 m->fs.cfa_reg = hard_frame_pointer_rtx;
10336 m->fs.fp_offset = m->fs.sp_offset;
10337 m->fs.fp_valid = true;
10341 if (!int_registers_saved)
10343 /* If saving registers via PUSH, do so now. */
10344 if (!frame.save_regs_using_mov)
10346 ix86_emit_save_regs ();
10347 int_registers_saved = true;
10348 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10351 /* When using red zone we may start register saving before allocating
10352 the stack frame saving one cycle of the prologue. However, avoid
10353 doing this if we have to probe the stack; at least on x86_64 the
10354 stack probe can turn into a call that clobbers a red zone location. */
10355 else if (ix86_using_red_zone ()
10356 && (! TARGET_STACK_PROBE
10357 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10359 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10360 int_registers_saved = true;
10364 if (stack_realign_fp)
10366 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10367 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10369 /* The computation of the size of the re-aligned stack frame means
10370 that we must allocate the size of the register save area before
10371 performing the actual alignment. Otherwise we cannot guarantee
10372 that there's enough storage above the realignment point. */
10373 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10374 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10375 GEN_INT (m->fs.sp_offset
10376 - frame.sse_reg_save_offset),
10377 -1, false);
10379 /* Align the stack. */
10380 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10381 stack_pointer_rtx,
10382 GEN_INT (-align_bytes)));
10384 /* For the purposes of register save area addressing, the stack
10385 pointer is no longer valid. As for the value of sp_offset,
10386 see ix86_compute_frame_layout, which we need to match in order
10387 to pass verification of stack_pointer_offset at the end. */
10388 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10389 m->fs.sp_valid = false;
10392 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10394 if (flag_stack_usage_info)
10396 /* We start to count from ARG_POINTER. */
10397 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10399 /* If it was realigned, take into account the fake frame. */
10400 if (stack_realign_drap)
10402 if (ix86_static_chain_on_stack)
10403 stack_size += UNITS_PER_WORD;
10405 if (!call_used_regs[REGNO (crtl->drap_reg)])
10406 stack_size += UNITS_PER_WORD;
10408 /* This over-estimates by 1 minimal-stack-alignment-unit but
10409 mitigates that by counting in the new return address slot. */
10410 current_function_dynamic_stack_size
10411 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10414 current_function_static_stack_size = stack_size;
10417 /* On SEH target with very large frame size, allocate an area to save
10418 SSE registers (as the very large allocation won't be described). */
10419 if (TARGET_SEH
10420 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10421 && !sse_registers_saved)
10423 HOST_WIDE_INT sse_size =
10424 frame.sse_reg_save_offset - frame.reg_save_offset;
10426 gcc_assert (int_registers_saved);
10428 /* No need to do stack checking as the area will be immediately
10429 written. */
10430 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10431 GEN_INT (-sse_size), -1,
10432 m->fs.cfa_reg == stack_pointer_rtx);
10433 allocate -= sse_size;
10434 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10435 sse_registers_saved = true;
10438 /* The stack has already been decremented by the instruction calling us
10439 so probe if the size is non-negative to preserve the protection area. */
10440 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10442 /* We expect the registers to be saved when probes are used. */
10443 gcc_assert (int_registers_saved);
10445 if (STACK_CHECK_MOVING_SP)
10447 ix86_adjust_stack_and_probe (allocate);
10448 allocate = 0;
10450 else
10452 HOST_WIDE_INT size = allocate;
10454 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10455 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10457 if (TARGET_STACK_PROBE)
10458 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10459 else
10460 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10464 if (allocate == 0)
10466 else if (!ix86_target_stack_probe ()
10467 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10469 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10470 GEN_INT (-allocate), -1,
10471 m->fs.cfa_reg == stack_pointer_rtx);
10473 else
10475 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10476 rtx r10 = NULL;
10477 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10478 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10479 bool eax_live = false;
10480 bool r10_live = false;
10482 if (TARGET_64BIT)
10483 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10484 if (!TARGET_64BIT_MS_ABI)
10485 eax_live = ix86_eax_live_at_start_p ();
10487 /* Note that SEH directives need to continue tracking the stack
10488 pointer even after the frame pointer has been set up. */
10489 if (eax_live)
10491 insn = emit_insn (gen_push (eax));
10492 allocate -= UNITS_PER_WORD;
10493 if (sp_is_cfa_reg || TARGET_SEH)
10495 if (sp_is_cfa_reg)
10496 m->fs.cfa_offset += UNITS_PER_WORD;
10497 RTX_FRAME_RELATED_P (insn) = 1;
10501 if (r10_live)
10503 r10 = gen_rtx_REG (Pmode, R10_REG);
10504 insn = emit_insn (gen_push (r10));
10505 allocate -= UNITS_PER_WORD;
10506 if (sp_is_cfa_reg || TARGET_SEH)
10508 if (sp_is_cfa_reg)
10509 m->fs.cfa_offset += UNITS_PER_WORD;
10510 RTX_FRAME_RELATED_P (insn) = 1;
10514 emit_move_insn (eax, GEN_INT (allocate));
10515 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10517 /* Use the fact that AX still contains ALLOCATE. */
10518 adjust_stack_insn = (Pmode == DImode
10519 ? gen_pro_epilogue_adjust_stack_di_sub
10520 : gen_pro_epilogue_adjust_stack_si_sub);
10522 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10523 stack_pointer_rtx, eax));
10525 if (sp_is_cfa_reg || TARGET_SEH)
10527 if (sp_is_cfa_reg)
10528 m->fs.cfa_offset += allocate;
10529 RTX_FRAME_RELATED_P (insn) = 1;
10530 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10531 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10532 plus_constant (Pmode, stack_pointer_rtx,
10533 -allocate)));
10535 m->fs.sp_offset += allocate;
10537 if (r10_live && eax_live)
10539 t = choose_baseaddr (m->fs.sp_offset - allocate);
10540 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10541 gen_frame_mem (word_mode, t));
10542 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10543 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10544 gen_frame_mem (word_mode, t));
10546 else if (eax_live || r10_live)
10548 t = choose_baseaddr (m->fs.sp_offset - allocate);
10549 emit_move_insn (gen_rtx_REG (word_mode,
10550 (eax_live ? AX_REG : R10_REG)),
10551 gen_frame_mem (word_mode, t));
10554 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10556 /* If we havn't already set up the frame pointer, do so now. */
10557 if (frame_pointer_needed && !m->fs.fp_valid)
10559 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10560 GEN_INT (frame.stack_pointer_offset
10561 - frame.hard_frame_pointer_offset));
10562 insn = emit_insn (insn);
10563 RTX_FRAME_RELATED_P (insn) = 1;
10564 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10566 if (m->fs.cfa_reg == stack_pointer_rtx)
10567 m->fs.cfa_reg = hard_frame_pointer_rtx;
10568 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10569 m->fs.fp_valid = true;
10572 if (!int_registers_saved)
10573 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10574 if (!sse_registers_saved)
10575 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10577 pic_reg_used = false;
10578 if (pic_offset_table_rtx
10579 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10580 || crtl->profile))
10582 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10584 if (alt_pic_reg_used != INVALID_REGNUM)
10585 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10587 pic_reg_used = true;
10590 if (pic_reg_used)
10592 if (TARGET_64BIT)
10594 if (ix86_cmodel == CM_LARGE_PIC)
10596 rtx label, tmp_reg;
10598 gcc_assert (Pmode == DImode);
10599 label = gen_label_rtx ();
10600 emit_label (label);
10601 LABEL_PRESERVE_P (label) = 1;
10602 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10603 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10604 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10605 label));
10606 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10607 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10608 pic_offset_table_rtx, tmp_reg));
10610 else
10611 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10613 else
10615 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10616 RTX_FRAME_RELATED_P (insn) = 1;
10617 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10621 /* In the pic_reg_used case, make sure that the got load isn't deleted
10622 when mcount needs it. Blockage to avoid call movement across mcount
10623 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10624 note. */
10625 if (crtl->profile && !flag_fentry && pic_reg_used)
10626 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10628 if (crtl->drap_reg && !crtl->stack_realign_needed)
10630 /* vDRAP is setup but after reload it turns out stack realign
10631 isn't necessary, here we will emit prologue to setup DRAP
10632 without stack realign adjustment */
10633 t = choose_baseaddr (0);
10634 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10637 /* Prevent instructions from being scheduled into register save push
10638 sequence when access to the redzone area is done through frame pointer.
10639 The offset between the frame pointer and the stack pointer is calculated
10640 relative to the value of the stack pointer at the end of the function
10641 prologue, and moving instructions that access redzone area via frame
10642 pointer inside push sequence violates this assumption. */
10643 if (frame_pointer_needed && frame.red_zone_size)
10644 emit_insn (gen_memory_blockage ());
10646 /* Emit cld instruction if stringops are used in the function. */
10647 if (TARGET_CLD && ix86_current_function_needs_cld)
10648 emit_insn (gen_cld ());
10650 /* SEH requires that the prologue end within 256 bytes of the start of
10651 the function. Prevent instruction schedules that would extend that.
10652 Further, prevent alloca modifications to the stack pointer from being
10653 combined with prologue modifications. */
10654 if (TARGET_SEH)
10655 emit_insn (gen_prologue_use (stack_pointer_rtx));
10658 /* Emit code to restore REG using a POP insn. */
10660 static void
10661 ix86_emit_restore_reg_using_pop (rtx reg)
10663 struct machine_function *m = cfun->machine;
10664 rtx insn = emit_insn (gen_pop (reg));
10666 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10667 m->fs.sp_offset -= UNITS_PER_WORD;
10669 if (m->fs.cfa_reg == crtl->drap_reg
10670 && REGNO (reg) == REGNO (crtl->drap_reg))
10672 /* Previously we'd represented the CFA as an expression
10673 like *(%ebp - 8). We've just popped that value from
10674 the stack, which means we need to reset the CFA to
10675 the drap register. This will remain until we restore
10676 the stack pointer. */
10677 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10678 RTX_FRAME_RELATED_P (insn) = 1;
10680 /* This means that the DRAP register is valid for addressing too. */
10681 m->fs.drap_valid = true;
10682 return;
10685 if (m->fs.cfa_reg == stack_pointer_rtx)
10687 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10688 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10689 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10690 RTX_FRAME_RELATED_P (insn) = 1;
10692 m->fs.cfa_offset -= UNITS_PER_WORD;
10695 /* When the frame pointer is the CFA, and we pop it, we are
10696 swapping back to the stack pointer as the CFA. This happens
10697 for stack frames that don't allocate other data, so we assume
10698 the stack pointer is now pointing at the return address, i.e.
10699 the function entry state, which makes the offset be 1 word. */
10700 if (reg == hard_frame_pointer_rtx)
10702 m->fs.fp_valid = false;
10703 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10705 m->fs.cfa_reg = stack_pointer_rtx;
10706 m->fs.cfa_offset -= UNITS_PER_WORD;
10708 add_reg_note (insn, REG_CFA_DEF_CFA,
10709 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10710 GEN_INT (m->fs.cfa_offset)));
10711 RTX_FRAME_RELATED_P (insn) = 1;
10716 /* Emit code to restore saved registers using POP insns. */
10718 static void
10719 ix86_emit_restore_regs_using_pop (void)
10721 unsigned int regno;
10723 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10724 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10725 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10728 /* Emit code and notes for the LEAVE instruction. */
10730 static void
10731 ix86_emit_leave (void)
10733 struct machine_function *m = cfun->machine;
10734 rtx insn = emit_insn (ix86_gen_leave ());
10736 ix86_add_queued_cfa_restore_notes (insn);
10738 gcc_assert (m->fs.fp_valid);
10739 m->fs.sp_valid = true;
10740 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10741 m->fs.fp_valid = false;
10743 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10745 m->fs.cfa_reg = stack_pointer_rtx;
10746 m->fs.cfa_offset = m->fs.sp_offset;
10748 add_reg_note (insn, REG_CFA_DEF_CFA,
10749 plus_constant (Pmode, stack_pointer_rtx,
10750 m->fs.sp_offset));
10751 RTX_FRAME_RELATED_P (insn) = 1;
10753 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10754 m->fs.fp_offset);
10757 /* Emit code to restore saved registers using MOV insns.
10758 First register is restored from CFA - CFA_OFFSET. */
10759 static void
10760 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10761 bool maybe_eh_return)
10763 struct machine_function *m = cfun->machine;
10764 unsigned int regno;
10766 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10767 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10769 rtx reg = gen_rtx_REG (word_mode, regno);
10770 rtx insn, mem;
10772 mem = choose_baseaddr (cfa_offset);
10773 mem = gen_frame_mem (word_mode, mem);
10774 insn = emit_move_insn (reg, mem);
10776 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10778 /* Previously we'd represented the CFA as an expression
10779 like *(%ebp - 8). We've just popped that value from
10780 the stack, which means we need to reset the CFA to
10781 the drap register. This will remain until we restore
10782 the stack pointer. */
10783 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10784 RTX_FRAME_RELATED_P (insn) = 1;
10786 /* This means that the DRAP register is valid for addressing. */
10787 m->fs.drap_valid = true;
10789 else
10790 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10792 cfa_offset -= UNITS_PER_WORD;
10796 /* Emit code to restore saved registers using MOV insns.
10797 First register is restored from CFA - CFA_OFFSET. */
10798 static void
10799 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10800 bool maybe_eh_return)
10802 unsigned int regno;
10804 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10805 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10807 rtx reg = gen_rtx_REG (V4SFmode, regno);
10808 rtx mem;
10810 mem = choose_baseaddr (cfa_offset);
10811 mem = gen_rtx_MEM (V4SFmode, mem);
10812 set_mem_align (mem, 128);
10813 emit_move_insn (reg, mem);
10815 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10817 cfa_offset -= 16;
10821 /* Restore function stack, frame, and registers. */
10823 void
10824 ix86_expand_epilogue (int style)
10826 struct machine_function *m = cfun->machine;
10827 struct machine_frame_state frame_state_save = m->fs;
10828 struct ix86_frame frame;
10829 bool restore_regs_via_mov;
10830 bool using_drap;
10832 ix86_finalize_stack_realign_flags ();
10833 ix86_compute_frame_layout (&frame);
10835 m->fs.sp_valid = (!frame_pointer_needed
10836 || (crtl->sp_is_unchanging
10837 && !stack_realign_fp));
10838 gcc_assert (!m->fs.sp_valid
10839 || m->fs.sp_offset == frame.stack_pointer_offset);
10841 /* The FP must be valid if the frame pointer is present. */
10842 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10843 gcc_assert (!m->fs.fp_valid
10844 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10846 /* We must have *some* valid pointer to the stack frame. */
10847 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10849 /* The DRAP is never valid at this point. */
10850 gcc_assert (!m->fs.drap_valid);
10852 /* See the comment about red zone and frame
10853 pointer usage in ix86_expand_prologue. */
10854 if (frame_pointer_needed && frame.red_zone_size)
10855 emit_insn (gen_memory_blockage ());
10857 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10858 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10860 /* Determine the CFA offset of the end of the red-zone. */
10861 m->fs.red_zone_offset = 0;
10862 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10864 /* The red-zone begins below the return address. */
10865 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10867 /* When the register save area is in the aligned portion of
10868 the stack, determine the maximum runtime displacement that
10869 matches up with the aligned frame. */
10870 if (stack_realign_drap)
10871 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10872 + UNITS_PER_WORD);
10875 /* Special care must be taken for the normal return case of a function
10876 using eh_return: the eax and edx registers are marked as saved, but
10877 not restored along this path. Adjust the save location to match. */
10878 if (crtl->calls_eh_return && style != 2)
10879 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10881 /* EH_RETURN requires the use of moves to function properly. */
10882 if (crtl->calls_eh_return)
10883 restore_regs_via_mov = true;
10884 /* SEH requires the use of pops to identify the epilogue. */
10885 else if (TARGET_SEH)
10886 restore_regs_via_mov = false;
10887 /* If we're only restoring one register and sp is not valid then
10888 using a move instruction to restore the register since it's
10889 less work than reloading sp and popping the register. */
10890 else if (!m->fs.sp_valid && frame.nregs <= 1)
10891 restore_regs_via_mov = true;
10892 else if (TARGET_EPILOGUE_USING_MOVE
10893 && cfun->machine->use_fast_prologue_epilogue
10894 && (frame.nregs > 1
10895 || m->fs.sp_offset != frame.reg_save_offset))
10896 restore_regs_via_mov = true;
10897 else if (frame_pointer_needed
10898 && !frame.nregs
10899 && m->fs.sp_offset != frame.reg_save_offset)
10900 restore_regs_via_mov = true;
10901 else if (frame_pointer_needed
10902 && TARGET_USE_LEAVE
10903 && cfun->machine->use_fast_prologue_epilogue
10904 && frame.nregs == 1)
10905 restore_regs_via_mov = true;
10906 else
10907 restore_regs_via_mov = false;
10909 if (restore_regs_via_mov || frame.nsseregs)
10911 /* Ensure that the entire register save area is addressable via
10912 the stack pointer, if we will restore via sp. */
10913 if (TARGET_64BIT
10914 && m->fs.sp_offset > 0x7fffffff
10915 && !(m->fs.fp_valid || m->fs.drap_valid)
10916 && (frame.nsseregs + frame.nregs) != 0)
10918 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10919 GEN_INT (m->fs.sp_offset
10920 - frame.sse_reg_save_offset),
10921 style,
10922 m->fs.cfa_reg == stack_pointer_rtx);
10926 /* If there are any SSE registers to restore, then we have to do it
10927 via moves, since there's obviously no pop for SSE regs. */
10928 if (frame.nsseregs)
10929 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10930 style == 2);
10932 if (restore_regs_via_mov)
10934 rtx t;
10936 if (frame.nregs)
10937 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10939 /* eh_return epilogues need %ecx added to the stack pointer. */
10940 if (style == 2)
10942 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10944 /* Stack align doesn't work with eh_return. */
10945 gcc_assert (!stack_realign_drap);
10946 /* Neither does regparm nested functions. */
10947 gcc_assert (!ix86_static_chain_on_stack);
10949 if (frame_pointer_needed)
10951 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10952 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10953 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10955 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10956 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10958 /* Note that we use SA as a temporary CFA, as the return
10959 address is at the proper place relative to it. We
10960 pretend this happens at the FP restore insn because
10961 prior to this insn the FP would be stored at the wrong
10962 offset relative to SA, and after this insn we have no
10963 other reasonable register to use for the CFA. We don't
10964 bother resetting the CFA to the SP for the duration of
10965 the return insn. */
10966 add_reg_note (insn, REG_CFA_DEF_CFA,
10967 plus_constant (Pmode, sa, UNITS_PER_WORD));
10968 ix86_add_queued_cfa_restore_notes (insn);
10969 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10970 RTX_FRAME_RELATED_P (insn) = 1;
10972 m->fs.cfa_reg = sa;
10973 m->fs.cfa_offset = UNITS_PER_WORD;
10974 m->fs.fp_valid = false;
10976 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10977 const0_rtx, style, false);
10979 else
10981 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10982 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10983 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10984 ix86_add_queued_cfa_restore_notes (insn);
10986 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10987 if (m->fs.cfa_offset != UNITS_PER_WORD)
10989 m->fs.cfa_offset = UNITS_PER_WORD;
10990 add_reg_note (insn, REG_CFA_DEF_CFA,
10991 plus_constant (Pmode, stack_pointer_rtx,
10992 UNITS_PER_WORD));
10993 RTX_FRAME_RELATED_P (insn) = 1;
10996 m->fs.sp_offset = UNITS_PER_WORD;
10997 m->fs.sp_valid = true;
11000 else
11002 /* SEH requires that the function end with (1) a stack adjustment
11003 if necessary, (2) a sequence of pops, and (3) a return or
11004 jump instruction. Prevent insns from the function body from
11005 being scheduled into this sequence. */
11006 if (TARGET_SEH)
11008 /* Prevent a catch region from being adjacent to the standard
11009 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11010 several other flags that would be interesting to test are
11011 not yet set up. */
11012 if (flag_non_call_exceptions)
11013 emit_insn (gen_nops (const1_rtx));
11014 else
11015 emit_insn (gen_blockage ());
11018 /* First step is to deallocate the stack frame so that we can
11019 pop the registers. Also do it on SEH target for very large
11020 frame as the emitted instructions aren't allowed by the ABI in
11021 epilogues. */
11022 if (!m->fs.sp_valid
11023 || (TARGET_SEH
11024 && (m->fs.sp_offset - frame.reg_save_offset
11025 >= SEH_MAX_FRAME_SIZE)))
11027 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11028 GEN_INT (m->fs.fp_offset
11029 - frame.reg_save_offset),
11030 style, false);
11032 else if (m->fs.sp_offset != frame.reg_save_offset)
11034 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11035 GEN_INT (m->fs.sp_offset
11036 - frame.reg_save_offset),
11037 style,
11038 m->fs.cfa_reg == stack_pointer_rtx);
11041 ix86_emit_restore_regs_using_pop ();
11044 /* If we used a stack pointer and haven't already got rid of it,
11045 then do so now. */
11046 if (m->fs.fp_valid)
11048 /* If the stack pointer is valid and pointing at the frame
11049 pointer store address, then we only need a pop. */
11050 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11051 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11052 /* Leave results in shorter dependency chains on CPUs that are
11053 able to grok it fast. */
11054 else if (TARGET_USE_LEAVE
11055 || optimize_function_for_size_p (cfun)
11056 || !cfun->machine->use_fast_prologue_epilogue)
11057 ix86_emit_leave ();
11058 else
11060 pro_epilogue_adjust_stack (stack_pointer_rtx,
11061 hard_frame_pointer_rtx,
11062 const0_rtx, style, !using_drap);
11063 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11067 if (using_drap)
11069 int param_ptr_offset = UNITS_PER_WORD;
11070 rtx insn;
11072 gcc_assert (stack_realign_drap);
11074 if (ix86_static_chain_on_stack)
11075 param_ptr_offset += UNITS_PER_WORD;
11076 if (!call_used_regs[REGNO (crtl->drap_reg)])
11077 param_ptr_offset += UNITS_PER_WORD;
11079 insn = emit_insn (gen_rtx_SET
11080 (VOIDmode, stack_pointer_rtx,
11081 gen_rtx_PLUS (Pmode,
11082 crtl->drap_reg,
11083 GEN_INT (-param_ptr_offset))));
11084 m->fs.cfa_reg = stack_pointer_rtx;
11085 m->fs.cfa_offset = param_ptr_offset;
11086 m->fs.sp_offset = param_ptr_offset;
11087 m->fs.realigned = false;
11089 add_reg_note (insn, REG_CFA_DEF_CFA,
11090 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11091 GEN_INT (param_ptr_offset)));
11092 RTX_FRAME_RELATED_P (insn) = 1;
11094 if (!call_used_regs[REGNO (crtl->drap_reg)])
11095 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11098 /* At this point the stack pointer must be valid, and we must have
11099 restored all of the registers. We may not have deallocated the
11100 entire stack frame. We've delayed this until now because it may
11101 be possible to merge the local stack deallocation with the
11102 deallocation forced by ix86_static_chain_on_stack. */
11103 gcc_assert (m->fs.sp_valid);
11104 gcc_assert (!m->fs.fp_valid);
11105 gcc_assert (!m->fs.realigned);
11106 if (m->fs.sp_offset != UNITS_PER_WORD)
11108 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11109 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11110 style, true);
11112 else
11113 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11115 /* Sibcall epilogues don't want a return instruction. */
11116 if (style == 0)
11118 m->fs = frame_state_save;
11119 return;
11122 if (crtl->args.pops_args && crtl->args.size)
11124 rtx popc = GEN_INT (crtl->args.pops_args);
11126 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11127 address, do explicit add, and jump indirectly to the caller. */
11129 if (crtl->args.pops_args >= 65536)
11131 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11132 rtx insn;
11134 /* There is no "pascal" calling convention in any 64bit ABI. */
11135 gcc_assert (!TARGET_64BIT);
11137 insn = emit_insn (gen_pop (ecx));
11138 m->fs.cfa_offset -= UNITS_PER_WORD;
11139 m->fs.sp_offset -= UNITS_PER_WORD;
11141 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11142 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11143 add_reg_note (insn, REG_CFA_REGISTER,
11144 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11145 RTX_FRAME_RELATED_P (insn) = 1;
11147 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11148 popc, -1, true);
11149 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11151 else
11152 emit_jump_insn (gen_simple_return_pop_internal (popc));
11154 else
11155 emit_jump_insn (gen_simple_return_internal ());
11157 /* Restore the state back to the state from the prologue,
11158 so that it's correct for the next epilogue. */
11159 m->fs = frame_state_save;
11162 /* Reset from the function's potential modifications. */
11164 static void
11165 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11166 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11168 if (pic_offset_table_rtx)
11169 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11170 #if TARGET_MACHO
11171 /* Mach-O doesn't support labels at the end of objects, so if
11172 it looks like we might want one, insert a NOP. */
11174 rtx insn = get_last_insn ();
11175 rtx deleted_debug_label = NULL_RTX;
11176 while (insn
11177 && NOTE_P (insn)
11178 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11180 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11181 notes only, instead set their CODE_LABEL_NUMBER to -1,
11182 otherwise there would be code generation differences
11183 in between -g and -g0. */
11184 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11185 deleted_debug_label = insn;
11186 insn = PREV_INSN (insn);
11188 if (insn
11189 && (LABEL_P (insn)
11190 || (NOTE_P (insn)
11191 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11192 fputs ("\tnop\n", file);
11193 else if (deleted_debug_label)
11194 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11195 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11196 CODE_LABEL_NUMBER (insn) = -1;
11198 #endif
11202 /* Return a scratch register to use in the split stack prologue. The
11203 split stack prologue is used for -fsplit-stack. It is the first
11204 instructions in the function, even before the regular prologue.
11205 The scratch register can be any caller-saved register which is not
11206 used for parameters or for the static chain. */
11208 static unsigned int
11209 split_stack_prologue_scratch_regno (void)
11211 if (TARGET_64BIT)
11212 return R11_REG;
11213 else
11215 bool is_fastcall, is_thiscall;
11216 int regparm;
11218 is_fastcall = (lookup_attribute ("fastcall",
11219 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11220 != NULL);
11221 is_thiscall = (lookup_attribute ("thiscall",
11222 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11223 != NULL);
11224 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11226 if (is_fastcall)
11228 if (DECL_STATIC_CHAIN (cfun->decl))
11230 sorry ("-fsplit-stack does not support fastcall with "
11231 "nested function");
11232 return INVALID_REGNUM;
11234 return AX_REG;
11236 else if (is_thiscall)
11238 if (!DECL_STATIC_CHAIN (cfun->decl))
11239 return DX_REG;
11240 return AX_REG;
11242 else if (regparm < 3)
11244 if (!DECL_STATIC_CHAIN (cfun->decl))
11245 return CX_REG;
11246 else
11248 if (regparm >= 2)
11250 sorry ("-fsplit-stack does not support 2 register "
11251 " parameters for a nested function");
11252 return INVALID_REGNUM;
11254 return DX_REG;
11257 else
11259 /* FIXME: We could make this work by pushing a register
11260 around the addition and comparison. */
11261 sorry ("-fsplit-stack does not support 3 register parameters");
11262 return INVALID_REGNUM;
11267 /* A SYMBOL_REF for the function which allocates new stackspace for
11268 -fsplit-stack. */
11270 static GTY(()) rtx split_stack_fn;
11272 /* A SYMBOL_REF for the more stack function when using the large
11273 model. */
11275 static GTY(()) rtx split_stack_fn_large;
11277 /* Handle -fsplit-stack. These are the first instructions in the
11278 function, even before the regular prologue. */
11280 void
11281 ix86_expand_split_stack_prologue (void)
11283 struct ix86_frame frame;
11284 HOST_WIDE_INT allocate;
11285 unsigned HOST_WIDE_INT args_size;
11286 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11287 rtx scratch_reg = NULL_RTX;
11288 rtx varargs_label = NULL_RTX;
11289 rtx fn;
11291 gcc_assert (flag_split_stack && reload_completed);
11293 ix86_finalize_stack_realign_flags ();
11294 ix86_compute_frame_layout (&frame);
11295 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11297 /* This is the label we will branch to if we have enough stack
11298 space. We expect the basic block reordering pass to reverse this
11299 branch if optimizing, so that we branch in the unlikely case. */
11300 label = gen_label_rtx ();
11302 /* We need to compare the stack pointer minus the frame size with
11303 the stack boundary in the TCB. The stack boundary always gives
11304 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11305 can compare directly. Otherwise we need to do an addition. */
11307 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11308 UNSPEC_STACK_CHECK);
11309 limit = gen_rtx_CONST (Pmode, limit);
11310 limit = gen_rtx_MEM (Pmode, limit);
11311 if (allocate < SPLIT_STACK_AVAILABLE)
11312 current = stack_pointer_rtx;
11313 else
11315 unsigned int scratch_regno;
11316 rtx offset;
11318 /* We need a scratch register to hold the stack pointer minus
11319 the required frame size. Since this is the very start of the
11320 function, the scratch register can be any caller-saved
11321 register which is not used for parameters. */
11322 offset = GEN_INT (- allocate);
11323 scratch_regno = split_stack_prologue_scratch_regno ();
11324 if (scratch_regno == INVALID_REGNUM)
11325 return;
11326 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11327 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11329 /* We don't use ix86_gen_add3 in this case because it will
11330 want to split to lea, but when not optimizing the insn
11331 will not be split after this point. */
11332 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11333 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11334 offset)));
11336 else
11338 emit_move_insn (scratch_reg, offset);
11339 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11340 stack_pointer_rtx));
11342 current = scratch_reg;
11345 ix86_expand_branch (GEU, current, limit, label);
11346 jump_insn = get_last_insn ();
11347 JUMP_LABEL (jump_insn) = label;
11349 /* Mark the jump as very likely to be taken. */
11350 add_reg_note (jump_insn, REG_BR_PROB,
11351 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11353 if (split_stack_fn == NULL_RTX)
11354 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11355 fn = split_stack_fn;
11357 /* Get more stack space. We pass in the desired stack space and the
11358 size of the arguments to copy to the new stack. In 32-bit mode
11359 we push the parameters; __morestack will return on a new stack
11360 anyhow. In 64-bit mode we pass the parameters in r10 and
11361 r11. */
11362 allocate_rtx = GEN_INT (allocate);
11363 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11364 call_fusage = NULL_RTX;
11365 if (TARGET_64BIT)
11367 rtx reg10, reg11;
11369 reg10 = gen_rtx_REG (Pmode, R10_REG);
11370 reg11 = gen_rtx_REG (Pmode, R11_REG);
11372 /* If this function uses a static chain, it will be in %r10.
11373 Preserve it across the call to __morestack. */
11374 if (DECL_STATIC_CHAIN (cfun->decl))
11376 rtx rax;
11378 rax = gen_rtx_REG (word_mode, AX_REG);
11379 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11380 use_reg (&call_fusage, rax);
11383 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11385 HOST_WIDE_INT argval;
11387 gcc_assert (Pmode == DImode);
11388 /* When using the large model we need to load the address
11389 into a register, and we've run out of registers. So we
11390 switch to a different calling convention, and we call a
11391 different function: __morestack_large. We pass the
11392 argument size in the upper 32 bits of r10 and pass the
11393 frame size in the lower 32 bits. */
11394 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11395 gcc_assert ((args_size & 0xffffffff) == args_size);
11397 if (split_stack_fn_large == NULL_RTX)
11398 split_stack_fn_large =
11399 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11401 if (ix86_cmodel == CM_LARGE_PIC)
11403 rtx label, x;
11405 label = gen_label_rtx ();
11406 emit_label (label);
11407 LABEL_PRESERVE_P (label) = 1;
11408 emit_insn (gen_set_rip_rex64 (reg10, label));
11409 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11410 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11411 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11412 UNSPEC_GOT);
11413 x = gen_rtx_CONST (Pmode, x);
11414 emit_move_insn (reg11, x);
11415 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11416 x = gen_const_mem (Pmode, x);
11417 emit_move_insn (reg11, x);
11419 else
11420 emit_move_insn (reg11, split_stack_fn_large);
11422 fn = reg11;
11424 argval = ((args_size << 16) << 16) + allocate;
11425 emit_move_insn (reg10, GEN_INT (argval));
11427 else
11429 emit_move_insn (reg10, allocate_rtx);
11430 emit_move_insn (reg11, GEN_INT (args_size));
11431 use_reg (&call_fusage, reg11);
11434 use_reg (&call_fusage, reg10);
11436 else
11438 emit_insn (gen_push (GEN_INT (args_size)));
11439 emit_insn (gen_push (allocate_rtx));
11441 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11442 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11443 NULL_RTX, false);
11444 add_function_usage_to (call_insn, call_fusage);
11446 /* In order to make call/return prediction work right, we now need
11447 to execute a return instruction. See
11448 libgcc/config/i386/morestack.S for the details on how this works.
11450 For flow purposes gcc must not see this as a return
11451 instruction--we need control flow to continue at the subsequent
11452 label. Therefore, we use an unspec. */
11453 gcc_assert (crtl->args.pops_args < 65536);
11454 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11456 /* If we are in 64-bit mode and this function uses a static chain,
11457 we saved %r10 in %rax before calling _morestack. */
11458 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11459 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11460 gen_rtx_REG (word_mode, AX_REG));
11462 /* If this function calls va_start, we need to store a pointer to
11463 the arguments on the old stack, because they may not have been
11464 all copied to the new stack. At this point the old stack can be
11465 found at the frame pointer value used by __morestack, because
11466 __morestack has set that up before calling back to us. Here we
11467 store that pointer in a scratch register, and in
11468 ix86_expand_prologue we store the scratch register in a stack
11469 slot. */
11470 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11472 unsigned int scratch_regno;
11473 rtx frame_reg;
11474 int words;
11476 scratch_regno = split_stack_prologue_scratch_regno ();
11477 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11478 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11480 /* 64-bit:
11481 fp -> old fp value
11482 return address within this function
11483 return address of caller of this function
11484 stack arguments
11485 So we add three words to get to the stack arguments.
11487 32-bit:
11488 fp -> old fp value
11489 return address within this function
11490 first argument to __morestack
11491 second argument to __morestack
11492 return address of caller of this function
11493 stack arguments
11494 So we add five words to get to the stack arguments.
11496 words = TARGET_64BIT ? 3 : 5;
11497 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11498 gen_rtx_PLUS (Pmode, frame_reg,
11499 GEN_INT (words * UNITS_PER_WORD))));
11501 varargs_label = gen_label_rtx ();
11502 emit_jump_insn (gen_jump (varargs_label));
11503 JUMP_LABEL (get_last_insn ()) = varargs_label;
11505 emit_barrier ();
11508 emit_label (label);
11509 LABEL_NUSES (label) = 1;
11511 /* If this function calls va_start, we now have to set the scratch
11512 register for the case where we do not call __morestack. In this
11513 case we need to set it based on the stack pointer. */
11514 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11516 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11517 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11518 GEN_INT (UNITS_PER_WORD))));
11520 emit_label (varargs_label);
11521 LABEL_NUSES (varargs_label) = 1;
11525 /* We may have to tell the dataflow pass that the split stack prologue
11526 is initializing a scratch register. */
11528 static void
11529 ix86_live_on_entry (bitmap regs)
11531 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11533 gcc_assert (flag_split_stack);
11534 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11538 /* Determine if op is suitable SUBREG RTX for address. */
11540 static bool
11541 ix86_address_subreg_operand (rtx op)
11543 enum machine_mode mode;
11545 if (!REG_P (op))
11546 return false;
11548 mode = GET_MODE (op);
11550 if (GET_MODE_CLASS (mode) != MODE_INT)
11551 return false;
11553 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11554 failures when the register is one word out of a two word structure. */
11555 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11556 return false;
11558 /* Allow only SUBREGs of non-eliminable hard registers. */
11559 return register_no_elim_operand (op, mode);
11562 /* Extract the parts of an RTL expression that is a valid memory address
11563 for an instruction. Return 0 if the structure of the address is
11564 grossly off. Return -1 if the address contains ASHIFT, so it is not
11565 strictly valid, but still used for computing length of lea instruction. */
11568 ix86_decompose_address (rtx addr, struct ix86_address *out)
11570 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11571 rtx base_reg, index_reg;
11572 HOST_WIDE_INT scale = 1;
11573 rtx scale_rtx = NULL_RTX;
11574 rtx tmp;
11575 int retval = 1;
11576 enum ix86_address_seg seg = SEG_DEFAULT;
11578 /* Allow zero-extended SImode addresses,
11579 they will be emitted with addr32 prefix. */
11580 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11582 if (GET_CODE (addr) == ZERO_EXTEND
11583 && GET_MODE (XEXP (addr, 0)) == SImode)
11585 addr = XEXP (addr, 0);
11586 if (CONST_INT_P (addr))
11587 return 0;
11589 else if (GET_CODE (addr) == AND
11590 && const_32bit_mask (XEXP (addr, 1), DImode))
11592 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11593 if (addr == NULL_RTX)
11594 return 0;
11596 if (CONST_INT_P (addr))
11597 return 0;
11601 /* Allow SImode subregs of DImode addresses,
11602 they will be emitted with addr32 prefix. */
11603 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11605 if (GET_CODE (addr) == SUBREG
11606 && GET_MODE (SUBREG_REG (addr)) == DImode)
11608 addr = SUBREG_REG (addr);
11609 if (CONST_INT_P (addr))
11610 return 0;
11614 if (REG_P (addr))
11615 base = addr;
11616 else if (GET_CODE (addr) == SUBREG)
11618 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11619 base = addr;
11620 else
11621 return 0;
11623 else if (GET_CODE (addr) == PLUS)
11625 rtx addends[4], op;
11626 int n = 0, i;
11628 op = addr;
11631 if (n >= 4)
11632 return 0;
11633 addends[n++] = XEXP (op, 1);
11634 op = XEXP (op, 0);
11636 while (GET_CODE (op) == PLUS);
11637 if (n >= 4)
11638 return 0;
11639 addends[n] = op;
11641 for (i = n; i >= 0; --i)
11643 op = addends[i];
11644 switch (GET_CODE (op))
11646 case MULT:
11647 if (index)
11648 return 0;
11649 index = XEXP (op, 0);
11650 scale_rtx = XEXP (op, 1);
11651 break;
11653 case ASHIFT:
11654 if (index)
11655 return 0;
11656 index = XEXP (op, 0);
11657 tmp = XEXP (op, 1);
11658 if (!CONST_INT_P (tmp))
11659 return 0;
11660 scale = INTVAL (tmp);
11661 if ((unsigned HOST_WIDE_INT) scale > 3)
11662 return 0;
11663 scale = 1 << scale;
11664 break;
11666 case ZERO_EXTEND:
11667 op = XEXP (op, 0);
11668 if (GET_CODE (op) != UNSPEC)
11669 return 0;
11670 /* FALLTHRU */
11672 case UNSPEC:
11673 if (XINT (op, 1) == UNSPEC_TP
11674 && TARGET_TLS_DIRECT_SEG_REFS
11675 && seg == SEG_DEFAULT)
11676 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11677 else
11678 return 0;
11679 break;
11681 case SUBREG:
11682 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11683 return 0;
11684 /* FALLTHRU */
11686 case REG:
11687 if (!base)
11688 base = op;
11689 else if (!index)
11690 index = op;
11691 else
11692 return 0;
11693 break;
11695 case CONST:
11696 case CONST_INT:
11697 case SYMBOL_REF:
11698 case LABEL_REF:
11699 if (disp)
11700 return 0;
11701 disp = op;
11702 break;
11704 default:
11705 return 0;
11709 else if (GET_CODE (addr) == MULT)
11711 index = XEXP (addr, 0); /* index*scale */
11712 scale_rtx = XEXP (addr, 1);
11714 else if (GET_CODE (addr) == ASHIFT)
11716 /* We're called for lea too, which implements ashift on occasion. */
11717 index = XEXP (addr, 0);
11718 tmp = XEXP (addr, 1);
11719 if (!CONST_INT_P (tmp))
11720 return 0;
11721 scale = INTVAL (tmp);
11722 if ((unsigned HOST_WIDE_INT) scale > 3)
11723 return 0;
11724 scale = 1 << scale;
11725 retval = -1;
11727 else if (CONST_INT_P (addr))
11729 if (!x86_64_immediate_operand (addr, VOIDmode))
11730 return 0;
11732 /* Constant addresses are sign extended to 64bit, we have to
11733 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11734 if (TARGET_X32
11735 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11736 return 0;
11738 disp = addr;
11740 else
11741 disp = addr; /* displacement */
11743 if (index)
11745 if (REG_P (index))
11747 else if (GET_CODE (index) == SUBREG
11748 && ix86_address_subreg_operand (SUBREG_REG (index)))
11750 else
11751 return 0;
11754 /* Address override works only on the (%reg) part of %fs:(%reg). */
11755 if (seg != SEG_DEFAULT
11756 && ((base && GET_MODE (base) != word_mode)
11757 || (index && GET_MODE (index) != word_mode)))
11758 return 0;
11760 /* Extract the integral value of scale. */
11761 if (scale_rtx)
11763 if (!CONST_INT_P (scale_rtx))
11764 return 0;
11765 scale = INTVAL (scale_rtx);
11768 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11769 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11771 /* Avoid useless 0 displacement. */
11772 if (disp == const0_rtx && (base || index))
11773 disp = NULL_RTX;
11775 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11776 if (base_reg && index_reg && scale == 1
11777 && (index_reg == arg_pointer_rtx
11778 || index_reg == frame_pointer_rtx
11779 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11781 rtx tmp;
11782 tmp = base, base = index, index = tmp;
11783 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11786 /* Special case: %ebp cannot be encoded as a base without a displacement.
11787 Similarly %r13. */
11788 if (!disp
11789 && base_reg
11790 && (base_reg == hard_frame_pointer_rtx
11791 || base_reg == frame_pointer_rtx
11792 || base_reg == arg_pointer_rtx
11793 || (REG_P (base_reg)
11794 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11795 || REGNO (base_reg) == R13_REG))))
11796 disp = const0_rtx;
11798 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11799 Avoid this by transforming to [%esi+0].
11800 Reload calls address legitimization without cfun defined, so we need
11801 to test cfun for being non-NULL. */
11802 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11803 && base_reg && !index_reg && !disp
11804 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11805 disp = const0_rtx;
11807 /* Special case: encode reg+reg instead of reg*2. */
11808 if (!base && index && scale == 2)
11809 base = index, base_reg = index_reg, scale = 1;
11811 /* Special case: scaling cannot be encoded without base or displacement. */
11812 if (!base && !disp && index && scale != 1)
11813 disp = const0_rtx;
11815 out->base = base;
11816 out->index = index;
11817 out->disp = disp;
11818 out->scale = scale;
11819 out->seg = seg;
11821 return retval;
11824 /* Return cost of the memory address x.
11825 For i386, it is better to use a complex address than let gcc copy
11826 the address into a reg and make a new pseudo. But not if the address
11827 requires to two regs - that would mean more pseudos with longer
11828 lifetimes. */
11829 static int
11830 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11831 addr_space_t as ATTRIBUTE_UNUSED,
11832 bool speed ATTRIBUTE_UNUSED)
11834 struct ix86_address parts;
11835 int cost = 1;
11836 int ok = ix86_decompose_address (x, &parts);
11838 gcc_assert (ok);
11840 if (parts.base && GET_CODE (parts.base) == SUBREG)
11841 parts.base = SUBREG_REG (parts.base);
11842 if (parts.index && GET_CODE (parts.index) == SUBREG)
11843 parts.index = SUBREG_REG (parts.index);
11845 /* Attempt to minimize number of registers in the address. */
11846 if ((parts.base
11847 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11848 || (parts.index
11849 && (!REG_P (parts.index)
11850 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11851 cost++;
11853 if (parts.base
11854 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11855 && parts.index
11856 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11857 && parts.base != parts.index)
11858 cost++;
11860 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11861 since it's predecode logic can't detect the length of instructions
11862 and it degenerates to vector decoded. Increase cost of such
11863 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11864 to split such addresses or even refuse such addresses at all.
11866 Following addressing modes are affected:
11867 [base+scale*index]
11868 [scale*index+disp]
11869 [base+index]
11871 The first and last case may be avoidable by explicitly coding the zero in
11872 memory address, but I don't have AMD-K6 machine handy to check this
11873 theory. */
11875 if (TARGET_K6
11876 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11877 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11878 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11879 cost += 10;
11881 return cost;
11884 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11885 this is used for to form addresses to local data when -fPIC is in
11886 use. */
11888 static bool
11889 darwin_local_data_pic (rtx disp)
11891 return (GET_CODE (disp) == UNSPEC
11892 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11895 /* Determine if a given RTX is a valid constant. We already know this
11896 satisfies CONSTANT_P. */
11898 static bool
11899 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11901 switch (GET_CODE (x))
11903 case CONST:
11904 x = XEXP (x, 0);
11906 if (GET_CODE (x) == PLUS)
11908 if (!CONST_INT_P (XEXP (x, 1)))
11909 return false;
11910 x = XEXP (x, 0);
11913 if (TARGET_MACHO && darwin_local_data_pic (x))
11914 return true;
11916 /* Only some unspecs are valid as "constants". */
11917 if (GET_CODE (x) == UNSPEC)
11918 switch (XINT (x, 1))
11920 case UNSPEC_GOT:
11921 case UNSPEC_GOTOFF:
11922 case UNSPEC_PLTOFF:
11923 return TARGET_64BIT;
11924 case UNSPEC_TPOFF:
11925 case UNSPEC_NTPOFF:
11926 x = XVECEXP (x, 0, 0);
11927 return (GET_CODE (x) == SYMBOL_REF
11928 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11929 case UNSPEC_DTPOFF:
11930 x = XVECEXP (x, 0, 0);
11931 return (GET_CODE (x) == SYMBOL_REF
11932 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11933 default:
11934 return false;
11937 /* We must have drilled down to a symbol. */
11938 if (GET_CODE (x) == LABEL_REF)
11939 return true;
11940 if (GET_CODE (x) != SYMBOL_REF)
11941 return false;
11942 /* FALLTHRU */
11944 case SYMBOL_REF:
11945 /* TLS symbols are never valid. */
11946 if (SYMBOL_REF_TLS_MODEL (x))
11947 return false;
11949 /* DLLIMPORT symbols are never valid. */
11950 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11951 && SYMBOL_REF_DLLIMPORT_P (x))
11952 return false;
11954 #if TARGET_MACHO
11955 /* mdynamic-no-pic */
11956 if (MACHO_DYNAMIC_NO_PIC_P)
11957 return machopic_symbol_defined_p (x);
11958 #endif
11959 break;
11961 case CONST_DOUBLE:
11962 if (GET_MODE (x) == TImode
11963 && x != CONST0_RTX (TImode)
11964 && !TARGET_64BIT)
11965 return false;
11966 break;
11968 case CONST_VECTOR:
11969 if (!standard_sse_constant_p (x))
11970 return false;
11972 default:
11973 break;
11976 /* Otherwise we handle everything else in the move patterns. */
11977 return true;
11980 /* Determine if it's legal to put X into the constant pool. This
11981 is not possible for the address of thread-local symbols, which
11982 is checked above. */
11984 static bool
11985 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11987 /* We can always put integral constants and vectors in memory. */
11988 switch (GET_CODE (x))
11990 case CONST_INT:
11991 case CONST_DOUBLE:
11992 case CONST_VECTOR:
11993 return false;
11995 default:
11996 break;
11998 return !ix86_legitimate_constant_p (mode, x);
12002 /* Nonzero if the constant value X is a legitimate general operand
12003 when generating PIC code. It is given that flag_pic is on and
12004 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12006 bool
12007 legitimate_pic_operand_p (rtx x)
12009 rtx inner;
12011 switch (GET_CODE (x))
12013 case CONST:
12014 inner = XEXP (x, 0);
12015 if (GET_CODE (inner) == PLUS
12016 && CONST_INT_P (XEXP (inner, 1)))
12017 inner = XEXP (inner, 0);
12019 /* Only some unspecs are valid as "constants". */
12020 if (GET_CODE (inner) == UNSPEC)
12021 switch (XINT (inner, 1))
12023 case UNSPEC_GOT:
12024 case UNSPEC_GOTOFF:
12025 case UNSPEC_PLTOFF:
12026 return TARGET_64BIT;
12027 case UNSPEC_TPOFF:
12028 x = XVECEXP (inner, 0, 0);
12029 return (GET_CODE (x) == SYMBOL_REF
12030 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12031 case UNSPEC_MACHOPIC_OFFSET:
12032 return legitimate_pic_address_disp_p (x);
12033 default:
12034 return false;
12036 /* FALLTHRU */
12038 case SYMBOL_REF:
12039 case LABEL_REF:
12040 return legitimate_pic_address_disp_p (x);
12042 default:
12043 return true;
12047 /* Determine if a given CONST RTX is a valid memory displacement
12048 in PIC mode. */
12050 bool
12051 legitimate_pic_address_disp_p (rtx disp)
12053 bool saw_plus;
12055 /* In 64bit mode we can allow direct addresses of symbols and labels
12056 when they are not dynamic symbols. */
12057 if (TARGET_64BIT)
12059 rtx op0 = disp, op1;
12061 switch (GET_CODE (disp))
12063 case LABEL_REF:
12064 return true;
12066 case CONST:
12067 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12068 break;
12069 op0 = XEXP (XEXP (disp, 0), 0);
12070 op1 = XEXP (XEXP (disp, 0), 1);
12071 if (!CONST_INT_P (op1)
12072 || INTVAL (op1) >= 16*1024*1024
12073 || INTVAL (op1) < -16*1024*1024)
12074 break;
12075 if (GET_CODE (op0) == LABEL_REF)
12076 return true;
12077 if (GET_CODE (op0) == CONST
12078 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12079 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12080 return true;
12081 if (GET_CODE (op0) == UNSPEC
12082 && XINT (op0, 1) == UNSPEC_PCREL)
12083 return true;
12084 if (GET_CODE (op0) != SYMBOL_REF)
12085 break;
12086 /* FALLTHRU */
12088 case SYMBOL_REF:
12089 /* TLS references should always be enclosed in UNSPEC. */
12090 if (SYMBOL_REF_TLS_MODEL (op0))
12091 return false;
12092 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12093 && ix86_cmodel != CM_LARGE_PIC)
12094 return true;
12095 break;
12097 default:
12098 break;
12101 if (GET_CODE (disp) != CONST)
12102 return false;
12103 disp = XEXP (disp, 0);
12105 if (TARGET_64BIT)
12107 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12108 of GOT tables. We should not need these anyway. */
12109 if (GET_CODE (disp) != UNSPEC
12110 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12111 && XINT (disp, 1) != UNSPEC_GOTOFF
12112 && XINT (disp, 1) != UNSPEC_PCREL
12113 && XINT (disp, 1) != UNSPEC_PLTOFF))
12114 return false;
12116 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12117 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12118 return false;
12119 return true;
12122 saw_plus = false;
12123 if (GET_CODE (disp) == PLUS)
12125 if (!CONST_INT_P (XEXP (disp, 1)))
12126 return false;
12127 disp = XEXP (disp, 0);
12128 saw_plus = true;
12131 if (TARGET_MACHO && darwin_local_data_pic (disp))
12132 return true;
12134 if (GET_CODE (disp) != UNSPEC)
12135 return false;
12137 switch (XINT (disp, 1))
12139 case UNSPEC_GOT:
12140 if (saw_plus)
12141 return false;
12142 /* We need to check for both symbols and labels because VxWorks loads
12143 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12144 details. */
12145 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12146 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12147 case UNSPEC_GOTOFF:
12148 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12149 While ABI specify also 32bit relocation but we don't produce it in
12150 small PIC model at all. */
12151 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12152 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12153 && !TARGET_64BIT)
12154 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12155 return false;
12156 case UNSPEC_GOTTPOFF:
12157 case UNSPEC_GOTNTPOFF:
12158 case UNSPEC_INDNTPOFF:
12159 if (saw_plus)
12160 return false;
12161 disp = XVECEXP (disp, 0, 0);
12162 return (GET_CODE (disp) == SYMBOL_REF
12163 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12164 case UNSPEC_NTPOFF:
12165 disp = XVECEXP (disp, 0, 0);
12166 return (GET_CODE (disp) == SYMBOL_REF
12167 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12168 case UNSPEC_DTPOFF:
12169 disp = XVECEXP (disp, 0, 0);
12170 return (GET_CODE (disp) == SYMBOL_REF
12171 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12174 return false;
12177 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12178 replace the input X, or the original X if no replacement is called for.
12179 The output parameter *WIN is 1 if the calling macro should goto WIN,
12180 0 if it should not. */
12182 bool
12183 ix86_legitimize_reload_address (rtx x,
12184 enum machine_mode mode ATTRIBUTE_UNUSED,
12185 int opnum, int type,
12186 int ind_levels ATTRIBUTE_UNUSED)
12188 /* Reload can generate:
12190 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12191 (reg:DI 97))
12192 (reg:DI 2 cx))
12194 This RTX is rejected from ix86_legitimate_address_p due to
12195 non-strictness of base register 97. Following this rejection,
12196 reload pushes all three components into separate registers,
12197 creating invalid memory address RTX.
12199 Following code reloads only the invalid part of the
12200 memory address RTX. */
12202 if (GET_CODE (x) == PLUS
12203 && REG_P (XEXP (x, 1))
12204 && GET_CODE (XEXP (x, 0)) == PLUS
12205 && REG_P (XEXP (XEXP (x, 0), 1)))
12207 rtx base, index;
12208 bool something_reloaded = false;
12210 base = XEXP (XEXP (x, 0), 1);
12211 if (!REG_OK_FOR_BASE_STRICT_P (base))
12213 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12214 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12215 opnum, (enum reload_type) type);
12216 something_reloaded = true;
12219 index = XEXP (x, 1);
12220 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12222 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12223 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12224 opnum, (enum reload_type) type);
12225 something_reloaded = true;
12228 gcc_assert (something_reloaded);
12229 return true;
12232 return false;
12235 /* Recognizes RTL expressions that are valid memory addresses for an
12236 instruction. The MODE argument is the machine mode for the MEM
12237 expression that wants to use this address.
12239 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12240 convert common non-canonical forms to canonical form so that they will
12241 be recognized. */
12243 static bool
12244 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12245 rtx addr, bool strict)
12247 struct ix86_address parts;
12248 rtx base, index, disp;
12249 HOST_WIDE_INT scale;
12251 if (ix86_decompose_address (addr, &parts) <= 0)
12252 /* Decomposition failed. */
12253 return false;
12255 base = parts.base;
12256 index = parts.index;
12257 disp = parts.disp;
12258 scale = parts.scale;
12260 /* Validate base register. */
12261 if (base)
12263 rtx reg;
12265 if (REG_P (base))
12266 reg = base;
12267 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12268 reg = SUBREG_REG (base);
12269 else
12270 /* Base is not a register. */
12271 return false;
12273 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12274 return false;
12276 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12277 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12278 /* Base is not valid. */
12279 return false;
12282 /* Validate index register. */
12283 if (index)
12285 rtx reg;
12287 if (REG_P (index))
12288 reg = index;
12289 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12290 reg = SUBREG_REG (index);
12291 else
12292 /* Index is not a register. */
12293 return false;
12295 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12296 return false;
12298 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12299 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12300 /* Index is not valid. */
12301 return false;
12304 /* Index and base should have the same mode. */
12305 if (base && index
12306 && GET_MODE (base) != GET_MODE (index))
12307 return false;
12309 /* Validate scale factor. */
12310 if (scale != 1)
12312 if (!index)
12313 /* Scale without index. */
12314 return false;
12316 if (scale != 2 && scale != 4 && scale != 8)
12317 /* Scale is not a valid multiplier. */
12318 return false;
12321 /* Validate displacement. */
12322 if (disp)
12324 if (GET_CODE (disp) == CONST
12325 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12326 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12327 switch (XINT (XEXP (disp, 0), 1))
12329 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12330 used. While ABI specify also 32bit relocations, we don't produce
12331 them at all and use IP relative instead. */
12332 case UNSPEC_GOT:
12333 case UNSPEC_GOTOFF:
12334 gcc_assert (flag_pic);
12335 if (!TARGET_64BIT)
12336 goto is_legitimate_pic;
12338 /* 64bit address unspec. */
12339 return false;
12341 case UNSPEC_GOTPCREL:
12342 case UNSPEC_PCREL:
12343 gcc_assert (flag_pic);
12344 goto is_legitimate_pic;
12346 case UNSPEC_GOTTPOFF:
12347 case UNSPEC_GOTNTPOFF:
12348 case UNSPEC_INDNTPOFF:
12349 case UNSPEC_NTPOFF:
12350 case UNSPEC_DTPOFF:
12351 break;
12353 case UNSPEC_STACK_CHECK:
12354 gcc_assert (flag_split_stack);
12355 break;
12357 default:
12358 /* Invalid address unspec. */
12359 return false;
12362 else if (SYMBOLIC_CONST (disp)
12363 && (flag_pic
12364 || (TARGET_MACHO
12365 #if TARGET_MACHO
12366 && MACHOPIC_INDIRECT
12367 && !machopic_operand_p (disp)
12368 #endif
12372 is_legitimate_pic:
12373 if (TARGET_64BIT && (index || base))
12375 /* foo@dtpoff(%rX) is ok. */
12376 if (GET_CODE (disp) != CONST
12377 || GET_CODE (XEXP (disp, 0)) != PLUS
12378 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12379 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12380 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12381 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12382 /* Non-constant pic memory reference. */
12383 return false;
12385 else if ((!TARGET_MACHO || flag_pic)
12386 && ! legitimate_pic_address_disp_p (disp))
12387 /* Displacement is an invalid pic construct. */
12388 return false;
12389 #if TARGET_MACHO
12390 else if (MACHO_DYNAMIC_NO_PIC_P
12391 && !ix86_legitimate_constant_p (Pmode, disp))
12392 /* displacment must be referenced via non_lazy_pointer */
12393 return false;
12394 #endif
12396 /* This code used to verify that a symbolic pic displacement
12397 includes the pic_offset_table_rtx register.
12399 While this is good idea, unfortunately these constructs may
12400 be created by "adds using lea" optimization for incorrect
12401 code like:
12403 int a;
12404 int foo(int i)
12406 return *(&a+i);
12409 This code is nonsensical, but results in addressing
12410 GOT table with pic_offset_table_rtx base. We can't
12411 just refuse it easily, since it gets matched by
12412 "addsi3" pattern, that later gets split to lea in the
12413 case output register differs from input. While this
12414 can be handled by separate addsi pattern for this case
12415 that never results in lea, this seems to be easier and
12416 correct fix for crash to disable this test. */
12418 else if (GET_CODE (disp) != LABEL_REF
12419 && !CONST_INT_P (disp)
12420 && (GET_CODE (disp) != CONST
12421 || !ix86_legitimate_constant_p (Pmode, disp))
12422 && (GET_CODE (disp) != SYMBOL_REF
12423 || !ix86_legitimate_constant_p (Pmode, disp)))
12424 /* Displacement is not constant. */
12425 return false;
12426 else if (TARGET_64BIT
12427 && !x86_64_immediate_operand (disp, VOIDmode))
12428 /* Displacement is out of range. */
12429 return false;
12432 /* Everything looks valid. */
12433 return true;
12436 /* Determine if a given RTX is a valid constant address. */
12438 bool
12439 constant_address_p (rtx x)
12441 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12444 /* Return a unique alias set for the GOT. */
12446 static alias_set_type
12447 ix86_GOT_alias_set (void)
12449 static alias_set_type set = -1;
12450 if (set == -1)
12451 set = new_alias_set ();
12452 return set;
12455 /* Return a legitimate reference for ORIG (an address) using the
12456 register REG. If REG is 0, a new pseudo is generated.
12458 There are two types of references that must be handled:
12460 1. Global data references must load the address from the GOT, via
12461 the PIC reg. An insn is emitted to do this load, and the reg is
12462 returned.
12464 2. Static data references, constant pool addresses, and code labels
12465 compute the address as an offset from the GOT, whose base is in
12466 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12467 differentiate them from global data objects. The returned
12468 address is the PIC reg + an unspec constant.
12470 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12471 reg also appears in the address. */
12473 static rtx
12474 legitimize_pic_address (rtx orig, rtx reg)
12476 rtx addr = orig;
12477 rtx new_rtx = orig;
12479 #if TARGET_MACHO
12480 if (TARGET_MACHO && !TARGET_64BIT)
12482 if (reg == 0)
12483 reg = gen_reg_rtx (Pmode);
12484 /* Use the generic Mach-O PIC machinery. */
12485 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12487 #endif
12489 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12490 new_rtx = addr;
12491 else if (TARGET_64BIT
12492 && ix86_cmodel != CM_SMALL_PIC
12493 && gotoff_operand (addr, Pmode))
12495 rtx tmpreg;
12496 /* This symbol may be referenced via a displacement from the PIC
12497 base address (@GOTOFF). */
12499 if (reload_in_progress)
12500 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12501 if (GET_CODE (addr) == CONST)
12502 addr = XEXP (addr, 0);
12503 if (GET_CODE (addr) == PLUS)
12505 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12506 UNSPEC_GOTOFF);
12507 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12509 else
12510 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12511 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12512 if (!reg)
12513 tmpreg = gen_reg_rtx (Pmode);
12514 else
12515 tmpreg = reg;
12516 emit_move_insn (tmpreg, new_rtx);
12518 if (reg != 0)
12520 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12521 tmpreg, 1, OPTAB_DIRECT);
12522 new_rtx = reg;
12524 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12526 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12528 /* This symbol may be referenced via a displacement from the PIC
12529 base address (@GOTOFF). */
12531 if (reload_in_progress)
12532 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12533 if (GET_CODE (addr) == CONST)
12534 addr = XEXP (addr, 0);
12535 if (GET_CODE (addr) == PLUS)
12537 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12538 UNSPEC_GOTOFF);
12539 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12541 else
12542 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12543 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12544 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12546 if (reg != 0)
12548 emit_move_insn (reg, new_rtx);
12549 new_rtx = reg;
12552 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12553 /* We can't use @GOTOFF for text labels on VxWorks;
12554 see gotoff_operand. */
12555 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12557 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12559 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12560 return legitimize_dllimport_symbol (addr, true);
12561 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12562 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12563 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12565 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12566 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12570 /* For x64 PE-COFF there is no GOT table. So we use address
12571 directly. */
12572 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12574 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12575 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12577 if (reg == 0)
12578 reg = gen_reg_rtx (Pmode);
12579 emit_move_insn (reg, new_rtx);
12580 new_rtx = reg;
12582 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12584 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12585 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12586 new_rtx = gen_const_mem (Pmode, new_rtx);
12587 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12589 if (reg == 0)
12590 reg = gen_reg_rtx (Pmode);
12591 /* Use directly gen_movsi, otherwise the address is loaded
12592 into register for CSE. We don't want to CSE this addresses,
12593 instead we CSE addresses from the GOT table, so skip this. */
12594 emit_insn (gen_movsi (reg, new_rtx));
12595 new_rtx = reg;
12597 else
12599 /* This symbol must be referenced via a load from the
12600 Global Offset Table (@GOT). */
12602 if (reload_in_progress)
12603 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12604 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12605 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12606 if (TARGET_64BIT)
12607 new_rtx = force_reg (Pmode, new_rtx);
12608 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12609 new_rtx = gen_const_mem (Pmode, new_rtx);
12610 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12612 if (reg == 0)
12613 reg = gen_reg_rtx (Pmode);
12614 emit_move_insn (reg, new_rtx);
12615 new_rtx = reg;
12618 else
12620 if (CONST_INT_P (addr)
12621 && !x86_64_immediate_operand (addr, VOIDmode))
12623 if (reg)
12625 emit_move_insn (reg, addr);
12626 new_rtx = reg;
12628 else
12629 new_rtx = force_reg (Pmode, addr);
12631 else if (GET_CODE (addr) == CONST)
12633 addr = XEXP (addr, 0);
12635 /* We must match stuff we generate before. Assume the only
12636 unspecs that can get here are ours. Not that we could do
12637 anything with them anyway.... */
12638 if (GET_CODE (addr) == UNSPEC
12639 || (GET_CODE (addr) == PLUS
12640 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12641 return orig;
12642 gcc_assert (GET_CODE (addr) == PLUS);
12644 if (GET_CODE (addr) == PLUS)
12646 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12648 /* Check first to see if this is a constant offset from a @GOTOFF
12649 symbol reference. */
12650 if (gotoff_operand (op0, Pmode)
12651 && CONST_INT_P (op1))
12653 if (!TARGET_64BIT)
12655 if (reload_in_progress)
12656 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12657 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12658 UNSPEC_GOTOFF);
12659 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12660 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12661 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12663 if (reg != 0)
12665 emit_move_insn (reg, new_rtx);
12666 new_rtx = reg;
12669 else
12671 if (INTVAL (op1) < -16*1024*1024
12672 || INTVAL (op1) >= 16*1024*1024)
12674 if (!x86_64_immediate_operand (op1, Pmode))
12675 op1 = force_reg (Pmode, op1);
12676 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12680 else
12682 rtx base = legitimize_pic_address (op0, reg);
12683 enum machine_mode mode = GET_MODE (base);
12684 new_rtx
12685 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12687 if (CONST_INT_P (new_rtx))
12689 if (INTVAL (new_rtx) < -16*1024*1024
12690 || INTVAL (new_rtx) >= 16*1024*1024)
12692 if (!x86_64_immediate_operand (new_rtx, mode))
12693 new_rtx = force_reg (mode, new_rtx);
12694 new_rtx
12695 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12697 else
12698 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12700 else
12702 if (GET_CODE (new_rtx) == PLUS
12703 && CONSTANT_P (XEXP (new_rtx, 1)))
12705 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12706 new_rtx = XEXP (new_rtx, 1);
12708 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12713 return new_rtx;
12716 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12718 static rtx
12719 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12721 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12723 if (GET_MODE (tp) != tp_mode)
12725 gcc_assert (GET_MODE (tp) == SImode);
12726 gcc_assert (tp_mode == DImode);
12728 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12731 if (to_reg)
12732 tp = copy_to_mode_reg (tp_mode, tp);
12734 return tp;
12737 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12739 static GTY(()) rtx ix86_tls_symbol;
12741 static rtx
12742 ix86_tls_get_addr (void)
12744 if (!ix86_tls_symbol)
12746 const char *sym
12747 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12748 ? "___tls_get_addr" : "__tls_get_addr");
12750 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12753 return ix86_tls_symbol;
12756 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12758 static GTY(()) rtx ix86_tls_module_base_symbol;
12761 ix86_tls_module_base (void)
12763 if (!ix86_tls_module_base_symbol)
12765 ix86_tls_module_base_symbol
12766 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12768 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12769 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12772 return ix86_tls_module_base_symbol;
12775 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12776 false if we expect this to be used for a memory address and true if
12777 we expect to load the address into a register. */
12779 static rtx
12780 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12782 rtx dest, base, off;
12783 rtx pic = NULL_RTX, tp = NULL_RTX;
12784 enum machine_mode tp_mode = Pmode;
12785 int type;
12787 switch (model)
12789 case TLS_MODEL_GLOBAL_DYNAMIC:
12790 dest = gen_reg_rtx (Pmode);
12792 if (!TARGET_64BIT)
12794 if (flag_pic)
12795 pic = pic_offset_table_rtx;
12796 else
12798 pic = gen_reg_rtx (Pmode);
12799 emit_insn (gen_set_got (pic));
12803 if (TARGET_GNU2_TLS)
12805 if (TARGET_64BIT)
12806 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12807 else
12808 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12810 tp = get_thread_pointer (Pmode, true);
12811 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12813 if (GET_MODE (x) != Pmode)
12814 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12816 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12818 else
12820 rtx caddr = ix86_tls_get_addr ();
12822 if (TARGET_64BIT)
12824 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12825 rtx insns;
12827 start_sequence ();
12828 emit_call_insn
12829 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
12830 insns = get_insns ();
12831 end_sequence ();
12833 if (GET_MODE (x) != Pmode)
12834 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12836 RTL_CONST_CALL_P (insns) = 1;
12837 emit_libcall_block (insns, dest, rax, x);
12839 else
12840 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12842 break;
12844 case TLS_MODEL_LOCAL_DYNAMIC:
12845 base = gen_reg_rtx (Pmode);
12847 if (!TARGET_64BIT)
12849 if (flag_pic)
12850 pic = pic_offset_table_rtx;
12851 else
12853 pic = gen_reg_rtx (Pmode);
12854 emit_insn (gen_set_got (pic));
12858 if (TARGET_GNU2_TLS)
12860 rtx tmp = ix86_tls_module_base ();
12862 if (TARGET_64BIT)
12863 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12864 else
12865 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12867 tp = get_thread_pointer (Pmode, true);
12868 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12869 gen_rtx_MINUS (Pmode, tmp, tp));
12871 else
12873 rtx caddr = ix86_tls_get_addr ();
12875 if (TARGET_64BIT)
12877 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12878 rtx insns, eqv;
12880 start_sequence ();
12881 emit_call_insn
12882 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
12883 insns = get_insns ();
12884 end_sequence ();
12886 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12887 share the LD_BASE result with other LD model accesses. */
12888 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12889 UNSPEC_TLS_LD_BASE);
12891 RTL_CONST_CALL_P (insns) = 1;
12892 emit_libcall_block (insns, base, rax, eqv);
12894 else
12895 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12898 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12899 off = gen_rtx_CONST (Pmode, off);
12901 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12903 if (TARGET_GNU2_TLS)
12905 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12907 if (GET_MODE (x) != Pmode)
12908 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12910 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12912 break;
12914 case TLS_MODEL_INITIAL_EXEC:
12915 if (TARGET_64BIT)
12917 if (TARGET_SUN_TLS && !TARGET_X32)
12919 /* The Sun linker took the AMD64 TLS spec literally
12920 and can only handle %rax as destination of the
12921 initial executable code sequence. */
12923 dest = gen_reg_rtx (DImode);
12924 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12925 return dest;
12928 /* Generate DImode references to avoid %fs:(%reg32)
12929 problems and linker IE->LE relaxation bug. */
12930 tp_mode = DImode;
12931 pic = NULL;
12932 type = UNSPEC_GOTNTPOFF;
12934 else if (flag_pic)
12936 if (reload_in_progress)
12937 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12938 pic = pic_offset_table_rtx;
12939 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12941 else if (!TARGET_ANY_GNU_TLS)
12943 pic = gen_reg_rtx (Pmode);
12944 emit_insn (gen_set_got (pic));
12945 type = UNSPEC_GOTTPOFF;
12947 else
12949 pic = NULL;
12950 type = UNSPEC_INDNTPOFF;
12953 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12954 off = gen_rtx_CONST (tp_mode, off);
12955 if (pic)
12956 off = gen_rtx_PLUS (tp_mode, pic, off);
12957 off = gen_const_mem (tp_mode, off);
12958 set_mem_alias_set (off, ix86_GOT_alias_set ());
12960 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12962 base = get_thread_pointer (tp_mode,
12963 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12964 off = force_reg (tp_mode, off);
12965 return gen_rtx_PLUS (tp_mode, base, off);
12967 else
12969 base = get_thread_pointer (Pmode, true);
12970 dest = gen_reg_rtx (Pmode);
12971 emit_insn (ix86_gen_sub3 (dest, base, off));
12973 break;
12975 case TLS_MODEL_LOCAL_EXEC:
12976 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12977 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12978 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12979 off = gen_rtx_CONST (Pmode, off);
12981 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12983 base = get_thread_pointer (Pmode,
12984 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12985 return gen_rtx_PLUS (Pmode, base, off);
12987 else
12989 base = get_thread_pointer (Pmode, true);
12990 dest = gen_reg_rtx (Pmode);
12991 emit_insn (ix86_gen_sub3 (dest, base, off));
12993 break;
12995 default:
12996 gcc_unreachable ();
12999 return dest;
13002 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13003 to symbol DECL. */
13005 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13006 htab_t dllimport_map;
13008 static tree
13009 get_dllimport_decl (tree decl)
13011 struct tree_map *h, in;
13012 void **loc;
13013 const char *name;
13014 const char *prefix;
13015 size_t namelen, prefixlen;
13016 char *imp_name;
13017 tree to;
13018 rtx rtl;
13020 if (!dllimport_map)
13021 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13023 in.hash = htab_hash_pointer (decl);
13024 in.base.from = decl;
13025 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13026 h = (struct tree_map *) *loc;
13027 if (h)
13028 return h->to;
13030 *loc = h = ggc_alloc_tree_map ();
13031 h->hash = in.hash;
13032 h->base.from = decl;
13033 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13034 VAR_DECL, NULL, ptr_type_node);
13035 DECL_ARTIFICIAL (to) = 1;
13036 DECL_IGNORED_P (to) = 1;
13037 DECL_EXTERNAL (to) = 1;
13038 TREE_READONLY (to) = 1;
13040 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13041 name = targetm.strip_name_encoding (name);
13042 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13043 ? "*__imp_" : "*__imp__";
13044 namelen = strlen (name);
13045 prefixlen = strlen (prefix);
13046 imp_name = (char *) alloca (namelen + prefixlen + 1);
13047 memcpy (imp_name, prefix, prefixlen);
13048 memcpy (imp_name + prefixlen, name, namelen + 1);
13050 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13051 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13052 SET_SYMBOL_REF_DECL (rtl, to);
13053 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13055 rtl = gen_const_mem (Pmode, rtl);
13056 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13058 SET_DECL_RTL (to, rtl);
13059 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13061 return to;
13064 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13065 true if we require the result be a register. */
13067 static rtx
13068 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13070 tree imp_decl;
13071 rtx x;
13073 gcc_assert (SYMBOL_REF_DECL (symbol));
13074 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13076 x = DECL_RTL (imp_decl);
13077 if (want_reg)
13078 x = force_reg (Pmode, x);
13079 return x;
13082 /* Try machine-dependent ways of modifying an illegitimate address
13083 to be legitimate. If we find one, return the new, valid address.
13084 This macro is used in only one place: `memory_address' in explow.c.
13086 OLDX is the address as it was before break_out_memory_refs was called.
13087 In some cases it is useful to look at this to decide what needs to be done.
13089 It is always safe for this macro to do nothing. It exists to recognize
13090 opportunities to optimize the output.
13092 For the 80386, we handle X+REG by loading X into a register R and
13093 using R+REG. R will go in a general reg and indexing will be used.
13094 However, if REG is a broken-out memory address or multiplication,
13095 nothing needs to be done because REG can certainly go in a general reg.
13097 When -fpic is used, special handling is needed for symbolic references.
13098 See comments by legitimize_pic_address in i386.c for details. */
13100 static rtx
13101 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13102 enum machine_mode mode)
13104 int changed = 0;
13105 unsigned log;
13107 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13108 if (log)
13109 return legitimize_tls_address (x, (enum tls_model) log, false);
13110 if (GET_CODE (x) == CONST
13111 && GET_CODE (XEXP (x, 0)) == PLUS
13112 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13113 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13115 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13116 (enum tls_model) log, false);
13117 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13120 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13122 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13123 return legitimize_dllimport_symbol (x, true);
13124 if (GET_CODE (x) == CONST
13125 && GET_CODE (XEXP (x, 0)) == PLUS
13126 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13127 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13129 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13130 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13134 if (flag_pic && SYMBOLIC_CONST (x))
13135 return legitimize_pic_address (x, 0);
13137 #if TARGET_MACHO
13138 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13139 return machopic_indirect_data_reference (x, 0);
13140 #endif
13142 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13143 if (GET_CODE (x) == ASHIFT
13144 && CONST_INT_P (XEXP (x, 1))
13145 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13147 changed = 1;
13148 log = INTVAL (XEXP (x, 1));
13149 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13150 GEN_INT (1 << log));
13153 if (GET_CODE (x) == PLUS)
13155 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13157 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13158 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13159 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13161 changed = 1;
13162 log = INTVAL (XEXP (XEXP (x, 0), 1));
13163 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13164 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13165 GEN_INT (1 << log));
13168 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13169 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13170 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13172 changed = 1;
13173 log = INTVAL (XEXP (XEXP (x, 1), 1));
13174 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13175 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13176 GEN_INT (1 << log));
13179 /* Put multiply first if it isn't already. */
13180 if (GET_CODE (XEXP (x, 1)) == MULT)
13182 rtx tmp = XEXP (x, 0);
13183 XEXP (x, 0) = XEXP (x, 1);
13184 XEXP (x, 1) = tmp;
13185 changed = 1;
13188 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13189 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13190 created by virtual register instantiation, register elimination, and
13191 similar optimizations. */
13192 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13194 changed = 1;
13195 x = gen_rtx_PLUS (Pmode,
13196 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13197 XEXP (XEXP (x, 1), 0)),
13198 XEXP (XEXP (x, 1), 1));
13201 /* Canonicalize
13202 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13203 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13204 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13205 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13206 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13207 && CONSTANT_P (XEXP (x, 1)))
13209 rtx constant;
13210 rtx other = NULL_RTX;
13212 if (CONST_INT_P (XEXP (x, 1)))
13214 constant = XEXP (x, 1);
13215 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13217 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13219 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13220 other = XEXP (x, 1);
13222 else
13223 constant = 0;
13225 if (constant)
13227 changed = 1;
13228 x = gen_rtx_PLUS (Pmode,
13229 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13230 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13231 plus_constant (Pmode, other,
13232 INTVAL (constant)));
13236 if (changed && ix86_legitimate_address_p (mode, x, false))
13237 return x;
13239 if (GET_CODE (XEXP (x, 0)) == MULT)
13241 changed = 1;
13242 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13245 if (GET_CODE (XEXP (x, 1)) == MULT)
13247 changed = 1;
13248 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13251 if (changed
13252 && REG_P (XEXP (x, 1))
13253 && REG_P (XEXP (x, 0)))
13254 return x;
13256 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13258 changed = 1;
13259 x = legitimize_pic_address (x, 0);
13262 if (changed && ix86_legitimate_address_p (mode, x, false))
13263 return x;
13265 if (REG_P (XEXP (x, 0)))
13267 rtx temp = gen_reg_rtx (Pmode);
13268 rtx val = force_operand (XEXP (x, 1), temp);
13269 if (val != temp)
13271 val = convert_to_mode (Pmode, val, 1);
13272 emit_move_insn (temp, val);
13275 XEXP (x, 1) = temp;
13276 return x;
13279 else if (REG_P (XEXP (x, 1)))
13281 rtx temp = gen_reg_rtx (Pmode);
13282 rtx val = force_operand (XEXP (x, 0), temp);
13283 if (val != temp)
13285 val = convert_to_mode (Pmode, val, 1);
13286 emit_move_insn (temp, val);
13289 XEXP (x, 0) = temp;
13290 return x;
13294 return x;
13297 /* Print an integer constant expression in assembler syntax. Addition
13298 and subtraction are the only arithmetic that may appear in these
13299 expressions. FILE is the stdio stream to write to, X is the rtx, and
13300 CODE is the operand print code from the output string. */
13302 static void
13303 output_pic_addr_const (FILE *file, rtx x, int code)
13305 char buf[256];
13307 switch (GET_CODE (x))
13309 case PC:
13310 gcc_assert (flag_pic);
13311 putc ('.', file);
13312 break;
13314 case SYMBOL_REF:
13315 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13316 output_addr_const (file, x);
13317 else
13319 const char *name = XSTR (x, 0);
13321 /* Mark the decl as referenced so that cgraph will
13322 output the function. */
13323 if (SYMBOL_REF_DECL (x))
13324 mark_decl_referenced (SYMBOL_REF_DECL (x));
13326 #if TARGET_MACHO
13327 if (MACHOPIC_INDIRECT
13328 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13329 name = machopic_indirection_name (x, /*stub_p=*/true);
13330 #endif
13331 assemble_name (file, name);
13333 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13334 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13335 fputs ("@PLT", file);
13336 break;
13338 case LABEL_REF:
13339 x = XEXP (x, 0);
13340 /* FALLTHRU */
13341 case CODE_LABEL:
13342 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13343 assemble_name (asm_out_file, buf);
13344 break;
13346 case CONST_INT:
13347 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13348 break;
13350 case CONST:
13351 /* This used to output parentheses around the expression,
13352 but that does not work on the 386 (either ATT or BSD assembler). */
13353 output_pic_addr_const (file, XEXP (x, 0), code);
13354 break;
13356 case CONST_DOUBLE:
13357 if (GET_MODE (x) == VOIDmode)
13359 /* We can use %d if the number is <32 bits and positive. */
13360 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13361 fprintf (file, "0x%lx%08lx",
13362 (unsigned long) CONST_DOUBLE_HIGH (x),
13363 (unsigned long) CONST_DOUBLE_LOW (x));
13364 else
13365 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13367 else
13368 /* We can't handle floating point constants;
13369 TARGET_PRINT_OPERAND must handle them. */
13370 output_operand_lossage ("floating constant misused");
13371 break;
13373 case PLUS:
13374 /* Some assemblers need integer constants to appear first. */
13375 if (CONST_INT_P (XEXP (x, 0)))
13377 output_pic_addr_const (file, XEXP (x, 0), code);
13378 putc ('+', file);
13379 output_pic_addr_const (file, XEXP (x, 1), code);
13381 else
13383 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13384 output_pic_addr_const (file, XEXP (x, 1), code);
13385 putc ('+', file);
13386 output_pic_addr_const (file, XEXP (x, 0), code);
13388 break;
13390 case MINUS:
13391 if (!TARGET_MACHO)
13392 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13393 output_pic_addr_const (file, XEXP (x, 0), code);
13394 putc ('-', file);
13395 output_pic_addr_const (file, XEXP (x, 1), code);
13396 if (!TARGET_MACHO)
13397 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13398 break;
13400 case UNSPEC:
13401 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13403 bool f = i386_asm_output_addr_const_extra (file, x);
13404 gcc_assert (f);
13405 break;
13408 gcc_assert (XVECLEN (x, 0) == 1);
13409 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13410 switch (XINT (x, 1))
13412 case UNSPEC_GOT:
13413 fputs ("@GOT", file);
13414 break;
13415 case UNSPEC_GOTOFF:
13416 fputs ("@GOTOFF", file);
13417 break;
13418 case UNSPEC_PLTOFF:
13419 fputs ("@PLTOFF", file);
13420 break;
13421 case UNSPEC_PCREL:
13422 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13423 "(%rip)" : "[rip]", file);
13424 break;
13425 case UNSPEC_GOTPCREL:
13426 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13427 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13428 break;
13429 case UNSPEC_GOTTPOFF:
13430 /* FIXME: This might be @TPOFF in Sun ld too. */
13431 fputs ("@gottpoff", file);
13432 break;
13433 case UNSPEC_TPOFF:
13434 fputs ("@tpoff", file);
13435 break;
13436 case UNSPEC_NTPOFF:
13437 if (TARGET_64BIT)
13438 fputs ("@tpoff", file);
13439 else
13440 fputs ("@ntpoff", file);
13441 break;
13442 case UNSPEC_DTPOFF:
13443 fputs ("@dtpoff", file);
13444 break;
13445 case UNSPEC_GOTNTPOFF:
13446 if (TARGET_64BIT)
13447 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13448 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13449 else
13450 fputs ("@gotntpoff", file);
13451 break;
13452 case UNSPEC_INDNTPOFF:
13453 fputs ("@indntpoff", file);
13454 break;
13455 #if TARGET_MACHO
13456 case UNSPEC_MACHOPIC_OFFSET:
13457 putc ('-', file);
13458 machopic_output_function_base_name (file);
13459 break;
13460 #endif
13461 default:
13462 output_operand_lossage ("invalid UNSPEC as operand");
13463 break;
13465 break;
13467 default:
13468 output_operand_lossage ("invalid expression as operand");
13472 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13473 We need to emit DTP-relative relocations. */
13475 static void ATTRIBUTE_UNUSED
13476 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13478 fputs (ASM_LONG, file);
13479 output_addr_const (file, x);
13480 fputs ("@dtpoff", file);
13481 switch (size)
13483 case 4:
13484 break;
13485 case 8:
13486 fputs (", 0", file);
13487 break;
13488 default:
13489 gcc_unreachable ();
13493 /* Return true if X is a representation of the PIC register. This copes
13494 with calls from ix86_find_base_term, where the register might have
13495 been replaced by a cselib value. */
13497 static bool
13498 ix86_pic_register_p (rtx x)
13500 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13501 return (pic_offset_table_rtx
13502 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13503 else
13504 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13507 /* Helper function for ix86_delegitimize_address.
13508 Attempt to delegitimize TLS local-exec accesses. */
13510 static rtx
13511 ix86_delegitimize_tls_address (rtx orig_x)
13513 rtx x = orig_x, unspec;
13514 struct ix86_address addr;
13516 if (!TARGET_TLS_DIRECT_SEG_REFS)
13517 return orig_x;
13518 if (MEM_P (x))
13519 x = XEXP (x, 0);
13520 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13521 return orig_x;
13522 if (ix86_decompose_address (x, &addr) == 0
13523 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13524 || addr.disp == NULL_RTX
13525 || GET_CODE (addr.disp) != CONST)
13526 return orig_x;
13527 unspec = XEXP (addr.disp, 0);
13528 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13529 unspec = XEXP (unspec, 0);
13530 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13531 return orig_x;
13532 x = XVECEXP (unspec, 0, 0);
13533 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13534 if (unspec != XEXP (addr.disp, 0))
13535 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13536 if (addr.index)
13538 rtx idx = addr.index;
13539 if (addr.scale != 1)
13540 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13541 x = gen_rtx_PLUS (Pmode, idx, x);
13543 if (addr.base)
13544 x = gen_rtx_PLUS (Pmode, addr.base, x);
13545 if (MEM_P (orig_x))
13546 x = replace_equiv_address_nv (orig_x, x);
13547 return x;
13550 /* In the name of slightly smaller debug output, and to cater to
13551 general assembler lossage, recognize PIC+GOTOFF and turn it back
13552 into a direct symbol reference.
13554 On Darwin, this is necessary to avoid a crash, because Darwin
13555 has a different PIC label for each routine but the DWARF debugging
13556 information is not associated with any particular routine, so it's
13557 necessary to remove references to the PIC label from RTL stored by
13558 the DWARF output code. */
13560 static rtx
13561 ix86_delegitimize_address (rtx x)
13563 rtx orig_x = delegitimize_mem_from_attrs (x);
13564 /* addend is NULL or some rtx if x is something+GOTOFF where
13565 something doesn't include the PIC register. */
13566 rtx addend = NULL_RTX;
13567 /* reg_addend is NULL or a multiple of some register. */
13568 rtx reg_addend = NULL_RTX;
13569 /* const_addend is NULL or a const_int. */
13570 rtx const_addend = NULL_RTX;
13571 /* This is the result, or NULL. */
13572 rtx result = NULL_RTX;
13574 x = orig_x;
13576 if (MEM_P (x))
13577 x = XEXP (x, 0);
13579 if (TARGET_64BIT)
13581 if (GET_CODE (x) == CONST
13582 && GET_CODE (XEXP (x, 0)) == PLUS
13583 && GET_MODE (XEXP (x, 0)) == Pmode
13584 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13585 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13586 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13588 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13589 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13590 if (MEM_P (orig_x))
13591 x = replace_equiv_address_nv (orig_x, x);
13592 return x;
13594 if (GET_CODE (x) != CONST
13595 || GET_CODE (XEXP (x, 0)) != UNSPEC
13596 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13597 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13598 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13599 return ix86_delegitimize_tls_address (orig_x);
13600 x = XVECEXP (XEXP (x, 0), 0, 0);
13601 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13603 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13604 GET_MODE (x), 0);
13605 if (x == NULL_RTX)
13606 return orig_x;
13608 return x;
13611 if (GET_CODE (x) != PLUS
13612 || GET_CODE (XEXP (x, 1)) != CONST)
13613 return ix86_delegitimize_tls_address (orig_x);
13615 if (ix86_pic_register_p (XEXP (x, 0)))
13616 /* %ebx + GOT/GOTOFF */
13618 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13620 /* %ebx + %reg * scale + GOT/GOTOFF */
13621 reg_addend = XEXP (x, 0);
13622 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13623 reg_addend = XEXP (reg_addend, 1);
13624 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13625 reg_addend = XEXP (reg_addend, 0);
13626 else
13628 reg_addend = NULL_RTX;
13629 addend = XEXP (x, 0);
13632 else
13633 addend = XEXP (x, 0);
13635 x = XEXP (XEXP (x, 1), 0);
13636 if (GET_CODE (x) == PLUS
13637 && CONST_INT_P (XEXP (x, 1)))
13639 const_addend = XEXP (x, 1);
13640 x = XEXP (x, 0);
13643 if (GET_CODE (x) == UNSPEC
13644 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13645 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13646 result = XVECEXP (x, 0, 0);
13648 if (TARGET_MACHO && darwin_local_data_pic (x)
13649 && !MEM_P (orig_x))
13650 result = XVECEXP (x, 0, 0);
13652 if (! result)
13653 return ix86_delegitimize_tls_address (orig_x);
13655 if (const_addend)
13656 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13657 if (reg_addend)
13658 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13659 if (addend)
13661 /* If the rest of original X doesn't involve the PIC register, add
13662 addend and subtract pic_offset_table_rtx. This can happen e.g.
13663 for code like:
13664 leal (%ebx, %ecx, 4), %ecx
13666 movl foo@GOTOFF(%ecx), %edx
13667 in which case we return (%ecx - %ebx) + foo. */
13668 if (pic_offset_table_rtx)
13669 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13670 pic_offset_table_rtx),
13671 result);
13672 else
13673 return orig_x;
13675 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13677 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13678 if (result == NULL_RTX)
13679 return orig_x;
13681 return result;
13684 /* If X is a machine specific address (i.e. a symbol or label being
13685 referenced as a displacement from the GOT implemented using an
13686 UNSPEC), then return the base term. Otherwise return X. */
13689 ix86_find_base_term (rtx x)
13691 rtx term;
13693 if (TARGET_64BIT)
13695 if (GET_CODE (x) != CONST)
13696 return x;
13697 term = XEXP (x, 0);
13698 if (GET_CODE (term) == PLUS
13699 && (CONST_INT_P (XEXP (term, 1))
13700 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13701 term = XEXP (term, 0);
13702 if (GET_CODE (term) != UNSPEC
13703 || (XINT (term, 1) != UNSPEC_GOTPCREL
13704 && XINT (term, 1) != UNSPEC_PCREL))
13705 return x;
13707 return XVECEXP (term, 0, 0);
13710 return ix86_delegitimize_address (x);
13713 static void
13714 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13715 bool fp, FILE *file)
13717 const char *suffix;
13719 if (mode == CCFPmode || mode == CCFPUmode)
13721 code = ix86_fp_compare_code_to_integer (code);
13722 mode = CCmode;
13724 if (reverse)
13725 code = reverse_condition (code);
13727 switch (code)
13729 case EQ:
13730 switch (mode)
13732 case CCAmode:
13733 suffix = "a";
13734 break;
13736 case CCCmode:
13737 suffix = "c";
13738 break;
13740 case CCOmode:
13741 suffix = "o";
13742 break;
13744 case CCSmode:
13745 suffix = "s";
13746 break;
13748 default:
13749 suffix = "e";
13751 break;
13752 case NE:
13753 switch (mode)
13755 case CCAmode:
13756 suffix = "na";
13757 break;
13759 case CCCmode:
13760 suffix = "nc";
13761 break;
13763 case CCOmode:
13764 suffix = "no";
13765 break;
13767 case CCSmode:
13768 suffix = "ns";
13769 break;
13771 default:
13772 suffix = "ne";
13774 break;
13775 case GT:
13776 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13777 suffix = "g";
13778 break;
13779 case GTU:
13780 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13781 Those same assemblers have the same but opposite lossage on cmov. */
13782 if (mode == CCmode)
13783 suffix = fp ? "nbe" : "a";
13784 else if (mode == CCCmode)
13785 suffix = "b";
13786 else
13787 gcc_unreachable ();
13788 break;
13789 case LT:
13790 switch (mode)
13792 case CCNOmode:
13793 case CCGOCmode:
13794 suffix = "s";
13795 break;
13797 case CCmode:
13798 case CCGCmode:
13799 suffix = "l";
13800 break;
13802 default:
13803 gcc_unreachable ();
13805 break;
13806 case LTU:
13807 gcc_assert (mode == CCmode || mode == CCCmode);
13808 suffix = "b";
13809 break;
13810 case GE:
13811 switch (mode)
13813 case CCNOmode:
13814 case CCGOCmode:
13815 suffix = "ns";
13816 break;
13818 case CCmode:
13819 case CCGCmode:
13820 suffix = "ge";
13821 break;
13823 default:
13824 gcc_unreachable ();
13826 break;
13827 case GEU:
13828 /* ??? As above. */
13829 gcc_assert (mode == CCmode || mode == CCCmode);
13830 suffix = fp ? "nb" : "ae";
13831 break;
13832 case LE:
13833 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13834 suffix = "le";
13835 break;
13836 case LEU:
13837 /* ??? As above. */
13838 if (mode == CCmode)
13839 suffix = "be";
13840 else if (mode == CCCmode)
13841 suffix = fp ? "nb" : "ae";
13842 else
13843 gcc_unreachable ();
13844 break;
13845 case UNORDERED:
13846 suffix = fp ? "u" : "p";
13847 break;
13848 case ORDERED:
13849 suffix = fp ? "nu" : "np";
13850 break;
13851 default:
13852 gcc_unreachable ();
13854 fputs (suffix, file);
13857 /* Print the name of register X to FILE based on its machine mode and number.
13858 If CODE is 'w', pretend the mode is HImode.
13859 If CODE is 'b', pretend the mode is QImode.
13860 If CODE is 'k', pretend the mode is SImode.
13861 If CODE is 'q', pretend the mode is DImode.
13862 If CODE is 'x', pretend the mode is V4SFmode.
13863 If CODE is 't', pretend the mode is V8SFmode.
13864 If CODE is 'h', pretend the reg is the 'high' byte register.
13865 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13866 If CODE is 'd', duplicate the operand for AVX instruction.
13869 void
13870 print_reg (rtx x, int code, FILE *file)
13872 const char *reg;
13873 unsigned int regno;
13874 bool duplicated = code == 'd' && TARGET_AVX;
13876 if (ASSEMBLER_DIALECT == ASM_ATT)
13877 putc ('%', file);
13879 if (x == pc_rtx)
13881 gcc_assert (TARGET_64BIT);
13882 fputs ("rip", file);
13883 return;
13886 regno = true_regnum (x);
13887 gcc_assert (regno != ARG_POINTER_REGNUM
13888 && regno != FRAME_POINTER_REGNUM
13889 && regno != FLAGS_REG
13890 && regno != FPSR_REG
13891 && regno != FPCR_REG);
13893 if (code == 'w' || MMX_REG_P (x))
13894 code = 2;
13895 else if (code == 'b')
13896 code = 1;
13897 else if (code == 'k')
13898 code = 4;
13899 else if (code == 'q')
13900 code = 8;
13901 else if (code == 'y')
13902 code = 3;
13903 else if (code == 'h')
13904 code = 0;
13905 else if (code == 'x')
13906 code = 16;
13907 else if (code == 't')
13908 code = 32;
13909 else
13910 code = GET_MODE_SIZE (GET_MODE (x));
13912 /* Irritatingly, AMD extended registers use different naming convention
13913 from the normal registers: "r%d[bwd]" */
13914 if (REX_INT_REGNO_P (regno))
13916 gcc_assert (TARGET_64BIT);
13917 putc ('r', file);
13918 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13919 switch (code)
13921 case 0:
13922 error ("extended registers have no high halves");
13923 break;
13924 case 1:
13925 putc ('b', file);
13926 break;
13927 case 2:
13928 putc ('w', file);
13929 break;
13930 case 4:
13931 putc ('d', file);
13932 break;
13933 case 8:
13934 /* no suffix */
13935 break;
13936 default:
13937 error ("unsupported operand size for extended register");
13938 break;
13940 return;
13943 reg = NULL;
13944 switch (code)
13946 case 3:
13947 if (STACK_TOP_P (x))
13949 reg = "st(0)";
13950 break;
13952 /* FALLTHRU */
13953 case 8:
13954 case 4:
13955 case 12:
13956 if (! ANY_FP_REG_P (x))
13957 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13958 /* FALLTHRU */
13959 case 16:
13960 case 2:
13961 normal:
13962 reg = hi_reg_name[regno];
13963 break;
13964 case 1:
13965 if (regno >= ARRAY_SIZE (qi_reg_name))
13966 goto normal;
13967 reg = qi_reg_name[regno];
13968 break;
13969 case 0:
13970 if (regno >= ARRAY_SIZE (qi_high_reg_name))
13971 goto normal;
13972 reg = qi_high_reg_name[regno];
13973 break;
13974 case 32:
13975 if (SSE_REG_P (x))
13977 gcc_assert (!duplicated);
13978 putc ('y', file);
13979 fputs (hi_reg_name[regno] + 1, file);
13980 return;
13982 break;
13983 default:
13984 gcc_unreachable ();
13987 fputs (reg, file);
13988 if (duplicated)
13990 if (ASSEMBLER_DIALECT == ASM_ATT)
13991 fprintf (file, ", %%%s", reg);
13992 else
13993 fprintf (file, ", %s", reg);
13997 /* Locate some local-dynamic symbol still in use by this function
13998 so that we can print its name in some tls_local_dynamic_base
13999 pattern. */
14001 static int
14002 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14004 rtx x = *px;
14006 if (GET_CODE (x) == SYMBOL_REF
14007 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14009 cfun->machine->some_ld_name = XSTR (x, 0);
14010 return 1;
14013 return 0;
14016 static const char *
14017 get_some_local_dynamic_name (void)
14019 rtx insn;
14021 if (cfun->machine->some_ld_name)
14022 return cfun->machine->some_ld_name;
14024 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14025 if (NONDEBUG_INSN_P (insn)
14026 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14027 return cfun->machine->some_ld_name;
14029 return NULL;
14032 /* Meaning of CODE:
14033 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14034 C -- print opcode suffix for set/cmov insn.
14035 c -- like C, but print reversed condition
14036 F,f -- likewise, but for floating-point.
14037 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14038 otherwise nothing
14039 R -- print the prefix for register names.
14040 z -- print the opcode suffix for the size of the current operand.
14041 Z -- likewise, with special suffixes for x87 instructions.
14042 * -- print a star (in certain assembler syntax)
14043 A -- print an absolute memory reference.
14044 E -- print address with DImode register names if TARGET_64BIT.
14045 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14046 s -- print a shift double count, followed by the assemblers argument
14047 delimiter.
14048 b -- print the QImode name of the register for the indicated operand.
14049 %b0 would print %al if operands[0] is reg 0.
14050 w -- likewise, print the HImode name of the register.
14051 k -- likewise, print the SImode name of the register.
14052 q -- likewise, print the DImode name of the register.
14053 x -- likewise, print the V4SFmode name of the register.
14054 t -- likewise, print the V8SFmode name of the register.
14055 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14056 y -- print "st(0)" instead of "st" as a register.
14057 d -- print duplicated register operand for AVX instruction.
14058 D -- print condition for SSE cmp instruction.
14059 P -- if PIC, print an @PLT suffix.
14060 p -- print raw symbol name.
14061 X -- don't print any sort of PIC '@' suffix for a symbol.
14062 & -- print some in-use local-dynamic symbol name.
14063 H -- print a memory address offset by 8; used for sse high-parts
14064 Y -- print condition for XOP pcom* instruction.
14065 + -- print a branch hint as 'cs' or 'ds' prefix
14066 ; -- print a semicolon (after prefixes due to bug in older gas).
14067 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14068 @ -- print a segment register of thread base pointer load
14069 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14072 void
14073 ix86_print_operand (FILE *file, rtx x, int code)
14075 if (code)
14077 switch (code)
14079 case 'A':
14080 switch (ASSEMBLER_DIALECT)
14082 case ASM_ATT:
14083 putc ('*', file);
14084 break;
14086 case ASM_INTEL:
14087 /* Intel syntax. For absolute addresses, registers should not
14088 be surrounded by braces. */
14089 if (!REG_P (x))
14091 putc ('[', file);
14092 ix86_print_operand (file, x, 0);
14093 putc (']', file);
14094 return;
14096 break;
14098 default:
14099 gcc_unreachable ();
14102 ix86_print_operand (file, x, 0);
14103 return;
14105 case 'E':
14106 /* Wrap address in an UNSPEC to declare special handling. */
14107 if (TARGET_64BIT)
14108 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14110 output_address (x);
14111 return;
14113 case 'L':
14114 if (ASSEMBLER_DIALECT == ASM_ATT)
14115 putc ('l', file);
14116 return;
14118 case 'W':
14119 if (ASSEMBLER_DIALECT == ASM_ATT)
14120 putc ('w', file);
14121 return;
14123 case 'B':
14124 if (ASSEMBLER_DIALECT == ASM_ATT)
14125 putc ('b', file);
14126 return;
14128 case 'Q':
14129 if (ASSEMBLER_DIALECT == ASM_ATT)
14130 putc ('l', file);
14131 return;
14133 case 'S':
14134 if (ASSEMBLER_DIALECT == ASM_ATT)
14135 putc ('s', file);
14136 return;
14138 case 'T':
14139 if (ASSEMBLER_DIALECT == ASM_ATT)
14140 putc ('t', file);
14141 return;
14143 case 'O':
14144 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14145 if (ASSEMBLER_DIALECT != ASM_ATT)
14146 return;
14148 switch (GET_MODE_SIZE (GET_MODE (x)))
14150 case 2:
14151 putc ('w', file);
14152 break;
14154 case 4:
14155 putc ('l', file);
14156 break;
14158 case 8:
14159 putc ('q', file);
14160 break;
14162 default:
14163 output_operand_lossage
14164 ("invalid operand size for operand code 'O'");
14165 return;
14168 putc ('.', file);
14169 #endif
14170 return;
14172 case 'z':
14173 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14175 /* Opcodes don't get size suffixes if using Intel opcodes. */
14176 if (ASSEMBLER_DIALECT == ASM_INTEL)
14177 return;
14179 switch (GET_MODE_SIZE (GET_MODE (x)))
14181 case 1:
14182 putc ('b', file);
14183 return;
14185 case 2:
14186 putc ('w', file);
14187 return;
14189 case 4:
14190 putc ('l', file);
14191 return;
14193 case 8:
14194 putc ('q', file);
14195 return;
14197 default:
14198 output_operand_lossage
14199 ("invalid operand size for operand code 'z'");
14200 return;
14204 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14205 warning
14206 (0, "non-integer operand used with operand code 'z'");
14207 /* FALLTHRU */
14209 case 'Z':
14210 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14211 if (ASSEMBLER_DIALECT == ASM_INTEL)
14212 return;
14214 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14216 switch (GET_MODE_SIZE (GET_MODE (x)))
14218 case 2:
14219 #ifdef HAVE_AS_IX86_FILDS
14220 putc ('s', file);
14221 #endif
14222 return;
14224 case 4:
14225 putc ('l', file);
14226 return;
14228 case 8:
14229 #ifdef HAVE_AS_IX86_FILDQ
14230 putc ('q', file);
14231 #else
14232 fputs ("ll", file);
14233 #endif
14234 return;
14236 default:
14237 break;
14240 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14242 /* 387 opcodes don't get size suffixes
14243 if the operands are registers. */
14244 if (STACK_REG_P (x))
14245 return;
14247 switch (GET_MODE_SIZE (GET_MODE (x)))
14249 case 4:
14250 putc ('s', file);
14251 return;
14253 case 8:
14254 putc ('l', file);
14255 return;
14257 case 12:
14258 case 16:
14259 putc ('t', file);
14260 return;
14262 default:
14263 break;
14266 else
14268 output_operand_lossage
14269 ("invalid operand type used with operand code 'Z'");
14270 return;
14273 output_operand_lossage
14274 ("invalid operand size for operand code 'Z'");
14275 return;
14277 case 'd':
14278 case 'b':
14279 case 'w':
14280 case 'k':
14281 case 'q':
14282 case 'h':
14283 case 't':
14284 case 'y':
14285 case 'x':
14286 case 'X':
14287 case 'P':
14288 case 'p':
14289 break;
14291 case 's':
14292 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14294 ix86_print_operand (file, x, 0);
14295 fputs (", ", file);
14297 return;
14299 case 'Y':
14300 switch (GET_CODE (x))
14302 case NE:
14303 fputs ("neq", file);
14304 break;
14305 case EQ:
14306 fputs ("eq", file);
14307 break;
14308 case GE:
14309 case GEU:
14310 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14311 break;
14312 case GT:
14313 case GTU:
14314 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14315 break;
14316 case LE:
14317 case LEU:
14318 fputs ("le", file);
14319 break;
14320 case LT:
14321 case LTU:
14322 fputs ("lt", file);
14323 break;
14324 case UNORDERED:
14325 fputs ("unord", file);
14326 break;
14327 case ORDERED:
14328 fputs ("ord", file);
14329 break;
14330 case UNEQ:
14331 fputs ("ueq", file);
14332 break;
14333 case UNGE:
14334 fputs ("nlt", file);
14335 break;
14336 case UNGT:
14337 fputs ("nle", file);
14338 break;
14339 case UNLE:
14340 fputs ("ule", file);
14341 break;
14342 case UNLT:
14343 fputs ("ult", file);
14344 break;
14345 case LTGT:
14346 fputs ("une", file);
14347 break;
14348 default:
14349 output_operand_lossage ("operand is not a condition code, "
14350 "invalid operand code 'Y'");
14351 return;
14353 return;
14355 case 'D':
14356 /* Little bit of braindamage here. The SSE compare instructions
14357 does use completely different names for the comparisons that the
14358 fp conditional moves. */
14359 switch (GET_CODE (x))
14361 case UNEQ:
14362 if (TARGET_AVX)
14364 fputs ("eq_us", file);
14365 break;
14367 case EQ:
14368 fputs ("eq", file);
14369 break;
14370 case UNLT:
14371 if (TARGET_AVX)
14373 fputs ("nge", file);
14374 break;
14376 case LT:
14377 fputs ("lt", file);
14378 break;
14379 case UNLE:
14380 if (TARGET_AVX)
14382 fputs ("ngt", file);
14383 break;
14385 case LE:
14386 fputs ("le", file);
14387 break;
14388 case UNORDERED:
14389 fputs ("unord", file);
14390 break;
14391 case LTGT:
14392 if (TARGET_AVX)
14394 fputs ("neq_oq", file);
14395 break;
14397 case NE:
14398 fputs ("neq", file);
14399 break;
14400 case GE:
14401 if (TARGET_AVX)
14403 fputs ("ge", file);
14404 break;
14406 case UNGE:
14407 fputs ("nlt", file);
14408 break;
14409 case GT:
14410 if (TARGET_AVX)
14412 fputs ("gt", file);
14413 break;
14415 case UNGT:
14416 fputs ("nle", file);
14417 break;
14418 case ORDERED:
14419 fputs ("ord", file);
14420 break;
14421 default:
14422 output_operand_lossage ("operand is not a condition code, "
14423 "invalid operand code 'D'");
14424 return;
14426 return;
14428 case 'F':
14429 case 'f':
14430 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14431 if (ASSEMBLER_DIALECT == ASM_ATT)
14432 putc ('.', file);
14433 #endif
14435 case 'C':
14436 case 'c':
14437 if (!COMPARISON_P (x))
14439 output_operand_lossage ("operand is not a condition code, "
14440 "invalid operand code '%c'", code);
14441 return;
14443 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14444 code == 'c' || code == 'f',
14445 code == 'F' || code == 'f',
14446 file);
14447 return;
14449 case 'H':
14450 if (!offsettable_memref_p (x))
14452 output_operand_lossage ("operand is not an offsettable memory "
14453 "reference, invalid operand code 'H'");
14454 return;
14456 /* It doesn't actually matter what mode we use here, as we're
14457 only going to use this for printing. */
14458 x = adjust_address_nv (x, DImode, 8);
14459 break;
14461 case 'K':
14462 gcc_assert (CONST_INT_P (x));
14464 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14465 #ifdef HAVE_AS_IX86_HLE
14466 fputs ("xacquire ", file);
14467 #else
14468 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14469 #endif
14470 else if (INTVAL (x) & IX86_HLE_RELEASE)
14471 #ifdef HAVE_AS_IX86_HLE
14472 fputs ("xrelease ", file);
14473 #else
14474 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14475 #endif
14476 /* We do not want to print value of the operand. */
14477 return;
14479 case '*':
14480 if (ASSEMBLER_DIALECT == ASM_ATT)
14481 putc ('*', file);
14482 return;
14484 case '&':
14486 const char *name = get_some_local_dynamic_name ();
14487 if (name == NULL)
14488 output_operand_lossage ("'%%&' used without any "
14489 "local dynamic TLS references");
14490 else
14491 assemble_name (file, name);
14492 return;
14495 case '+':
14497 rtx x;
14499 if (!optimize
14500 || optimize_function_for_size_p (cfun)
14501 || !TARGET_BRANCH_PREDICTION_HINTS)
14502 return;
14504 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14505 if (x)
14507 int pred_val = INTVAL (XEXP (x, 0));
14509 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14510 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14512 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14513 bool cputaken
14514 = final_forward_branch_p (current_output_insn) == 0;
14516 /* Emit hints only in the case default branch prediction
14517 heuristics would fail. */
14518 if (taken != cputaken)
14520 /* We use 3e (DS) prefix for taken branches and
14521 2e (CS) prefix for not taken branches. */
14522 if (taken)
14523 fputs ("ds ; ", file);
14524 else
14525 fputs ("cs ; ", file);
14529 return;
14532 case ';':
14533 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14534 putc (';', file);
14535 #endif
14536 return;
14538 case '@':
14539 if (ASSEMBLER_DIALECT == ASM_ATT)
14540 putc ('%', file);
14542 /* The kernel uses a different segment register for performance
14543 reasons; a system call would not have to trash the userspace
14544 segment register, which would be expensive. */
14545 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14546 fputs ("fs", file);
14547 else
14548 fputs ("gs", file);
14549 return;
14551 case '~':
14552 putc (TARGET_AVX2 ? 'i' : 'f', file);
14553 return;
14555 case '^':
14556 if (TARGET_64BIT && Pmode != word_mode)
14557 fputs ("addr32 ", file);
14558 return;
14560 default:
14561 output_operand_lossage ("invalid operand code '%c'", code);
14565 if (REG_P (x))
14566 print_reg (x, code, file);
14568 else if (MEM_P (x))
14570 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14571 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14572 && GET_MODE (x) != BLKmode)
14574 const char * size;
14575 switch (GET_MODE_SIZE (GET_MODE (x)))
14577 case 1: size = "BYTE"; break;
14578 case 2: size = "WORD"; break;
14579 case 4: size = "DWORD"; break;
14580 case 8: size = "QWORD"; break;
14581 case 12: size = "TBYTE"; break;
14582 case 16:
14583 if (GET_MODE (x) == XFmode)
14584 size = "TBYTE";
14585 else
14586 size = "XMMWORD";
14587 break;
14588 case 32: size = "YMMWORD"; break;
14589 default:
14590 gcc_unreachable ();
14593 /* Check for explicit size override (codes 'b', 'w', 'k',
14594 'q' and 'x') */
14595 if (code == 'b')
14596 size = "BYTE";
14597 else if (code == 'w')
14598 size = "WORD";
14599 else if (code == 'k')
14600 size = "DWORD";
14601 else if (code == 'q')
14602 size = "QWORD";
14603 else if (code == 'x')
14604 size = "XMMWORD";
14606 fputs (size, file);
14607 fputs (" PTR ", file);
14610 x = XEXP (x, 0);
14611 /* Avoid (%rip) for call operands. */
14612 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14613 && !CONST_INT_P (x))
14614 output_addr_const (file, x);
14615 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14616 output_operand_lossage ("invalid constraints for operand");
14617 else
14618 output_address (x);
14621 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14623 REAL_VALUE_TYPE r;
14624 long l;
14626 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14627 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14629 if (ASSEMBLER_DIALECT == ASM_ATT)
14630 putc ('$', file);
14631 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14632 if (code == 'q')
14633 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14634 (unsigned long long) (int) l);
14635 else
14636 fprintf (file, "0x%08x", (unsigned int) l);
14639 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14641 REAL_VALUE_TYPE r;
14642 long l[2];
14644 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14645 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14647 if (ASSEMBLER_DIALECT == ASM_ATT)
14648 putc ('$', file);
14649 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14652 /* These float cases don't actually occur as immediate operands. */
14653 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14655 char dstr[30];
14657 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14658 fputs (dstr, file);
14661 else
14663 /* We have patterns that allow zero sets of memory, for instance.
14664 In 64-bit mode, we should probably support all 8-byte vectors,
14665 since we can in fact encode that into an immediate. */
14666 if (GET_CODE (x) == CONST_VECTOR)
14668 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14669 x = const0_rtx;
14672 if (code != 'P' && code != 'p')
14674 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14676 if (ASSEMBLER_DIALECT == ASM_ATT)
14677 putc ('$', file);
14679 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14680 || GET_CODE (x) == LABEL_REF)
14682 if (ASSEMBLER_DIALECT == ASM_ATT)
14683 putc ('$', file);
14684 else
14685 fputs ("OFFSET FLAT:", file);
14688 if (CONST_INT_P (x))
14689 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14690 else if (flag_pic || MACHOPIC_INDIRECT)
14691 output_pic_addr_const (file, x, code);
14692 else
14693 output_addr_const (file, x);
14697 static bool
14698 ix86_print_operand_punct_valid_p (unsigned char code)
14700 return (code == '@' || code == '*' || code == '+' || code == '&'
14701 || code == ';' || code == '~' || code == '^');
14704 /* Print a memory operand whose address is ADDR. */
14706 static void
14707 ix86_print_operand_address (FILE *file, rtx addr)
14709 struct ix86_address parts;
14710 rtx base, index, disp;
14711 int scale;
14712 int ok;
14713 bool vsib = false;
14714 int code = 0;
14716 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14718 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14719 gcc_assert (parts.index == NULL_RTX);
14720 parts.index = XVECEXP (addr, 0, 1);
14721 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14722 addr = XVECEXP (addr, 0, 0);
14723 vsib = true;
14725 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14727 gcc_assert (TARGET_64BIT);
14728 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14729 code = 'q';
14731 else
14732 ok = ix86_decompose_address (addr, &parts);
14734 gcc_assert (ok);
14736 base = parts.base;
14737 index = parts.index;
14738 disp = parts.disp;
14739 scale = parts.scale;
14741 switch (parts.seg)
14743 case SEG_DEFAULT:
14744 break;
14745 case SEG_FS:
14746 case SEG_GS:
14747 if (ASSEMBLER_DIALECT == ASM_ATT)
14748 putc ('%', file);
14749 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14750 break;
14751 default:
14752 gcc_unreachable ();
14755 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14756 if (TARGET_64BIT && !base && !index)
14758 rtx symbol = disp;
14760 if (GET_CODE (disp) == CONST
14761 && GET_CODE (XEXP (disp, 0)) == PLUS
14762 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14763 symbol = XEXP (XEXP (disp, 0), 0);
14765 if (GET_CODE (symbol) == LABEL_REF
14766 || (GET_CODE (symbol) == SYMBOL_REF
14767 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14768 base = pc_rtx;
14770 if (!base && !index)
14772 /* Displacement only requires special attention. */
14774 if (CONST_INT_P (disp))
14776 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14777 fputs ("ds:", file);
14778 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14780 else if (flag_pic)
14781 output_pic_addr_const (file, disp, 0);
14782 else
14783 output_addr_const (file, disp);
14785 else
14787 /* Print SImode register names to force addr32 prefix. */
14788 if (SImode_address_operand (addr, VOIDmode))
14790 #ifdef ENABLE_CHECKING
14791 gcc_assert (TARGET_64BIT);
14792 switch (GET_CODE (addr))
14794 case SUBREG:
14795 gcc_assert (GET_MODE (addr) == SImode);
14796 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14797 break;
14798 case ZERO_EXTEND:
14799 case AND:
14800 gcc_assert (GET_MODE (addr) == DImode);
14801 break;
14802 default:
14803 gcc_unreachable ();
14805 #endif
14806 gcc_assert (!code);
14807 code = 'k';
14809 else if (code == 0
14810 && TARGET_X32
14811 && disp
14812 && CONST_INT_P (disp)
14813 && INTVAL (disp) < -16*1024*1024)
14815 /* X32 runs in 64-bit mode, where displacement, DISP, in
14816 address DISP(%r64), is encoded as 32-bit immediate sign-
14817 extended from 32-bit to 64-bit. For -0x40000300(%r64),
14818 address is %r64 + 0xffffffffbffffd00. When %r64 <
14819 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14820 which is invalid for x32. The correct address is %r64
14821 - 0x40000300 == 0xf7ffdd64. To properly encode
14822 -0x40000300(%r64) for x32, we zero-extend negative
14823 displacement by forcing addr32 prefix which truncates
14824 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
14825 zero-extend all negative displacements, including -1(%rsp).
14826 However, for small negative displacements, sign-extension
14827 won't cause overflow. We only zero-extend negative
14828 displacements if they < -16*1024*1024, which is also used
14829 to check legitimate address displacements for PIC. */
14830 code = 'k';
14833 if (ASSEMBLER_DIALECT == ASM_ATT)
14835 if (disp)
14837 if (flag_pic)
14838 output_pic_addr_const (file, disp, 0);
14839 else if (GET_CODE (disp) == LABEL_REF)
14840 output_asm_label (disp);
14841 else
14842 output_addr_const (file, disp);
14845 putc ('(', file);
14846 if (base)
14847 print_reg (base, code, file);
14848 if (index)
14850 putc (',', file);
14851 print_reg (index, vsib ? 0 : code, file);
14852 if (scale != 1 || vsib)
14853 fprintf (file, ",%d", scale);
14855 putc (')', file);
14857 else
14859 rtx offset = NULL_RTX;
14861 if (disp)
14863 /* Pull out the offset of a symbol; print any symbol itself. */
14864 if (GET_CODE (disp) == CONST
14865 && GET_CODE (XEXP (disp, 0)) == PLUS
14866 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14868 offset = XEXP (XEXP (disp, 0), 1);
14869 disp = gen_rtx_CONST (VOIDmode,
14870 XEXP (XEXP (disp, 0), 0));
14873 if (flag_pic)
14874 output_pic_addr_const (file, disp, 0);
14875 else if (GET_CODE (disp) == LABEL_REF)
14876 output_asm_label (disp);
14877 else if (CONST_INT_P (disp))
14878 offset = disp;
14879 else
14880 output_addr_const (file, disp);
14883 putc ('[', file);
14884 if (base)
14886 print_reg (base, code, file);
14887 if (offset)
14889 if (INTVAL (offset) >= 0)
14890 putc ('+', file);
14891 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14894 else if (offset)
14895 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14896 else
14897 putc ('0', file);
14899 if (index)
14901 putc ('+', file);
14902 print_reg (index, vsib ? 0 : code, file);
14903 if (scale != 1 || vsib)
14904 fprintf (file, "*%d", scale);
14906 putc (']', file);
14911 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14913 static bool
14914 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14916 rtx op;
14918 if (GET_CODE (x) != UNSPEC)
14919 return false;
14921 op = XVECEXP (x, 0, 0);
14922 switch (XINT (x, 1))
14924 case UNSPEC_GOTTPOFF:
14925 output_addr_const (file, op);
14926 /* FIXME: This might be @TPOFF in Sun ld. */
14927 fputs ("@gottpoff", file);
14928 break;
14929 case UNSPEC_TPOFF:
14930 output_addr_const (file, op);
14931 fputs ("@tpoff", file);
14932 break;
14933 case UNSPEC_NTPOFF:
14934 output_addr_const (file, op);
14935 if (TARGET_64BIT)
14936 fputs ("@tpoff", file);
14937 else
14938 fputs ("@ntpoff", file);
14939 break;
14940 case UNSPEC_DTPOFF:
14941 output_addr_const (file, op);
14942 fputs ("@dtpoff", file);
14943 break;
14944 case UNSPEC_GOTNTPOFF:
14945 output_addr_const (file, op);
14946 if (TARGET_64BIT)
14947 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14948 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14949 else
14950 fputs ("@gotntpoff", file);
14951 break;
14952 case UNSPEC_INDNTPOFF:
14953 output_addr_const (file, op);
14954 fputs ("@indntpoff", file);
14955 break;
14956 #if TARGET_MACHO
14957 case UNSPEC_MACHOPIC_OFFSET:
14958 output_addr_const (file, op);
14959 putc ('-', file);
14960 machopic_output_function_base_name (file);
14961 break;
14962 #endif
14964 case UNSPEC_STACK_CHECK:
14966 int offset;
14968 gcc_assert (flag_split_stack);
14970 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14971 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14972 #else
14973 gcc_unreachable ();
14974 #endif
14976 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14978 break;
14980 default:
14981 return false;
14984 return true;
14987 /* Split one or more double-mode RTL references into pairs of half-mode
14988 references. The RTL can be REG, offsettable MEM, integer constant, or
14989 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14990 split and "num" is its length. lo_half and hi_half are output arrays
14991 that parallel "operands". */
14993 void
14994 split_double_mode (enum machine_mode mode, rtx operands[],
14995 int num, rtx lo_half[], rtx hi_half[])
14997 enum machine_mode half_mode;
14998 unsigned int byte;
15000 switch (mode)
15002 case TImode:
15003 half_mode = DImode;
15004 break;
15005 case DImode:
15006 half_mode = SImode;
15007 break;
15008 default:
15009 gcc_unreachable ();
15012 byte = GET_MODE_SIZE (half_mode);
15014 while (num--)
15016 rtx op = operands[num];
15018 /* simplify_subreg refuse to split volatile memory addresses,
15019 but we still have to handle it. */
15020 if (MEM_P (op))
15022 lo_half[num] = adjust_address (op, half_mode, 0);
15023 hi_half[num] = adjust_address (op, half_mode, byte);
15025 else
15027 lo_half[num] = simplify_gen_subreg (half_mode, op,
15028 GET_MODE (op) == VOIDmode
15029 ? mode : GET_MODE (op), 0);
15030 hi_half[num] = simplify_gen_subreg (half_mode, op,
15031 GET_MODE (op) == VOIDmode
15032 ? mode : GET_MODE (op), byte);
15037 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15038 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15039 is the expression of the binary operation. The output may either be
15040 emitted here, or returned to the caller, like all output_* functions.
15042 There is no guarantee that the operands are the same mode, as they
15043 might be within FLOAT or FLOAT_EXTEND expressions. */
15045 #ifndef SYSV386_COMPAT
15046 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15047 wants to fix the assemblers because that causes incompatibility
15048 with gcc. No-one wants to fix gcc because that causes
15049 incompatibility with assemblers... You can use the option of
15050 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15051 #define SYSV386_COMPAT 1
15052 #endif
15054 const char *
15055 output_387_binary_op (rtx insn, rtx *operands)
15057 static char buf[40];
15058 const char *p;
15059 const char *ssep;
15060 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15062 #ifdef ENABLE_CHECKING
15063 /* Even if we do not want to check the inputs, this documents input
15064 constraints. Which helps in understanding the following code. */
15065 if (STACK_REG_P (operands[0])
15066 && ((REG_P (operands[1])
15067 && REGNO (operands[0]) == REGNO (operands[1])
15068 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15069 || (REG_P (operands[2])
15070 && REGNO (operands[0]) == REGNO (operands[2])
15071 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15072 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15073 ; /* ok */
15074 else
15075 gcc_assert (is_sse);
15076 #endif
15078 switch (GET_CODE (operands[3]))
15080 case PLUS:
15081 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15082 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15083 p = "fiadd";
15084 else
15085 p = "fadd";
15086 ssep = "vadd";
15087 break;
15089 case MINUS:
15090 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15091 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15092 p = "fisub";
15093 else
15094 p = "fsub";
15095 ssep = "vsub";
15096 break;
15098 case MULT:
15099 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15100 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15101 p = "fimul";
15102 else
15103 p = "fmul";
15104 ssep = "vmul";
15105 break;
15107 case DIV:
15108 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15109 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15110 p = "fidiv";
15111 else
15112 p = "fdiv";
15113 ssep = "vdiv";
15114 break;
15116 default:
15117 gcc_unreachable ();
15120 if (is_sse)
15122 if (TARGET_AVX)
15124 strcpy (buf, ssep);
15125 if (GET_MODE (operands[0]) == SFmode)
15126 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15127 else
15128 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15130 else
15132 strcpy (buf, ssep + 1);
15133 if (GET_MODE (operands[0]) == SFmode)
15134 strcat (buf, "ss\t{%2, %0|%0, %2}");
15135 else
15136 strcat (buf, "sd\t{%2, %0|%0, %2}");
15138 return buf;
15140 strcpy (buf, p);
15142 switch (GET_CODE (operands[3]))
15144 case MULT:
15145 case PLUS:
15146 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15148 rtx temp = operands[2];
15149 operands[2] = operands[1];
15150 operands[1] = temp;
15153 /* know operands[0] == operands[1]. */
15155 if (MEM_P (operands[2]))
15157 p = "%Z2\t%2";
15158 break;
15161 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15163 if (STACK_TOP_P (operands[0]))
15164 /* How is it that we are storing to a dead operand[2]?
15165 Well, presumably operands[1] is dead too. We can't
15166 store the result to st(0) as st(0) gets popped on this
15167 instruction. Instead store to operands[2] (which I
15168 think has to be st(1)). st(1) will be popped later.
15169 gcc <= 2.8.1 didn't have this check and generated
15170 assembly code that the Unixware assembler rejected. */
15171 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15172 else
15173 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15174 break;
15177 if (STACK_TOP_P (operands[0]))
15178 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15179 else
15180 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15181 break;
15183 case MINUS:
15184 case DIV:
15185 if (MEM_P (operands[1]))
15187 p = "r%Z1\t%1";
15188 break;
15191 if (MEM_P (operands[2]))
15193 p = "%Z2\t%2";
15194 break;
15197 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15199 #if SYSV386_COMPAT
15200 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15201 derived assemblers, confusingly reverse the direction of
15202 the operation for fsub{r} and fdiv{r} when the
15203 destination register is not st(0). The Intel assembler
15204 doesn't have this brain damage. Read !SYSV386_COMPAT to
15205 figure out what the hardware really does. */
15206 if (STACK_TOP_P (operands[0]))
15207 p = "{p\t%0, %2|rp\t%2, %0}";
15208 else
15209 p = "{rp\t%2, %0|p\t%0, %2}";
15210 #else
15211 if (STACK_TOP_P (operands[0]))
15212 /* As above for fmul/fadd, we can't store to st(0). */
15213 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15214 else
15215 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15216 #endif
15217 break;
15220 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15222 #if SYSV386_COMPAT
15223 if (STACK_TOP_P (operands[0]))
15224 p = "{rp\t%0, %1|p\t%1, %0}";
15225 else
15226 p = "{p\t%1, %0|rp\t%0, %1}";
15227 #else
15228 if (STACK_TOP_P (operands[0]))
15229 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15230 else
15231 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15232 #endif
15233 break;
15236 if (STACK_TOP_P (operands[0]))
15238 if (STACK_TOP_P (operands[1]))
15239 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15240 else
15241 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15242 break;
15244 else if (STACK_TOP_P (operands[1]))
15246 #if SYSV386_COMPAT
15247 p = "{\t%1, %0|r\t%0, %1}";
15248 #else
15249 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15250 #endif
15252 else
15254 #if SYSV386_COMPAT
15255 p = "{r\t%2, %0|\t%0, %2}";
15256 #else
15257 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15258 #endif
15260 break;
15262 default:
15263 gcc_unreachable ();
15266 strcat (buf, p);
15267 return buf;
15270 /* Check if a 256bit AVX register is referenced inside of EXP. */
15272 static int
15273 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15275 rtx exp = *pexp;
15277 if (GET_CODE (exp) == SUBREG)
15278 exp = SUBREG_REG (exp);
15280 if (REG_P (exp)
15281 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15282 return 1;
15284 return 0;
15287 /* Return needed mode for entity in optimize_mode_switching pass. */
15289 static int
15290 ix86_avx_u128_mode_needed (rtx insn)
15292 if (CALL_P (insn))
15294 rtx link;
15296 /* Needed mode is set to AVX_U128_CLEAN if there are
15297 no 256bit modes used in function arguments. */
15298 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15299 link;
15300 link = XEXP (link, 1))
15302 if (GET_CODE (XEXP (link, 0)) == USE)
15304 rtx arg = XEXP (XEXP (link, 0), 0);
15306 if (ix86_check_avx256_register (&arg, NULL))
15307 return AVX_U128_ANY;
15311 return AVX_U128_CLEAN;
15314 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15315 changes state only when a 256bit register is written to, but we need
15316 to prevent the compiler from moving optimal insertion point above
15317 eventual read from 256bit register. */
15318 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15319 return AVX_U128_DIRTY;
15321 return AVX_U128_ANY;
15324 /* Return mode that i387 must be switched into
15325 prior to the execution of insn. */
15327 static int
15328 ix86_i387_mode_needed (int entity, rtx insn)
15330 enum attr_i387_cw mode;
15332 /* The mode UNINITIALIZED is used to store control word after a
15333 function call or ASM pattern. The mode ANY specify that function
15334 has no requirements on the control word and make no changes in the
15335 bits we are interested in. */
15337 if (CALL_P (insn)
15338 || (NONJUMP_INSN_P (insn)
15339 && (asm_noperands (PATTERN (insn)) >= 0
15340 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15341 return I387_CW_UNINITIALIZED;
15343 if (recog_memoized (insn) < 0)
15344 return I387_CW_ANY;
15346 mode = get_attr_i387_cw (insn);
15348 switch (entity)
15350 case I387_TRUNC:
15351 if (mode == I387_CW_TRUNC)
15352 return mode;
15353 break;
15355 case I387_FLOOR:
15356 if (mode == I387_CW_FLOOR)
15357 return mode;
15358 break;
15360 case I387_CEIL:
15361 if (mode == I387_CW_CEIL)
15362 return mode;
15363 break;
15365 case I387_MASK_PM:
15366 if (mode == I387_CW_MASK_PM)
15367 return mode;
15368 break;
15370 default:
15371 gcc_unreachable ();
15374 return I387_CW_ANY;
15377 /* Return mode that entity must be switched into
15378 prior to the execution of insn. */
15381 ix86_mode_needed (int entity, rtx insn)
15383 switch (entity)
15385 case AVX_U128:
15386 return ix86_avx_u128_mode_needed (insn);
15387 case I387_TRUNC:
15388 case I387_FLOOR:
15389 case I387_CEIL:
15390 case I387_MASK_PM:
15391 return ix86_i387_mode_needed (entity, insn);
15392 default:
15393 gcc_unreachable ();
15395 return 0;
15398 /* Check if a 256bit AVX register is referenced in stores. */
15400 static void
15401 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15403 if (ix86_check_avx256_register (&dest, NULL))
15405 bool *used = (bool *) data;
15406 *used = true;
15410 /* Calculate mode of upper 128bit AVX registers after the insn. */
15412 static int
15413 ix86_avx_u128_mode_after (int mode, rtx insn)
15415 rtx pat = PATTERN (insn);
15417 if (vzeroupper_operation (pat, VOIDmode)
15418 || vzeroall_operation (pat, VOIDmode))
15419 return AVX_U128_CLEAN;
15421 /* We know that state is clean after CALL insn if there are no
15422 256bit registers used in the function return register. */
15423 if (CALL_P (insn))
15425 bool avx_reg256_found = false;
15426 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15427 if (!avx_reg256_found)
15428 return AVX_U128_CLEAN;
15431 /* Otherwise, return current mode. Remember that if insn
15432 references AVX 256bit registers, the mode was already changed
15433 to DIRTY from MODE_NEEDED. */
15434 return mode;
15437 /* Return the mode that an insn results in. */
15440 ix86_mode_after (int entity, int mode, rtx insn)
15442 switch (entity)
15444 case AVX_U128:
15445 return ix86_avx_u128_mode_after (mode, insn);
15446 case I387_TRUNC:
15447 case I387_FLOOR:
15448 case I387_CEIL:
15449 case I387_MASK_PM:
15450 return mode;
15451 default:
15452 gcc_unreachable ();
15456 static int
15457 ix86_avx_u128_mode_entry (void)
15459 tree arg;
15461 /* Entry mode is set to AVX_U128_DIRTY if there are
15462 256bit modes used in function arguments. */
15463 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15464 arg = TREE_CHAIN (arg))
15466 rtx incoming = DECL_INCOMING_RTL (arg);
15468 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15469 return AVX_U128_DIRTY;
15472 return AVX_U128_CLEAN;
15475 /* Return a mode that ENTITY is assumed to be
15476 switched to at function entry. */
15479 ix86_mode_entry (int entity)
15481 switch (entity)
15483 case AVX_U128:
15484 return ix86_avx_u128_mode_entry ();
15485 case I387_TRUNC:
15486 case I387_FLOOR:
15487 case I387_CEIL:
15488 case I387_MASK_PM:
15489 return I387_CW_ANY;
15490 default:
15491 gcc_unreachable ();
15495 static int
15496 ix86_avx_u128_mode_exit (void)
15498 rtx reg = crtl->return_rtx;
15500 /* Exit mode is set to AVX_U128_DIRTY if there are
15501 256bit modes used in the function return register. */
15502 if (reg && ix86_check_avx256_register (&reg, NULL))
15503 return AVX_U128_DIRTY;
15505 return AVX_U128_CLEAN;
15508 /* Return a mode that ENTITY is assumed to be
15509 switched to at function exit. */
15512 ix86_mode_exit (int entity)
15514 switch (entity)
15516 case AVX_U128:
15517 return ix86_avx_u128_mode_exit ();
15518 case I387_TRUNC:
15519 case I387_FLOOR:
15520 case I387_CEIL:
15521 case I387_MASK_PM:
15522 return I387_CW_ANY;
15523 default:
15524 gcc_unreachable ();
15528 /* Output code to initialize control word copies used by trunc?f?i and
15529 rounding patterns. CURRENT_MODE is set to current control word,
15530 while NEW_MODE is set to new control word. */
15532 static void
15533 emit_i387_cw_initialization (int mode)
15535 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15536 rtx new_mode;
15538 enum ix86_stack_slot slot;
15540 rtx reg = gen_reg_rtx (HImode);
15542 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15543 emit_move_insn (reg, copy_rtx (stored_mode));
15545 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15546 || optimize_function_for_size_p (cfun))
15548 switch (mode)
15550 case I387_CW_TRUNC:
15551 /* round toward zero (truncate) */
15552 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15553 slot = SLOT_CW_TRUNC;
15554 break;
15556 case I387_CW_FLOOR:
15557 /* round down toward -oo */
15558 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15559 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15560 slot = SLOT_CW_FLOOR;
15561 break;
15563 case I387_CW_CEIL:
15564 /* round up toward +oo */
15565 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15566 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15567 slot = SLOT_CW_CEIL;
15568 break;
15570 case I387_CW_MASK_PM:
15571 /* mask precision exception for nearbyint() */
15572 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15573 slot = SLOT_CW_MASK_PM;
15574 break;
15576 default:
15577 gcc_unreachable ();
15580 else
15582 switch (mode)
15584 case I387_CW_TRUNC:
15585 /* round toward zero (truncate) */
15586 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15587 slot = SLOT_CW_TRUNC;
15588 break;
15590 case I387_CW_FLOOR:
15591 /* round down toward -oo */
15592 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15593 slot = SLOT_CW_FLOOR;
15594 break;
15596 case I387_CW_CEIL:
15597 /* round up toward +oo */
15598 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15599 slot = SLOT_CW_CEIL;
15600 break;
15602 case I387_CW_MASK_PM:
15603 /* mask precision exception for nearbyint() */
15604 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15605 slot = SLOT_CW_MASK_PM;
15606 break;
15608 default:
15609 gcc_unreachable ();
15613 gcc_assert (slot < MAX_386_STACK_LOCALS);
15615 new_mode = assign_386_stack_local (HImode, slot);
15616 emit_move_insn (new_mode, reg);
15619 /* Emit vzeroupper. */
15621 void
15622 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15624 int i;
15626 /* Cancel automatic vzeroupper insertion if there are
15627 live call-saved SSE registers at the insertion point. */
15629 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15630 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15631 return;
15633 if (TARGET_64BIT)
15634 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15635 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15636 return;
15638 emit_insn (gen_avx_vzeroupper ());
15641 /* Generate one or more insns to set ENTITY to MODE. */
15643 void
15644 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15646 switch (entity)
15648 case AVX_U128:
15649 if (mode == AVX_U128_CLEAN)
15650 ix86_avx_emit_vzeroupper (regs_live);
15651 break;
15652 case I387_TRUNC:
15653 case I387_FLOOR:
15654 case I387_CEIL:
15655 case I387_MASK_PM:
15656 if (mode != I387_CW_ANY
15657 && mode != I387_CW_UNINITIALIZED)
15658 emit_i387_cw_initialization (mode);
15659 break;
15660 default:
15661 gcc_unreachable ();
15665 /* Output code for INSN to convert a float to a signed int. OPERANDS
15666 are the insn operands. The output may be [HSD]Imode and the input
15667 operand may be [SDX]Fmode. */
15669 const char *
15670 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15672 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15673 int dimode_p = GET_MODE (operands[0]) == DImode;
15674 int round_mode = get_attr_i387_cw (insn);
15676 /* Jump through a hoop or two for DImode, since the hardware has no
15677 non-popping instruction. We used to do this a different way, but
15678 that was somewhat fragile and broke with post-reload splitters. */
15679 if ((dimode_p || fisttp) && !stack_top_dies)
15680 output_asm_insn ("fld\t%y1", operands);
15682 gcc_assert (STACK_TOP_P (operands[1]));
15683 gcc_assert (MEM_P (operands[0]));
15684 gcc_assert (GET_MODE (operands[1]) != TFmode);
15686 if (fisttp)
15687 output_asm_insn ("fisttp%Z0\t%0", operands);
15688 else
15690 if (round_mode != I387_CW_ANY)
15691 output_asm_insn ("fldcw\t%3", operands);
15692 if (stack_top_dies || dimode_p)
15693 output_asm_insn ("fistp%Z0\t%0", operands);
15694 else
15695 output_asm_insn ("fist%Z0\t%0", operands);
15696 if (round_mode != I387_CW_ANY)
15697 output_asm_insn ("fldcw\t%2", operands);
15700 return "";
15703 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15704 have the values zero or one, indicates the ffreep insn's operand
15705 from the OPERANDS array. */
15707 static const char *
15708 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15710 if (TARGET_USE_FFREEP)
15711 #ifdef HAVE_AS_IX86_FFREEP
15712 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15713 #else
15715 static char retval[32];
15716 int regno = REGNO (operands[opno]);
15718 gcc_assert (STACK_REGNO_P (regno));
15720 regno -= FIRST_STACK_REG;
15722 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15723 return retval;
15725 #endif
15727 return opno ? "fstp\t%y1" : "fstp\t%y0";
15731 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15732 should be used. UNORDERED_P is true when fucom should be used. */
15734 const char *
15735 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15737 int stack_top_dies;
15738 rtx cmp_op0, cmp_op1;
15739 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15741 if (eflags_p)
15743 cmp_op0 = operands[0];
15744 cmp_op1 = operands[1];
15746 else
15748 cmp_op0 = operands[1];
15749 cmp_op1 = operands[2];
15752 if (is_sse)
15754 if (GET_MODE (operands[0]) == SFmode)
15755 if (unordered_p)
15756 return "%vucomiss\t{%1, %0|%0, %1}";
15757 else
15758 return "%vcomiss\t{%1, %0|%0, %1}";
15759 else
15760 if (unordered_p)
15761 return "%vucomisd\t{%1, %0|%0, %1}";
15762 else
15763 return "%vcomisd\t{%1, %0|%0, %1}";
15766 gcc_assert (STACK_TOP_P (cmp_op0));
15768 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15770 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15772 if (stack_top_dies)
15774 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15775 return output_387_ffreep (operands, 1);
15777 else
15778 return "ftst\n\tfnstsw\t%0";
15781 if (STACK_REG_P (cmp_op1)
15782 && stack_top_dies
15783 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15784 && REGNO (cmp_op1) != FIRST_STACK_REG)
15786 /* If both the top of the 387 stack dies, and the other operand
15787 is also a stack register that dies, then this must be a
15788 `fcompp' float compare */
15790 if (eflags_p)
15792 /* There is no double popping fcomi variant. Fortunately,
15793 eflags is immune from the fstp's cc clobbering. */
15794 if (unordered_p)
15795 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15796 else
15797 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15798 return output_387_ffreep (operands, 0);
15800 else
15802 if (unordered_p)
15803 return "fucompp\n\tfnstsw\t%0";
15804 else
15805 return "fcompp\n\tfnstsw\t%0";
15808 else
15810 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15812 static const char * const alt[16] =
15814 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15815 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15816 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15817 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15819 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15820 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15821 NULL,
15822 NULL,
15824 "fcomi\t{%y1, %0|%0, %y1}",
15825 "fcomip\t{%y1, %0|%0, %y1}",
15826 "fucomi\t{%y1, %0|%0, %y1}",
15827 "fucomip\t{%y1, %0|%0, %y1}",
15829 NULL,
15830 NULL,
15831 NULL,
15832 NULL
15835 int mask;
15836 const char *ret;
15838 mask = eflags_p << 3;
15839 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15840 mask |= unordered_p << 1;
15841 mask |= stack_top_dies;
15843 gcc_assert (mask < 16);
15844 ret = alt[mask];
15845 gcc_assert (ret);
15847 return ret;
15851 void
15852 ix86_output_addr_vec_elt (FILE *file, int value)
15854 const char *directive = ASM_LONG;
15856 #ifdef ASM_QUAD
15857 if (TARGET_LP64)
15858 directive = ASM_QUAD;
15859 #else
15860 gcc_assert (!TARGET_64BIT);
15861 #endif
15863 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15866 void
15867 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15869 const char *directive = ASM_LONG;
15871 #ifdef ASM_QUAD
15872 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15873 directive = ASM_QUAD;
15874 #else
15875 gcc_assert (!TARGET_64BIT);
15876 #endif
15877 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15878 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15879 fprintf (file, "%s%s%d-%s%d\n",
15880 directive, LPREFIX, value, LPREFIX, rel);
15881 else if (HAVE_AS_GOTOFF_IN_DATA)
15882 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15883 #if TARGET_MACHO
15884 else if (TARGET_MACHO)
15886 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15887 machopic_output_function_base_name (file);
15888 putc ('\n', file);
15890 #endif
15891 else
15892 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15893 GOT_SYMBOL_NAME, LPREFIX, value);
15896 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15897 for the target. */
15899 void
15900 ix86_expand_clear (rtx dest)
15902 rtx tmp;
15904 /* We play register width games, which are only valid after reload. */
15905 gcc_assert (reload_completed);
15907 /* Avoid HImode and its attendant prefix byte. */
15908 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15909 dest = gen_rtx_REG (SImode, REGNO (dest));
15910 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15912 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15913 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15915 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15916 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15919 emit_insn (tmp);
15922 /* X is an unchanging MEM. If it is a constant pool reference, return
15923 the constant pool rtx, else NULL. */
15926 maybe_get_pool_constant (rtx x)
15928 x = ix86_delegitimize_address (XEXP (x, 0));
15930 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15931 return get_pool_constant (x);
15933 return NULL_RTX;
15936 void
15937 ix86_expand_move (enum machine_mode mode, rtx operands[])
15939 rtx op0, op1;
15940 enum tls_model model;
15942 op0 = operands[0];
15943 op1 = operands[1];
15945 if (GET_CODE (op1) == SYMBOL_REF)
15947 model = SYMBOL_REF_TLS_MODEL (op1);
15948 if (model)
15950 op1 = legitimize_tls_address (op1, model, true);
15951 op1 = force_operand (op1, op0);
15952 if (op1 == op0)
15953 return;
15954 op1 = convert_to_mode (mode, op1, 1);
15956 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15957 && SYMBOL_REF_DLLIMPORT_P (op1))
15958 op1 = legitimize_dllimport_symbol (op1, false);
15960 else if (GET_CODE (op1) == CONST
15961 && GET_CODE (XEXP (op1, 0)) == PLUS
15962 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15964 rtx addend = XEXP (XEXP (op1, 0), 1);
15965 rtx symbol = XEXP (XEXP (op1, 0), 0);
15966 rtx tmp = NULL;
15968 model = SYMBOL_REF_TLS_MODEL (symbol);
15969 if (model)
15970 tmp = legitimize_tls_address (symbol, model, true);
15971 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15972 && SYMBOL_REF_DLLIMPORT_P (symbol))
15973 tmp = legitimize_dllimport_symbol (symbol, true);
15975 if (tmp)
15977 tmp = force_operand (tmp, NULL);
15978 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15979 op0, 1, OPTAB_DIRECT);
15980 if (tmp == op0)
15981 return;
15982 op1 = convert_to_mode (mode, tmp, 1);
15986 if ((flag_pic || MACHOPIC_INDIRECT)
15987 && symbolic_operand (op1, mode))
15989 if (TARGET_MACHO && !TARGET_64BIT)
15991 #if TARGET_MACHO
15992 /* dynamic-no-pic */
15993 if (MACHOPIC_INDIRECT)
15995 rtx temp = ((reload_in_progress
15996 || ((op0 && REG_P (op0))
15997 && mode == Pmode))
15998 ? op0 : gen_reg_rtx (Pmode));
15999 op1 = machopic_indirect_data_reference (op1, temp);
16000 if (MACHOPIC_PURE)
16001 op1 = machopic_legitimize_pic_address (op1, mode,
16002 temp == op1 ? 0 : temp);
16004 if (op0 != op1 && GET_CODE (op0) != MEM)
16006 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16007 emit_insn (insn);
16008 return;
16010 if (GET_CODE (op0) == MEM)
16011 op1 = force_reg (Pmode, op1);
16012 else
16014 rtx temp = op0;
16015 if (GET_CODE (temp) != REG)
16016 temp = gen_reg_rtx (Pmode);
16017 temp = legitimize_pic_address (op1, temp);
16018 if (temp == op0)
16019 return;
16020 op1 = temp;
16022 /* dynamic-no-pic */
16023 #endif
16025 else
16027 if (MEM_P (op0))
16028 op1 = force_reg (mode, op1);
16029 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16031 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16032 op1 = legitimize_pic_address (op1, reg);
16033 if (op0 == op1)
16034 return;
16035 op1 = convert_to_mode (mode, op1, 1);
16039 else
16041 if (MEM_P (op0)
16042 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16043 || !push_operand (op0, mode))
16044 && MEM_P (op1))
16045 op1 = force_reg (mode, op1);
16047 if (push_operand (op0, mode)
16048 && ! general_no_elim_operand (op1, mode))
16049 op1 = copy_to_mode_reg (mode, op1);
16051 /* Force large constants in 64bit compilation into register
16052 to get them CSEed. */
16053 if (can_create_pseudo_p ()
16054 && (mode == DImode) && TARGET_64BIT
16055 && immediate_operand (op1, mode)
16056 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16057 && !register_operand (op0, mode)
16058 && optimize)
16059 op1 = copy_to_mode_reg (mode, op1);
16061 if (can_create_pseudo_p ()
16062 && FLOAT_MODE_P (mode)
16063 && GET_CODE (op1) == CONST_DOUBLE)
16065 /* If we are loading a floating point constant to a register,
16066 force the value to memory now, since we'll get better code
16067 out the back end. */
16069 op1 = validize_mem (force_const_mem (mode, op1));
16070 if (!register_operand (op0, mode))
16072 rtx temp = gen_reg_rtx (mode);
16073 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16074 emit_move_insn (op0, temp);
16075 return;
16080 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16083 void
16084 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16086 rtx op0 = operands[0], op1 = operands[1];
16087 unsigned int align = GET_MODE_ALIGNMENT (mode);
16089 /* Force constants other than zero into memory. We do not know how
16090 the instructions used to build constants modify the upper 64 bits
16091 of the register, once we have that information we may be able
16092 to handle some of them more efficiently. */
16093 if (can_create_pseudo_p ()
16094 && register_operand (op0, mode)
16095 && (CONSTANT_P (op1)
16096 || (GET_CODE (op1) == SUBREG
16097 && CONSTANT_P (SUBREG_REG (op1))))
16098 && !standard_sse_constant_p (op1))
16099 op1 = validize_mem (force_const_mem (mode, op1));
16101 /* We need to check memory alignment for SSE mode since attribute
16102 can make operands unaligned. */
16103 if (can_create_pseudo_p ()
16104 && SSE_REG_MODE_P (mode)
16105 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16106 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16108 rtx tmp[2];
16110 /* ix86_expand_vector_move_misalign() does not like constants ... */
16111 if (CONSTANT_P (op1)
16112 || (GET_CODE (op1) == SUBREG
16113 && CONSTANT_P (SUBREG_REG (op1))))
16114 op1 = validize_mem (force_const_mem (mode, op1));
16116 /* ... nor both arguments in memory. */
16117 if (!register_operand (op0, mode)
16118 && !register_operand (op1, mode))
16119 op1 = force_reg (mode, op1);
16121 tmp[0] = op0; tmp[1] = op1;
16122 ix86_expand_vector_move_misalign (mode, tmp);
16123 return;
16126 /* Make operand1 a register if it isn't already. */
16127 if (can_create_pseudo_p ()
16128 && !register_operand (op0, mode)
16129 && !register_operand (op1, mode))
16131 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16132 return;
16135 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16138 /* Split 32-byte AVX unaligned load and store if needed. */
16140 static void
16141 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16143 rtx m;
16144 rtx (*extract) (rtx, rtx, rtx);
16145 rtx (*load_unaligned) (rtx, rtx);
16146 rtx (*store_unaligned) (rtx, rtx);
16147 enum machine_mode mode;
16149 switch (GET_MODE (op0))
16151 default:
16152 gcc_unreachable ();
16153 case V32QImode:
16154 extract = gen_avx_vextractf128v32qi;
16155 load_unaligned = gen_avx_loaddqu256;
16156 store_unaligned = gen_avx_storedqu256;
16157 mode = V16QImode;
16158 break;
16159 case V8SFmode:
16160 extract = gen_avx_vextractf128v8sf;
16161 load_unaligned = gen_avx_loadups256;
16162 store_unaligned = gen_avx_storeups256;
16163 mode = V4SFmode;
16164 break;
16165 case V4DFmode:
16166 extract = gen_avx_vextractf128v4df;
16167 load_unaligned = gen_avx_loadupd256;
16168 store_unaligned = gen_avx_storeupd256;
16169 mode = V2DFmode;
16170 break;
16173 if (MEM_P (op1))
16175 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16177 rtx r = gen_reg_rtx (mode);
16178 m = adjust_address (op1, mode, 0);
16179 emit_move_insn (r, m);
16180 m = adjust_address (op1, mode, 16);
16181 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16182 emit_move_insn (op0, r);
16184 else
16185 emit_insn (load_unaligned (op0, op1));
16187 else if (MEM_P (op0))
16189 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16191 m = adjust_address (op0, mode, 0);
16192 emit_insn (extract (m, op1, const0_rtx));
16193 m = adjust_address (op0, mode, 16);
16194 emit_insn (extract (m, op1, const1_rtx));
16196 else
16197 emit_insn (store_unaligned (op0, op1));
16199 else
16200 gcc_unreachable ();
16203 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16204 straight to ix86_expand_vector_move. */
16205 /* Code generation for scalar reg-reg moves of single and double precision data:
16206 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16207 movaps reg, reg
16208 else
16209 movss reg, reg
16210 if (x86_sse_partial_reg_dependency == true)
16211 movapd reg, reg
16212 else
16213 movsd reg, reg
16215 Code generation for scalar loads of double precision data:
16216 if (x86_sse_split_regs == true)
16217 movlpd mem, reg (gas syntax)
16218 else
16219 movsd mem, reg
16221 Code generation for unaligned packed loads of single precision data
16222 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16223 if (x86_sse_unaligned_move_optimal)
16224 movups mem, reg
16226 if (x86_sse_partial_reg_dependency == true)
16228 xorps reg, reg
16229 movlps mem, reg
16230 movhps mem+8, reg
16232 else
16234 movlps mem, reg
16235 movhps mem+8, reg
16238 Code generation for unaligned packed loads of double precision data
16239 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16240 if (x86_sse_unaligned_move_optimal)
16241 movupd mem, reg
16243 if (x86_sse_split_regs == true)
16245 movlpd mem, reg
16246 movhpd mem+8, reg
16248 else
16250 movsd mem, reg
16251 movhpd mem+8, reg
16255 void
16256 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16258 rtx op0, op1, m;
16260 op0 = operands[0];
16261 op1 = operands[1];
16263 if (TARGET_AVX
16264 && GET_MODE_SIZE (mode) == 32)
16266 switch (GET_MODE_CLASS (mode))
16268 case MODE_VECTOR_INT:
16269 case MODE_INT:
16270 op0 = gen_lowpart (V32QImode, op0);
16271 op1 = gen_lowpart (V32QImode, op1);
16272 /* FALLTHRU */
16274 case MODE_VECTOR_FLOAT:
16275 ix86_avx256_split_vector_move_misalign (op0, op1);
16276 break;
16278 default:
16279 gcc_unreachable ();
16282 return;
16285 if (MEM_P (op1))
16287 /* ??? If we have typed data, then it would appear that using
16288 movdqu is the only way to get unaligned data loaded with
16289 integer type. */
16290 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16292 op0 = gen_lowpart (V16QImode, op0);
16293 op1 = gen_lowpart (V16QImode, op1);
16294 /* We will eventually emit movups based on insn attributes. */
16295 emit_insn (gen_sse2_loaddqu (op0, op1));
16297 else if (TARGET_SSE2 && mode == V2DFmode)
16299 rtx zero;
16301 if (TARGET_AVX
16302 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16303 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16304 || optimize_function_for_size_p (cfun))
16306 /* We will eventually emit movups based on insn attributes. */
16307 emit_insn (gen_sse2_loadupd (op0, op1));
16308 return;
16311 /* When SSE registers are split into halves, we can avoid
16312 writing to the top half twice. */
16313 if (TARGET_SSE_SPLIT_REGS)
16315 emit_clobber (op0);
16316 zero = op0;
16318 else
16320 /* ??? Not sure about the best option for the Intel chips.
16321 The following would seem to satisfy; the register is
16322 entirely cleared, breaking the dependency chain. We
16323 then store to the upper half, with a dependency depth
16324 of one. A rumor has it that Intel recommends two movsd
16325 followed by an unpacklpd, but this is unconfirmed. And
16326 given that the dependency depth of the unpacklpd would
16327 still be one, I'm not sure why this would be better. */
16328 zero = CONST0_RTX (V2DFmode);
16331 m = adjust_address (op1, DFmode, 0);
16332 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16333 m = adjust_address (op1, DFmode, 8);
16334 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16336 else
16338 if (TARGET_AVX
16339 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16340 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16341 || optimize_function_for_size_p (cfun))
16343 op0 = gen_lowpart (V4SFmode, op0);
16344 op1 = gen_lowpart (V4SFmode, op1);
16345 emit_insn (gen_sse_loadups (op0, op1));
16346 return;
16349 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16350 emit_move_insn (op0, CONST0_RTX (mode));
16351 else
16352 emit_clobber (op0);
16354 if (mode != V4SFmode)
16355 op0 = gen_lowpart (V4SFmode, op0);
16357 m = adjust_address (op1, V2SFmode, 0);
16358 emit_insn (gen_sse_loadlps (op0, op0, m));
16359 m = adjust_address (op1, V2SFmode, 8);
16360 emit_insn (gen_sse_loadhps (op0, op0, m));
16363 else if (MEM_P (op0))
16365 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16367 op0 = gen_lowpart (V16QImode, op0);
16368 op1 = gen_lowpart (V16QImode, op1);
16369 /* We will eventually emit movups based on insn attributes. */
16370 emit_insn (gen_sse2_storedqu (op0, op1));
16372 else if (TARGET_SSE2 && mode == V2DFmode)
16374 if (TARGET_AVX
16375 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16376 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16377 || optimize_function_for_size_p (cfun))
16378 /* We will eventually emit movups based on insn attributes. */
16379 emit_insn (gen_sse2_storeupd (op0, op1));
16380 else
16382 m = adjust_address (op0, DFmode, 0);
16383 emit_insn (gen_sse2_storelpd (m, op1));
16384 m = adjust_address (op0, DFmode, 8);
16385 emit_insn (gen_sse2_storehpd (m, op1));
16388 else
16390 if (mode != V4SFmode)
16391 op1 = gen_lowpart (V4SFmode, op1);
16393 if (TARGET_AVX
16394 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16395 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16396 || optimize_function_for_size_p (cfun))
16398 op0 = gen_lowpart (V4SFmode, op0);
16399 emit_insn (gen_sse_storeups (op0, op1));
16401 else
16403 m = adjust_address (op0, V2SFmode, 0);
16404 emit_insn (gen_sse_storelps (m, op1));
16405 m = adjust_address (op0, V2SFmode, 8);
16406 emit_insn (gen_sse_storehps (m, op1));
16410 else
16411 gcc_unreachable ();
16414 /* Expand a push in MODE. This is some mode for which we do not support
16415 proper push instructions, at least from the registers that we expect
16416 the value to live in. */
16418 void
16419 ix86_expand_push (enum machine_mode mode, rtx x)
16421 rtx tmp;
16423 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16424 GEN_INT (-GET_MODE_SIZE (mode)),
16425 stack_pointer_rtx, 1, OPTAB_DIRECT);
16426 if (tmp != stack_pointer_rtx)
16427 emit_move_insn (stack_pointer_rtx, tmp);
16429 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16431 /* When we push an operand onto stack, it has to be aligned at least
16432 at the function argument boundary. However since we don't have
16433 the argument type, we can't determine the actual argument
16434 boundary. */
16435 emit_move_insn (tmp, x);
16438 /* Helper function of ix86_fixup_binary_operands to canonicalize
16439 operand order. Returns true if the operands should be swapped. */
16441 static bool
16442 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16443 rtx operands[])
16445 rtx dst = operands[0];
16446 rtx src1 = operands[1];
16447 rtx src2 = operands[2];
16449 /* If the operation is not commutative, we can't do anything. */
16450 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16451 return false;
16453 /* Highest priority is that src1 should match dst. */
16454 if (rtx_equal_p (dst, src1))
16455 return false;
16456 if (rtx_equal_p (dst, src2))
16457 return true;
16459 /* Next highest priority is that immediate constants come second. */
16460 if (immediate_operand (src2, mode))
16461 return false;
16462 if (immediate_operand (src1, mode))
16463 return true;
16465 /* Lowest priority is that memory references should come second. */
16466 if (MEM_P (src2))
16467 return false;
16468 if (MEM_P (src1))
16469 return true;
16471 return false;
16475 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16476 destination to use for the operation. If different from the true
16477 destination in operands[0], a copy operation will be required. */
16480 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16481 rtx operands[])
16483 rtx dst = operands[0];
16484 rtx src1 = operands[1];
16485 rtx src2 = operands[2];
16487 /* Canonicalize operand order. */
16488 if (ix86_swap_binary_operands_p (code, mode, operands))
16490 rtx temp;
16492 /* It is invalid to swap operands of different modes. */
16493 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16495 temp = src1;
16496 src1 = src2;
16497 src2 = temp;
16500 /* Both source operands cannot be in memory. */
16501 if (MEM_P (src1) && MEM_P (src2))
16503 /* Optimization: Only read from memory once. */
16504 if (rtx_equal_p (src1, src2))
16506 src2 = force_reg (mode, src2);
16507 src1 = src2;
16509 else
16510 src2 = force_reg (mode, src2);
16513 /* If the destination is memory, and we do not have matching source
16514 operands, do things in registers. */
16515 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16516 dst = gen_reg_rtx (mode);
16518 /* Source 1 cannot be a constant. */
16519 if (CONSTANT_P (src1))
16520 src1 = force_reg (mode, src1);
16522 /* Source 1 cannot be a non-matching memory. */
16523 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16524 src1 = force_reg (mode, src1);
16526 /* Improve address combine. */
16527 if (code == PLUS
16528 && GET_MODE_CLASS (mode) == MODE_INT
16529 && MEM_P (src2))
16530 src2 = force_reg (mode, src2);
16532 operands[1] = src1;
16533 operands[2] = src2;
16534 return dst;
16537 /* Similarly, but assume that the destination has already been
16538 set up properly. */
16540 void
16541 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16542 enum machine_mode mode, rtx operands[])
16544 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16545 gcc_assert (dst == operands[0]);
16548 /* Attempt to expand a binary operator. Make the expansion closer to the
16549 actual machine, then just general_operand, which will allow 3 separate
16550 memory references (one output, two input) in a single insn. */
16552 void
16553 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16554 rtx operands[])
16556 rtx src1, src2, dst, op, clob;
16558 dst = ix86_fixup_binary_operands (code, mode, operands);
16559 src1 = operands[1];
16560 src2 = operands[2];
16562 /* Emit the instruction. */
16564 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16565 if (reload_in_progress)
16567 /* Reload doesn't know about the flags register, and doesn't know that
16568 it doesn't want to clobber it. We can only do this with PLUS. */
16569 gcc_assert (code == PLUS);
16570 emit_insn (op);
16572 else if (reload_completed
16573 && code == PLUS
16574 && !rtx_equal_p (dst, src1))
16576 /* This is going to be an LEA; avoid splitting it later. */
16577 emit_insn (op);
16579 else
16581 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16582 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16585 /* Fix up the destination if needed. */
16586 if (dst != operands[0])
16587 emit_move_insn (operands[0], dst);
16590 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16591 the given OPERANDS. */
16593 void
16594 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16595 rtx operands[])
16597 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16598 if (GET_CODE (operands[1]) == SUBREG)
16600 op1 = operands[1];
16601 op2 = operands[2];
16603 else if (GET_CODE (operands[2]) == SUBREG)
16605 op1 = operands[2];
16606 op2 = operands[1];
16608 /* Optimize (__m128i) d | (__m128i) e and similar code
16609 when d and e are float vectors into float vector logical
16610 insn. In C/C++ without using intrinsics there is no other way
16611 to express vector logical operation on float vectors than
16612 to cast them temporarily to integer vectors. */
16613 if (op1
16614 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16615 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16616 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16617 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16618 && SUBREG_BYTE (op1) == 0
16619 && (GET_CODE (op2) == CONST_VECTOR
16620 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16621 && SUBREG_BYTE (op2) == 0))
16622 && can_create_pseudo_p ())
16624 rtx dst;
16625 switch (GET_MODE (SUBREG_REG (op1)))
16627 case V4SFmode:
16628 case V8SFmode:
16629 case V2DFmode:
16630 case V4DFmode:
16631 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16632 if (GET_CODE (op2) == CONST_VECTOR)
16634 op2 = gen_lowpart (GET_MODE (dst), op2);
16635 op2 = force_reg (GET_MODE (dst), op2);
16637 else
16639 op1 = operands[1];
16640 op2 = SUBREG_REG (operands[2]);
16641 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16642 op2 = force_reg (GET_MODE (dst), op2);
16644 op1 = SUBREG_REG (op1);
16645 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16646 op1 = force_reg (GET_MODE (dst), op1);
16647 emit_insn (gen_rtx_SET (VOIDmode, dst,
16648 gen_rtx_fmt_ee (code, GET_MODE (dst),
16649 op1, op2)));
16650 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16651 return;
16652 default:
16653 break;
16656 if (!nonimmediate_operand (operands[1], mode))
16657 operands[1] = force_reg (mode, operands[1]);
16658 if (!nonimmediate_operand (operands[2], mode))
16659 operands[2] = force_reg (mode, operands[2]);
16660 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16661 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16662 gen_rtx_fmt_ee (code, mode, operands[1],
16663 operands[2])));
16666 /* Return TRUE or FALSE depending on whether the binary operator meets the
16667 appropriate constraints. */
16669 bool
16670 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16671 rtx operands[3])
16673 rtx dst = operands[0];
16674 rtx src1 = operands[1];
16675 rtx src2 = operands[2];
16677 /* Both source operands cannot be in memory. */
16678 if (MEM_P (src1) && MEM_P (src2))
16679 return false;
16681 /* Canonicalize operand order for commutative operators. */
16682 if (ix86_swap_binary_operands_p (code, mode, operands))
16684 rtx temp = src1;
16685 src1 = src2;
16686 src2 = temp;
16689 /* If the destination is memory, we must have a matching source operand. */
16690 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16691 return false;
16693 /* Source 1 cannot be a constant. */
16694 if (CONSTANT_P (src1))
16695 return false;
16697 /* Source 1 cannot be a non-matching memory. */
16698 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16699 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16700 return (code == AND
16701 && (mode == HImode
16702 || mode == SImode
16703 || (TARGET_64BIT && mode == DImode))
16704 && satisfies_constraint_L (src2));
16706 return true;
16709 /* Attempt to expand a unary operator. Make the expansion closer to the
16710 actual machine, then just general_operand, which will allow 2 separate
16711 memory references (one output, one input) in a single insn. */
16713 void
16714 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16715 rtx operands[])
16717 int matching_memory;
16718 rtx src, dst, op, clob;
16720 dst = operands[0];
16721 src = operands[1];
16723 /* If the destination is memory, and we do not have matching source
16724 operands, do things in registers. */
16725 matching_memory = 0;
16726 if (MEM_P (dst))
16728 if (rtx_equal_p (dst, src))
16729 matching_memory = 1;
16730 else
16731 dst = gen_reg_rtx (mode);
16734 /* When source operand is memory, destination must match. */
16735 if (MEM_P (src) && !matching_memory)
16736 src = force_reg (mode, src);
16738 /* Emit the instruction. */
16740 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16741 if (reload_in_progress || code == NOT)
16743 /* Reload doesn't know about the flags register, and doesn't know that
16744 it doesn't want to clobber it. */
16745 gcc_assert (code == NOT);
16746 emit_insn (op);
16748 else
16750 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16751 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16754 /* Fix up the destination if needed. */
16755 if (dst != operands[0])
16756 emit_move_insn (operands[0], dst);
16759 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16760 divisor are within the range [0-255]. */
16762 void
16763 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16764 bool signed_p)
16766 rtx end_label, qimode_label;
16767 rtx insn, div, mod;
16768 rtx scratch, tmp0, tmp1, tmp2;
16769 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16770 rtx (*gen_zero_extend) (rtx, rtx);
16771 rtx (*gen_test_ccno_1) (rtx, rtx);
16773 switch (mode)
16775 case SImode:
16776 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16777 gen_test_ccno_1 = gen_testsi_ccno_1;
16778 gen_zero_extend = gen_zero_extendqisi2;
16779 break;
16780 case DImode:
16781 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16782 gen_test_ccno_1 = gen_testdi_ccno_1;
16783 gen_zero_extend = gen_zero_extendqidi2;
16784 break;
16785 default:
16786 gcc_unreachable ();
16789 end_label = gen_label_rtx ();
16790 qimode_label = gen_label_rtx ();
16792 scratch = gen_reg_rtx (mode);
16794 /* Use 8bit unsigned divimod if dividend and divisor are within
16795 the range [0-255]. */
16796 emit_move_insn (scratch, operands[2]);
16797 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16798 scratch, 1, OPTAB_DIRECT);
16799 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16800 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16801 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16802 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16803 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16804 pc_rtx);
16805 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16806 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16807 JUMP_LABEL (insn) = qimode_label;
16809 /* Generate original signed/unsigned divimod. */
16810 div = gen_divmod4_1 (operands[0], operands[1],
16811 operands[2], operands[3]);
16812 emit_insn (div);
16814 /* Branch to the end. */
16815 emit_jump_insn (gen_jump (end_label));
16816 emit_barrier ();
16818 /* Generate 8bit unsigned divide. */
16819 emit_label (qimode_label);
16820 /* Don't use operands[0] for result of 8bit divide since not all
16821 registers support QImode ZERO_EXTRACT. */
16822 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16823 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16824 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16825 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16827 if (signed_p)
16829 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16830 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16832 else
16834 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16835 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16838 /* Extract remainder from AH. */
16839 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16840 if (REG_P (operands[1]))
16841 insn = emit_move_insn (operands[1], tmp1);
16842 else
16844 /* Need a new scratch register since the old one has result
16845 of 8bit divide. */
16846 scratch = gen_reg_rtx (mode);
16847 emit_move_insn (scratch, tmp1);
16848 insn = emit_move_insn (operands[1], scratch);
16850 set_unique_reg_note (insn, REG_EQUAL, mod);
16852 /* Zero extend quotient from AL. */
16853 tmp1 = gen_lowpart (QImode, tmp0);
16854 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16855 set_unique_reg_note (insn, REG_EQUAL, div);
16857 emit_label (end_label);
16860 #define LEA_MAX_STALL (3)
16861 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16863 /* Increase given DISTANCE in half-cycles according to
16864 dependencies between PREV and NEXT instructions.
16865 Add 1 half-cycle if there is no dependency and
16866 go to next cycle if there is some dependecy. */
16868 static unsigned int
16869 increase_distance (rtx prev, rtx next, unsigned int distance)
16871 df_ref *use_rec;
16872 df_ref *def_rec;
16874 if (!prev || !next)
16875 return distance + (distance & 1) + 2;
16877 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16878 return distance + 1;
16880 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16881 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16882 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16883 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16884 return distance + (distance & 1) + 2;
16886 return distance + 1;
16889 /* Function checks if instruction INSN defines register number
16890 REGNO1 or REGNO2. */
16892 static bool
16893 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16894 rtx insn)
16896 df_ref *def_rec;
16898 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16899 if (DF_REF_REG_DEF_P (*def_rec)
16900 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16901 && (regno1 == DF_REF_REGNO (*def_rec)
16902 || regno2 == DF_REF_REGNO (*def_rec)))
16904 return true;
16907 return false;
16910 /* Function checks if instruction INSN uses register number
16911 REGNO as a part of address expression. */
16913 static bool
16914 insn_uses_reg_mem (unsigned int regno, rtx insn)
16916 df_ref *use_rec;
16918 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16919 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16920 return true;
16922 return false;
16925 /* Search backward for non-agu definition of register number REGNO1
16926 or register number REGNO2 in basic block starting from instruction
16927 START up to head of basic block or instruction INSN.
16929 Function puts true value into *FOUND var if definition was found
16930 and false otherwise.
16932 Distance in half-cycles between START and found instruction or head
16933 of BB is added to DISTANCE and returned. */
16935 static int
16936 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16937 rtx insn, int distance,
16938 rtx start, bool *found)
16940 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16941 rtx prev = start;
16942 rtx next = NULL;
16944 *found = false;
16946 while (prev
16947 && prev != insn
16948 && distance < LEA_SEARCH_THRESHOLD)
16950 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16952 distance = increase_distance (prev, next, distance);
16953 if (insn_defines_reg (regno1, regno2, prev))
16955 if (recog_memoized (prev) < 0
16956 || get_attr_type (prev) != TYPE_LEA)
16958 *found = true;
16959 return distance;
16963 next = prev;
16965 if (prev == BB_HEAD (bb))
16966 break;
16968 prev = PREV_INSN (prev);
16971 return distance;
16974 /* Search backward for non-agu definition of register number REGNO1
16975 or register number REGNO2 in INSN's basic block until
16976 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16977 2. Reach neighbour BBs boundary, or
16978 3. Reach agu definition.
16979 Returns the distance between the non-agu definition point and INSN.
16980 If no definition point, returns -1. */
16982 static int
16983 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16984 rtx insn)
16986 basic_block bb = BLOCK_FOR_INSN (insn);
16987 int distance = 0;
16988 bool found = false;
16990 if (insn != BB_HEAD (bb))
16991 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16992 distance, PREV_INSN (insn),
16993 &found);
16995 if (!found && distance < LEA_SEARCH_THRESHOLD)
16997 edge e;
16998 edge_iterator ei;
16999 bool simple_loop = false;
17001 FOR_EACH_EDGE (e, ei, bb->preds)
17002 if (e->src == bb)
17004 simple_loop = true;
17005 break;
17008 if (simple_loop)
17009 distance = distance_non_agu_define_in_bb (regno1, regno2,
17010 insn, distance,
17011 BB_END (bb), &found);
17012 else
17014 int shortest_dist = -1;
17015 bool found_in_bb = false;
17017 FOR_EACH_EDGE (e, ei, bb->preds)
17019 int bb_dist
17020 = distance_non_agu_define_in_bb (regno1, regno2,
17021 insn, distance,
17022 BB_END (e->src),
17023 &found_in_bb);
17024 if (found_in_bb)
17026 if (shortest_dist < 0)
17027 shortest_dist = bb_dist;
17028 else if (bb_dist > 0)
17029 shortest_dist = MIN (bb_dist, shortest_dist);
17031 found = true;
17035 distance = shortest_dist;
17039 /* get_attr_type may modify recog data. We want to make sure
17040 that recog data is valid for instruction INSN, on which
17041 distance_non_agu_define is called. INSN is unchanged here. */
17042 extract_insn_cached (insn);
17044 if (!found)
17045 return -1;
17047 return distance >> 1;
17050 /* Return the distance in half-cycles between INSN and the next
17051 insn that uses register number REGNO in memory address added
17052 to DISTANCE. Return -1 if REGNO0 is set.
17054 Put true value into *FOUND if register usage was found and
17055 false otherwise.
17056 Put true value into *REDEFINED if register redefinition was
17057 found and false otherwise. */
17059 static int
17060 distance_agu_use_in_bb (unsigned int regno,
17061 rtx insn, int distance, rtx start,
17062 bool *found, bool *redefined)
17064 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17065 rtx next = start;
17066 rtx prev = NULL;
17068 *found = false;
17069 *redefined = false;
17071 while (next
17072 && next != insn
17073 && distance < LEA_SEARCH_THRESHOLD)
17075 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17077 distance = increase_distance(prev, next, distance);
17078 if (insn_uses_reg_mem (regno, next))
17080 /* Return DISTANCE if OP0 is used in memory
17081 address in NEXT. */
17082 *found = true;
17083 return distance;
17086 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17088 /* Return -1 if OP0 is set in NEXT. */
17089 *redefined = true;
17090 return -1;
17093 prev = next;
17096 if (next == BB_END (bb))
17097 break;
17099 next = NEXT_INSN (next);
17102 return distance;
17105 /* Return the distance between INSN and the next insn that uses
17106 register number REGNO0 in memory address. Return -1 if no such
17107 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17109 static int
17110 distance_agu_use (unsigned int regno0, rtx insn)
17112 basic_block bb = BLOCK_FOR_INSN (insn);
17113 int distance = 0;
17114 bool found = false;
17115 bool redefined = false;
17117 if (insn != BB_END (bb))
17118 distance = distance_agu_use_in_bb (regno0, insn, distance,
17119 NEXT_INSN (insn),
17120 &found, &redefined);
17122 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17124 edge e;
17125 edge_iterator ei;
17126 bool simple_loop = false;
17128 FOR_EACH_EDGE (e, ei, bb->succs)
17129 if (e->dest == bb)
17131 simple_loop = true;
17132 break;
17135 if (simple_loop)
17136 distance = distance_agu_use_in_bb (regno0, insn,
17137 distance, BB_HEAD (bb),
17138 &found, &redefined);
17139 else
17141 int shortest_dist = -1;
17142 bool found_in_bb = false;
17143 bool redefined_in_bb = false;
17145 FOR_EACH_EDGE (e, ei, bb->succs)
17147 int bb_dist
17148 = distance_agu_use_in_bb (regno0, insn,
17149 distance, BB_HEAD (e->dest),
17150 &found_in_bb, &redefined_in_bb);
17151 if (found_in_bb)
17153 if (shortest_dist < 0)
17154 shortest_dist = bb_dist;
17155 else if (bb_dist > 0)
17156 shortest_dist = MIN (bb_dist, shortest_dist);
17158 found = true;
17162 distance = shortest_dist;
17166 if (!found || redefined)
17167 return -1;
17169 return distance >> 1;
17172 /* Define this macro to tune LEA priority vs ADD, it take effect when
17173 there is a dilemma of choicing LEA or ADD
17174 Negative value: ADD is more preferred than LEA
17175 Zero: Netrual
17176 Positive value: LEA is more preferred than ADD*/
17177 #define IX86_LEA_PRIORITY 0
17179 /* Return true if usage of lea INSN has performance advantage
17180 over a sequence of instructions. Instructions sequence has
17181 SPLIT_COST cycles higher latency than lea latency. */
17183 static bool
17184 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17185 unsigned int regno2, int split_cost)
17187 int dist_define, dist_use;
17189 dist_define = distance_non_agu_define (regno1, regno2, insn);
17190 dist_use = distance_agu_use (regno0, insn);
17192 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17194 /* If there is no non AGU operand definition, no AGU
17195 operand usage and split cost is 0 then both lea
17196 and non lea variants have same priority. Currently
17197 we prefer lea for 64 bit code and non lea on 32 bit
17198 code. */
17199 if (dist_use < 0 && split_cost == 0)
17200 return TARGET_64BIT || IX86_LEA_PRIORITY;
17201 else
17202 return true;
17205 /* With longer definitions distance lea is more preferable.
17206 Here we change it to take into account splitting cost and
17207 lea priority. */
17208 dist_define += split_cost + IX86_LEA_PRIORITY;
17210 /* If there is no use in memory addess then we just check
17211 that split cost exceeds AGU stall. */
17212 if (dist_use < 0)
17213 return dist_define > LEA_MAX_STALL;
17215 /* If this insn has both backward non-agu dependence and forward
17216 agu dependence, the one with short distance takes effect. */
17217 return dist_define >= dist_use;
17220 /* Return true if it is legal to clobber flags by INSN and
17221 false otherwise. */
17223 static bool
17224 ix86_ok_to_clobber_flags (rtx insn)
17226 basic_block bb = BLOCK_FOR_INSN (insn);
17227 df_ref *use;
17228 bitmap live;
17230 while (insn)
17232 if (NONDEBUG_INSN_P (insn))
17234 for (use = DF_INSN_USES (insn); *use; use++)
17235 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17236 return false;
17238 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17239 return true;
17242 if (insn == BB_END (bb))
17243 break;
17245 insn = NEXT_INSN (insn);
17248 live = df_get_live_out(bb);
17249 return !REGNO_REG_SET_P (live, FLAGS_REG);
17252 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17253 move and add to avoid AGU stalls. */
17255 bool
17256 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17258 unsigned int regno0, regno1, regno2;
17260 /* Check if we need to optimize. */
17261 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17262 return false;
17264 /* Check it is correct to split here. */
17265 if (!ix86_ok_to_clobber_flags(insn))
17266 return false;
17268 regno0 = true_regnum (operands[0]);
17269 regno1 = true_regnum (operands[1]);
17270 regno2 = true_regnum (operands[2]);
17272 /* We need to split only adds with non destructive
17273 destination operand. */
17274 if (regno0 == regno1 || regno0 == regno2)
17275 return false;
17276 else
17277 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17280 /* Return true if we should emit lea instruction instead of mov
17281 instruction. */
17283 bool
17284 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17286 unsigned int regno0, regno1;
17288 /* Check if we need to optimize. */
17289 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17290 return false;
17292 /* Use lea for reg to reg moves only. */
17293 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17294 return false;
17296 regno0 = true_regnum (operands[0]);
17297 regno1 = true_regnum (operands[1]);
17299 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17302 /* Return true if we need to split lea into a sequence of
17303 instructions to avoid AGU stalls. */
17305 bool
17306 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17308 unsigned int regno0, regno1, regno2;
17309 int split_cost;
17310 struct ix86_address parts;
17311 int ok;
17313 /* Check we need to optimize. */
17314 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17315 return false;
17317 /* Check it is correct to split here. */
17318 if (!ix86_ok_to_clobber_flags(insn))
17319 return false;
17321 ok = ix86_decompose_address (operands[1], &parts);
17322 gcc_assert (ok);
17324 /* There should be at least two components in the address. */
17325 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17326 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17327 return false;
17329 /* We should not split into add if non legitimate pic
17330 operand is used as displacement. */
17331 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17332 return false;
17334 regno0 = true_regnum (operands[0]) ;
17335 regno1 = INVALID_REGNUM;
17336 regno2 = INVALID_REGNUM;
17338 if (parts.base)
17339 regno1 = true_regnum (parts.base);
17340 if (parts.index)
17341 regno2 = true_regnum (parts.index);
17343 split_cost = 0;
17345 /* Compute how many cycles we will add to execution time
17346 if split lea into a sequence of instructions. */
17347 if (parts.base || parts.index)
17349 /* Have to use mov instruction if non desctructive
17350 destination form is used. */
17351 if (regno1 != regno0 && regno2 != regno0)
17352 split_cost += 1;
17354 /* Have to add index to base if both exist. */
17355 if (parts.base && parts.index)
17356 split_cost += 1;
17358 /* Have to use shift and adds if scale is 2 or greater. */
17359 if (parts.scale > 1)
17361 if (regno0 != regno1)
17362 split_cost += 1;
17363 else if (regno2 == regno0)
17364 split_cost += 4;
17365 else
17366 split_cost += parts.scale;
17369 /* Have to use add instruction with immediate if
17370 disp is non zero. */
17371 if (parts.disp && parts.disp != const0_rtx)
17372 split_cost += 1;
17374 /* Subtract the price of lea. */
17375 split_cost -= 1;
17378 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17381 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17382 matches destination. RTX includes clobber of FLAGS_REG. */
17384 static void
17385 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17386 rtx dst, rtx src)
17388 rtx op, clob;
17390 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17391 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17393 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17396 /* Return true if regno1 def is nearest to the insn. */
17398 static bool
17399 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17401 rtx prev = insn;
17402 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17404 if (insn == start)
17405 return false;
17406 while (prev && prev != start)
17408 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17410 prev = PREV_INSN (prev);
17411 continue;
17413 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17414 return true;
17415 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17416 return false;
17417 prev = PREV_INSN (prev);
17420 /* None of the regs is defined in the bb. */
17421 return false;
17424 /* Split lea instructions into a sequence of instructions
17425 which are executed on ALU to avoid AGU stalls.
17426 It is assumed that it is allowed to clobber flags register
17427 at lea position. */
17429 void
17430 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17432 unsigned int regno0, regno1, regno2;
17433 struct ix86_address parts;
17434 rtx target, tmp;
17435 int ok, adds;
17437 ok = ix86_decompose_address (operands[1], &parts);
17438 gcc_assert (ok);
17440 target = gen_lowpart (mode, operands[0]);
17442 regno0 = true_regnum (target);
17443 regno1 = INVALID_REGNUM;
17444 regno2 = INVALID_REGNUM;
17446 if (parts.base)
17448 parts.base = gen_lowpart (mode, parts.base);
17449 regno1 = true_regnum (parts.base);
17452 if (parts.index)
17454 parts.index = gen_lowpart (mode, parts.index);
17455 regno2 = true_regnum (parts.index);
17458 if (parts.disp)
17459 parts.disp = gen_lowpart (mode, parts.disp);
17461 if (parts.scale > 1)
17463 /* Case r1 = r1 + ... */
17464 if (regno1 == regno0)
17466 /* If we have a case r1 = r1 + C * r1 then we
17467 should use multiplication which is very
17468 expensive. Assume cost model is wrong if we
17469 have such case here. */
17470 gcc_assert (regno2 != regno0);
17472 for (adds = parts.scale; adds > 0; adds--)
17473 ix86_emit_binop (PLUS, mode, target, parts.index);
17475 else
17477 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17478 if (regno0 != regno2)
17479 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17481 /* Use shift for scaling. */
17482 ix86_emit_binop (ASHIFT, mode, target,
17483 GEN_INT (exact_log2 (parts.scale)));
17485 if (parts.base)
17486 ix86_emit_binop (PLUS, mode, target, parts.base);
17488 if (parts.disp && parts.disp != const0_rtx)
17489 ix86_emit_binop (PLUS, mode, target, parts.disp);
17492 else if (!parts.base && !parts.index)
17494 gcc_assert(parts.disp);
17495 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17497 else
17499 if (!parts.base)
17501 if (regno0 != regno2)
17502 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17504 else if (!parts.index)
17506 if (regno0 != regno1)
17507 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17509 else
17511 if (regno0 == regno1)
17512 tmp = parts.index;
17513 else if (regno0 == regno2)
17514 tmp = parts.base;
17515 else
17517 rtx tmp1;
17519 /* Find better operand for SET instruction, depending
17520 on which definition is farther from the insn. */
17521 if (find_nearest_reg_def (insn, regno1, regno2))
17522 tmp = parts.index, tmp1 = parts.base;
17523 else
17524 tmp = parts.base, tmp1 = parts.index;
17526 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17528 if (parts.disp && parts.disp != const0_rtx)
17529 ix86_emit_binop (PLUS, mode, target, parts.disp);
17531 ix86_emit_binop (PLUS, mode, target, tmp1);
17532 return;
17535 ix86_emit_binop (PLUS, mode, target, tmp);
17538 if (parts.disp && parts.disp != const0_rtx)
17539 ix86_emit_binop (PLUS, mode, target, parts.disp);
17543 /* Return true if it is ok to optimize an ADD operation to LEA
17544 operation to avoid flag register consumation. For most processors,
17545 ADD is faster than LEA. For the processors like ATOM, if the
17546 destination register of LEA holds an actual address which will be
17547 used soon, LEA is better and otherwise ADD is better. */
17549 bool
17550 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17552 unsigned int regno0 = true_regnum (operands[0]);
17553 unsigned int regno1 = true_regnum (operands[1]);
17554 unsigned int regno2 = true_regnum (operands[2]);
17556 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17557 if (regno0 != regno1 && regno0 != regno2)
17558 return true;
17560 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17561 return false;
17563 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17566 /* Return true if destination reg of SET_BODY is shift count of
17567 USE_BODY. */
17569 static bool
17570 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17572 rtx set_dest;
17573 rtx shift_rtx;
17574 int i;
17576 /* Retrieve destination of SET_BODY. */
17577 switch (GET_CODE (set_body))
17579 case SET:
17580 set_dest = SET_DEST (set_body);
17581 if (!set_dest || !REG_P (set_dest))
17582 return false;
17583 break;
17584 case PARALLEL:
17585 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17586 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17587 use_body))
17588 return true;
17589 default:
17590 return false;
17591 break;
17594 /* Retrieve shift count of USE_BODY. */
17595 switch (GET_CODE (use_body))
17597 case SET:
17598 shift_rtx = XEXP (use_body, 1);
17599 break;
17600 case PARALLEL:
17601 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17602 if (ix86_dep_by_shift_count_body (set_body,
17603 XVECEXP (use_body, 0, i)))
17604 return true;
17605 default:
17606 return false;
17607 break;
17610 if (shift_rtx
17611 && (GET_CODE (shift_rtx) == ASHIFT
17612 || GET_CODE (shift_rtx) == LSHIFTRT
17613 || GET_CODE (shift_rtx) == ASHIFTRT
17614 || GET_CODE (shift_rtx) == ROTATE
17615 || GET_CODE (shift_rtx) == ROTATERT))
17617 rtx shift_count = XEXP (shift_rtx, 1);
17619 /* Return true if shift count is dest of SET_BODY. */
17620 if (REG_P (shift_count))
17622 /* Add check since it can be invoked before register
17623 allocation in pre-reload schedule. */
17624 if (reload_completed
17625 && true_regnum (set_dest) == true_regnum (shift_count))
17626 return true;
17627 else if (REGNO(set_dest) == REGNO(shift_count))
17628 return true;
17632 return false;
17635 /* Return true if destination reg of SET_INSN is shift count of
17636 USE_INSN. */
17638 bool
17639 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17641 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17642 PATTERN (use_insn));
17645 /* Return TRUE or FALSE depending on whether the unary operator meets the
17646 appropriate constraints. */
17648 bool
17649 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17650 enum machine_mode mode ATTRIBUTE_UNUSED,
17651 rtx operands[2] ATTRIBUTE_UNUSED)
17653 /* If one of operands is memory, source and destination must match. */
17654 if ((MEM_P (operands[0])
17655 || MEM_P (operands[1]))
17656 && ! rtx_equal_p (operands[0], operands[1]))
17657 return false;
17658 return true;
17661 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17662 are ok, keeping in mind the possible movddup alternative. */
17664 bool
17665 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17667 if (MEM_P (operands[0]))
17668 return rtx_equal_p (operands[0], operands[1 + high]);
17669 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17670 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17671 return true;
17674 /* Post-reload splitter for converting an SF or DFmode value in an
17675 SSE register into an unsigned SImode. */
17677 void
17678 ix86_split_convert_uns_si_sse (rtx operands[])
17680 enum machine_mode vecmode;
17681 rtx value, large, zero_or_two31, input, two31, x;
17683 large = operands[1];
17684 zero_or_two31 = operands[2];
17685 input = operands[3];
17686 two31 = operands[4];
17687 vecmode = GET_MODE (large);
17688 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17690 /* Load up the value into the low element. We must ensure that the other
17691 elements are valid floats -- zero is the easiest such value. */
17692 if (MEM_P (input))
17694 if (vecmode == V4SFmode)
17695 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17696 else
17697 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17699 else
17701 input = gen_rtx_REG (vecmode, REGNO (input));
17702 emit_move_insn (value, CONST0_RTX (vecmode));
17703 if (vecmode == V4SFmode)
17704 emit_insn (gen_sse_movss (value, value, input));
17705 else
17706 emit_insn (gen_sse2_movsd (value, value, input));
17709 emit_move_insn (large, two31);
17710 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17712 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17713 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17715 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17716 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17718 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17719 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17721 large = gen_rtx_REG (V4SImode, REGNO (large));
17722 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17724 x = gen_rtx_REG (V4SImode, REGNO (value));
17725 if (vecmode == V4SFmode)
17726 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17727 else
17728 emit_insn (gen_sse2_cvttpd2dq (x, value));
17729 value = x;
17731 emit_insn (gen_xorv4si3 (value, value, large));
17734 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17735 Expects the 64-bit DImode to be supplied in a pair of integral
17736 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17737 -mfpmath=sse, !optimize_size only. */
17739 void
17740 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17742 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17743 rtx int_xmm, fp_xmm;
17744 rtx biases, exponents;
17745 rtx x;
17747 int_xmm = gen_reg_rtx (V4SImode);
17748 if (TARGET_INTER_UNIT_MOVES)
17749 emit_insn (gen_movdi_to_sse (int_xmm, input));
17750 else if (TARGET_SSE_SPLIT_REGS)
17752 emit_clobber (int_xmm);
17753 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17755 else
17757 x = gen_reg_rtx (V2DImode);
17758 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17759 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17762 x = gen_rtx_CONST_VECTOR (V4SImode,
17763 gen_rtvec (4, GEN_INT (0x43300000UL),
17764 GEN_INT (0x45300000UL),
17765 const0_rtx, const0_rtx));
17766 exponents = validize_mem (force_const_mem (V4SImode, x));
17768 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17769 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17771 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17772 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17773 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17774 (0x1.0p84 + double(fp_value_hi_xmm)).
17775 Note these exponents differ by 32. */
17777 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17779 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17780 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17781 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17782 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17783 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17784 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17785 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17786 biases = validize_mem (force_const_mem (V2DFmode, biases));
17787 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17789 /* Add the upper and lower DFmode values together. */
17790 if (TARGET_SSE3)
17791 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17792 else
17794 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17795 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17796 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17799 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17802 /* Not used, but eases macroization of patterns. */
17803 void
17804 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17805 rtx input ATTRIBUTE_UNUSED)
17807 gcc_unreachable ();
17810 /* Convert an unsigned SImode value into a DFmode. Only currently used
17811 for SSE, but applicable anywhere. */
17813 void
17814 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17816 REAL_VALUE_TYPE TWO31r;
17817 rtx x, fp;
17819 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17820 NULL, 1, OPTAB_DIRECT);
17822 fp = gen_reg_rtx (DFmode);
17823 emit_insn (gen_floatsidf2 (fp, x));
17825 real_ldexp (&TWO31r, &dconst1, 31);
17826 x = const_double_from_real_value (TWO31r, DFmode);
17828 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17829 if (x != target)
17830 emit_move_insn (target, x);
17833 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17834 32-bit mode; otherwise we have a direct convert instruction. */
17836 void
17837 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17839 REAL_VALUE_TYPE TWO32r;
17840 rtx fp_lo, fp_hi, x;
17842 fp_lo = gen_reg_rtx (DFmode);
17843 fp_hi = gen_reg_rtx (DFmode);
17845 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17847 real_ldexp (&TWO32r, &dconst1, 32);
17848 x = const_double_from_real_value (TWO32r, DFmode);
17849 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17851 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17853 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17854 0, OPTAB_DIRECT);
17855 if (x != target)
17856 emit_move_insn (target, x);
17859 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17860 For x86_32, -mfpmath=sse, !optimize_size only. */
17861 void
17862 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17864 REAL_VALUE_TYPE ONE16r;
17865 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17867 real_ldexp (&ONE16r, &dconst1, 16);
17868 x = const_double_from_real_value (ONE16r, SFmode);
17869 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17870 NULL, 0, OPTAB_DIRECT);
17871 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17872 NULL, 0, OPTAB_DIRECT);
17873 fp_hi = gen_reg_rtx (SFmode);
17874 fp_lo = gen_reg_rtx (SFmode);
17875 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17876 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17877 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17878 0, OPTAB_DIRECT);
17879 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17880 0, OPTAB_DIRECT);
17881 if (!rtx_equal_p (target, fp_hi))
17882 emit_move_insn (target, fp_hi);
17885 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17886 a vector of unsigned ints VAL to vector of floats TARGET. */
17888 void
17889 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17891 rtx tmp[8];
17892 REAL_VALUE_TYPE TWO16r;
17893 enum machine_mode intmode = GET_MODE (val);
17894 enum machine_mode fltmode = GET_MODE (target);
17895 rtx (*cvt) (rtx, rtx);
17897 if (intmode == V4SImode)
17898 cvt = gen_floatv4siv4sf2;
17899 else
17900 cvt = gen_floatv8siv8sf2;
17901 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17902 tmp[0] = force_reg (intmode, tmp[0]);
17903 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17904 OPTAB_DIRECT);
17905 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17906 NULL_RTX, 1, OPTAB_DIRECT);
17907 tmp[3] = gen_reg_rtx (fltmode);
17908 emit_insn (cvt (tmp[3], tmp[1]));
17909 tmp[4] = gen_reg_rtx (fltmode);
17910 emit_insn (cvt (tmp[4], tmp[2]));
17911 real_ldexp (&TWO16r, &dconst1, 16);
17912 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17913 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17914 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17915 OPTAB_DIRECT);
17916 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17917 OPTAB_DIRECT);
17918 if (tmp[7] != target)
17919 emit_move_insn (target, tmp[7]);
17922 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17923 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17924 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17925 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17928 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17930 REAL_VALUE_TYPE TWO31r;
17931 rtx two31r, tmp[4];
17932 enum machine_mode mode = GET_MODE (val);
17933 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17934 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17935 rtx (*cmp) (rtx, rtx, rtx, rtx);
17936 int i;
17938 for (i = 0; i < 3; i++)
17939 tmp[i] = gen_reg_rtx (mode);
17940 real_ldexp (&TWO31r, &dconst1, 31);
17941 two31r = const_double_from_real_value (TWO31r, scalarmode);
17942 two31r = ix86_build_const_vector (mode, 1, two31r);
17943 two31r = force_reg (mode, two31r);
17944 switch (mode)
17946 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17947 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17948 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17949 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17950 default: gcc_unreachable ();
17952 tmp[3] = gen_rtx_LE (mode, two31r, val);
17953 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17954 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17955 0, OPTAB_DIRECT);
17956 if (intmode == V4SImode || TARGET_AVX2)
17957 *xorp = expand_simple_binop (intmode, ASHIFT,
17958 gen_lowpart (intmode, tmp[0]),
17959 GEN_INT (31), NULL_RTX, 0,
17960 OPTAB_DIRECT);
17961 else
17963 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17964 two31 = ix86_build_const_vector (intmode, 1, two31);
17965 *xorp = expand_simple_binop (intmode, AND,
17966 gen_lowpart (intmode, tmp[0]),
17967 two31, NULL_RTX, 0,
17968 OPTAB_DIRECT);
17970 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17971 0, OPTAB_DIRECT);
17974 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17975 then replicate the value for all elements of the vector
17976 register. */
17979 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17981 int i, n_elt;
17982 rtvec v;
17983 enum machine_mode scalar_mode;
17985 switch (mode)
17987 case V32QImode:
17988 case V16QImode:
17989 case V16HImode:
17990 case V8HImode:
17991 case V8SImode:
17992 case V4SImode:
17993 case V4DImode:
17994 case V2DImode:
17995 gcc_assert (vect);
17996 case V8SFmode:
17997 case V4SFmode:
17998 case V4DFmode:
17999 case V2DFmode:
18000 n_elt = GET_MODE_NUNITS (mode);
18001 v = rtvec_alloc (n_elt);
18002 scalar_mode = GET_MODE_INNER (mode);
18004 RTVEC_ELT (v, 0) = value;
18006 for (i = 1; i < n_elt; ++i)
18007 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18009 return gen_rtx_CONST_VECTOR (mode, v);
18011 default:
18012 gcc_unreachable ();
18016 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18017 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18018 for an SSE register. If VECT is true, then replicate the mask for
18019 all elements of the vector register. If INVERT is true, then create
18020 a mask excluding the sign bit. */
18023 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18025 enum machine_mode vec_mode, imode;
18026 HOST_WIDE_INT hi, lo;
18027 int shift = 63;
18028 rtx v;
18029 rtx mask;
18031 /* Find the sign bit, sign extended to 2*HWI. */
18032 switch (mode)
18034 case V8SImode:
18035 case V4SImode:
18036 case V8SFmode:
18037 case V4SFmode:
18038 vec_mode = mode;
18039 mode = GET_MODE_INNER (mode);
18040 imode = SImode;
18041 lo = 0x80000000, hi = lo < 0;
18042 break;
18044 case V4DImode:
18045 case V2DImode:
18046 case V4DFmode:
18047 case V2DFmode:
18048 vec_mode = mode;
18049 mode = GET_MODE_INNER (mode);
18050 imode = DImode;
18051 if (HOST_BITS_PER_WIDE_INT >= 64)
18052 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18053 else
18054 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18055 break;
18057 case TImode:
18058 case TFmode:
18059 vec_mode = VOIDmode;
18060 if (HOST_BITS_PER_WIDE_INT >= 64)
18062 imode = TImode;
18063 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18065 else
18067 rtvec vec;
18069 imode = DImode;
18070 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18072 if (invert)
18074 lo = ~lo, hi = ~hi;
18075 v = constm1_rtx;
18077 else
18078 v = const0_rtx;
18080 mask = immed_double_const (lo, hi, imode);
18082 vec = gen_rtvec (2, v, mask);
18083 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18084 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18086 return v;
18088 break;
18090 default:
18091 gcc_unreachable ();
18094 if (invert)
18095 lo = ~lo, hi = ~hi;
18097 /* Force this value into the low part of a fp vector constant. */
18098 mask = immed_double_const (lo, hi, imode);
18099 mask = gen_lowpart (mode, mask);
18101 if (vec_mode == VOIDmode)
18102 return force_reg (mode, mask);
18104 v = ix86_build_const_vector (vec_mode, vect, mask);
18105 return force_reg (vec_mode, v);
18108 /* Generate code for floating point ABS or NEG. */
18110 void
18111 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18112 rtx operands[])
18114 rtx mask, set, dst, src;
18115 bool use_sse = false;
18116 bool vector_mode = VECTOR_MODE_P (mode);
18117 enum machine_mode vmode = mode;
18119 if (vector_mode)
18120 use_sse = true;
18121 else if (mode == TFmode)
18122 use_sse = true;
18123 else if (TARGET_SSE_MATH)
18125 use_sse = SSE_FLOAT_MODE_P (mode);
18126 if (mode == SFmode)
18127 vmode = V4SFmode;
18128 else if (mode == DFmode)
18129 vmode = V2DFmode;
18132 /* NEG and ABS performed with SSE use bitwise mask operations.
18133 Create the appropriate mask now. */
18134 if (use_sse)
18135 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18136 else
18137 mask = NULL_RTX;
18139 dst = operands[0];
18140 src = operands[1];
18142 set = gen_rtx_fmt_e (code, mode, src);
18143 set = gen_rtx_SET (VOIDmode, dst, set);
18145 if (mask)
18147 rtx use, clob;
18148 rtvec par;
18150 use = gen_rtx_USE (VOIDmode, mask);
18151 if (vector_mode)
18152 par = gen_rtvec (2, set, use);
18153 else
18155 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18156 par = gen_rtvec (3, set, use, clob);
18158 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18160 else
18161 emit_insn (set);
18164 /* Expand a copysign operation. Special case operand 0 being a constant. */
18166 void
18167 ix86_expand_copysign (rtx operands[])
18169 enum machine_mode mode, vmode;
18170 rtx dest, op0, op1, mask, nmask;
18172 dest = operands[0];
18173 op0 = operands[1];
18174 op1 = operands[2];
18176 mode = GET_MODE (dest);
18178 if (mode == SFmode)
18179 vmode = V4SFmode;
18180 else if (mode == DFmode)
18181 vmode = V2DFmode;
18182 else
18183 vmode = mode;
18185 if (GET_CODE (op0) == CONST_DOUBLE)
18187 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18189 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18190 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18192 if (mode == SFmode || mode == DFmode)
18194 if (op0 == CONST0_RTX (mode))
18195 op0 = CONST0_RTX (vmode);
18196 else
18198 rtx v = ix86_build_const_vector (vmode, false, op0);
18200 op0 = force_reg (vmode, v);
18203 else if (op0 != CONST0_RTX (mode))
18204 op0 = force_reg (mode, op0);
18206 mask = ix86_build_signbit_mask (vmode, 0, 0);
18208 if (mode == SFmode)
18209 copysign_insn = gen_copysignsf3_const;
18210 else if (mode == DFmode)
18211 copysign_insn = gen_copysigndf3_const;
18212 else
18213 copysign_insn = gen_copysigntf3_const;
18215 emit_insn (copysign_insn (dest, op0, op1, mask));
18217 else
18219 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18221 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18222 mask = ix86_build_signbit_mask (vmode, 0, 0);
18224 if (mode == SFmode)
18225 copysign_insn = gen_copysignsf3_var;
18226 else if (mode == DFmode)
18227 copysign_insn = gen_copysigndf3_var;
18228 else
18229 copysign_insn = gen_copysigntf3_var;
18231 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18235 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18236 be a constant, and so has already been expanded into a vector constant. */
18238 void
18239 ix86_split_copysign_const (rtx operands[])
18241 enum machine_mode mode, vmode;
18242 rtx dest, op0, mask, x;
18244 dest = operands[0];
18245 op0 = operands[1];
18246 mask = operands[3];
18248 mode = GET_MODE (dest);
18249 vmode = GET_MODE (mask);
18251 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18252 x = gen_rtx_AND (vmode, dest, mask);
18253 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18255 if (op0 != CONST0_RTX (vmode))
18257 x = gen_rtx_IOR (vmode, dest, op0);
18258 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18262 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18263 so we have to do two masks. */
18265 void
18266 ix86_split_copysign_var (rtx operands[])
18268 enum machine_mode mode, vmode;
18269 rtx dest, scratch, op0, op1, mask, nmask, x;
18271 dest = operands[0];
18272 scratch = operands[1];
18273 op0 = operands[2];
18274 op1 = operands[3];
18275 nmask = operands[4];
18276 mask = operands[5];
18278 mode = GET_MODE (dest);
18279 vmode = GET_MODE (mask);
18281 if (rtx_equal_p (op0, op1))
18283 /* Shouldn't happen often (it's useless, obviously), but when it does
18284 we'd generate incorrect code if we continue below. */
18285 emit_move_insn (dest, op0);
18286 return;
18289 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18291 gcc_assert (REGNO (op1) == REGNO (scratch));
18293 x = gen_rtx_AND (vmode, scratch, mask);
18294 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18296 dest = mask;
18297 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18298 x = gen_rtx_NOT (vmode, dest);
18299 x = gen_rtx_AND (vmode, x, op0);
18300 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18302 else
18304 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18306 x = gen_rtx_AND (vmode, scratch, mask);
18308 else /* alternative 2,4 */
18310 gcc_assert (REGNO (mask) == REGNO (scratch));
18311 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18312 x = gen_rtx_AND (vmode, scratch, op1);
18314 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18316 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18318 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18319 x = gen_rtx_AND (vmode, dest, nmask);
18321 else /* alternative 3,4 */
18323 gcc_assert (REGNO (nmask) == REGNO (dest));
18324 dest = nmask;
18325 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18326 x = gen_rtx_AND (vmode, dest, op0);
18328 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18331 x = gen_rtx_IOR (vmode, dest, scratch);
18332 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18335 /* Return TRUE or FALSE depending on whether the first SET in INSN
18336 has source and destination with matching CC modes, and that the
18337 CC mode is at least as constrained as REQ_MODE. */
18339 bool
18340 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18342 rtx set;
18343 enum machine_mode set_mode;
18345 set = PATTERN (insn);
18346 if (GET_CODE (set) == PARALLEL)
18347 set = XVECEXP (set, 0, 0);
18348 gcc_assert (GET_CODE (set) == SET);
18349 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18351 set_mode = GET_MODE (SET_DEST (set));
18352 switch (set_mode)
18354 case CCNOmode:
18355 if (req_mode != CCNOmode
18356 && (req_mode != CCmode
18357 || XEXP (SET_SRC (set), 1) != const0_rtx))
18358 return false;
18359 break;
18360 case CCmode:
18361 if (req_mode == CCGCmode)
18362 return false;
18363 /* FALLTHRU */
18364 case CCGCmode:
18365 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18366 return false;
18367 /* FALLTHRU */
18368 case CCGOCmode:
18369 if (req_mode == CCZmode)
18370 return false;
18371 /* FALLTHRU */
18372 case CCZmode:
18373 break;
18375 case CCAmode:
18376 case CCCmode:
18377 case CCOmode:
18378 case CCSmode:
18379 if (set_mode != req_mode)
18380 return false;
18381 break;
18383 default:
18384 gcc_unreachable ();
18387 return GET_MODE (SET_SRC (set)) == set_mode;
18390 /* Generate insn patterns to do an integer compare of OPERANDS. */
18392 static rtx
18393 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18395 enum machine_mode cmpmode;
18396 rtx tmp, flags;
18398 cmpmode = SELECT_CC_MODE (code, op0, op1);
18399 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18401 /* This is very simple, but making the interface the same as in the
18402 FP case makes the rest of the code easier. */
18403 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18404 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18406 /* Return the test that should be put into the flags user, i.e.
18407 the bcc, scc, or cmov instruction. */
18408 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18411 /* Figure out whether to use ordered or unordered fp comparisons.
18412 Return the appropriate mode to use. */
18414 enum machine_mode
18415 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18417 /* ??? In order to make all comparisons reversible, we do all comparisons
18418 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18419 all forms trapping and nontrapping comparisons, we can make inequality
18420 comparisons trapping again, since it results in better code when using
18421 FCOM based compares. */
18422 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18425 enum machine_mode
18426 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18428 enum machine_mode mode = GET_MODE (op0);
18430 if (SCALAR_FLOAT_MODE_P (mode))
18432 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18433 return ix86_fp_compare_mode (code);
18436 switch (code)
18438 /* Only zero flag is needed. */
18439 case EQ: /* ZF=0 */
18440 case NE: /* ZF!=0 */
18441 return CCZmode;
18442 /* Codes needing carry flag. */
18443 case GEU: /* CF=0 */
18444 case LTU: /* CF=1 */
18445 /* Detect overflow checks. They need just the carry flag. */
18446 if (GET_CODE (op0) == PLUS
18447 && rtx_equal_p (op1, XEXP (op0, 0)))
18448 return CCCmode;
18449 else
18450 return CCmode;
18451 case GTU: /* CF=0 & ZF=0 */
18452 case LEU: /* CF=1 | ZF=1 */
18453 /* Detect overflow checks. They need just the carry flag. */
18454 if (GET_CODE (op0) == MINUS
18455 && rtx_equal_p (op1, XEXP (op0, 0)))
18456 return CCCmode;
18457 else
18458 return CCmode;
18459 /* Codes possibly doable only with sign flag when
18460 comparing against zero. */
18461 case GE: /* SF=OF or SF=0 */
18462 case LT: /* SF<>OF or SF=1 */
18463 if (op1 == const0_rtx)
18464 return CCGOCmode;
18465 else
18466 /* For other cases Carry flag is not required. */
18467 return CCGCmode;
18468 /* Codes doable only with sign flag when comparing
18469 against zero, but we miss jump instruction for it
18470 so we need to use relational tests against overflow
18471 that thus needs to be zero. */
18472 case GT: /* ZF=0 & SF=OF */
18473 case LE: /* ZF=1 | SF<>OF */
18474 if (op1 == const0_rtx)
18475 return CCNOmode;
18476 else
18477 return CCGCmode;
18478 /* strcmp pattern do (use flags) and combine may ask us for proper
18479 mode. */
18480 case USE:
18481 return CCmode;
18482 default:
18483 gcc_unreachable ();
18487 /* Return the fixed registers used for condition codes. */
18489 static bool
18490 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18492 *p1 = FLAGS_REG;
18493 *p2 = FPSR_REG;
18494 return true;
18497 /* If two condition code modes are compatible, return a condition code
18498 mode which is compatible with both. Otherwise, return
18499 VOIDmode. */
18501 static enum machine_mode
18502 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18504 if (m1 == m2)
18505 return m1;
18507 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18508 return VOIDmode;
18510 if ((m1 == CCGCmode && m2 == CCGOCmode)
18511 || (m1 == CCGOCmode && m2 == CCGCmode))
18512 return CCGCmode;
18514 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18515 return m2;
18516 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18517 return m1;
18519 switch (m1)
18521 default:
18522 gcc_unreachable ();
18524 case CCmode:
18525 case CCGCmode:
18526 case CCGOCmode:
18527 case CCNOmode:
18528 case CCAmode:
18529 case CCCmode:
18530 case CCOmode:
18531 case CCSmode:
18532 case CCZmode:
18533 switch (m2)
18535 default:
18536 return VOIDmode;
18538 case CCmode:
18539 case CCGCmode:
18540 case CCGOCmode:
18541 case CCNOmode:
18542 case CCAmode:
18543 case CCCmode:
18544 case CCOmode:
18545 case CCSmode:
18546 case CCZmode:
18547 return CCmode;
18550 case CCFPmode:
18551 case CCFPUmode:
18552 /* These are only compatible with themselves, which we already
18553 checked above. */
18554 return VOIDmode;
18559 /* Return a comparison we can do and that it is equivalent to
18560 swap_condition (code) apart possibly from orderedness.
18561 But, never change orderedness if TARGET_IEEE_FP, returning
18562 UNKNOWN in that case if necessary. */
18564 static enum rtx_code
18565 ix86_fp_swap_condition (enum rtx_code code)
18567 switch (code)
18569 case GT: /* GTU - CF=0 & ZF=0 */
18570 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18571 case GE: /* GEU - CF=0 */
18572 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18573 case UNLT: /* LTU - CF=1 */
18574 return TARGET_IEEE_FP ? UNKNOWN : GT;
18575 case UNLE: /* LEU - CF=1 | ZF=1 */
18576 return TARGET_IEEE_FP ? UNKNOWN : GE;
18577 default:
18578 return swap_condition (code);
18582 /* Return cost of comparison CODE using the best strategy for performance.
18583 All following functions do use number of instructions as a cost metrics.
18584 In future this should be tweaked to compute bytes for optimize_size and
18585 take into account performance of various instructions on various CPUs. */
18587 static int
18588 ix86_fp_comparison_cost (enum rtx_code code)
18590 int arith_cost;
18592 /* The cost of code using bit-twiddling on %ah. */
18593 switch (code)
18595 case UNLE:
18596 case UNLT:
18597 case LTGT:
18598 case GT:
18599 case GE:
18600 case UNORDERED:
18601 case ORDERED:
18602 case UNEQ:
18603 arith_cost = 4;
18604 break;
18605 case LT:
18606 case NE:
18607 case EQ:
18608 case UNGE:
18609 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18610 break;
18611 case LE:
18612 case UNGT:
18613 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18614 break;
18615 default:
18616 gcc_unreachable ();
18619 switch (ix86_fp_comparison_strategy (code))
18621 case IX86_FPCMP_COMI:
18622 return arith_cost > 4 ? 3 : 2;
18623 case IX86_FPCMP_SAHF:
18624 return arith_cost > 4 ? 4 : 3;
18625 default:
18626 return arith_cost;
18630 /* Return strategy to use for floating-point. We assume that fcomi is always
18631 preferrable where available, since that is also true when looking at size
18632 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18634 enum ix86_fpcmp_strategy
18635 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18637 /* Do fcomi/sahf based test when profitable. */
18639 if (TARGET_CMOVE)
18640 return IX86_FPCMP_COMI;
18642 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18643 return IX86_FPCMP_SAHF;
18645 return IX86_FPCMP_ARITH;
18648 /* Swap, force into registers, or otherwise massage the two operands
18649 to a fp comparison. The operands are updated in place; the new
18650 comparison code is returned. */
18652 static enum rtx_code
18653 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18655 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18656 rtx op0 = *pop0, op1 = *pop1;
18657 enum machine_mode op_mode = GET_MODE (op0);
18658 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18660 /* All of the unordered compare instructions only work on registers.
18661 The same is true of the fcomi compare instructions. The XFmode
18662 compare instructions require registers except when comparing
18663 against zero or when converting operand 1 from fixed point to
18664 floating point. */
18666 if (!is_sse
18667 && (fpcmp_mode == CCFPUmode
18668 || (op_mode == XFmode
18669 && ! (standard_80387_constant_p (op0) == 1
18670 || standard_80387_constant_p (op1) == 1)
18671 && GET_CODE (op1) != FLOAT)
18672 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18674 op0 = force_reg (op_mode, op0);
18675 op1 = force_reg (op_mode, op1);
18677 else
18679 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18680 things around if they appear profitable, otherwise force op0
18681 into a register. */
18683 if (standard_80387_constant_p (op0) == 0
18684 || (MEM_P (op0)
18685 && ! (standard_80387_constant_p (op1) == 0
18686 || MEM_P (op1))))
18688 enum rtx_code new_code = ix86_fp_swap_condition (code);
18689 if (new_code != UNKNOWN)
18691 rtx tmp;
18692 tmp = op0, op0 = op1, op1 = tmp;
18693 code = new_code;
18697 if (!REG_P (op0))
18698 op0 = force_reg (op_mode, op0);
18700 if (CONSTANT_P (op1))
18702 int tmp = standard_80387_constant_p (op1);
18703 if (tmp == 0)
18704 op1 = validize_mem (force_const_mem (op_mode, op1));
18705 else if (tmp == 1)
18707 if (TARGET_CMOVE)
18708 op1 = force_reg (op_mode, op1);
18710 else
18711 op1 = force_reg (op_mode, op1);
18715 /* Try to rearrange the comparison to make it cheaper. */
18716 if (ix86_fp_comparison_cost (code)
18717 > ix86_fp_comparison_cost (swap_condition (code))
18718 && (REG_P (op1) || can_create_pseudo_p ()))
18720 rtx tmp;
18721 tmp = op0, op0 = op1, op1 = tmp;
18722 code = swap_condition (code);
18723 if (!REG_P (op0))
18724 op0 = force_reg (op_mode, op0);
18727 *pop0 = op0;
18728 *pop1 = op1;
18729 return code;
18732 /* Convert comparison codes we use to represent FP comparison to integer
18733 code that will result in proper branch. Return UNKNOWN if no such code
18734 is available. */
18736 enum rtx_code
18737 ix86_fp_compare_code_to_integer (enum rtx_code code)
18739 switch (code)
18741 case GT:
18742 return GTU;
18743 case GE:
18744 return GEU;
18745 case ORDERED:
18746 case UNORDERED:
18747 return code;
18748 break;
18749 case UNEQ:
18750 return EQ;
18751 break;
18752 case UNLT:
18753 return LTU;
18754 break;
18755 case UNLE:
18756 return LEU;
18757 break;
18758 case LTGT:
18759 return NE;
18760 break;
18761 default:
18762 return UNKNOWN;
18766 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18768 static rtx
18769 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18771 enum machine_mode fpcmp_mode, intcmp_mode;
18772 rtx tmp, tmp2;
18774 fpcmp_mode = ix86_fp_compare_mode (code);
18775 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18777 /* Do fcomi/sahf based test when profitable. */
18778 switch (ix86_fp_comparison_strategy (code))
18780 case IX86_FPCMP_COMI:
18781 intcmp_mode = fpcmp_mode;
18782 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18783 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18784 tmp);
18785 emit_insn (tmp);
18786 break;
18788 case IX86_FPCMP_SAHF:
18789 intcmp_mode = fpcmp_mode;
18790 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18791 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18792 tmp);
18794 if (!scratch)
18795 scratch = gen_reg_rtx (HImode);
18796 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18797 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18798 break;
18800 case IX86_FPCMP_ARITH:
18801 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18802 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18803 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18804 if (!scratch)
18805 scratch = gen_reg_rtx (HImode);
18806 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18808 /* In the unordered case, we have to check C2 for NaN's, which
18809 doesn't happen to work out to anything nice combination-wise.
18810 So do some bit twiddling on the value we've got in AH to come
18811 up with an appropriate set of condition codes. */
18813 intcmp_mode = CCNOmode;
18814 switch (code)
18816 case GT:
18817 case UNGT:
18818 if (code == GT || !TARGET_IEEE_FP)
18820 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18821 code = EQ;
18823 else
18825 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18826 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18827 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18828 intcmp_mode = CCmode;
18829 code = GEU;
18831 break;
18832 case LT:
18833 case UNLT:
18834 if (code == LT && TARGET_IEEE_FP)
18836 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18837 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18838 intcmp_mode = CCmode;
18839 code = EQ;
18841 else
18843 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18844 code = NE;
18846 break;
18847 case GE:
18848 case UNGE:
18849 if (code == GE || !TARGET_IEEE_FP)
18851 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18852 code = EQ;
18854 else
18856 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18857 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18858 code = NE;
18860 break;
18861 case LE:
18862 case UNLE:
18863 if (code == LE && TARGET_IEEE_FP)
18865 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18866 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18867 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18868 intcmp_mode = CCmode;
18869 code = LTU;
18871 else
18873 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18874 code = NE;
18876 break;
18877 case EQ:
18878 case UNEQ:
18879 if (code == EQ && TARGET_IEEE_FP)
18881 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18882 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18883 intcmp_mode = CCmode;
18884 code = EQ;
18886 else
18888 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18889 code = NE;
18891 break;
18892 case NE:
18893 case LTGT:
18894 if (code == NE && TARGET_IEEE_FP)
18896 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18897 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18898 GEN_INT (0x40)));
18899 code = NE;
18901 else
18903 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18904 code = EQ;
18906 break;
18908 case UNORDERED:
18909 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18910 code = NE;
18911 break;
18912 case ORDERED:
18913 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18914 code = EQ;
18915 break;
18917 default:
18918 gcc_unreachable ();
18920 break;
18922 default:
18923 gcc_unreachable();
18926 /* Return the test that should be put into the flags user, i.e.
18927 the bcc, scc, or cmov instruction. */
18928 return gen_rtx_fmt_ee (code, VOIDmode,
18929 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18930 const0_rtx);
18933 static rtx
18934 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18936 rtx ret;
18938 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18939 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18941 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18943 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18944 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18946 else
18947 ret = ix86_expand_int_compare (code, op0, op1);
18949 return ret;
18952 void
18953 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18955 enum machine_mode mode = GET_MODE (op0);
18956 rtx tmp;
18958 switch (mode)
18960 case SFmode:
18961 case DFmode:
18962 case XFmode:
18963 case QImode:
18964 case HImode:
18965 case SImode:
18966 simple:
18967 tmp = ix86_expand_compare (code, op0, op1);
18968 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18969 gen_rtx_LABEL_REF (VOIDmode, label),
18970 pc_rtx);
18971 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18972 return;
18974 case DImode:
18975 if (TARGET_64BIT)
18976 goto simple;
18977 case TImode:
18978 /* Expand DImode branch into multiple compare+branch. */
18980 rtx lo[2], hi[2], label2;
18981 enum rtx_code code1, code2, code3;
18982 enum machine_mode submode;
18984 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18986 tmp = op0, op0 = op1, op1 = tmp;
18987 code = swap_condition (code);
18990 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18991 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18993 submode = mode == DImode ? SImode : DImode;
18995 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18996 avoid two branches. This costs one extra insn, so disable when
18997 optimizing for size. */
18999 if ((code == EQ || code == NE)
19000 && (!optimize_insn_for_size_p ()
19001 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19003 rtx xor0, xor1;
19005 xor1 = hi[0];
19006 if (hi[1] != const0_rtx)
19007 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19008 NULL_RTX, 0, OPTAB_WIDEN);
19010 xor0 = lo[0];
19011 if (lo[1] != const0_rtx)
19012 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19013 NULL_RTX, 0, OPTAB_WIDEN);
19015 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19016 NULL_RTX, 0, OPTAB_WIDEN);
19018 ix86_expand_branch (code, tmp, const0_rtx, label);
19019 return;
19022 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19023 op1 is a constant and the low word is zero, then we can just
19024 examine the high word. Similarly for low word -1 and
19025 less-or-equal-than or greater-than. */
19027 if (CONST_INT_P (hi[1]))
19028 switch (code)
19030 case LT: case LTU: case GE: case GEU:
19031 if (lo[1] == const0_rtx)
19033 ix86_expand_branch (code, hi[0], hi[1], label);
19034 return;
19036 break;
19037 case LE: case LEU: case GT: case GTU:
19038 if (lo[1] == constm1_rtx)
19040 ix86_expand_branch (code, hi[0], hi[1], label);
19041 return;
19043 break;
19044 default:
19045 break;
19048 /* Otherwise, we need two or three jumps. */
19050 label2 = gen_label_rtx ();
19052 code1 = code;
19053 code2 = swap_condition (code);
19054 code3 = unsigned_condition (code);
19056 switch (code)
19058 case LT: case GT: case LTU: case GTU:
19059 break;
19061 case LE: code1 = LT; code2 = GT; break;
19062 case GE: code1 = GT; code2 = LT; break;
19063 case LEU: code1 = LTU; code2 = GTU; break;
19064 case GEU: code1 = GTU; code2 = LTU; break;
19066 case EQ: code1 = UNKNOWN; code2 = NE; break;
19067 case NE: code2 = UNKNOWN; break;
19069 default:
19070 gcc_unreachable ();
19074 * a < b =>
19075 * if (hi(a) < hi(b)) goto true;
19076 * if (hi(a) > hi(b)) goto false;
19077 * if (lo(a) < lo(b)) goto true;
19078 * false:
19081 if (code1 != UNKNOWN)
19082 ix86_expand_branch (code1, hi[0], hi[1], label);
19083 if (code2 != UNKNOWN)
19084 ix86_expand_branch (code2, hi[0], hi[1], label2);
19086 ix86_expand_branch (code3, lo[0], lo[1], label);
19088 if (code2 != UNKNOWN)
19089 emit_label (label2);
19090 return;
19093 default:
19094 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19095 goto simple;
19099 /* Split branch based on floating point condition. */
19100 void
19101 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19102 rtx target1, rtx target2, rtx tmp, rtx pushed)
19104 rtx condition;
19105 rtx i;
19107 if (target2 != pc_rtx)
19109 rtx tmp = target2;
19110 code = reverse_condition_maybe_unordered (code);
19111 target2 = target1;
19112 target1 = tmp;
19115 condition = ix86_expand_fp_compare (code, op1, op2,
19116 tmp);
19118 /* Remove pushed operand from stack. */
19119 if (pushed)
19120 ix86_free_from_memory (GET_MODE (pushed));
19122 i = emit_jump_insn (gen_rtx_SET
19123 (VOIDmode, pc_rtx,
19124 gen_rtx_IF_THEN_ELSE (VOIDmode,
19125 condition, target1, target2)));
19126 if (split_branch_probability >= 0)
19127 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19130 void
19131 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19133 rtx ret;
19135 gcc_assert (GET_MODE (dest) == QImode);
19137 ret = ix86_expand_compare (code, op0, op1);
19138 PUT_MODE (ret, QImode);
19139 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19142 /* Expand comparison setting or clearing carry flag. Return true when
19143 successful and set pop for the operation. */
19144 static bool
19145 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19147 enum machine_mode mode =
19148 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19150 /* Do not handle double-mode compares that go through special path. */
19151 if (mode == (TARGET_64BIT ? TImode : DImode))
19152 return false;
19154 if (SCALAR_FLOAT_MODE_P (mode))
19156 rtx compare_op, compare_seq;
19158 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19160 /* Shortcut: following common codes never translate
19161 into carry flag compares. */
19162 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19163 || code == ORDERED || code == UNORDERED)
19164 return false;
19166 /* These comparisons require zero flag; swap operands so they won't. */
19167 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19168 && !TARGET_IEEE_FP)
19170 rtx tmp = op0;
19171 op0 = op1;
19172 op1 = tmp;
19173 code = swap_condition (code);
19176 /* Try to expand the comparison and verify that we end up with
19177 carry flag based comparison. This fails to be true only when
19178 we decide to expand comparison using arithmetic that is not
19179 too common scenario. */
19180 start_sequence ();
19181 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19182 compare_seq = get_insns ();
19183 end_sequence ();
19185 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19186 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19187 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19188 else
19189 code = GET_CODE (compare_op);
19191 if (code != LTU && code != GEU)
19192 return false;
19194 emit_insn (compare_seq);
19195 *pop = compare_op;
19196 return true;
19199 if (!INTEGRAL_MODE_P (mode))
19200 return false;
19202 switch (code)
19204 case LTU:
19205 case GEU:
19206 break;
19208 /* Convert a==0 into (unsigned)a<1. */
19209 case EQ:
19210 case NE:
19211 if (op1 != const0_rtx)
19212 return false;
19213 op1 = const1_rtx;
19214 code = (code == EQ ? LTU : GEU);
19215 break;
19217 /* Convert a>b into b<a or a>=b-1. */
19218 case GTU:
19219 case LEU:
19220 if (CONST_INT_P (op1))
19222 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19223 /* Bail out on overflow. We still can swap operands but that
19224 would force loading of the constant into register. */
19225 if (op1 == const0_rtx
19226 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19227 return false;
19228 code = (code == GTU ? GEU : LTU);
19230 else
19232 rtx tmp = op1;
19233 op1 = op0;
19234 op0 = tmp;
19235 code = (code == GTU ? LTU : GEU);
19237 break;
19239 /* Convert a>=0 into (unsigned)a<0x80000000. */
19240 case LT:
19241 case GE:
19242 if (mode == DImode || op1 != const0_rtx)
19243 return false;
19244 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19245 code = (code == LT ? GEU : LTU);
19246 break;
19247 case LE:
19248 case GT:
19249 if (mode == DImode || op1 != constm1_rtx)
19250 return false;
19251 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19252 code = (code == LE ? GEU : LTU);
19253 break;
19255 default:
19256 return false;
19258 /* Swapping operands may cause constant to appear as first operand. */
19259 if (!nonimmediate_operand (op0, VOIDmode))
19261 if (!can_create_pseudo_p ())
19262 return false;
19263 op0 = force_reg (mode, op0);
19265 *pop = ix86_expand_compare (code, op0, op1);
19266 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19267 return true;
19270 bool
19271 ix86_expand_int_movcc (rtx operands[])
19273 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19274 rtx compare_seq, compare_op;
19275 enum machine_mode mode = GET_MODE (operands[0]);
19276 bool sign_bit_compare_p = false;
19277 rtx op0 = XEXP (operands[1], 0);
19278 rtx op1 = XEXP (operands[1], 1);
19280 if (GET_MODE (op0) == TImode
19281 || (GET_MODE (op0) == DImode
19282 && !TARGET_64BIT))
19283 return false;
19285 start_sequence ();
19286 compare_op = ix86_expand_compare (code, op0, op1);
19287 compare_seq = get_insns ();
19288 end_sequence ();
19290 compare_code = GET_CODE (compare_op);
19292 if ((op1 == const0_rtx && (code == GE || code == LT))
19293 || (op1 == constm1_rtx && (code == GT || code == LE)))
19294 sign_bit_compare_p = true;
19296 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19297 HImode insns, we'd be swallowed in word prefix ops. */
19299 if ((mode != HImode || TARGET_FAST_PREFIX)
19300 && (mode != (TARGET_64BIT ? TImode : DImode))
19301 && CONST_INT_P (operands[2])
19302 && CONST_INT_P (operands[3]))
19304 rtx out = operands[0];
19305 HOST_WIDE_INT ct = INTVAL (operands[2]);
19306 HOST_WIDE_INT cf = INTVAL (operands[3]);
19307 HOST_WIDE_INT diff;
19309 diff = ct - cf;
19310 /* Sign bit compares are better done using shifts than we do by using
19311 sbb. */
19312 if (sign_bit_compare_p
19313 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19315 /* Detect overlap between destination and compare sources. */
19316 rtx tmp = out;
19318 if (!sign_bit_compare_p)
19320 rtx flags;
19321 bool fpcmp = false;
19323 compare_code = GET_CODE (compare_op);
19325 flags = XEXP (compare_op, 0);
19327 if (GET_MODE (flags) == CCFPmode
19328 || GET_MODE (flags) == CCFPUmode)
19330 fpcmp = true;
19331 compare_code
19332 = ix86_fp_compare_code_to_integer (compare_code);
19335 /* To simplify rest of code, restrict to the GEU case. */
19336 if (compare_code == LTU)
19338 HOST_WIDE_INT tmp = ct;
19339 ct = cf;
19340 cf = tmp;
19341 compare_code = reverse_condition (compare_code);
19342 code = reverse_condition (code);
19344 else
19346 if (fpcmp)
19347 PUT_CODE (compare_op,
19348 reverse_condition_maybe_unordered
19349 (GET_CODE (compare_op)));
19350 else
19351 PUT_CODE (compare_op,
19352 reverse_condition (GET_CODE (compare_op)));
19354 diff = ct - cf;
19356 if (reg_overlap_mentioned_p (out, op0)
19357 || reg_overlap_mentioned_p (out, op1))
19358 tmp = gen_reg_rtx (mode);
19360 if (mode == DImode)
19361 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19362 else
19363 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19364 flags, compare_op));
19366 else
19368 if (code == GT || code == GE)
19369 code = reverse_condition (code);
19370 else
19372 HOST_WIDE_INT tmp = ct;
19373 ct = cf;
19374 cf = tmp;
19375 diff = ct - cf;
19377 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19380 if (diff == 1)
19383 * cmpl op0,op1
19384 * sbbl dest,dest
19385 * [addl dest, ct]
19387 * Size 5 - 8.
19389 if (ct)
19390 tmp = expand_simple_binop (mode, PLUS,
19391 tmp, GEN_INT (ct),
19392 copy_rtx (tmp), 1, OPTAB_DIRECT);
19394 else if (cf == -1)
19397 * cmpl op0,op1
19398 * sbbl dest,dest
19399 * orl $ct, dest
19401 * Size 8.
19403 tmp = expand_simple_binop (mode, IOR,
19404 tmp, GEN_INT (ct),
19405 copy_rtx (tmp), 1, OPTAB_DIRECT);
19407 else if (diff == -1 && ct)
19410 * cmpl op0,op1
19411 * sbbl dest,dest
19412 * notl dest
19413 * [addl dest, cf]
19415 * Size 8 - 11.
19417 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19418 if (cf)
19419 tmp = expand_simple_binop (mode, PLUS,
19420 copy_rtx (tmp), GEN_INT (cf),
19421 copy_rtx (tmp), 1, OPTAB_DIRECT);
19423 else
19426 * cmpl op0,op1
19427 * sbbl dest,dest
19428 * [notl dest]
19429 * andl cf - ct, dest
19430 * [addl dest, ct]
19432 * Size 8 - 11.
19435 if (cf == 0)
19437 cf = ct;
19438 ct = 0;
19439 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19442 tmp = expand_simple_binop (mode, AND,
19443 copy_rtx (tmp),
19444 gen_int_mode (cf - ct, mode),
19445 copy_rtx (tmp), 1, OPTAB_DIRECT);
19446 if (ct)
19447 tmp = expand_simple_binop (mode, PLUS,
19448 copy_rtx (tmp), GEN_INT (ct),
19449 copy_rtx (tmp), 1, OPTAB_DIRECT);
19452 if (!rtx_equal_p (tmp, out))
19453 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19455 return true;
19458 if (diff < 0)
19460 enum machine_mode cmp_mode = GET_MODE (op0);
19462 HOST_WIDE_INT tmp;
19463 tmp = ct, ct = cf, cf = tmp;
19464 diff = -diff;
19466 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19468 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19470 /* We may be reversing unordered compare to normal compare, that
19471 is not valid in general (we may convert non-trapping condition
19472 to trapping one), however on i386 we currently emit all
19473 comparisons unordered. */
19474 compare_code = reverse_condition_maybe_unordered (compare_code);
19475 code = reverse_condition_maybe_unordered (code);
19477 else
19479 compare_code = reverse_condition (compare_code);
19480 code = reverse_condition (code);
19484 compare_code = UNKNOWN;
19485 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19486 && CONST_INT_P (op1))
19488 if (op1 == const0_rtx
19489 && (code == LT || code == GE))
19490 compare_code = code;
19491 else if (op1 == constm1_rtx)
19493 if (code == LE)
19494 compare_code = LT;
19495 else if (code == GT)
19496 compare_code = GE;
19500 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19501 if (compare_code != UNKNOWN
19502 && GET_MODE (op0) == GET_MODE (out)
19503 && (cf == -1 || ct == -1))
19505 /* If lea code below could be used, only optimize
19506 if it results in a 2 insn sequence. */
19508 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19509 || diff == 3 || diff == 5 || diff == 9)
19510 || (compare_code == LT && ct == -1)
19511 || (compare_code == GE && cf == -1))
19514 * notl op1 (if necessary)
19515 * sarl $31, op1
19516 * orl cf, op1
19518 if (ct != -1)
19520 cf = ct;
19521 ct = -1;
19522 code = reverse_condition (code);
19525 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19527 out = expand_simple_binop (mode, IOR,
19528 out, GEN_INT (cf),
19529 out, 1, OPTAB_DIRECT);
19530 if (out != operands[0])
19531 emit_move_insn (operands[0], out);
19533 return true;
19538 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19539 || diff == 3 || diff == 5 || diff == 9)
19540 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19541 && (mode != DImode
19542 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19545 * xorl dest,dest
19546 * cmpl op1,op2
19547 * setcc dest
19548 * lea cf(dest*(ct-cf)),dest
19550 * Size 14.
19552 * This also catches the degenerate setcc-only case.
19555 rtx tmp;
19556 int nops;
19558 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19560 nops = 0;
19561 /* On x86_64 the lea instruction operates on Pmode, so we need
19562 to get arithmetics done in proper mode to match. */
19563 if (diff == 1)
19564 tmp = copy_rtx (out);
19565 else
19567 rtx out1;
19568 out1 = copy_rtx (out);
19569 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19570 nops++;
19571 if (diff & 1)
19573 tmp = gen_rtx_PLUS (mode, tmp, out1);
19574 nops++;
19577 if (cf != 0)
19579 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19580 nops++;
19582 if (!rtx_equal_p (tmp, out))
19584 if (nops == 1)
19585 out = force_operand (tmp, copy_rtx (out));
19586 else
19587 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19589 if (!rtx_equal_p (out, operands[0]))
19590 emit_move_insn (operands[0], copy_rtx (out));
19592 return true;
19596 * General case: Jumpful:
19597 * xorl dest,dest cmpl op1, op2
19598 * cmpl op1, op2 movl ct, dest
19599 * setcc dest jcc 1f
19600 * decl dest movl cf, dest
19601 * andl (cf-ct),dest 1:
19602 * addl ct,dest
19604 * Size 20. Size 14.
19606 * This is reasonably steep, but branch mispredict costs are
19607 * high on modern cpus, so consider failing only if optimizing
19608 * for space.
19611 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19612 && BRANCH_COST (optimize_insn_for_speed_p (),
19613 false) >= 2)
19615 if (cf == 0)
19617 enum machine_mode cmp_mode = GET_MODE (op0);
19619 cf = ct;
19620 ct = 0;
19622 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19624 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19626 /* We may be reversing unordered compare to normal compare,
19627 that is not valid in general (we may convert non-trapping
19628 condition to trapping one), however on i386 we currently
19629 emit all comparisons unordered. */
19630 code = reverse_condition_maybe_unordered (code);
19632 else
19634 code = reverse_condition (code);
19635 if (compare_code != UNKNOWN)
19636 compare_code = reverse_condition (compare_code);
19640 if (compare_code != UNKNOWN)
19642 /* notl op1 (if needed)
19643 sarl $31, op1
19644 andl (cf-ct), op1
19645 addl ct, op1
19647 For x < 0 (resp. x <= -1) there will be no notl,
19648 so if possible swap the constants to get rid of the
19649 complement.
19650 True/false will be -1/0 while code below (store flag
19651 followed by decrement) is 0/-1, so the constants need
19652 to be exchanged once more. */
19654 if (compare_code == GE || !cf)
19656 code = reverse_condition (code);
19657 compare_code = LT;
19659 else
19661 HOST_WIDE_INT tmp = cf;
19662 cf = ct;
19663 ct = tmp;
19666 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19668 else
19670 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19672 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19673 constm1_rtx,
19674 copy_rtx (out), 1, OPTAB_DIRECT);
19677 out = expand_simple_binop (mode, AND, copy_rtx (out),
19678 gen_int_mode (cf - ct, mode),
19679 copy_rtx (out), 1, OPTAB_DIRECT);
19680 if (ct)
19681 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19682 copy_rtx (out), 1, OPTAB_DIRECT);
19683 if (!rtx_equal_p (out, operands[0]))
19684 emit_move_insn (operands[0], copy_rtx (out));
19686 return true;
19690 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19692 /* Try a few things more with specific constants and a variable. */
19694 optab op;
19695 rtx var, orig_out, out, tmp;
19697 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19698 return false;
19700 /* If one of the two operands is an interesting constant, load a
19701 constant with the above and mask it in with a logical operation. */
19703 if (CONST_INT_P (operands[2]))
19705 var = operands[3];
19706 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19707 operands[3] = constm1_rtx, op = and_optab;
19708 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19709 operands[3] = const0_rtx, op = ior_optab;
19710 else
19711 return false;
19713 else if (CONST_INT_P (operands[3]))
19715 var = operands[2];
19716 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19717 operands[2] = constm1_rtx, op = and_optab;
19718 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19719 operands[2] = const0_rtx, op = ior_optab;
19720 else
19721 return false;
19723 else
19724 return false;
19726 orig_out = operands[0];
19727 tmp = gen_reg_rtx (mode);
19728 operands[0] = tmp;
19730 /* Recurse to get the constant loaded. */
19731 if (ix86_expand_int_movcc (operands) == 0)
19732 return false;
19734 /* Mask in the interesting variable. */
19735 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19736 OPTAB_WIDEN);
19737 if (!rtx_equal_p (out, orig_out))
19738 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19740 return true;
19744 * For comparison with above,
19746 * movl cf,dest
19747 * movl ct,tmp
19748 * cmpl op1,op2
19749 * cmovcc tmp,dest
19751 * Size 15.
19754 if (! nonimmediate_operand (operands[2], mode))
19755 operands[2] = force_reg (mode, operands[2]);
19756 if (! nonimmediate_operand (operands[3], mode))
19757 operands[3] = force_reg (mode, operands[3]);
19759 if (! register_operand (operands[2], VOIDmode)
19760 && (mode == QImode
19761 || ! register_operand (operands[3], VOIDmode)))
19762 operands[2] = force_reg (mode, operands[2]);
19764 if (mode == QImode
19765 && ! register_operand (operands[3], VOIDmode))
19766 operands[3] = force_reg (mode, operands[3]);
19768 emit_insn (compare_seq);
19769 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19770 gen_rtx_IF_THEN_ELSE (mode,
19771 compare_op, operands[2],
19772 operands[3])));
19773 return true;
19776 /* Swap, force into registers, or otherwise massage the two operands
19777 to an sse comparison with a mask result. Thus we differ a bit from
19778 ix86_prepare_fp_compare_args which expects to produce a flags result.
19780 The DEST operand exists to help determine whether to commute commutative
19781 operators. The POP0/POP1 operands are updated in place. The new
19782 comparison code is returned, or UNKNOWN if not implementable. */
19784 static enum rtx_code
19785 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19786 rtx *pop0, rtx *pop1)
19788 rtx tmp;
19790 switch (code)
19792 case LTGT:
19793 case UNEQ:
19794 /* AVX supports all the needed comparisons. */
19795 if (TARGET_AVX)
19796 break;
19797 /* We have no LTGT as an operator. We could implement it with
19798 NE & ORDERED, but this requires an extra temporary. It's
19799 not clear that it's worth it. */
19800 return UNKNOWN;
19802 case LT:
19803 case LE:
19804 case UNGT:
19805 case UNGE:
19806 /* These are supported directly. */
19807 break;
19809 case EQ:
19810 case NE:
19811 case UNORDERED:
19812 case ORDERED:
19813 /* AVX has 3 operand comparisons, no need to swap anything. */
19814 if (TARGET_AVX)
19815 break;
19816 /* For commutative operators, try to canonicalize the destination
19817 operand to be first in the comparison - this helps reload to
19818 avoid extra moves. */
19819 if (!dest || !rtx_equal_p (dest, *pop1))
19820 break;
19821 /* FALLTHRU */
19823 case GE:
19824 case GT:
19825 case UNLE:
19826 case UNLT:
19827 /* These are not supported directly before AVX, and furthermore
19828 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19829 comparison operands to transform into something that is
19830 supported. */
19831 tmp = *pop0;
19832 *pop0 = *pop1;
19833 *pop1 = tmp;
19834 code = swap_condition (code);
19835 break;
19837 default:
19838 gcc_unreachable ();
19841 return code;
19844 /* Detect conditional moves that exactly match min/max operational
19845 semantics. Note that this is IEEE safe, as long as we don't
19846 interchange the operands.
19848 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19849 and TRUE if the operation is successful and instructions are emitted. */
19851 static bool
19852 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19853 rtx cmp_op1, rtx if_true, rtx if_false)
19855 enum machine_mode mode;
19856 bool is_min;
19857 rtx tmp;
19859 if (code == LT)
19861 else if (code == UNGE)
19863 tmp = if_true;
19864 if_true = if_false;
19865 if_false = tmp;
19867 else
19868 return false;
19870 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19871 is_min = true;
19872 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19873 is_min = false;
19874 else
19875 return false;
19877 mode = GET_MODE (dest);
19879 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19880 but MODE may be a vector mode and thus not appropriate. */
19881 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19883 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19884 rtvec v;
19886 if_true = force_reg (mode, if_true);
19887 v = gen_rtvec (2, if_true, if_false);
19888 tmp = gen_rtx_UNSPEC (mode, v, u);
19890 else
19892 code = is_min ? SMIN : SMAX;
19893 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19896 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19897 return true;
19900 /* Expand an sse vector comparison. Return the register with the result. */
19902 static rtx
19903 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19904 rtx op_true, rtx op_false)
19906 enum machine_mode mode = GET_MODE (dest);
19907 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19908 rtx x;
19910 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19911 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19912 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19914 if (optimize
19915 || reg_overlap_mentioned_p (dest, op_true)
19916 || reg_overlap_mentioned_p (dest, op_false))
19917 dest = gen_reg_rtx (mode);
19919 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19920 if (cmp_mode != mode)
19922 x = force_reg (cmp_mode, x);
19923 convert_move (dest, x, false);
19925 else
19926 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19928 return dest;
19931 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19932 operations. This is used for both scalar and vector conditional moves. */
19934 static void
19935 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19937 enum machine_mode mode = GET_MODE (dest);
19938 rtx t2, t3, x;
19940 if (vector_all_ones_operand (op_true, mode)
19941 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19943 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19945 else if (op_false == CONST0_RTX (mode))
19947 op_true = force_reg (mode, op_true);
19948 x = gen_rtx_AND (mode, cmp, op_true);
19949 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19951 else if (op_true == CONST0_RTX (mode))
19953 op_false = force_reg (mode, op_false);
19954 x = gen_rtx_NOT (mode, cmp);
19955 x = gen_rtx_AND (mode, x, op_false);
19956 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19958 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19960 op_false = force_reg (mode, op_false);
19961 x = gen_rtx_IOR (mode, cmp, op_false);
19962 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19964 else if (TARGET_XOP)
19966 op_true = force_reg (mode, op_true);
19968 if (!nonimmediate_operand (op_false, mode))
19969 op_false = force_reg (mode, op_false);
19971 emit_insn (gen_rtx_SET (mode, dest,
19972 gen_rtx_IF_THEN_ELSE (mode, cmp,
19973 op_true,
19974 op_false)));
19976 else
19978 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19980 if (!nonimmediate_operand (op_true, mode))
19981 op_true = force_reg (mode, op_true);
19983 op_false = force_reg (mode, op_false);
19985 switch (mode)
19987 case V4SFmode:
19988 if (TARGET_SSE4_1)
19989 gen = gen_sse4_1_blendvps;
19990 break;
19991 case V2DFmode:
19992 if (TARGET_SSE4_1)
19993 gen = gen_sse4_1_blendvpd;
19994 break;
19995 case V16QImode:
19996 case V8HImode:
19997 case V4SImode:
19998 case V2DImode:
19999 if (TARGET_SSE4_1)
20001 gen = gen_sse4_1_pblendvb;
20002 dest = gen_lowpart (V16QImode, dest);
20003 op_false = gen_lowpart (V16QImode, op_false);
20004 op_true = gen_lowpart (V16QImode, op_true);
20005 cmp = gen_lowpart (V16QImode, cmp);
20007 break;
20008 case V8SFmode:
20009 if (TARGET_AVX)
20010 gen = gen_avx_blendvps256;
20011 break;
20012 case V4DFmode:
20013 if (TARGET_AVX)
20014 gen = gen_avx_blendvpd256;
20015 break;
20016 case V32QImode:
20017 case V16HImode:
20018 case V8SImode:
20019 case V4DImode:
20020 if (TARGET_AVX2)
20022 gen = gen_avx2_pblendvb;
20023 dest = gen_lowpart (V32QImode, dest);
20024 op_false = gen_lowpart (V32QImode, op_false);
20025 op_true = gen_lowpart (V32QImode, op_true);
20026 cmp = gen_lowpart (V32QImode, cmp);
20028 break;
20029 default:
20030 break;
20033 if (gen != NULL)
20034 emit_insn (gen (dest, op_false, op_true, cmp));
20035 else
20037 op_true = force_reg (mode, op_true);
20039 t2 = gen_reg_rtx (mode);
20040 if (optimize)
20041 t3 = gen_reg_rtx (mode);
20042 else
20043 t3 = dest;
20045 x = gen_rtx_AND (mode, op_true, cmp);
20046 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20048 x = gen_rtx_NOT (mode, cmp);
20049 x = gen_rtx_AND (mode, x, op_false);
20050 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20052 x = gen_rtx_IOR (mode, t3, t2);
20053 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20058 /* Expand a floating-point conditional move. Return true if successful. */
20060 bool
20061 ix86_expand_fp_movcc (rtx operands[])
20063 enum machine_mode mode = GET_MODE (operands[0]);
20064 enum rtx_code code = GET_CODE (operands[1]);
20065 rtx tmp, compare_op;
20066 rtx op0 = XEXP (operands[1], 0);
20067 rtx op1 = XEXP (operands[1], 1);
20069 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20071 enum machine_mode cmode;
20073 /* Since we've no cmove for sse registers, don't force bad register
20074 allocation just to gain access to it. Deny movcc when the
20075 comparison mode doesn't match the move mode. */
20076 cmode = GET_MODE (op0);
20077 if (cmode == VOIDmode)
20078 cmode = GET_MODE (op1);
20079 if (cmode != mode)
20080 return false;
20082 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20083 if (code == UNKNOWN)
20084 return false;
20086 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20087 operands[2], operands[3]))
20088 return true;
20090 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20091 operands[2], operands[3]);
20092 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20093 return true;
20096 if (GET_MODE (op0) == TImode
20097 || (GET_MODE (op0) == DImode
20098 && !TARGET_64BIT))
20099 return false;
20101 /* The floating point conditional move instructions don't directly
20102 support conditions resulting from a signed integer comparison. */
20104 compare_op = ix86_expand_compare (code, op0, op1);
20105 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20107 tmp = gen_reg_rtx (QImode);
20108 ix86_expand_setcc (tmp, code, op0, op1);
20110 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20113 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20114 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20115 operands[2], operands[3])));
20117 return true;
20120 /* Expand a floating-point vector conditional move; a vcond operation
20121 rather than a movcc operation. */
20123 bool
20124 ix86_expand_fp_vcond (rtx operands[])
20126 enum rtx_code code = GET_CODE (operands[3]);
20127 rtx cmp;
20129 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20130 &operands[4], &operands[5]);
20131 if (code == UNKNOWN)
20133 rtx temp;
20134 switch (GET_CODE (operands[3]))
20136 case LTGT:
20137 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20138 operands[5], operands[0], operands[0]);
20139 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20140 operands[5], operands[1], operands[2]);
20141 code = AND;
20142 break;
20143 case UNEQ:
20144 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20145 operands[5], operands[0], operands[0]);
20146 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20147 operands[5], operands[1], operands[2]);
20148 code = IOR;
20149 break;
20150 default:
20151 gcc_unreachable ();
20153 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20154 OPTAB_DIRECT);
20155 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20156 return true;
20159 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20160 operands[5], operands[1], operands[2]))
20161 return true;
20163 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20164 operands[1], operands[2]);
20165 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20166 return true;
20169 /* Expand a signed/unsigned integral vector conditional move. */
20171 bool
20172 ix86_expand_int_vcond (rtx operands[])
20174 enum machine_mode data_mode = GET_MODE (operands[0]);
20175 enum machine_mode mode = GET_MODE (operands[4]);
20176 enum rtx_code code = GET_CODE (operands[3]);
20177 bool negate = false;
20178 rtx x, cop0, cop1;
20180 cop0 = operands[4];
20181 cop1 = operands[5];
20183 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20184 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20185 if ((code == LT || code == GE)
20186 && data_mode == mode
20187 && cop1 == CONST0_RTX (mode)
20188 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20189 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20190 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20191 && (GET_MODE_SIZE (data_mode) == 16
20192 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20194 rtx negop = operands[2 - (code == LT)];
20195 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20196 if (negop == CONST1_RTX (data_mode))
20198 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20199 operands[0], 1, OPTAB_DIRECT);
20200 if (res != operands[0])
20201 emit_move_insn (operands[0], res);
20202 return true;
20204 else if (GET_MODE_INNER (data_mode) != DImode
20205 && vector_all_ones_operand (negop, data_mode))
20207 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20208 operands[0], 0, OPTAB_DIRECT);
20209 if (res != operands[0])
20210 emit_move_insn (operands[0], res);
20211 return true;
20215 if (!nonimmediate_operand (cop1, mode))
20216 cop1 = force_reg (mode, cop1);
20217 if (!general_operand (operands[1], data_mode))
20218 operands[1] = force_reg (data_mode, operands[1]);
20219 if (!general_operand (operands[2], data_mode))
20220 operands[2] = force_reg (data_mode, operands[2]);
20222 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20223 if (TARGET_XOP
20224 && (mode == V16QImode || mode == V8HImode
20225 || mode == V4SImode || mode == V2DImode))
20227 else
20229 /* Canonicalize the comparison to EQ, GT, GTU. */
20230 switch (code)
20232 case EQ:
20233 case GT:
20234 case GTU:
20235 break;
20237 case NE:
20238 case LE:
20239 case LEU:
20240 code = reverse_condition (code);
20241 negate = true;
20242 break;
20244 case GE:
20245 case GEU:
20246 code = reverse_condition (code);
20247 negate = true;
20248 /* FALLTHRU */
20250 case LT:
20251 case LTU:
20252 code = swap_condition (code);
20253 x = cop0, cop0 = cop1, cop1 = x;
20254 break;
20256 default:
20257 gcc_unreachable ();
20260 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20261 if (mode == V2DImode)
20263 switch (code)
20265 case EQ:
20266 /* SSE4.1 supports EQ. */
20267 if (!TARGET_SSE4_1)
20268 return false;
20269 break;
20271 case GT:
20272 case GTU:
20273 /* SSE4.2 supports GT/GTU. */
20274 if (!TARGET_SSE4_2)
20275 return false;
20276 break;
20278 default:
20279 gcc_unreachable ();
20283 /* Unsigned parallel compare is not supported by the hardware.
20284 Play some tricks to turn this into a signed comparison
20285 against 0. */
20286 if (code == GTU)
20288 cop0 = force_reg (mode, cop0);
20290 switch (mode)
20292 case V8SImode:
20293 case V4DImode:
20294 case V4SImode:
20295 case V2DImode:
20297 rtx t1, t2, mask;
20298 rtx (*gen_sub3) (rtx, rtx, rtx);
20300 switch (mode)
20302 case V8SImode: gen_sub3 = gen_subv8si3; break;
20303 case V4DImode: gen_sub3 = gen_subv4di3; break;
20304 case V4SImode: gen_sub3 = gen_subv4si3; break;
20305 case V2DImode: gen_sub3 = gen_subv2di3; break;
20306 default:
20307 gcc_unreachable ();
20309 /* Subtract (-(INT MAX) - 1) from both operands to make
20310 them signed. */
20311 mask = ix86_build_signbit_mask (mode, true, false);
20312 t1 = gen_reg_rtx (mode);
20313 emit_insn (gen_sub3 (t1, cop0, mask));
20315 t2 = gen_reg_rtx (mode);
20316 emit_insn (gen_sub3 (t2, cop1, mask));
20318 cop0 = t1;
20319 cop1 = t2;
20320 code = GT;
20322 break;
20324 case V32QImode:
20325 case V16HImode:
20326 case V16QImode:
20327 case V8HImode:
20328 /* Perform a parallel unsigned saturating subtraction. */
20329 x = gen_reg_rtx (mode);
20330 emit_insn (gen_rtx_SET (VOIDmode, x,
20331 gen_rtx_US_MINUS (mode, cop0, cop1)));
20333 cop0 = x;
20334 cop1 = CONST0_RTX (mode);
20335 code = EQ;
20336 negate = !negate;
20337 break;
20339 default:
20340 gcc_unreachable ();
20345 /* Allow the comparison to be done in one mode, but the movcc to
20346 happen in another mode. */
20347 if (data_mode == mode)
20349 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20350 operands[1+negate], operands[2-negate]);
20352 else
20354 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20355 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20356 code, cop0, cop1,
20357 operands[1+negate], operands[2-negate]);
20358 x = gen_lowpart (data_mode, x);
20361 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20362 operands[2-negate]);
20363 return true;
20366 /* Expand a variable vector permutation. */
20368 void
20369 ix86_expand_vec_perm (rtx operands[])
20371 rtx target = operands[0];
20372 rtx op0 = operands[1];
20373 rtx op1 = operands[2];
20374 rtx mask = operands[3];
20375 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20376 enum machine_mode mode = GET_MODE (op0);
20377 enum machine_mode maskmode = GET_MODE (mask);
20378 int w, e, i;
20379 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20381 /* Number of elements in the vector. */
20382 w = GET_MODE_NUNITS (mode);
20383 e = GET_MODE_UNIT_SIZE (mode);
20384 gcc_assert (w <= 32);
20386 if (TARGET_AVX2)
20388 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20390 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20391 an constant shuffle operand. With a tiny bit of effort we can
20392 use VPERMD instead. A re-interpretation stall for V4DFmode is
20393 unfortunate but there's no avoiding it.
20394 Similarly for V16HImode we don't have instructions for variable
20395 shuffling, while for V32QImode we can use after preparing suitable
20396 masks vpshufb; vpshufb; vpermq; vpor. */
20398 if (mode == V16HImode)
20400 maskmode = mode = V32QImode;
20401 w = 32;
20402 e = 1;
20404 else
20406 maskmode = mode = V8SImode;
20407 w = 8;
20408 e = 4;
20410 t1 = gen_reg_rtx (maskmode);
20412 /* Replicate the low bits of the V4DImode mask into V8SImode:
20413 mask = { A B C D }
20414 t1 = { A A B B C C D D }. */
20415 for (i = 0; i < w / 2; ++i)
20416 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20417 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20418 vt = force_reg (maskmode, vt);
20419 mask = gen_lowpart (maskmode, mask);
20420 if (maskmode == V8SImode)
20421 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20422 else
20423 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20425 /* Multiply the shuffle indicies by two. */
20426 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20427 OPTAB_DIRECT);
20429 /* Add one to the odd shuffle indicies:
20430 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20431 for (i = 0; i < w / 2; ++i)
20433 vec[i * 2] = const0_rtx;
20434 vec[i * 2 + 1] = const1_rtx;
20436 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20437 vt = force_const_mem (maskmode, vt);
20438 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20439 OPTAB_DIRECT);
20441 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20442 operands[3] = mask = t1;
20443 target = gen_lowpart (mode, target);
20444 op0 = gen_lowpart (mode, op0);
20445 op1 = gen_lowpart (mode, op1);
20448 switch (mode)
20450 case V8SImode:
20451 /* The VPERMD and VPERMPS instructions already properly ignore
20452 the high bits of the shuffle elements. No need for us to
20453 perform an AND ourselves. */
20454 if (one_operand_shuffle)
20455 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20456 else
20458 t1 = gen_reg_rtx (V8SImode);
20459 t2 = gen_reg_rtx (V8SImode);
20460 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20461 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20462 goto merge_two;
20464 return;
20466 case V8SFmode:
20467 mask = gen_lowpart (V8SFmode, mask);
20468 if (one_operand_shuffle)
20469 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20470 else
20472 t1 = gen_reg_rtx (V8SFmode);
20473 t2 = gen_reg_rtx (V8SFmode);
20474 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20475 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20476 goto merge_two;
20478 return;
20480 case V4SImode:
20481 /* By combining the two 128-bit input vectors into one 256-bit
20482 input vector, we can use VPERMD and VPERMPS for the full
20483 two-operand shuffle. */
20484 t1 = gen_reg_rtx (V8SImode);
20485 t2 = gen_reg_rtx (V8SImode);
20486 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20487 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20488 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20489 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20490 return;
20492 case V4SFmode:
20493 t1 = gen_reg_rtx (V8SFmode);
20494 t2 = gen_reg_rtx (V8SImode);
20495 mask = gen_lowpart (V4SImode, mask);
20496 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20497 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20498 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20499 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20500 return;
20502 case V32QImode:
20503 t1 = gen_reg_rtx (V32QImode);
20504 t2 = gen_reg_rtx (V32QImode);
20505 t3 = gen_reg_rtx (V32QImode);
20506 vt2 = GEN_INT (128);
20507 for (i = 0; i < 32; i++)
20508 vec[i] = vt2;
20509 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20510 vt = force_reg (V32QImode, vt);
20511 for (i = 0; i < 32; i++)
20512 vec[i] = i < 16 ? vt2 : const0_rtx;
20513 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20514 vt2 = force_reg (V32QImode, vt2);
20515 /* From mask create two adjusted masks, which contain the same
20516 bits as mask in the low 7 bits of each vector element.
20517 The first mask will have the most significant bit clear
20518 if it requests element from the same 128-bit lane
20519 and MSB set if it requests element from the other 128-bit lane.
20520 The second mask will have the opposite values of the MSB,
20521 and additionally will have its 128-bit lanes swapped.
20522 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20523 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20524 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20525 stands for other 12 bytes. */
20526 /* The bit whether element is from the same lane or the other
20527 lane is bit 4, so shift it up by 3 to the MSB position. */
20528 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20529 gen_lowpart (V4DImode, mask),
20530 GEN_INT (3)));
20531 /* Clear MSB bits from the mask just in case it had them set. */
20532 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20533 /* After this t1 will have MSB set for elements from other lane. */
20534 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20535 /* Clear bits other than MSB. */
20536 emit_insn (gen_andv32qi3 (t1, t1, vt));
20537 /* Or in the lower bits from mask into t3. */
20538 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20539 /* And invert MSB bits in t1, so MSB is set for elements from the same
20540 lane. */
20541 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20542 /* Swap 128-bit lanes in t3. */
20543 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20544 gen_lowpart (V4DImode, t3),
20545 const2_rtx, GEN_INT (3),
20546 const0_rtx, const1_rtx));
20547 /* And or in the lower bits from mask into t1. */
20548 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20549 if (one_operand_shuffle)
20551 /* Each of these shuffles will put 0s in places where
20552 element from the other 128-bit lane is needed, otherwise
20553 will shuffle in the requested value. */
20554 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20555 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20556 /* For t3 the 128-bit lanes are swapped again. */
20557 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20558 gen_lowpart (V4DImode, t3),
20559 const2_rtx, GEN_INT (3),
20560 const0_rtx, const1_rtx));
20561 /* And oring both together leads to the result. */
20562 emit_insn (gen_iorv32qi3 (target, t1, t3));
20563 return;
20566 t4 = gen_reg_rtx (V32QImode);
20567 /* Similarly to the above one_operand_shuffle code,
20568 just for repeated twice for each operand. merge_two:
20569 code will merge the two results together. */
20570 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20571 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20572 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20573 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20574 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20575 gen_lowpart (V4DImode, t4),
20576 const2_rtx, GEN_INT (3),
20577 const0_rtx, const1_rtx));
20578 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20579 gen_lowpart (V4DImode, t3),
20580 const2_rtx, GEN_INT (3),
20581 const0_rtx, const1_rtx));
20582 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20583 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20584 t1 = t4;
20585 t2 = t3;
20586 goto merge_two;
20588 default:
20589 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20590 break;
20594 if (TARGET_XOP)
20596 /* The XOP VPPERM insn supports three inputs. By ignoring the
20597 one_operand_shuffle special case, we avoid creating another
20598 set of constant vectors in memory. */
20599 one_operand_shuffle = false;
20601 /* mask = mask & {2*w-1, ...} */
20602 vt = GEN_INT (2*w - 1);
20604 else
20606 /* mask = mask & {w-1, ...} */
20607 vt = GEN_INT (w - 1);
20610 for (i = 0; i < w; i++)
20611 vec[i] = vt;
20612 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20613 mask = expand_simple_binop (maskmode, AND, mask, vt,
20614 NULL_RTX, 0, OPTAB_DIRECT);
20616 /* For non-QImode operations, convert the word permutation control
20617 into a byte permutation control. */
20618 if (mode != V16QImode)
20620 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20621 GEN_INT (exact_log2 (e)),
20622 NULL_RTX, 0, OPTAB_DIRECT);
20624 /* Convert mask to vector of chars. */
20625 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20627 /* Replicate each of the input bytes into byte positions:
20628 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20629 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20630 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20631 for (i = 0; i < 16; ++i)
20632 vec[i] = GEN_INT (i/e * e);
20633 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20634 vt = force_const_mem (V16QImode, vt);
20635 if (TARGET_XOP)
20636 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20637 else
20638 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20640 /* Convert it into the byte positions by doing
20641 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20642 for (i = 0; i < 16; ++i)
20643 vec[i] = GEN_INT (i % e);
20644 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20645 vt = force_const_mem (V16QImode, vt);
20646 emit_insn (gen_addv16qi3 (mask, mask, vt));
20649 /* The actual shuffle operations all operate on V16QImode. */
20650 op0 = gen_lowpart (V16QImode, op0);
20651 op1 = gen_lowpart (V16QImode, op1);
20652 target = gen_lowpart (V16QImode, target);
20654 if (TARGET_XOP)
20656 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20658 else if (one_operand_shuffle)
20660 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20662 else
20664 rtx xops[6];
20665 bool ok;
20667 /* Shuffle the two input vectors independently. */
20668 t1 = gen_reg_rtx (V16QImode);
20669 t2 = gen_reg_rtx (V16QImode);
20670 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20671 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20673 merge_two:
20674 /* Then merge them together. The key is whether any given control
20675 element contained a bit set that indicates the second word. */
20676 mask = operands[3];
20677 vt = GEN_INT (w);
20678 if (maskmode == V2DImode && !TARGET_SSE4_1)
20680 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20681 more shuffle to convert the V2DI input mask into a V4SI
20682 input mask. At which point the masking that expand_int_vcond
20683 will work as desired. */
20684 rtx t3 = gen_reg_rtx (V4SImode);
20685 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20686 const0_rtx, const0_rtx,
20687 const2_rtx, const2_rtx));
20688 mask = t3;
20689 maskmode = V4SImode;
20690 e = w = 4;
20693 for (i = 0; i < w; i++)
20694 vec[i] = vt;
20695 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20696 vt = force_reg (maskmode, vt);
20697 mask = expand_simple_binop (maskmode, AND, mask, vt,
20698 NULL_RTX, 0, OPTAB_DIRECT);
20700 xops[0] = gen_lowpart (mode, operands[0]);
20701 xops[1] = gen_lowpart (mode, t2);
20702 xops[2] = gen_lowpart (mode, t1);
20703 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20704 xops[4] = mask;
20705 xops[5] = vt;
20706 ok = ix86_expand_int_vcond (xops);
20707 gcc_assert (ok);
20711 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20712 true if we should do zero extension, else sign extension. HIGH_P is
20713 true if we want the N/2 high elements, else the low elements. */
20715 void
20716 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20718 enum machine_mode imode = GET_MODE (src);
20719 rtx tmp;
20721 if (TARGET_SSE4_1)
20723 rtx (*unpack)(rtx, rtx);
20724 rtx (*extract)(rtx, rtx) = NULL;
20725 enum machine_mode halfmode = BLKmode;
20727 switch (imode)
20729 case V32QImode:
20730 if (unsigned_p)
20731 unpack = gen_avx2_zero_extendv16qiv16hi2;
20732 else
20733 unpack = gen_avx2_sign_extendv16qiv16hi2;
20734 halfmode = V16QImode;
20735 extract
20736 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20737 break;
20738 case V16HImode:
20739 if (unsigned_p)
20740 unpack = gen_avx2_zero_extendv8hiv8si2;
20741 else
20742 unpack = gen_avx2_sign_extendv8hiv8si2;
20743 halfmode = V8HImode;
20744 extract
20745 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20746 break;
20747 case V8SImode:
20748 if (unsigned_p)
20749 unpack = gen_avx2_zero_extendv4siv4di2;
20750 else
20751 unpack = gen_avx2_sign_extendv4siv4di2;
20752 halfmode = V4SImode;
20753 extract
20754 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20755 break;
20756 case V16QImode:
20757 if (unsigned_p)
20758 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20759 else
20760 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20761 break;
20762 case V8HImode:
20763 if (unsigned_p)
20764 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20765 else
20766 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20767 break;
20768 case V4SImode:
20769 if (unsigned_p)
20770 unpack = gen_sse4_1_zero_extendv2siv2di2;
20771 else
20772 unpack = gen_sse4_1_sign_extendv2siv2di2;
20773 break;
20774 default:
20775 gcc_unreachable ();
20778 if (GET_MODE_SIZE (imode) == 32)
20780 tmp = gen_reg_rtx (halfmode);
20781 emit_insn (extract (tmp, src));
20783 else if (high_p)
20785 /* Shift higher 8 bytes to lower 8 bytes. */
20786 tmp = gen_reg_rtx (imode);
20787 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20788 gen_lowpart (V1TImode, src),
20789 GEN_INT (64)));
20791 else
20792 tmp = src;
20794 emit_insn (unpack (dest, tmp));
20796 else
20798 rtx (*unpack)(rtx, rtx, rtx);
20800 switch (imode)
20802 case V16QImode:
20803 if (high_p)
20804 unpack = gen_vec_interleave_highv16qi;
20805 else
20806 unpack = gen_vec_interleave_lowv16qi;
20807 break;
20808 case V8HImode:
20809 if (high_p)
20810 unpack = gen_vec_interleave_highv8hi;
20811 else
20812 unpack = gen_vec_interleave_lowv8hi;
20813 break;
20814 case V4SImode:
20815 if (high_p)
20816 unpack = gen_vec_interleave_highv4si;
20817 else
20818 unpack = gen_vec_interleave_lowv4si;
20819 break;
20820 default:
20821 gcc_unreachable ();
20824 if (unsigned_p)
20825 tmp = force_reg (imode, CONST0_RTX (imode));
20826 else
20827 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20828 src, pc_rtx, pc_rtx);
20830 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20834 /* Expand conditional increment or decrement using adb/sbb instructions.
20835 The default case using setcc followed by the conditional move can be
20836 done by generic code. */
20837 bool
20838 ix86_expand_int_addcc (rtx operands[])
20840 enum rtx_code code = GET_CODE (operands[1]);
20841 rtx flags;
20842 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20843 rtx compare_op;
20844 rtx val = const0_rtx;
20845 bool fpcmp = false;
20846 enum machine_mode mode;
20847 rtx op0 = XEXP (operands[1], 0);
20848 rtx op1 = XEXP (operands[1], 1);
20850 if (operands[3] != const1_rtx
20851 && operands[3] != constm1_rtx)
20852 return false;
20853 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20854 return false;
20855 code = GET_CODE (compare_op);
20857 flags = XEXP (compare_op, 0);
20859 if (GET_MODE (flags) == CCFPmode
20860 || GET_MODE (flags) == CCFPUmode)
20862 fpcmp = true;
20863 code = ix86_fp_compare_code_to_integer (code);
20866 if (code != LTU)
20868 val = constm1_rtx;
20869 if (fpcmp)
20870 PUT_CODE (compare_op,
20871 reverse_condition_maybe_unordered
20872 (GET_CODE (compare_op)));
20873 else
20874 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20877 mode = GET_MODE (operands[0]);
20879 /* Construct either adc or sbb insn. */
20880 if ((code == LTU) == (operands[3] == constm1_rtx))
20882 switch (mode)
20884 case QImode:
20885 insn = gen_subqi3_carry;
20886 break;
20887 case HImode:
20888 insn = gen_subhi3_carry;
20889 break;
20890 case SImode:
20891 insn = gen_subsi3_carry;
20892 break;
20893 case DImode:
20894 insn = gen_subdi3_carry;
20895 break;
20896 default:
20897 gcc_unreachable ();
20900 else
20902 switch (mode)
20904 case QImode:
20905 insn = gen_addqi3_carry;
20906 break;
20907 case HImode:
20908 insn = gen_addhi3_carry;
20909 break;
20910 case SImode:
20911 insn = gen_addsi3_carry;
20912 break;
20913 case DImode:
20914 insn = gen_adddi3_carry;
20915 break;
20916 default:
20917 gcc_unreachable ();
20920 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20922 return true;
20926 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20927 but works for floating pointer parameters and nonoffsetable memories.
20928 For pushes, it returns just stack offsets; the values will be saved
20929 in the right order. Maximally three parts are generated. */
20931 static int
20932 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20934 int size;
20936 if (!TARGET_64BIT)
20937 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20938 else
20939 size = (GET_MODE_SIZE (mode) + 4) / 8;
20941 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20942 gcc_assert (size >= 2 && size <= 4);
20944 /* Optimize constant pool reference to immediates. This is used by fp
20945 moves, that force all constants to memory to allow combining. */
20946 if (MEM_P (operand) && MEM_READONLY_P (operand))
20948 rtx tmp = maybe_get_pool_constant (operand);
20949 if (tmp)
20950 operand = tmp;
20953 if (MEM_P (operand) && !offsettable_memref_p (operand))
20955 /* The only non-offsetable memories we handle are pushes. */
20956 int ok = push_operand (operand, VOIDmode);
20958 gcc_assert (ok);
20960 operand = copy_rtx (operand);
20961 PUT_MODE (operand, word_mode);
20962 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20963 return size;
20966 if (GET_CODE (operand) == CONST_VECTOR)
20968 enum machine_mode imode = int_mode_for_mode (mode);
20969 /* Caution: if we looked through a constant pool memory above,
20970 the operand may actually have a different mode now. That's
20971 ok, since we want to pun this all the way back to an integer. */
20972 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20973 gcc_assert (operand != NULL);
20974 mode = imode;
20977 if (!TARGET_64BIT)
20979 if (mode == DImode)
20980 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20981 else
20983 int i;
20985 if (REG_P (operand))
20987 gcc_assert (reload_completed);
20988 for (i = 0; i < size; i++)
20989 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20991 else if (offsettable_memref_p (operand))
20993 operand = adjust_address (operand, SImode, 0);
20994 parts[0] = operand;
20995 for (i = 1; i < size; i++)
20996 parts[i] = adjust_address (operand, SImode, 4 * i);
20998 else if (GET_CODE (operand) == CONST_DOUBLE)
21000 REAL_VALUE_TYPE r;
21001 long l[4];
21003 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21004 switch (mode)
21006 case TFmode:
21007 real_to_target (l, &r, mode);
21008 parts[3] = gen_int_mode (l[3], SImode);
21009 parts[2] = gen_int_mode (l[2], SImode);
21010 break;
21011 case XFmode:
21012 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21013 long double may not be 80-bit. */
21014 real_to_target (l, &r, mode);
21015 parts[2] = gen_int_mode (l[2], SImode);
21016 break;
21017 case DFmode:
21018 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21019 break;
21020 default:
21021 gcc_unreachable ();
21023 parts[1] = gen_int_mode (l[1], SImode);
21024 parts[0] = gen_int_mode (l[0], SImode);
21026 else
21027 gcc_unreachable ();
21030 else
21032 if (mode == TImode)
21033 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21034 if (mode == XFmode || mode == TFmode)
21036 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21037 if (REG_P (operand))
21039 gcc_assert (reload_completed);
21040 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21041 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21043 else if (offsettable_memref_p (operand))
21045 operand = adjust_address (operand, DImode, 0);
21046 parts[0] = operand;
21047 parts[1] = adjust_address (operand, upper_mode, 8);
21049 else if (GET_CODE (operand) == CONST_DOUBLE)
21051 REAL_VALUE_TYPE r;
21052 long l[4];
21054 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21055 real_to_target (l, &r, mode);
21057 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21058 if (HOST_BITS_PER_WIDE_INT >= 64)
21059 parts[0]
21060 = gen_int_mode
21061 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21062 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21063 DImode);
21064 else
21065 parts[0] = immed_double_const (l[0], l[1], DImode);
21067 if (upper_mode == SImode)
21068 parts[1] = gen_int_mode (l[2], SImode);
21069 else if (HOST_BITS_PER_WIDE_INT >= 64)
21070 parts[1]
21071 = gen_int_mode
21072 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21073 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21074 DImode);
21075 else
21076 parts[1] = immed_double_const (l[2], l[3], DImode);
21078 else
21079 gcc_unreachable ();
21083 return size;
21086 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21087 Return false when normal moves are needed; true when all required
21088 insns have been emitted. Operands 2-4 contain the input values
21089 int the correct order; operands 5-7 contain the output values. */
21091 void
21092 ix86_split_long_move (rtx operands[])
21094 rtx part[2][4];
21095 int nparts, i, j;
21096 int push = 0;
21097 int collisions = 0;
21098 enum machine_mode mode = GET_MODE (operands[0]);
21099 bool collisionparts[4];
21101 /* The DFmode expanders may ask us to move double.
21102 For 64bit target this is single move. By hiding the fact
21103 here we simplify i386.md splitters. */
21104 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21106 /* Optimize constant pool reference to immediates. This is used by
21107 fp moves, that force all constants to memory to allow combining. */
21109 if (MEM_P (operands[1])
21110 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21111 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21112 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21113 if (push_operand (operands[0], VOIDmode))
21115 operands[0] = copy_rtx (operands[0]);
21116 PUT_MODE (operands[0], word_mode);
21118 else
21119 operands[0] = gen_lowpart (DImode, operands[0]);
21120 operands[1] = gen_lowpart (DImode, operands[1]);
21121 emit_move_insn (operands[0], operands[1]);
21122 return;
21125 /* The only non-offsettable memory we handle is push. */
21126 if (push_operand (operands[0], VOIDmode))
21127 push = 1;
21128 else
21129 gcc_assert (!MEM_P (operands[0])
21130 || offsettable_memref_p (operands[0]));
21132 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21133 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21135 /* When emitting push, take care for source operands on the stack. */
21136 if (push && MEM_P (operands[1])
21137 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21139 rtx src_base = XEXP (part[1][nparts - 1], 0);
21141 /* Compensate for the stack decrement by 4. */
21142 if (!TARGET_64BIT && nparts == 3
21143 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21144 src_base = plus_constant (Pmode, src_base, 4);
21146 /* src_base refers to the stack pointer and is
21147 automatically decreased by emitted push. */
21148 for (i = 0; i < nparts; i++)
21149 part[1][i] = change_address (part[1][i],
21150 GET_MODE (part[1][i]), src_base);
21153 /* We need to do copy in the right order in case an address register
21154 of the source overlaps the destination. */
21155 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21157 rtx tmp;
21159 for (i = 0; i < nparts; i++)
21161 collisionparts[i]
21162 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21163 if (collisionparts[i])
21164 collisions++;
21167 /* Collision in the middle part can be handled by reordering. */
21168 if (collisions == 1 && nparts == 3 && collisionparts [1])
21170 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21171 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21173 else if (collisions == 1
21174 && nparts == 4
21175 && (collisionparts [1] || collisionparts [2]))
21177 if (collisionparts [1])
21179 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21180 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21182 else
21184 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21185 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21189 /* If there are more collisions, we can't handle it by reordering.
21190 Do an lea to the last part and use only one colliding move. */
21191 else if (collisions > 1)
21193 rtx base;
21195 collisions = 1;
21197 base = part[0][nparts - 1];
21199 /* Handle the case when the last part isn't valid for lea.
21200 Happens in 64-bit mode storing the 12-byte XFmode. */
21201 if (GET_MODE (base) != Pmode)
21202 base = gen_rtx_REG (Pmode, REGNO (base));
21204 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21205 part[1][0] = replace_equiv_address (part[1][0], base);
21206 for (i = 1; i < nparts; i++)
21208 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21209 part[1][i] = replace_equiv_address (part[1][i], tmp);
21214 if (push)
21216 if (!TARGET_64BIT)
21218 if (nparts == 3)
21220 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21221 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21222 stack_pointer_rtx, GEN_INT (-4)));
21223 emit_move_insn (part[0][2], part[1][2]);
21225 else if (nparts == 4)
21227 emit_move_insn (part[0][3], part[1][3]);
21228 emit_move_insn (part[0][2], part[1][2]);
21231 else
21233 /* In 64bit mode we don't have 32bit push available. In case this is
21234 register, it is OK - we will just use larger counterpart. We also
21235 retype memory - these comes from attempt to avoid REX prefix on
21236 moving of second half of TFmode value. */
21237 if (GET_MODE (part[1][1]) == SImode)
21239 switch (GET_CODE (part[1][1]))
21241 case MEM:
21242 part[1][1] = adjust_address (part[1][1], DImode, 0);
21243 break;
21245 case REG:
21246 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21247 break;
21249 default:
21250 gcc_unreachable ();
21253 if (GET_MODE (part[1][0]) == SImode)
21254 part[1][0] = part[1][1];
21257 emit_move_insn (part[0][1], part[1][1]);
21258 emit_move_insn (part[0][0], part[1][0]);
21259 return;
21262 /* Choose correct order to not overwrite the source before it is copied. */
21263 if ((REG_P (part[0][0])
21264 && REG_P (part[1][1])
21265 && (REGNO (part[0][0]) == REGNO (part[1][1])
21266 || (nparts == 3
21267 && REGNO (part[0][0]) == REGNO (part[1][2]))
21268 || (nparts == 4
21269 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21270 || (collisions > 0
21271 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21273 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21275 operands[2 + i] = part[0][j];
21276 operands[6 + i] = part[1][j];
21279 else
21281 for (i = 0; i < nparts; i++)
21283 operands[2 + i] = part[0][i];
21284 operands[6 + i] = part[1][i];
21288 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21289 if (optimize_insn_for_size_p ())
21291 for (j = 0; j < nparts - 1; j++)
21292 if (CONST_INT_P (operands[6 + j])
21293 && operands[6 + j] != const0_rtx
21294 && REG_P (operands[2 + j]))
21295 for (i = j; i < nparts - 1; i++)
21296 if (CONST_INT_P (operands[7 + i])
21297 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21298 operands[7 + i] = operands[2 + j];
21301 for (i = 0; i < nparts; i++)
21302 emit_move_insn (operands[2 + i], operands[6 + i]);
21304 return;
21307 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21308 left shift by a constant, either using a single shift or
21309 a sequence of add instructions. */
21311 static void
21312 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21314 rtx (*insn)(rtx, rtx, rtx);
21316 if (count == 1
21317 || (count * ix86_cost->add <= ix86_cost->shift_const
21318 && !optimize_insn_for_size_p ()))
21320 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21321 while (count-- > 0)
21322 emit_insn (insn (operand, operand, operand));
21324 else
21326 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21327 emit_insn (insn (operand, operand, GEN_INT (count)));
21331 void
21332 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21334 rtx (*gen_ashl3)(rtx, rtx, rtx);
21335 rtx (*gen_shld)(rtx, rtx, rtx);
21336 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21338 rtx low[2], high[2];
21339 int count;
21341 if (CONST_INT_P (operands[2]))
21343 split_double_mode (mode, operands, 2, low, high);
21344 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21346 if (count >= half_width)
21348 emit_move_insn (high[0], low[1]);
21349 emit_move_insn (low[0], const0_rtx);
21351 if (count > half_width)
21352 ix86_expand_ashl_const (high[0], count - half_width, mode);
21354 else
21356 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21358 if (!rtx_equal_p (operands[0], operands[1]))
21359 emit_move_insn (operands[0], operands[1]);
21361 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21362 ix86_expand_ashl_const (low[0], count, mode);
21364 return;
21367 split_double_mode (mode, operands, 1, low, high);
21369 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21371 if (operands[1] == const1_rtx)
21373 /* Assuming we've chosen a QImode capable registers, then 1 << N
21374 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21375 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21377 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21379 ix86_expand_clear (low[0]);
21380 ix86_expand_clear (high[0]);
21381 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21383 d = gen_lowpart (QImode, low[0]);
21384 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21385 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21386 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21388 d = gen_lowpart (QImode, high[0]);
21389 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21390 s = gen_rtx_NE (QImode, flags, const0_rtx);
21391 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21394 /* Otherwise, we can get the same results by manually performing
21395 a bit extract operation on bit 5/6, and then performing the two
21396 shifts. The two methods of getting 0/1 into low/high are exactly
21397 the same size. Avoiding the shift in the bit extract case helps
21398 pentium4 a bit; no one else seems to care much either way. */
21399 else
21401 enum machine_mode half_mode;
21402 rtx (*gen_lshr3)(rtx, rtx, rtx);
21403 rtx (*gen_and3)(rtx, rtx, rtx);
21404 rtx (*gen_xor3)(rtx, rtx, rtx);
21405 HOST_WIDE_INT bits;
21406 rtx x;
21408 if (mode == DImode)
21410 half_mode = SImode;
21411 gen_lshr3 = gen_lshrsi3;
21412 gen_and3 = gen_andsi3;
21413 gen_xor3 = gen_xorsi3;
21414 bits = 5;
21416 else
21418 half_mode = DImode;
21419 gen_lshr3 = gen_lshrdi3;
21420 gen_and3 = gen_anddi3;
21421 gen_xor3 = gen_xordi3;
21422 bits = 6;
21425 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21426 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21427 else
21428 x = gen_lowpart (half_mode, operands[2]);
21429 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21431 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21432 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21433 emit_move_insn (low[0], high[0]);
21434 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21437 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21438 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21439 return;
21442 if (operands[1] == constm1_rtx)
21444 /* For -1 << N, we can avoid the shld instruction, because we
21445 know that we're shifting 0...31/63 ones into a -1. */
21446 emit_move_insn (low[0], constm1_rtx);
21447 if (optimize_insn_for_size_p ())
21448 emit_move_insn (high[0], low[0]);
21449 else
21450 emit_move_insn (high[0], constm1_rtx);
21452 else
21454 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21456 if (!rtx_equal_p (operands[0], operands[1]))
21457 emit_move_insn (operands[0], operands[1]);
21459 split_double_mode (mode, operands, 1, low, high);
21460 emit_insn (gen_shld (high[0], low[0], operands[2]));
21463 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21465 if (TARGET_CMOVE && scratch)
21467 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21468 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21470 ix86_expand_clear (scratch);
21471 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21473 else
21475 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21476 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21478 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21482 void
21483 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21485 rtx (*gen_ashr3)(rtx, rtx, rtx)
21486 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21487 rtx (*gen_shrd)(rtx, rtx, rtx);
21488 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21490 rtx low[2], high[2];
21491 int count;
21493 if (CONST_INT_P (operands[2]))
21495 split_double_mode (mode, operands, 2, low, high);
21496 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21498 if (count == GET_MODE_BITSIZE (mode) - 1)
21500 emit_move_insn (high[0], high[1]);
21501 emit_insn (gen_ashr3 (high[0], high[0],
21502 GEN_INT (half_width - 1)));
21503 emit_move_insn (low[0], high[0]);
21506 else if (count >= half_width)
21508 emit_move_insn (low[0], high[1]);
21509 emit_move_insn (high[0], low[0]);
21510 emit_insn (gen_ashr3 (high[0], high[0],
21511 GEN_INT (half_width - 1)));
21513 if (count > half_width)
21514 emit_insn (gen_ashr3 (low[0], low[0],
21515 GEN_INT (count - half_width)));
21517 else
21519 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21521 if (!rtx_equal_p (operands[0], operands[1]))
21522 emit_move_insn (operands[0], operands[1]);
21524 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21525 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21528 else
21530 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21532 if (!rtx_equal_p (operands[0], operands[1]))
21533 emit_move_insn (operands[0], operands[1]);
21535 split_double_mode (mode, operands, 1, low, high);
21537 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21538 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21540 if (TARGET_CMOVE && scratch)
21542 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21543 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21545 emit_move_insn (scratch, high[0]);
21546 emit_insn (gen_ashr3 (scratch, scratch,
21547 GEN_INT (half_width - 1)));
21548 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21549 scratch));
21551 else
21553 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21554 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21556 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21561 void
21562 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21564 rtx (*gen_lshr3)(rtx, rtx, rtx)
21565 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21566 rtx (*gen_shrd)(rtx, rtx, rtx);
21567 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21569 rtx low[2], high[2];
21570 int count;
21572 if (CONST_INT_P (operands[2]))
21574 split_double_mode (mode, operands, 2, low, high);
21575 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21577 if (count >= half_width)
21579 emit_move_insn (low[0], high[1]);
21580 ix86_expand_clear (high[0]);
21582 if (count > half_width)
21583 emit_insn (gen_lshr3 (low[0], low[0],
21584 GEN_INT (count - half_width)));
21586 else
21588 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21590 if (!rtx_equal_p (operands[0], operands[1]))
21591 emit_move_insn (operands[0], operands[1]);
21593 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21594 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21597 else
21599 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21601 if (!rtx_equal_p (operands[0], operands[1]))
21602 emit_move_insn (operands[0], operands[1]);
21604 split_double_mode (mode, operands, 1, low, high);
21606 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21607 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21609 if (TARGET_CMOVE && scratch)
21611 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21612 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21614 ix86_expand_clear (scratch);
21615 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21616 scratch));
21618 else
21620 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21621 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21623 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21628 /* Predict just emitted jump instruction to be taken with probability PROB. */
21629 static void
21630 predict_jump (int prob)
21632 rtx insn = get_last_insn ();
21633 gcc_assert (JUMP_P (insn));
21634 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21637 /* Helper function for the string operations below. Dest VARIABLE whether
21638 it is aligned to VALUE bytes. If true, jump to the label. */
21639 static rtx
21640 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21642 rtx label = gen_label_rtx ();
21643 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21644 if (GET_MODE (variable) == DImode)
21645 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21646 else
21647 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21648 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21649 1, label);
21650 if (epilogue)
21651 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21652 else
21653 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21654 return label;
21657 /* Adjust COUNTER by the VALUE. */
21658 static void
21659 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21661 rtx (*gen_add)(rtx, rtx, rtx)
21662 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21664 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21667 /* Zero extend possibly SImode EXP to Pmode register. */
21669 ix86_zero_extend_to_Pmode (rtx exp)
21671 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21674 /* Divide COUNTREG by SCALE. */
21675 static rtx
21676 scale_counter (rtx countreg, int scale)
21678 rtx sc;
21680 if (scale == 1)
21681 return countreg;
21682 if (CONST_INT_P (countreg))
21683 return GEN_INT (INTVAL (countreg) / scale);
21684 gcc_assert (REG_P (countreg));
21686 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21687 GEN_INT (exact_log2 (scale)),
21688 NULL, 1, OPTAB_DIRECT);
21689 return sc;
21692 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21693 DImode for constant loop counts. */
21695 static enum machine_mode
21696 counter_mode (rtx count_exp)
21698 if (GET_MODE (count_exp) != VOIDmode)
21699 return GET_MODE (count_exp);
21700 if (!CONST_INT_P (count_exp))
21701 return Pmode;
21702 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21703 return DImode;
21704 return SImode;
21707 /* When SRCPTR is non-NULL, output simple loop to move memory
21708 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21709 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21710 equivalent loop to set memory by VALUE (supposed to be in MODE).
21712 The size is rounded down to whole number of chunk size moved at once.
21713 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21716 static void
21717 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21718 rtx destptr, rtx srcptr, rtx value,
21719 rtx count, enum machine_mode mode, int unroll,
21720 int expected_size)
21722 rtx out_label, top_label, iter, tmp;
21723 enum machine_mode iter_mode = counter_mode (count);
21724 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21725 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21726 rtx size;
21727 rtx x_addr;
21728 rtx y_addr;
21729 int i;
21731 top_label = gen_label_rtx ();
21732 out_label = gen_label_rtx ();
21733 iter = gen_reg_rtx (iter_mode);
21735 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21736 NULL, 1, OPTAB_DIRECT);
21737 /* Those two should combine. */
21738 if (piece_size == const1_rtx)
21740 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21741 true, out_label);
21742 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21744 emit_move_insn (iter, const0_rtx);
21746 emit_label (top_label);
21748 tmp = convert_modes (Pmode, iter_mode, iter, true);
21749 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21750 destmem = change_address (destmem, mode, x_addr);
21752 if (srcmem)
21754 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21755 srcmem = change_address (srcmem, mode, y_addr);
21757 /* When unrolling for chips that reorder memory reads and writes,
21758 we can save registers by using single temporary.
21759 Also using 4 temporaries is overkill in 32bit mode. */
21760 if (!TARGET_64BIT && 0)
21762 for (i = 0; i < unroll; i++)
21764 if (i)
21766 destmem =
21767 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21768 srcmem =
21769 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21771 emit_move_insn (destmem, srcmem);
21774 else
21776 rtx tmpreg[4];
21777 gcc_assert (unroll <= 4);
21778 for (i = 0; i < unroll; i++)
21780 tmpreg[i] = gen_reg_rtx (mode);
21781 if (i)
21783 srcmem =
21784 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21786 emit_move_insn (tmpreg[i], srcmem);
21788 for (i = 0; i < unroll; i++)
21790 if (i)
21792 destmem =
21793 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21795 emit_move_insn (destmem, tmpreg[i]);
21799 else
21800 for (i = 0; i < unroll; i++)
21802 if (i)
21803 destmem =
21804 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21805 emit_move_insn (destmem, value);
21808 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21809 true, OPTAB_LIB_WIDEN);
21810 if (tmp != iter)
21811 emit_move_insn (iter, tmp);
21813 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21814 true, top_label);
21815 if (expected_size != -1)
21817 expected_size /= GET_MODE_SIZE (mode) * unroll;
21818 if (expected_size == 0)
21819 predict_jump (0);
21820 else if (expected_size > REG_BR_PROB_BASE)
21821 predict_jump (REG_BR_PROB_BASE - 1);
21822 else
21823 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21825 else
21826 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21827 iter = ix86_zero_extend_to_Pmode (iter);
21828 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21829 true, OPTAB_LIB_WIDEN);
21830 if (tmp != destptr)
21831 emit_move_insn (destptr, tmp);
21832 if (srcptr)
21834 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21835 true, OPTAB_LIB_WIDEN);
21836 if (tmp != srcptr)
21837 emit_move_insn (srcptr, tmp);
21839 emit_label (out_label);
21842 /* Output "rep; mov" instruction.
21843 Arguments have same meaning as for previous function */
21844 static void
21845 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21846 rtx destptr, rtx srcptr,
21847 rtx count,
21848 enum machine_mode mode)
21850 rtx destexp;
21851 rtx srcexp;
21852 rtx countreg;
21853 HOST_WIDE_INT rounded_count;
21855 /* If the size is known, it is shorter to use rep movs. */
21856 if (mode == QImode && CONST_INT_P (count)
21857 && !(INTVAL (count) & 3))
21858 mode = SImode;
21860 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21861 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21862 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21863 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21864 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21865 if (mode != QImode)
21867 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21868 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21869 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21870 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21871 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21872 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21874 else
21876 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21877 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21879 if (CONST_INT_P (count))
21881 rounded_count = (INTVAL (count)
21882 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21883 destmem = shallow_copy_rtx (destmem);
21884 srcmem = shallow_copy_rtx (srcmem);
21885 set_mem_size (destmem, rounded_count);
21886 set_mem_size (srcmem, rounded_count);
21888 else
21890 if (MEM_SIZE_KNOWN_P (destmem))
21891 clear_mem_size (destmem);
21892 if (MEM_SIZE_KNOWN_P (srcmem))
21893 clear_mem_size (srcmem);
21895 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21896 destexp, srcexp));
21899 /* Output "rep; stos" instruction.
21900 Arguments have same meaning as for previous function */
21901 static void
21902 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21903 rtx count, enum machine_mode mode,
21904 rtx orig_value)
21906 rtx destexp;
21907 rtx countreg;
21908 HOST_WIDE_INT rounded_count;
21910 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21911 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21912 value = force_reg (mode, gen_lowpart (mode, value));
21913 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21914 if (mode != QImode)
21916 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21917 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21918 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21920 else
21921 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21922 if (orig_value == const0_rtx && CONST_INT_P (count))
21924 rounded_count = (INTVAL (count)
21925 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21926 destmem = shallow_copy_rtx (destmem);
21927 set_mem_size (destmem, rounded_count);
21929 else if (MEM_SIZE_KNOWN_P (destmem))
21930 clear_mem_size (destmem);
21931 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21934 static void
21935 emit_strmov (rtx destmem, rtx srcmem,
21936 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21938 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21939 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21940 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21943 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21944 static void
21945 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21946 rtx destptr, rtx srcptr, rtx count, int max_size)
21948 rtx src, dest;
21949 if (CONST_INT_P (count))
21951 HOST_WIDE_INT countval = INTVAL (count);
21952 int offset = 0;
21954 if ((countval & 0x10) && max_size > 16)
21956 if (TARGET_64BIT)
21958 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21959 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21961 else
21962 gcc_unreachable ();
21963 offset += 16;
21965 if ((countval & 0x08) && max_size > 8)
21967 if (TARGET_64BIT)
21968 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21969 else
21971 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21972 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21974 offset += 8;
21976 if ((countval & 0x04) && max_size > 4)
21978 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21979 offset += 4;
21981 if ((countval & 0x02) && max_size > 2)
21983 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21984 offset += 2;
21986 if ((countval & 0x01) && max_size > 1)
21988 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21989 offset += 1;
21991 return;
21993 if (max_size > 8)
21995 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21996 count, 1, OPTAB_DIRECT);
21997 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21998 count, QImode, 1, 4);
21999 return;
22002 /* When there are stringops, we can cheaply increase dest and src pointers.
22003 Otherwise we save code size by maintaining offset (zero is readily
22004 available from preceding rep operation) and using x86 addressing modes.
22006 if (TARGET_SINGLE_STRINGOP)
22008 if (max_size > 4)
22010 rtx label = ix86_expand_aligntest (count, 4, true);
22011 src = change_address (srcmem, SImode, srcptr);
22012 dest = change_address (destmem, SImode, destptr);
22013 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22014 emit_label (label);
22015 LABEL_NUSES (label) = 1;
22017 if (max_size > 2)
22019 rtx label = ix86_expand_aligntest (count, 2, true);
22020 src = change_address (srcmem, HImode, srcptr);
22021 dest = change_address (destmem, HImode, destptr);
22022 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22023 emit_label (label);
22024 LABEL_NUSES (label) = 1;
22026 if (max_size > 1)
22028 rtx label = ix86_expand_aligntest (count, 1, true);
22029 src = change_address (srcmem, QImode, srcptr);
22030 dest = change_address (destmem, QImode, destptr);
22031 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22032 emit_label (label);
22033 LABEL_NUSES (label) = 1;
22036 else
22038 rtx offset = force_reg (Pmode, const0_rtx);
22039 rtx tmp;
22041 if (max_size > 4)
22043 rtx label = ix86_expand_aligntest (count, 4, true);
22044 src = change_address (srcmem, SImode, srcptr);
22045 dest = change_address (destmem, SImode, destptr);
22046 emit_move_insn (dest, src);
22047 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22048 true, OPTAB_LIB_WIDEN);
22049 if (tmp != offset)
22050 emit_move_insn (offset, tmp);
22051 emit_label (label);
22052 LABEL_NUSES (label) = 1;
22054 if (max_size > 2)
22056 rtx label = ix86_expand_aligntest (count, 2, true);
22057 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22058 src = change_address (srcmem, HImode, tmp);
22059 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22060 dest = change_address (destmem, HImode, tmp);
22061 emit_move_insn (dest, src);
22062 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22063 true, OPTAB_LIB_WIDEN);
22064 if (tmp != offset)
22065 emit_move_insn (offset, tmp);
22066 emit_label (label);
22067 LABEL_NUSES (label) = 1;
22069 if (max_size > 1)
22071 rtx label = ix86_expand_aligntest (count, 1, true);
22072 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22073 src = change_address (srcmem, QImode, tmp);
22074 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22075 dest = change_address (destmem, QImode, tmp);
22076 emit_move_insn (dest, src);
22077 emit_label (label);
22078 LABEL_NUSES (label) = 1;
22083 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22084 static void
22085 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22086 rtx count, int max_size)
22088 count =
22089 expand_simple_binop (counter_mode (count), AND, count,
22090 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22091 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22092 gen_lowpart (QImode, value), count, QImode,
22093 1, max_size / 2);
22096 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22097 static void
22098 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22100 rtx dest;
22102 if (CONST_INT_P (count))
22104 HOST_WIDE_INT countval = INTVAL (count);
22105 int offset = 0;
22107 if ((countval & 0x10) && max_size > 16)
22109 if (TARGET_64BIT)
22111 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22112 emit_insn (gen_strset (destptr, dest, value));
22113 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22114 emit_insn (gen_strset (destptr, dest, value));
22116 else
22117 gcc_unreachable ();
22118 offset += 16;
22120 if ((countval & 0x08) && max_size > 8)
22122 if (TARGET_64BIT)
22124 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22125 emit_insn (gen_strset (destptr, dest, value));
22127 else
22129 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22130 emit_insn (gen_strset (destptr, dest, value));
22131 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22132 emit_insn (gen_strset (destptr, dest, value));
22134 offset += 8;
22136 if ((countval & 0x04) && max_size > 4)
22138 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22139 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22140 offset += 4;
22142 if ((countval & 0x02) && max_size > 2)
22144 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22145 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22146 offset += 2;
22148 if ((countval & 0x01) && max_size > 1)
22150 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22151 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22152 offset += 1;
22154 return;
22156 if (max_size > 32)
22158 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22159 return;
22161 if (max_size > 16)
22163 rtx label = ix86_expand_aligntest (count, 16, true);
22164 if (TARGET_64BIT)
22166 dest = change_address (destmem, DImode, destptr);
22167 emit_insn (gen_strset (destptr, dest, value));
22168 emit_insn (gen_strset (destptr, dest, value));
22170 else
22172 dest = change_address (destmem, SImode, destptr);
22173 emit_insn (gen_strset (destptr, dest, value));
22174 emit_insn (gen_strset (destptr, dest, value));
22175 emit_insn (gen_strset (destptr, dest, value));
22176 emit_insn (gen_strset (destptr, dest, value));
22178 emit_label (label);
22179 LABEL_NUSES (label) = 1;
22181 if (max_size > 8)
22183 rtx label = ix86_expand_aligntest (count, 8, true);
22184 if (TARGET_64BIT)
22186 dest = change_address (destmem, DImode, destptr);
22187 emit_insn (gen_strset (destptr, dest, value));
22189 else
22191 dest = change_address (destmem, SImode, destptr);
22192 emit_insn (gen_strset (destptr, dest, value));
22193 emit_insn (gen_strset (destptr, dest, value));
22195 emit_label (label);
22196 LABEL_NUSES (label) = 1;
22198 if (max_size > 4)
22200 rtx label = ix86_expand_aligntest (count, 4, true);
22201 dest = change_address (destmem, SImode, destptr);
22202 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22203 emit_label (label);
22204 LABEL_NUSES (label) = 1;
22206 if (max_size > 2)
22208 rtx label = ix86_expand_aligntest (count, 2, true);
22209 dest = change_address (destmem, HImode, destptr);
22210 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22211 emit_label (label);
22212 LABEL_NUSES (label) = 1;
22214 if (max_size > 1)
22216 rtx label = ix86_expand_aligntest (count, 1, true);
22217 dest = change_address (destmem, QImode, destptr);
22218 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22219 emit_label (label);
22220 LABEL_NUSES (label) = 1;
22224 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22225 DESIRED_ALIGNMENT. */
22226 static void
22227 expand_movmem_prologue (rtx destmem, rtx srcmem,
22228 rtx destptr, rtx srcptr, rtx count,
22229 int align, int desired_alignment)
22231 if (align <= 1 && desired_alignment > 1)
22233 rtx label = ix86_expand_aligntest (destptr, 1, false);
22234 srcmem = change_address (srcmem, QImode, srcptr);
22235 destmem = change_address (destmem, QImode, destptr);
22236 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22237 ix86_adjust_counter (count, 1);
22238 emit_label (label);
22239 LABEL_NUSES (label) = 1;
22241 if (align <= 2 && desired_alignment > 2)
22243 rtx label = ix86_expand_aligntest (destptr, 2, false);
22244 srcmem = change_address (srcmem, HImode, srcptr);
22245 destmem = change_address (destmem, HImode, destptr);
22246 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22247 ix86_adjust_counter (count, 2);
22248 emit_label (label);
22249 LABEL_NUSES (label) = 1;
22251 if (align <= 4 && desired_alignment > 4)
22253 rtx label = ix86_expand_aligntest (destptr, 4, false);
22254 srcmem = change_address (srcmem, SImode, srcptr);
22255 destmem = change_address (destmem, SImode, destptr);
22256 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22257 ix86_adjust_counter (count, 4);
22258 emit_label (label);
22259 LABEL_NUSES (label) = 1;
22261 gcc_assert (desired_alignment <= 8);
22264 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22265 ALIGN_BYTES is how many bytes need to be copied. */
22266 static rtx
22267 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22268 int desired_align, int align_bytes)
22270 rtx src = *srcp;
22271 rtx orig_dst = dst;
22272 rtx orig_src = src;
22273 int off = 0;
22274 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22275 if (src_align_bytes >= 0)
22276 src_align_bytes = desired_align - src_align_bytes;
22277 if (align_bytes & 1)
22279 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22280 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22281 off = 1;
22282 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22284 if (align_bytes & 2)
22286 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22287 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22288 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22289 set_mem_align (dst, 2 * BITS_PER_UNIT);
22290 if (src_align_bytes >= 0
22291 && (src_align_bytes & 1) == (align_bytes & 1)
22292 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22293 set_mem_align (src, 2 * BITS_PER_UNIT);
22294 off = 2;
22295 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22297 if (align_bytes & 4)
22299 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22300 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22301 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22302 set_mem_align (dst, 4 * BITS_PER_UNIT);
22303 if (src_align_bytes >= 0)
22305 unsigned int src_align = 0;
22306 if ((src_align_bytes & 3) == (align_bytes & 3))
22307 src_align = 4;
22308 else if ((src_align_bytes & 1) == (align_bytes & 1))
22309 src_align = 2;
22310 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22311 set_mem_align (src, src_align * BITS_PER_UNIT);
22313 off = 4;
22314 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22316 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22317 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22318 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22319 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22320 if (src_align_bytes >= 0)
22322 unsigned int src_align = 0;
22323 if ((src_align_bytes & 7) == (align_bytes & 7))
22324 src_align = 8;
22325 else if ((src_align_bytes & 3) == (align_bytes & 3))
22326 src_align = 4;
22327 else if ((src_align_bytes & 1) == (align_bytes & 1))
22328 src_align = 2;
22329 if (src_align > (unsigned int) desired_align)
22330 src_align = desired_align;
22331 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22332 set_mem_align (src, src_align * BITS_PER_UNIT);
22334 if (MEM_SIZE_KNOWN_P (orig_dst))
22335 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22336 if (MEM_SIZE_KNOWN_P (orig_src))
22337 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22338 *srcp = src;
22339 return dst;
22342 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22343 DESIRED_ALIGNMENT. */
22344 static void
22345 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22346 int align, int desired_alignment)
22348 if (align <= 1 && desired_alignment > 1)
22350 rtx label = ix86_expand_aligntest (destptr, 1, false);
22351 destmem = change_address (destmem, QImode, destptr);
22352 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22353 ix86_adjust_counter (count, 1);
22354 emit_label (label);
22355 LABEL_NUSES (label) = 1;
22357 if (align <= 2 && desired_alignment > 2)
22359 rtx label = ix86_expand_aligntest (destptr, 2, false);
22360 destmem = change_address (destmem, HImode, destptr);
22361 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22362 ix86_adjust_counter (count, 2);
22363 emit_label (label);
22364 LABEL_NUSES (label) = 1;
22366 if (align <= 4 && desired_alignment > 4)
22368 rtx label = ix86_expand_aligntest (destptr, 4, false);
22369 destmem = change_address (destmem, SImode, destptr);
22370 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22371 ix86_adjust_counter (count, 4);
22372 emit_label (label);
22373 LABEL_NUSES (label) = 1;
22375 gcc_assert (desired_alignment <= 8);
22378 /* Set enough from DST to align DST known to by aligned by ALIGN to
22379 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22380 static rtx
22381 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22382 int desired_align, int align_bytes)
22384 int off = 0;
22385 rtx orig_dst = dst;
22386 if (align_bytes & 1)
22388 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22389 off = 1;
22390 emit_insn (gen_strset (destreg, dst,
22391 gen_lowpart (QImode, value)));
22393 if (align_bytes & 2)
22395 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22396 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22397 set_mem_align (dst, 2 * BITS_PER_UNIT);
22398 off = 2;
22399 emit_insn (gen_strset (destreg, dst,
22400 gen_lowpart (HImode, value)));
22402 if (align_bytes & 4)
22404 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22405 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22406 set_mem_align (dst, 4 * BITS_PER_UNIT);
22407 off = 4;
22408 emit_insn (gen_strset (destreg, dst,
22409 gen_lowpart (SImode, value)));
22411 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22412 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22413 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22414 if (MEM_SIZE_KNOWN_P (orig_dst))
22415 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22416 return dst;
22419 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22420 static enum stringop_alg
22421 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22422 int *dynamic_check, bool *noalign)
22424 const struct stringop_algs * algs;
22425 bool optimize_for_speed;
22426 /* Algorithms using the rep prefix want at least edi and ecx;
22427 additionally, memset wants eax and memcpy wants esi. Don't
22428 consider such algorithms if the user has appropriated those
22429 registers for their own purposes. */
22430 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22431 || (memset
22432 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22433 *noalign = false;
22435 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22436 || (alg != rep_prefix_1_byte \
22437 && alg != rep_prefix_4_byte \
22438 && alg != rep_prefix_8_byte))
22439 const struct processor_costs *cost;
22441 /* Even if the string operation call is cold, we still might spend a lot
22442 of time processing large blocks. */
22443 if (optimize_function_for_size_p (cfun)
22444 || (optimize_insn_for_size_p ()
22445 && expected_size != -1 && expected_size < 256))
22446 optimize_for_speed = false;
22447 else
22448 optimize_for_speed = true;
22450 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22452 *dynamic_check = -1;
22453 if (memset)
22454 algs = &cost->memset[TARGET_64BIT != 0];
22455 else
22456 algs = &cost->memcpy[TARGET_64BIT != 0];
22457 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22458 return ix86_stringop_alg;
22459 /* rep; movq or rep; movl is the smallest variant. */
22460 else if (!optimize_for_speed)
22462 if (!count || (count & 3))
22463 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22464 else
22465 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22467 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22469 else if (expected_size != -1 && expected_size < 4)
22470 return loop_1_byte;
22471 else if (expected_size != -1)
22473 unsigned int i;
22474 enum stringop_alg alg = libcall;
22475 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22477 /* We get here if the algorithms that were not libcall-based
22478 were rep-prefix based and we are unable to use rep prefixes
22479 based on global register usage. Break out of the loop and
22480 use the heuristic below. */
22481 if (algs->size[i].max == 0)
22482 break;
22483 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22485 enum stringop_alg candidate = algs->size[i].alg;
22487 if (candidate != libcall && ALG_USABLE_P (candidate))
22488 alg = candidate;
22489 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22490 last non-libcall inline algorithm. */
22491 if (TARGET_INLINE_ALL_STRINGOPS)
22493 /* When the current size is best to be copied by a libcall,
22494 but we are still forced to inline, run the heuristic below
22495 that will pick code for medium sized blocks. */
22496 if (alg != libcall)
22497 return alg;
22498 break;
22500 else if (ALG_USABLE_P (candidate))
22502 *noalign = algs->size[i].noalign;
22503 return candidate;
22507 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22509 /* When asked to inline the call anyway, try to pick meaningful choice.
22510 We look for maximal size of block that is faster to copy by hand and
22511 take blocks of at most of that size guessing that average size will
22512 be roughly half of the block.
22514 If this turns out to be bad, we might simply specify the preferred
22515 choice in ix86_costs. */
22516 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22517 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22519 int max = -1;
22520 enum stringop_alg alg;
22521 int i;
22522 bool any_alg_usable_p = true;
22524 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22526 enum stringop_alg candidate = algs->size[i].alg;
22527 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22529 if (candidate != libcall && candidate
22530 && ALG_USABLE_P (candidate))
22531 max = algs->size[i].max;
22533 /* If there aren't any usable algorithms, then recursing on
22534 smaller sizes isn't going to find anything. Just return the
22535 simple byte-at-a-time copy loop. */
22536 if (!any_alg_usable_p)
22538 /* Pick something reasonable. */
22539 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22540 *dynamic_check = 128;
22541 return loop_1_byte;
22543 if (max == -1)
22544 max = 4096;
22545 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22546 gcc_assert (*dynamic_check == -1);
22547 gcc_assert (alg != libcall);
22548 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22549 *dynamic_check = max;
22550 return alg;
22552 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22553 #undef ALG_USABLE_P
22556 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22557 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22558 static int
22559 decide_alignment (int align,
22560 enum stringop_alg alg,
22561 int expected_size)
22563 int desired_align = 0;
22564 switch (alg)
22566 case no_stringop:
22567 gcc_unreachable ();
22568 case loop:
22569 case unrolled_loop:
22570 desired_align = GET_MODE_SIZE (Pmode);
22571 break;
22572 case rep_prefix_8_byte:
22573 desired_align = 8;
22574 break;
22575 case rep_prefix_4_byte:
22576 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22577 copying whole cacheline at once. */
22578 if (TARGET_PENTIUMPRO)
22579 desired_align = 8;
22580 else
22581 desired_align = 4;
22582 break;
22583 case rep_prefix_1_byte:
22584 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22585 copying whole cacheline at once. */
22586 if (TARGET_PENTIUMPRO)
22587 desired_align = 8;
22588 else
22589 desired_align = 1;
22590 break;
22591 case loop_1_byte:
22592 desired_align = 1;
22593 break;
22594 case libcall:
22595 return 0;
22598 if (optimize_size)
22599 desired_align = 1;
22600 if (desired_align < align)
22601 desired_align = align;
22602 if (expected_size != -1 && expected_size < 4)
22603 desired_align = align;
22604 return desired_align;
22607 /* Return the smallest power of 2 greater than VAL. */
22608 static int
22609 smallest_pow2_greater_than (int val)
22611 int ret = 1;
22612 while (ret <= val)
22613 ret <<= 1;
22614 return ret;
22617 /* Expand string move (memcpy) operation. Use i386 string operations
22618 when profitable. expand_setmem contains similar code. The code
22619 depends upon architecture, block size and alignment, but always has
22620 the same overall structure:
22622 1) Prologue guard: Conditional that jumps up to epilogues for small
22623 blocks that can be handled by epilogue alone. This is faster
22624 but also needed for correctness, since prologue assume the block
22625 is larger than the desired alignment.
22627 Optional dynamic check for size and libcall for large
22628 blocks is emitted here too, with -minline-stringops-dynamically.
22630 2) Prologue: copy first few bytes in order to get destination
22631 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22632 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22633 copied. We emit either a jump tree on power of two sized
22634 blocks, or a byte loop.
22636 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22637 with specified algorithm.
22639 4) Epilogue: code copying tail of the block that is too small to be
22640 handled by main body (or up to size guarded by prologue guard). */
22642 bool
22643 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22644 rtx expected_align_exp, rtx expected_size_exp)
22646 rtx destreg;
22647 rtx srcreg;
22648 rtx label = NULL;
22649 rtx tmp;
22650 rtx jump_around_label = NULL;
22651 HOST_WIDE_INT align = 1;
22652 unsigned HOST_WIDE_INT count = 0;
22653 HOST_WIDE_INT expected_size = -1;
22654 int size_needed = 0, epilogue_size_needed;
22655 int desired_align = 0, align_bytes = 0;
22656 enum stringop_alg alg;
22657 int dynamic_check;
22658 bool need_zero_guard = false;
22659 bool noalign;
22661 if (CONST_INT_P (align_exp))
22662 align = INTVAL (align_exp);
22663 /* i386 can do misaligned access on reasonably increased cost. */
22664 if (CONST_INT_P (expected_align_exp)
22665 && INTVAL (expected_align_exp) > align)
22666 align = INTVAL (expected_align_exp);
22667 /* ALIGN is the minimum of destination and source alignment, but we care here
22668 just about destination alignment. */
22669 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22670 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22672 if (CONST_INT_P (count_exp))
22673 count = expected_size = INTVAL (count_exp);
22674 if (CONST_INT_P (expected_size_exp) && count == 0)
22675 expected_size = INTVAL (expected_size_exp);
22677 /* Make sure we don't need to care about overflow later on. */
22678 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22679 return false;
22681 /* Step 0: Decide on preferred algorithm, desired alignment and
22682 size of chunks to be copied by main loop. */
22684 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22685 desired_align = decide_alignment (align, alg, expected_size);
22687 if (!TARGET_ALIGN_STRINGOPS || noalign)
22688 align = desired_align;
22690 if (alg == libcall)
22691 return false;
22692 gcc_assert (alg != no_stringop);
22693 if (!count)
22694 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22695 destreg = copy_addr_to_reg (XEXP (dst, 0));
22696 srcreg = copy_addr_to_reg (XEXP (src, 0));
22697 switch (alg)
22699 case libcall:
22700 case no_stringop:
22701 gcc_unreachable ();
22702 case loop:
22703 need_zero_guard = true;
22704 size_needed = GET_MODE_SIZE (word_mode);
22705 break;
22706 case unrolled_loop:
22707 need_zero_guard = true;
22708 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22709 break;
22710 case rep_prefix_8_byte:
22711 size_needed = 8;
22712 break;
22713 case rep_prefix_4_byte:
22714 size_needed = 4;
22715 break;
22716 case rep_prefix_1_byte:
22717 size_needed = 1;
22718 break;
22719 case loop_1_byte:
22720 need_zero_guard = true;
22721 size_needed = 1;
22722 break;
22725 epilogue_size_needed = size_needed;
22727 /* Step 1: Prologue guard. */
22729 /* Alignment code needs count to be in register. */
22730 if (CONST_INT_P (count_exp) && desired_align > align)
22732 if (INTVAL (count_exp) > desired_align
22733 && INTVAL (count_exp) > size_needed)
22735 align_bytes
22736 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22737 if (align_bytes <= 0)
22738 align_bytes = 0;
22739 else
22740 align_bytes = desired_align - align_bytes;
22742 if (align_bytes == 0)
22743 count_exp = force_reg (counter_mode (count_exp), count_exp);
22745 gcc_assert (desired_align >= 1 && align >= 1);
22747 /* Ensure that alignment prologue won't copy past end of block. */
22748 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22750 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22751 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22752 Make sure it is power of 2. */
22753 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22755 if (count)
22757 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22759 /* If main algorithm works on QImode, no epilogue is needed.
22760 For small sizes just don't align anything. */
22761 if (size_needed == 1)
22762 desired_align = align;
22763 else
22764 goto epilogue;
22767 else
22769 label = gen_label_rtx ();
22770 emit_cmp_and_jump_insns (count_exp,
22771 GEN_INT (epilogue_size_needed),
22772 LTU, 0, counter_mode (count_exp), 1, label);
22773 if (expected_size == -1 || expected_size < epilogue_size_needed)
22774 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22775 else
22776 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22780 /* Emit code to decide on runtime whether library call or inline should be
22781 used. */
22782 if (dynamic_check != -1)
22784 if (CONST_INT_P (count_exp))
22786 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22788 emit_block_move_via_libcall (dst, src, count_exp, false);
22789 count_exp = const0_rtx;
22790 goto epilogue;
22793 else
22795 rtx hot_label = gen_label_rtx ();
22796 jump_around_label = gen_label_rtx ();
22797 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22798 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22799 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22800 emit_block_move_via_libcall (dst, src, count_exp, false);
22801 emit_jump (jump_around_label);
22802 emit_label (hot_label);
22806 /* Step 2: Alignment prologue. */
22808 if (desired_align > align)
22810 if (align_bytes == 0)
22812 /* Except for the first move in epilogue, we no longer know
22813 constant offset in aliasing info. It don't seems to worth
22814 the pain to maintain it for the first move, so throw away
22815 the info early. */
22816 src = change_address (src, BLKmode, srcreg);
22817 dst = change_address (dst, BLKmode, destreg);
22818 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22819 desired_align);
22821 else
22823 /* If we know how many bytes need to be stored before dst is
22824 sufficiently aligned, maintain aliasing info accurately. */
22825 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22826 desired_align, align_bytes);
22827 count_exp = plus_constant (counter_mode (count_exp),
22828 count_exp, -align_bytes);
22829 count -= align_bytes;
22831 if (need_zero_guard
22832 && (count < (unsigned HOST_WIDE_INT) size_needed
22833 || (align_bytes == 0
22834 && count < ((unsigned HOST_WIDE_INT) size_needed
22835 + desired_align - align))))
22837 /* It is possible that we copied enough so the main loop will not
22838 execute. */
22839 gcc_assert (size_needed > 1);
22840 if (label == NULL_RTX)
22841 label = gen_label_rtx ();
22842 emit_cmp_and_jump_insns (count_exp,
22843 GEN_INT (size_needed),
22844 LTU, 0, counter_mode (count_exp), 1, label);
22845 if (expected_size == -1
22846 || expected_size < (desired_align - align) / 2 + size_needed)
22847 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22848 else
22849 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22852 if (label && size_needed == 1)
22854 emit_label (label);
22855 LABEL_NUSES (label) = 1;
22856 label = NULL;
22857 epilogue_size_needed = 1;
22859 else if (label == NULL_RTX)
22860 epilogue_size_needed = size_needed;
22862 /* Step 3: Main loop. */
22864 switch (alg)
22866 case libcall:
22867 case no_stringop:
22868 gcc_unreachable ();
22869 case loop_1_byte:
22870 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22871 count_exp, QImode, 1, expected_size);
22872 break;
22873 case loop:
22874 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22875 count_exp, word_mode, 1, expected_size);
22876 break;
22877 case unrolled_loop:
22878 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22879 registers for 4 temporaries anyway. */
22880 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22881 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22882 expected_size);
22883 break;
22884 case rep_prefix_8_byte:
22885 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22886 DImode);
22887 break;
22888 case rep_prefix_4_byte:
22889 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22890 SImode);
22891 break;
22892 case rep_prefix_1_byte:
22893 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22894 QImode);
22895 break;
22897 /* Adjust properly the offset of src and dest memory for aliasing. */
22898 if (CONST_INT_P (count_exp))
22900 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22901 (count / size_needed) * size_needed);
22902 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22903 (count / size_needed) * size_needed);
22905 else
22907 src = change_address (src, BLKmode, srcreg);
22908 dst = change_address (dst, BLKmode, destreg);
22911 /* Step 4: Epilogue to copy the remaining bytes. */
22912 epilogue:
22913 if (label)
22915 /* When the main loop is done, COUNT_EXP might hold original count,
22916 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22917 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22918 bytes. Compensate if needed. */
22920 if (size_needed < epilogue_size_needed)
22922 tmp =
22923 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22924 GEN_INT (size_needed - 1), count_exp, 1,
22925 OPTAB_DIRECT);
22926 if (tmp != count_exp)
22927 emit_move_insn (count_exp, tmp);
22929 emit_label (label);
22930 LABEL_NUSES (label) = 1;
22933 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22934 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22935 epilogue_size_needed);
22936 if (jump_around_label)
22937 emit_label (jump_around_label);
22938 return true;
22941 /* Helper function for memcpy. For QImode value 0xXY produce
22942 0xXYXYXYXY of wide specified by MODE. This is essentially
22943 a * 0x10101010, but we can do slightly better than
22944 synth_mult by unwinding the sequence by hand on CPUs with
22945 slow multiply. */
22946 static rtx
22947 promote_duplicated_reg (enum machine_mode mode, rtx val)
22949 enum machine_mode valmode = GET_MODE (val);
22950 rtx tmp;
22951 int nops = mode == DImode ? 3 : 2;
22953 gcc_assert (mode == SImode || mode == DImode);
22954 if (val == const0_rtx)
22955 return copy_to_mode_reg (mode, const0_rtx);
22956 if (CONST_INT_P (val))
22958 HOST_WIDE_INT v = INTVAL (val) & 255;
22960 v |= v << 8;
22961 v |= v << 16;
22962 if (mode == DImode)
22963 v |= (v << 16) << 16;
22964 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22967 if (valmode == VOIDmode)
22968 valmode = QImode;
22969 if (valmode != QImode)
22970 val = gen_lowpart (QImode, val);
22971 if (mode == QImode)
22972 return val;
22973 if (!TARGET_PARTIAL_REG_STALL)
22974 nops--;
22975 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22976 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22977 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22978 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22980 rtx reg = convert_modes (mode, QImode, val, true);
22981 tmp = promote_duplicated_reg (mode, const1_rtx);
22982 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22983 OPTAB_DIRECT);
22985 else
22987 rtx reg = convert_modes (mode, QImode, val, true);
22989 if (!TARGET_PARTIAL_REG_STALL)
22990 if (mode == SImode)
22991 emit_insn (gen_movsi_insv_1 (reg, reg));
22992 else
22993 emit_insn (gen_movdi_insv_1 (reg, reg));
22994 else
22996 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22997 NULL, 1, OPTAB_DIRECT);
22998 reg =
22999 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23001 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23002 NULL, 1, OPTAB_DIRECT);
23003 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23004 if (mode == SImode)
23005 return reg;
23006 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23007 NULL, 1, OPTAB_DIRECT);
23008 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23009 return reg;
23013 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23014 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23015 alignment from ALIGN to DESIRED_ALIGN. */
23016 static rtx
23017 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23019 rtx promoted_val;
23021 if (TARGET_64BIT
23022 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23023 promoted_val = promote_duplicated_reg (DImode, val);
23024 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23025 promoted_val = promote_duplicated_reg (SImode, val);
23026 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23027 promoted_val = promote_duplicated_reg (HImode, val);
23028 else
23029 promoted_val = val;
23031 return promoted_val;
23034 /* Expand string clear operation (bzero). Use i386 string operations when
23035 profitable. See expand_movmem comment for explanation of individual
23036 steps performed. */
23037 bool
23038 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23039 rtx expected_align_exp, rtx expected_size_exp)
23041 rtx destreg;
23042 rtx label = NULL;
23043 rtx tmp;
23044 rtx jump_around_label = NULL;
23045 HOST_WIDE_INT align = 1;
23046 unsigned HOST_WIDE_INT count = 0;
23047 HOST_WIDE_INT expected_size = -1;
23048 int size_needed = 0, epilogue_size_needed;
23049 int desired_align = 0, align_bytes = 0;
23050 enum stringop_alg alg;
23051 rtx promoted_val = NULL;
23052 bool force_loopy_epilogue = false;
23053 int dynamic_check;
23054 bool need_zero_guard = false;
23055 bool noalign;
23057 if (CONST_INT_P (align_exp))
23058 align = INTVAL (align_exp);
23059 /* i386 can do misaligned access on reasonably increased cost. */
23060 if (CONST_INT_P (expected_align_exp)
23061 && INTVAL (expected_align_exp) > align)
23062 align = INTVAL (expected_align_exp);
23063 if (CONST_INT_P (count_exp))
23064 count = expected_size = INTVAL (count_exp);
23065 if (CONST_INT_P (expected_size_exp) && count == 0)
23066 expected_size = INTVAL (expected_size_exp);
23068 /* Make sure we don't need to care about overflow later on. */
23069 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23070 return false;
23072 /* Step 0: Decide on preferred algorithm, desired alignment and
23073 size of chunks to be copied by main loop. */
23075 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23076 desired_align = decide_alignment (align, alg, expected_size);
23078 if (!TARGET_ALIGN_STRINGOPS || noalign)
23079 align = desired_align;
23081 if (alg == libcall)
23082 return false;
23083 gcc_assert (alg != no_stringop);
23084 if (!count)
23085 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23086 destreg = copy_addr_to_reg (XEXP (dst, 0));
23087 switch (alg)
23089 case libcall:
23090 case no_stringop:
23091 gcc_unreachable ();
23092 case loop:
23093 need_zero_guard = true;
23094 size_needed = GET_MODE_SIZE (word_mode);
23095 break;
23096 case unrolled_loop:
23097 need_zero_guard = true;
23098 size_needed = GET_MODE_SIZE (word_mode) * 4;
23099 break;
23100 case rep_prefix_8_byte:
23101 size_needed = 8;
23102 break;
23103 case rep_prefix_4_byte:
23104 size_needed = 4;
23105 break;
23106 case rep_prefix_1_byte:
23107 size_needed = 1;
23108 break;
23109 case loop_1_byte:
23110 need_zero_guard = true;
23111 size_needed = 1;
23112 break;
23114 epilogue_size_needed = size_needed;
23116 /* Step 1: Prologue guard. */
23118 /* Alignment code needs count to be in register. */
23119 if (CONST_INT_P (count_exp) && desired_align > align)
23121 if (INTVAL (count_exp) > desired_align
23122 && INTVAL (count_exp) > size_needed)
23124 align_bytes
23125 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23126 if (align_bytes <= 0)
23127 align_bytes = 0;
23128 else
23129 align_bytes = desired_align - align_bytes;
23131 if (align_bytes == 0)
23133 enum machine_mode mode = SImode;
23134 if (TARGET_64BIT && (count & ~0xffffffff))
23135 mode = DImode;
23136 count_exp = force_reg (mode, count_exp);
23139 /* Do the cheap promotion to allow better CSE across the
23140 main loop and epilogue (ie one load of the big constant in the
23141 front of all code. */
23142 if (CONST_INT_P (val_exp))
23143 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23144 desired_align, align);
23145 /* Ensure that alignment prologue won't copy past end of block. */
23146 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23148 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23149 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23150 Make sure it is power of 2. */
23151 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23153 /* To improve performance of small blocks, we jump around the VAL
23154 promoting mode. This mean that if the promoted VAL is not constant,
23155 we might not use it in the epilogue and have to use byte
23156 loop variant. */
23157 if (epilogue_size_needed > 2 && !promoted_val)
23158 force_loopy_epilogue = true;
23159 if (count)
23161 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23163 /* If main algorithm works on QImode, no epilogue is needed.
23164 For small sizes just don't align anything. */
23165 if (size_needed == 1)
23166 desired_align = align;
23167 else
23168 goto epilogue;
23171 else
23173 label = gen_label_rtx ();
23174 emit_cmp_and_jump_insns (count_exp,
23175 GEN_INT (epilogue_size_needed),
23176 LTU, 0, counter_mode (count_exp), 1, label);
23177 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23178 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23179 else
23180 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23183 if (dynamic_check != -1)
23185 rtx hot_label = gen_label_rtx ();
23186 jump_around_label = gen_label_rtx ();
23187 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23188 LEU, 0, counter_mode (count_exp), 1, hot_label);
23189 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23190 set_storage_via_libcall (dst, count_exp, val_exp, false);
23191 emit_jump (jump_around_label);
23192 emit_label (hot_label);
23195 /* Step 2: Alignment prologue. */
23197 /* Do the expensive promotion once we branched off the small blocks. */
23198 if (!promoted_val)
23199 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23200 desired_align, align);
23201 gcc_assert (desired_align >= 1 && align >= 1);
23203 if (desired_align > align)
23205 if (align_bytes == 0)
23207 /* Except for the first move in epilogue, we no longer know
23208 constant offset in aliasing info. It don't seems to worth
23209 the pain to maintain it for the first move, so throw away
23210 the info early. */
23211 dst = change_address (dst, BLKmode, destreg);
23212 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23213 desired_align);
23215 else
23217 /* If we know how many bytes need to be stored before dst is
23218 sufficiently aligned, maintain aliasing info accurately. */
23219 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23220 desired_align, align_bytes);
23221 count_exp = plus_constant (counter_mode (count_exp),
23222 count_exp, -align_bytes);
23223 count -= align_bytes;
23225 if (need_zero_guard
23226 && (count < (unsigned HOST_WIDE_INT) size_needed
23227 || (align_bytes == 0
23228 && count < ((unsigned HOST_WIDE_INT) size_needed
23229 + desired_align - align))))
23231 /* It is possible that we copied enough so the main loop will not
23232 execute. */
23233 gcc_assert (size_needed > 1);
23234 if (label == NULL_RTX)
23235 label = gen_label_rtx ();
23236 emit_cmp_and_jump_insns (count_exp,
23237 GEN_INT (size_needed),
23238 LTU, 0, counter_mode (count_exp), 1, label);
23239 if (expected_size == -1
23240 || expected_size < (desired_align - align) / 2 + size_needed)
23241 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23242 else
23243 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23246 if (label && size_needed == 1)
23248 emit_label (label);
23249 LABEL_NUSES (label) = 1;
23250 label = NULL;
23251 promoted_val = val_exp;
23252 epilogue_size_needed = 1;
23254 else if (label == NULL_RTX)
23255 epilogue_size_needed = size_needed;
23257 /* Step 3: Main loop. */
23259 switch (alg)
23261 case libcall:
23262 case no_stringop:
23263 gcc_unreachable ();
23264 case loop_1_byte:
23265 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23266 count_exp, QImode, 1, expected_size);
23267 break;
23268 case loop:
23269 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23270 count_exp, word_mode, 1, expected_size);
23271 break;
23272 case unrolled_loop:
23273 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23274 count_exp, word_mode, 4, expected_size);
23275 break;
23276 case rep_prefix_8_byte:
23277 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23278 DImode, val_exp);
23279 break;
23280 case rep_prefix_4_byte:
23281 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23282 SImode, val_exp);
23283 break;
23284 case rep_prefix_1_byte:
23285 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23286 QImode, val_exp);
23287 break;
23289 /* Adjust properly the offset of src and dest memory for aliasing. */
23290 if (CONST_INT_P (count_exp))
23291 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23292 (count / size_needed) * size_needed);
23293 else
23294 dst = change_address (dst, BLKmode, destreg);
23296 /* Step 4: Epilogue to copy the remaining bytes. */
23298 if (label)
23300 /* When the main loop is done, COUNT_EXP might hold original count,
23301 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23302 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23303 bytes. Compensate if needed. */
23305 if (size_needed < epilogue_size_needed)
23307 tmp =
23308 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23309 GEN_INT (size_needed - 1), count_exp, 1,
23310 OPTAB_DIRECT);
23311 if (tmp != count_exp)
23312 emit_move_insn (count_exp, tmp);
23314 emit_label (label);
23315 LABEL_NUSES (label) = 1;
23317 epilogue:
23318 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23320 if (force_loopy_epilogue)
23321 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23322 epilogue_size_needed);
23323 else
23324 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23325 epilogue_size_needed);
23327 if (jump_around_label)
23328 emit_label (jump_around_label);
23329 return true;
23332 /* Expand the appropriate insns for doing strlen if not just doing
23333 repnz; scasb
23335 out = result, initialized with the start address
23336 align_rtx = alignment of the address.
23337 scratch = scratch register, initialized with the startaddress when
23338 not aligned, otherwise undefined
23340 This is just the body. It needs the initializations mentioned above and
23341 some address computing at the end. These things are done in i386.md. */
23343 static void
23344 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23346 int align;
23347 rtx tmp;
23348 rtx align_2_label = NULL_RTX;
23349 rtx align_3_label = NULL_RTX;
23350 rtx align_4_label = gen_label_rtx ();
23351 rtx end_0_label = gen_label_rtx ();
23352 rtx mem;
23353 rtx tmpreg = gen_reg_rtx (SImode);
23354 rtx scratch = gen_reg_rtx (SImode);
23355 rtx cmp;
23357 align = 0;
23358 if (CONST_INT_P (align_rtx))
23359 align = INTVAL (align_rtx);
23361 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23363 /* Is there a known alignment and is it less than 4? */
23364 if (align < 4)
23366 rtx scratch1 = gen_reg_rtx (Pmode);
23367 emit_move_insn (scratch1, out);
23368 /* Is there a known alignment and is it not 2? */
23369 if (align != 2)
23371 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23372 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23374 /* Leave just the 3 lower bits. */
23375 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23376 NULL_RTX, 0, OPTAB_WIDEN);
23378 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23379 Pmode, 1, align_4_label);
23380 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23381 Pmode, 1, align_2_label);
23382 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23383 Pmode, 1, align_3_label);
23385 else
23387 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23388 check if is aligned to 4 - byte. */
23390 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23391 NULL_RTX, 0, OPTAB_WIDEN);
23393 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23394 Pmode, 1, align_4_label);
23397 mem = change_address (src, QImode, out);
23399 /* Now compare the bytes. */
23401 /* Compare the first n unaligned byte on a byte per byte basis. */
23402 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23403 QImode, 1, end_0_label);
23405 /* Increment the address. */
23406 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23408 /* Not needed with an alignment of 2 */
23409 if (align != 2)
23411 emit_label (align_2_label);
23413 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23414 end_0_label);
23416 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23418 emit_label (align_3_label);
23421 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23422 end_0_label);
23424 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23427 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23428 align this loop. It gives only huge programs, but does not help to
23429 speed up. */
23430 emit_label (align_4_label);
23432 mem = change_address (src, SImode, out);
23433 emit_move_insn (scratch, mem);
23434 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23436 /* This formula yields a nonzero result iff one of the bytes is zero.
23437 This saves three branches inside loop and many cycles. */
23439 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23440 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23441 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23442 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23443 gen_int_mode (0x80808080, SImode)));
23444 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23445 align_4_label);
23447 if (TARGET_CMOVE)
23449 rtx reg = gen_reg_rtx (SImode);
23450 rtx reg2 = gen_reg_rtx (Pmode);
23451 emit_move_insn (reg, tmpreg);
23452 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23454 /* If zero is not in the first two bytes, move two bytes forward. */
23455 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23456 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23457 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23458 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23459 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23460 reg,
23461 tmpreg)));
23462 /* Emit lea manually to avoid clobbering of flags. */
23463 emit_insn (gen_rtx_SET (SImode, reg2,
23464 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23466 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23467 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23468 emit_insn (gen_rtx_SET (VOIDmode, out,
23469 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23470 reg2,
23471 out)));
23473 else
23475 rtx end_2_label = gen_label_rtx ();
23476 /* Is zero in the first two bytes? */
23478 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23479 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23480 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23481 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23482 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23483 pc_rtx);
23484 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23485 JUMP_LABEL (tmp) = end_2_label;
23487 /* Not in the first two. Move two bytes forward. */
23488 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23489 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23491 emit_label (end_2_label);
23495 /* Avoid branch in fixing the byte. */
23496 tmpreg = gen_lowpart (QImode, tmpreg);
23497 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23498 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23499 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23500 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23502 emit_label (end_0_label);
23505 /* Expand strlen. */
23507 bool
23508 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23510 rtx addr, scratch1, scratch2, scratch3, scratch4;
23512 /* The generic case of strlen expander is long. Avoid it's
23513 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23515 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23516 && !TARGET_INLINE_ALL_STRINGOPS
23517 && !optimize_insn_for_size_p ()
23518 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23519 return false;
23521 addr = force_reg (Pmode, XEXP (src, 0));
23522 scratch1 = gen_reg_rtx (Pmode);
23524 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23525 && !optimize_insn_for_size_p ())
23527 /* Well it seems that some optimizer does not combine a call like
23528 foo(strlen(bar), strlen(bar));
23529 when the move and the subtraction is done here. It does calculate
23530 the length just once when these instructions are done inside of
23531 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23532 often used and I use one fewer register for the lifetime of
23533 output_strlen_unroll() this is better. */
23535 emit_move_insn (out, addr);
23537 ix86_expand_strlensi_unroll_1 (out, src, align);
23539 /* strlensi_unroll_1 returns the address of the zero at the end of
23540 the string, like memchr(), so compute the length by subtracting
23541 the start address. */
23542 emit_insn (ix86_gen_sub3 (out, out, addr));
23544 else
23546 rtx unspec;
23548 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23549 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23550 return false;
23552 scratch2 = gen_reg_rtx (Pmode);
23553 scratch3 = gen_reg_rtx (Pmode);
23554 scratch4 = force_reg (Pmode, constm1_rtx);
23556 emit_move_insn (scratch3, addr);
23557 eoschar = force_reg (QImode, eoschar);
23559 src = replace_equiv_address_nv (src, scratch3);
23561 /* If .md starts supporting :P, this can be done in .md. */
23562 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23563 scratch4), UNSPEC_SCAS);
23564 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23565 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23566 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23568 return true;
23571 /* For given symbol (function) construct code to compute address of it's PLT
23572 entry in large x86-64 PIC model. */
23573 static rtx
23574 construct_plt_address (rtx symbol)
23576 rtx tmp, unspec;
23578 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23579 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23580 gcc_assert (Pmode == DImode);
23582 tmp = gen_reg_rtx (Pmode);
23583 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23585 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23586 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23587 return tmp;
23591 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23592 rtx callarg2,
23593 rtx pop, bool sibcall)
23595 /* We need to represent that SI and DI registers are clobbered
23596 by SYSV calls. */
23597 static int clobbered_registers[] = {
23598 XMM6_REG, XMM7_REG, XMM8_REG,
23599 XMM9_REG, XMM10_REG, XMM11_REG,
23600 XMM12_REG, XMM13_REG, XMM14_REG,
23601 XMM15_REG, SI_REG, DI_REG
23603 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23604 rtx use = NULL, call;
23605 unsigned int vec_len;
23607 if (pop == const0_rtx)
23608 pop = NULL;
23609 gcc_assert (!TARGET_64BIT || !pop);
23611 if (TARGET_MACHO && !TARGET_64BIT)
23613 #if TARGET_MACHO
23614 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23615 fnaddr = machopic_indirect_call_target (fnaddr);
23616 #endif
23618 else
23620 /* Static functions and indirect calls don't need the pic register. */
23621 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23622 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23623 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23624 use_reg (&use, pic_offset_table_rtx);
23627 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23629 rtx al = gen_rtx_REG (QImode, AX_REG);
23630 emit_move_insn (al, callarg2);
23631 use_reg (&use, al);
23634 if (ix86_cmodel == CM_LARGE_PIC
23635 && MEM_P (fnaddr)
23636 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23637 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23638 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23639 else if (sibcall
23640 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23641 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23643 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23644 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23647 vec_len = 0;
23648 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23649 if (retval)
23650 call = gen_rtx_SET (VOIDmode, retval, call);
23651 vec[vec_len++] = call;
23653 if (pop)
23655 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23656 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23657 vec[vec_len++] = pop;
23660 if (TARGET_64BIT_MS_ABI
23661 && (!callarg2 || INTVAL (callarg2) != -2))
23663 unsigned i;
23665 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23666 UNSPEC_MS_TO_SYSV_CALL);
23668 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23669 vec[vec_len++]
23670 = gen_rtx_CLOBBER (VOIDmode,
23671 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23672 ? TImode : DImode,
23673 clobbered_registers[i]));
23676 if (vec_len > 1)
23677 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23678 call = emit_call_insn (call);
23679 if (use)
23680 CALL_INSN_FUNCTION_USAGE (call) = use;
23682 return call;
23685 /* Output the assembly for a call instruction. */
23687 const char *
23688 ix86_output_call_insn (rtx insn, rtx call_op)
23690 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23691 bool seh_nop_p = false;
23692 const char *xasm;
23694 if (SIBLING_CALL_P (insn))
23696 if (direct_p)
23697 xasm = "jmp\t%P0";
23698 /* SEH epilogue detection requires the indirect branch case
23699 to include REX.W. */
23700 else if (TARGET_SEH)
23701 xasm = "rex.W jmp %A0";
23702 else
23703 xasm = "jmp\t%A0";
23705 output_asm_insn (xasm, &call_op);
23706 return "";
23709 /* SEH unwinding can require an extra nop to be emitted in several
23710 circumstances. Determine if we have one of those. */
23711 if (TARGET_SEH)
23713 rtx i;
23715 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23717 /* If we get to another real insn, we don't need the nop. */
23718 if (INSN_P (i))
23719 break;
23721 /* If we get to the epilogue note, prevent a catch region from
23722 being adjacent to the standard epilogue sequence. If non-
23723 call-exceptions, we'll have done this during epilogue emission. */
23724 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23725 && !flag_non_call_exceptions
23726 && !can_throw_internal (insn))
23728 seh_nop_p = true;
23729 break;
23733 /* If we didn't find a real insn following the call, prevent the
23734 unwinder from looking into the next function. */
23735 if (i == NULL)
23736 seh_nop_p = true;
23739 if (direct_p)
23740 xasm = "call\t%P0";
23741 else
23742 xasm = "call\t%A0";
23744 output_asm_insn (xasm, &call_op);
23746 if (seh_nop_p)
23747 return "nop";
23749 return "";
23752 /* Clear stack slot assignments remembered from previous functions.
23753 This is called from INIT_EXPANDERS once before RTL is emitted for each
23754 function. */
23756 static struct machine_function *
23757 ix86_init_machine_status (void)
23759 struct machine_function *f;
23761 f = ggc_alloc_cleared_machine_function ();
23762 f->use_fast_prologue_epilogue_nregs = -1;
23763 f->call_abi = ix86_abi;
23765 return f;
23768 /* Return a MEM corresponding to a stack slot with mode MODE.
23769 Allocate a new slot if necessary.
23771 The RTL for a function can have several slots available: N is
23772 which slot to use. */
23775 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23777 struct stack_local_entry *s;
23779 gcc_assert (n < MAX_386_STACK_LOCALS);
23781 for (s = ix86_stack_locals; s; s = s->next)
23782 if (s->mode == mode && s->n == n)
23783 return validize_mem (copy_rtx (s->rtl));
23785 s = ggc_alloc_stack_local_entry ();
23786 s->n = n;
23787 s->mode = mode;
23788 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23790 s->next = ix86_stack_locals;
23791 ix86_stack_locals = s;
23792 return validize_mem (s->rtl);
23795 static void
23796 ix86_instantiate_decls (void)
23798 struct stack_local_entry *s;
23800 for (s = ix86_stack_locals; s; s = s->next)
23801 if (s->rtl != NULL_RTX)
23802 instantiate_decl_rtl (s->rtl);
23805 /* Calculate the length of the memory address in the instruction encoding.
23806 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23807 or other prefixes. We never generate addr32 prefix for LEA insn. */
23810 memory_address_length (rtx addr, bool lea)
23812 struct ix86_address parts;
23813 rtx base, index, disp;
23814 int len;
23815 int ok;
23817 if (GET_CODE (addr) == PRE_DEC
23818 || GET_CODE (addr) == POST_INC
23819 || GET_CODE (addr) == PRE_MODIFY
23820 || GET_CODE (addr) == POST_MODIFY)
23821 return 0;
23823 ok = ix86_decompose_address (addr, &parts);
23824 gcc_assert (ok);
23826 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23828 /* If this is not LEA instruction, add the length of addr32 prefix. */
23829 if (TARGET_64BIT && !lea
23830 && (SImode_address_operand (addr, VOIDmode)
23831 || (parts.base && GET_MODE (parts.base) == SImode)
23832 || (parts.index && GET_MODE (parts.index) == SImode)))
23833 len++;
23835 base = parts.base;
23836 index = parts.index;
23837 disp = parts.disp;
23839 if (base && GET_CODE (base) == SUBREG)
23840 base = SUBREG_REG (base);
23841 if (index && GET_CODE (index) == SUBREG)
23842 index = SUBREG_REG (index);
23844 gcc_assert (base == NULL_RTX || REG_P (base));
23845 gcc_assert (index == NULL_RTX || REG_P (index));
23847 /* Rule of thumb:
23848 - esp as the base always wants an index,
23849 - ebp as the base always wants a displacement,
23850 - r12 as the base always wants an index,
23851 - r13 as the base always wants a displacement. */
23853 /* Register Indirect. */
23854 if (base && !index && !disp)
23856 /* esp (for its index) and ebp (for its displacement) need
23857 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23858 code. */
23859 if (base == arg_pointer_rtx
23860 || base == frame_pointer_rtx
23861 || REGNO (base) == SP_REG
23862 || REGNO (base) == BP_REG
23863 || REGNO (base) == R12_REG
23864 || REGNO (base) == R13_REG)
23865 len++;
23868 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23869 is not disp32, but disp32(%rip), so for disp32
23870 SIB byte is needed, unless print_operand_address
23871 optimizes it into disp32(%rip) or (%rip) is implied
23872 by UNSPEC. */
23873 else if (disp && !base && !index)
23875 len += 4;
23876 if (TARGET_64BIT)
23878 rtx symbol = disp;
23880 if (GET_CODE (disp) == CONST)
23881 symbol = XEXP (disp, 0);
23882 if (GET_CODE (symbol) == PLUS
23883 && CONST_INT_P (XEXP (symbol, 1)))
23884 symbol = XEXP (symbol, 0);
23886 if (GET_CODE (symbol) != LABEL_REF
23887 && (GET_CODE (symbol) != SYMBOL_REF
23888 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23889 && (GET_CODE (symbol) != UNSPEC
23890 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23891 && XINT (symbol, 1) != UNSPEC_PCREL
23892 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23893 len++;
23896 else
23898 /* Find the length of the displacement constant. */
23899 if (disp)
23901 if (base && satisfies_constraint_K (disp))
23902 len += 1;
23903 else
23904 len += 4;
23906 /* ebp always wants a displacement. Similarly r13. */
23907 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23908 len++;
23910 /* An index requires the two-byte modrm form.... */
23911 if (index
23912 /* ...like esp (or r12), which always wants an index. */
23913 || base == arg_pointer_rtx
23914 || base == frame_pointer_rtx
23915 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23916 len++;
23919 return len;
23922 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23923 is set, expect that insn have 8bit immediate alternative. */
23925 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23927 int len = 0;
23928 int i;
23929 extract_insn_cached (insn);
23930 for (i = recog_data.n_operands - 1; i >= 0; --i)
23931 if (CONSTANT_P (recog_data.operand[i]))
23933 enum attr_mode mode = get_attr_mode (insn);
23935 gcc_assert (!len);
23936 if (shortform && CONST_INT_P (recog_data.operand[i]))
23938 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23939 switch (mode)
23941 case MODE_QI:
23942 len = 1;
23943 continue;
23944 case MODE_HI:
23945 ival = trunc_int_for_mode (ival, HImode);
23946 break;
23947 case MODE_SI:
23948 ival = trunc_int_for_mode (ival, SImode);
23949 break;
23950 default:
23951 break;
23953 if (IN_RANGE (ival, -128, 127))
23955 len = 1;
23956 continue;
23959 switch (mode)
23961 case MODE_QI:
23962 len = 1;
23963 break;
23964 case MODE_HI:
23965 len = 2;
23966 break;
23967 case MODE_SI:
23968 len = 4;
23969 break;
23970 /* Immediates for DImode instructions are encoded
23971 as 32bit sign extended values. */
23972 case MODE_DI:
23973 len = 4;
23974 break;
23975 default:
23976 fatal_insn ("unknown insn mode", insn);
23979 return len;
23982 /* Compute default value for "length_address" attribute. */
23984 ix86_attr_length_address_default (rtx insn)
23986 int i;
23988 if (get_attr_type (insn) == TYPE_LEA)
23990 rtx set = PATTERN (insn), addr;
23992 if (GET_CODE (set) == PARALLEL)
23993 set = XVECEXP (set, 0, 0);
23995 gcc_assert (GET_CODE (set) == SET);
23997 addr = SET_SRC (set);
23999 return memory_address_length (addr, true);
24002 extract_insn_cached (insn);
24003 for (i = recog_data.n_operands - 1; i >= 0; --i)
24004 if (MEM_P (recog_data.operand[i]))
24006 constrain_operands_cached (reload_completed);
24007 if (which_alternative != -1)
24009 const char *constraints = recog_data.constraints[i];
24010 int alt = which_alternative;
24012 while (*constraints == '=' || *constraints == '+')
24013 constraints++;
24014 while (alt-- > 0)
24015 while (*constraints++ != ',')
24017 /* Skip ignored operands. */
24018 if (*constraints == 'X')
24019 continue;
24021 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24023 return 0;
24026 /* Compute default value for "length_vex" attribute. It includes
24027 2 or 3 byte VEX prefix and 1 opcode byte. */
24030 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24032 int i;
24034 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24035 byte VEX prefix. */
24036 if (!has_0f_opcode || has_vex_w)
24037 return 3 + 1;
24039 /* We can always use 2 byte VEX prefix in 32bit. */
24040 if (!TARGET_64BIT)
24041 return 2 + 1;
24043 extract_insn_cached (insn);
24045 for (i = recog_data.n_operands - 1; i >= 0; --i)
24046 if (REG_P (recog_data.operand[i]))
24048 /* REX.W bit uses 3 byte VEX prefix. */
24049 if (GET_MODE (recog_data.operand[i]) == DImode
24050 && GENERAL_REG_P (recog_data.operand[i]))
24051 return 3 + 1;
24053 else
24055 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24056 if (MEM_P (recog_data.operand[i])
24057 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24058 return 3 + 1;
24061 return 2 + 1;
24064 /* Return the maximum number of instructions a cpu can issue. */
24066 static int
24067 ix86_issue_rate (void)
24069 switch (ix86_tune)
24071 case PROCESSOR_PENTIUM:
24072 case PROCESSOR_ATOM:
24073 case PROCESSOR_K6:
24074 case PROCESSOR_BTVER2:
24075 return 2;
24077 case PROCESSOR_PENTIUMPRO:
24078 case PROCESSOR_PENTIUM4:
24079 case PROCESSOR_CORE2:
24080 case PROCESSOR_COREI7:
24081 case PROCESSOR_HASWELL:
24082 case PROCESSOR_ATHLON:
24083 case PROCESSOR_K8:
24084 case PROCESSOR_AMDFAM10:
24085 case PROCESSOR_NOCONA:
24086 case PROCESSOR_GENERIC32:
24087 case PROCESSOR_GENERIC64:
24088 case PROCESSOR_BDVER1:
24089 case PROCESSOR_BDVER2:
24090 case PROCESSOR_BDVER3:
24091 case PROCESSOR_BTVER1:
24092 return 3;
24094 default:
24095 return 1;
24099 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24100 by DEP_INSN and nothing set by DEP_INSN. */
24102 static bool
24103 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24105 rtx set, set2;
24107 /* Simplify the test for uninteresting insns. */
24108 if (insn_type != TYPE_SETCC
24109 && insn_type != TYPE_ICMOV
24110 && insn_type != TYPE_FCMOV
24111 && insn_type != TYPE_IBR)
24112 return false;
24114 if ((set = single_set (dep_insn)) != 0)
24116 set = SET_DEST (set);
24117 set2 = NULL_RTX;
24119 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24120 && XVECLEN (PATTERN (dep_insn), 0) == 2
24121 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24122 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24124 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24125 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24127 else
24128 return false;
24130 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24131 return false;
24133 /* This test is true if the dependent insn reads the flags but
24134 not any other potentially set register. */
24135 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24136 return false;
24138 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24139 return false;
24141 return true;
24144 /* Return true iff USE_INSN has a memory address with operands set by
24145 SET_INSN. */
24147 bool
24148 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24150 int i;
24151 extract_insn_cached (use_insn);
24152 for (i = recog_data.n_operands - 1; i >= 0; --i)
24153 if (MEM_P (recog_data.operand[i]))
24155 rtx addr = XEXP (recog_data.operand[i], 0);
24156 return modified_in_p (addr, set_insn) != 0;
24158 return false;
24161 static int
24162 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24164 enum attr_type insn_type, dep_insn_type;
24165 enum attr_memory memory;
24166 rtx set, set2;
24167 int dep_insn_code_number;
24169 /* Anti and output dependencies have zero cost on all CPUs. */
24170 if (REG_NOTE_KIND (link) != 0)
24171 return 0;
24173 dep_insn_code_number = recog_memoized (dep_insn);
24175 /* If we can't recognize the insns, we can't really do anything. */
24176 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24177 return cost;
24179 insn_type = get_attr_type (insn);
24180 dep_insn_type = get_attr_type (dep_insn);
24182 switch (ix86_tune)
24184 case PROCESSOR_PENTIUM:
24185 /* Address Generation Interlock adds a cycle of latency. */
24186 if (insn_type == TYPE_LEA)
24188 rtx addr = PATTERN (insn);
24190 if (GET_CODE (addr) == PARALLEL)
24191 addr = XVECEXP (addr, 0, 0);
24193 gcc_assert (GET_CODE (addr) == SET);
24195 addr = SET_SRC (addr);
24196 if (modified_in_p (addr, dep_insn))
24197 cost += 1;
24199 else if (ix86_agi_dependent (dep_insn, insn))
24200 cost += 1;
24202 /* ??? Compares pair with jump/setcc. */
24203 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24204 cost = 0;
24206 /* Floating point stores require value to be ready one cycle earlier. */
24207 if (insn_type == TYPE_FMOV
24208 && get_attr_memory (insn) == MEMORY_STORE
24209 && !ix86_agi_dependent (dep_insn, insn))
24210 cost += 1;
24211 break;
24213 case PROCESSOR_PENTIUMPRO:
24214 memory = get_attr_memory (insn);
24216 /* INT->FP conversion is expensive. */
24217 if (get_attr_fp_int_src (dep_insn))
24218 cost += 5;
24220 /* There is one cycle extra latency between an FP op and a store. */
24221 if (insn_type == TYPE_FMOV
24222 && (set = single_set (dep_insn)) != NULL_RTX
24223 && (set2 = single_set (insn)) != NULL_RTX
24224 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24225 && MEM_P (SET_DEST (set2)))
24226 cost += 1;
24228 /* Show ability of reorder buffer to hide latency of load by executing
24229 in parallel with previous instruction in case
24230 previous instruction is not needed to compute the address. */
24231 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24232 && !ix86_agi_dependent (dep_insn, insn))
24234 /* Claim moves to take one cycle, as core can issue one load
24235 at time and the next load can start cycle later. */
24236 if (dep_insn_type == TYPE_IMOV
24237 || dep_insn_type == TYPE_FMOV)
24238 cost = 1;
24239 else if (cost > 1)
24240 cost--;
24242 break;
24244 case PROCESSOR_K6:
24245 memory = get_attr_memory (insn);
24247 /* The esp dependency is resolved before the instruction is really
24248 finished. */
24249 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24250 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24251 return 1;
24253 /* INT->FP conversion is expensive. */
24254 if (get_attr_fp_int_src (dep_insn))
24255 cost += 5;
24257 /* Show ability of reorder buffer to hide latency of load by executing
24258 in parallel with previous instruction in case
24259 previous instruction is not needed to compute the address. */
24260 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24261 && !ix86_agi_dependent (dep_insn, insn))
24263 /* Claim moves to take one cycle, as core can issue one load
24264 at time and the next load can start cycle later. */
24265 if (dep_insn_type == TYPE_IMOV
24266 || dep_insn_type == TYPE_FMOV)
24267 cost = 1;
24268 else if (cost > 2)
24269 cost -= 2;
24270 else
24271 cost = 1;
24273 break;
24275 case PROCESSOR_ATHLON:
24276 case PROCESSOR_K8:
24277 case PROCESSOR_AMDFAM10:
24278 case PROCESSOR_BDVER1:
24279 case PROCESSOR_BDVER2:
24280 case PROCESSOR_BDVER3:
24281 case PROCESSOR_BTVER1:
24282 case PROCESSOR_BTVER2:
24283 case PROCESSOR_ATOM:
24284 case PROCESSOR_GENERIC32:
24285 case PROCESSOR_GENERIC64:
24286 memory = get_attr_memory (insn);
24288 /* Show ability of reorder buffer to hide latency of load by executing
24289 in parallel with previous instruction in case
24290 previous instruction is not needed to compute the address. */
24291 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24292 && !ix86_agi_dependent (dep_insn, insn))
24294 enum attr_unit unit = get_attr_unit (insn);
24295 int loadcost = 3;
24297 /* Because of the difference between the length of integer and
24298 floating unit pipeline preparation stages, the memory operands
24299 for floating point are cheaper.
24301 ??? For Athlon it the difference is most probably 2. */
24302 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24303 loadcost = 3;
24304 else
24305 loadcost = TARGET_ATHLON ? 2 : 0;
24307 if (cost >= loadcost)
24308 cost -= loadcost;
24309 else
24310 cost = 0;
24313 default:
24314 break;
24317 return cost;
24320 /* How many alternative schedules to try. This should be as wide as the
24321 scheduling freedom in the DFA, but no wider. Making this value too
24322 large results extra work for the scheduler. */
24324 static int
24325 ia32_multipass_dfa_lookahead (void)
24327 switch (ix86_tune)
24329 case PROCESSOR_PENTIUM:
24330 return 2;
24332 case PROCESSOR_PENTIUMPRO:
24333 case PROCESSOR_K6:
24334 return 1;
24336 case PROCESSOR_CORE2:
24337 case PROCESSOR_COREI7:
24338 case PROCESSOR_HASWELL:
24339 case PROCESSOR_ATOM:
24340 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24341 as many instructions can be executed on a cycle, i.e.,
24342 issue_rate. I wonder why tuning for many CPUs does not do this. */
24343 if (reload_completed)
24344 return ix86_issue_rate ();
24345 /* Don't use lookahead for pre-reload schedule to save compile time. */
24346 return 0;
24348 default:
24349 return 0;
24353 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24354 execution. It is applied if
24355 (1) IMUL instruction is on the top of list;
24356 (2) There exists the only producer of independent IMUL instruction in
24357 ready list;
24358 (3) Put found producer on the top of ready list.
24359 Returns issue rate. */
24361 static int
24362 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24363 int clock_var ATTRIBUTE_UNUSED)
24365 static int issue_rate = -1;
24366 int n_ready = *pn_ready;
24367 rtx insn, insn1, insn2;
24368 int i;
24369 sd_iterator_def sd_it;
24370 dep_t dep;
24371 int index = -1;
24373 /* Set up issue rate. */
24374 issue_rate = ix86_issue_rate();
24376 /* Do reodering for Atom only. */
24377 if (ix86_tune != PROCESSOR_ATOM)
24378 return issue_rate;
24379 /* Do not perform ready list reodering for pre-reload schedule pass. */
24380 if (!reload_completed)
24381 return issue_rate;
24382 /* Nothing to do if ready list contains only 1 instruction. */
24383 if (n_ready <= 1)
24384 return issue_rate;
24386 /* Check that IMUL instruction is on the top of ready list. */
24387 insn = ready[n_ready - 1];
24388 if (!NONDEBUG_INSN_P (insn))
24389 return issue_rate;
24390 insn = PATTERN (insn);
24391 if (GET_CODE (insn) == PARALLEL)
24392 insn = XVECEXP (insn, 0, 0);
24393 if (GET_CODE (insn) != SET)
24394 return issue_rate;
24395 if (!(GET_CODE (SET_SRC (insn)) == MULT
24396 && GET_MODE (SET_SRC (insn)) == SImode))
24397 return issue_rate;
24399 /* Search for producer of independent IMUL instruction. */
24400 for (i = n_ready - 2; i>= 0; i--)
24402 insn = ready[i];
24403 if (!NONDEBUG_INSN_P (insn))
24404 continue;
24405 /* Skip IMUL instruction. */
24406 insn2 = PATTERN (insn);
24407 if (GET_CODE (insn2) == PARALLEL)
24408 insn2 = XVECEXP (insn2, 0, 0);
24409 if (GET_CODE (insn2) == SET
24410 && GET_CODE (SET_SRC (insn2)) == MULT
24411 && GET_MODE (SET_SRC (insn2)) == SImode)
24412 continue;
24414 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24416 rtx con;
24417 con = DEP_CON (dep);
24418 if (!NONDEBUG_INSN_P (con))
24419 continue;
24420 insn1 = PATTERN (con);
24421 if (GET_CODE (insn1) == PARALLEL)
24422 insn1 = XVECEXP (insn1, 0, 0);
24424 if (GET_CODE (insn1) == SET
24425 && GET_CODE (SET_SRC (insn1)) == MULT
24426 && GET_MODE (SET_SRC (insn1)) == SImode)
24428 sd_iterator_def sd_it1;
24429 dep_t dep1;
24430 /* Check if there is no other dependee for IMUL. */
24431 index = i;
24432 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24434 rtx pro;
24435 pro = DEP_PRO (dep1);
24436 if (!NONDEBUG_INSN_P (pro))
24437 continue;
24438 if (pro != insn)
24439 index = -1;
24441 if (index >= 0)
24442 break;
24445 if (index >= 0)
24446 break;
24448 if (index < 0)
24449 return issue_rate; /* Didn't find IMUL producer. */
24451 if (sched_verbose > 1)
24452 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24453 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24455 /* Put IMUL producer (ready[index]) at the top of ready list. */
24456 insn1= ready[index];
24457 for (i = index; i < n_ready - 1; i++)
24458 ready[i] = ready[i + 1];
24459 ready[n_ready - 1] = insn1;
24461 return issue_rate;
24464 static bool
24465 ix86_class_likely_spilled_p (reg_class_t);
24467 /* Returns true if lhs of insn is HW function argument register and set up
24468 is_spilled to true if it is likely spilled HW register. */
24469 static bool
24470 insn_is_function_arg (rtx insn, bool* is_spilled)
24472 rtx dst;
24474 if (!NONDEBUG_INSN_P (insn))
24475 return false;
24476 /* Call instructions are not movable, ignore it. */
24477 if (CALL_P (insn))
24478 return false;
24479 insn = PATTERN (insn);
24480 if (GET_CODE (insn) == PARALLEL)
24481 insn = XVECEXP (insn, 0, 0);
24482 if (GET_CODE (insn) != SET)
24483 return false;
24484 dst = SET_DEST (insn);
24485 if (REG_P (dst) && HARD_REGISTER_P (dst)
24486 && ix86_function_arg_regno_p (REGNO (dst)))
24488 /* Is it likely spilled HW register? */
24489 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24490 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24491 *is_spilled = true;
24492 return true;
24494 return false;
24497 /* Add output dependencies for chain of function adjacent arguments if only
24498 there is a move to likely spilled HW register. Return first argument
24499 if at least one dependence was added or NULL otherwise. */
24500 static rtx
24501 add_parameter_dependencies (rtx call, rtx head)
24503 rtx insn;
24504 rtx last = call;
24505 rtx first_arg = NULL;
24506 bool is_spilled = false;
24508 head = PREV_INSN (head);
24510 /* Find nearest to call argument passing instruction. */
24511 while (true)
24513 last = PREV_INSN (last);
24514 if (last == head)
24515 return NULL;
24516 if (!NONDEBUG_INSN_P (last))
24517 continue;
24518 if (insn_is_function_arg (last, &is_spilled))
24519 break;
24520 return NULL;
24523 first_arg = last;
24524 while (true)
24526 insn = PREV_INSN (last);
24527 if (!INSN_P (insn))
24528 break;
24529 if (insn == head)
24530 break;
24531 if (!NONDEBUG_INSN_P (insn))
24533 last = insn;
24534 continue;
24536 if (insn_is_function_arg (insn, &is_spilled))
24538 /* Add output depdendence between two function arguments if chain
24539 of output arguments contains likely spilled HW registers. */
24540 if (is_spilled)
24541 add_dependence (last, insn, REG_DEP_OUTPUT);
24542 first_arg = last = insn;
24544 else
24545 break;
24547 if (!is_spilled)
24548 return NULL;
24549 return first_arg;
24552 /* Add output or anti dependency from insn to first_arg to restrict its code
24553 motion. */
24554 static void
24555 avoid_func_arg_motion (rtx first_arg, rtx insn)
24557 rtx set;
24558 rtx tmp;
24560 set = single_set (insn);
24561 if (!set)
24562 return;
24563 tmp = SET_DEST (set);
24564 if (REG_P (tmp))
24566 /* Add output dependency to the first function argument. */
24567 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24568 return;
24570 /* Add anti dependency. */
24571 add_dependence (first_arg, insn, REG_DEP_ANTI);
24574 /* Avoid cross block motion of function argument through adding dependency
24575 from the first non-jump instruction in bb. */
24576 static void
24577 add_dependee_for_func_arg (rtx arg, basic_block bb)
24579 rtx insn = BB_END (bb);
24581 while (insn)
24583 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24585 rtx set = single_set (insn);
24586 if (set)
24588 avoid_func_arg_motion (arg, insn);
24589 return;
24592 if (insn == BB_HEAD (bb))
24593 return;
24594 insn = PREV_INSN (insn);
24598 /* Hook for pre-reload schedule - avoid motion of function arguments
24599 passed in likely spilled HW registers. */
24600 static void
24601 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24603 rtx insn;
24604 rtx first_arg = NULL;
24605 if (reload_completed)
24606 return;
24607 while (head != tail && DEBUG_INSN_P (head))
24608 head = NEXT_INSN (head);
24609 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24610 if (INSN_P (insn) && CALL_P (insn))
24612 first_arg = add_parameter_dependencies (insn, head);
24613 if (first_arg)
24615 /* Add dependee for first argument to predecessors if only
24616 region contains more than one block. */
24617 basic_block bb = BLOCK_FOR_INSN (insn);
24618 int rgn = CONTAINING_RGN (bb->index);
24619 int nr_blks = RGN_NR_BLOCKS (rgn);
24620 /* Skip trivial regions and region head blocks that can have
24621 predecessors outside of region. */
24622 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24624 edge e;
24625 edge_iterator ei;
24626 /* Assume that region is SCC, i.e. all immediate predecessors
24627 of non-head block are in the same region. */
24628 FOR_EACH_EDGE (e, ei, bb->preds)
24630 /* Avoid creating of loop-carried dependencies through
24631 using topological odering in region. */
24632 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24633 add_dependee_for_func_arg (first_arg, e->src);
24636 insn = first_arg;
24637 if (insn == head)
24638 break;
24641 else if (first_arg)
24642 avoid_func_arg_motion (first_arg, insn);
24645 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24646 HW registers to maximum, to schedule them at soon as possible. These are
24647 moves from function argument registers at the top of the function entry
24648 and moves from function return value registers after call. */
24649 static int
24650 ix86_adjust_priority (rtx insn, int priority)
24652 rtx set;
24654 if (reload_completed)
24655 return priority;
24657 if (!NONDEBUG_INSN_P (insn))
24658 return priority;
24660 set = single_set (insn);
24661 if (set)
24663 rtx tmp = SET_SRC (set);
24664 if (REG_P (tmp)
24665 && HARD_REGISTER_P (tmp)
24666 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24667 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24668 return current_sched_info->sched_max_insns_priority;
24671 return priority;
24674 /* Model decoder of Core 2/i7.
24675 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24676 track the instruction fetch block boundaries and make sure that long
24677 (9+ bytes) instructions are assigned to D0. */
24679 /* Maximum length of an insn that can be handled by
24680 a secondary decoder unit. '8' for Core 2/i7. */
24681 static int core2i7_secondary_decoder_max_insn_size;
24683 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24684 '16' for Core 2/i7. */
24685 static int core2i7_ifetch_block_size;
24687 /* Maximum number of instructions decoder can handle per cycle.
24688 '6' for Core 2/i7. */
24689 static int core2i7_ifetch_block_max_insns;
24691 typedef struct ix86_first_cycle_multipass_data_ *
24692 ix86_first_cycle_multipass_data_t;
24693 typedef const struct ix86_first_cycle_multipass_data_ *
24694 const_ix86_first_cycle_multipass_data_t;
24696 /* A variable to store target state across calls to max_issue within
24697 one cycle. */
24698 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24699 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24701 /* Initialize DATA. */
24702 static void
24703 core2i7_first_cycle_multipass_init (void *_data)
24705 ix86_first_cycle_multipass_data_t data
24706 = (ix86_first_cycle_multipass_data_t) _data;
24708 data->ifetch_block_len = 0;
24709 data->ifetch_block_n_insns = 0;
24710 data->ready_try_change = NULL;
24711 data->ready_try_change_size = 0;
24714 /* Advancing the cycle; reset ifetch block counts. */
24715 static void
24716 core2i7_dfa_post_advance_cycle (void)
24718 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24720 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24722 data->ifetch_block_len = 0;
24723 data->ifetch_block_n_insns = 0;
24726 static int min_insn_size (rtx);
24728 /* Filter out insns from ready_try that the core will not be able to issue
24729 on current cycle due to decoder. */
24730 static void
24731 core2i7_first_cycle_multipass_filter_ready_try
24732 (const_ix86_first_cycle_multipass_data_t data,
24733 char *ready_try, int n_ready, bool first_cycle_insn_p)
24735 while (n_ready--)
24737 rtx insn;
24738 int insn_size;
24740 if (ready_try[n_ready])
24741 continue;
24743 insn = get_ready_element (n_ready);
24744 insn_size = min_insn_size (insn);
24746 if (/* If this is a too long an insn for a secondary decoder ... */
24747 (!first_cycle_insn_p
24748 && insn_size > core2i7_secondary_decoder_max_insn_size)
24749 /* ... or it would not fit into the ifetch block ... */
24750 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24751 /* ... or the decoder is full already ... */
24752 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24753 /* ... mask the insn out. */
24755 ready_try[n_ready] = 1;
24757 if (data->ready_try_change)
24758 bitmap_set_bit (data->ready_try_change, n_ready);
24763 /* Prepare for a new round of multipass lookahead scheduling. */
24764 static void
24765 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24766 bool first_cycle_insn_p)
24768 ix86_first_cycle_multipass_data_t data
24769 = (ix86_first_cycle_multipass_data_t) _data;
24770 const_ix86_first_cycle_multipass_data_t prev_data
24771 = ix86_first_cycle_multipass_data;
24773 /* Restore the state from the end of the previous round. */
24774 data->ifetch_block_len = prev_data->ifetch_block_len;
24775 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24777 /* Filter instructions that cannot be issued on current cycle due to
24778 decoder restrictions. */
24779 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24780 first_cycle_insn_p);
24783 /* INSN is being issued in current solution. Account for its impact on
24784 the decoder model. */
24785 static void
24786 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24787 rtx insn, const void *_prev_data)
24789 ix86_first_cycle_multipass_data_t data
24790 = (ix86_first_cycle_multipass_data_t) _data;
24791 const_ix86_first_cycle_multipass_data_t prev_data
24792 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24794 int insn_size = min_insn_size (insn);
24796 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24797 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24798 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24799 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24801 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24802 if (!data->ready_try_change)
24804 data->ready_try_change = sbitmap_alloc (n_ready);
24805 data->ready_try_change_size = n_ready;
24807 else if (data->ready_try_change_size < n_ready)
24809 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24810 n_ready, 0);
24811 data->ready_try_change_size = n_ready;
24813 bitmap_clear (data->ready_try_change);
24815 /* Filter out insns from ready_try that the core will not be able to issue
24816 on current cycle due to decoder. */
24817 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24818 false);
24821 /* Revert the effect on ready_try. */
24822 static void
24823 core2i7_first_cycle_multipass_backtrack (const void *_data,
24824 char *ready_try,
24825 int n_ready ATTRIBUTE_UNUSED)
24827 const_ix86_first_cycle_multipass_data_t data
24828 = (const_ix86_first_cycle_multipass_data_t) _data;
24829 unsigned int i = 0;
24830 sbitmap_iterator sbi;
24832 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24833 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24835 ready_try[i] = 0;
24839 /* Save the result of multipass lookahead scheduling for the next round. */
24840 static void
24841 core2i7_first_cycle_multipass_end (const void *_data)
24843 const_ix86_first_cycle_multipass_data_t data
24844 = (const_ix86_first_cycle_multipass_data_t) _data;
24845 ix86_first_cycle_multipass_data_t next_data
24846 = ix86_first_cycle_multipass_data;
24848 if (data != NULL)
24850 next_data->ifetch_block_len = data->ifetch_block_len;
24851 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24855 /* Deallocate target data. */
24856 static void
24857 core2i7_first_cycle_multipass_fini (void *_data)
24859 ix86_first_cycle_multipass_data_t data
24860 = (ix86_first_cycle_multipass_data_t) _data;
24862 if (data->ready_try_change)
24864 sbitmap_free (data->ready_try_change);
24865 data->ready_try_change = NULL;
24866 data->ready_try_change_size = 0;
24870 /* Prepare for scheduling pass. */
24871 static void
24872 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24873 int verbose ATTRIBUTE_UNUSED,
24874 int max_uid ATTRIBUTE_UNUSED)
24876 /* Install scheduling hooks for current CPU. Some of these hooks are used
24877 in time-critical parts of the scheduler, so we only set them up when
24878 they are actually used. */
24879 switch (ix86_tune)
24881 case PROCESSOR_CORE2:
24882 case PROCESSOR_COREI7:
24883 case PROCESSOR_HASWELL:
24884 /* Do not perform multipass scheduling for pre-reload schedule
24885 to save compile time. */
24886 if (reload_completed)
24888 targetm.sched.dfa_post_advance_cycle
24889 = core2i7_dfa_post_advance_cycle;
24890 targetm.sched.first_cycle_multipass_init
24891 = core2i7_first_cycle_multipass_init;
24892 targetm.sched.first_cycle_multipass_begin
24893 = core2i7_first_cycle_multipass_begin;
24894 targetm.sched.first_cycle_multipass_issue
24895 = core2i7_first_cycle_multipass_issue;
24896 targetm.sched.first_cycle_multipass_backtrack
24897 = core2i7_first_cycle_multipass_backtrack;
24898 targetm.sched.first_cycle_multipass_end
24899 = core2i7_first_cycle_multipass_end;
24900 targetm.sched.first_cycle_multipass_fini
24901 = core2i7_first_cycle_multipass_fini;
24903 /* Set decoder parameters. */
24904 core2i7_secondary_decoder_max_insn_size = 8;
24905 core2i7_ifetch_block_size = 16;
24906 core2i7_ifetch_block_max_insns = 6;
24907 break;
24909 /* ... Fall through ... */
24910 default:
24911 targetm.sched.dfa_post_advance_cycle = NULL;
24912 targetm.sched.first_cycle_multipass_init = NULL;
24913 targetm.sched.first_cycle_multipass_begin = NULL;
24914 targetm.sched.first_cycle_multipass_issue = NULL;
24915 targetm.sched.first_cycle_multipass_backtrack = NULL;
24916 targetm.sched.first_cycle_multipass_end = NULL;
24917 targetm.sched.first_cycle_multipass_fini = NULL;
24918 break;
24923 /* Compute the alignment given to a constant that is being placed in memory.
24924 EXP is the constant and ALIGN is the alignment that the object would
24925 ordinarily have.
24926 The value of this function is used instead of that alignment to align
24927 the object. */
24930 ix86_constant_alignment (tree exp, int align)
24932 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24933 || TREE_CODE (exp) == INTEGER_CST)
24935 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24936 return 64;
24937 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24938 return 128;
24940 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24941 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24942 return BITS_PER_WORD;
24944 return align;
24947 /* Compute the alignment for a static variable.
24948 TYPE is the data type, and ALIGN is the alignment that
24949 the object would ordinarily have. The value of this function is used
24950 instead of that alignment to align the object. */
24953 ix86_data_alignment (tree type, int align)
24955 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24957 if (AGGREGATE_TYPE_P (type)
24958 && TYPE_SIZE (type)
24959 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24960 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24961 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24962 && align < max_align)
24963 align = max_align;
24965 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24966 to 16byte boundary. */
24967 if (TARGET_64BIT)
24969 if (AGGREGATE_TYPE_P (type)
24970 && TYPE_SIZE (type)
24971 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24972 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24973 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24974 return 128;
24977 if (TREE_CODE (type) == ARRAY_TYPE)
24979 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24980 return 64;
24981 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24982 return 128;
24984 else if (TREE_CODE (type) == COMPLEX_TYPE)
24987 if (TYPE_MODE (type) == DCmode && align < 64)
24988 return 64;
24989 if ((TYPE_MODE (type) == XCmode
24990 || TYPE_MODE (type) == TCmode) && align < 128)
24991 return 128;
24993 else if ((TREE_CODE (type) == RECORD_TYPE
24994 || TREE_CODE (type) == UNION_TYPE
24995 || TREE_CODE (type) == QUAL_UNION_TYPE)
24996 && TYPE_FIELDS (type))
24998 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24999 return 64;
25000 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25001 return 128;
25003 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25004 || TREE_CODE (type) == INTEGER_TYPE)
25006 if (TYPE_MODE (type) == DFmode && align < 64)
25007 return 64;
25008 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25009 return 128;
25012 return align;
25015 /* Compute the alignment for a local variable or a stack slot. EXP is
25016 the data type or decl itself, MODE is the widest mode available and
25017 ALIGN is the alignment that the object would ordinarily have. The
25018 value of this macro is used instead of that alignment to align the
25019 object. */
25021 unsigned int
25022 ix86_local_alignment (tree exp, enum machine_mode mode,
25023 unsigned int align)
25025 tree type, decl;
25027 if (exp && DECL_P (exp))
25029 type = TREE_TYPE (exp);
25030 decl = exp;
25032 else
25034 type = exp;
25035 decl = NULL;
25038 /* Don't do dynamic stack realignment for long long objects with
25039 -mpreferred-stack-boundary=2. */
25040 if (!TARGET_64BIT
25041 && align == 64
25042 && ix86_preferred_stack_boundary < 64
25043 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25044 && (!type || !TYPE_USER_ALIGN (type))
25045 && (!decl || !DECL_USER_ALIGN (decl)))
25046 align = 32;
25048 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25049 register in MODE. We will return the largest alignment of XF
25050 and DF. */
25051 if (!type)
25053 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25054 align = GET_MODE_ALIGNMENT (DFmode);
25055 return align;
25058 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25059 to 16byte boundary. Exact wording is:
25061 An array uses the same alignment as its elements, except that a local or
25062 global array variable of length at least 16 bytes or
25063 a C99 variable-length array variable always has alignment of at least 16 bytes.
25065 This was added to allow use of aligned SSE instructions at arrays. This
25066 rule is meant for static storage (where compiler can not do the analysis
25067 by itself). We follow it for automatic variables only when convenient.
25068 We fully control everything in the function compiled and functions from
25069 other unit can not rely on the alignment.
25071 Exclude va_list type. It is the common case of local array where
25072 we can not benefit from the alignment. */
25073 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25074 && TARGET_SSE)
25076 if (AGGREGATE_TYPE_P (type)
25077 && (va_list_type_node == NULL_TREE
25078 || (TYPE_MAIN_VARIANT (type)
25079 != TYPE_MAIN_VARIANT (va_list_type_node)))
25080 && TYPE_SIZE (type)
25081 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25082 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25083 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25084 return 128;
25086 if (TREE_CODE (type) == ARRAY_TYPE)
25088 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25089 return 64;
25090 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25091 return 128;
25093 else if (TREE_CODE (type) == COMPLEX_TYPE)
25095 if (TYPE_MODE (type) == DCmode && align < 64)
25096 return 64;
25097 if ((TYPE_MODE (type) == XCmode
25098 || TYPE_MODE (type) == TCmode) && align < 128)
25099 return 128;
25101 else if ((TREE_CODE (type) == RECORD_TYPE
25102 || TREE_CODE (type) == UNION_TYPE
25103 || TREE_CODE (type) == QUAL_UNION_TYPE)
25104 && TYPE_FIELDS (type))
25106 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25107 return 64;
25108 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25109 return 128;
25111 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25112 || TREE_CODE (type) == INTEGER_TYPE)
25115 if (TYPE_MODE (type) == DFmode && align < 64)
25116 return 64;
25117 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25118 return 128;
25120 return align;
25123 /* Compute the minimum required alignment for dynamic stack realignment
25124 purposes for a local variable, parameter or a stack slot. EXP is
25125 the data type or decl itself, MODE is its mode and ALIGN is the
25126 alignment that the object would ordinarily have. */
25128 unsigned int
25129 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25130 unsigned int align)
25132 tree type, decl;
25134 if (exp && DECL_P (exp))
25136 type = TREE_TYPE (exp);
25137 decl = exp;
25139 else
25141 type = exp;
25142 decl = NULL;
25145 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25146 return align;
25148 /* Don't do dynamic stack realignment for long long objects with
25149 -mpreferred-stack-boundary=2. */
25150 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25151 && (!type || !TYPE_USER_ALIGN (type))
25152 && (!decl || !DECL_USER_ALIGN (decl)))
25153 return 32;
25155 return align;
25158 /* Find a location for the static chain incoming to a nested function.
25159 This is a register, unless all free registers are used by arguments. */
25161 static rtx
25162 ix86_static_chain (const_tree fndecl, bool incoming_p)
25164 unsigned regno;
25166 if (!DECL_STATIC_CHAIN (fndecl))
25167 return NULL;
25169 if (TARGET_64BIT)
25171 /* We always use R10 in 64-bit mode. */
25172 regno = R10_REG;
25174 else
25176 tree fntype;
25177 unsigned int ccvt;
25179 /* By default in 32-bit mode we use ECX to pass the static chain. */
25180 regno = CX_REG;
25182 fntype = TREE_TYPE (fndecl);
25183 ccvt = ix86_get_callcvt (fntype);
25184 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25186 /* Fastcall functions use ecx/edx for arguments, which leaves
25187 us with EAX for the static chain.
25188 Thiscall functions use ecx for arguments, which also
25189 leaves us with EAX for the static chain. */
25190 regno = AX_REG;
25192 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25194 /* Thiscall functions use ecx for arguments, which leaves
25195 us with EAX and EDX for the static chain.
25196 We are using for abi-compatibility EAX. */
25197 regno = AX_REG;
25199 else if (ix86_function_regparm (fntype, fndecl) == 3)
25201 /* For regparm 3, we have no free call-clobbered registers in
25202 which to store the static chain. In order to implement this,
25203 we have the trampoline push the static chain to the stack.
25204 However, we can't push a value below the return address when
25205 we call the nested function directly, so we have to use an
25206 alternate entry point. For this we use ESI, and have the
25207 alternate entry point push ESI, so that things appear the
25208 same once we're executing the nested function. */
25209 if (incoming_p)
25211 if (fndecl == current_function_decl)
25212 ix86_static_chain_on_stack = true;
25213 return gen_frame_mem (SImode,
25214 plus_constant (Pmode,
25215 arg_pointer_rtx, -8));
25217 regno = SI_REG;
25221 return gen_rtx_REG (Pmode, regno);
25224 /* Emit RTL insns to initialize the variable parts of a trampoline.
25225 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25226 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25227 to be passed to the target function. */
25229 static void
25230 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25232 rtx mem, fnaddr;
25233 int opcode;
25234 int offset = 0;
25236 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25238 if (TARGET_64BIT)
25240 int size;
25242 /* Load the function address to r11. Try to load address using
25243 the shorter movl instead of movabs. We may want to support
25244 movq for kernel mode, but kernel does not use trampolines at
25245 the moment. FNADDR is a 32bit address and may not be in
25246 DImode when ptr_mode == SImode. Always use movl in this
25247 case. */
25248 if (ptr_mode == SImode
25249 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25251 fnaddr = copy_addr_to_reg (fnaddr);
25253 mem = adjust_address (m_tramp, HImode, offset);
25254 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25256 mem = adjust_address (m_tramp, SImode, offset + 2);
25257 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25258 offset += 6;
25260 else
25262 mem = adjust_address (m_tramp, HImode, offset);
25263 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25265 mem = adjust_address (m_tramp, DImode, offset + 2);
25266 emit_move_insn (mem, fnaddr);
25267 offset += 10;
25270 /* Load static chain using movabs to r10. Use the shorter movl
25271 instead of movabs when ptr_mode == SImode. */
25272 if (ptr_mode == SImode)
25274 opcode = 0xba41;
25275 size = 6;
25277 else
25279 opcode = 0xba49;
25280 size = 10;
25283 mem = adjust_address (m_tramp, HImode, offset);
25284 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25286 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25287 emit_move_insn (mem, chain_value);
25288 offset += size;
25290 /* Jump to r11; the last (unused) byte is a nop, only there to
25291 pad the write out to a single 32-bit store. */
25292 mem = adjust_address (m_tramp, SImode, offset);
25293 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25294 offset += 4;
25296 else
25298 rtx disp, chain;
25300 /* Depending on the static chain location, either load a register
25301 with a constant, or push the constant to the stack. All of the
25302 instructions are the same size. */
25303 chain = ix86_static_chain (fndecl, true);
25304 if (REG_P (chain))
25306 switch (REGNO (chain))
25308 case AX_REG:
25309 opcode = 0xb8; break;
25310 case CX_REG:
25311 opcode = 0xb9; break;
25312 default:
25313 gcc_unreachable ();
25316 else
25317 opcode = 0x68;
25319 mem = adjust_address (m_tramp, QImode, offset);
25320 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25322 mem = adjust_address (m_tramp, SImode, offset + 1);
25323 emit_move_insn (mem, chain_value);
25324 offset += 5;
25326 mem = adjust_address (m_tramp, QImode, offset);
25327 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25329 mem = adjust_address (m_tramp, SImode, offset + 1);
25331 /* Compute offset from the end of the jmp to the target function.
25332 In the case in which the trampoline stores the static chain on
25333 the stack, we need to skip the first insn which pushes the
25334 (call-saved) register static chain; this push is 1 byte. */
25335 offset += 5;
25336 disp = expand_binop (SImode, sub_optab, fnaddr,
25337 plus_constant (Pmode, XEXP (m_tramp, 0),
25338 offset - (MEM_P (chain) ? 1 : 0)),
25339 NULL_RTX, 1, OPTAB_DIRECT);
25340 emit_move_insn (mem, disp);
25343 gcc_assert (offset <= TRAMPOLINE_SIZE);
25345 #ifdef HAVE_ENABLE_EXECUTE_STACK
25346 #ifdef CHECK_EXECUTE_STACK_ENABLED
25347 if (CHECK_EXECUTE_STACK_ENABLED)
25348 #endif
25349 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25350 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25351 #endif
25354 /* The following file contains several enumerations and data structures
25355 built from the definitions in i386-builtin-types.def. */
25357 #include "i386-builtin-types.inc"
25359 /* Table for the ix86 builtin non-function types. */
25360 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25362 /* Retrieve an element from the above table, building some of
25363 the types lazily. */
25365 static tree
25366 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25368 unsigned int index;
25369 tree type, itype;
25371 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25373 type = ix86_builtin_type_tab[(int) tcode];
25374 if (type != NULL)
25375 return type;
25377 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25378 if (tcode <= IX86_BT_LAST_VECT)
25380 enum machine_mode mode;
25382 index = tcode - IX86_BT_LAST_PRIM - 1;
25383 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25384 mode = ix86_builtin_type_vect_mode[index];
25386 type = build_vector_type_for_mode (itype, mode);
25388 else
25390 int quals;
25392 index = tcode - IX86_BT_LAST_VECT - 1;
25393 if (tcode <= IX86_BT_LAST_PTR)
25394 quals = TYPE_UNQUALIFIED;
25395 else
25396 quals = TYPE_QUAL_CONST;
25398 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25399 if (quals != TYPE_UNQUALIFIED)
25400 itype = build_qualified_type (itype, quals);
25402 type = build_pointer_type (itype);
25405 ix86_builtin_type_tab[(int) tcode] = type;
25406 return type;
25409 /* Table for the ix86 builtin function types. */
25410 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25412 /* Retrieve an element from the above table, building some of
25413 the types lazily. */
25415 static tree
25416 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25418 tree type;
25420 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25422 type = ix86_builtin_func_type_tab[(int) tcode];
25423 if (type != NULL)
25424 return type;
25426 if (tcode <= IX86_BT_LAST_FUNC)
25428 unsigned start = ix86_builtin_func_start[(int) tcode];
25429 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25430 tree rtype, atype, args = void_list_node;
25431 unsigned i;
25433 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25434 for (i = after - 1; i > start; --i)
25436 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25437 args = tree_cons (NULL, atype, args);
25440 type = build_function_type (rtype, args);
25442 else
25444 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25445 enum ix86_builtin_func_type icode;
25447 icode = ix86_builtin_func_alias_base[index];
25448 type = ix86_get_builtin_func_type (icode);
25451 ix86_builtin_func_type_tab[(int) tcode] = type;
25452 return type;
25456 /* Codes for all the SSE/MMX builtins. */
25457 enum ix86_builtins
25459 IX86_BUILTIN_ADDPS,
25460 IX86_BUILTIN_ADDSS,
25461 IX86_BUILTIN_DIVPS,
25462 IX86_BUILTIN_DIVSS,
25463 IX86_BUILTIN_MULPS,
25464 IX86_BUILTIN_MULSS,
25465 IX86_BUILTIN_SUBPS,
25466 IX86_BUILTIN_SUBSS,
25468 IX86_BUILTIN_CMPEQPS,
25469 IX86_BUILTIN_CMPLTPS,
25470 IX86_BUILTIN_CMPLEPS,
25471 IX86_BUILTIN_CMPGTPS,
25472 IX86_BUILTIN_CMPGEPS,
25473 IX86_BUILTIN_CMPNEQPS,
25474 IX86_BUILTIN_CMPNLTPS,
25475 IX86_BUILTIN_CMPNLEPS,
25476 IX86_BUILTIN_CMPNGTPS,
25477 IX86_BUILTIN_CMPNGEPS,
25478 IX86_BUILTIN_CMPORDPS,
25479 IX86_BUILTIN_CMPUNORDPS,
25480 IX86_BUILTIN_CMPEQSS,
25481 IX86_BUILTIN_CMPLTSS,
25482 IX86_BUILTIN_CMPLESS,
25483 IX86_BUILTIN_CMPNEQSS,
25484 IX86_BUILTIN_CMPNLTSS,
25485 IX86_BUILTIN_CMPNLESS,
25486 IX86_BUILTIN_CMPNGTSS,
25487 IX86_BUILTIN_CMPNGESS,
25488 IX86_BUILTIN_CMPORDSS,
25489 IX86_BUILTIN_CMPUNORDSS,
25491 IX86_BUILTIN_COMIEQSS,
25492 IX86_BUILTIN_COMILTSS,
25493 IX86_BUILTIN_COMILESS,
25494 IX86_BUILTIN_COMIGTSS,
25495 IX86_BUILTIN_COMIGESS,
25496 IX86_BUILTIN_COMINEQSS,
25497 IX86_BUILTIN_UCOMIEQSS,
25498 IX86_BUILTIN_UCOMILTSS,
25499 IX86_BUILTIN_UCOMILESS,
25500 IX86_BUILTIN_UCOMIGTSS,
25501 IX86_BUILTIN_UCOMIGESS,
25502 IX86_BUILTIN_UCOMINEQSS,
25504 IX86_BUILTIN_CVTPI2PS,
25505 IX86_BUILTIN_CVTPS2PI,
25506 IX86_BUILTIN_CVTSI2SS,
25507 IX86_BUILTIN_CVTSI642SS,
25508 IX86_BUILTIN_CVTSS2SI,
25509 IX86_BUILTIN_CVTSS2SI64,
25510 IX86_BUILTIN_CVTTPS2PI,
25511 IX86_BUILTIN_CVTTSS2SI,
25512 IX86_BUILTIN_CVTTSS2SI64,
25514 IX86_BUILTIN_MAXPS,
25515 IX86_BUILTIN_MAXSS,
25516 IX86_BUILTIN_MINPS,
25517 IX86_BUILTIN_MINSS,
25519 IX86_BUILTIN_LOADUPS,
25520 IX86_BUILTIN_STOREUPS,
25521 IX86_BUILTIN_MOVSS,
25523 IX86_BUILTIN_MOVHLPS,
25524 IX86_BUILTIN_MOVLHPS,
25525 IX86_BUILTIN_LOADHPS,
25526 IX86_BUILTIN_LOADLPS,
25527 IX86_BUILTIN_STOREHPS,
25528 IX86_BUILTIN_STORELPS,
25530 IX86_BUILTIN_MASKMOVQ,
25531 IX86_BUILTIN_MOVMSKPS,
25532 IX86_BUILTIN_PMOVMSKB,
25534 IX86_BUILTIN_MOVNTPS,
25535 IX86_BUILTIN_MOVNTQ,
25537 IX86_BUILTIN_LOADDQU,
25538 IX86_BUILTIN_STOREDQU,
25540 IX86_BUILTIN_PACKSSWB,
25541 IX86_BUILTIN_PACKSSDW,
25542 IX86_BUILTIN_PACKUSWB,
25544 IX86_BUILTIN_PADDB,
25545 IX86_BUILTIN_PADDW,
25546 IX86_BUILTIN_PADDD,
25547 IX86_BUILTIN_PADDQ,
25548 IX86_BUILTIN_PADDSB,
25549 IX86_BUILTIN_PADDSW,
25550 IX86_BUILTIN_PADDUSB,
25551 IX86_BUILTIN_PADDUSW,
25552 IX86_BUILTIN_PSUBB,
25553 IX86_BUILTIN_PSUBW,
25554 IX86_BUILTIN_PSUBD,
25555 IX86_BUILTIN_PSUBQ,
25556 IX86_BUILTIN_PSUBSB,
25557 IX86_BUILTIN_PSUBSW,
25558 IX86_BUILTIN_PSUBUSB,
25559 IX86_BUILTIN_PSUBUSW,
25561 IX86_BUILTIN_PAND,
25562 IX86_BUILTIN_PANDN,
25563 IX86_BUILTIN_POR,
25564 IX86_BUILTIN_PXOR,
25566 IX86_BUILTIN_PAVGB,
25567 IX86_BUILTIN_PAVGW,
25569 IX86_BUILTIN_PCMPEQB,
25570 IX86_BUILTIN_PCMPEQW,
25571 IX86_BUILTIN_PCMPEQD,
25572 IX86_BUILTIN_PCMPGTB,
25573 IX86_BUILTIN_PCMPGTW,
25574 IX86_BUILTIN_PCMPGTD,
25576 IX86_BUILTIN_PMADDWD,
25578 IX86_BUILTIN_PMAXSW,
25579 IX86_BUILTIN_PMAXUB,
25580 IX86_BUILTIN_PMINSW,
25581 IX86_BUILTIN_PMINUB,
25583 IX86_BUILTIN_PMULHUW,
25584 IX86_BUILTIN_PMULHW,
25585 IX86_BUILTIN_PMULLW,
25587 IX86_BUILTIN_PSADBW,
25588 IX86_BUILTIN_PSHUFW,
25590 IX86_BUILTIN_PSLLW,
25591 IX86_BUILTIN_PSLLD,
25592 IX86_BUILTIN_PSLLQ,
25593 IX86_BUILTIN_PSRAW,
25594 IX86_BUILTIN_PSRAD,
25595 IX86_BUILTIN_PSRLW,
25596 IX86_BUILTIN_PSRLD,
25597 IX86_BUILTIN_PSRLQ,
25598 IX86_BUILTIN_PSLLWI,
25599 IX86_BUILTIN_PSLLDI,
25600 IX86_BUILTIN_PSLLQI,
25601 IX86_BUILTIN_PSRAWI,
25602 IX86_BUILTIN_PSRADI,
25603 IX86_BUILTIN_PSRLWI,
25604 IX86_BUILTIN_PSRLDI,
25605 IX86_BUILTIN_PSRLQI,
25607 IX86_BUILTIN_PUNPCKHBW,
25608 IX86_BUILTIN_PUNPCKHWD,
25609 IX86_BUILTIN_PUNPCKHDQ,
25610 IX86_BUILTIN_PUNPCKLBW,
25611 IX86_BUILTIN_PUNPCKLWD,
25612 IX86_BUILTIN_PUNPCKLDQ,
25614 IX86_BUILTIN_SHUFPS,
25616 IX86_BUILTIN_RCPPS,
25617 IX86_BUILTIN_RCPSS,
25618 IX86_BUILTIN_RSQRTPS,
25619 IX86_BUILTIN_RSQRTPS_NR,
25620 IX86_BUILTIN_RSQRTSS,
25621 IX86_BUILTIN_RSQRTF,
25622 IX86_BUILTIN_SQRTPS,
25623 IX86_BUILTIN_SQRTPS_NR,
25624 IX86_BUILTIN_SQRTSS,
25626 IX86_BUILTIN_UNPCKHPS,
25627 IX86_BUILTIN_UNPCKLPS,
25629 IX86_BUILTIN_ANDPS,
25630 IX86_BUILTIN_ANDNPS,
25631 IX86_BUILTIN_ORPS,
25632 IX86_BUILTIN_XORPS,
25634 IX86_BUILTIN_EMMS,
25635 IX86_BUILTIN_LDMXCSR,
25636 IX86_BUILTIN_STMXCSR,
25637 IX86_BUILTIN_SFENCE,
25639 IX86_BUILTIN_FXSAVE,
25640 IX86_BUILTIN_FXRSTOR,
25641 IX86_BUILTIN_FXSAVE64,
25642 IX86_BUILTIN_FXRSTOR64,
25644 IX86_BUILTIN_XSAVE,
25645 IX86_BUILTIN_XRSTOR,
25646 IX86_BUILTIN_XSAVE64,
25647 IX86_BUILTIN_XRSTOR64,
25649 IX86_BUILTIN_XSAVEOPT,
25650 IX86_BUILTIN_XSAVEOPT64,
25652 /* 3DNow! Original */
25653 IX86_BUILTIN_FEMMS,
25654 IX86_BUILTIN_PAVGUSB,
25655 IX86_BUILTIN_PF2ID,
25656 IX86_BUILTIN_PFACC,
25657 IX86_BUILTIN_PFADD,
25658 IX86_BUILTIN_PFCMPEQ,
25659 IX86_BUILTIN_PFCMPGE,
25660 IX86_BUILTIN_PFCMPGT,
25661 IX86_BUILTIN_PFMAX,
25662 IX86_BUILTIN_PFMIN,
25663 IX86_BUILTIN_PFMUL,
25664 IX86_BUILTIN_PFRCP,
25665 IX86_BUILTIN_PFRCPIT1,
25666 IX86_BUILTIN_PFRCPIT2,
25667 IX86_BUILTIN_PFRSQIT1,
25668 IX86_BUILTIN_PFRSQRT,
25669 IX86_BUILTIN_PFSUB,
25670 IX86_BUILTIN_PFSUBR,
25671 IX86_BUILTIN_PI2FD,
25672 IX86_BUILTIN_PMULHRW,
25674 /* 3DNow! Athlon Extensions */
25675 IX86_BUILTIN_PF2IW,
25676 IX86_BUILTIN_PFNACC,
25677 IX86_BUILTIN_PFPNACC,
25678 IX86_BUILTIN_PI2FW,
25679 IX86_BUILTIN_PSWAPDSI,
25680 IX86_BUILTIN_PSWAPDSF,
25682 /* SSE2 */
25683 IX86_BUILTIN_ADDPD,
25684 IX86_BUILTIN_ADDSD,
25685 IX86_BUILTIN_DIVPD,
25686 IX86_BUILTIN_DIVSD,
25687 IX86_BUILTIN_MULPD,
25688 IX86_BUILTIN_MULSD,
25689 IX86_BUILTIN_SUBPD,
25690 IX86_BUILTIN_SUBSD,
25692 IX86_BUILTIN_CMPEQPD,
25693 IX86_BUILTIN_CMPLTPD,
25694 IX86_BUILTIN_CMPLEPD,
25695 IX86_BUILTIN_CMPGTPD,
25696 IX86_BUILTIN_CMPGEPD,
25697 IX86_BUILTIN_CMPNEQPD,
25698 IX86_BUILTIN_CMPNLTPD,
25699 IX86_BUILTIN_CMPNLEPD,
25700 IX86_BUILTIN_CMPNGTPD,
25701 IX86_BUILTIN_CMPNGEPD,
25702 IX86_BUILTIN_CMPORDPD,
25703 IX86_BUILTIN_CMPUNORDPD,
25704 IX86_BUILTIN_CMPEQSD,
25705 IX86_BUILTIN_CMPLTSD,
25706 IX86_BUILTIN_CMPLESD,
25707 IX86_BUILTIN_CMPNEQSD,
25708 IX86_BUILTIN_CMPNLTSD,
25709 IX86_BUILTIN_CMPNLESD,
25710 IX86_BUILTIN_CMPORDSD,
25711 IX86_BUILTIN_CMPUNORDSD,
25713 IX86_BUILTIN_COMIEQSD,
25714 IX86_BUILTIN_COMILTSD,
25715 IX86_BUILTIN_COMILESD,
25716 IX86_BUILTIN_COMIGTSD,
25717 IX86_BUILTIN_COMIGESD,
25718 IX86_BUILTIN_COMINEQSD,
25719 IX86_BUILTIN_UCOMIEQSD,
25720 IX86_BUILTIN_UCOMILTSD,
25721 IX86_BUILTIN_UCOMILESD,
25722 IX86_BUILTIN_UCOMIGTSD,
25723 IX86_BUILTIN_UCOMIGESD,
25724 IX86_BUILTIN_UCOMINEQSD,
25726 IX86_BUILTIN_MAXPD,
25727 IX86_BUILTIN_MAXSD,
25728 IX86_BUILTIN_MINPD,
25729 IX86_BUILTIN_MINSD,
25731 IX86_BUILTIN_ANDPD,
25732 IX86_BUILTIN_ANDNPD,
25733 IX86_BUILTIN_ORPD,
25734 IX86_BUILTIN_XORPD,
25736 IX86_BUILTIN_SQRTPD,
25737 IX86_BUILTIN_SQRTSD,
25739 IX86_BUILTIN_UNPCKHPD,
25740 IX86_BUILTIN_UNPCKLPD,
25742 IX86_BUILTIN_SHUFPD,
25744 IX86_BUILTIN_LOADUPD,
25745 IX86_BUILTIN_STOREUPD,
25746 IX86_BUILTIN_MOVSD,
25748 IX86_BUILTIN_LOADHPD,
25749 IX86_BUILTIN_LOADLPD,
25751 IX86_BUILTIN_CVTDQ2PD,
25752 IX86_BUILTIN_CVTDQ2PS,
25754 IX86_BUILTIN_CVTPD2DQ,
25755 IX86_BUILTIN_CVTPD2PI,
25756 IX86_BUILTIN_CVTPD2PS,
25757 IX86_BUILTIN_CVTTPD2DQ,
25758 IX86_BUILTIN_CVTTPD2PI,
25760 IX86_BUILTIN_CVTPI2PD,
25761 IX86_BUILTIN_CVTSI2SD,
25762 IX86_BUILTIN_CVTSI642SD,
25764 IX86_BUILTIN_CVTSD2SI,
25765 IX86_BUILTIN_CVTSD2SI64,
25766 IX86_BUILTIN_CVTSD2SS,
25767 IX86_BUILTIN_CVTSS2SD,
25768 IX86_BUILTIN_CVTTSD2SI,
25769 IX86_BUILTIN_CVTTSD2SI64,
25771 IX86_BUILTIN_CVTPS2DQ,
25772 IX86_BUILTIN_CVTPS2PD,
25773 IX86_BUILTIN_CVTTPS2DQ,
25775 IX86_BUILTIN_MOVNTI,
25776 IX86_BUILTIN_MOVNTI64,
25777 IX86_BUILTIN_MOVNTPD,
25778 IX86_BUILTIN_MOVNTDQ,
25780 IX86_BUILTIN_MOVQ128,
25782 /* SSE2 MMX */
25783 IX86_BUILTIN_MASKMOVDQU,
25784 IX86_BUILTIN_MOVMSKPD,
25785 IX86_BUILTIN_PMOVMSKB128,
25787 IX86_BUILTIN_PACKSSWB128,
25788 IX86_BUILTIN_PACKSSDW128,
25789 IX86_BUILTIN_PACKUSWB128,
25791 IX86_BUILTIN_PADDB128,
25792 IX86_BUILTIN_PADDW128,
25793 IX86_BUILTIN_PADDD128,
25794 IX86_BUILTIN_PADDQ128,
25795 IX86_BUILTIN_PADDSB128,
25796 IX86_BUILTIN_PADDSW128,
25797 IX86_BUILTIN_PADDUSB128,
25798 IX86_BUILTIN_PADDUSW128,
25799 IX86_BUILTIN_PSUBB128,
25800 IX86_BUILTIN_PSUBW128,
25801 IX86_BUILTIN_PSUBD128,
25802 IX86_BUILTIN_PSUBQ128,
25803 IX86_BUILTIN_PSUBSB128,
25804 IX86_BUILTIN_PSUBSW128,
25805 IX86_BUILTIN_PSUBUSB128,
25806 IX86_BUILTIN_PSUBUSW128,
25808 IX86_BUILTIN_PAND128,
25809 IX86_BUILTIN_PANDN128,
25810 IX86_BUILTIN_POR128,
25811 IX86_BUILTIN_PXOR128,
25813 IX86_BUILTIN_PAVGB128,
25814 IX86_BUILTIN_PAVGW128,
25816 IX86_BUILTIN_PCMPEQB128,
25817 IX86_BUILTIN_PCMPEQW128,
25818 IX86_BUILTIN_PCMPEQD128,
25819 IX86_BUILTIN_PCMPGTB128,
25820 IX86_BUILTIN_PCMPGTW128,
25821 IX86_BUILTIN_PCMPGTD128,
25823 IX86_BUILTIN_PMADDWD128,
25825 IX86_BUILTIN_PMAXSW128,
25826 IX86_BUILTIN_PMAXUB128,
25827 IX86_BUILTIN_PMINSW128,
25828 IX86_BUILTIN_PMINUB128,
25830 IX86_BUILTIN_PMULUDQ,
25831 IX86_BUILTIN_PMULUDQ128,
25832 IX86_BUILTIN_PMULHUW128,
25833 IX86_BUILTIN_PMULHW128,
25834 IX86_BUILTIN_PMULLW128,
25836 IX86_BUILTIN_PSADBW128,
25837 IX86_BUILTIN_PSHUFHW,
25838 IX86_BUILTIN_PSHUFLW,
25839 IX86_BUILTIN_PSHUFD,
25841 IX86_BUILTIN_PSLLDQI128,
25842 IX86_BUILTIN_PSLLWI128,
25843 IX86_BUILTIN_PSLLDI128,
25844 IX86_BUILTIN_PSLLQI128,
25845 IX86_BUILTIN_PSRAWI128,
25846 IX86_BUILTIN_PSRADI128,
25847 IX86_BUILTIN_PSRLDQI128,
25848 IX86_BUILTIN_PSRLWI128,
25849 IX86_BUILTIN_PSRLDI128,
25850 IX86_BUILTIN_PSRLQI128,
25852 IX86_BUILTIN_PSLLDQ128,
25853 IX86_BUILTIN_PSLLW128,
25854 IX86_BUILTIN_PSLLD128,
25855 IX86_BUILTIN_PSLLQ128,
25856 IX86_BUILTIN_PSRAW128,
25857 IX86_BUILTIN_PSRAD128,
25858 IX86_BUILTIN_PSRLW128,
25859 IX86_BUILTIN_PSRLD128,
25860 IX86_BUILTIN_PSRLQ128,
25862 IX86_BUILTIN_PUNPCKHBW128,
25863 IX86_BUILTIN_PUNPCKHWD128,
25864 IX86_BUILTIN_PUNPCKHDQ128,
25865 IX86_BUILTIN_PUNPCKHQDQ128,
25866 IX86_BUILTIN_PUNPCKLBW128,
25867 IX86_BUILTIN_PUNPCKLWD128,
25868 IX86_BUILTIN_PUNPCKLDQ128,
25869 IX86_BUILTIN_PUNPCKLQDQ128,
25871 IX86_BUILTIN_CLFLUSH,
25872 IX86_BUILTIN_MFENCE,
25873 IX86_BUILTIN_LFENCE,
25874 IX86_BUILTIN_PAUSE,
25876 IX86_BUILTIN_BSRSI,
25877 IX86_BUILTIN_BSRDI,
25878 IX86_BUILTIN_RDPMC,
25879 IX86_BUILTIN_RDTSC,
25880 IX86_BUILTIN_RDTSCP,
25881 IX86_BUILTIN_ROLQI,
25882 IX86_BUILTIN_ROLHI,
25883 IX86_BUILTIN_RORQI,
25884 IX86_BUILTIN_RORHI,
25886 /* SSE3. */
25887 IX86_BUILTIN_ADDSUBPS,
25888 IX86_BUILTIN_HADDPS,
25889 IX86_BUILTIN_HSUBPS,
25890 IX86_BUILTIN_MOVSHDUP,
25891 IX86_BUILTIN_MOVSLDUP,
25892 IX86_BUILTIN_ADDSUBPD,
25893 IX86_BUILTIN_HADDPD,
25894 IX86_BUILTIN_HSUBPD,
25895 IX86_BUILTIN_LDDQU,
25897 IX86_BUILTIN_MONITOR,
25898 IX86_BUILTIN_MWAIT,
25900 /* SSSE3. */
25901 IX86_BUILTIN_PHADDW,
25902 IX86_BUILTIN_PHADDD,
25903 IX86_BUILTIN_PHADDSW,
25904 IX86_BUILTIN_PHSUBW,
25905 IX86_BUILTIN_PHSUBD,
25906 IX86_BUILTIN_PHSUBSW,
25907 IX86_BUILTIN_PMADDUBSW,
25908 IX86_BUILTIN_PMULHRSW,
25909 IX86_BUILTIN_PSHUFB,
25910 IX86_BUILTIN_PSIGNB,
25911 IX86_BUILTIN_PSIGNW,
25912 IX86_BUILTIN_PSIGND,
25913 IX86_BUILTIN_PALIGNR,
25914 IX86_BUILTIN_PABSB,
25915 IX86_BUILTIN_PABSW,
25916 IX86_BUILTIN_PABSD,
25918 IX86_BUILTIN_PHADDW128,
25919 IX86_BUILTIN_PHADDD128,
25920 IX86_BUILTIN_PHADDSW128,
25921 IX86_BUILTIN_PHSUBW128,
25922 IX86_BUILTIN_PHSUBD128,
25923 IX86_BUILTIN_PHSUBSW128,
25924 IX86_BUILTIN_PMADDUBSW128,
25925 IX86_BUILTIN_PMULHRSW128,
25926 IX86_BUILTIN_PSHUFB128,
25927 IX86_BUILTIN_PSIGNB128,
25928 IX86_BUILTIN_PSIGNW128,
25929 IX86_BUILTIN_PSIGND128,
25930 IX86_BUILTIN_PALIGNR128,
25931 IX86_BUILTIN_PABSB128,
25932 IX86_BUILTIN_PABSW128,
25933 IX86_BUILTIN_PABSD128,
25935 /* AMDFAM10 - SSE4A New Instructions. */
25936 IX86_BUILTIN_MOVNTSD,
25937 IX86_BUILTIN_MOVNTSS,
25938 IX86_BUILTIN_EXTRQI,
25939 IX86_BUILTIN_EXTRQ,
25940 IX86_BUILTIN_INSERTQI,
25941 IX86_BUILTIN_INSERTQ,
25943 /* SSE4.1. */
25944 IX86_BUILTIN_BLENDPD,
25945 IX86_BUILTIN_BLENDPS,
25946 IX86_BUILTIN_BLENDVPD,
25947 IX86_BUILTIN_BLENDVPS,
25948 IX86_BUILTIN_PBLENDVB128,
25949 IX86_BUILTIN_PBLENDW128,
25951 IX86_BUILTIN_DPPD,
25952 IX86_BUILTIN_DPPS,
25954 IX86_BUILTIN_INSERTPS128,
25956 IX86_BUILTIN_MOVNTDQA,
25957 IX86_BUILTIN_MPSADBW128,
25958 IX86_BUILTIN_PACKUSDW128,
25959 IX86_BUILTIN_PCMPEQQ,
25960 IX86_BUILTIN_PHMINPOSUW128,
25962 IX86_BUILTIN_PMAXSB128,
25963 IX86_BUILTIN_PMAXSD128,
25964 IX86_BUILTIN_PMAXUD128,
25965 IX86_BUILTIN_PMAXUW128,
25967 IX86_BUILTIN_PMINSB128,
25968 IX86_BUILTIN_PMINSD128,
25969 IX86_BUILTIN_PMINUD128,
25970 IX86_BUILTIN_PMINUW128,
25972 IX86_BUILTIN_PMOVSXBW128,
25973 IX86_BUILTIN_PMOVSXBD128,
25974 IX86_BUILTIN_PMOVSXBQ128,
25975 IX86_BUILTIN_PMOVSXWD128,
25976 IX86_BUILTIN_PMOVSXWQ128,
25977 IX86_BUILTIN_PMOVSXDQ128,
25979 IX86_BUILTIN_PMOVZXBW128,
25980 IX86_BUILTIN_PMOVZXBD128,
25981 IX86_BUILTIN_PMOVZXBQ128,
25982 IX86_BUILTIN_PMOVZXWD128,
25983 IX86_BUILTIN_PMOVZXWQ128,
25984 IX86_BUILTIN_PMOVZXDQ128,
25986 IX86_BUILTIN_PMULDQ128,
25987 IX86_BUILTIN_PMULLD128,
25989 IX86_BUILTIN_ROUNDSD,
25990 IX86_BUILTIN_ROUNDSS,
25992 IX86_BUILTIN_ROUNDPD,
25993 IX86_BUILTIN_ROUNDPS,
25995 IX86_BUILTIN_FLOORPD,
25996 IX86_BUILTIN_CEILPD,
25997 IX86_BUILTIN_TRUNCPD,
25998 IX86_BUILTIN_RINTPD,
25999 IX86_BUILTIN_ROUNDPD_AZ,
26001 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26002 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26003 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26005 IX86_BUILTIN_FLOORPS,
26006 IX86_BUILTIN_CEILPS,
26007 IX86_BUILTIN_TRUNCPS,
26008 IX86_BUILTIN_RINTPS,
26009 IX86_BUILTIN_ROUNDPS_AZ,
26011 IX86_BUILTIN_FLOORPS_SFIX,
26012 IX86_BUILTIN_CEILPS_SFIX,
26013 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26015 IX86_BUILTIN_PTESTZ,
26016 IX86_BUILTIN_PTESTC,
26017 IX86_BUILTIN_PTESTNZC,
26019 IX86_BUILTIN_VEC_INIT_V2SI,
26020 IX86_BUILTIN_VEC_INIT_V4HI,
26021 IX86_BUILTIN_VEC_INIT_V8QI,
26022 IX86_BUILTIN_VEC_EXT_V2DF,
26023 IX86_BUILTIN_VEC_EXT_V2DI,
26024 IX86_BUILTIN_VEC_EXT_V4SF,
26025 IX86_BUILTIN_VEC_EXT_V4SI,
26026 IX86_BUILTIN_VEC_EXT_V8HI,
26027 IX86_BUILTIN_VEC_EXT_V2SI,
26028 IX86_BUILTIN_VEC_EXT_V4HI,
26029 IX86_BUILTIN_VEC_EXT_V16QI,
26030 IX86_BUILTIN_VEC_SET_V2DI,
26031 IX86_BUILTIN_VEC_SET_V4SF,
26032 IX86_BUILTIN_VEC_SET_V4SI,
26033 IX86_BUILTIN_VEC_SET_V8HI,
26034 IX86_BUILTIN_VEC_SET_V4HI,
26035 IX86_BUILTIN_VEC_SET_V16QI,
26037 IX86_BUILTIN_VEC_PACK_SFIX,
26038 IX86_BUILTIN_VEC_PACK_SFIX256,
26040 /* SSE4.2. */
26041 IX86_BUILTIN_CRC32QI,
26042 IX86_BUILTIN_CRC32HI,
26043 IX86_BUILTIN_CRC32SI,
26044 IX86_BUILTIN_CRC32DI,
26046 IX86_BUILTIN_PCMPESTRI128,
26047 IX86_BUILTIN_PCMPESTRM128,
26048 IX86_BUILTIN_PCMPESTRA128,
26049 IX86_BUILTIN_PCMPESTRC128,
26050 IX86_BUILTIN_PCMPESTRO128,
26051 IX86_BUILTIN_PCMPESTRS128,
26052 IX86_BUILTIN_PCMPESTRZ128,
26053 IX86_BUILTIN_PCMPISTRI128,
26054 IX86_BUILTIN_PCMPISTRM128,
26055 IX86_BUILTIN_PCMPISTRA128,
26056 IX86_BUILTIN_PCMPISTRC128,
26057 IX86_BUILTIN_PCMPISTRO128,
26058 IX86_BUILTIN_PCMPISTRS128,
26059 IX86_BUILTIN_PCMPISTRZ128,
26061 IX86_BUILTIN_PCMPGTQ,
26063 /* AES instructions */
26064 IX86_BUILTIN_AESENC128,
26065 IX86_BUILTIN_AESENCLAST128,
26066 IX86_BUILTIN_AESDEC128,
26067 IX86_BUILTIN_AESDECLAST128,
26068 IX86_BUILTIN_AESIMC128,
26069 IX86_BUILTIN_AESKEYGENASSIST128,
26071 /* PCLMUL instruction */
26072 IX86_BUILTIN_PCLMULQDQ128,
26074 /* AVX */
26075 IX86_BUILTIN_ADDPD256,
26076 IX86_BUILTIN_ADDPS256,
26077 IX86_BUILTIN_ADDSUBPD256,
26078 IX86_BUILTIN_ADDSUBPS256,
26079 IX86_BUILTIN_ANDPD256,
26080 IX86_BUILTIN_ANDPS256,
26081 IX86_BUILTIN_ANDNPD256,
26082 IX86_BUILTIN_ANDNPS256,
26083 IX86_BUILTIN_BLENDPD256,
26084 IX86_BUILTIN_BLENDPS256,
26085 IX86_BUILTIN_BLENDVPD256,
26086 IX86_BUILTIN_BLENDVPS256,
26087 IX86_BUILTIN_DIVPD256,
26088 IX86_BUILTIN_DIVPS256,
26089 IX86_BUILTIN_DPPS256,
26090 IX86_BUILTIN_HADDPD256,
26091 IX86_BUILTIN_HADDPS256,
26092 IX86_BUILTIN_HSUBPD256,
26093 IX86_BUILTIN_HSUBPS256,
26094 IX86_BUILTIN_MAXPD256,
26095 IX86_BUILTIN_MAXPS256,
26096 IX86_BUILTIN_MINPD256,
26097 IX86_BUILTIN_MINPS256,
26098 IX86_BUILTIN_MULPD256,
26099 IX86_BUILTIN_MULPS256,
26100 IX86_BUILTIN_ORPD256,
26101 IX86_BUILTIN_ORPS256,
26102 IX86_BUILTIN_SHUFPD256,
26103 IX86_BUILTIN_SHUFPS256,
26104 IX86_BUILTIN_SUBPD256,
26105 IX86_BUILTIN_SUBPS256,
26106 IX86_BUILTIN_XORPD256,
26107 IX86_BUILTIN_XORPS256,
26108 IX86_BUILTIN_CMPSD,
26109 IX86_BUILTIN_CMPSS,
26110 IX86_BUILTIN_CMPPD,
26111 IX86_BUILTIN_CMPPS,
26112 IX86_BUILTIN_CMPPD256,
26113 IX86_BUILTIN_CMPPS256,
26114 IX86_BUILTIN_CVTDQ2PD256,
26115 IX86_BUILTIN_CVTDQ2PS256,
26116 IX86_BUILTIN_CVTPD2PS256,
26117 IX86_BUILTIN_CVTPS2DQ256,
26118 IX86_BUILTIN_CVTPS2PD256,
26119 IX86_BUILTIN_CVTTPD2DQ256,
26120 IX86_BUILTIN_CVTPD2DQ256,
26121 IX86_BUILTIN_CVTTPS2DQ256,
26122 IX86_BUILTIN_EXTRACTF128PD256,
26123 IX86_BUILTIN_EXTRACTF128PS256,
26124 IX86_BUILTIN_EXTRACTF128SI256,
26125 IX86_BUILTIN_VZEROALL,
26126 IX86_BUILTIN_VZEROUPPER,
26127 IX86_BUILTIN_VPERMILVARPD,
26128 IX86_BUILTIN_VPERMILVARPS,
26129 IX86_BUILTIN_VPERMILVARPD256,
26130 IX86_BUILTIN_VPERMILVARPS256,
26131 IX86_BUILTIN_VPERMILPD,
26132 IX86_BUILTIN_VPERMILPS,
26133 IX86_BUILTIN_VPERMILPD256,
26134 IX86_BUILTIN_VPERMILPS256,
26135 IX86_BUILTIN_VPERMIL2PD,
26136 IX86_BUILTIN_VPERMIL2PS,
26137 IX86_BUILTIN_VPERMIL2PD256,
26138 IX86_BUILTIN_VPERMIL2PS256,
26139 IX86_BUILTIN_VPERM2F128PD256,
26140 IX86_BUILTIN_VPERM2F128PS256,
26141 IX86_BUILTIN_VPERM2F128SI256,
26142 IX86_BUILTIN_VBROADCASTSS,
26143 IX86_BUILTIN_VBROADCASTSD256,
26144 IX86_BUILTIN_VBROADCASTSS256,
26145 IX86_BUILTIN_VBROADCASTPD256,
26146 IX86_BUILTIN_VBROADCASTPS256,
26147 IX86_BUILTIN_VINSERTF128PD256,
26148 IX86_BUILTIN_VINSERTF128PS256,
26149 IX86_BUILTIN_VINSERTF128SI256,
26150 IX86_BUILTIN_LOADUPD256,
26151 IX86_BUILTIN_LOADUPS256,
26152 IX86_BUILTIN_STOREUPD256,
26153 IX86_BUILTIN_STOREUPS256,
26154 IX86_BUILTIN_LDDQU256,
26155 IX86_BUILTIN_MOVNTDQ256,
26156 IX86_BUILTIN_MOVNTPD256,
26157 IX86_BUILTIN_MOVNTPS256,
26158 IX86_BUILTIN_LOADDQU256,
26159 IX86_BUILTIN_STOREDQU256,
26160 IX86_BUILTIN_MASKLOADPD,
26161 IX86_BUILTIN_MASKLOADPS,
26162 IX86_BUILTIN_MASKSTOREPD,
26163 IX86_BUILTIN_MASKSTOREPS,
26164 IX86_BUILTIN_MASKLOADPD256,
26165 IX86_BUILTIN_MASKLOADPS256,
26166 IX86_BUILTIN_MASKSTOREPD256,
26167 IX86_BUILTIN_MASKSTOREPS256,
26168 IX86_BUILTIN_MOVSHDUP256,
26169 IX86_BUILTIN_MOVSLDUP256,
26170 IX86_BUILTIN_MOVDDUP256,
26172 IX86_BUILTIN_SQRTPD256,
26173 IX86_BUILTIN_SQRTPS256,
26174 IX86_BUILTIN_SQRTPS_NR256,
26175 IX86_BUILTIN_RSQRTPS256,
26176 IX86_BUILTIN_RSQRTPS_NR256,
26178 IX86_BUILTIN_RCPPS256,
26180 IX86_BUILTIN_ROUNDPD256,
26181 IX86_BUILTIN_ROUNDPS256,
26183 IX86_BUILTIN_FLOORPD256,
26184 IX86_BUILTIN_CEILPD256,
26185 IX86_BUILTIN_TRUNCPD256,
26186 IX86_BUILTIN_RINTPD256,
26187 IX86_BUILTIN_ROUNDPD_AZ256,
26189 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26190 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26191 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26193 IX86_BUILTIN_FLOORPS256,
26194 IX86_BUILTIN_CEILPS256,
26195 IX86_BUILTIN_TRUNCPS256,
26196 IX86_BUILTIN_RINTPS256,
26197 IX86_BUILTIN_ROUNDPS_AZ256,
26199 IX86_BUILTIN_FLOORPS_SFIX256,
26200 IX86_BUILTIN_CEILPS_SFIX256,
26201 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26203 IX86_BUILTIN_UNPCKHPD256,
26204 IX86_BUILTIN_UNPCKLPD256,
26205 IX86_BUILTIN_UNPCKHPS256,
26206 IX86_BUILTIN_UNPCKLPS256,
26208 IX86_BUILTIN_SI256_SI,
26209 IX86_BUILTIN_PS256_PS,
26210 IX86_BUILTIN_PD256_PD,
26211 IX86_BUILTIN_SI_SI256,
26212 IX86_BUILTIN_PS_PS256,
26213 IX86_BUILTIN_PD_PD256,
26215 IX86_BUILTIN_VTESTZPD,
26216 IX86_BUILTIN_VTESTCPD,
26217 IX86_BUILTIN_VTESTNZCPD,
26218 IX86_BUILTIN_VTESTZPS,
26219 IX86_BUILTIN_VTESTCPS,
26220 IX86_BUILTIN_VTESTNZCPS,
26221 IX86_BUILTIN_VTESTZPD256,
26222 IX86_BUILTIN_VTESTCPD256,
26223 IX86_BUILTIN_VTESTNZCPD256,
26224 IX86_BUILTIN_VTESTZPS256,
26225 IX86_BUILTIN_VTESTCPS256,
26226 IX86_BUILTIN_VTESTNZCPS256,
26227 IX86_BUILTIN_PTESTZ256,
26228 IX86_BUILTIN_PTESTC256,
26229 IX86_BUILTIN_PTESTNZC256,
26231 IX86_BUILTIN_MOVMSKPD256,
26232 IX86_BUILTIN_MOVMSKPS256,
26234 /* AVX2 */
26235 IX86_BUILTIN_MPSADBW256,
26236 IX86_BUILTIN_PABSB256,
26237 IX86_BUILTIN_PABSW256,
26238 IX86_BUILTIN_PABSD256,
26239 IX86_BUILTIN_PACKSSDW256,
26240 IX86_BUILTIN_PACKSSWB256,
26241 IX86_BUILTIN_PACKUSDW256,
26242 IX86_BUILTIN_PACKUSWB256,
26243 IX86_BUILTIN_PADDB256,
26244 IX86_BUILTIN_PADDW256,
26245 IX86_BUILTIN_PADDD256,
26246 IX86_BUILTIN_PADDQ256,
26247 IX86_BUILTIN_PADDSB256,
26248 IX86_BUILTIN_PADDSW256,
26249 IX86_BUILTIN_PADDUSB256,
26250 IX86_BUILTIN_PADDUSW256,
26251 IX86_BUILTIN_PALIGNR256,
26252 IX86_BUILTIN_AND256I,
26253 IX86_BUILTIN_ANDNOT256I,
26254 IX86_BUILTIN_PAVGB256,
26255 IX86_BUILTIN_PAVGW256,
26256 IX86_BUILTIN_PBLENDVB256,
26257 IX86_BUILTIN_PBLENDVW256,
26258 IX86_BUILTIN_PCMPEQB256,
26259 IX86_BUILTIN_PCMPEQW256,
26260 IX86_BUILTIN_PCMPEQD256,
26261 IX86_BUILTIN_PCMPEQQ256,
26262 IX86_BUILTIN_PCMPGTB256,
26263 IX86_BUILTIN_PCMPGTW256,
26264 IX86_BUILTIN_PCMPGTD256,
26265 IX86_BUILTIN_PCMPGTQ256,
26266 IX86_BUILTIN_PHADDW256,
26267 IX86_BUILTIN_PHADDD256,
26268 IX86_BUILTIN_PHADDSW256,
26269 IX86_BUILTIN_PHSUBW256,
26270 IX86_BUILTIN_PHSUBD256,
26271 IX86_BUILTIN_PHSUBSW256,
26272 IX86_BUILTIN_PMADDUBSW256,
26273 IX86_BUILTIN_PMADDWD256,
26274 IX86_BUILTIN_PMAXSB256,
26275 IX86_BUILTIN_PMAXSW256,
26276 IX86_BUILTIN_PMAXSD256,
26277 IX86_BUILTIN_PMAXUB256,
26278 IX86_BUILTIN_PMAXUW256,
26279 IX86_BUILTIN_PMAXUD256,
26280 IX86_BUILTIN_PMINSB256,
26281 IX86_BUILTIN_PMINSW256,
26282 IX86_BUILTIN_PMINSD256,
26283 IX86_BUILTIN_PMINUB256,
26284 IX86_BUILTIN_PMINUW256,
26285 IX86_BUILTIN_PMINUD256,
26286 IX86_BUILTIN_PMOVMSKB256,
26287 IX86_BUILTIN_PMOVSXBW256,
26288 IX86_BUILTIN_PMOVSXBD256,
26289 IX86_BUILTIN_PMOVSXBQ256,
26290 IX86_BUILTIN_PMOVSXWD256,
26291 IX86_BUILTIN_PMOVSXWQ256,
26292 IX86_BUILTIN_PMOVSXDQ256,
26293 IX86_BUILTIN_PMOVZXBW256,
26294 IX86_BUILTIN_PMOVZXBD256,
26295 IX86_BUILTIN_PMOVZXBQ256,
26296 IX86_BUILTIN_PMOVZXWD256,
26297 IX86_BUILTIN_PMOVZXWQ256,
26298 IX86_BUILTIN_PMOVZXDQ256,
26299 IX86_BUILTIN_PMULDQ256,
26300 IX86_BUILTIN_PMULHRSW256,
26301 IX86_BUILTIN_PMULHUW256,
26302 IX86_BUILTIN_PMULHW256,
26303 IX86_BUILTIN_PMULLW256,
26304 IX86_BUILTIN_PMULLD256,
26305 IX86_BUILTIN_PMULUDQ256,
26306 IX86_BUILTIN_POR256,
26307 IX86_BUILTIN_PSADBW256,
26308 IX86_BUILTIN_PSHUFB256,
26309 IX86_BUILTIN_PSHUFD256,
26310 IX86_BUILTIN_PSHUFHW256,
26311 IX86_BUILTIN_PSHUFLW256,
26312 IX86_BUILTIN_PSIGNB256,
26313 IX86_BUILTIN_PSIGNW256,
26314 IX86_BUILTIN_PSIGND256,
26315 IX86_BUILTIN_PSLLDQI256,
26316 IX86_BUILTIN_PSLLWI256,
26317 IX86_BUILTIN_PSLLW256,
26318 IX86_BUILTIN_PSLLDI256,
26319 IX86_BUILTIN_PSLLD256,
26320 IX86_BUILTIN_PSLLQI256,
26321 IX86_BUILTIN_PSLLQ256,
26322 IX86_BUILTIN_PSRAWI256,
26323 IX86_BUILTIN_PSRAW256,
26324 IX86_BUILTIN_PSRADI256,
26325 IX86_BUILTIN_PSRAD256,
26326 IX86_BUILTIN_PSRLDQI256,
26327 IX86_BUILTIN_PSRLWI256,
26328 IX86_BUILTIN_PSRLW256,
26329 IX86_BUILTIN_PSRLDI256,
26330 IX86_BUILTIN_PSRLD256,
26331 IX86_BUILTIN_PSRLQI256,
26332 IX86_BUILTIN_PSRLQ256,
26333 IX86_BUILTIN_PSUBB256,
26334 IX86_BUILTIN_PSUBW256,
26335 IX86_BUILTIN_PSUBD256,
26336 IX86_BUILTIN_PSUBQ256,
26337 IX86_BUILTIN_PSUBSB256,
26338 IX86_BUILTIN_PSUBSW256,
26339 IX86_BUILTIN_PSUBUSB256,
26340 IX86_BUILTIN_PSUBUSW256,
26341 IX86_BUILTIN_PUNPCKHBW256,
26342 IX86_BUILTIN_PUNPCKHWD256,
26343 IX86_BUILTIN_PUNPCKHDQ256,
26344 IX86_BUILTIN_PUNPCKHQDQ256,
26345 IX86_BUILTIN_PUNPCKLBW256,
26346 IX86_BUILTIN_PUNPCKLWD256,
26347 IX86_BUILTIN_PUNPCKLDQ256,
26348 IX86_BUILTIN_PUNPCKLQDQ256,
26349 IX86_BUILTIN_PXOR256,
26350 IX86_BUILTIN_MOVNTDQA256,
26351 IX86_BUILTIN_VBROADCASTSS_PS,
26352 IX86_BUILTIN_VBROADCASTSS_PS256,
26353 IX86_BUILTIN_VBROADCASTSD_PD256,
26354 IX86_BUILTIN_VBROADCASTSI256,
26355 IX86_BUILTIN_PBLENDD256,
26356 IX86_BUILTIN_PBLENDD128,
26357 IX86_BUILTIN_PBROADCASTB256,
26358 IX86_BUILTIN_PBROADCASTW256,
26359 IX86_BUILTIN_PBROADCASTD256,
26360 IX86_BUILTIN_PBROADCASTQ256,
26361 IX86_BUILTIN_PBROADCASTB128,
26362 IX86_BUILTIN_PBROADCASTW128,
26363 IX86_BUILTIN_PBROADCASTD128,
26364 IX86_BUILTIN_PBROADCASTQ128,
26365 IX86_BUILTIN_VPERMVARSI256,
26366 IX86_BUILTIN_VPERMDF256,
26367 IX86_BUILTIN_VPERMVARSF256,
26368 IX86_BUILTIN_VPERMDI256,
26369 IX86_BUILTIN_VPERMTI256,
26370 IX86_BUILTIN_VEXTRACT128I256,
26371 IX86_BUILTIN_VINSERT128I256,
26372 IX86_BUILTIN_MASKLOADD,
26373 IX86_BUILTIN_MASKLOADQ,
26374 IX86_BUILTIN_MASKLOADD256,
26375 IX86_BUILTIN_MASKLOADQ256,
26376 IX86_BUILTIN_MASKSTORED,
26377 IX86_BUILTIN_MASKSTOREQ,
26378 IX86_BUILTIN_MASKSTORED256,
26379 IX86_BUILTIN_MASKSTOREQ256,
26380 IX86_BUILTIN_PSLLVV4DI,
26381 IX86_BUILTIN_PSLLVV2DI,
26382 IX86_BUILTIN_PSLLVV8SI,
26383 IX86_BUILTIN_PSLLVV4SI,
26384 IX86_BUILTIN_PSRAVV8SI,
26385 IX86_BUILTIN_PSRAVV4SI,
26386 IX86_BUILTIN_PSRLVV4DI,
26387 IX86_BUILTIN_PSRLVV2DI,
26388 IX86_BUILTIN_PSRLVV8SI,
26389 IX86_BUILTIN_PSRLVV4SI,
26391 IX86_BUILTIN_GATHERSIV2DF,
26392 IX86_BUILTIN_GATHERSIV4DF,
26393 IX86_BUILTIN_GATHERDIV2DF,
26394 IX86_BUILTIN_GATHERDIV4DF,
26395 IX86_BUILTIN_GATHERSIV4SF,
26396 IX86_BUILTIN_GATHERSIV8SF,
26397 IX86_BUILTIN_GATHERDIV4SF,
26398 IX86_BUILTIN_GATHERDIV8SF,
26399 IX86_BUILTIN_GATHERSIV2DI,
26400 IX86_BUILTIN_GATHERSIV4DI,
26401 IX86_BUILTIN_GATHERDIV2DI,
26402 IX86_BUILTIN_GATHERDIV4DI,
26403 IX86_BUILTIN_GATHERSIV4SI,
26404 IX86_BUILTIN_GATHERSIV8SI,
26405 IX86_BUILTIN_GATHERDIV4SI,
26406 IX86_BUILTIN_GATHERDIV8SI,
26408 /* Alternate 4 element gather for the vectorizer where
26409 all operands are 32-byte wide. */
26410 IX86_BUILTIN_GATHERALTSIV4DF,
26411 IX86_BUILTIN_GATHERALTDIV8SF,
26412 IX86_BUILTIN_GATHERALTSIV4DI,
26413 IX86_BUILTIN_GATHERALTDIV8SI,
26415 /* TFmode support builtins. */
26416 IX86_BUILTIN_INFQ,
26417 IX86_BUILTIN_HUGE_VALQ,
26418 IX86_BUILTIN_FABSQ,
26419 IX86_BUILTIN_COPYSIGNQ,
26421 /* Vectorizer support builtins. */
26422 IX86_BUILTIN_CPYSGNPS,
26423 IX86_BUILTIN_CPYSGNPD,
26424 IX86_BUILTIN_CPYSGNPS256,
26425 IX86_BUILTIN_CPYSGNPD256,
26427 /* FMA4 instructions. */
26428 IX86_BUILTIN_VFMADDSS,
26429 IX86_BUILTIN_VFMADDSD,
26430 IX86_BUILTIN_VFMADDPS,
26431 IX86_BUILTIN_VFMADDPD,
26432 IX86_BUILTIN_VFMADDPS256,
26433 IX86_BUILTIN_VFMADDPD256,
26434 IX86_BUILTIN_VFMADDSUBPS,
26435 IX86_BUILTIN_VFMADDSUBPD,
26436 IX86_BUILTIN_VFMADDSUBPS256,
26437 IX86_BUILTIN_VFMADDSUBPD256,
26439 /* FMA3 instructions. */
26440 IX86_BUILTIN_VFMADDSS3,
26441 IX86_BUILTIN_VFMADDSD3,
26443 /* XOP instructions. */
26444 IX86_BUILTIN_VPCMOV,
26445 IX86_BUILTIN_VPCMOV_V2DI,
26446 IX86_BUILTIN_VPCMOV_V4SI,
26447 IX86_BUILTIN_VPCMOV_V8HI,
26448 IX86_BUILTIN_VPCMOV_V16QI,
26449 IX86_BUILTIN_VPCMOV_V4SF,
26450 IX86_BUILTIN_VPCMOV_V2DF,
26451 IX86_BUILTIN_VPCMOV256,
26452 IX86_BUILTIN_VPCMOV_V4DI256,
26453 IX86_BUILTIN_VPCMOV_V8SI256,
26454 IX86_BUILTIN_VPCMOV_V16HI256,
26455 IX86_BUILTIN_VPCMOV_V32QI256,
26456 IX86_BUILTIN_VPCMOV_V8SF256,
26457 IX86_BUILTIN_VPCMOV_V4DF256,
26459 IX86_BUILTIN_VPPERM,
26461 IX86_BUILTIN_VPMACSSWW,
26462 IX86_BUILTIN_VPMACSWW,
26463 IX86_BUILTIN_VPMACSSWD,
26464 IX86_BUILTIN_VPMACSWD,
26465 IX86_BUILTIN_VPMACSSDD,
26466 IX86_BUILTIN_VPMACSDD,
26467 IX86_BUILTIN_VPMACSSDQL,
26468 IX86_BUILTIN_VPMACSSDQH,
26469 IX86_BUILTIN_VPMACSDQL,
26470 IX86_BUILTIN_VPMACSDQH,
26471 IX86_BUILTIN_VPMADCSSWD,
26472 IX86_BUILTIN_VPMADCSWD,
26474 IX86_BUILTIN_VPHADDBW,
26475 IX86_BUILTIN_VPHADDBD,
26476 IX86_BUILTIN_VPHADDBQ,
26477 IX86_BUILTIN_VPHADDWD,
26478 IX86_BUILTIN_VPHADDWQ,
26479 IX86_BUILTIN_VPHADDDQ,
26480 IX86_BUILTIN_VPHADDUBW,
26481 IX86_BUILTIN_VPHADDUBD,
26482 IX86_BUILTIN_VPHADDUBQ,
26483 IX86_BUILTIN_VPHADDUWD,
26484 IX86_BUILTIN_VPHADDUWQ,
26485 IX86_BUILTIN_VPHADDUDQ,
26486 IX86_BUILTIN_VPHSUBBW,
26487 IX86_BUILTIN_VPHSUBWD,
26488 IX86_BUILTIN_VPHSUBDQ,
26490 IX86_BUILTIN_VPROTB,
26491 IX86_BUILTIN_VPROTW,
26492 IX86_BUILTIN_VPROTD,
26493 IX86_BUILTIN_VPROTQ,
26494 IX86_BUILTIN_VPROTB_IMM,
26495 IX86_BUILTIN_VPROTW_IMM,
26496 IX86_BUILTIN_VPROTD_IMM,
26497 IX86_BUILTIN_VPROTQ_IMM,
26499 IX86_BUILTIN_VPSHLB,
26500 IX86_BUILTIN_VPSHLW,
26501 IX86_BUILTIN_VPSHLD,
26502 IX86_BUILTIN_VPSHLQ,
26503 IX86_BUILTIN_VPSHAB,
26504 IX86_BUILTIN_VPSHAW,
26505 IX86_BUILTIN_VPSHAD,
26506 IX86_BUILTIN_VPSHAQ,
26508 IX86_BUILTIN_VFRCZSS,
26509 IX86_BUILTIN_VFRCZSD,
26510 IX86_BUILTIN_VFRCZPS,
26511 IX86_BUILTIN_VFRCZPD,
26512 IX86_BUILTIN_VFRCZPS256,
26513 IX86_BUILTIN_VFRCZPD256,
26515 IX86_BUILTIN_VPCOMEQUB,
26516 IX86_BUILTIN_VPCOMNEUB,
26517 IX86_BUILTIN_VPCOMLTUB,
26518 IX86_BUILTIN_VPCOMLEUB,
26519 IX86_BUILTIN_VPCOMGTUB,
26520 IX86_BUILTIN_VPCOMGEUB,
26521 IX86_BUILTIN_VPCOMFALSEUB,
26522 IX86_BUILTIN_VPCOMTRUEUB,
26524 IX86_BUILTIN_VPCOMEQUW,
26525 IX86_BUILTIN_VPCOMNEUW,
26526 IX86_BUILTIN_VPCOMLTUW,
26527 IX86_BUILTIN_VPCOMLEUW,
26528 IX86_BUILTIN_VPCOMGTUW,
26529 IX86_BUILTIN_VPCOMGEUW,
26530 IX86_BUILTIN_VPCOMFALSEUW,
26531 IX86_BUILTIN_VPCOMTRUEUW,
26533 IX86_BUILTIN_VPCOMEQUD,
26534 IX86_BUILTIN_VPCOMNEUD,
26535 IX86_BUILTIN_VPCOMLTUD,
26536 IX86_BUILTIN_VPCOMLEUD,
26537 IX86_BUILTIN_VPCOMGTUD,
26538 IX86_BUILTIN_VPCOMGEUD,
26539 IX86_BUILTIN_VPCOMFALSEUD,
26540 IX86_BUILTIN_VPCOMTRUEUD,
26542 IX86_BUILTIN_VPCOMEQUQ,
26543 IX86_BUILTIN_VPCOMNEUQ,
26544 IX86_BUILTIN_VPCOMLTUQ,
26545 IX86_BUILTIN_VPCOMLEUQ,
26546 IX86_BUILTIN_VPCOMGTUQ,
26547 IX86_BUILTIN_VPCOMGEUQ,
26548 IX86_BUILTIN_VPCOMFALSEUQ,
26549 IX86_BUILTIN_VPCOMTRUEUQ,
26551 IX86_BUILTIN_VPCOMEQB,
26552 IX86_BUILTIN_VPCOMNEB,
26553 IX86_BUILTIN_VPCOMLTB,
26554 IX86_BUILTIN_VPCOMLEB,
26555 IX86_BUILTIN_VPCOMGTB,
26556 IX86_BUILTIN_VPCOMGEB,
26557 IX86_BUILTIN_VPCOMFALSEB,
26558 IX86_BUILTIN_VPCOMTRUEB,
26560 IX86_BUILTIN_VPCOMEQW,
26561 IX86_BUILTIN_VPCOMNEW,
26562 IX86_BUILTIN_VPCOMLTW,
26563 IX86_BUILTIN_VPCOMLEW,
26564 IX86_BUILTIN_VPCOMGTW,
26565 IX86_BUILTIN_VPCOMGEW,
26566 IX86_BUILTIN_VPCOMFALSEW,
26567 IX86_BUILTIN_VPCOMTRUEW,
26569 IX86_BUILTIN_VPCOMEQD,
26570 IX86_BUILTIN_VPCOMNED,
26571 IX86_BUILTIN_VPCOMLTD,
26572 IX86_BUILTIN_VPCOMLED,
26573 IX86_BUILTIN_VPCOMGTD,
26574 IX86_BUILTIN_VPCOMGED,
26575 IX86_BUILTIN_VPCOMFALSED,
26576 IX86_BUILTIN_VPCOMTRUED,
26578 IX86_BUILTIN_VPCOMEQQ,
26579 IX86_BUILTIN_VPCOMNEQ,
26580 IX86_BUILTIN_VPCOMLTQ,
26581 IX86_BUILTIN_VPCOMLEQ,
26582 IX86_BUILTIN_VPCOMGTQ,
26583 IX86_BUILTIN_VPCOMGEQ,
26584 IX86_BUILTIN_VPCOMFALSEQ,
26585 IX86_BUILTIN_VPCOMTRUEQ,
26587 /* LWP instructions. */
26588 IX86_BUILTIN_LLWPCB,
26589 IX86_BUILTIN_SLWPCB,
26590 IX86_BUILTIN_LWPVAL32,
26591 IX86_BUILTIN_LWPVAL64,
26592 IX86_BUILTIN_LWPINS32,
26593 IX86_BUILTIN_LWPINS64,
26595 IX86_BUILTIN_CLZS,
26597 /* RTM */
26598 IX86_BUILTIN_XBEGIN,
26599 IX86_BUILTIN_XEND,
26600 IX86_BUILTIN_XABORT,
26601 IX86_BUILTIN_XTEST,
26603 /* BMI instructions. */
26604 IX86_BUILTIN_BEXTR32,
26605 IX86_BUILTIN_BEXTR64,
26606 IX86_BUILTIN_CTZS,
26608 /* TBM instructions. */
26609 IX86_BUILTIN_BEXTRI32,
26610 IX86_BUILTIN_BEXTRI64,
26612 /* BMI2 instructions. */
26613 IX86_BUILTIN_BZHI32,
26614 IX86_BUILTIN_BZHI64,
26615 IX86_BUILTIN_PDEP32,
26616 IX86_BUILTIN_PDEP64,
26617 IX86_BUILTIN_PEXT32,
26618 IX86_BUILTIN_PEXT64,
26620 /* ADX instructions. */
26621 IX86_BUILTIN_ADDCARRYX32,
26622 IX86_BUILTIN_ADDCARRYX64,
26624 /* FSGSBASE instructions. */
26625 IX86_BUILTIN_RDFSBASE32,
26626 IX86_BUILTIN_RDFSBASE64,
26627 IX86_BUILTIN_RDGSBASE32,
26628 IX86_BUILTIN_RDGSBASE64,
26629 IX86_BUILTIN_WRFSBASE32,
26630 IX86_BUILTIN_WRFSBASE64,
26631 IX86_BUILTIN_WRGSBASE32,
26632 IX86_BUILTIN_WRGSBASE64,
26634 /* RDRND instructions. */
26635 IX86_BUILTIN_RDRAND16_STEP,
26636 IX86_BUILTIN_RDRAND32_STEP,
26637 IX86_BUILTIN_RDRAND64_STEP,
26639 /* RDSEED instructions. */
26640 IX86_BUILTIN_RDSEED16_STEP,
26641 IX86_BUILTIN_RDSEED32_STEP,
26642 IX86_BUILTIN_RDSEED64_STEP,
26644 /* F16C instructions. */
26645 IX86_BUILTIN_CVTPH2PS,
26646 IX86_BUILTIN_CVTPH2PS256,
26647 IX86_BUILTIN_CVTPS2PH,
26648 IX86_BUILTIN_CVTPS2PH256,
26650 /* CFString built-in for darwin */
26651 IX86_BUILTIN_CFSTRING,
26653 /* Builtins to get CPU type and supported features. */
26654 IX86_BUILTIN_CPU_INIT,
26655 IX86_BUILTIN_CPU_IS,
26656 IX86_BUILTIN_CPU_SUPPORTS,
26658 IX86_BUILTIN_MAX
26661 /* Table for the ix86 builtin decls. */
26662 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26664 /* Table of all of the builtin functions that are possible with different ISA's
26665 but are waiting to be built until a function is declared to use that
26666 ISA. */
26667 struct builtin_isa {
26668 const char *name; /* function name */
26669 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26670 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26671 bool const_p; /* true if the declaration is constant */
26672 bool set_and_not_built_p;
26675 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26678 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26679 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26680 function decl in the ix86_builtins array. Returns the function decl or
26681 NULL_TREE, if the builtin was not added.
26683 If the front end has a special hook for builtin functions, delay adding
26684 builtin functions that aren't in the current ISA until the ISA is changed
26685 with function specific optimization. Doing so, can save about 300K for the
26686 default compiler. When the builtin is expanded, check at that time whether
26687 it is valid.
26689 If the front end doesn't have a special hook, record all builtins, even if
26690 it isn't an instruction set in the current ISA in case the user uses
26691 function specific options for a different ISA, so that we don't get scope
26692 errors if a builtin is added in the middle of a function scope. */
26694 static inline tree
26695 def_builtin (HOST_WIDE_INT mask, const char *name,
26696 enum ix86_builtin_func_type tcode,
26697 enum ix86_builtins code)
26699 tree decl = NULL_TREE;
26701 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26703 ix86_builtins_isa[(int) code].isa = mask;
26705 mask &= ~OPTION_MASK_ISA_64BIT;
26706 if (mask == 0
26707 || (mask & ix86_isa_flags) != 0
26708 || (lang_hooks.builtin_function
26709 == lang_hooks.builtin_function_ext_scope))
26712 tree type = ix86_get_builtin_func_type (tcode);
26713 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26714 NULL, NULL_TREE);
26715 ix86_builtins[(int) code] = decl;
26716 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26718 else
26720 ix86_builtins[(int) code] = NULL_TREE;
26721 ix86_builtins_isa[(int) code].tcode = tcode;
26722 ix86_builtins_isa[(int) code].name = name;
26723 ix86_builtins_isa[(int) code].const_p = false;
26724 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26728 return decl;
26731 /* Like def_builtin, but also marks the function decl "const". */
26733 static inline tree
26734 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26735 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26737 tree decl = def_builtin (mask, name, tcode, code);
26738 if (decl)
26739 TREE_READONLY (decl) = 1;
26740 else
26741 ix86_builtins_isa[(int) code].const_p = true;
26743 return decl;
26746 /* Add any new builtin functions for a given ISA that may not have been
26747 declared. This saves a bit of space compared to adding all of the
26748 declarations to the tree, even if we didn't use them. */
26750 static void
26751 ix86_add_new_builtins (HOST_WIDE_INT isa)
26753 int i;
26755 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26757 if ((ix86_builtins_isa[i].isa & isa) != 0
26758 && ix86_builtins_isa[i].set_and_not_built_p)
26760 tree decl, type;
26762 /* Don't define the builtin again. */
26763 ix86_builtins_isa[i].set_and_not_built_p = false;
26765 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26766 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26767 type, i, BUILT_IN_MD, NULL,
26768 NULL_TREE);
26770 ix86_builtins[i] = decl;
26771 if (ix86_builtins_isa[i].const_p)
26772 TREE_READONLY (decl) = 1;
26777 /* Bits for builtin_description.flag. */
26779 /* Set when we don't support the comparison natively, and should
26780 swap_comparison in order to support it. */
26781 #define BUILTIN_DESC_SWAP_OPERANDS 1
26783 struct builtin_description
26785 const HOST_WIDE_INT mask;
26786 const enum insn_code icode;
26787 const char *const name;
26788 const enum ix86_builtins code;
26789 const enum rtx_code comparison;
26790 const int flag;
26793 static const struct builtin_description bdesc_comi[] =
26795 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26796 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26797 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26798 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26799 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26800 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26801 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26802 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26803 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26804 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26805 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26806 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26807 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26808 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26809 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26810 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26811 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26812 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26813 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26814 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26815 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26816 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26817 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26818 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26821 static const struct builtin_description bdesc_pcmpestr[] =
26823 /* SSE4.2 */
26824 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26825 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26826 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26827 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26828 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26829 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26830 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26833 static const struct builtin_description bdesc_pcmpistr[] =
26835 /* SSE4.2 */
26836 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26837 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26838 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26839 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26840 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26841 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26842 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26845 /* Special builtins with variable number of arguments. */
26846 static const struct builtin_description bdesc_special_args[] =
26848 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26849 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26850 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26852 /* MMX */
26853 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26855 /* 3DNow! */
26856 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26858 /* FXSR, XSAVE and XSAVEOPT */
26859 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26860 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26861 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26862 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26863 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26865 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26866 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26867 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26868 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26869 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26871 /* SSE */
26872 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26873 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26874 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26876 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26877 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26878 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26879 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26881 /* SSE or 3DNow!A */
26882 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26883 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26885 /* SSE2 */
26886 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26887 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26888 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26889 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26890 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26891 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26892 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26893 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26894 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26895 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26897 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26898 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26900 /* SSE3 */
26901 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26903 /* SSE4.1 */
26904 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26906 /* SSE4A */
26907 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26908 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26910 /* AVX */
26911 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26912 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26914 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26915 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26916 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26917 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26918 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26920 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26921 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26922 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26923 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26924 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26925 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26926 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26928 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26929 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26930 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26932 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26933 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26934 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26935 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26936 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26937 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26938 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26939 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26941 /* AVX2 */
26942 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26943 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26944 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26945 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26946 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26947 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26948 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26950 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26952 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26953 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26954 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26955 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26956 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26957 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26959 /* FSGSBASE */
26960 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26961 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26962 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26963 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26964 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26965 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26966 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26967 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26969 /* RTM */
26970 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26971 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26972 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26975 /* Builtins with variable number of arguments. */
26976 static const struct builtin_description bdesc_args[] =
26978 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26979 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26980 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26981 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26982 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26983 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26984 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26986 /* MMX */
26987 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26988 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26989 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26990 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26991 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26992 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26994 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26995 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26996 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26997 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26998 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26999 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27001 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27004 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27006 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27011 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27013 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27015 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27016 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27018 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27019 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27020 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27021 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27022 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27023 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27025 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27026 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27027 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27029 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27031 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27032 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27033 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27034 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27035 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27036 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27038 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27039 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27040 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27041 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27042 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27043 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27045 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27046 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27047 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27048 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27050 /* 3DNow! */
27051 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27052 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27053 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27054 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27056 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27057 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27058 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27059 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27060 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27061 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27062 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27063 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27064 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27065 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27066 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27067 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27068 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27069 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27070 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27072 /* 3DNow!A */
27073 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27074 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27075 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27076 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27077 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27078 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27080 /* SSE */
27081 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27082 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27083 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27084 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27085 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27086 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27088 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27089 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27090 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27091 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27092 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27094 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27096 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27097 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27098 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27099 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27100 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27101 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27102 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27103 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27105 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27106 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27107 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27108 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27109 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27110 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27111 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27112 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27113 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27114 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27115 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27117 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27118 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27120 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27124 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27125 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27126 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27128 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27129 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27130 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27131 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27133 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27134 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27135 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27136 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27138 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27140 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27141 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27142 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27143 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27144 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27146 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27147 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27148 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27150 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27152 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27153 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27154 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27156 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27157 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27159 /* SSE MMX or 3Dnow!A */
27160 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27161 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27162 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27164 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27165 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27166 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27167 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27169 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27170 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27172 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27174 /* SSE2 */
27175 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27177 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27179 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27180 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27181 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27183 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27184 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27185 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27187 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27189 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27193 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27194 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27198 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27201 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27202 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27203 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27204 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27206 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27207 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27210 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27211 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27212 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27213 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27214 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27215 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27216 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27217 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27218 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27220 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27221 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27223 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27225 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27226 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27227 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27228 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27230 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27232 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27233 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27235 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27237 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27238 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27240 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27242 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27243 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27244 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27246 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27248 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27249 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27250 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27251 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27252 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27254 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27255 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27257 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27258 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27259 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27260 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27261 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27263 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27264 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27266 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27267 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27269 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27270 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27271 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27272 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27274 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27278 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27280 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27281 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27282 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27284 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27286 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27287 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27289 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27290 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27291 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27293 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27294 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27295 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27296 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27298 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27300 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27302 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27306 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27308 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27310 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27311 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27312 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27313 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27315 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27316 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27317 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27318 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27319 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27320 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27321 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27323 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27324 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27325 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27326 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27327 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27328 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27329 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27331 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27332 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27333 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27334 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27336 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27337 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27338 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27340 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27342 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27344 /* SSE2 MMX */
27345 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27346 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27348 /* SSE3 */
27349 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27350 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27352 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27353 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27354 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27355 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27356 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27357 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27359 /* SSSE3 */
27360 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27361 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27362 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27363 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27364 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27365 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27367 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27368 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27369 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27370 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27371 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27372 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27373 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27374 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27375 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27376 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27377 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27378 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27379 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27380 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27381 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27382 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27383 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27384 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27385 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27386 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27387 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27388 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27389 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27390 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27392 /* SSSE3. */
27393 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27394 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27396 /* SSE4.1 */
27397 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27398 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27399 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27400 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27401 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27402 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27403 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27404 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27405 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27406 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27408 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27409 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27410 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27411 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27412 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27413 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27414 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27415 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27416 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27417 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27418 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27419 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27420 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27422 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27423 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27424 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27425 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27426 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27427 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27428 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27429 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27430 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27431 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27432 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27433 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27435 /* SSE4.1 */
27436 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27437 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27438 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27439 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27441 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27442 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27443 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27444 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27446 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27447 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27449 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27450 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27452 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27453 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27454 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27455 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27457 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27458 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27460 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27461 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27463 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27464 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27465 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27467 /* SSE4.2 */
27468 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27469 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27470 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27471 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27472 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27474 /* SSE4A */
27475 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27476 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27477 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27478 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27480 /* AES */
27481 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27482 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27484 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27485 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27486 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27487 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27489 /* PCLMUL */
27490 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27492 /* AVX */
27493 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27494 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27495 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27496 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27497 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27498 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27499 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27500 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27501 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27504 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27505 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27506 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27507 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27508 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27509 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27510 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27511 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27512 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27513 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27514 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27518 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27523 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27526 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27527 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27530 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27532 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27537 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27538 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27541 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27542 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27546 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27549 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27562 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27564 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27566 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27568 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27572 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27581 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27591 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27592 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27594 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27595 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27597 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27598 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27599 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27600 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27602 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27603 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27604 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27605 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27606 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27607 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27609 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27610 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27611 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27612 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27613 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27614 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27615 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27616 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27617 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27618 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27619 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27621 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27622 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27623 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27628 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27629 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27631 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27633 /* AVX2 */
27634 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27635 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27636 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27637 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27638 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27639 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27640 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27641 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27642 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27643 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27644 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27645 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27646 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27647 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27648 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27649 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27650 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27651 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27747 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27748 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27749 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27750 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27751 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27752 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27753 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27754 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27755 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27756 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27757 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27758 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27759 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27760 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27761 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27762 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27763 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27764 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27765 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27766 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27781 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27783 /* BMI */
27784 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27785 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27786 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27788 /* TBM */
27789 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27790 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27792 /* F16C */
27793 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27794 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27795 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27796 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27798 /* BMI2 */
27799 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27800 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27801 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27802 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27803 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27804 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27807 /* FMA4 and XOP. */
27808 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27809 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27810 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27811 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27812 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27813 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27814 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27815 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27816 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27817 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27818 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27819 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27820 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27821 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27822 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27823 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27824 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27825 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27826 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27827 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27828 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27829 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27830 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27831 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27832 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27833 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27834 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27835 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27836 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27837 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27838 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27839 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27840 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27841 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27842 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27843 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27844 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27845 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27846 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27847 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27848 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27849 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27850 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27851 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27852 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27853 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27854 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27855 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27856 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27857 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27858 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27859 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27861 static const struct builtin_description bdesc_multi_arg[] =
27863 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27864 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27865 UNKNOWN, (int)MULTI_ARG_3_SF },
27866 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27867 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27868 UNKNOWN, (int)MULTI_ARG_3_DF },
27870 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27871 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27872 UNKNOWN, (int)MULTI_ARG_3_SF },
27873 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27874 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27875 UNKNOWN, (int)MULTI_ARG_3_DF },
27877 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27878 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27879 UNKNOWN, (int)MULTI_ARG_3_SF },
27880 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27881 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27882 UNKNOWN, (int)MULTI_ARG_3_DF },
27883 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27884 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27885 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27886 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27887 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27888 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27890 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27891 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27892 UNKNOWN, (int)MULTI_ARG_3_SF },
27893 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27894 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27895 UNKNOWN, (int)MULTI_ARG_3_DF },
27896 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27897 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27898 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27899 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27900 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27901 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27915 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27917 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27923 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28063 /* TM vector builtins. */
28065 /* Reuse the existing x86-specific `struct builtin_description' cause
28066 we're lazy. Add casts to make them fit. */
28067 static const struct builtin_description bdesc_tm[] =
28069 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28070 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28071 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28072 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28073 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28074 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28075 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28077 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28078 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28079 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28080 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28081 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28082 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28083 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28085 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28086 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28087 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28088 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28089 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28090 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28091 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28093 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28094 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28095 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28098 /* TM callbacks. */
28100 /* Return the builtin decl needed to load a vector of TYPE. */
28102 static tree
28103 ix86_builtin_tm_load (tree type)
28105 if (TREE_CODE (type) == VECTOR_TYPE)
28107 switch (tree_low_cst (TYPE_SIZE (type), 1))
28109 case 64:
28110 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28111 case 128:
28112 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28113 case 256:
28114 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28117 return NULL_TREE;
28120 /* Return the builtin decl needed to store a vector of TYPE. */
28122 static tree
28123 ix86_builtin_tm_store (tree type)
28125 if (TREE_CODE (type) == VECTOR_TYPE)
28127 switch (tree_low_cst (TYPE_SIZE (type), 1))
28129 case 64:
28130 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28131 case 128:
28132 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28133 case 256:
28134 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28137 return NULL_TREE;
28140 /* Initialize the transactional memory vector load/store builtins. */
28142 static void
28143 ix86_init_tm_builtins (void)
28145 enum ix86_builtin_func_type ftype;
28146 const struct builtin_description *d;
28147 size_t i;
28148 tree decl;
28149 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28150 tree attrs_log, attrs_type_log;
28152 if (!flag_tm)
28153 return;
28155 /* If there are no builtins defined, we must be compiling in a
28156 language without trans-mem support. */
28157 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28158 return;
28160 /* Use whatever attributes a normal TM load has. */
28161 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28162 attrs_load = DECL_ATTRIBUTES (decl);
28163 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28164 /* Use whatever attributes a normal TM store has. */
28165 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28166 attrs_store = DECL_ATTRIBUTES (decl);
28167 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28168 /* Use whatever attributes a normal TM log has. */
28169 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28170 attrs_log = DECL_ATTRIBUTES (decl);
28171 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28173 for (i = 0, d = bdesc_tm;
28174 i < ARRAY_SIZE (bdesc_tm);
28175 i++, d++)
28177 if ((d->mask & ix86_isa_flags) != 0
28178 || (lang_hooks.builtin_function
28179 == lang_hooks.builtin_function_ext_scope))
28181 tree type, attrs, attrs_type;
28182 enum built_in_function code = (enum built_in_function) d->code;
28184 ftype = (enum ix86_builtin_func_type) d->flag;
28185 type = ix86_get_builtin_func_type (ftype);
28187 if (BUILTIN_TM_LOAD_P (code))
28189 attrs = attrs_load;
28190 attrs_type = attrs_type_load;
28192 else if (BUILTIN_TM_STORE_P (code))
28194 attrs = attrs_store;
28195 attrs_type = attrs_type_store;
28197 else
28199 attrs = attrs_log;
28200 attrs_type = attrs_type_log;
28202 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28203 /* The builtin without the prefix for
28204 calling it directly. */
28205 d->name + strlen ("__builtin_"),
28206 attrs);
28207 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28208 set the TYPE_ATTRIBUTES. */
28209 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28211 set_builtin_decl (code, decl, false);
28216 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28217 in the current target ISA to allow the user to compile particular modules
28218 with different target specific options that differ from the command line
28219 options. */
28220 static void
28221 ix86_init_mmx_sse_builtins (void)
28223 const struct builtin_description * d;
28224 enum ix86_builtin_func_type ftype;
28225 size_t i;
28227 /* Add all special builtins with variable number of operands. */
28228 for (i = 0, d = bdesc_special_args;
28229 i < ARRAY_SIZE (bdesc_special_args);
28230 i++, d++)
28232 if (d->name == 0)
28233 continue;
28235 ftype = (enum ix86_builtin_func_type) d->flag;
28236 def_builtin (d->mask, d->name, ftype, d->code);
28239 /* Add all builtins with variable number of operands. */
28240 for (i = 0, d = bdesc_args;
28241 i < ARRAY_SIZE (bdesc_args);
28242 i++, d++)
28244 if (d->name == 0)
28245 continue;
28247 ftype = (enum ix86_builtin_func_type) d->flag;
28248 def_builtin_const (d->mask, d->name, ftype, d->code);
28251 /* pcmpestr[im] insns. */
28252 for (i = 0, d = bdesc_pcmpestr;
28253 i < ARRAY_SIZE (bdesc_pcmpestr);
28254 i++, d++)
28256 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28257 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28258 else
28259 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28260 def_builtin_const (d->mask, d->name, ftype, d->code);
28263 /* pcmpistr[im] insns. */
28264 for (i = 0, d = bdesc_pcmpistr;
28265 i < ARRAY_SIZE (bdesc_pcmpistr);
28266 i++, d++)
28268 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28269 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28270 else
28271 ftype = INT_FTYPE_V16QI_V16QI_INT;
28272 def_builtin_const (d->mask, d->name, ftype, d->code);
28275 /* comi/ucomi insns. */
28276 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28278 if (d->mask == OPTION_MASK_ISA_SSE2)
28279 ftype = INT_FTYPE_V2DF_V2DF;
28280 else
28281 ftype = INT_FTYPE_V4SF_V4SF;
28282 def_builtin_const (d->mask, d->name, ftype, d->code);
28285 /* SSE */
28286 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28287 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28288 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28289 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28291 /* SSE or 3DNow!A */
28292 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28293 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28294 IX86_BUILTIN_MASKMOVQ);
28296 /* SSE2 */
28297 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28298 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28300 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28301 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28302 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28303 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28305 /* SSE3. */
28306 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28307 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28308 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28309 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28311 /* AES */
28312 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28313 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28314 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28315 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28316 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28317 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28318 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28319 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28320 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28321 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28322 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28323 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28325 /* PCLMUL */
28326 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28327 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28329 /* RDRND */
28330 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28331 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28332 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28333 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28334 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28335 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28336 IX86_BUILTIN_RDRAND64_STEP);
28338 /* AVX2 */
28339 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28340 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28341 IX86_BUILTIN_GATHERSIV2DF);
28343 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28344 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28345 IX86_BUILTIN_GATHERSIV4DF);
28347 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28348 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28349 IX86_BUILTIN_GATHERDIV2DF);
28351 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28352 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28353 IX86_BUILTIN_GATHERDIV4DF);
28355 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28356 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28357 IX86_BUILTIN_GATHERSIV4SF);
28359 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28360 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28361 IX86_BUILTIN_GATHERSIV8SF);
28363 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28364 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28365 IX86_BUILTIN_GATHERDIV4SF);
28367 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28368 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28369 IX86_BUILTIN_GATHERDIV8SF);
28371 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28372 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28373 IX86_BUILTIN_GATHERSIV2DI);
28375 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28376 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28377 IX86_BUILTIN_GATHERSIV4DI);
28379 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28380 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28381 IX86_BUILTIN_GATHERDIV2DI);
28383 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28384 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28385 IX86_BUILTIN_GATHERDIV4DI);
28387 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28388 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28389 IX86_BUILTIN_GATHERSIV4SI);
28391 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28392 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28393 IX86_BUILTIN_GATHERSIV8SI);
28395 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28396 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28397 IX86_BUILTIN_GATHERDIV4SI);
28399 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28400 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28401 IX86_BUILTIN_GATHERDIV8SI);
28403 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28404 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28405 IX86_BUILTIN_GATHERALTSIV4DF);
28407 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28408 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28409 IX86_BUILTIN_GATHERALTDIV8SF);
28411 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28412 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28413 IX86_BUILTIN_GATHERALTSIV4DI);
28415 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28416 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28417 IX86_BUILTIN_GATHERALTDIV8SI);
28419 /* RTM. */
28420 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28421 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28423 /* MMX access to the vec_init patterns. */
28424 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28425 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28427 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28428 V4HI_FTYPE_HI_HI_HI_HI,
28429 IX86_BUILTIN_VEC_INIT_V4HI);
28431 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28432 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28433 IX86_BUILTIN_VEC_INIT_V8QI);
28435 /* Access to the vec_extract patterns. */
28436 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28437 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28438 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28439 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28440 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28441 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28442 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28443 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28444 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28445 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28447 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28448 "__builtin_ia32_vec_ext_v4hi",
28449 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28451 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28452 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28454 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28455 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28457 /* Access to the vec_set patterns. */
28458 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28459 "__builtin_ia32_vec_set_v2di",
28460 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28462 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28463 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28465 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28466 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28468 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28469 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28471 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28472 "__builtin_ia32_vec_set_v4hi",
28473 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28475 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28476 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28478 /* RDSEED */
28479 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28480 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28481 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28482 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28483 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28484 "__builtin_ia32_rdseed_di_step",
28485 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28487 /* ADCX */
28488 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28489 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28490 def_builtin (OPTION_MASK_ISA_64BIT,
28491 "__builtin_ia32_addcarryx_u64",
28492 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28493 IX86_BUILTIN_ADDCARRYX64);
28495 /* Add FMA4 multi-arg argument instructions */
28496 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28498 if (d->name == 0)
28499 continue;
28501 ftype = (enum ix86_builtin_func_type) d->flag;
28502 def_builtin_const (d->mask, d->name, ftype, d->code);
28506 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28507 to return a pointer to VERSION_DECL if the outcome of the expression
28508 formed by PREDICATE_CHAIN is true. This function will be called during
28509 version dispatch to decide which function version to execute. It returns
28510 the basic block at the end, to which more conditions can be added. */
28512 static basic_block
28513 add_condition_to_bb (tree function_decl, tree version_decl,
28514 tree predicate_chain, basic_block new_bb)
28516 gimple return_stmt;
28517 tree convert_expr, result_var;
28518 gimple convert_stmt;
28519 gimple call_cond_stmt;
28520 gimple if_else_stmt;
28522 basic_block bb1, bb2, bb3;
28523 edge e12, e23;
28525 tree cond_var, and_expr_var = NULL_TREE;
28526 gimple_seq gseq;
28528 tree predicate_decl, predicate_arg;
28530 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28532 gcc_assert (new_bb != NULL);
28533 gseq = bb_seq (new_bb);
28536 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28537 build_fold_addr_expr (version_decl));
28538 result_var = create_tmp_var (ptr_type_node, NULL);
28539 convert_stmt = gimple_build_assign (result_var, convert_expr);
28540 return_stmt = gimple_build_return (result_var);
28542 if (predicate_chain == NULL_TREE)
28544 gimple_seq_add_stmt (&gseq, convert_stmt);
28545 gimple_seq_add_stmt (&gseq, return_stmt);
28546 set_bb_seq (new_bb, gseq);
28547 gimple_set_bb (convert_stmt, new_bb);
28548 gimple_set_bb (return_stmt, new_bb);
28549 pop_cfun ();
28550 return new_bb;
28553 while (predicate_chain != NULL)
28555 cond_var = create_tmp_var (integer_type_node, NULL);
28556 predicate_decl = TREE_PURPOSE (predicate_chain);
28557 predicate_arg = TREE_VALUE (predicate_chain);
28558 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28559 gimple_call_set_lhs (call_cond_stmt, cond_var);
28561 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28562 gimple_set_bb (call_cond_stmt, new_bb);
28563 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28565 predicate_chain = TREE_CHAIN (predicate_chain);
28567 if (and_expr_var == NULL)
28568 and_expr_var = cond_var;
28569 else
28571 gimple assign_stmt;
28572 /* Use MIN_EXPR to check if any integer is zero?.
28573 and_expr_var = min_expr <cond_var, and_expr_var> */
28574 assign_stmt = gimple_build_assign (and_expr_var,
28575 build2 (MIN_EXPR, integer_type_node,
28576 cond_var, and_expr_var));
28578 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28579 gimple_set_bb (assign_stmt, new_bb);
28580 gimple_seq_add_stmt (&gseq, assign_stmt);
28584 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28585 integer_zero_node,
28586 NULL_TREE, NULL_TREE);
28587 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28588 gimple_set_bb (if_else_stmt, new_bb);
28589 gimple_seq_add_stmt (&gseq, if_else_stmt);
28591 gimple_seq_add_stmt (&gseq, convert_stmt);
28592 gimple_seq_add_stmt (&gseq, return_stmt);
28593 set_bb_seq (new_bb, gseq);
28595 bb1 = new_bb;
28596 e12 = split_block (bb1, if_else_stmt);
28597 bb2 = e12->dest;
28598 e12->flags &= ~EDGE_FALLTHRU;
28599 e12->flags |= EDGE_TRUE_VALUE;
28601 e23 = split_block (bb2, return_stmt);
28603 gimple_set_bb (convert_stmt, bb2);
28604 gimple_set_bb (return_stmt, bb2);
28606 bb3 = e23->dest;
28607 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28609 remove_edge (e23);
28610 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28612 pop_cfun ();
28614 return bb3;
28617 /* This parses the attribute arguments to target in DECL and determines
28618 the right builtin to use to match the platform specification.
28619 It returns the priority value for this version decl. If PREDICATE_LIST
28620 is not NULL, it stores the list of cpu features that need to be checked
28621 before dispatching this function. */
28623 static unsigned int
28624 get_builtin_code_for_version (tree decl, tree *predicate_list)
28626 tree attrs;
28627 struct cl_target_option cur_target;
28628 tree target_node;
28629 struct cl_target_option *new_target;
28630 const char *arg_str = NULL;
28631 const char *attrs_str = NULL;
28632 char *tok_str = NULL;
28633 char *token;
28635 /* Priority of i386 features, greater value is higher priority. This is
28636 used to decide the order in which function dispatch must happen. For
28637 instance, a version specialized for SSE4.2 should be checked for dispatch
28638 before a version for SSE3, as SSE4.2 implies SSE3. */
28639 enum feature_priority
28641 P_ZERO = 0,
28642 P_MMX,
28643 P_SSE,
28644 P_SSE2,
28645 P_SSE3,
28646 P_SSSE3,
28647 P_PROC_SSSE3,
28648 P_SSE4_a,
28649 P_PROC_SSE4_a,
28650 P_SSE4_1,
28651 P_SSE4_2,
28652 P_PROC_SSE4_2,
28653 P_POPCNT,
28654 P_AVX,
28655 P_AVX2,
28656 P_FMA,
28657 P_PROC_FMA
28660 enum feature_priority priority = P_ZERO;
28662 /* These are the target attribute strings for which a dispatcher is
28663 available, from fold_builtin_cpu. */
28665 static struct _feature_list
28667 const char *const name;
28668 const enum feature_priority priority;
28670 const feature_list[] =
28672 {"mmx", P_MMX},
28673 {"sse", P_SSE},
28674 {"sse2", P_SSE2},
28675 {"sse3", P_SSE3},
28676 {"ssse3", P_SSSE3},
28677 {"sse4.1", P_SSE4_1},
28678 {"sse4.2", P_SSE4_2},
28679 {"popcnt", P_POPCNT},
28680 {"avx", P_AVX},
28681 {"avx2", P_AVX2}
28685 static unsigned int NUM_FEATURES
28686 = sizeof (feature_list) / sizeof (struct _feature_list);
28688 unsigned int i;
28690 tree predicate_chain = NULL_TREE;
28691 tree predicate_decl, predicate_arg;
28693 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28694 gcc_assert (attrs != NULL);
28696 attrs = TREE_VALUE (TREE_VALUE (attrs));
28698 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28699 attrs_str = TREE_STRING_POINTER (attrs);
28701 /* Return priority zero for default function. */
28702 if (strcmp (attrs_str, "default") == 0)
28703 return 0;
28705 /* Handle arch= if specified. For priority, set it to be 1 more than
28706 the best instruction set the processor can handle. For instance, if
28707 there is a version for atom and a version for ssse3 (the highest ISA
28708 priority for atom), the atom version must be checked for dispatch
28709 before the ssse3 version. */
28710 if (strstr (attrs_str, "arch=") != NULL)
28712 cl_target_option_save (&cur_target, &global_options);
28713 target_node = ix86_valid_target_attribute_tree (attrs);
28715 gcc_assert (target_node);
28716 new_target = TREE_TARGET_OPTION (target_node);
28717 gcc_assert (new_target);
28719 if (new_target->arch_specified && new_target->arch > 0)
28721 switch (new_target->arch)
28723 case PROCESSOR_CORE2:
28724 arg_str = "core2";
28725 priority = P_PROC_SSSE3;
28726 break;
28727 case PROCESSOR_COREI7:
28728 arg_str = "corei7";
28729 priority = P_PROC_SSE4_2;
28730 break;
28731 case PROCESSOR_ATOM:
28732 arg_str = "atom";
28733 priority = P_PROC_SSSE3;
28734 break;
28735 case PROCESSOR_AMDFAM10:
28736 arg_str = "amdfam10h";
28737 priority = P_PROC_SSE4_a;
28738 break;
28739 case PROCESSOR_BDVER1:
28740 arg_str = "bdver1";
28741 priority = P_PROC_FMA;
28742 break;
28743 case PROCESSOR_BDVER2:
28744 arg_str = "bdver2";
28745 priority = P_PROC_FMA;
28746 break;
28750 cl_target_option_restore (&global_options, &cur_target);
28752 if (predicate_list && arg_str == NULL)
28754 error_at (DECL_SOURCE_LOCATION (decl),
28755 "No dispatcher found for the versioning attributes");
28756 return 0;
28759 if (predicate_list)
28761 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28762 /* For a C string literal the length includes the trailing NULL. */
28763 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28764 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28765 predicate_chain);
28769 /* Process feature name. */
28770 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28771 strcpy (tok_str, attrs_str);
28772 token = strtok (tok_str, ",");
28773 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28775 while (token != NULL)
28777 /* Do not process "arch=" */
28778 if (strncmp (token, "arch=", 5) == 0)
28780 token = strtok (NULL, ",");
28781 continue;
28783 for (i = 0; i < NUM_FEATURES; ++i)
28785 if (strcmp (token, feature_list[i].name) == 0)
28787 if (predicate_list)
28789 predicate_arg = build_string_literal (
28790 strlen (feature_list[i].name) + 1,
28791 feature_list[i].name);
28792 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28793 predicate_chain);
28795 /* Find the maximum priority feature. */
28796 if (feature_list[i].priority > priority)
28797 priority = feature_list[i].priority;
28799 break;
28802 if (predicate_list && i == NUM_FEATURES)
28804 error_at (DECL_SOURCE_LOCATION (decl),
28805 "No dispatcher found for %s", token);
28806 return 0;
28808 token = strtok (NULL, ",");
28810 free (tok_str);
28812 if (predicate_list && predicate_chain == NULL_TREE)
28814 error_at (DECL_SOURCE_LOCATION (decl),
28815 "No dispatcher found for the versioning attributes : %s",
28816 attrs_str);
28817 return 0;
28819 else if (predicate_list)
28821 predicate_chain = nreverse (predicate_chain);
28822 *predicate_list = predicate_chain;
28825 return priority;
28828 /* This compares the priority of target features in function DECL1
28829 and DECL2. It returns positive value if DECL1 is higher priority,
28830 negative value if DECL2 is higher priority and 0 if they are the
28831 same. */
28833 static int
28834 ix86_compare_version_priority (tree decl1, tree decl2)
28836 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
28837 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
28839 return (int)priority1 - (int)priority2;
28842 /* V1 and V2 point to function versions with different priorities
28843 based on the target ISA. This function compares their priorities. */
28845 static int
28846 feature_compare (const void *v1, const void *v2)
28848 typedef struct _function_version_info
28850 tree version_decl;
28851 tree predicate_chain;
28852 unsigned int dispatch_priority;
28853 } function_version_info;
28855 const function_version_info c1 = *(const function_version_info *)v1;
28856 const function_version_info c2 = *(const function_version_info *)v2;
28857 return (c2.dispatch_priority - c1.dispatch_priority);
28860 /* This function generates the dispatch function for
28861 multi-versioned functions. DISPATCH_DECL is the function which will
28862 contain the dispatch logic. FNDECLS are the function choices for
28863 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28864 in DISPATCH_DECL in which the dispatch code is generated. */
28866 static int
28867 dispatch_function_versions (tree dispatch_decl,
28868 void *fndecls_p,
28869 basic_block *empty_bb)
28871 tree default_decl;
28872 gimple ifunc_cpu_init_stmt;
28873 gimple_seq gseq;
28874 int ix;
28875 tree ele;
28876 vec<tree> *fndecls;
28877 unsigned int num_versions = 0;
28878 unsigned int actual_versions = 0;
28879 unsigned int i;
28881 struct _function_version_info
28883 tree version_decl;
28884 tree predicate_chain;
28885 unsigned int dispatch_priority;
28886 }*function_version_info;
28888 gcc_assert (dispatch_decl != NULL
28889 && fndecls_p != NULL
28890 && empty_bb != NULL);
28892 /*fndecls_p is actually a vector. */
28893 fndecls = static_cast<vec<tree> *> (fndecls_p);
28895 /* At least one more version other than the default. */
28896 num_versions = fndecls->length ();
28897 gcc_assert (num_versions >= 2);
28899 function_version_info = (struct _function_version_info *)
28900 XNEWVEC (struct _function_version_info, (num_versions - 1));
28902 /* The first version in the vector is the default decl. */
28903 default_decl = (*fndecls)[0];
28905 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
28907 gseq = bb_seq (*empty_bb);
28908 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
28909 constructors, so explicity call __builtin_cpu_init here. */
28910 ifunc_cpu_init_stmt = gimple_build_call_vec (
28911 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
28912 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
28913 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
28914 set_bb_seq (*empty_bb, gseq);
28916 pop_cfun ();
28919 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
28921 tree version_decl = ele;
28922 tree predicate_chain = NULL_TREE;
28923 unsigned int priority;
28924 /* Get attribute string, parse it and find the right predicate decl.
28925 The predicate function could be a lengthy combination of many
28926 features, like arch-type and various isa-variants. */
28927 priority = get_builtin_code_for_version (version_decl,
28928 &predicate_chain);
28930 if (predicate_chain == NULL_TREE)
28931 continue;
28933 actual_versions++;
28934 function_version_info [ix - 1].version_decl = version_decl;
28935 function_version_info [ix - 1].predicate_chain = predicate_chain;
28936 function_version_info [ix - 1].dispatch_priority = priority;
28939 /* Sort the versions according to descending order of dispatch priority. The
28940 priority is based on the ISA. This is not a perfect solution. There
28941 could still be ambiguity. If more than one function version is suitable
28942 to execute, which one should be dispatched? In future, allow the user
28943 to specify a dispatch priority next to the version. */
28944 qsort (function_version_info, actual_versions,
28945 sizeof (struct _function_version_info), feature_compare);
28947 for (i = 0; i < actual_versions; ++i)
28948 *empty_bb = add_condition_to_bb (dispatch_decl,
28949 function_version_info[i].version_decl,
28950 function_version_info[i].predicate_chain,
28951 *empty_bb);
28953 /* dispatch default version at the end. */
28954 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
28955 NULL, *empty_bb);
28957 free (function_version_info);
28958 return 0;
28961 /* Comparator function to be used in qsort routine to sort attribute
28962 specification strings to "target". */
28964 static int
28965 attr_strcmp (const void *v1, const void *v2)
28967 const char *c1 = *(char *const*)v1;
28968 const char *c2 = *(char *const*)v2;
28969 return strcmp (c1, c2);
28972 /* ARGLIST is the argument to target attribute. This function tokenizes
28973 the comma separated arguments, sorts them and returns a string which
28974 is a unique identifier for the comma separated arguments. It also
28975 replaces non-identifier characters "=,-" with "_". */
28977 static char *
28978 sorted_attr_string (tree arglist)
28980 tree arg;
28981 size_t str_len_sum = 0;
28982 char **args = NULL;
28983 char *attr_str, *ret_str;
28984 char *attr = NULL;
28985 unsigned int argnum = 1;
28986 unsigned int i;
28988 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
28990 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
28991 size_t len = strlen (str);
28992 str_len_sum += len + 1;
28993 if (arg != arglist)
28994 argnum++;
28995 for (i = 0; i < strlen (str); i++)
28996 if (str[i] == ',')
28997 argnum++;
29000 attr_str = XNEWVEC (char, str_len_sum);
29001 str_len_sum = 0;
29002 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29004 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29005 size_t len = strlen (str);
29006 memcpy (attr_str + str_len_sum, str, len);
29007 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29008 str_len_sum += len + 1;
29011 /* Replace "=,-" with "_". */
29012 for (i = 0; i < strlen (attr_str); i++)
29013 if (attr_str[i] == '=' || attr_str[i]== '-')
29014 attr_str[i] = '_';
29016 if (argnum == 1)
29017 return attr_str;
29019 args = XNEWVEC (char *, argnum);
29021 i = 0;
29022 attr = strtok (attr_str, ",");
29023 while (attr != NULL)
29025 args[i] = attr;
29026 i++;
29027 attr = strtok (NULL, ",");
29030 qsort (args, argnum, sizeof (char *), attr_strcmp);
29032 ret_str = XNEWVEC (char, str_len_sum);
29033 str_len_sum = 0;
29034 for (i = 0; i < argnum; i++)
29036 size_t len = strlen (args[i]);
29037 memcpy (ret_str + str_len_sum, args[i], len);
29038 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29039 str_len_sum += len + 1;
29042 XDELETEVEC (args);
29043 XDELETEVEC (attr_str);
29044 return ret_str;
29047 /* This function changes the assembler name for functions that are
29048 versions. If DECL is a function version and has a "target"
29049 attribute, it appends the attribute string to its assembler name. */
29051 static tree
29052 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29054 tree version_attr;
29055 const char *orig_name, *version_string;
29056 char *attr_str, *assembler_name;
29058 if (DECL_DECLARED_INLINE_P (decl)
29059 && lookup_attribute ("gnu_inline",
29060 DECL_ATTRIBUTES (decl)))
29061 error_at (DECL_SOURCE_LOCATION (decl),
29062 "Function versions cannot be marked as gnu_inline,"
29063 " bodies have to be generated");
29065 if (DECL_VIRTUAL_P (decl)
29066 || DECL_VINDEX (decl))
29067 sorry ("Virtual function multiversioning not supported");
29069 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29071 /* target attribute string cannot be NULL. */
29072 gcc_assert (version_attr != NULL_TREE);
29074 orig_name = IDENTIFIER_POINTER (id);
29075 version_string
29076 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29078 if (strcmp (version_string, "default") == 0)
29079 return id;
29081 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29082 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29084 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29086 /* Allow assembler name to be modified if already set. */
29087 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29088 SET_DECL_RTL (decl, NULL);
29090 tree ret = get_identifier (assembler_name);
29091 XDELETEVEC (attr_str);
29092 XDELETEVEC (assembler_name);
29093 return ret;
29096 /* This function returns true if FN1 and FN2 are versions of the same function,
29097 that is, the target strings of the function decls are different. This assumes
29098 that FN1 and FN2 have the same signature. */
29100 static bool
29101 ix86_function_versions (tree fn1, tree fn2)
29103 tree attr1, attr2;
29104 char *target1, *target2;
29105 bool result;
29107 if (TREE_CODE (fn1) != FUNCTION_DECL
29108 || TREE_CODE (fn2) != FUNCTION_DECL)
29109 return false;
29111 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29112 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29114 /* At least one function decl should have the target attribute specified. */
29115 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29116 return false;
29118 /* Diagnose missing target attribute if one of the decls is already
29119 multi-versioned. */
29120 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29122 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29124 if (attr2 != NULL_TREE)
29126 tree tem = fn1;
29127 fn1 = fn2;
29128 fn2 = tem;
29129 attr1 = attr2;
29131 error_at (DECL_SOURCE_LOCATION (fn2),
29132 "missing %<target%> attribute for multi-versioned %D",
29133 fn2);
29134 error_at (DECL_SOURCE_LOCATION (fn1),
29135 "previous declaration of %D", fn1);
29136 /* Prevent diagnosing of the same error multiple times. */
29137 DECL_ATTRIBUTES (fn2)
29138 = tree_cons (get_identifier ("target"),
29139 copy_node (TREE_VALUE (attr1)),
29140 DECL_ATTRIBUTES (fn2));
29142 return false;
29145 target1 = sorted_attr_string (TREE_VALUE (attr1));
29146 target2 = sorted_attr_string (TREE_VALUE (attr2));
29148 /* The sorted target strings must be different for fn1 and fn2
29149 to be versions. */
29150 if (strcmp (target1, target2) == 0)
29151 result = false;
29152 else
29153 result = true;
29155 XDELETEVEC (target1);
29156 XDELETEVEC (target2);
29158 return result;
29161 static tree
29162 ix86_mangle_decl_assembler_name (tree decl, tree id)
29164 /* For function version, add the target suffix to the assembler name. */
29165 if (TREE_CODE (decl) == FUNCTION_DECL
29166 && DECL_FUNCTION_VERSIONED (decl))
29167 id = ix86_mangle_function_version_assembler_name (decl, id);
29168 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29169 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29170 #endif
29172 return id;
29175 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29176 is true, append the full path name of the source file. */
29178 static char *
29179 make_name (tree decl, const char *suffix, bool make_unique)
29181 char *global_var_name;
29182 int name_len;
29183 const char *name;
29184 const char *unique_name = NULL;
29186 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29188 /* Get a unique name that can be used globally without any chances
29189 of collision at link time. */
29190 if (make_unique)
29191 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29193 name_len = strlen (name) + strlen (suffix) + 2;
29195 if (make_unique)
29196 name_len += strlen (unique_name) + 1;
29197 global_var_name = XNEWVEC (char, name_len);
29199 /* Use '.' to concatenate names as it is demangler friendly. */
29200 if (make_unique)
29201 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29202 suffix);
29203 else
29204 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29206 return global_var_name;
29209 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29211 /* Make a dispatcher declaration for the multi-versioned function DECL.
29212 Calls to DECL function will be replaced with calls to the dispatcher
29213 by the front-end. Return the decl created. */
29215 static tree
29216 make_dispatcher_decl (const tree decl)
29218 tree func_decl;
29219 char *func_name;
29220 tree fn_type, func_type;
29221 bool is_uniq = false;
29223 if (TREE_PUBLIC (decl) == 0)
29224 is_uniq = true;
29226 func_name = make_name (decl, "ifunc", is_uniq);
29228 fn_type = TREE_TYPE (decl);
29229 func_type = build_function_type (TREE_TYPE (fn_type),
29230 TYPE_ARG_TYPES (fn_type));
29232 func_decl = build_fn_decl (func_name, func_type);
29233 XDELETEVEC (func_name);
29234 TREE_USED (func_decl) = 1;
29235 DECL_CONTEXT (func_decl) = NULL_TREE;
29236 DECL_INITIAL (func_decl) = error_mark_node;
29237 DECL_ARTIFICIAL (func_decl) = 1;
29238 /* Mark this func as external, the resolver will flip it again if
29239 it gets generated. */
29240 DECL_EXTERNAL (func_decl) = 1;
29241 /* This will be of type IFUNCs have to be externally visible. */
29242 TREE_PUBLIC (func_decl) = 1;
29244 return func_decl;
29247 #endif
29249 /* Returns true if decl is multi-versioned and DECL is the default function,
29250 that is it is not tagged with target specific optimization. */
29252 static bool
29253 is_function_default_version (const tree decl)
29255 if (TREE_CODE (decl) != FUNCTION_DECL
29256 || !DECL_FUNCTION_VERSIONED (decl))
29257 return false;
29258 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29259 gcc_assert (attr);
29260 attr = TREE_VALUE (TREE_VALUE (attr));
29261 return (TREE_CODE (attr) == STRING_CST
29262 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29265 /* Make a dispatcher declaration for the multi-versioned function DECL.
29266 Calls to DECL function will be replaced with calls to the dispatcher
29267 by the front-end. Returns the decl of the dispatcher function. */
29269 static tree
29270 ix86_get_function_versions_dispatcher (void *decl)
29272 tree fn = (tree) decl;
29273 struct cgraph_node *node = NULL;
29274 struct cgraph_node *default_node = NULL;
29275 struct cgraph_function_version_info *node_v = NULL;
29276 struct cgraph_function_version_info *first_v = NULL;
29278 tree dispatch_decl = NULL;
29280 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29281 struct cgraph_function_version_info *it_v = NULL;
29282 struct cgraph_node *dispatcher_node = NULL;
29283 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29284 #endif
29286 struct cgraph_function_version_info *default_version_info = NULL;
29288 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29290 node = cgraph_get_node (fn);
29291 gcc_assert (node != NULL);
29293 node_v = get_cgraph_node_version (node);
29294 gcc_assert (node_v != NULL);
29296 if (node_v->dispatcher_resolver != NULL)
29297 return node_v->dispatcher_resolver;
29299 /* Find the default version and make it the first node. */
29300 first_v = node_v;
29301 /* Go to the beginnig of the chain. */
29302 while (first_v->prev != NULL)
29303 first_v = first_v->prev;
29304 default_version_info = first_v;
29305 while (default_version_info != NULL)
29307 if (is_function_default_version
29308 (default_version_info->this_node->symbol.decl))
29309 break;
29310 default_version_info = default_version_info->next;
29313 /* If there is no default node, just return NULL. */
29314 if (default_version_info == NULL)
29315 return NULL;
29317 /* Make default info the first node. */
29318 if (first_v != default_version_info)
29320 default_version_info->prev->next = default_version_info->next;
29321 if (default_version_info->next)
29322 default_version_info->next->prev = default_version_info->prev;
29323 first_v->prev = default_version_info;
29324 default_version_info->next = first_v;
29325 default_version_info->prev = NULL;
29328 default_node = default_version_info->this_node;
29330 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29331 /* Right now, the dispatching is done via ifunc. */
29332 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29334 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29335 gcc_assert (dispatcher_node != NULL);
29336 dispatcher_node->dispatcher_function = 1;
29337 dispatcher_version_info
29338 = insert_new_cgraph_node_version (dispatcher_node);
29339 dispatcher_version_info->next = default_version_info;
29340 dispatcher_node->local.finalized = 1;
29342 /* Set the dispatcher for all the versions. */
29343 it_v = default_version_info;
29344 while (it_v != NULL)
29346 it_v->dispatcher_resolver = dispatch_decl;
29347 it_v = it_v->next;
29349 #else
29350 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29351 "multiversioning needs ifunc which is not supported "
29352 "in this configuration");
29353 #endif
29354 return dispatch_decl;
29357 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29358 it to CHAIN. */
29360 static tree
29361 make_attribute (const char *name, const char *arg_name, tree chain)
29363 tree attr_name;
29364 tree attr_arg_name;
29365 tree attr_args;
29366 tree attr;
29368 attr_name = get_identifier (name);
29369 attr_arg_name = build_string (strlen (arg_name), arg_name);
29370 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29371 attr = tree_cons (attr_name, attr_args, chain);
29372 return attr;
29375 /* Make the resolver function decl to dispatch the versions of
29376 a multi-versioned function, DEFAULT_DECL. Create an
29377 empty basic block in the resolver and store the pointer in
29378 EMPTY_BB. Return the decl of the resolver function. */
29380 static tree
29381 make_resolver_func (const tree default_decl,
29382 const tree dispatch_decl,
29383 basic_block *empty_bb)
29385 char *resolver_name;
29386 tree decl, type, decl_name, t;
29387 bool is_uniq = false;
29389 /* IFUNC's have to be globally visible. So, if the default_decl is
29390 not, then the name of the IFUNC should be made unique. */
29391 if (TREE_PUBLIC (default_decl) == 0)
29392 is_uniq = true;
29394 /* Append the filename to the resolver function if the versions are
29395 not externally visible. This is because the resolver function has
29396 to be externally visible for the loader to find it. So, appending
29397 the filename will prevent conflicts with a resolver function from
29398 another module which is based on the same version name. */
29399 resolver_name = make_name (default_decl, "resolver", is_uniq);
29401 /* The resolver function should return a (void *). */
29402 type = build_function_type_list (ptr_type_node, NULL_TREE);
29404 decl = build_fn_decl (resolver_name, type);
29405 decl_name = get_identifier (resolver_name);
29406 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29408 DECL_NAME (decl) = decl_name;
29409 TREE_USED (decl) = 1;
29410 DECL_ARTIFICIAL (decl) = 1;
29411 DECL_IGNORED_P (decl) = 0;
29412 /* IFUNC resolvers have to be externally visible. */
29413 TREE_PUBLIC (decl) = 1;
29414 DECL_UNINLINABLE (decl) = 0;
29416 /* Resolver is not external, body is generated. */
29417 DECL_EXTERNAL (decl) = 0;
29418 DECL_EXTERNAL (dispatch_decl) = 0;
29420 DECL_CONTEXT (decl) = NULL_TREE;
29421 DECL_INITIAL (decl) = make_node (BLOCK);
29422 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29424 if (DECL_COMDAT_GROUP (default_decl)
29425 || TREE_PUBLIC (default_decl))
29427 /* In this case, each translation unit with a call to this
29428 versioned function will put out a resolver. Ensure it
29429 is comdat to keep just one copy. */
29430 DECL_COMDAT (decl) = 1;
29431 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29433 /* Build result decl and add to function_decl. */
29434 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29435 DECL_ARTIFICIAL (t) = 1;
29436 DECL_IGNORED_P (t) = 1;
29437 DECL_RESULT (decl) = t;
29439 gimplify_function_tree (decl);
29440 push_cfun (DECL_STRUCT_FUNCTION (decl));
29441 *empty_bb = init_lowered_empty_function (decl, false);
29443 cgraph_add_new_function (decl, true);
29444 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29446 pop_cfun ();
29448 gcc_assert (dispatch_decl != NULL);
29449 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29450 DECL_ATTRIBUTES (dispatch_decl)
29451 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29453 /* Create the alias for dispatch to resolver here. */
29454 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29455 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29456 XDELETEVEC (resolver_name);
29457 return decl;
29460 /* Generate the dispatching code body to dispatch multi-versioned function
29461 DECL. The target hook is called to process the "target" attributes and
29462 provide the code to dispatch the right function at run-time. NODE points
29463 to the dispatcher decl whose body will be created. */
29465 static tree
29466 ix86_generate_version_dispatcher_body (void *node_p)
29468 tree resolver_decl;
29469 basic_block empty_bb;
29470 vec<tree> fn_ver_vec = vNULL;
29471 tree default_ver_decl;
29472 struct cgraph_node *versn;
29473 struct cgraph_node *node;
29475 struct cgraph_function_version_info *node_version_info = NULL;
29476 struct cgraph_function_version_info *versn_info = NULL;
29478 node = (cgraph_node *)node_p;
29480 node_version_info = get_cgraph_node_version (node);
29481 gcc_assert (node->dispatcher_function
29482 && node_version_info != NULL);
29484 if (node_version_info->dispatcher_resolver)
29485 return node_version_info->dispatcher_resolver;
29487 /* The first version in the chain corresponds to the default version. */
29488 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29490 /* node is going to be an alias, so remove the finalized bit. */
29491 node->local.finalized = false;
29493 resolver_decl = make_resolver_func (default_ver_decl,
29494 node->symbol.decl, &empty_bb);
29496 node_version_info->dispatcher_resolver = resolver_decl;
29498 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29500 fn_ver_vec.create (2);
29502 for (versn_info = node_version_info->next; versn_info;
29503 versn_info = versn_info->next)
29505 versn = versn_info->this_node;
29506 /* Check for virtual functions here again, as by this time it should
29507 have been determined if this function needs a vtable index or
29508 not. This happens for methods in derived classes that override
29509 virtual methods in base classes but are not explicitly marked as
29510 virtual. */
29511 if (DECL_VINDEX (versn->symbol.decl))
29512 sorry ("Virtual function multiversioning not supported");
29514 fn_ver_vec.safe_push (versn->symbol.decl);
29517 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29518 fn_ver_vec.release ();
29519 rebuild_cgraph_edges ();
29520 pop_cfun ();
29521 return resolver_decl;
29523 /* This builds the processor_model struct type defined in
29524 libgcc/config/i386/cpuinfo.c */
29526 static tree
29527 build_processor_model_struct (void)
29529 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29530 "__cpu_features"};
29531 tree field = NULL_TREE, field_chain = NULL_TREE;
29532 int i;
29533 tree type = make_node (RECORD_TYPE);
29535 /* The first 3 fields are unsigned int. */
29536 for (i = 0; i < 3; ++i)
29538 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29539 get_identifier (field_name[i]), unsigned_type_node);
29540 if (field_chain != NULL_TREE)
29541 DECL_CHAIN (field) = field_chain;
29542 field_chain = field;
29545 /* The last field is an array of unsigned integers of size one. */
29546 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29547 get_identifier (field_name[3]),
29548 build_array_type (unsigned_type_node,
29549 build_index_type (size_one_node)));
29550 if (field_chain != NULL_TREE)
29551 DECL_CHAIN (field) = field_chain;
29552 field_chain = field;
29554 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29555 return type;
29558 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29560 static tree
29561 make_var_decl (tree type, const char *name)
29563 tree new_decl;
29565 new_decl = build_decl (UNKNOWN_LOCATION,
29566 VAR_DECL,
29567 get_identifier(name),
29568 type);
29570 DECL_EXTERNAL (new_decl) = 1;
29571 TREE_STATIC (new_decl) = 1;
29572 TREE_PUBLIC (new_decl) = 1;
29573 DECL_INITIAL (new_decl) = 0;
29574 DECL_ARTIFICIAL (new_decl) = 0;
29575 DECL_PRESERVE_P (new_decl) = 1;
29577 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29578 assemble_variable (new_decl, 0, 0, 0);
29580 return new_decl;
29583 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29584 into an integer defined in libgcc/config/i386/cpuinfo.c */
29586 static tree
29587 fold_builtin_cpu (tree fndecl, tree *args)
29589 unsigned int i;
29590 enum ix86_builtins fn_code = (enum ix86_builtins)
29591 DECL_FUNCTION_CODE (fndecl);
29592 tree param_string_cst = NULL;
29594 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29595 enum processor_features
29597 F_CMOV = 0,
29598 F_MMX,
29599 F_POPCNT,
29600 F_SSE,
29601 F_SSE2,
29602 F_SSE3,
29603 F_SSSE3,
29604 F_SSE4_1,
29605 F_SSE4_2,
29606 F_AVX,
29607 F_AVX2,
29608 F_MAX
29611 /* These are the values for vendor types and cpu types and subtypes
29612 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29613 the corresponding start value. */
29614 enum processor_model
29616 M_INTEL = 1,
29617 M_AMD,
29618 M_CPU_TYPE_START,
29619 M_INTEL_ATOM,
29620 M_INTEL_CORE2,
29621 M_INTEL_COREI7,
29622 M_AMDFAM10H,
29623 M_AMDFAM15H,
29624 M_CPU_SUBTYPE_START,
29625 M_INTEL_COREI7_NEHALEM,
29626 M_INTEL_COREI7_WESTMERE,
29627 M_INTEL_COREI7_SANDYBRIDGE,
29628 M_AMDFAM10H_BARCELONA,
29629 M_AMDFAM10H_SHANGHAI,
29630 M_AMDFAM10H_ISTANBUL,
29631 M_AMDFAM15H_BDVER1,
29632 M_AMDFAM15H_BDVER2,
29633 M_AMDFAM15H_BDVER3
29636 static struct _arch_names_table
29638 const char *const name;
29639 const enum processor_model model;
29641 const arch_names_table[] =
29643 {"amd", M_AMD},
29644 {"intel", M_INTEL},
29645 {"atom", M_INTEL_ATOM},
29646 {"core2", M_INTEL_CORE2},
29647 {"corei7", M_INTEL_COREI7},
29648 {"nehalem", M_INTEL_COREI7_NEHALEM},
29649 {"westmere", M_INTEL_COREI7_WESTMERE},
29650 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29651 {"amdfam10h", M_AMDFAM10H},
29652 {"barcelona", M_AMDFAM10H_BARCELONA},
29653 {"shanghai", M_AMDFAM10H_SHANGHAI},
29654 {"istanbul", M_AMDFAM10H_ISTANBUL},
29655 {"amdfam15h", M_AMDFAM15H},
29656 {"bdver1", M_AMDFAM15H_BDVER1},
29657 {"bdver2", M_AMDFAM15H_BDVER2},
29658 {"bdver3", M_AMDFAM15H_BDVER3},
29661 static struct _isa_names_table
29663 const char *const name;
29664 const enum processor_features feature;
29666 const isa_names_table[] =
29668 {"cmov", F_CMOV},
29669 {"mmx", F_MMX},
29670 {"popcnt", F_POPCNT},
29671 {"sse", F_SSE},
29672 {"sse2", F_SSE2},
29673 {"sse3", F_SSE3},
29674 {"ssse3", F_SSSE3},
29675 {"sse4.1", F_SSE4_1},
29676 {"sse4.2", F_SSE4_2},
29677 {"avx", F_AVX},
29678 {"avx2", F_AVX2}
29681 tree __processor_model_type = build_processor_model_struct ();
29682 tree __cpu_model_var = make_var_decl (__processor_model_type,
29683 "__cpu_model");
29685 gcc_assert ((args != NULL) && (*args != NULL));
29687 param_string_cst = *args;
29688 while (param_string_cst
29689 && TREE_CODE (param_string_cst) != STRING_CST)
29691 /* *args must be a expr that can contain other EXPRS leading to a
29692 STRING_CST. */
29693 if (!EXPR_P (param_string_cst))
29695 error ("Parameter to builtin must be a string constant or literal");
29696 return integer_zero_node;
29698 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29701 gcc_assert (param_string_cst);
29703 if (fn_code == IX86_BUILTIN_CPU_IS)
29705 tree ref;
29706 tree field;
29707 tree final;
29709 unsigned int field_val = 0;
29710 unsigned int NUM_ARCH_NAMES
29711 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29713 for (i = 0; i < NUM_ARCH_NAMES; i++)
29714 if (strcmp (arch_names_table[i].name,
29715 TREE_STRING_POINTER (param_string_cst)) == 0)
29716 break;
29718 if (i == NUM_ARCH_NAMES)
29720 error ("Parameter to builtin not valid: %s",
29721 TREE_STRING_POINTER (param_string_cst));
29722 return integer_zero_node;
29725 field = TYPE_FIELDS (__processor_model_type);
29726 field_val = arch_names_table[i].model;
29728 /* CPU types are stored in the next field. */
29729 if (field_val > M_CPU_TYPE_START
29730 && field_val < M_CPU_SUBTYPE_START)
29732 field = DECL_CHAIN (field);
29733 field_val -= M_CPU_TYPE_START;
29736 /* CPU subtypes are stored in the next field. */
29737 if (field_val > M_CPU_SUBTYPE_START)
29739 field = DECL_CHAIN ( DECL_CHAIN (field));
29740 field_val -= M_CPU_SUBTYPE_START;
29743 /* Get the appropriate field in __cpu_model. */
29744 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29745 field, NULL_TREE);
29747 /* Check the value. */
29748 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29749 build_int_cstu (unsigned_type_node, field_val));
29750 return build1 (CONVERT_EXPR, integer_type_node, final);
29752 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29754 tree ref;
29755 tree array_elt;
29756 tree field;
29757 tree final;
29759 unsigned int field_val = 0;
29760 unsigned int NUM_ISA_NAMES
29761 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29763 for (i = 0; i < NUM_ISA_NAMES; i++)
29764 if (strcmp (isa_names_table[i].name,
29765 TREE_STRING_POINTER (param_string_cst)) == 0)
29766 break;
29768 if (i == NUM_ISA_NAMES)
29770 error ("Parameter to builtin not valid: %s",
29771 TREE_STRING_POINTER (param_string_cst));
29772 return integer_zero_node;
29775 field = TYPE_FIELDS (__processor_model_type);
29776 /* Get the last field, which is __cpu_features. */
29777 while (DECL_CHAIN (field))
29778 field = DECL_CHAIN (field);
29780 /* Get the appropriate field: __cpu_model.__cpu_features */
29781 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29782 field, NULL_TREE);
29784 /* Access the 0th element of __cpu_features array. */
29785 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29786 integer_zero_node, NULL_TREE, NULL_TREE);
29788 field_val = (1 << isa_names_table[i].feature);
29789 /* Return __cpu_model.__cpu_features[0] & field_val */
29790 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29791 build_int_cstu (unsigned_type_node, field_val));
29792 return build1 (CONVERT_EXPR, integer_type_node, final);
29794 gcc_unreachable ();
29797 static tree
29798 ix86_fold_builtin (tree fndecl, int n_args,
29799 tree *args, bool ignore ATTRIBUTE_UNUSED)
29801 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29803 enum ix86_builtins fn_code = (enum ix86_builtins)
29804 DECL_FUNCTION_CODE (fndecl);
29805 if (fn_code == IX86_BUILTIN_CPU_IS
29806 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29808 gcc_assert (n_args == 1);
29809 return fold_builtin_cpu (fndecl, args);
29813 #ifdef SUBTARGET_FOLD_BUILTIN
29814 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29815 #endif
29817 return NULL_TREE;
29820 /* Make builtins to detect cpu type and features supported. NAME is
29821 the builtin name, CODE is the builtin code, and FTYPE is the function
29822 type of the builtin. */
29824 static void
29825 make_cpu_type_builtin (const char* name, int code,
29826 enum ix86_builtin_func_type ftype, bool is_const)
29828 tree decl;
29829 tree type;
29831 type = ix86_get_builtin_func_type (ftype);
29832 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29833 NULL, NULL_TREE);
29834 gcc_assert (decl != NULL_TREE);
29835 ix86_builtins[(int) code] = decl;
29836 TREE_READONLY (decl) = is_const;
29839 /* Make builtins to get CPU type and features supported. The created
29840 builtins are :
29842 __builtin_cpu_init (), to detect cpu type and features,
29843 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29844 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29847 static void
29848 ix86_init_platform_type_builtins (void)
29850 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29851 INT_FTYPE_VOID, false);
29852 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29853 INT_FTYPE_PCCHAR, true);
29854 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29855 INT_FTYPE_PCCHAR, true);
29858 /* Internal method for ix86_init_builtins. */
29860 static void
29861 ix86_init_builtins_va_builtins_abi (void)
29863 tree ms_va_ref, sysv_va_ref;
29864 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29865 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29866 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29867 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29869 if (!TARGET_64BIT)
29870 return;
29871 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
29872 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
29873 ms_va_ref = build_reference_type (ms_va_list_type_node);
29874 sysv_va_ref =
29875 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
29877 fnvoid_va_end_ms =
29878 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29879 fnvoid_va_start_ms =
29880 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29881 fnvoid_va_end_sysv =
29882 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
29883 fnvoid_va_start_sysv =
29884 build_varargs_function_type_list (void_type_node, sysv_va_ref,
29885 NULL_TREE);
29886 fnvoid_va_copy_ms =
29887 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
29888 NULL_TREE);
29889 fnvoid_va_copy_sysv =
29890 build_function_type_list (void_type_node, sysv_va_ref,
29891 sysv_va_ref, NULL_TREE);
29893 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
29894 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
29895 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
29896 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
29897 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
29898 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
29899 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
29900 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29901 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
29902 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29903 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
29904 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29907 static void
29908 ix86_init_builtin_types (void)
29910 tree float128_type_node, float80_type_node;
29912 /* The __float80 type. */
29913 float80_type_node = long_double_type_node;
29914 if (TYPE_MODE (float80_type_node) != XFmode)
29916 /* The __float80 type. */
29917 float80_type_node = make_node (REAL_TYPE);
29919 TYPE_PRECISION (float80_type_node) = 80;
29920 layout_type (float80_type_node);
29922 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
29924 /* The __float128 type. */
29925 float128_type_node = make_node (REAL_TYPE);
29926 TYPE_PRECISION (float128_type_node) = 128;
29927 layout_type (float128_type_node);
29928 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
29930 /* This macro is built by i386-builtin-types.awk. */
29931 DEFINE_BUILTIN_PRIMITIVE_TYPES;
29934 static void
29935 ix86_init_builtins (void)
29937 tree t;
29939 ix86_init_builtin_types ();
29941 /* Builtins to get CPU type and features. */
29942 ix86_init_platform_type_builtins ();
29944 /* TFmode support builtins. */
29945 def_builtin_const (0, "__builtin_infq",
29946 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
29947 def_builtin_const (0, "__builtin_huge_valq",
29948 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
29950 /* We will expand them to normal call if SSE isn't available since
29951 they are used by libgcc. */
29952 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
29953 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
29954 BUILT_IN_MD, "__fabstf2", NULL_TREE);
29955 TREE_READONLY (t) = 1;
29956 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
29958 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
29959 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
29960 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
29961 TREE_READONLY (t) = 1;
29962 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
29964 ix86_init_tm_builtins ();
29965 ix86_init_mmx_sse_builtins ();
29967 if (TARGET_LP64)
29968 ix86_init_builtins_va_builtins_abi ();
29970 #ifdef SUBTARGET_INIT_BUILTINS
29971 SUBTARGET_INIT_BUILTINS;
29972 #endif
29975 /* Return the ix86 builtin for CODE. */
29977 static tree
29978 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
29980 if (code >= IX86_BUILTIN_MAX)
29981 return error_mark_node;
29983 return ix86_builtins[code];
29986 /* Errors in the source file can cause expand_expr to return const0_rtx
29987 where we expect a vector. To avoid crashing, use one of the vector
29988 clear instructions. */
29989 static rtx
29990 safe_vector_operand (rtx x, enum machine_mode mode)
29992 if (x == const0_rtx)
29993 x = CONST0_RTX (mode);
29994 return x;
29997 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
29999 static rtx
30000 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30002 rtx pat;
30003 tree arg0 = CALL_EXPR_ARG (exp, 0);
30004 tree arg1 = CALL_EXPR_ARG (exp, 1);
30005 rtx op0 = expand_normal (arg0);
30006 rtx op1 = expand_normal (arg1);
30007 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30008 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30009 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30011 if (VECTOR_MODE_P (mode0))
30012 op0 = safe_vector_operand (op0, mode0);
30013 if (VECTOR_MODE_P (mode1))
30014 op1 = safe_vector_operand (op1, mode1);
30016 if (optimize || !target
30017 || GET_MODE (target) != tmode
30018 || !insn_data[icode].operand[0].predicate (target, tmode))
30019 target = gen_reg_rtx (tmode);
30021 if (GET_MODE (op1) == SImode && mode1 == TImode)
30023 rtx x = gen_reg_rtx (V4SImode);
30024 emit_insn (gen_sse2_loadd (x, op1));
30025 op1 = gen_lowpart (TImode, x);
30028 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30029 op0 = copy_to_mode_reg (mode0, op0);
30030 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30031 op1 = copy_to_mode_reg (mode1, op1);
30033 pat = GEN_FCN (icode) (target, op0, op1);
30034 if (! pat)
30035 return 0;
30037 emit_insn (pat);
30039 return target;
30042 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30044 static rtx
30045 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30046 enum ix86_builtin_func_type m_type,
30047 enum rtx_code sub_code)
30049 rtx pat;
30050 int i;
30051 int nargs;
30052 bool comparison_p = false;
30053 bool tf_p = false;
30054 bool last_arg_constant = false;
30055 int num_memory = 0;
30056 struct {
30057 rtx op;
30058 enum machine_mode mode;
30059 } args[4];
30061 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30063 switch (m_type)
30065 case MULTI_ARG_4_DF2_DI_I:
30066 case MULTI_ARG_4_DF2_DI_I1:
30067 case MULTI_ARG_4_SF2_SI_I:
30068 case MULTI_ARG_4_SF2_SI_I1:
30069 nargs = 4;
30070 last_arg_constant = true;
30071 break;
30073 case MULTI_ARG_3_SF:
30074 case MULTI_ARG_3_DF:
30075 case MULTI_ARG_3_SF2:
30076 case MULTI_ARG_3_DF2:
30077 case MULTI_ARG_3_DI:
30078 case MULTI_ARG_3_SI:
30079 case MULTI_ARG_3_SI_DI:
30080 case MULTI_ARG_3_HI:
30081 case MULTI_ARG_3_HI_SI:
30082 case MULTI_ARG_3_QI:
30083 case MULTI_ARG_3_DI2:
30084 case MULTI_ARG_3_SI2:
30085 case MULTI_ARG_3_HI2:
30086 case MULTI_ARG_3_QI2:
30087 nargs = 3;
30088 break;
30090 case MULTI_ARG_2_SF:
30091 case MULTI_ARG_2_DF:
30092 case MULTI_ARG_2_DI:
30093 case MULTI_ARG_2_SI:
30094 case MULTI_ARG_2_HI:
30095 case MULTI_ARG_2_QI:
30096 nargs = 2;
30097 break;
30099 case MULTI_ARG_2_DI_IMM:
30100 case MULTI_ARG_2_SI_IMM:
30101 case MULTI_ARG_2_HI_IMM:
30102 case MULTI_ARG_2_QI_IMM:
30103 nargs = 2;
30104 last_arg_constant = true;
30105 break;
30107 case MULTI_ARG_1_SF:
30108 case MULTI_ARG_1_DF:
30109 case MULTI_ARG_1_SF2:
30110 case MULTI_ARG_1_DF2:
30111 case MULTI_ARG_1_DI:
30112 case MULTI_ARG_1_SI:
30113 case MULTI_ARG_1_HI:
30114 case MULTI_ARG_1_QI:
30115 case MULTI_ARG_1_SI_DI:
30116 case MULTI_ARG_1_HI_DI:
30117 case MULTI_ARG_1_HI_SI:
30118 case MULTI_ARG_1_QI_DI:
30119 case MULTI_ARG_1_QI_SI:
30120 case MULTI_ARG_1_QI_HI:
30121 nargs = 1;
30122 break;
30124 case MULTI_ARG_2_DI_CMP:
30125 case MULTI_ARG_2_SI_CMP:
30126 case MULTI_ARG_2_HI_CMP:
30127 case MULTI_ARG_2_QI_CMP:
30128 nargs = 2;
30129 comparison_p = true;
30130 break;
30132 case MULTI_ARG_2_SF_TF:
30133 case MULTI_ARG_2_DF_TF:
30134 case MULTI_ARG_2_DI_TF:
30135 case MULTI_ARG_2_SI_TF:
30136 case MULTI_ARG_2_HI_TF:
30137 case MULTI_ARG_2_QI_TF:
30138 nargs = 2;
30139 tf_p = true;
30140 break;
30142 default:
30143 gcc_unreachable ();
30146 if (optimize || !target
30147 || GET_MODE (target) != tmode
30148 || !insn_data[icode].operand[0].predicate (target, tmode))
30149 target = gen_reg_rtx (tmode);
30151 gcc_assert (nargs <= 4);
30153 for (i = 0; i < nargs; i++)
30155 tree arg = CALL_EXPR_ARG (exp, i);
30156 rtx op = expand_normal (arg);
30157 int adjust = (comparison_p) ? 1 : 0;
30158 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30160 if (last_arg_constant && i == nargs - 1)
30162 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30164 enum insn_code new_icode = icode;
30165 switch (icode)
30167 case CODE_FOR_xop_vpermil2v2df3:
30168 case CODE_FOR_xop_vpermil2v4sf3:
30169 case CODE_FOR_xop_vpermil2v4df3:
30170 case CODE_FOR_xop_vpermil2v8sf3:
30171 error ("the last argument must be a 2-bit immediate");
30172 return gen_reg_rtx (tmode);
30173 case CODE_FOR_xop_rotlv2di3:
30174 new_icode = CODE_FOR_rotlv2di3;
30175 goto xop_rotl;
30176 case CODE_FOR_xop_rotlv4si3:
30177 new_icode = CODE_FOR_rotlv4si3;
30178 goto xop_rotl;
30179 case CODE_FOR_xop_rotlv8hi3:
30180 new_icode = CODE_FOR_rotlv8hi3;
30181 goto xop_rotl;
30182 case CODE_FOR_xop_rotlv16qi3:
30183 new_icode = CODE_FOR_rotlv16qi3;
30184 xop_rotl:
30185 if (CONST_INT_P (op))
30187 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30188 op = GEN_INT (INTVAL (op) & mask);
30189 gcc_checking_assert
30190 (insn_data[icode].operand[i + 1].predicate (op, mode));
30192 else
30194 gcc_checking_assert
30195 (nargs == 2
30196 && insn_data[new_icode].operand[0].mode == tmode
30197 && insn_data[new_icode].operand[1].mode == tmode
30198 && insn_data[new_icode].operand[2].mode == mode
30199 && insn_data[new_icode].operand[0].predicate
30200 == insn_data[icode].operand[0].predicate
30201 && insn_data[new_icode].operand[1].predicate
30202 == insn_data[icode].operand[1].predicate);
30203 icode = new_icode;
30204 goto non_constant;
30206 break;
30207 default:
30208 gcc_unreachable ();
30212 else
30214 non_constant:
30215 if (VECTOR_MODE_P (mode))
30216 op = safe_vector_operand (op, mode);
30218 /* If we aren't optimizing, only allow one memory operand to be
30219 generated. */
30220 if (memory_operand (op, mode))
30221 num_memory++;
30223 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30225 if (optimize
30226 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30227 || num_memory > 1)
30228 op = force_reg (mode, op);
30231 args[i].op = op;
30232 args[i].mode = mode;
30235 switch (nargs)
30237 case 1:
30238 pat = GEN_FCN (icode) (target, args[0].op);
30239 break;
30241 case 2:
30242 if (tf_p)
30243 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30244 GEN_INT ((int)sub_code));
30245 else if (! comparison_p)
30246 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30247 else
30249 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30250 args[0].op,
30251 args[1].op);
30253 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30255 break;
30257 case 3:
30258 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30259 break;
30261 case 4:
30262 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30263 break;
30265 default:
30266 gcc_unreachable ();
30269 if (! pat)
30270 return 0;
30272 emit_insn (pat);
30273 return target;
30276 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30277 insns with vec_merge. */
30279 static rtx
30280 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30281 rtx target)
30283 rtx pat;
30284 tree arg0 = CALL_EXPR_ARG (exp, 0);
30285 rtx op1, op0 = expand_normal (arg0);
30286 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30287 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30289 if (optimize || !target
30290 || GET_MODE (target) != tmode
30291 || !insn_data[icode].operand[0].predicate (target, tmode))
30292 target = gen_reg_rtx (tmode);
30294 if (VECTOR_MODE_P (mode0))
30295 op0 = safe_vector_operand (op0, mode0);
30297 if ((optimize && !register_operand (op0, mode0))
30298 || !insn_data[icode].operand[1].predicate (op0, mode0))
30299 op0 = copy_to_mode_reg (mode0, op0);
30301 op1 = op0;
30302 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30303 op1 = copy_to_mode_reg (mode0, op1);
30305 pat = GEN_FCN (icode) (target, op0, op1);
30306 if (! pat)
30307 return 0;
30308 emit_insn (pat);
30309 return target;
30312 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30314 static rtx
30315 ix86_expand_sse_compare (const struct builtin_description *d,
30316 tree exp, rtx target, bool swap)
30318 rtx pat;
30319 tree arg0 = CALL_EXPR_ARG (exp, 0);
30320 tree arg1 = CALL_EXPR_ARG (exp, 1);
30321 rtx op0 = expand_normal (arg0);
30322 rtx op1 = expand_normal (arg1);
30323 rtx op2;
30324 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30325 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30326 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30327 enum rtx_code comparison = d->comparison;
30329 if (VECTOR_MODE_P (mode0))
30330 op0 = safe_vector_operand (op0, mode0);
30331 if (VECTOR_MODE_P (mode1))
30332 op1 = safe_vector_operand (op1, mode1);
30334 /* Swap operands if we have a comparison that isn't available in
30335 hardware. */
30336 if (swap)
30338 rtx tmp = gen_reg_rtx (mode1);
30339 emit_move_insn (tmp, op1);
30340 op1 = op0;
30341 op0 = tmp;
30344 if (optimize || !target
30345 || GET_MODE (target) != tmode
30346 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30347 target = gen_reg_rtx (tmode);
30349 if ((optimize && !register_operand (op0, mode0))
30350 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30351 op0 = copy_to_mode_reg (mode0, op0);
30352 if ((optimize && !register_operand (op1, mode1))
30353 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30354 op1 = copy_to_mode_reg (mode1, op1);
30356 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30357 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30358 if (! pat)
30359 return 0;
30360 emit_insn (pat);
30361 return target;
30364 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30366 static rtx
30367 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30368 rtx target)
30370 rtx pat;
30371 tree arg0 = CALL_EXPR_ARG (exp, 0);
30372 tree arg1 = CALL_EXPR_ARG (exp, 1);
30373 rtx op0 = expand_normal (arg0);
30374 rtx op1 = expand_normal (arg1);
30375 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30376 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30377 enum rtx_code comparison = d->comparison;
30379 if (VECTOR_MODE_P (mode0))
30380 op0 = safe_vector_operand (op0, mode0);
30381 if (VECTOR_MODE_P (mode1))
30382 op1 = safe_vector_operand (op1, mode1);
30384 /* Swap operands if we have a comparison that isn't available in
30385 hardware. */
30386 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30388 rtx tmp = op1;
30389 op1 = op0;
30390 op0 = tmp;
30393 target = gen_reg_rtx (SImode);
30394 emit_move_insn (target, const0_rtx);
30395 target = gen_rtx_SUBREG (QImode, target, 0);
30397 if ((optimize && !register_operand (op0, mode0))
30398 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30399 op0 = copy_to_mode_reg (mode0, op0);
30400 if ((optimize && !register_operand (op1, mode1))
30401 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30402 op1 = copy_to_mode_reg (mode1, op1);
30404 pat = GEN_FCN (d->icode) (op0, op1);
30405 if (! pat)
30406 return 0;
30407 emit_insn (pat);
30408 emit_insn (gen_rtx_SET (VOIDmode,
30409 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30410 gen_rtx_fmt_ee (comparison, QImode,
30411 SET_DEST (pat),
30412 const0_rtx)));
30414 return SUBREG_REG (target);
30417 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30419 static rtx
30420 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30421 rtx target)
30423 rtx pat;
30424 tree arg0 = CALL_EXPR_ARG (exp, 0);
30425 rtx op1, op0 = expand_normal (arg0);
30426 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30427 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30429 if (optimize || target == 0
30430 || GET_MODE (target) != tmode
30431 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30432 target = gen_reg_rtx (tmode);
30434 if (VECTOR_MODE_P (mode0))
30435 op0 = safe_vector_operand (op0, mode0);
30437 if ((optimize && !register_operand (op0, mode0))
30438 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30439 op0 = copy_to_mode_reg (mode0, op0);
30441 op1 = GEN_INT (d->comparison);
30443 pat = GEN_FCN (d->icode) (target, op0, op1);
30444 if (! pat)
30445 return 0;
30446 emit_insn (pat);
30447 return target;
30450 static rtx
30451 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30452 tree exp, rtx target)
30454 rtx pat;
30455 tree arg0 = CALL_EXPR_ARG (exp, 0);
30456 tree arg1 = CALL_EXPR_ARG (exp, 1);
30457 rtx op0 = expand_normal (arg0);
30458 rtx op1 = expand_normal (arg1);
30459 rtx op2;
30460 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30461 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30462 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30464 if (optimize || target == 0
30465 || GET_MODE (target) != tmode
30466 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30467 target = gen_reg_rtx (tmode);
30469 op0 = safe_vector_operand (op0, mode0);
30470 op1 = safe_vector_operand (op1, mode1);
30472 if ((optimize && !register_operand (op0, mode0))
30473 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30474 op0 = copy_to_mode_reg (mode0, op0);
30475 if ((optimize && !register_operand (op1, mode1))
30476 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30477 op1 = copy_to_mode_reg (mode1, op1);
30479 op2 = GEN_INT (d->comparison);
30481 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30482 if (! pat)
30483 return 0;
30484 emit_insn (pat);
30485 return target;
30488 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30490 static rtx
30491 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30492 rtx target)
30494 rtx pat;
30495 tree arg0 = CALL_EXPR_ARG (exp, 0);
30496 tree arg1 = CALL_EXPR_ARG (exp, 1);
30497 rtx op0 = expand_normal (arg0);
30498 rtx op1 = expand_normal (arg1);
30499 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30500 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30501 enum rtx_code comparison = d->comparison;
30503 if (VECTOR_MODE_P (mode0))
30504 op0 = safe_vector_operand (op0, mode0);
30505 if (VECTOR_MODE_P (mode1))
30506 op1 = safe_vector_operand (op1, mode1);
30508 target = gen_reg_rtx (SImode);
30509 emit_move_insn (target, const0_rtx);
30510 target = gen_rtx_SUBREG (QImode, target, 0);
30512 if ((optimize && !register_operand (op0, mode0))
30513 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30514 op0 = copy_to_mode_reg (mode0, op0);
30515 if ((optimize && !register_operand (op1, mode1))
30516 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30517 op1 = copy_to_mode_reg (mode1, op1);
30519 pat = GEN_FCN (d->icode) (op0, op1);
30520 if (! pat)
30521 return 0;
30522 emit_insn (pat);
30523 emit_insn (gen_rtx_SET (VOIDmode,
30524 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30525 gen_rtx_fmt_ee (comparison, QImode,
30526 SET_DEST (pat),
30527 const0_rtx)));
30529 return SUBREG_REG (target);
30532 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30534 static rtx
30535 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30536 tree exp, rtx target)
30538 rtx pat;
30539 tree arg0 = CALL_EXPR_ARG (exp, 0);
30540 tree arg1 = CALL_EXPR_ARG (exp, 1);
30541 tree arg2 = CALL_EXPR_ARG (exp, 2);
30542 tree arg3 = CALL_EXPR_ARG (exp, 3);
30543 tree arg4 = CALL_EXPR_ARG (exp, 4);
30544 rtx scratch0, scratch1;
30545 rtx op0 = expand_normal (arg0);
30546 rtx op1 = expand_normal (arg1);
30547 rtx op2 = expand_normal (arg2);
30548 rtx op3 = expand_normal (arg3);
30549 rtx op4 = expand_normal (arg4);
30550 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30552 tmode0 = insn_data[d->icode].operand[0].mode;
30553 tmode1 = insn_data[d->icode].operand[1].mode;
30554 modev2 = insn_data[d->icode].operand[2].mode;
30555 modei3 = insn_data[d->icode].operand[3].mode;
30556 modev4 = insn_data[d->icode].operand[4].mode;
30557 modei5 = insn_data[d->icode].operand[5].mode;
30558 modeimm = insn_data[d->icode].operand[6].mode;
30560 if (VECTOR_MODE_P (modev2))
30561 op0 = safe_vector_operand (op0, modev2);
30562 if (VECTOR_MODE_P (modev4))
30563 op2 = safe_vector_operand (op2, modev4);
30565 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30566 op0 = copy_to_mode_reg (modev2, op0);
30567 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30568 op1 = copy_to_mode_reg (modei3, op1);
30569 if ((optimize && !register_operand (op2, modev4))
30570 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30571 op2 = copy_to_mode_reg (modev4, op2);
30572 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30573 op3 = copy_to_mode_reg (modei5, op3);
30575 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30577 error ("the fifth argument must be an 8-bit immediate");
30578 return const0_rtx;
30581 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30583 if (optimize || !target
30584 || GET_MODE (target) != tmode0
30585 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30586 target = gen_reg_rtx (tmode0);
30588 scratch1 = gen_reg_rtx (tmode1);
30590 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30592 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30594 if (optimize || !target
30595 || GET_MODE (target) != tmode1
30596 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30597 target = gen_reg_rtx (tmode1);
30599 scratch0 = gen_reg_rtx (tmode0);
30601 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30603 else
30605 gcc_assert (d->flag);
30607 scratch0 = gen_reg_rtx (tmode0);
30608 scratch1 = gen_reg_rtx (tmode1);
30610 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30613 if (! pat)
30614 return 0;
30616 emit_insn (pat);
30618 if (d->flag)
30620 target = gen_reg_rtx (SImode);
30621 emit_move_insn (target, const0_rtx);
30622 target = gen_rtx_SUBREG (QImode, target, 0);
30624 emit_insn
30625 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30626 gen_rtx_fmt_ee (EQ, QImode,
30627 gen_rtx_REG ((enum machine_mode) d->flag,
30628 FLAGS_REG),
30629 const0_rtx)));
30630 return SUBREG_REG (target);
30632 else
30633 return target;
30637 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30639 static rtx
30640 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30641 tree exp, rtx target)
30643 rtx pat;
30644 tree arg0 = CALL_EXPR_ARG (exp, 0);
30645 tree arg1 = CALL_EXPR_ARG (exp, 1);
30646 tree arg2 = CALL_EXPR_ARG (exp, 2);
30647 rtx scratch0, scratch1;
30648 rtx op0 = expand_normal (arg0);
30649 rtx op1 = expand_normal (arg1);
30650 rtx op2 = expand_normal (arg2);
30651 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30653 tmode0 = insn_data[d->icode].operand[0].mode;
30654 tmode1 = insn_data[d->icode].operand[1].mode;
30655 modev2 = insn_data[d->icode].operand[2].mode;
30656 modev3 = insn_data[d->icode].operand[3].mode;
30657 modeimm = insn_data[d->icode].operand[4].mode;
30659 if (VECTOR_MODE_P (modev2))
30660 op0 = safe_vector_operand (op0, modev2);
30661 if (VECTOR_MODE_P (modev3))
30662 op1 = safe_vector_operand (op1, modev3);
30664 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30665 op0 = copy_to_mode_reg (modev2, op0);
30666 if ((optimize && !register_operand (op1, modev3))
30667 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30668 op1 = copy_to_mode_reg (modev3, op1);
30670 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30672 error ("the third argument must be an 8-bit immediate");
30673 return const0_rtx;
30676 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30678 if (optimize || !target
30679 || GET_MODE (target) != tmode0
30680 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30681 target = gen_reg_rtx (tmode0);
30683 scratch1 = gen_reg_rtx (tmode1);
30685 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30687 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30689 if (optimize || !target
30690 || GET_MODE (target) != tmode1
30691 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30692 target = gen_reg_rtx (tmode1);
30694 scratch0 = gen_reg_rtx (tmode0);
30696 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30698 else
30700 gcc_assert (d->flag);
30702 scratch0 = gen_reg_rtx (tmode0);
30703 scratch1 = gen_reg_rtx (tmode1);
30705 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30708 if (! pat)
30709 return 0;
30711 emit_insn (pat);
30713 if (d->flag)
30715 target = gen_reg_rtx (SImode);
30716 emit_move_insn (target, const0_rtx);
30717 target = gen_rtx_SUBREG (QImode, target, 0);
30719 emit_insn
30720 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30721 gen_rtx_fmt_ee (EQ, QImode,
30722 gen_rtx_REG ((enum machine_mode) d->flag,
30723 FLAGS_REG),
30724 const0_rtx)));
30725 return SUBREG_REG (target);
30727 else
30728 return target;
30731 /* Subroutine of ix86_expand_builtin to take care of insns with
30732 variable number of operands. */
30734 static rtx
30735 ix86_expand_args_builtin (const struct builtin_description *d,
30736 tree exp, rtx target)
30738 rtx pat, real_target;
30739 unsigned int i, nargs;
30740 unsigned int nargs_constant = 0;
30741 int num_memory = 0;
30742 struct
30744 rtx op;
30745 enum machine_mode mode;
30746 } args[4];
30747 bool last_arg_count = false;
30748 enum insn_code icode = d->icode;
30749 const struct insn_data_d *insn_p = &insn_data[icode];
30750 enum machine_mode tmode = insn_p->operand[0].mode;
30751 enum machine_mode rmode = VOIDmode;
30752 bool swap = false;
30753 enum rtx_code comparison = d->comparison;
30755 switch ((enum ix86_builtin_func_type) d->flag)
30757 case V2DF_FTYPE_V2DF_ROUND:
30758 case V4DF_FTYPE_V4DF_ROUND:
30759 case V4SF_FTYPE_V4SF_ROUND:
30760 case V8SF_FTYPE_V8SF_ROUND:
30761 case V4SI_FTYPE_V4SF_ROUND:
30762 case V8SI_FTYPE_V8SF_ROUND:
30763 return ix86_expand_sse_round (d, exp, target);
30764 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30765 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30766 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30767 case INT_FTYPE_V8SF_V8SF_PTEST:
30768 case INT_FTYPE_V4DI_V4DI_PTEST:
30769 case INT_FTYPE_V4DF_V4DF_PTEST:
30770 case INT_FTYPE_V4SF_V4SF_PTEST:
30771 case INT_FTYPE_V2DI_V2DI_PTEST:
30772 case INT_FTYPE_V2DF_V2DF_PTEST:
30773 return ix86_expand_sse_ptest (d, exp, target);
30774 case FLOAT128_FTYPE_FLOAT128:
30775 case FLOAT_FTYPE_FLOAT:
30776 case INT_FTYPE_INT:
30777 case UINT64_FTYPE_INT:
30778 case UINT16_FTYPE_UINT16:
30779 case INT64_FTYPE_INT64:
30780 case INT64_FTYPE_V4SF:
30781 case INT64_FTYPE_V2DF:
30782 case INT_FTYPE_V16QI:
30783 case INT_FTYPE_V8QI:
30784 case INT_FTYPE_V8SF:
30785 case INT_FTYPE_V4DF:
30786 case INT_FTYPE_V4SF:
30787 case INT_FTYPE_V2DF:
30788 case INT_FTYPE_V32QI:
30789 case V16QI_FTYPE_V16QI:
30790 case V8SI_FTYPE_V8SF:
30791 case V8SI_FTYPE_V4SI:
30792 case V8HI_FTYPE_V8HI:
30793 case V8HI_FTYPE_V16QI:
30794 case V8QI_FTYPE_V8QI:
30795 case V8SF_FTYPE_V8SF:
30796 case V8SF_FTYPE_V8SI:
30797 case V8SF_FTYPE_V4SF:
30798 case V8SF_FTYPE_V8HI:
30799 case V4SI_FTYPE_V4SI:
30800 case V4SI_FTYPE_V16QI:
30801 case V4SI_FTYPE_V4SF:
30802 case V4SI_FTYPE_V8SI:
30803 case V4SI_FTYPE_V8HI:
30804 case V4SI_FTYPE_V4DF:
30805 case V4SI_FTYPE_V2DF:
30806 case V4HI_FTYPE_V4HI:
30807 case V4DF_FTYPE_V4DF:
30808 case V4DF_FTYPE_V4SI:
30809 case V4DF_FTYPE_V4SF:
30810 case V4DF_FTYPE_V2DF:
30811 case V4SF_FTYPE_V4SF:
30812 case V4SF_FTYPE_V4SI:
30813 case V4SF_FTYPE_V8SF:
30814 case V4SF_FTYPE_V4DF:
30815 case V4SF_FTYPE_V8HI:
30816 case V4SF_FTYPE_V2DF:
30817 case V2DI_FTYPE_V2DI:
30818 case V2DI_FTYPE_V16QI:
30819 case V2DI_FTYPE_V8HI:
30820 case V2DI_FTYPE_V4SI:
30821 case V2DF_FTYPE_V2DF:
30822 case V2DF_FTYPE_V4SI:
30823 case V2DF_FTYPE_V4DF:
30824 case V2DF_FTYPE_V4SF:
30825 case V2DF_FTYPE_V2SI:
30826 case V2SI_FTYPE_V2SI:
30827 case V2SI_FTYPE_V4SF:
30828 case V2SI_FTYPE_V2SF:
30829 case V2SI_FTYPE_V2DF:
30830 case V2SF_FTYPE_V2SF:
30831 case V2SF_FTYPE_V2SI:
30832 case V32QI_FTYPE_V32QI:
30833 case V32QI_FTYPE_V16QI:
30834 case V16HI_FTYPE_V16HI:
30835 case V16HI_FTYPE_V8HI:
30836 case V8SI_FTYPE_V8SI:
30837 case V16HI_FTYPE_V16QI:
30838 case V8SI_FTYPE_V16QI:
30839 case V4DI_FTYPE_V16QI:
30840 case V8SI_FTYPE_V8HI:
30841 case V4DI_FTYPE_V8HI:
30842 case V4DI_FTYPE_V4SI:
30843 case V4DI_FTYPE_V2DI:
30844 nargs = 1;
30845 break;
30846 case V4SF_FTYPE_V4SF_VEC_MERGE:
30847 case V2DF_FTYPE_V2DF_VEC_MERGE:
30848 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30849 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30850 case V16QI_FTYPE_V16QI_V16QI:
30851 case V16QI_FTYPE_V8HI_V8HI:
30852 case V8QI_FTYPE_V8QI_V8QI:
30853 case V8QI_FTYPE_V4HI_V4HI:
30854 case V8HI_FTYPE_V8HI_V8HI:
30855 case V8HI_FTYPE_V16QI_V16QI:
30856 case V8HI_FTYPE_V4SI_V4SI:
30857 case V8SF_FTYPE_V8SF_V8SF:
30858 case V8SF_FTYPE_V8SF_V8SI:
30859 case V4SI_FTYPE_V4SI_V4SI:
30860 case V4SI_FTYPE_V8HI_V8HI:
30861 case V4SI_FTYPE_V4SF_V4SF:
30862 case V4SI_FTYPE_V2DF_V2DF:
30863 case V4HI_FTYPE_V4HI_V4HI:
30864 case V4HI_FTYPE_V8QI_V8QI:
30865 case V4HI_FTYPE_V2SI_V2SI:
30866 case V4DF_FTYPE_V4DF_V4DF:
30867 case V4DF_FTYPE_V4DF_V4DI:
30868 case V4SF_FTYPE_V4SF_V4SF:
30869 case V4SF_FTYPE_V4SF_V4SI:
30870 case V4SF_FTYPE_V4SF_V2SI:
30871 case V4SF_FTYPE_V4SF_V2DF:
30872 case V4SF_FTYPE_V4SF_DI:
30873 case V4SF_FTYPE_V4SF_SI:
30874 case V2DI_FTYPE_V2DI_V2DI:
30875 case V2DI_FTYPE_V16QI_V16QI:
30876 case V2DI_FTYPE_V4SI_V4SI:
30877 case V2UDI_FTYPE_V4USI_V4USI:
30878 case V2DI_FTYPE_V2DI_V16QI:
30879 case V2DI_FTYPE_V2DF_V2DF:
30880 case V2SI_FTYPE_V2SI_V2SI:
30881 case V2SI_FTYPE_V4HI_V4HI:
30882 case V2SI_FTYPE_V2SF_V2SF:
30883 case V2DF_FTYPE_V2DF_V2DF:
30884 case V2DF_FTYPE_V2DF_V4SF:
30885 case V2DF_FTYPE_V2DF_V2DI:
30886 case V2DF_FTYPE_V2DF_DI:
30887 case V2DF_FTYPE_V2DF_SI:
30888 case V2SF_FTYPE_V2SF_V2SF:
30889 case V1DI_FTYPE_V1DI_V1DI:
30890 case V1DI_FTYPE_V8QI_V8QI:
30891 case V1DI_FTYPE_V2SI_V2SI:
30892 case V32QI_FTYPE_V16HI_V16HI:
30893 case V16HI_FTYPE_V8SI_V8SI:
30894 case V32QI_FTYPE_V32QI_V32QI:
30895 case V16HI_FTYPE_V32QI_V32QI:
30896 case V16HI_FTYPE_V16HI_V16HI:
30897 case V8SI_FTYPE_V4DF_V4DF:
30898 case V8SI_FTYPE_V8SI_V8SI:
30899 case V8SI_FTYPE_V16HI_V16HI:
30900 case V4DI_FTYPE_V4DI_V4DI:
30901 case V4DI_FTYPE_V8SI_V8SI:
30902 case V4UDI_FTYPE_V8USI_V8USI:
30903 if (comparison == UNKNOWN)
30904 return ix86_expand_binop_builtin (icode, exp, target);
30905 nargs = 2;
30906 break;
30907 case V4SF_FTYPE_V4SF_V4SF_SWAP:
30908 case V2DF_FTYPE_V2DF_V2DF_SWAP:
30909 gcc_assert (comparison != UNKNOWN);
30910 nargs = 2;
30911 swap = true;
30912 break;
30913 case V16HI_FTYPE_V16HI_V8HI_COUNT:
30914 case V16HI_FTYPE_V16HI_SI_COUNT:
30915 case V8SI_FTYPE_V8SI_V4SI_COUNT:
30916 case V8SI_FTYPE_V8SI_SI_COUNT:
30917 case V4DI_FTYPE_V4DI_V2DI_COUNT:
30918 case V4DI_FTYPE_V4DI_INT_COUNT:
30919 case V8HI_FTYPE_V8HI_V8HI_COUNT:
30920 case V8HI_FTYPE_V8HI_SI_COUNT:
30921 case V4SI_FTYPE_V4SI_V4SI_COUNT:
30922 case V4SI_FTYPE_V4SI_SI_COUNT:
30923 case V4HI_FTYPE_V4HI_V4HI_COUNT:
30924 case V4HI_FTYPE_V4HI_SI_COUNT:
30925 case V2DI_FTYPE_V2DI_V2DI_COUNT:
30926 case V2DI_FTYPE_V2DI_SI_COUNT:
30927 case V2SI_FTYPE_V2SI_V2SI_COUNT:
30928 case V2SI_FTYPE_V2SI_SI_COUNT:
30929 case V1DI_FTYPE_V1DI_V1DI_COUNT:
30930 case V1DI_FTYPE_V1DI_SI_COUNT:
30931 nargs = 2;
30932 last_arg_count = true;
30933 break;
30934 case UINT64_FTYPE_UINT64_UINT64:
30935 case UINT_FTYPE_UINT_UINT:
30936 case UINT_FTYPE_UINT_USHORT:
30937 case UINT_FTYPE_UINT_UCHAR:
30938 case UINT16_FTYPE_UINT16_INT:
30939 case UINT8_FTYPE_UINT8_INT:
30940 nargs = 2;
30941 break;
30942 case V2DI_FTYPE_V2DI_INT_CONVERT:
30943 nargs = 2;
30944 rmode = V1TImode;
30945 nargs_constant = 1;
30946 break;
30947 case V4DI_FTYPE_V4DI_INT_CONVERT:
30948 nargs = 2;
30949 rmode = V2TImode;
30950 nargs_constant = 1;
30951 break;
30952 case V8HI_FTYPE_V8HI_INT:
30953 case V8HI_FTYPE_V8SF_INT:
30954 case V8HI_FTYPE_V4SF_INT:
30955 case V8SF_FTYPE_V8SF_INT:
30956 case V4SI_FTYPE_V4SI_INT:
30957 case V4SI_FTYPE_V8SI_INT:
30958 case V4HI_FTYPE_V4HI_INT:
30959 case V4DF_FTYPE_V4DF_INT:
30960 case V4SF_FTYPE_V4SF_INT:
30961 case V4SF_FTYPE_V8SF_INT:
30962 case V2DI_FTYPE_V2DI_INT:
30963 case V2DF_FTYPE_V2DF_INT:
30964 case V2DF_FTYPE_V4DF_INT:
30965 case V16HI_FTYPE_V16HI_INT:
30966 case V8SI_FTYPE_V8SI_INT:
30967 case V4DI_FTYPE_V4DI_INT:
30968 case V2DI_FTYPE_V4DI_INT:
30969 nargs = 2;
30970 nargs_constant = 1;
30971 break;
30972 case V16QI_FTYPE_V16QI_V16QI_V16QI:
30973 case V8SF_FTYPE_V8SF_V8SF_V8SF:
30974 case V4DF_FTYPE_V4DF_V4DF_V4DF:
30975 case V4SF_FTYPE_V4SF_V4SF_V4SF:
30976 case V2DF_FTYPE_V2DF_V2DF_V2DF:
30977 case V32QI_FTYPE_V32QI_V32QI_V32QI:
30978 nargs = 3;
30979 break;
30980 case V32QI_FTYPE_V32QI_V32QI_INT:
30981 case V16HI_FTYPE_V16HI_V16HI_INT:
30982 case V16QI_FTYPE_V16QI_V16QI_INT:
30983 case V4DI_FTYPE_V4DI_V4DI_INT:
30984 case V8HI_FTYPE_V8HI_V8HI_INT:
30985 case V8SI_FTYPE_V8SI_V8SI_INT:
30986 case V8SI_FTYPE_V8SI_V4SI_INT:
30987 case V8SF_FTYPE_V8SF_V8SF_INT:
30988 case V8SF_FTYPE_V8SF_V4SF_INT:
30989 case V4SI_FTYPE_V4SI_V4SI_INT:
30990 case V4DF_FTYPE_V4DF_V4DF_INT:
30991 case V4DF_FTYPE_V4DF_V2DF_INT:
30992 case V4SF_FTYPE_V4SF_V4SF_INT:
30993 case V2DI_FTYPE_V2DI_V2DI_INT:
30994 case V4DI_FTYPE_V4DI_V2DI_INT:
30995 case V2DF_FTYPE_V2DF_V2DF_INT:
30996 nargs = 3;
30997 nargs_constant = 1;
30998 break;
30999 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31000 nargs = 3;
31001 rmode = V4DImode;
31002 nargs_constant = 1;
31003 break;
31004 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31005 nargs = 3;
31006 rmode = V2DImode;
31007 nargs_constant = 1;
31008 break;
31009 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31010 nargs = 3;
31011 rmode = DImode;
31012 nargs_constant = 1;
31013 break;
31014 case V2DI_FTYPE_V2DI_UINT_UINT:
31015 nargs = 3;
31016 nargs_constant = 2;
31017 break;
31018 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31019 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31020 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31021 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31022 nargs = 4;
31023 nargs_constant = 1;
31024 break;
31025 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31026 nargs = 4;
31027 nargs_constant = 2;
31028 break;
31029 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31030 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31031 nargs = 4;
31032 break;
31033 default:
31034 gcc_unreachable ();
31037 gcc_assert (nargs <= ARRAY_SIZE (args));
31039 if (comparison != UNKNOWN)
31041 gcc_assert (nargs == 2);
31042 return ix86_expand_sse_compare (d, exp, target, swap);
31045 if (rmode == VOIDmode || rmode == tmode)
31047 if (optimize
31048 || target == 0
31049 || GET_MODE (target) != tmode
31050 || !insn_p->operand[0].predicate (target, tmode))
31051 target = gen_reg_rtx (tmode);
31052 real_target = target;
31054 else
31056 target = gen_reg_rtx (rmode);
31057 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31060 for (i = 0; i < nargs; i++)
31062 tree arg = CALL_EXPR_ARG (exp, i);
31063 rtx op = expand_normal (arg);
31064 enum machine_mode mode = insn_p->operand[i + 1].mode;
31065 bool match = insn_p->operand[i + 1].predicate (op, mode);
31067 if (last_arg_count && (i + 1) == nargs)
31069 /* SIMD shift insns take either an 8-bit immediate or
31070 register as count. But builtin functions take int as
31071 count. If count doesn't match, we put it in register. */
31072 if (!match)
31074 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31075 if (!insn_p->operand[i + 1].predicate (op, mode))
31076 op = copy_to_reg (op);
31079 else if ((nargs - i) <= nargs_constant)
31081 if (!match)
31082 switch (icode)
31084 case CODE_FOR_avx2_inserti128:
31085 case CODE_FOR_avx2_extracti128:
31086 error ("the last argument must be an 1-bit immediate");
31087 return const0_rtx;
31089 case CODE_FOR_sse4_1_roundsd:
31090 case CODE_FOR_sse4_1_roundss:
31092 case CODE_FOR_sse4_1_roundpd:
31093 case CODE_FOR_sse4_1_roundps:
31094 case CODE_FOR_avx_roundpd256:
31095 case CODE_FOR_avx_roundps256:
31097 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31098 case CODE_FOR_sse4_1_roundps_sfix:
31099 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31100 case CODE_FOR_avx_roundps_sfix256:
31102 case CODE_FOR_sse4_1_blendps:
31103 case CODE_FOR_avx_blendpd256:
31104 case CODE_FOR_avx_vpermilv4df:
31105 error ("the last argument must be a 4-bit immediate");
31106 return const0_rtx;
31108 case CODE_FOR_sse4_1_blendpd:
31109 case CODE_FOR_avx_vpermilv2df:
31110 case CODE_FOR_xop_vpermil2v2df3:
31111 case CODE_FOR_xop_vpermil2v4sf3:
31112 case CODE_FOR_xop_vpermil2v4df3:
31113 case CODE_FOR_xop_vpermil2v8sf3:
31114 error ("the last argument must be a 2-bit immediate");
31115 return const0_rtx;
31117 case CODE_FOR_avx_vextractf128v4df:
31118 case CODE_FOR_avx_vextractf128v8sf:
31119 case CODE_FOR_avx_vextractf128v8si:
31120 case CODE_FOR_avx_vinsertf128v4df:
31121 case CODE_FOR_avx_vinsertf128v8sf:
31122 case CODE_FOR_avx_vinsertf128v8si:
31123 error ("the last argument must be a 1-bit immediate");
31124 return const0_rtx;
31126 case CODE_FOR_avx_vmcmpv2df3:
31127 case CODE_FOR_avx_vmcmpv4sf3:
31128 case CODE_FOR_avx_cmpv2df3:
31129 case CODE_FOR_avx_cmpv4sf3:
31130 case CODE_FOR_avx_cmpv4df3:
31131 case CODE_FOR_avx_cmpv8sf3:
31132 error ("the last argument must be a 5-bit immediate");
31133 return const0_rtx;
31135 default:
31136 switch (nargs_constant)
31138 case 2:
31139 if ((nargs - i) == nargs_constant)
31141 error ("the next to last argument must be an 8-bit immediate");
31142 break;
31144 case 1:
31145 error ("the last argument must be an 8-bit immediate");
31146 break;
31147 default:
31148 gcc_unreachable ();
31150 return const0_rtx;
31153 else
31155 if (VECTOR_MODE_P (mode))
31156 op = safe_vector_operand (op, mode);
31158 /* If we aren't optimizing, only allow one memory operand to
31159 be generated. */
31160 if (memory_operand (op, mode))
31161 num_memory++;
31163 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31165 if (optimize || !match || num_memory > 1)
31166 op = copy_to_mode_reg (mode, op);
31168 else
31170 op = copy_to_reg (op);
31171 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31175 args[i].op = op;
31176 args[i].mode = mode;
31179 switch (nargs)
31181 case 1:
31182 pat = GEN_FCN (icode) (real_target, args[0].op);
31183 break;
31184 case 2:
31185 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31186 break;
31187 case 3:
31188 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31189 args[2].op);
31190 break;
31191 case 4:
31192 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31193 args[2].op, args[3].op);
31194 break;
31195 default:
31196 gcc_unreachable ();
31199 if (! pat)
31200 return 0;
31202 emit_insn (pat);
31203 return target;
31206 /* Subroutine of ix86_expand_builtin to take care of special insns
31207 with variable number of operands. */
31209 static rtx
31210 ix86_expand_special_args_builtin (const struct builtin_description *d,
31211 tree exp, rtx target)
31213 tree arg;
31214 rtx pat, op;
31215 unsigned int i, nargs, arg_adjust, memory;
31216 struct
31218 rtx op;
31219 enum machine_mode mode;
31220 } args[3];
31221 enum insn_code icode = d->icode;
31222 bool last_arg_constant = false;
31223 const struct insn_data_d *insn_p = &insn_data[icode];
31224 enum machine_mode tmode = insn_p->operand[0].mode;
31225 enum { load, store } klass;
31227 switch ((enum ix86_builtin_func_type) d->flag)
31229 case VOID_FTYPE_VOID:
31230 emit_insn (GEN_FCN (icode) (target));
31231 return 0;
31232 case VOID_FTYPE_UINT64:
31233 case VOID_FTYPE_UNSIGNED:
31234 nargs = 0;
31235 klass = store;
31236 memory = 0;
31237 break;
31239 case INT_FTYPE_VOID:
31240 case UINT64_FTYPE_VOID:
31241 case UNSIGNED_FTYPE_VOID:
31242 nargs = 0;
31243 klass = load;
31244 memory = 0;
31245 break;
31246 case UINT64_FTYPE_PUNSIGNED:
31247 case V2DI_FTYPE_PV2DI:
31248 case V4DI_FTYPE_PV4DI:
31249 case V32QI_FTYPE_PCCHAR:
31250 case V16QI_FTYPE_PCCHAR:
31251 case V8SF_FTYPE_PCV4SF:
31252 case V8SF_FTYPE_PCFLOAT:
31253 case V4SF_FTYPE_PCFLOAT:
31254 case V4DF_FTYPE_PCV2DF:
31255 case V4DF_FTYPE_PCDOUBLE:
31256 case V2DF_FTYPE_PCDOUBLE:
31257 case VOID_FTYPE_PVOID:
31258 nargs = 1;
31259 klass = load;
31260 memory = 0;
31261 break;
31262 case VOID_FTYPE_PV2SF_V4SF:
31263 case VOID_FTYPE_PV4DI_V4DI:
31264 case VOID_FTYPE_PV2DI_V2DI:
31265 case VOID_FTYPE_PCHAR_V32QI:
31266 case VOID_FTYPE_PCHAR_V16QI:
31267 case VOID_FTYPE_PFLOAT_V8SF:
31268 case VOID_FTYPE_PFLOAT_V4SF:
31269 case VOID_FTYPE_PDOUBLE_V4DF:
31270 case VOID_FTYPE_PDOUBLE_V2DF:
31271 case VOID_FTYPE_PLONGLONG_LONGLONG:
31272 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31273 case VOID_FTYPE_PINT_INT:
31274 nargs = 1;
31275 klass = store;
31276 /* Reserve memory operand for target. */
31277 memory = ARRAY_SIZE (args);
31278 break;
31279 case V4SF_FTYPE_V4SF_PCV2SF:
31280 case V2DF_FTYPE_V2DF_PCDOUBLE:
31281 nargs = 2;
31282 klass = load;
31283 memory = 1;
31284 break;
31285 case V8SF_FTYPE_PCV8SF_V8SI:
31286 case V4DF_FTYPE_PCV4DF_V4DI:
31287 case V4SF_FTYPE_PCV4SF_V4SI:
31288 case V2DF_FTYPE_PCV2DF_V2DI:
31289 case V8SI_FTYPE_PCV8SI_V8SI:
31290 case V4DI_FTYPE_PCV4DI_V4DI:
31291 case V4SI_FTYPE_PCV4SI_V4SI:
31292 case V2DI_FTYPE_PCV2DI_V2DI:
31293 nargs = 2;
31294 klass = load;
31295 memory = 0;
31296 break;
31297 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31298 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31299 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31300 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31301 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31302 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31303 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31304 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31305 nargs = 2;
31306 klass = store;
31307 /* Reserve memory operand for target. */
31308 memory = ARRAY_SIZE (args);
31309 break;
31310 case VOID_FTYPE_UINT_UINT_UINT:
31311 case VOID_FTYPE_UINT64_UINT_UINT:
31312 case UCHAR_FTYPE_UINT_UINT_UINT:
31313 case UCHAR_FTYPE_UINT64_UINT_UINT:
31314 nargs = 3;
31315 klass = load;
31316 memory = ARRAY_SIZE (args);
31317 last_arg_constant = true;
31318 break;
31319 default:
31320 gcc_unreachable ();
31323 gcc_assert (nargs <= ARRAY_SIZE (args));
31325 if (klass == store)
31327 arg = CALL_EXPR_ARG (exp, 0);
31328 op = expand_normal (arg);
31329 gcc_assert (target == 0);
31330 if (memory)
31332 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31333 target = gen_rtx_MEM (tmode, op);
31335 else
31336 target = force_reg (tmode, op);
31337 arg_adjust = 1;
31339 else
31341 arg_adjust = 0;
31342 if (optimize
31343 || target == 0
31344 || !register_operand (target, tmode)
31345 || GET_MODE (target) != tmode)
31346 target = gen_reg_rtx (tmode);
31349 for (i = 0; i < nargs; i++)
31351 enum machine_mode mode = insn_p->operand[i + 1].mode;
31352 bool match;
31354 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31355 op = expand_normal (arg);
31356 match = insn_p->operand[i + 1].predicate (op, mode);
31358 if (last_arg_constant && (i + 1) == nargs)
31360 if (!match)
31362 if (icode == CODE_FOR_lwp_lwpvalsi3
31363 || icode == CODE_FOR_lwp_lwpinssi3
31364 || icode == CODE_FOR_lwp_lwpvaldi3
31365 || icode == CODE_FOR_lwp_lwpinsdi3)
31366 error ("the last argument must be a 32-bit immediate");
31367 else
31368 error ("the last argument must be an 8-bit immediate");
31369 return const0_rtx;
31372 else
31374 if (i == memory)
31376 /* This must be the memory operand. */
31377 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31378 op = gen_rtx_MEM (mode, op);
31379 gcc_assert (GET_MODE (op) == mode
31380 || GET_MODE (op) == VOIDmode);
31382 else
31384 /* This must be register. */
31385 if (VECTOR_MODE_P (mode))
31386 op = safe_vector_operand (op, mode);
31388 gcc_assert (GET_MODE (op) == mode
31389 || GET_MODE (op) == VOIDmode);
31390 op = copy_to_mode_reg (mode, op);
31394 args[i].op = op;
31395 args[i].mode = mode;
31398 switch (nargs)
31400 case 0:
31401 pat = GEN_FCN (icode) (target);
31402 break;
31403 case 1:
31404 pat = GEN_FCN (icode) (target, args[0].op);
31405 break;
31406 case 2:
31407 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31408 break;
31409 case 3:
31410 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31411 break;
31412 default:
31413 gcc_unreachable ();
31416 if (! pat)
31417 return 0;
31418 emit_insn (pat);
31419 return klass == store ? 0 : target;
31422 /* Return the integer constant in ARG. Constrain it to be in the range
31423 of the subparts of VEC_TYPE; issue an error if not. */
31425 static int
31426 get_element_number (tree vec_type, tree arg)
31428 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31430 if (!host_integerp (arg, 1)
31431 || (elt = tree_low_cst (arg, 1), elt > max))
31433 error ("selector must be an integer constant in the range 0..%wi", max);
31434 return 0;
31437 return elt;
31440 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31441 ix86_expand_vector_init. We DO have language-level syntax for this, in
31442 the form of (type){ init-list }. Except that since we can't place emms
31443 instructions from inside the compiler, we can't allow the use of MMX
31444 registers unless the user explicitly asks for it. So we do *not* define
31445 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31446 we have builtins invoked by mmintrin.h that gives us license to emit
31447 these sorts of instructions. */
31449 static rtx
31450 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31452 enum machine_mode tmode = TYPE_MODE (type);
31453 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31454 int i, n_elt = GET_MODE_NUNITS (tmode);
31455 rtvec v = rtvec_alloc (n_elt);
31457 gcc_assert (VECTOR_MODE_P (tmode));
31458 gcc_assert (call_expr_nargs (exp) == n_elt);
31460 for (i = 0; i < n_elt; ++i)
31462 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31463 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31466 if (!target || !register_operand (target, tmode))
31467 target = gen_reg_rtx (tmode);
31469 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31470 return target;
31473 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31474 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31475 had a language-level syntax for referencing vector elements. */
31477 static rtx
31478 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31480 enum machine_mode tmode, mode0;
31481 tree arg0, arg1;
31482 int elt;
31483 rtx op0;
31485 arg0 = CALL_EXPR_ARG (exp, 0);
31486 arg1 = CALL_EXPR_ARG (exp, 1);
31488 op0 = expand_normal (arg0);
31489 elt = get_element_number (TREE_TYPE (arg0), arg1);
31491 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31492 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31493 gcc_assert (VECTOR_MODE_P (mode0));
31495 op0 = force_reg (mode0, op0);
31497 if (optimize || !target || !register_operand (target, tmode))
31498 target = gen_reg_rtx (tmode);
31500 ix86_expand_vector_extract (true, target, op0, elt);
31502 return target;
31505 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31506 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31507 a language-level syntax for referencing vector elements. */
31509 static rtx
31510 ix86_expand_vec_set_builtin (tree exp)
31512 enum machine_mode tmode, mode1;
31513 tree arg0, arg1, arg2;
31514 int elt;
31515 rtx op0, op1, target;
31517 arg0 = CALL_EXPR_ARG (exp, 0);
31518 arg1 = CALL_EXPR_ARG (exp, 1);
31519 arg2 = CALL_EXPR_ARG (exp, 2);
31521 tmode = TYPE_MODE (TREE_TYPE (arg0));
31522 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31523 gcc_assert (VECTOR_MODE_P (tmode));
31525 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31526 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31527 elt = get_element_number (TREE_TYPE (arg0), arg2);
31529 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31530 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31532 op0 = force_reg (tmode, op0);
31533 op1 = force_reg (mode1, op1);
31535 /* OP0 is the source of these builtin functions and shouldn't be
31536 modified. Create a copy, use it and return it as target. */
31537 target = gen_reg_rtx (tmode);
31538 emit_move_insn (target, op0);
31539 ix86_expand_vector_set (true, target, op1, elt);
31541 return target;
31544 /* Expand an expression EXP that calls a built-in function,
31545 with result going to TARGET if that's convenient
31546 (and in mode MODE if that's convenient).
31547 SUBTARGET may be used as the target for computing one of EXP's operands.
31548 IGNORE is nonzero if the value is to be ignored. */
31550 static rtx
31551 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31552 enum machine_mode mode ATTRIBUTE_UNUSED,
31553 int ignore ATTRIBUTE_UNUSED)
31555 const struct builtin_description *d;
31556 size_t i;
31557 enum insn_code icode;
31558 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31559 tree arg0, arg1, arg2, arg3, arg4;
31560 rtx op0, op1, op2, op3, op4, pat, insn;
31561 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31562 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31564 /* For CPU builtins that can be folded, fold first and expand the fold. */
31565 switch (fcode)
31567 case IX86_BUILTIN_CPU_INIT:
31569 /* Make it call __cpu_indicator_init in libgcc. */
31570 tree call_expr, fndecl, type;
31571 type = build_function_type_list (integer_type_node, NULL_TREE);
31572 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31573 call_expr = build_call_expr (fndecl, 0);
31574 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31576 case IX86_BUILTIN_CPU_IS:
31577 case IX86_BUILTIN_CPU_SUPPORTS:
31579 tree arg0 = CALL_EXPR_ARG (exp, 0);
31580 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31581 gcc_assert (fold_expr != NULL_TREE);
31582 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31586 /* Determine whether the builtin function is available under the current ISA.
31587 Originally the builtin was not created if it wasn't applicable to the
31588 current ISA based on the command line switches. With function specific
31589 options, we need to check in the context of the function making the call
31590 whether it is supported. */
31591 if (ix86_builtins_isa[fcode].isa
31592 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31594 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31595 NULL, (enum fpmath_unit) 0, false);
31597 if (!opts)
31598 error ("%qE needs unknown isa option", fndecl);
31599 else
31601 gcc_assert (opts != NULL);
31602 error ("%qE needs isa option %s", fndecl, opts);
31603 free (opts);
31605 return const0_rtx;
31608 switch (fcode)
31610 case IX86_BUILTIN_MASKMOVQ:
31611 case IX86_BUILTIN_MASKMOVDQU:
31612 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31613 ? CODE_FOR_mmx_maskmovq
31614 : CODE_FOR_sse2_maskmovdqu);
31615 /* Note the arg order is different from the operand order. */
31616 arg1 = CALL_EXPR_ARG (exp, 0);
31617 arg2 = CALL_EXPR_ARG (exp, 1);
31618 arg0 = CALL_EXPR_ARG (exp, 2);
31619 op0 = expand_normal (arg0);
31620 op1 = expand_normal (arg1);
31621 op2 = expand_normal (arg2);
31622 mode0 = insn_data[icode].operand[0].mode;
31623 mode1 = insn_data[icode].operand[1].mode;
31624 mode2 = insn_data[icode].operand[2].mode;
31626 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31627 op0 = gen_rtx_MEM (mode1, op0);
31629 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31630 op0 = copy_to_mode_reg (mode0, op0);
31631 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31632 op1 = copy_to_mode_reg (mode1, op1);
31633 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31634 op2 = copy_to_mode_reg (mode2, op2);
31635 pat = GEN_FCN (icode) (op0, op1, op2);
31636 if (! pat)
31637 return 0;
31638 emit_insn (pat);
31639 return 0;
31641 case IX86_BUILTIN_LDMXCSR:
31642 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31643 target = assign_386_stack_local (SImode, SLOT_TEMP);
31644 emit_move_insn (target, op0);
31645 emit_insn (gen_sse_ldmxcsr (target));
31646 return 0;
31648 case IX86_BUILTIN_STMXCSR:
31649 target = assign_386_stack_local (SImode, SLOT_TEMP);
31650 emit_insn (gen_sse_stmxcsr (target));
31651 return copy_to_mode_reg (SImode, target);
31653 case IX86_BUILTIN_CLFLUSH:
31654 arg0 = CALL_EXPR_ARG (exp, 0);
31655 op0 = expand_normal (arg0);
31656 icode = CODE_FOR_sse2_clflush;
31657 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31658 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31660 emit_insn (gen_sse2_clflush (op0));
31661 return 0;
31663 case IX86_BUILTIN_MONITOR:
31664 arg0 = CALL_EXPR_ARG (exp, 0);
31665 arg1 = CALL_EXPR_ARG (exp, 1);
31666 arg2 = CALL_EXPR_ARG (exp, 2);
31667 op0 = expand_normal (arg0);
31668 op1 = expand_normal (arg1);
31669 op2 = expand_normal (arg2);
31670 if (!REG_P (op0))
31671 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31672 if (!REG_P (op1))
31673 op1 = copy_to_mode_reg (SImode, op1);
31674 if (!REG_P (op2))
31675 op2 = copy_to_mode_reg (SImode, op2);
31676 emit_insn (ix86_gen_monitor (op0, op1, op2));
31677 return 0;
31679 case IX86_BUILTIN_MWAIT:
31680 arg0 = CALL_EXPR_ARG (exp, 0);
31681 arg1 = CALL_EXPR_ARG (exp, 1);
31682 op0 = expand_normal (arg0);
31683 op1 = expand_normal (arg1);
31684 if (!REG_P (op0))
31685 op0 = copy_to_mode_reg (SImode, op0);
31686 if (!REG_P (op1))
31687 op1 = copy_to_mode_reg (SImode, op1);
31688 emit_insn (gen_sse3_mwait (op0, op1));
31689 return 0;
31691 case IX86_BUILTIN_VEC_INIT_V2SI:
31692 case IX86_BUILTIN_VEC_INIT_V4HI:
31693 case IX86_BUILTIN_VEC_INIT_V8QI:
31694 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31696 case IX86_BUILTIN_VEC_EXT_V2DF:
31697 case IX86_BUILTIN_VEC_EXT_V2DI:
31698 case IX86_BUILTIN_VEC_EXT_V4SF:
31699 case IX86_BUILTIN_VEC_EXT_V4SI:
31700 case IX86_BUILTIN_VEC_EXT_V8HI:
31701 case IX86_BUILTIN_VEC_EXT_V2SI:
31702 case IX86_BUILTIN_VEC_EXT_V4HI:
31703 case IX86_BUILTIN_VEC_EXT_V16QI:
31704 return ix86_expand_vec_ext_builtin (exp, target);
31706 case IX86_BUILTIN_VEC_SET_V2DI:
31707 case IX86_BUILTIN_VEC_SET_V4SF:
31708 case IX86_BUILTIN_VEC_SET_V4SI:
31709 case IX86_BUILTIN_VEC_SET_V8HI:
31710 case IX86_BUILTIN_VEC_SET_V4HI:
31711 case IX86_BUILTIN_VEC_SET_V16QI:
31712 return ix86_expand_vec_set_builtin (exp);
31714 case IX86_BUILTIN_INFQ:
31715 case IX86_BUILTIN_HUGE_VALQ:
31717 REAL_VALUE_TYPE inf;
31718 rtx tmp;
31720 real_inf (&inf);
31721 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31723 tmp = validize_mem (force_const_mem (mode, tmp));
31725 if (target == 0)
31726 target = gen_reg_rtx (mode);
31728 emit_move_insn (target, tmp);
31729 return target;
31732 case IX86_BUILTIN_RDPMC:
31733 case IX86_BUILTIN_RDTSC:
31734 case IX86_BUILTIN_RDTSCP:
31736 op0 = gen_reg_rtx (DImode);
31737 op1 = gen_reg_rtx (DImode);
31739 if (fcode == IX86_BUILTIN_RDPMC)
31741 arg0 = CALL_EXPR_ARG (exp, 0);
31742 op2 = expand_normal (arg0);
31743 if (!register_operand (op2, SImode))
31744 op2 = copy_to_mode_reg (SImode, op2);
31746 insn = (TARGET_64BIT
31747 ? gen_rdpmc_rex64 (op0, op1, op2)
31748 : gen_rdpmc (op0, op2));
31749 emit_insn (insn);
31751 else if (fcode == IX86_BUILTIN_RDTSC)
31753 insn = (TARGET_64BIT
31754 ? gen_rdtsc_rex64 (op0, op1)
31755 : gen_rdtsc (op0));
31756 emit_insn (insn);
31758 else
31760 op2 = gen_reg_rtx (SImode);
31762 insn = (TARGET_64BIT
31763 ? gen_rdtscp_rex64 (op0, op1, op2)
31764 : gen_rdtscp (op0, op2));
31765 emit_insn (insn);
31767 arg0 = CALL_EXPR_ARG (exp, 0);
31768 op4 = expand_normal (arg0);
31769 if (!address_operand (op4, VOIDmode))
31771 op4 = convert_memory_address (Pmode, op4);
31772 op4 = copy_addr_to_reg (op4);
31774 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31777 if (target == 0)
31778 target = gen_reg_rtx (mode);
31780 if (TARGET_64BIT)
31782 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31783 op1, 1, OPTAB_DIRECT);
31784 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31785 op0, 1, OPTAB_DIRECT);
31788 emit_move_insn (target, op0);
31789 return target;
31791 case IX86_BUILTIN_FXSAVE:
31792 case IX86_BUILTIN_FXRSTOR:
31793 case IX86_BUILTIN_FXSAVE64:
31794 case IX86_BUILTIN_FXRSTOR64:
31795 switch (fcode)
31797 case IX86_BUILTIN_FXSAVE:
31798 icode = CODE_FOR_fxsave;
31799 break;
31800 case IX86_BUILTIN_FXRSTOR:
31801 icode = CODE_FOR_fxrstor;
31802 break;
31803 case IX86_BUILTIN_FXSAVE64:
31804 icode = CODE_FOR_fxsave64;
31805 break;
31806 case IX86_BUILTIN_FXRSTOR64:
31807 icode = CODE_FOR_fxrstor64;
31808 break;
31809 default:
31810 gcc_unreachable ();
31813 arg0 = CALL_EXPR_ARG (exp, 0);
31814 op0 = expand_normal (arg0);
31816 if (!address_operand (op0, VOIDmode))
31818 op0 = convert_memory_address (Pmode, op0);
31819 op0 = copy_addr_to_reg (op0);
31821 op0 = gen_rtx_MEM (BLKmode, op0);
31823 pat = GEN_FCN (icode) (op0);
31824 if (pat)
31825 emit_insn (pat);
31826 return 0;
31828 case IX86_BUILTIN_XSAVE:
31829 case IX86_BUILTIN_XRSTOR:
31830 case IX86_BUILTIN_XSAVE64:
31831 case IX86_BUILTIN_XRSTOR64:
31832 case IX86_BUILTIN_XSAVEOPT:
31833 case IX86_BUILTIN_XSAVEOPT64:
31834 arg0 = CALL_EXPR_ARG (exp, 0);
31835 arg1 = CALL_EXPR_ARG (exp, 1);
31836 op0 = expand_normal (arg0);
31837 op1 = expand_normal (arg1);
31839 if (!address_operand (op0, VOIDmode))
31841 op0 = convert_memory_address (Pmode, op0);
31842 op0 = copy_addr_to_reg (op0);
31844 op0 = gen_rtx_MEM (BLKmode, op0);
31846 op1 = force_reg (DImode, op1);
31848 if (TARGET_64BIT)
31850 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31851 NULL, 1, OPTAB_DIRECT);
31852 switch (fcode)
31854 case IX86_BUILTIN_XSAVE:
31855 icode = CODE_FOR_xsave_rex64;
31856 break;
31857 case IX86_BUILTIN_XRSTOR:
31858 icode = CODE_FOR_xrstor_rex64;
31859 break;
31860 case IX86_BUILTIN_XSAVE64:
31861 icode = CODE_FOR_xsave64;
31862 break;
31863 case IX86_BUILTIN_XRSTOR64:
31864 icode = CODE_FOR_xrstor64;
31865 break;
31866 case IX86_BUILTIN_XSAVEOPT:
31867 icode = CODE_FOR_xsaveopt_rex64;
31868 break;
31869 case IX86_BUILTIN_XSAVEOPT64:
31870 icode = CODE_FOR_xsaveopt64;
31871 break;
31872 default:
31873 gcc_unreachable ();
31876 op2 = gen_lowpart (SImode, op2);
31877 op1 = gen_lowpart (SImode, op1);
31878 pat = GEN_FCN (icode) (op0, op1, op2);
31880 else
31882 switch (fcode)
31884 case IX86_BUILTIN_XSAVE:
31885 icode = CODE_FOR_xsave;
31886 break;
31887 case IX86_BUILTIN_XRSTOR:
31888 icode = CODE_FOR_xrstor;
31889 break;
31890 case IX86_BUILTIN_XSAVEOPT:
31891 icode = CODE_FOR_xsaveopt;
31892 break;
31893 default:
31894 gcc_unreachable ();
31896 pat = GEN_FCN (icode) (op0, op1);
31899 if (pat)
31900 emit_insn (pat);
31901 return 0;
31903 case IX86_BUILTIN_LLWPCB:
31904 arg0 = CALL_EXPR_ARG (exp, 0);
31905 op0 = expand_normal (arg0);
31906 icode = CODE_FOR_lwp_llwpcb;
31907 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31908 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31909 emit_insn (gen_lwp_llwpcb (op0));
31910 return 0;
31912 case IX86_BUILTIN_SLWPCB:
31913 icode = CODE_FOR_lwp_slwpcb;
31914 if (!target
31915 || !insn_data[icode].operand[0].predicate (target, Pmode))
31916 target = gen_reg_rtx (Pmode);
31917 emit_insn (gen_lwp_slwpcb (target));
31918 return target;
31920 case IX86_BUILTIN_BEXTRI32:
31921 case IX86_BUILTIN_BEXTRI64:
31922 arg0 = CALL_EXPR_ARG (exp, 0);
31923 arg1 = CALL_EXPR_ARG (exp, 1);
31924 op0 = expand_normal (arg0);
31925 op1 = expand_normal (arg1);
31926 icode = (fcode == IX86_BUILTIN_BEXTRI32
31927 ? CODE_FOR_tbm_bextri_si
31928 : CODE_FOR_tbm_bextri_di);
31929 if (!CONST_INT_P (op1))
31931 error ("last argument must be an immediate");
31932 return const0_rtx;
31934 else
31936 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
31937 unsigned char lsb_index = INTVAL (op1) & 0xFF;
31938 op1 = GEN_INT (length);
31939 op2 = GEN_INT (lsb_index);
31940 pat = GEN_FCN (icode) (target, op0, op1, op2);
31941 if (pat)
31942 emit_insn (pat);
31943 return target;
31946 case IX86_BUILTIN_RDRAND16_STEP:
31947 icode = CODE_FOR_rdrandhi_1;
31948 mode0 = HImode;
31949 goto rdrand_step;
31951 case IX86_BUILTIN_RDRAND32_STEP:
31952 icode = CODE_FOR_rdrandsi_1;
31953 mode0 = SImode;
31954 goto rdrand_step;
31956 case IX86_BUILTIN_RDRAND64_STEP:
31957 icode = CODE_FOR_rdranddi_1;
31958 mode0 = DImode;
31960 rdrand_step:
31961 op0 = gen_reg_rtx (mode0);
31962 emit_insn (GEN_FCN (icode) (op0));
31964 arg0 = CALL_EXPR_ARG (exp, 0);
31965 op1 = expand_normal (arg0);
31966 if (!address_operand (op1, VOIDmode))
31968 op1 = convert_memory_address (Pmode, op1);
31969 op1 = copy_addr_to_reg (op1);
31971 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31973 op1 = gen_reg_rtx (SImode);
31974 emit_move_insn (op1, CONST1_RTX (SImode));
31976 /* Emit SImode conditional move. */
31977 if (mode0 == HImode)
31979 op2 = gen_reg_rtx (SImode);
31980 emit_insn (gen_zero_extendhisi2 (op2, op0));
31982 else if (mode0 == SImode)
31983 op2 = op0;
31984 else
31985 op2 = gen_rtx_SUBREG (SImode, op0, 0);
31987 if (target == 0)
31988 target = gen_reg_rtx (SImode);
31990 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
31991 const0_rtx);
31992 emit_insn (gen_rtx_SET (VOIDmode, target,
31993 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
31994 return target;
31996 case IX86_BUILTIN_RDSEED16_STEP:
31997 icode = CODE_FOR_rdseedhi_1;
31998 mode0 = HImode;
31999 goto rdseed_step;
32001 case IX86_BUILTIN_RDSEED32_STEP:
32002 icode = CODE_FOR_rdseedsi_1;
32003 mode0 = SImode;
32004 goto rdseed_step;
32006 case IX86_BUILTIN_RDSEED64_STEP:
32007 icode = CODE_FOR_rdseeddi_1;
32008 mode0 = DImode;
32010 rdseed_step:
32011 op0 = gen_reg_rtx (mode0);
32012 emit_insn (GEN_FCN (icode) (op0));
32014 arg0 = CALL_EXPR_ARG (exp, 0);
32015 op1 = expand_normal (arg0);
32016 if (!address_operand (op1, VOIDmode))
32018 op1 = convert_memory_address (Pmode, op1);
32019 op1 = copy_addr_to_reg (op1);
32021 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32023 op2 = gen_reg_rtx (QImode);
32025 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32026 const0_rtx);
32027 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32029 if (target == 0)
32030 target = gen_reg_rtx (SImode);
32032 emit_insn (gen_zero_extendqisi2 (target, op2));
32033 return target;
32035 case IX86_BUILTIN_ADDCARRYX32:
32036 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32037 mode0 = SImode;
32038 goto addcarryx;
32040 case IX86_BUILTIN_ADDCARRYX64:
32041 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32042 mode0 = DImode;
32044 addcarryx:
32045 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32046 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32047 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32048 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32050 op0 = gen_reg_rtx (QImode);
32052 /* Generate CF from input operand. */
32053 op1 = expand_normal (arg0);
32054 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32055 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32057 /* Gen ADCX instruction to compute X+Y+CF. */
32058 op2 = expand_normal (arg1);
32059 op3 = expand_normal (arg2);
32061 if (!REG_P (op2))
32062 op2 = copy_to_mode_reg (mode0, op2);
32063 if (!REG_P (op3))
32064 op3 = copy_to_mode_reg (mode0, op3);
32066 op0 = gen_reg_rtx (mode0);
32068 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32069 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32070 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32072 /* Store the result. */
32073 op4 = expand_normal (arg3);
32074 if (!address_operand (op4, VOIDmode))
32076 op4 = convert_memory_address (Pmode, op4);
32077 op4 = copy_addr_to_reg (op4);
32079 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32081 /* Return current CF value. */
32082 if (target == 0)
32083 target = gen_reg_rtx (QImode);
32085 PUT_MODE (pat, QImode);
32086 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32087 return target;
32089 case IX86_BUILTIN_GATHERSIV2DF:
32090 icode = CODE_FOR_avx2_gathersiv2df;
32091 goto gather_gen;
32092 case IX86_BUILTIN_GATHERSIV4DF:
32093 icode = CODE_FOR_avx2_gathersiv4df;
32094 goto gather_gen;
32095 case IX86_BUILTIN_GATHERDIV2DF:
32096 icode = CODE_FOR_avx2_gatherdiv2df;
32097 goto gather_gen;
32098 case IX86_BUILTIN_GATHERDIV4DF:
32099 icode = CODE_FOR_avx2_gatherdiv4df;
32100 goto gather_gen;
32101 case IX86_BUILTIN_GATHERSIV4SF:
32102 icode = CODE_FOR_avx2_gathersiv4sf;
32103 goto gather_gen;
32104 case IX86_BUILTIN_GATHERSIV8SF:
32105 icode = CODE_FOR_avx2_gathersiv8sf;
32106 goto gather_gen;
32107 case IX86_BUILTIN_GATHERDIV4SF:
32108 icode = CODE_FOR_avx2_gatherdiv4sf;
32109 goto gather_gen;
32110 case IX86_BUILTIN_GATHERDIV8SF:
32111 icode = CODE_FOR_avx2_gatherdiv8sf;
32112 goto gather_gen;
32113 case IX86_BUILTIN_GATHERSIV2DI:
32114 icode = CODE_FOR_avx2_gathersiv2di;
32115 goto gather_gen;
32116 case IX86_BUILTIN_GATHERSIV4DI:
32117 icode = CODE_FOR_avx2_gathersiv4di;
32118 goto gather_gen;
32119 case IX86_BUILTIN_GATHERDIV2DI:
32120 icode = CODE_FOR_avx2_gatherdiv2di;
32121 goto gather_gen;
32122 case IX86_BUILTIN_GATHERDIV4DI:
32123 icode = CODE_FOR_avx2_gatherdiv4di;
32124 goto gather_gen;
32125 case IX86_BUILTIN_GATHERSIV4SI:
32126 icode = CODE_FOR_avx2_gathersiv4si;
32127 goto gather_gen;
32128 case IX86_BUILTIN_GATHERSIV8SI:
32129 icode = CODE_FOR_avx2_gathersiv8si;
32130 goto gather_gen;
32131 case IX86_BUILTIN_GATHERDIV4SI:
32132 icode = CODE_FOR_avx2_gatherdiv4si;
32133 goto gather_gen;
32134 case IX86_BUILTIN_GATHERDIV8SI:
32135 icode = CODE_FOR_avx2_gatherdiv8si;
32136 goto gather_gen;
32137 case IX86_BUILTIN_GATHERALTSIV4DF:
32138 icode = CODE_FOR_avx2_gathersiv4df;
32139 goto gather_gen;
32140 case IX86_BUILTIN_GATHERALTDIV8SF:
32141 icode = CODE_FOR_avx2_gatherdiv8sf;
32142 goto gather_gen;
32143 case IX86_BUILTIN_GATHERALTSIV4DI:
32144 icode = CODE_FOR_avx2_gathersiv4di;
32145 goto gather_gen;
32146 case IX86_BUILTIN_GATHERALTDIV8SI:
32147 icode = CODE_FOR_avx2_gatherdiv8si;
32148 goto gather_gen;
32150 gather_gen:
32151 arg0 = CALL_EXPR_ARG (exp, 0);
32152 arg1 = CALL_EXPR_ARG (exp, 1);
32153 arg2 = CALL_EXPR_ARG (exp, 2);
32154 arg3 = CALL_EXPR_ARG (exp, 3);
32155 arg4 = CALL_EXPR_ARG (exp, 4);
32156 op0 = expand_normal (arg0);
32157 op1 = expand_normal (arg1);
32158 op2 = expand_normal (arg2);
32159 op3 = expand_normal (arg3);
32160 op4 = expand_normal (arg4);
32161 /* Note the arg order is different from the operand order. */
32162 mode0 = insn_data[icode].operand[1].mode;
32163 mode2 = insn_data[icode].operand[3].mode;
32164 mode3 = insn_data[icode].operand[4].mode;
32165 mode4 = insn_data[icode].operand[5].mode;
32167 if (target == NULL_RTX
32168 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32169 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32170 else
32171 subtarget = target;
32173 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32174 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32176 rtx half = gen_reg_rtx (V4SImode);
32177 if (!nonimmediate_operand (op2, V8SImode))
32178 op2 = copy_to_mode_reg (V8SImode, op2);
32179 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32180 op2 = half;
32182 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32183 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32185 rtx (*gen) (rtx, rtx);
32186 rtx half = gen_reg_rtx (mode0);
32187 if (mode0 == V4SFmode)
32188 gen = gen_vec_extract_lo_v8sf;
32189 else
32190 gen = gen_vec_extract_lo_v8si;
32191 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32192 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32193 emit_insn (gen (half, op0));
32194 op0 = half;
32195 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32196 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32197 emit_insn (gen (half, op3));
32198 op3 = half;
32201 /* Force memory operand only with base register here. But we
32202 don't want to do it on memory operand for other builtin
32203 functions. */
32204 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32206 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32207 op0 = copy_to_mode_reg (mode0, op0);
32208 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32209 op1 = copy_to_mode_reg (Pmode, op1);
32210 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32211 op2 = copy_to_mode_reg (mode2, op2);
32212 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32213 op3 = copy_to_mode_reg (mode3, op3);
32214 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32216 error ("last argument must be scale 1, 2, 4, 8");
32217 return const0_rtx;
32220 /* Optimize. If mask is known to have all high bits set,
32221 replace op0 with pc_rtx to signal that the instruction
32222 overwrites the whole destination and doesn't use its
32223 previous contents. */
32224 if (optimize)
32226 if (TREE_CODE (arg3) == VECTOR_CST)
32228 unsigned int negative = 0;
32229 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32231 tree cst = VECTOR_CST_ELT (arg3, i);
32232 if (TREE_CODE (cst) == INTEGER_CST
32233 && tree_int_cst_sign_bit (cst))
32234 negative++;
32235 else if (TREE_CODE (cst) == REAL_CST
32236 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32237 negative++;
32239 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32240 op0 = pc_rtx;
32242 else if (TREE_CODE (arg3) == SSA_NAME)
32244 /* Recognize also when mask is like:
32245 __v2df src = _mm_setzero_pd ();
32246 __v2df mask = _mm_cmpeq_pd (src, src);
32248 __v8sf src = _mm256_setzero_ps ();
32249 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32250 as that is a cheaper way to load all ones into
32251 a register than having to load a constant from
32252 memory. */
32253 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32254 if (is_gimple_call (def_stmt))
32256 tree fndecl = gimple_call_fndecl (def_stmt);
32257 if (fndecl
32258 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32259 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32261 case IX86_BUILTIN_CMPPD:
32262 case IX86_BUILTIN_CMPPS:
32263 case IX86_BUILTIN_CMPPD256:
32264 case IX86_BUILTIN_CMPPS256:
32265 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32266 break;
32267 /* FALLTHRU */
32268 case IX86_BUILTIN_CMPEQPD:
32269 case IX86_BUILTIN_CMPEQPS:
32270 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32271 && initializer_zerop (gimple_call_arg (def_stmt,
32272 1)))
32273 op0 = pc_rtx;
32274 break;
32275 default:
32276 break;
32282 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32283 if (! pat)
32284 return const0_rtx;
32285 emit_insn (pat);
32287 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32288 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32290 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32291 ? V4SFmode : V4SImode;
32292 if (target == NULL_RTX)
32293 target = gen_reg_rtx (tmode);
32294 if (tmode == V4SFmode)
32295 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32296 else
32297 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32299 else
32300 target = subtarget;
32302 return target;
32304 case IX86_BUILTIN_XABORT:
32305 icode = CODE_FOR_xabort;
32306 arg0 = CALL_EXPR_ARG (exp, 0);
32307 op0 = expand_normal (arg0);
32308 mode0 = insn_data[icode].operand[0].mode;
32309 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32311 error ("the xabort's argument must be an 8-bit immediate");
32312 return const0_rtx;
32314 emit_insn (gen_xabort (op0));
32315 return 0;
32317 default:
32318 break;
32321 for (i = 0, d = bdesc_special_args;
32322 i < ARRAY_SIZE (bdesc_special_args);
32323 i++, d++)
32324 if (d->code == fcode)
32325 return ix86_expand_special_args_builtin (d, exp, target);
32327 for (i = 0, d = bdesc_args;
32328 i < ARRAY_SIZE (bdesc_args);
32329 i++, d++)
32330 if (d->code == fcode)
32331 switch (fcode)
32333 case IX86_BUILTIN_FABSQ:
32334 case IX86_BUILTIN_COPYSIGNQ:
32335 if (!TARGET_SSE)
32336 /* Emit a normal call if SSE isn't available. */
32337 return expand_call (exp, target, ignore);
32338 default:
32339 return ix86_expand_args_builtin (d, exp, target);
32342 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32343 if (d->code == fcode)
32344 return ix86_expand_sse_comi (d, exp, target);
32346 for (i = 0, d = bdesc_pcmpestr;
32347 i < ARRAY_SIZE (bdesc_pcmpestr);
32348 i++, d++)
32349 if (d->code == fcode)
32350 return ix86_expand_sse_pcmpestr (d, exp, target);
32352 for (i = 0, d = bdesc_pcmpistr;
32353 i < ARRAY_SIZE (bdesc_pcmpistr);
32354 i++, d++)
32355 if (d->code == fcode)
32356 return ix86_expand_sse_pcmpistr (d, exp, target);
32358 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32359 if (d->code == fcode)
32360 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32361 (enum ix86_builtin_func_type)
32362 d->flag, d->comparison);
32364 gcc_unreachable ();
32367 /* Returns a function decl for a vectorized version of the builtin function
32368 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32369 if it is not available. */
32371 static tree
32372 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32373 tree type_in)
32375 enum machine_mode in_mode, out_mode;
32376 int in_n, out_n;
32377 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32379 if (TREE_CODE (type_out) != VECTOR_TYPE
32380 || TREE_CODE (type_in) != VECTOR_TYPE
32381 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32382 return NULL_TREE;
32384 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32385 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32386 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32387 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32389 switch (fn)
32391 case BUILT_IN_SQRT:
32392 if (out_mode == DFmode && in_mode == DFmode)
32394 if (out_n == 2 && in_n == 2)
32395 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32396 else if (out_n == 4 && in_n == 4)
32397 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32399 break;
32401 case BUILT_IN_SQRTF:
32402 if (out_mode == SFmode && in_mode == SFmode)
32404 if (out_n == 4 && in_n == 4)
32405 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32406 else if (out_n == 8 && in_n == 8)
32407 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32409 break;
32411 case BUILT_IN_IFLOOR:
32412 case BUILT_IN_LFLOOR:
32413 case BUILT_IN_LLFLOOR:
32414 /* The round insn does not trap on denormals. */
32415 if (flag_trapping_math || !TARGET_ROUND)
32416 break;
32418 if (out_mode == SImode && in_mode == DFmode)
32420 if (out_n == 4 && in_n == 2)
32421 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32422 else if (out_n == 8 && in_n == 4)
32423 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32425 break;
32427 case BUILT_IN_IFLOORF:
32428 case BUILT_IN_LFLOORF:
32429 case BUILT_IN_LLFLOORF:
32430 /* The round insn does not trap on denormals. */
32431 if (flag_trapping_math || !TARGET_ROUND)
32432 break;
32434 if (out_mode == SImode && in_mode == SFmode)
32436 if (out_n == 4 && in_n == 4)
32437 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32438 else if (out_n == 8 && in_n == 8)
32439 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32441 break;
32443 case BUILT_IN_ICEIL:
32444 case BUILT_IN_LCEIL:
32445 case BUILT_IN_LLCEIL:
32446 /* The round insn does not trap on denormals. */
32447 if (flag_trapping_math || !TARGET_ROUND)
32448 break;
32450 if (out_mode == SImode && in_mode == DFmode)
32452 if (out_n == 4 && in_n == 2)
32453 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32454 else if (out_n == 8 && in_n == 4)
32455 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32457 break;
32459 case BUILT_IN_ICEILF:
32460 case BUILT_IN_LCEILF:
32461 case BUILT_IN_LLCEILF:
32462 /* The round insn does not trap on denormals. */
32463 if (flag_trapping_math || !TARGET_ROUND)
32464 break;
32466 if (out_mode == SImode && in_mode == SFmode)
32468 if (out_n == 4 && in_n == 4)
32469 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32470 else if (out_n == 8 && in_n == 8)
32471 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32473 break;
32475 case BUILT_IN_IRINT:
32476 case BUILT_IN_LRINT:
32477 case BUILT_IN_LLRINT:
32478 if (out_mode == SImode && in_mode == DFmode)
32480 if (out_n == 4 && in_n == 2)
32481 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32482 else if (out_n == 8 && in_n == 4)
32483 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32485 break;
32487 case BUILT_IN_IRINTF:
32488 case BUILT_IN_LRINTF:
32489 case BUILT_IN_LLRINTF:
32490 if (out_mode == SImode && in_mode == SFmode)
32492 if (out_n == 4 && in_n == 4)
32493 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32494 else if (out_n == 8 && in_n == 8)
32495 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32497 break;
32499 case BUILT_IN_IROUND:
32500 case BUILT_IN_LROUND:
32501 case BUILT_IN_LLROUND:
32502 /* The round insn does not trap on denormals. */
32503 if (flag_trapping_math || !TARGET_ROUND)
32504 break;
32506 if (out_mode == SImode && in_mode == DFmode)
32508 if (out_n == 4 && in_n == 2)
32509 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32510 else if (out_n == 8 && in_n == 4)
32511 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32513 break;
32515 case BUILT_IN_IROUNDF:
32516 case BUILT_IN_LROUNDF:
32517 case BUILT_IN_LLROUNDF:
32518 /* The round insn does not trap on denormals. */
32519 if (flag_trapping_math || !TARGET_ROUND)
32520 break;
32522 if (out_mode == SImode && in_mode == SFmode)
32524 if (out_n == 4 && in_n == 4)
32525 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32526 else if (out_n == 8 && in_n == 8)
32527 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32529 break;
32531 case BUILT_IN_COPYSIGN:
32532 if (out_mode == DFmode && in_mode == DFmode)
32534 if (out_n == 2 && in_n == 2)
32535 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32536 else if (out_n == 4 && in_n == 4)
32537 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32539 break;
32541 case BUILT_IN_COPYSIGNF:
32542 if (out_mode == SFmode && in_mode == SFmode)
32544 if (out_n == 4 && in_n == 4)
32545 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32546 else if (out_n == 8 && in_n == 8)
32547 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32549 break;
32551 case BUILT_IN_FLOOR:
32552 /* The round insn does not trap on denormals. */
32553 if (flag_trapping_math || !TARGET_ROUND)
32554 break;
32556 if (out_mode == DFmode && in_mode == DFmode)
32558 if (out_n == 2 && in_n == 2)
32559 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32560 else if (out_n == 4 && in_n == 4)
32561 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32563 break;
32565 case BUILT_IN_FLOORF:
32566 /* The round insn does not trap on denormals. */
32567 if (flag_trapping_math || !TARGET_ROUND)
32568 break;
32570 if (out_mode == SFmode && in_mode == SFmode)
32572 if (out_n == 4 && in_n == 4)
32573 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32574 else if (out_n == 8 && in_n == 8)
32575 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32577 break;
32579 case BUILT_IN_CEIL:
32580 /* The round insn does not trap on denormals. */
32581 if (flag_trapping_math || !TARGET_ROUND)
32582 break;
32584 if (out_mode == DFmode && in_mode == DFmode)
32586 if (out_n == 2 && in_n == 2)
32587 return ix86_builtins[IX86_BUILTIN_CEILPD];
32588 else if (out_n == 4 && in_n == 4)
32589 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32591 break;
32593 case BUILT_IN_CEILF:
32594 /* The round insn does not trap on denormals. */
32595 if (flag_trapping_math || !TARGET_ROUND)
32596 break;
32598 if (out_mode == SFmode && in_mode == SFmode)
32600 if (out_n == 4 && in_n == 4)
32601 return ix86_builtins[IX86_BUILTIN_CEILPS];
32602 else if (out_n == 8 && in_n == 8)
32603 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32605 break;
32607 case BUILT_IN_TRUNC:
32608 /* The round insn does not trap on denormals. */
32609 if (flag_trapping_math || !TARGET_ROUND)
32610 break;
32612 if (out_mode == DFmode && in_mode == DFmode)
32614 if (out_n == 2 && in_n == 2)
32615 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32616 else if (out_n == 4 && in_n == 4)
32617 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32619 break;
32621 case BUILT_IN_TRUNCF:
32622 /* The round insn does not trap on denormals. */
32623 if (flag_trapping_math || !TARGET_ROUND)
32624 break;
32626 if (out_mode == SFmode && in_mode == SFmode)
32628 if (out_n == 4 && in_n == 4)
32629 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32630 else if (out_n == 8 && in_n == 8)
32631 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32633 break;
32635 case BUILT_IN_RINT:
32636 /* The round insn does not trap on denormals. */
32637 if (flag_trapping_math || !TARGET_ROUND)
32638 break;
32640 if (out_mode == DFmode && in_mode == DFmode)
32642 if (out_n == 2 && in_n == 2)
32643 return ix86_builtins[IX86_BUILTIN_RINTPD];
32644 else if (out_n == 4 && in_n == 4)
32645 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32647 break;
32649 case BUILT_IN_RINTF:
32650 /* The round insn does not trap on denormals. */
32651 if (flag_trapping_math || !TARGET_ROUND)
32652 break;
32654 if (out_mode == SFmode && in_mode == SFmode)
32656 if (out_n == 4 && in_n == 4)
32657 return ix86_builtins[IX86_BUILTIN_RINTPS];
32658 else if (out_n == 8 && in_n == 8)
32659 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32661 break;
32663 case BUILT_IN_ROUND:
32664 /* The round insn does not trap on denormals. */
32665 if (flag_trapping_math || !TARGET_ROUND)
32666 break;
32668 if (out_mode == DFmode && in_mode == DFmode)
32670 if (out_n == 2 && in_n == 2)
32671 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32672 else if (out_n == 4 && in_n == 4)
32673 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32675 break;
32677 case BUILT_IN_ROUNDF:
32678 /* The round insn does not trap on denormals. */
32679 if (flag_trapping_math || !TARGET_ROUND)
32680 break;
32682 if (out_mode == SFmode && in_mode == SFmode)
32684 if (out_n == 4 && in_n == 4)
32685 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32686 else if (out_n == 8 && in_n == 8)
32687 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32689 break;
32691 case BUILT_IN_FMA:
32692 if (out_mode == DFmode && in_mode == DFmode)
32694 if (out_n == 2 && in_n == 2)
32695 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32696 if (out_n == 4 && in_n == 4)
32697 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32699 break;
32701 case BUILT_IN_FMAF:
32702 if (out_mode == SFmode && in_mode == SFmode)
32704 if (out_n == 4 && in_n == 4)
32705 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32706 if (out_n == 8 && in_n == 8)
32707 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32709 break;
32711 default:
32712 break;
32715 /* Dispatch to a handler for a vectorization library. */
32716 if (ix86_veclib_handler)
32717 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32718 type_in);
32720 return NULL_TREE;
32723 /* Handler for an SVML-style interface to
32724 a library with vectorized intrinsics. */
32726 static tree
32727 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32729 char name[20];
32730 tree fntype, new_fndecl, args;
32731 unsigned arity;
32732 const char *bname;
32733 enum machine_mode el_mode, in_mode;
32734 int n, in_n;
32736 /* The SVML is suitable for unsafe math only. */
32737 if (!flag_unsafe_math_optimizations)
32738 return NULL_TREE;
32740 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32741 n = TYPE_VECTOR_SUBPARTS (type_out);
32742 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32743 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32744 if (el_mode != in_mode
32745 || n != in_n)
32746 return NULL_TREE;
32748 switch (fn)
32750 case BUILT_IN_EXP:
32751 case BUILT_IN_LOG:
32752 case BUILT_IN_LOG10:
32753 case BUILT_IN_POW:
32754 case BUILT_IN_TANH:
32755 case BUILT_IN_TAN:
32756 case BUILT_IN_ATAN:
32757 case BUILT_IN_ATAN2:
32758 case BUILT_IN_ATANH:
32759 case BUILT_IN_CBRT:
32760 case BUILT_IN_SINH:
32761 case BUILT_IN_SIN:
32762 case BUILT_IN_ASINH:
32763 case BUILT_IN_ASIN:
32764 case BUILT_IN_COSH:
32765 case BUILT_IN_COS:
32766 case BUILT_IN_ACOSH:
32767 case BUILT_IN_ACOS:
32768 if (el_mode != DFmode || n != 2)
32769 return NULL_TREE;
32770 break;
32772 case BUILT_IN_EXPF:
32773 case BUILT_IN_LOGF:
32774 case BUILT_IN_LOG10F:
32775 case BUILT_IN_POWF:
32776 case BUILT_IN_TANHF:
32777 case BUILT_IN_TANF:
32778 case BUILT_IN_ATANF:
32779 case BUILT_IN_ATAN2F:
32780 case BUILT_IN_ATANHF:
32781 case BUILT_IN_CBRTF:
32782 case BUILT_IN_SINHF:
32783 case BUILT_IN_SINF:
32784 case BUILT_IN_ASINHF:
32785 case BUILT_IN_ASINF:
32786 case BUILT_IN_COSHF:
32787 case BUILT_IN_COSF:
32788 case BUILT_IN_ACOSHF:
32789 case BUILT_IN_ACOSF:
32790 if (el_mode != SFmode || n != 4)
32791 return NULL_TREE;
32792 break;
32794 default:
32795 return NULL_TREE;
32798 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32800 if (fn == BUILT_IN_LOGF)
32801 strcpy (name, "vmlsLn4");
32802 else if (fn == BUILT_IN_LOG)
32803 strcpy (name, "vmldLn2");
32804 else if (n == 4)
32806 sprintf (name, "vmls%s", bname+10);
32807 name[strlen (name)-1] = '4';
32809 else
32810 sprintf (name, "vmld%s2", bname+10);
32812 /* Convert to uppercase. */
32813 name[4] &= ~0x20;
32815 arity = 0;
32816 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32817 args;
32818 args = TREE_CHAIN (args))
32819 arity++;
32821 if (arity == 1)
32822 fntype = build_function_type_list (type_out, type_in, NULL);
32823 else
32824 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32826 /* Build a function declaration for the vectorized function. */
32827 new_fndecl = build_decl (BUILTINS_LOCATION,
32828 FUNCTION_DECL, get_identifier (name), fntype);
32829 TREE_PUBLIC (new_fndecl) = 1;
32830 DECL_EXTERNAL (new_fndecl) = 1;
32831 DECL_IS_NOVOPS (new_fndecl) = 1;
32832 TREE_READONLY (new_fndecl) = 1;
32834 return new_fndecl;
32837 /* Handler for an ACML-style interface to
32838 a library with vectorized intrinsics. */
32840 static tree
32841 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32843 char name[20] = "__vr.._";
32844 tree fntype, new_fndecl, args;
32845 unsigned arity;
32846 const char *bname;
32847 enum machine_mode el_mode, in_mode;
32848 int n, in_n;
32850 /* The ACML is 64bits only and suitable for unsafe math only as
32851 it does not correctly support parts of IEEE with the required
32852 precision such as denormals. */
32853 if (!TARGET_64BIT
32854 || !flag_unsafe_math_optimizations)
32855 return NULL_TREE;
32857 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32858 n = TYPE_VECTOR_SUBPARTS (type_out);
32859 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32860 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32861 if (el_mode != in_mode
32862 || n != in_n)
32863 return NULL_TREE;
32865 switch (fn)
32867 case BUILT_IN_SIN:
32868 case BUILT_IN_COS:
32869 case BUILT_IN_EXP:
32870 case BUILT_IN_LOG:
32871 case BUILT_IN_LOG2:
32872 case BUILT_IN_LOG10:
32873 name[4] = 'd';
32874 name[5] = '2';
32875 if (el_mode != DFmode
32876 || n != 2)
32877 return NULL_TREE;
32878 break;
32880 case BUILT_IN_SINF:
32881 case BUILT_IN_COSF:
32882 case BUILT_IN_EXPF:
32883 case BUILT_IN_POWF:
32884 case BUILT_IN_LOGF:
32885 case BUILT_IN_LOG2F:
32886 case BUILT_IN_LOG10F:
32887 name[4] = 's';
32888 name[5] = '4';
32889 if (el_mode != SFmode
32890 || n != 4)
32891 return NULL_TREE;
32892 break;
32894 default:
32895 return NULL_TREE;
32898 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32899 sprintf (name + 7, "%s", bname+10);
32901 arity = 0;
32902 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32903 args;
32904 args = TREE_CHAIN (args))
32905 arity++;
32907 if (arity == 1)
32908 fntype = build_function_type_list (type_out, type_in, NULL);
32909 else
32910 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32912 /* Build a function declaration for the vectorized function. */
32913 new_fndecl = build_decl (BUILTINS_LOCATION,
32914 FUNCTION_DECL, get_identifier (name), fntype);
32915 TREE_PUBLIC (new_fndecl) = 1;
32916 DECL_EXTERNAL (new_fndecl) = 1;
32917 DECL_IS_NOVOPS (new_fndecl) = 1;
32918 TREE_READONLY (new_fndecl) = 1;
32920 return new_fndecl;
32923 /* Returns a decl of a function that implements gather load with
32924 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
32925 Return NULL_TREE if it is not available. */
32927 static tree
32928 ix86_vectorize_builtin_gather (const_tree mem_vectype,
32929 const_tree index_type, int scale)
32931 bool si;
32932 enum ix86_builtins code;
32934 if (! TARGET_AVX2)
32935 return NULL_TREE;
32937 if ((TREE_CODE (index_type) != INTEGER_TYPE
32938 && !POINTER_TYPE_P (index_type))
32939 || (TYPE_MODE (index_type) != SImode
32940 && TYPE_MODE (index_type) != DImode))
32941 return NULL_TREE;
32943 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
32944 return NULL_TREE;
32946 /* v*gather* insn sign extends index to pointer mode. */
32947 if (TYPE_PRECISION (index_type) < POINTER_SIZE
32948 && TYPE_UNSIGNED (index_type))
32949 return NULL_TREE;
32951 if (scale <= 0
32952 || scale > 8
32953 || (scale & (scale - 1)) != 0)
32954 return NULL_TREE;
32956 si = TYPE_MODE (index_type) == SImode;
32957 switch (TYPE_MODE (mem_vectype))
32959 case V2DFmode:
32960 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
32961 break;
32962 case V4DFmode:
32963 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
32964 break;
32965 case V2DImode:
32966 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
32967 break;
32968 case V4DImode:
32969 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
32970 break;
32971 case V4SFmode:
32972 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
32973 break;
32974 case V8SFmode:
32975 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
32976 break;
32977 case V4SImode:
32978 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
32979 break;
32980 case V8SImode:
32981 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
32982 break;
32983 default:
32984 return NULL_TREE;
32987 return ix86_builtins[code];
32990 /* Returns a code for a target-specific builtin that implements
32991 reciprocal of the function, or NULL_TREE if not available. */
32993 static tree
32994 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
32995 bool sqrt ATTRIBUTE_UNUSED)
32997 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
32998 && flag_finite_math_only && !flag_trapping_math
32999 && flag_unsafe_math_optimizations))
33000 return NULL_TREE;
33002 if (md_fn)
33003 /* Machine dependent builtins. */
33004 switch (fn)
33006 /* Vectorized version of sqrt to rsqrt conversion. */
33007 case IX86_BUILTIN_SQRTPS_NR:
33008 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33010 case IX86_BUILTIN_SQRTPS_NR256:
33011 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33013 default:
33014 return NULL_TREE;
33016 else
33017 /* Normal builtins. */
33018 switch (fn)
33020 /* Sqrt to rsqrt conversion. */
33021 case BUILT_IN_SQRTF:
33022 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33024 default:
33025 return NULL_TREE;
33029 /* Helper for avx_vpermilps256_operand et al. This is also used by
33030 the expansion functions to turn the parallel back into a mask.
33031 The return value is 0 for no match and the imm8+1 for a match. */
33034 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33036 unsigned i, nelt = GET_MODE_NUNITS (mode);
33037 unsigned mask = 0;
33038 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33040 if (XVECLEN (par, 0) != (int) nelt)
33041 return 0;
33043 /* Validate that all of the elements are constants, and not totally
33044 out of range. Copy the data into an integral array to make the
33045 subsequent checks easier. */
33046 for (i = 0; i < nelt; ++i)
33048 rtx er = XVECEXP (par, 0, i);
33049 unsigned HOST_WIDE_INT ei;
33051 if (!CONST_INT_P (er))
33052 return 0;
33053 ei = INTVAL (er);
33054 if (ei >= nelt)
33055 return 0;
33056 ipar[i] = ei;
33059 switch (mode)
33061 case V4DFmode:
33062 /* In the 256-bit DFmode case, we can only move elements within
33063 a 128-bit lane. */
33064 for (i = 0; i < 2; ++i)
33066 if (ipar[i] >= 2)
33067 return 0;
33068 mask |= ipar[i] << i;
33070 for (i = 2; i < 4; ++i)
33072 if (ipar[i] < 2)
33073 return 0;
33074 mask |= (ipar[i] - 2) << i;
33076 break;
33078 case V8SFmode:
33079 /* In the 256-bit SFmode case, we have full freedom of movement
33080 within the low 128-bit lane, but the high 128-bit lane must
33081 mirror the exact same pattern. */
33082 for (i = 0; i < 4; ++i)
33083 if (ipar[i] + 4 != ipar[i + 4])
33084 return 0;
33085 nelt = 4;
33086 /* FALLTHRU */
33088 case V2DFmode:
33089 case V4SFmode:
33090 /* In the 128-bit case, we've full freedom in the placement of
33091 the elements from the source operand. */
33092 for (i = 0; i < nelt; ++i)
33093 mask |= ipar[i] << (i * (nelt / 2));
33094 break;
33096 default:
33097 gcc_unreachable ();
33100 /* Make sure success has a non-zero value by adding one. */
33101 return mask + 1;
33104 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33105 the expansion functions to turn the parallel back into a mask.
33106 The return value is 0 for no match and the imm8+1 for a match. */
33109 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33111 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33112 unsigned mask = 0;
33113 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33115 if (XVECLEN (par, 0) != (int) nelt)
33116 return 0;
33118 /* Validate that all of the elements are constants, and not totally
33119 out of range. Copy the data into an integral array to make the
33120 subsequent checks easier. */
33121 for (i = 0; i < nelt; ++i)
33123 rtx er = XVECEXP (par, 0, i);
33124 unsigned HOST_WIDE_INT ei;
33126 if (!CONST_INT_P (er))
33127 return 0;
33128 ei = INTVAL (er);
33129 if (ei >= 2 * nelt)
33130 return 0;
33131 ipar[i] = ei;
33134 /* Validate that the halves of the permute are halves. */
33135 for (i = 0; i < nelt2 - 1; ++i)
33136 if (ipar[i] + 1 != ipar[i + 1])
33137 return 0;
33138 for (i = nelt2; i < nelt - 1; ++i)
33139 if (ipar[i] + 1 != ipar[i + 1])
33140 return 0;
33142 /* Reconstruct the mask. */
33143 for (i = 0; i < 2; ++i)
33145 unsigned e = ipar[i * nelt2];
33146 if (e % nelt2)
33147 return 0;
33148 e /= nelt2;
33149 mask |= e << (i * 4);
33152 /* Make sure success has a non-zero value by adding one. */
33153 return mask + 1;
33156 /* Store OPERAND to the memory after reload is completed. This means
33157 that we can't easily use assign_stack_local. */
33159 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33161 rtx result;
33163 gcc_assert (reload_completed);
33164 if (ix86_using_red_zone ())
33166 result = gen_rtx_MEM (mode,
33167 gen_rtx_PLUS (Pmode,
33168 stack_pointer_rtx,
33169 GEN_INT (-RED_ZONE_SIZE)));
33170 emit_move_insn (result, operand);
33172 else if (TARGET_64BIT)
33174 switch (mode)
33176 case HImode:
33177 case SImode:
33178 operand = gen_lowpart (DImode, operand);
33179 /* FALLTHRU */
33180 case DImode:
33181 emit_insn (
33182 gen_rtx_SET (VOIDmode,
33183 gen_rtx_MEM (DImode,
33184 gen_rtx_PRE_DEC (DImode,
33185 stack_pointer_rtx)),
33186 operand));
33187 break;
33188 default:
33189 gcc_unreachable ();
33191 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33193 else
33195 switch (mode)
33197 case DImode:
33199 rtx operands[2];
33200 split_double_mode (mode, &operand, 1, operands, operands + 1);
33201 emit_insn (
33202 gen_rtx_SET (VOIDmode,
33203 gen_rtx_MEM (SImode,
33204 gen_rtx_PRE_DEC (Pmode,
33205 stack_pointer_rtx)),
33206 operands[1]));
33207 emit_insn (
33208 gen_rtx_SET (VOIDmode,
33209 gen_rtx_MEM (SImode,
33210 gen_rtx_PRE_DEC (Pmode,
33211 stack_pointer_rtx)),
33212 operands[0]));
33214 break;
33215 case HImode:
33216 /* Store HImodes as SImodes. */
33217 operand = gen_lowpart (SImode, operand);
33218 /* FALLTHRU */
33219 case SImode:
33220 emit_insn (
33221 gen_rtx_SET (VOIDmode,
33222 gen_rtx_MEM (GET_MODE (operand),
33223 gen_rtx_PRE_DEC (SImode,
33224 stack_pointer_rtx)),
33225 operand));
33226 break;
33227 default:
33228 gcc_unreachable ();
33230 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33232 return result;
33235 /* Free operand from the memory. */
33236 void
33237 ix86_free_from_memory (enum machine_mode mode)
33239 if (!ix86_using_red_zone ())
33241 int size;
33243 if (mode == DImode || TARGET_64BIT)
33244 size = 8;
33245 else
33246 size = 4;
33247 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33248 to pop or add instruction if registers are available. */
33249 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33250 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33251 GEN_INT (size))));
33255 /* Return a register priority for hard reg REGNO. */
33256 static int
33257 ix86_register_priority (int hard_regno)
33259 /* ebp and r13 as the base always wants a displacement, r12 as the
33260 base always wants an index. So discourage their usage in an
33261 address. */
33262 if (hard_regno == R12_REG || hard_regno == R13_REG)
33263 return 0;
33264 if (hard_regno == BP_REG)
33265 return 1;
33266 /* New x86-64 int registers result in bigger code size. Discourage
33267 them. */
33268 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33269 return 2;
33270 /* New x86-64 SSE registers result in bigger code size. Discourage
33271 them. */
33272 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33273 return 2;
33274 /* Usage of AX register results in smaller code. Prefer it. */
33275 if (hard_regno == 0)
33276 return 4;
33277 return 3;
33280 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33282 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33283 QImode must go into class Q_REGS.
33284 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33285 movdf to do mem-to-mem moves through integer regs. */
33287 static reg_class_t
33288 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33290 enum machine_mode mode = GET_MODE (x);
33292 /* We're only allowed to return a subclass of CLASS. Many of the
33293 following checks fail for NO_REGS, so eliminate that early. */
33294 if (regclass == NO_REGS)
33295 return NO_REGS;
33297 /* All classes can load zeros. */
33298 if (x == CONST0_RTX (mode))
33299 return regclass;
33301 /* Force constants into memory if we are loading a (nonzero) constant into
33302 an MMX or SSE register. This is because there are no MMX/SSE instructions
33303 to load from a constant. */
33304 if (CONSTANT_P (x)
33305 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33306 return NO_REGS;
33308 /* Prefer SSE regs only, if we can use them for math. */
33309 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33310 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33312 /* Floating-point constants need more complex checks. */
33313 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33315 /* General regs can load everything. */
33316 if (reg_class_subset_p (regclass, GENERAL_REGS))
33317 return regclass;
33319 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33320 zero above. We only want to wind up preferring 80387 registers if
33321 we plan on doing computation with them. */
33322 if (TARGET_80387
33323 && standard_80387_constant_p (x) > 0)
33325 /* Limit class to non-sse. */
33326 if (regclass == FLOAT_SSE_REGS)
33327 return FLOAT_REGS;
33328 if (regclass == FP_TOP_SSE_REGS)
33329 return FP_TOP_REG;
33330 if (regclass == FP_SECOND_SSE_REGS)
33331 return FP_SECOND_REG;
33332 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33333 return regclass;
33336 return NO_REGS;
33339 /* Generally when we see PLUS here, it's the function invariant
33340 (plus soft-fp const_int). Which can only be computed into general
33341 regs. */
33342 if (GET_CODE (x) == PLUS)
33343 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33345 /* QImode constants are easy to load, but non-constant QImode data
33346 must go into Q_REGS. */
33347 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33349 if (reg_class_subset_p (regclass, Q_REGS))
33350 return regclass;
33351 if (reg_class_subset_p (Q_REGS, regclass))
33352 return Q_REGS;
33353 return NO_REGS;
33356 return regclass;
33359 /* Discourage putting floating-point values in SSE registers unless
33360 SSE math is being used, and likewise for the 387 registers. */
33361 static reg_class_t
33362 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33364 enum machine_mode mode = GET_MODE (x);
33366 /* Restrict the output reload class to the register bank that we are doing
33367 math on. If we would like not to return a subset of CLASS, reject this
33368 alternative: if reload cannot do this, it will still use its choice. */
33369 mode = GET_MODE (x);
33370 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33371 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33373 if (X87_FLOAT_MODE_P (mode))
33375 if (regclass == FP_TOP_SSE_REGS)
33376 return FP_TOP_REG;
33377 else if (regclass == FP_SECOND_SSE_REGS)
33378 return FP_SECOND_REG;
33379 else
33380 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33383 return regclass;
33386 static reg_class_t
33387 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33388 enum machine_mode mode, secondary_reload_info *sri)
33390 /* Double-word spills from general registers to non-offsettable memory
33391 references (zero-extended addresses) require special handling. */
33392 if (TARGET_64BIT
33393 && MEM_P (x)
33394 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33395 && rclass == GENERAL_REGS
33396 && !offsettable_memref_p (x))
33398 sri->icode = (in_p
33399 ? CODE_FOR_reload_noff_load
33400 : CODE_FOR_reload_noff_store);
33401 /* Add the cost of moving address to a temporary. */
33402 sri->extra_cost = 1;
33404 return NO_REGS;
33407 /* QImode spills from non-QI registers require
33408 intermediate register on 32bit targets. */
33409 if (!TARGET_64BIT
33410 && !in_p && mode == QImode
33411 && (rclass == GENERAL_REGS
33412 || rclass == LEGACY_REGS
33413 || rclass == NON_Q_REGS
33414 || rclass == SIREG
33415 || rclass == DIREG
33416 || rclass == INDEX_REGS))
33418 int regno;
33420 if (REG_P (x))
33421 regno = REGNO (x);
33422 else
33423 regno = -1;
33425 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33426 regno = true_regnum (x);
33428 /* Return Q_REGS if the operand is in memory. */
33429 if (regno == -1)
33430 return Q_REGS;
33433 /* This condition handles corner case where an expression involving
33434 pointers gets vectorized. We're trying to use the address of a
33435 stack slot as a vector initializer.
33437 (set (reg:V2DI 74 [ vect_cst_.2 ])
33438 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33440 Eventually frame gets turned into sp+offset like this:
33442 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33443 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33444 (const_int 392 [0x188]))))
33446 That later gets turned into:
33448 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33449 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33450 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33452 We'll have the following reload recorded:
33454 Reload 0: reload_in (DI) =
33455 (plus:DI (reg/f:DI 7 sp)
33456 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33457 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33458 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33459 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33460 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33461 reload_reg_rtx: (reg:V2DI 22 xmm1)
33463 Which isn't going to work since SSE instructions can't handle scalar
33464 additions. Returning GENERAL_REGS forces the addition into integer
33465 register and reload can handle subsequent reloads without problems. */
33467 if (in_p && GET_CODE (x) == PLUS
33468 && SSE_CLASS_P (rclass)
33469 && SCALAR_INT_MODE_P (mode))
33470 return GENERAL_REGS;
33472 return NO_REGS;
33475 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33477 static bool
33478 ix86_class_likely_spilled_p (reg_class_t rclass)
33480 switch (rclass)
33482 case AREG:
33483 case DREG:
33484 case CREG:
33485 case BREG:
33486 case AD_REGS:
33487 case SIREG:
33488 case DIREG:
33489 case SSE_FIRST_REG:
33490 case FP_TOP_REG:
33491 case FP_SECOND_REG:
33492 return true;
33494 default:
33495 break;
33498 return false;
33501 /* If we are copying between general and FP registers, we need a memory
33502 location. The same is true for SSE and MMX registers.
33504 To optimize register_move_cost performance, allow inline variant.
33506 The macro can't work reliably when one of the CLASSES is class containing
33507 registers from multiple units (SSE, MMX, integer). We avoid this by never
33508 combining those units in single alternative in the machine description.
33509 Ensure that this constraint holds to avoid unexpected surprises.
33511 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33512 enforce these sanity checks. */
33514 static inline bool
33515 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33516 enum machine_mode mode, int strict)
33518 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33519 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33520 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33521 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33522 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33523 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33525 gcc_assert (!strict || lra_in_progress);
33526 return true;
33529 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33530 return true;
33532 /* ??? This is a lie. We do have moves between mmx/general, and for
33533 mmx/sse2. But by saying we need secondary memory we discourage the
33534 register allocator from using the mmx registers unless needed. */
33535 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33536 return true;
33538 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33540 /* SSE1 doesn't have any direct moves from other classes. */
33541 if (!TARGET_SSE2)
33542 return true;
33544 /* If the target says that inter-unit moves are more expensive
33545 than moving through memory, then don't generate them. */
33546 if (!TARGET_INTER_UNIT_MOVES)
33547 return true;
33549 /* Between SSE and general, we have moves no larger than word size. */
33550 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33551 return true;
33554 return false;
33557 bool
33558 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33559 enum machine_mode mode, int strict)
33561 return inline_secondary_memory_needed (class1, class2, mode, strict);
33564 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33566 On the 80386, this is the size of MODE in words,
33567 except in the FP regs, where a single reg is always enough. */
33569 static unsigned char
33570 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33572 if (MAYBE_INTEGER_CLASS_P (rclass))
33574 if (mode == XFmode)
33575 return (TARGET_64BIT ? 2 : 3);
33576 else if (mode == XCmode)
33577 return (TARGET_64BIT ? 4 : 6);
33578 else
33579 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33581 else
33583 if (COMPLEX_MODE_P (mode))
33584 return 2;
33585 else
33586 return 1;
33590 /* Return true if the registers in CLASS cannot represent the change from
33591 modes FROM to TO. */
33593 bool
33594 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33595 enum reg_class regclass)
33597 if (from == to)
33598 return false;
33600 /* x87 registers can't do subreg at all, as all values are reformatted
33601 to extended precision. */
33602 if (MAYBE_FLOAT_CLASS_P (regclass))
33603 return true;
33605 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33607 /* Vector registers do not support QI or HImode loads. If we don't
33608 disallow a change to these modes, reload will assume it's ok to
33609 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33610 the vec_dupv4hi pattern. */
33611 if (GET_MODE_SIZE (from) < 4)
33612 return true;
33614 /* Vector registers do not support subreg with nonzero offsets, which
33615 are otherwise valid for integer registers. Since we can't see
33616 whether we have a nonzero offset from here, prohibit all
33617 nonparadoxical subregs changing size. */
33618 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33619 return true;
33622 return false;
33625 /* Return the cost of moving data of mode M between a
33626 register and memory. A value of 2 is the default; this cost is
33627 relative to those in `REGISTER_MOVE_COST'.
33629 This function is used extensively by register_move_cost that is used to
33630 build tables at startup. Make it inline in this case.
33631 When IN is 2, return maximum of in and out move cost.
33633 If moving between registers and memory is more expensive than
33634 between two registers, you should define this macro to express the
33635 relative cost.
33637 Model also increased moving costs of QImode registers in non
33638 Q_REGS classes.
33640 static inline int
33641 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33642 int in)
33644 int cost;
33645 if (FLOAT_CLASS_P (regclass))
33647 int index;
33648 switch (mode)
33650 case SFmode:
33651 index = 0;
33652 break;
33653 case DFmode:
33654 index = 1;
33655 break;
33656 case XFmode:
33657 index = 2;
33658 break;
33659 default:
33660 return 100;
33662 if (in == 2)
33663 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33664 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33666 if (SSE_CLASS_P (regclass))
33668 int index;
33669 switch (GET_MODE_SIZE (mode))
33671 case 4:
33672 index = 0;
33673 break;
33674 case 8:
33675 index = 1;
33676 break;
33677 case 16:
33678 index = 2;
33679 break;
33680 default:
33681 return 100;
33683 if (in == 2)
33684 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33685 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33687 if (MMX_CLASS_P (regclass))
33689 int index;
33690 switch (GET_MODE_SIZE (mode))
33692 case 4:
33693 index = 0;
33694 break;
33695 case 8:
33696 index = 1;
33697 break;
33698 default:
33699 return 100;
33701 if (in)
33702 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33703 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33705 switch (GET_MODE_SIZE (mode))
33707 case 1:
33708 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33710 if (!in)
33711 return ix86_cost->int_store[0];
33712 if (TARGET_PARTIAL_REG_DEPENDENCY
33713 && optimize_function_for_speed_p (cfun))
33714 cost = ix86_cost->movzbl_load;
33715 else
33716 cost = ix86_cost->int_load[0];
33717 if (in == 2)
33718 return MAX (cost, ix86_cost->int_store[0]);
33719 return cost;
33721 else
33723 if (in == 2)
33724 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33725 if (in)
33726 return ix86_cost->movzbl_load;
33727 else
33728 return ix86_cost->int_store[0] + 4;
33730 break;
33731 case 2:
33732 if (in == 2)
33733 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33734 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33735 default:
33736 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33737 if (mode == TFmode)
33738 mode = XFmode;
33739 if (in == 2)
33740 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33741 else if (in)
33742 cost = ix86_cost->int_load[2];
33743 else
33744 cost = ix86_cost->int_store[2];
33745 return (cost * (((int) GET_MODE_SIZE (mode)
33746 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33750 static int
33751 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33752 bool in)
33754 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33758 /* Return the cost of moving data from a register in class CLASS1 to
33759 one in class CLASS2.
33761 It is not required that the cost always equal 2 when FROM is the same as TO;
33762 on some machines it is expensive to move between registers if they are not
33763 general registers. */
33765 static int
33766 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33767 reg_class_t class2_i)
33769 enum reg_class class1 = (enum reg_class) class1_i;
33770 enum reg_class class2 = (enum reg_class) class2_i;
33772 /* In case we require secondary memory, compute cost of the store followed
33773 by load. In order to avoid bad register allocation choices, we need
33774 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33776 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33778 int cost = 1;
33780 cost += inline_memory_move_cost (mode, class1, 2);
33781 cost += inline_memory_move_cost (mode, class2, 2);
33783 /* In case of copying from general_purpose_register we may emit multiple
33784 stores followed by single load causing memory size mismatch stall.
33785 Count this as arbitrarily high cost of 20. */
33786 if (targetm.class_max_nregs (class1, mode)
33787 > targetm.class_max_nregs (class2, mode))
33788 cost += 20;
33790 /* In the case of FP/MMX moves, the registers actually overlap, and we
33791 have to switch modes in order to treat them differently. */
33792 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33793 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33794 cost += 20;
33796 return cost;
33799 /* Moves between SSE/MMX and integer unit are expensive. */
33800 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33801 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33803 /* ??? By keeping returned value relatively high, we limit the number
33804 of moves between integer and MMX/SSE registers for all targets.
33805 Additionally, high value prevents problem with x86_modes_tieable_p(),
33806 where integer modes in MMX/SSE registers are not tieable
33807 because of missing QImode and HImode moves to, from or between
33808 MMX/SSE registers. */
33809 return MAX (8, ix86_cost->mmxsse_to_integer);
33811 if (MAYBE_FLOAT_CLASS_P (class1))
33812 return ix86_cost->fp_move;
33813 if (MAYBE_SSE_CLASS_P (class1))
33814 return ix86_cost->sse_move;
33815 if (MAYBE_MMX_CLASS_P (class1))
33816 return ix86_cost->mmx_move;
33817 return 2;
33820 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33821 MODE. */
33823 bool
33824 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33826 /* Flags and only flags can only hold CCmode values. */
33827 if (CC_REGNO_P (regno))
33828 return GET_MODE_CLASS (mode) == MODE_CC;
33829 if (GET_MODE_CLASS (mode) == MODE_CC
33830 || GET_MODE_CLASS (mode) == MODE_RANDOM
33831 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33832 return false;
33833 if (STACK_REGNO_P (regno))
33834 return VALID_FP_MODE_P (mode);
33835 if (SSE_REGNO_P (regno))
33837 /* We implement the move patterns for all vector modes into and
33838 out of SSE registers, even when no operation instructions
33839 are available. OImode move is available only when AVX is
33840 enabled. */
33841 return ((TARGET_AVX && mode == OImode)
33842 || VALID_AVX256_REG_MODE (mode)
33843 || VALID_SSE_REG_MODE (mode)
33844 || VALID_SSE2_REG_MODE (mode)
33845 || VALID_MMX_REG_MODE (mode)
33846 || VALID_MMX_REG_MODE_3DNOW (mode));
33848 if (MMX_REGNO_P (regno))
33850 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33851 so if the register is available at all, then we can move data of
33852 the given mode into or out of it. */
33853 return (VALID_MMX_REG_MODE (mode)
33854 || VALID_MMX_REG_MODE_3DNOW (mode));
33857 if (mode == QImode)
33859 /* Take care for QImode values - they can be in non-QI regs,
33860 but then they do cause partial register stalls. */
33861 if (TARGET_64BIT || QI_REGNO_P (regno))
33862 return true;
33863 if (!TARGET_PARTIAL_REG_STALL)
33864 return true;
33865 return !can_create_pseudo_p ();
33867 /* We handle both integer and floats in the general purpose registers. */
33868 else if (VALID_INT_MODE_P (mode))
33869 return true;
33870 else if (VALID_FP_MODE_P (mode))
33871 return true;
33872 else if (VALID_DFP_MODE_P (mode))
33873 return true;
33874 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
33875 on to use that value in smaller contexts, this can easily force a
33876 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
33877 supporting DImode, allow it. */
33878 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
33879 return true;
33881 return false;
33884 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
33885 tieable integer mode. */
33887 static bool
33888 ix86_tieable_integer_mode_p (enum machine_mode mode)
33890 switch (mode)
33892 case HImode:
33893 case SImode:
33894 return true;
33896 case QImode:
33897 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
33899 case DImode:
33900 return TARGET_64BIT;
33902 default:
33903 return false;
33907 /* Return true if MODE1 is accessible in a register that can hold MODE2
33908 without copying. That is, all register classes that can hold MODE2
33909 can also hold MODE1. */
33911 bool
33912 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
33914 if (mode1 == mode2)
33915 return true;
33917 if (ix86_tieable_integer_mode_p (mode1)
33918 && ix86_tieable_integer_mode_p (mode2))
33919 return true;
33921 /* MODE2 being XFmode implies fp stack or general regs, which means we
33922 can tie any smaller floating point modes to it. Note that we do not
33923 tie this with TFmode. */
33924 if (mode2 == XFmode)
33925 return mode1 == SFmode || mode1 == DFmode;
33927 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
33928 that we can tie it with SFmode. */
33929 if (mode2 == DFmode)
33930 return mode1 == SFmode;
33932 /* If MODE2 is only appropriate for an SSE register, then tie with
33933 any other mode acceptable to SSE registers. */
33934 if (GET_MODE_SIZE (mode2) == 32
33935 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33936 return (GET_MODE_SIZE (mode1) == 32
33937 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33938 if (GET_MODE_SIZE (mode2) == 16
33939 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33940 return (GET_MODE_SIZE (mode1) == 16
33941 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33943 /* If MODE2 is appropriate for an MMX register, then tie
33944 with any other mode acceptable to MMX registers. */
33945 if (GET_MODE_SIZE (mode2) == 8
33946 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
33947 return (GET_MODE_SIZE (mode1) == 8
33948 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
33950 return false;
33953 /* Return the cost of moving between two registers of mode MODE. */
33955 static int
33956 ix86_set_reg_reg_cost (enum machine_mode mode)
33958 unsigned int units = UNITS_PER_WORD;
33960 switch (GET_MODE_CLASS (mode))
33962 default:
33963 break;
33965 case MODE_CC:
33966 units = GET_MODE_SIZE (CCmode);
33967 break;
33969 case MODE_FLOAT:
33970 if ((TARGET_SSE && mode == TFmode)
33971 || (TARGET_80387 && mode == XFmode)
33972 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
33973 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
33974 units = GET_MODE_SIZE (mode);
33975 break;
33977 case MODE_COMPLEX_FLOAT:
33978 if ((TARGET_SSE && mode == TCmode)
33979 || (TARGET_80387 && mode == XCmode)
33980 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
33981 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
33982 units = GET_MODE_SIZE (mode);
33983 break;
33985 case MODE_VECTOR_INT:
33986 case MODE_VECTOR_FLOAT:
33987 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33988 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33989 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33990 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
33991 units = GET_MODE_SIZE (mode);
33994 /* Return the cost of moving between two registers of mode MODE,
33995 assuming that the move will be in pieces of at most UNITS bytes. */
33996 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
33999 /* Compute a (partial) cost for rtx X. Return true if the complete
34000 cost has been computed, and false if subexpressions should be
34001 scanned. In either case, *TOTAL contains the cost result. */
34003 static bool
34004 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34005 bool speed)
34007 enum rtx_code code = (enum rtx_code) code_i;
34008 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34009 enum machine_mode mode = GET_MODE (x);
34010 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34012 switch (code)
34014 case SET:
34015 if (register_operand (SET_DEST (x), VOIDmode)
34016 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34018 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34019 return true;
34021 return false;
34023 case CONST_INT:
34024 case CONST:
34025 case LABEL_REF:
34026 case SYMBOL_REF:
34027 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34028 *total = 3;
34029 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34030 *total = 2;
34031 else if (flag_pic && SYMBOLIC_CONST (x)
34032 && (!TARGET_64BIT
34033 || (!GET_CODE (x) != LABEL_REF
34034 && (GET_CODE (x) != SYMBOL_REF
34035 || !SYMBOL_REF_LOCAL_P (x)))))
34036 *total = 1;
34037 else
34038 *total = 0;
34039 return true;
34041 case CONST_DOUBLE:
34042 if (mode == VOIDmode)
34044 *total = 0;
34045 return true;
34047 switch (standard_80387_constant_p (x))
34049 case 1: /* 0.0 */
34050 *total = 1;
34051 return true;
34052 default: /* Other constants */
34053 *total = 2;
34054 return true;
34055 case 0:
34056 case -1:
34057 break;
34059 if (SSE_FLOAT_MODE_P (mode))
34061 case CONST_VECTOR:
34062 switch (standard_sse_constant_p (x))
34064 case 0:
34065 break;
34066 case 1: /* 0: xor eliminates false dependency */
34067 *total = 0;
34068 return true;
34069 default: /* -1: cmp contains false dependency */
34070 *total = 1;
34071 return true;
34074 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34075 it'll probably end up. Add a penalty for size. */
34076 *total = (COSTS_N_INSNS (1)
34077 + (flag_pic != 0 && !TARGET_64BIT)
34078 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34079 return true;
34081 case ZERO_EXTEND:
34082 /* The zero extensions is often completely free on x86_64, so make
34083 it as cheap as possible. */
34084 if (TARGET_64BIT && mode == DImode
34085 && GET_MODE (XEXP (x, 0)) == SImode)
34086 *total = 1;
34087 else if (TARGET_ZERO_EXTEND_WITH_AND)
34088 *total = cost->add;
34089 else
34090 *total = cost->movzx;
34091 return false;
34093 case SIGN_EXTEND:
34094 *total = cost->movsx;
34095 return false;
34097 case ASHIFT:
34098 if (SCALAR_INT_MODE_P (mode)
34099 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34100 && CONST_INT_P (XEXP (x, 1)))
34102 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34103 if (value == 1)
34105 *total = cost->add;
34106 return false;
34108 if ((value == 2 || value == 3)
34109 && cost->lea <= cost->shift_const)
34111 *total = cost->lea;
34112 return false;
34115 /* FALLTHRU */
34117 case ROTATE:
34118 case ASHIFTRT:
34119 case LSHIFTRT:
34120 case ROTATERT:
34121 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34123 /* ??? Should be SSE vector operation cost. */
34124 /* At least for published AMD latencies, this really is the same
34125 as the latency for a simple fpu operation like fabs. */
34126 /* V*QImode is emulated with 1-11 insns. */
34127 if (mode == V16QImode || mode == V32QImode)
34129 int count = 11;
34130 if (TARGET_XOP && mode == V16QImode)
34132 /* For XOP we use vpshab, which requires a broadcast of the
34133 value to the variable shift insn. For constants this
34134 means a V16Q const in mem; even when we can perform the
34135 shift with one insn set the cost to prefer paddb. */
34136 if (CONSTANT_P (XEXP (x, 1)))
34138 *total = (cost->fabs
34139 + rtx_cost (XEXP (x, 0), code, 0, speed)
34140 + (speed ? 2 : COSTS_N_BYTES (16)));
34141 return true;
34143 count = 3;
34145 else if (TARGET_SSSE3)
34146 count = 7;
34147 *total = cost->fabs * count;
34149 else
34150 *total = cost->fabs;
34152 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34154 if (CONST_INT_P (XEXP (x, 1)))
34156 if (INTVAL (XEXP (x, 1)) > 32)
34157 *total = cost->shift_const + COSTS_N_INSNS (2);
34158 else
34159 *total = cost->shift_const * 2;
34161 else
34163 if (GET_CODE (XEXP (x, 1)) == AND)
34164 *total = cost->shift_var * 2;
34165 else
34166 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34169 else
34171 if (CONST_INT_P (XEXP (x, 1)))
34172 *total = cost->shift_const;
34173 else
34174 *total = cost->shift_var;
34176 return false;
34178 case FMA:
34180 rtx sub;
34182 gcc_assert (FLOAT_MODE_P (mode));
34183 gcc_assert (TARGET_FMA || TARGET_FMA4);
34185 /* ??? SSE scalar/vector cost should be used here. */
34186 /* ??? Bald assumption that fma has the same cost as fmul. */
34187 *total = cost->fmul;
34188 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34190 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34191 sub = XEXP (x, 0);
34192 if (GET_CODE (sub) == NEG)
34193 sub = XEXP (sub, 0);
34194 *total += rtx_cost (sub, FMA, 0, speed);
34196 sub = XEXP (x, 2);
34197 if (GET_CODE (sub) == NEG)
34198 sub = XEXP (sub, 0);
34199 *total += rtx_cost (sub, FMA, 2, speed);
34200 return true;
34203 case MULT:
34204 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34206 /* ??? SSE scalar cost should be used here. */
34207 *total = cost->fmul;
34208 return false;
34210 else if (X87_FLOAT_MODE_P (mode))
34212 *total = cost->fmul;
34213 return false;
34215 else if (FLOAT_MODE_P (mode))
34217 /* ??? SSE vector cost should be used here. */
34218 *total = cost->fmul;
34219 return false;
34221 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34223 /* V*QImode is emulated with 7-13 insns. */
34224 if (mode == V16QImode || mode == V32QImode)
34226 int extra = 11;
34227 if (TARGET_XOP && mode == V16QImode)
34228 extra = 5;
34229 else if (TARGET_SSSE3)
34230 extra = 6;
34231 *total = cost->fmul * 2 + cost->fabs * extra;
34233 /* V*DImode is emulated with 5-8 insns. */
34234 else if (mode == V2DImode || mode == V4DImode)
34236 if (TARGET_XOP && mode == V2DImode)
34237 *total = cost->fmul * 2 + cost->fabs * 3;
34238 else
34239 *total = cost->fmul * 3 + cost->fabs * 5;
34241 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34242 insns, including two PMULUDQ. */
34243 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34244 *total = cost->fmul * 2 + cost->fabs * 5;
34245 else
34246 *total = cost->fmul;
34247 return false;
34249 else
34251 rtx op0 = XEXP (x, 0);
34252 rtx op1 = XEXP (x, 1);
34253 int nbits;
34254 if (CONST_INT_P (XEXP (x, 1)))
34256 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34257 for (nbits = 0; value != 0; value &= value - 1)
34258 nbits++;
34260 else
34261 /* This is arbitrary. */
34262 nbits = 7;
34264 /* Compute costs correctly for widening multiplication. */
34265 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34266 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34267 == GET_MODE_SIZE (mode))
34269 int is_mulwiden = 0;
34270 enum machine_mode inner_mode = GET_MODE (op0);
34272 if (GET_CODE (op0) == GET_CODE (op1))
34273 is_mulwiden = 1, op1 = XEXP (op1, 0);
34274 else if (CONST_INT_P (op1))
34276 if (GET_CODE (op0) == SIGN_EXTEND)
34277 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34278 == INTVAL (op1);
34279 else
34280 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34283 if (is_mulwiden)
34284 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34287 *total = (cost->mult_init[MODE_INDEX (mode)]
34288 + nbits * cost->mult_bit
34289 + rtx_cost (op0, outer_code, opno, speed)
34290 + rtx_cost (op1, outer_code, opno, speed));
34292 return true;
34295 case DIV:
34296 case UDIV:
34297 case MOD:
34298 case UMOD:
34299 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34300 /* ??? SSE cost should be used here. */
34301 *total = cost->fdiv;
34302 else if (X87_FLOAT_MODE_P (mode))
34303 *total = cost->fdiv;
34304 else if (FLOAT_MODE_P (mode))
34305 /* ??? SSE vector cost should be used here. */
34306 *total = cost->fdiv;
34307 else
34308 *total = cost->divide[MODE_INDEX (mode)];
34309 return false;
34311 case PLUS:
34312 if (GET_MODE_CLASS (mode) == MODE_INT
34313 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34315 if (GET_CODE (XEXP (x, 0)) == PLUS
34316 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34317 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34318 && CONSTANT_P (XEXP (x, 1)))
34320 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34321 if (val == 2 || val == 4 || val == 8)
34323 *total = cost->lea;
34324 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34325 outer_code, opno, speed);
34326 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34327 outer_code, opno, speed);
34328 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34329 return true;
34332 else if (GET_CODE (XEXP (x, 0)) == MULT
34333 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34335 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34336 if (val == 2 || val == 4 || val == 8)
34338 *total = cost->lea;
34339 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34340 outer_code, opno, speed);
34341 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34342 return true;
34345 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34347 *total = cost->lea;
34348 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34349 outer_code, opno, speed);
34350 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34351 outer_code, opno, speed);
34352 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34353 return true;
34356 /* FALLTHRU */
34358 case MINUS:
34359 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34361 /* ??? SSE cost should be used here. */
34362 *total = cost->fadd;
34363 return false;
34365 else if (X87_FLOAT_MODE_P (mode))
34367 *total = cost->fadd;
34368 return false;
34370 else if (FLOAT_MODE_P (mode))
34372 /* ??? SSE vector cost should be used here. */
34373 *total = cost->fadd;
34374 return false;
34376 /* FALLTHRU */
34378 case AND:
34379 case IOR:
34380 case XOR:
34381 if (GET_MODE_CLASS (mode) == MODE_INT
34382 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34384 *total = (cost->add * 2
34385 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34386 << (GET_MODE (XEXP (x, 0)) != DImode))
34387 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34388 << (GET_MODE (XEXP (x, 1)) != DImode)));
34389 return true;
34391 /* FALLTHRU */
34393 case NEG:
34394 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34396 /* ??? SSE cost should be used here. */
34397 *total = cost->fchs;
34398 return false;
34400 else if (X87_FLOAT_MODE_P (mode))
34402 *total = cost->fchs;
34403 return false;
34405 else if (FLOAT_MODE_P (mode))
34407 /* ??? SSE vector cost should be used here. */
34408 *total = cost->fchs;
34409 return false;
34411 /* FALLTHRU */
34413 case NOT:
34414 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34416 /* ??? Should be SSE vector operation cost. */
34417 /* At least for published AMD latencies, this really is the same
34418 as the latency for a simple fpu operation like fabs. */
34419 *total = cost->fabs;
34421 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34422 *total = cost->add * 2;
34423 else
34424 *total = cost->add;
34425 return false;
34427 case COMPARE:
34428 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34429 && XEXP (XEXP (x, 0), 1) == const1_rtx
34430 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34431 && XEXP (x, 1) == const0_rtx)
34433 /* This kind of construct is implemented using test[bwl].
34434 Treat it as if we had an AND. */
34435 *total = (cost->add
34436 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34437 + rtx_cost (const1_rtx, outer_code, opno, speed));
34438 return true;
34440 return false;
34442 case FLOAT_EXTEND:
34443 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34444 *total = 0;
34445 return false;
34447 case ABS:
34448 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34449 /* ??? SSE cost should be used here. */
34450 *total = cost->fabs;
34451 else if (X87_FLOAT_MODE_P (mode))
34452 *total = cost->fabs;
34453 else if (FLOAT_MODE_P (mode))
34454 /* ??? SSE vector cost should be used here. */
34455 *total = cost->fabs;
34456 return false;
34458 case SQRT:
34459 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34460 /* ??? SSE cost should be used here. */
34461 *total = cost->fsqrt;
34462 else if (X87_FLOAT_MODE_P (mode))
34463 *total = cost->fsqrt;
34464 else if (FLOAT_MODE_P (mode))
34465 /* ??? SSE vector cost should be used here. */
34466 *total = cost->fsqrt;
34467 return false;
34469 case UNSPEC:
34470 if (XINT (x, 1) == UNSPEC_TP)
34471 *total = 0;
34472 return false;
34474 case VEC_SELECT:
34475 case VEC_CONCAT:
34476 case VEC_MERGE:
34477 case VEC_DUPLICATE:
34478 /* ??? Assume all of these vector manipulation patterns are
34479 recognizable. In which case they all pretty much have the
34480 same cost. */
34481 *total = cost->fabs;
34482 return true;
34484 default:
34485 return false;
34489 #if TARGET_MACHO
34491 static int current_machopic_label_num;
34493 /* Given a symbol name and its associated stub, write out the
34494 definition of the stub. */
34496 void
34497 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34499 unsigned int length;
34500 char *binder_name, *symbol_name, lazy_ptr_name[32];
34501 int label = ++current_machopic_label_num;
34503 /* For 64-bit we shouldn't get here. */
34504 gcc_assert (!TARGET_64BIT);
34506 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34507 symb = targetm.strip_name_encoding (symb);
34509 length = strlen (stub);
34510 binder_name = XALLOCAVEC (char, length + 32);
34511 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34513 length = strlen (symb);
34514 symbol_name = XALLOCAVEC (char, length + 32);
34515 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34517 sprintf (lazy_ptr_name, "L%d$lz", label);
34519 if (MACHOPIC_ATT_STUB)
34520 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34521 else if (MACHOPIC_PURE)
34522 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34523 else
34524 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34526 fprintf (file, "%s:\n", stub);
34527 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34529 if (MACHOPIC_ATT_STUB)
34531 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34533 else if (MACHOPIC_PURE)
34535 /* PIC stub. */
34536 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34537 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34538 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34539 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34540 label, lazy_ptr_name, label);
34541 fprintf (file, "\tjmp\t*%%ecx\n");
34543 else
34544 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34546 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34547 it needs no stub-binding-helper. */
34548 if (MACHOPIC_ATT_STUB)
34549 return;
34551 fprintf (file, "%s:\n", binder_name);
34553 if (MACHOPIC_PURE)
34555 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34556 fprintf (file, "\tpushl\t%%ecx\n");
34558 else
34559 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34561 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34563 /* N.B. Keep the correspondence of these
34564 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34565 old-pic/new-pic/non-pic stubs; altering this will break
34566 compatibility with existing dylibs. */
34567 if (MACHOPIC_PURE)
34569 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34570 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34572 else
34573 /* 16-byte -mdynamic-no-pic stub. */
34574 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34576 fprintf (file, "%s:\n", lazy_ptr_name);
34577 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34578 fprintf (file, ASM_LONG "%s\n", binder_name);
34580 #endif /* TARGET_MACHO */
34582 /* Order the registers for register allocator. */
34584 void
34585 x86_order_regs_for_local_alloc (void)
34587 int pos = 0;
34588 int i;
34590 /* First allocate the local general purpose registers. */
34591 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34592 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34593 reg_alloc_order [pos++] = i;
34595 /* Global general purpose registers. */
34596 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34597 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34598 reg_alloc_order [pos++] = i;
34600 /* x87 registers come first in case we are doing FP math
34601 using them. */
34602 if (!TARGET_SSE_MATH)
34603 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34604 reg_alloc_order [pos++] = i;
34606 /* SSE registers. */
34607 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34608 reg_alloc_order [pos++] = i;
34609 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34610 reg_alloc_order [pos++] = i;
34612 /* x87 registers. */
34613 if (TARGET_SSE_MATH)
34614 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34615 reg_alloc_order [pos++] = i;
34617 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34618 reg_alloc_order [pos++] = i;
34620 /* Initialize the rest of array as we do not allocate some registers
34621 at all. */
34622 while (pos < FIRST_PSEUDO_REGISTER)
34623 reg_alloc_order [pos++] = 0;
34626 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34627 in struct attribute_spec handler. */
34628 static tree
34629 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34630 tree args,
34631 int flags ATTRIBUTE_UNUSED,
34632 bool *no_add_attrs)
34634 if (TREE_CODE (*node) != FUNCTION_TYPE
34635 && TREE_CODE (*node) != METHOD_TYPE
34636 && TREE_CODE (*node) != FIELD_DECL
34637 && TREE_CODE (*node) != TYPE_DECL)
34639 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34640 name);
34641 *no_add_attrs = true;
34642 return NULL_TREE;
34644 if (TARGET_64BIT)
34646 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34647 name);
34648 *no_add_attrs = true;
34649 return NULL_TREE;
34651 if (is_attribute_p ("callee_pop_aggregate_return", name))
34653 tree cst;
34655 cst = TREE_VALUE (args);
34656 if (TREE_CODE (cst) != INTEGER_CST)
34658 warning (OPT_Wattributes,
34659 "%qE attribute requires an integer constant argument",
34660 name);
34661 *no_add_attrs = true;
34663 else if (compare_tree_int (cst, 0) != 0
34664 && compare_tree_int (cst, 1) != 0)
34666 warning (OPT_Wattributes,
34667 "argument to %qE attribute is neither zero, nor one",
34668 name);
34669 *no_add_attrs = true;
34672 return NULL_TREE;
34675 return NULL_TREE;
34678 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34679 struct attribute_spec.handler. */
34680 static tree
34681 ix86_handle_abi_attribute (tree *node, tree name,
34682 tree args ATTRIBUTE_UNUSED,
34683 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34685 if (TREE_CODE (*node) != FUNCTION_TYPE
34686 && TREE_CODE (*node) != METHOD_TYPE
34687 && TREE_CODE (*node) != FIELD_DECL
34688 && TREE_CODE (*node) != TYPE_DECL)
34690 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34691 name);
34692 *no_add_attrs = true;
34693 return NULL_TREE;
34696 /* Can combine regparm with all attributes but fastcall. */
34697 if (is_attribute_p ("ms_abi", name))
34699 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34701 error ("ms_abi and sysv_abi attributes are not compatible");
34704 return NULL_TREE;
34706 else if (is_attribute_p ("sysv_abi", name))
34708 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34710 error ("ms_abi and sysv_abi attributes are not compatible");
34713 return NULL_TREE;
34716 return NULL_TREE;
34719 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34720 struct attribute_spec.handler. */
34721 static tree
34722 ix86_handle_struct_attribute (tree *node, tree name,
34723 tree args ATTRIBUTE_UNUSED,
34724 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34726 tree *type = NULL;
34727 if (DECL_P (*node))
34729 if (TREE_CODE (*node) == TYPE_DECL)
34730 type = &TREE_TYPE (*node);
34732 else
34733 type = node;
34735 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34737 warning (OPT_Wattributes, "%qE attribute ignored",
34738 name);
34739 *no_add_attrs = true;
34742 else if ((is_attribute_p ("ms_struct", name)
34743 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34744 || ((is_attribute_p ("gcc_struct", name)
34745 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34747 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34748 name);
34749 *no_add_attrs = true;
34752 return NULL_TREE;
34755 static tree
34756 ix86_handle_fndecl_attribute (tree *node, tree name,
34757 tree args ATTRIBUTE_UNUSED,
34758 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34760 if (TREE_CODE (*node) != FUNCTION_DECL)
34762 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34763 name);
34764 *no_add_attrs = true;
34766 return NULL_TREE;
34769 static bool
34770 ix86_ms_bitfield_layout_p (const_tree record_type)
34772 return ((TARGET_MS_BITFIELD_LAYOUT
34773 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34774 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34777 /* Returns an expression indicating where the this parameter is
34778 located on entry to the FUNCTION. */
34780 static rtx
34781 x86_this_parameter (tree function)
34783 tree type = TREE_TYPE (function);
34784 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34785 int nregs;
34787 if (TARGET_64BIT)
34789 const int *parm_regs;
34791 if (ix86_function_type_abi (type) == MS_ABI)
34792 parm_regs = x86_64_ms_abi_int_parameter_registers;
34793 else
34794 parm_regs = x86_64_int_parameter_registers;
34795 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34798 nregs = ix86_function_regparm (type, function);
34800 if (nregs > 0 && !stdarg_p (type))
34802 int regno;
34803 unsigned int ccvt = ix86_get_callcvt (type);
34805 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34806 regno = aggr ? DX_REG : CX_REG;
34807 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34809 regno = CX_REG;
34810 if (aggr)
34811 return gen_rtx_MEM (SImode,
34812 plus_constant (Pmode, stack_pointer_rtx, 4));
34814 else
34816 regno = AX_REG;
34817 if (aggr)
34819 regno = DX_REG;
34820 if (nregs == 1)
34821 return gen_rtx_MEM (SImode,
34822 plus_constant (Pmode,
34823 stack_pointer_rtx, 4));
34826 return gen_rtx_REG (SImode, regno);
34829 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34830 aggr ? 8 : 4));
34833 /* Determine whether x86_output_mi_thunk can succeed. */
34835 static bool
34836 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34837 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34838 HOST_WIDE_INT vcall_offset, const_tree function)
34840 /* 64-bit can handle anything. */
34841 if (TARGET_64BIT)
34842 return true;
34844 /* For 32-bit, everything's fine if we have one free register. */
34845 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34846 return true;
34848 /* Need a free register for vcall_offset. */
34849 if (vcall_offset)
34850 return false;
34852 /* Need a free register for GOT references. */
34853 if (flag_pic && !targetm.binds_local_p (function))
34854 return false;
34856 /* Otherwise ok. */
34857 return true;
34860 /* Output the assembler code for a thunk function. THUNK_DECL is the
34861 declaration for the thunk function itself, FUNCTION is the decl for
34862 the target function. DELTA is an immediate constant offset to be
34863 added to THIS. If VCALL_OFFSET is nonzero, the word at
34864 *(*this + vcall_offset) should be added to THIS. */
34866 static void
34867 x86_output_mi_thunk (FILE *file,
34868 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
34869 HOST_WIDE_INT vcall_offset, tree function)
34871 rtx this_param = x86_this_parameter (function);
34872 rtx this_reg, tmp, fnaddr;
34873 unsigned int tmp_regno;
34875 if (TARGET_64BIT)
34876 tmp_regno = R10_REG;
34877 else
34879 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
34880 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34881 tmp_regno = AX_REG;
34882 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34883 tmp_regno = DX_REG;
34884 else
34885 tmp_regno = CX_REG;
34888 emit_note (NOTE_INSN_PROLOGUE_END);
34890 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
34891 pull it in now and let DELTA benefit. */
34892 if (REG_P (this_param))
34893 this_reg = this_param;
34894 else if (vcall_offset)
34896 /* Put the this parameter into %eax. */
34897 this_reg = gen_rtx_REG (Pmode, AX_REG);
34898 emit_move_insn (this_reg, this_param);
34900 else
34901 this_reg = NULL_RTX;
34903 /* Adjust the this parameter by a fixed constant. */
34904 if (delta)
34906 rtx delta_rtx = GEN_INT (delta);
34907 rtx delta_dst = this_reg ? this_reg : this_param;
34909 if (TARGET_64BIT)
34911 if (!x86_64_general_operand (delta_rtx, Pmode))
34913 tmp = gen_rtx_REG (Pmode, tmp_regno);
34914 emit_move_insn (tmp, delta_rtx);
34915 delta_rtx = tmp;
34919 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
34922 /* Adjust the this parameter by a value stored in the vtable. */
34923 if (vcall_offset)
34925 rtx vcall_addr, vcall_mem, this_mem;
34927 tmp = gen_rtx_REG (Pmode, tmp_regno);
34929 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
34930 if (Pmode != ptr_mode)
34931 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
34932 emit_move_insn (tmp, this_mem);
34934 /* Adjust the this parameter. */
34935 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
34936 if (TARGET_64BIT
34937 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
34939 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
34940 emit_move_insn (tmp2, GEN_INT (vcall_offset));
34941 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
34944 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
34945 if (Pmode != ptr_mode)
34946 emit_insn (gen_addsi_1_zext (this_reg,
34947 gen_rtx_REG (ptr_mode,
34948 REGNO (this_reg)),
34949 vcall_mem));
34950 else
34951 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
34954 /* If necessary, drop THIS back to its stack slot. */
34955 if (this_reg && this_reg != this_param)
34956 emit_move_insn (this_param, this_reg);
34958 fnaddr = XEXP (DECL_RTL (function), 0);
34959 if (TARGET_64BIT)
34961 if (!flag_pic || targetm.binds_local_p (function)
34962 || cfun->machine->call_abi == MS_ABI)
34964 else
34966 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
34967 tmp = gen_rtx_CONST (Pmode, tmp);
34968 fnaddr = gen_rtx_MEM (Pmode, tmp);
34971 else
34973 if (!flag_pic || targetm.binds_local_p (function))
34975 #if TARGET_MACHO
34976 else if (TARGET_MACHO)
34978 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
34979 fnaddr = XEXP (fnaddr, 0);
34981 #endif /* TARGET_MACHO */
34982 else
34984 tmp = gen_rtx_REG (Pmode, CX_REG);
34985 output_set_got (tmp, NULL_RTX);
34987 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
34988 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
34989 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
34993 /* Our sibling call patterns do not allow memories, because we have no
34994 predicate that can distinguish between frame and non-frame memory.
34995 For our purposes here, we can get away with (ab)using a jump pattern,
34996 because we're going to do no optimization. */
34997 if (MEM_P (fnaddr))
34998 emit_jump_insn (gen_indirect_jump (fnaddr));
34999 else
35001 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35002 fnaddr = legitimize_pic_address (fnaddr,
35003 gen_rtx_REG (Pmode, tmp_regno));
35005 if (!sibcall_insn_operand (fnaddr, word_mode))
35007 tmp = gen_rtx_REG (word_mode, tmp_regno);
35008 if (GET_MODE (fnaddr) != word_mode)
35009 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35010 emit_move_insn (tmp, fnaddr);
35011 fnaddr = tmp;
35014 tmp = gen_rtx_MEM (QImode, fnaddr);
35015 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35016 tmp = emit_call_insn (tmp);
35017 SIBLING_CALL_P (tmp) = 1;
35019 emit_barrier ();
35021 /* Emit just enough of rest_of_compilation to get the insns emitted.
35022 Note that use_thunk calls assemble_start_function et al. */
35023 tmp = get_insns ();
35024 shorten_branches (tmp);
35025 final_start_function (tmp, file, 1);
35026 final (tmp, file, 1);
35027 final_end_function ();
35030 static void
35031 x86_file_start (void)
35033 default_file_start ();
35034 #if TARGET_MACHO
35035 darwin_file_start ();
35036 #endif
35037 if (X86_FILE_START_VERSION_DIRECTIVE)
35038 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35039 if (X86_FILE_START_FLTUSED)
35040 fputs ("\t.global\t__fltused\n", asm_out_file);
35041 if (ix86_asm_dialect == ASM_INTEL)
35042 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35046 x86_field_alignment (tree field, int computed)
35048 enum machine_mode mode;
35049 tree type = TREE_TYPE (field);
35051 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35052 return computed;
35053 mode = TYPE_MODE (strip_array_types (type));
35054 if (mode == DFmode || mode == DCmode
35055 || GET_MODE_CLASS (mode) == MODE_INT
35056 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35057 return MIN (32, computed);
35058 return computed;
35061 /* Output assembler code to FILE to increment profiler label # LABELNO
35062 for profiling a function entry. */
35063 void
35064 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35066 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35067 : MCOUNT_NAME);
35069 if (TARGET_64BIT)
35071 #ifndef NO_PROFILE_COUNTERS
35072 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35073 #endif
35075 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
35076 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35077 else
35078 fprintf (file, "\tcall\t%s\n", mcount_name);
35080 else if (flag_pic)
35082 #ifndef NO_PROFILE_COUNTERS
35083 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35084 LPREFIX, labelno);
35085 #endif
35086 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35088 else
35090 #ifndef NO_PROFILE_COUNTERS
35091 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35092 LPREFIX, labelno);
35093 #endif
35094 fprintf (file, "\tcall\t%s\n", mcount_name);
35098 /* We don't have exact information about the insn sizes, but we may assume
35099 quite safely that we are informed about all 1 byte insns and memory
35100 address sizes. This is enough to eliminate unnecessary padding in
35101 99% of cases. */
35103 static int
35104 min_insn_size (rtx insn)
35106 int l = 0, len;
35108 if (!INSN_P (insn) || !active_insn_p (insn))
35109 return 0;
35111 /* Discard alignments we've emit and jump instructions. */
35112 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35113 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35114 return 0;
35115 if (JUMP_TABLE_DATA_P (insn))
35116 return 0;
35118 /* Important case - calls are always 5 bytes.
35119 It is common to have many calls in the row. */
35120 if (CALL_P (insn)
35121 && symbolic_reference_mentioned_p (PATTERN (insn))
35122 && !SIBLING_CALL_P (insn))
35123 return 5;
35124 len = get_attr_length (insn);
35125 if (len <= 1)
35126 return 1;
35128 /* For normal instructions we rely on get_attr_length being exact,
35129 with a few exceptions. */
35130 if (!JUMP_P (insn))
35132 enum attr_type type = get_attr_type (insn);
35134 switch (type)
35136 case TYPE_MULTI:
35137 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35138 || asm_noperands (PATTERN (insn)) >= 0)
35139 return 0;
35140 break;
35141 case TYPE_OTHER:
35142 case TYPE_FCMP:
35143 break;
35144 default:
35145 /* Otherwise trust get_attr_length. */
35146 return len;
35149 l = get_attr_length_address (insn);
35150 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35151 l = 4;
35153 if (l)
35154 return 1+l;
35155 else
35156 return 2;
35159 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35161 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35162 window. */
35164 static void
35165 ix86_avoid_jump_mispredicts (void)
35167 rtx insn, start = get_insns ();
35168 int nbytes = 0, njumps = 0;
35169 int isjump = 0;
35171 /* Look for all minimal intervals of instructions containing 4 jumps.
35172 The intervals are bounded by START and INSN. NBYTES is the total
35173 size of instructions in the interval including INSN and not including
35174 START. When the NBYTES is smaller than 16 bytes, it is possible
35175 that the end of START and INSN ends up in the same 16byte page.
35177 The smallest offset in the page INSN can start is the case where START
35178 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35179 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35181 for (insn = start; insn; insn = NEXT_INSN (insn))
35183 int min_size;
35185 if (LABEL_P (insn))
35187 int align = label_to_alignment (insn);
35188 int max_skip = label_to_max_skip (insn);
35190 if (max_skip > 15)
35191 max_skip = 15;
35192 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35193 already in the current 16 byte page, because otherwise
35194 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35195 bytes to reach 16 byte boundary. */
35196 if (align <= 0
35197 || (align <= 3 && max_skip != (1 << align) - 1))
35198 max_skip = 0;
35199 if (dump_file)
35200 fprintf (dump_file, "Label %i with max_skip %i\n",
35201 INSN_UID (insn), max_skip);
35202 if (max_skip)
35204 while (nbytes + max_skip >= 16)
35206 start = NEXT_INSN (start);
35207 if ((JUMP_P (start)
35208 && GET_CODE (PATTERN (start)) != ADDR_VEC
35209 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35210 || CALL_P (start))
35211 njumps--, isjump = 1;
35212 else
35213 isjump = 0;
35214 nbytes -= min_insn_size (start);
35217 continue;
35220 min_size = min_insn_size (insn);
35221 nbytes += min_size;
35222 if (dump_file)
35223 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35224 INSN_UID (insn), min_size);
35225 if ((JUMP_P (insn)
35226 && GET_CODE (PATTERN (insn)) != ADDR_VEC
35227 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
35228 || CALL_P (insn))
35229 njumps++;
35230 else
35231 continue;
35233 while (njumps > 3)
35235 start = NEXT_INSN (start);
35236 if ((JUMP_P (start)
35237 && GET_CODE (PATTERN (start)) != ADDR_VEC
35238 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35239 || CALL_P (start))
35240 njumps--, isjump = 1;
35241 else
35242 isjump = 0;
35243 nbytes -= min_insn_size (start);
35245 gcc_assert (njumps >= 0);
35246 if (dump_file)
35247 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35248 INSN_UID (start), INSN_UID (insn), nbytes);
35250 if (njumps == 3 && isjump && nbytes < 16)
35252 int padsize = 15 - nbytes + min_insn_size (insn);
35254 if (dump_file)
35255 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35256 INSN_UID (insn), padsize);
35257 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35261 #endif
35263 /* AMD Athlon works faster
35264 when RET is not destination of conditional jump or directly preceded
35265 by other jump instruction. We avoid the penalty by inserting NOP just
35266 before the RET instructions in such cases. */
35267 static void
35268 ix86_pad_returns (void)
35270 edge e;
35271 edge_iterator ei;
35273 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35275 basic_block bb = e->src;
35276 rtx ret = BB_END (bb);
35277 rtx prev;
35278 bool replace = false;
35280 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35281 || optimize_bb_for_size_p (bb))
35282 continue;
35283 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35284 if (active_insn_p (prev) || LABEL_P (prev))
35285 break;
35286 if (prev && LABEL_P (prev))
35288 edge e;
35289 edge_iterator ei;
35291 FOR_EACH_EDGE (e, ei, bb->preds)
35292 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35293 && !(e->flags & EDGE_FALLTHRU))
35294 replace = true;
35296 if (!replace)
35298 prev = prev_active_insn (ret);
35299 if (prev
35300 && ((JUMP_P (prev) && any_condjump_p (prev))
35301 || CALL_P (prev)))
35302 replace = true;
35303 /* Empty functions get branch mispredict even when
35304 the jump destination is not visible to us. */
35305 if (!prev && !optimize_function_for_size_p (cfun))
35306 replace = true;
35308 if (replace)
35310 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35311 delete_insn (ret);
35316 /* Count the minimum number of instructions in BB. Return 4 if the
35317 number of instructions >= 4. */
35319 static int
35320 ix86_count_insn_bb (basic_block bb)
35322 rtx insn;
35323 int insn_count = 0;
35325 /* Count number of instructions in this block. Return 4 if the number
35326 of instructions >= 4. */
35327 FOR_BB_INSNS (bb, insn)
35329 /* Only happen in exit blocks. */
35330 if (JUMP_P (insn)
35331 && ANY_RETURN_P (PATTERN (insn)))
35332 break;
35334 if (NONDEBUG_INSN_P (insn)
35335 && GET_CODE (PATTERN (insn)) != USE
35336 && GET_CODE (PATTERN (insn)) != CLOBBER)
35338 insn_count++;
35339 if (insn_count >= 4)
35340 return insn_count;
35344 return insn_count;
35348 /* Count the minimum number of instructions in code path in BB.
35349 Return 4 if the number of instructions >= 4. */
35351 static int
35352 ix86_count_insn (basic_block bb)
35354 edge e;
35355 edge_iterator ei;
35356 int min_prev_count;
35358 /* Only bother counting instructions along paths with no
35359 more than 2 basic blocks between entry and exit. Given
35360 that BB has an edge to exit, determine if a predecessor
35361 of BB has an edge from entry. If so, compute the number
35362 of instructions in the predecessor block. If there
35363 happen to be multiple such blocks, compute the minimum. */
35364 min_prev_count = 4;
35365 FOR_EACH_EDGE (e, ei, bb->preds)
35367 edge prev_e;
35368 edge_iterator prev_ei;
35370 if (e->src == ENTRY_BLOCK_PTR)
35372 min_prev_count = 0;
35373 break;
35375 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35377 if (prev_e->src == ENTRY_BLOCK_PTR)
35379 int count = ix86_count_insn_bb (e->src);
35380 if (count < min_prev_count)
35381 min_prev_count = count;
35382 break;
35387 if (min_prev_count < 4)
35388 min_prev_count += ix86_count_insn_bb (bb);
35390 return min_prev_count;
35393 /* Pad short function to 4 instructions. */
35395 static void
35396 ix86_pad_short_function (void)
35398 edge e;
35399 edge_iterator ei;
35401 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35403 rtx ret = BB_END (e->src);
35404 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35406 int insn_count = ix86_count_insn (e->src);
35408 /* Pad short function. */
35409 if (insn_count < 4)
35411 rtx insn = ret;
35413 /* Find epilogue. */
35414 while (insn
35415 && (!NOTE_P (insn)
35416 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35417 insn = PREV_INSN (insn);
35419 if (!insn)
35420 insn = ret;
35422 /* Two NOPs count as one instruction. */
35423 insn_count = 2 * (4 - insn_count);
35424 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35430 /* Implement machine specific optimizations. We implement padding of returns
35431 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35432 static void
35433 ix86_reorg (void)
35435 /* We are freeing block_for_insn in the toplev to keep compatibility
35436 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35437 compute_bb_for_insn ();
35439 if (optimize && optimize_function_for_speed_p (cfun))
35441 if (TARGET_PAD_SHORT_FUNCTION)
35442 ix86_pad_short_function ();
35443 else if (TARGET_PAD_RETURNS)
35444 ix86_pad_returns ();
35445 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35446 if (TARGET_FOUR_JUMP_LIMIT)
35447 ix86_avoid_jump_mispredicts ();
35448 #endif
35452 /* Return nonzero when QImode register that must be represented via REX prefix
35453 is used. */
35454 bool
35455 x86_extended_QIreg_mentioned_p (rtx insn)
35457 int i;
35458 extract_insn_cached (insn);
35459 for (i = 0; i < recog_data.n_operands; i++)
35460 if (GENERAL_REG_P (recog_data.operand[i])
35461 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35462 return true;
35463 return false;
35466 /* Return nonzero when P points to register encoded via REX prefix.
35467 Called via for_each_rtx. */
35468 static int
35469 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35471 unsigned int regno;
35472 if (!REG_P (*p))
35473 return 0;
35474 regno = REGNO (*p);
35475 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35478 /* Return true when INSN mentions register that must be encoded using REX
35479 prefix. */
35480 bool
35481 x86_extended_reg_mentioned_p (rtx insn)
35483 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35484 extended_reg_mentioned_1, NULL);
35487 /* If profitable, negate (without causing overflow) integer constant
35488 of mode MODE at location LOC. Return true in this case. */
35489 bool
35490 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35492 HOST_WIDE_INT val;
35494 if (!CONST_INT_P (*loc))
35495 return false;
35497 switch (mode)
35499 case DImode:
35500 /* DImode x86_64 constants must fit in 32 bits. */
35501 gcc_assert (x86_64_immediate_operand (*loc, mode));
35503 mode = SImode;
35504 break;
35506 case SImode:
35507 case HImode:
35508 case QImode:
35509 break;
35511 default:
35512 gcc_unreachable ();
35515 /* Avoid overflows. */
35516 if (mode_signbit_p (mode, *loc))
35517 return false;
35519 val = INTVAL (*loc);
35521 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35522 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35523 if ((val < 0 && val != -128)
35524 || val == 128)
35526 *loc = GEN_INT (-val);
35527 return true;
35530 return false;
35533 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35534 optabs would emit if we didn't have TFmode patterns. */
35536 void
35537 x86_emit_floatuns (rtx operands[2])
35539 rtx neglab, donelab, i0, i1, f0, in, out;
35540 enum machine_mode mode, inmode;
35542 inmode = GET_MODE (operands[1]);
35543 gcc_assert (inmode == SImode || inmode == DImode);
35545 out = operands[0];
35546 in = force_reg (inmode, operands[1]);
35547 mode = GET_MODE (out);
35548 neglab = gen_label_rtx ();
35549 donelab = gen_label_rtx ();
35550 f0 = gen_reg_rtx (mode);
35552 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35554 expand_float (out, in, 0);
35556 emit_jump_insn (gen_jump (donelab));
35557 emit_barrier ();
35559 emit_label (neglab);
35561 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35562 1, OPTAB_DIRECT);
35563 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35564 1, OPTAB_DIRECT);
35565 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35567 expand_float (f0, i0, 0);
35569 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35571 emit_label (donelab);
35574 /* AVX2 does support 32-byte integer vector operations,
35575 thus the longest vector we are faced with is V32QImode. */
35576 #define MAX_VECT_LEN 32
35578 struct expand_vec_perm_d
35580 rtx target, op0, op1;
35581 unsigned char perm[MAX_VECT_LEN];
35582 enum machine_mode vmode;
35583 unsigned char nelt;
35584 bool one_operand_p;
35585 bool testing_p;
35588 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35589 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35590 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35592 /* Get a vector mode of the same size as the original but with elements
35593 twice as wide. This is only guaranteed to apply to integral vectors. */
35595 static inline enum machine_mode
35596 get_mode_wider_vector (enum machine_mode o)
35598 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35599 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35600 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35601 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35602 return n;
35605 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35606 with all elements equal to VAR. Return true if successful. */
35608 static bool
35609 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35610 rtx target, rtx val)
35612 bool ok;
35614 switch (mode)
35616 case V2SImode:
35617 case V2SFmode:
35618 if (!mmx_ok)
35619 return false;
35620 /* FALLTHRU */
35622 case V4DFmode:
35623 case V4DImode:
35624 case V8SFmode:
35625 case V8SImode:
35626 case V2DFmode:
35627 case V2DImode:
35628 case V4SFmode:
35629 case V4SImode:
35631 rtx insn, dup;
35633 /* First attempt to recognize VAL as-is. */
35634 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35635 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35636 if (recog_memoized (insn) < 0)
35638 rtx seq;
35639 /* If that fails, force VAL into a register. */
35641 start_sequence ();
35642 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35643 seq = get_insns ();
35644 end_sequence ();
35645 if (seq)
35646 emit_insn_before (seq, insn);
35648 ok = recog_memoized (insn) >= 0;
35649 gcc_assert (ok);
35652 return true;
35654 case V4HImode:
35655 if (!mmx_ok)
35656 return false;
35657 if (TARGET_SSE || TARGET_3DNOW_A)
35659 rtx x;
35661 val = gen_lowpart (SImode, val);
35662 x = gen_rtx_TRUNCATE (HImode, val);
35663 x = gen_rtx_VEC_DUPLICATE (mode, x);
35664 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35665 return true;
35667 goto widen;
35669 case V8QImode:
35670 if (!mmx_ok)
35671 return false;
35672 goto widen;
35674 case V8HImode:
35675 if (TARGET_SSE2)
35677 struct expand_vec_perm_d dperm;
35678 rtx tmp1, tmp2;
35680 permute:
35681 memset (&dperm, 0, sizeof (dperm));
35682 dperm.target = target;
35683 dperm.vmode = mode;
35684 dperm.nelt = GET_MODE_NUNITS (mode);
35685 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35686 dperm.one_operand_p = true;
35688 /* Extend to SImode using a paradoxical SUBREG. */
35689 tmp1 = gen_reg_rtx (SImode);
35690 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35692 /* Insert the SImode value as low element of a V4SImode vector. */
35693 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35694 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35696 ok = (expand_vec_perm_1 (&dperm)
35697 || expand_vec_perm_broadcast_1 (&dperm));
35698 gcc_assert (ok);
35699 return ok;
35701 goto widen;
35703 case V16QImode:
35704 if (TARGET_SSE2)
35705 goto permute;
35706 goto widen;
35708 widen:
35709 /* Replicate the value once into the next wider mode and recurse. */
35711 enum machine_mode smode, wsmode, wvmode;
35712 rtx x;
35714 smode = GET_MODE_INNER (mode);
35715 wvmode = get_mode_wider_vector (mode);
35716 wsmode = GET_MODE_INNER (wvmode);
35718 val = convert_modes (wsmode, smode, val, true);
35719 x = expand_simple_binop (wsmode, ASHIFT, val,
35720 GEN_INT (GET_MODE_BITSIZE (smode)),
35721 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35722 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35724 x = gen_lowpart (wvmode, target);
35725 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35726 gcc_assert (ok);
35727 return ok;
35730 case V16HImode:
35731 case V32QImode:
35733 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35734 rtx x = gen_reg_rtx (hvmode);
35736 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35737 gcc_assert (ok);
35739 x = gen_rtx_VEC_CONCAT (mode, x, x);
35740 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35742 return true;
35744 default:
35745 return false;
35749 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35750 whose ONE_VAR element is VAR, and other elements are zero. Return true
35751 if successful. */
35753 static bool
35754 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35755 rtx target, rtx var, int one_var)
35757 enum machine_mode vsimode;
35758 rtx new_target;
35759 rtx x, tmp;
35760 bool use_vector_set = false;
35762 switch (mode)
35764 case V2DImode:
35765 /* For SSE4.1, we normally use vector set. But if the second
35766 element is zero and inter-unit moves are OK, we use movq
35767 instead. */
35768 use_vector_set = (TARGET_64BIT
35769 && TARGET_SSE4_1
35770 && !(TARGET_INTER_UNIT_MOVES
35771 && one_var == 0));
35772 break;
35773 case V16QImode:
35774 case V4SImode:
35775 case V4SFmode:
35776 use_vector_set = TARGET_SSE4_1;
35777 break;
35778 case V8HImode:
35779 use_vector_set = TARGET_SSE2;
35780 break;
35781 case V4HImode:
35782 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35783 break;
35784 case V32QImode:
35785 case V16HImode:
35786 case V8SImode:
35787 case V8SFmode:
35788 case V4DFmode:
35789 use_vector_set = TARGET_AVX;
35790 break;
35791 case V4DImode:
35792 /* Use ix86_expand_vector_set in 64bit mode only. */
35793 use_vector_set = TARGET_AVX && TARGET_64BIT;
35794 break;
35795 default:
35796 break;
35799 if (use_vector_set)
35801 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35802 var = force_reg (GET_MODE_INNER (mode), var);
35803 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35804 return true;
35807 switch (mode)
35809 case V2SFmode:
35810 case V2SImode:
35811 if (!mmx_ok)
35812 return false;
35813 /* FALLTHRU */
35815 case V2DFmode:
35816 case V2DImode:
35817 if (one_var != 0)
35818 return false;
35819 var = force_reg (GET_MODE_INNER (mode), var);
35820 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35821 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35822 return true;
35824 case V4SFmode:
35825 case V4SImode:
35826 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35827 new_target = gen_reg_rtx (mode);
35828 else
35829 new_target = target;
35830 var = force_reg (GET_MODE_INNER (mode), var);
35831 x = gen_rtx_VEC_DUPLICATE (mode, var);
35832 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35833 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35834 if (one_var != 0)
35836 /* We need to shuffle the value to the correct position, so
35837 create a new pseudo to store the intermediate result. */
35839 /* With SSE2, we can use the integer shuffle insns. */
35840 if (mode != V4SFmode && TARGET_SSE2)
35842 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35843 const1_rtx,
35844 GEN_INT (one_var == 1 ? 0 : 1),
35845 GEN_INT (one_var == 2 ? 0 : 1),
35846 GEN_INT (one_var == 3 ? 0 : 1)));
35847 if (target != new_target)
35848 emit_move_insn (target, new_target);
35849 return true;
35852 /* Otherwise convert the intermediate result to V4SFmode and
35853 use the SSE1 shuffle instructions. */
35854 if (mode != V4SFmode)
35856 tmp = gen_reg_rtx (V4SFmode);
35857 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
35859 else
35860 tmp = new_target;
35862 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
35863 const1_rtx,
35864 GEN_INT (one_var == 1 ? 0 : 1),
35865 GEN_INT (one_var == 2 ? 0+4 : 1+4),
35866 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
35868 if (mode != V4SFmode)
35869 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
35870 else if (tmp != target)
35871 emit_move_insn (target, tmp);
35873 else if (target != new_target)
35874 emit_move_insn (target, new_target);
35875 return true;
35877 case V8HImode:
35878 case V16QImode:
35879 vsimode = V4SImode;
35880 goto widen;
35881 case V4HImode:
35882 case V8QImode:
35883 if (!mmx_ok)
35884 return false;
35885 vsimode = V2SImode;
35886 goto widen;
35887 widen:
35888 if (one_var != 0)
35889 return false;
35891 /* Zero extend the variable element to SImode and recurse. */
35892 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
35894 x = gen_reg_rtx (vsimode);
35895 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
35896 var, one_var))
35897 gcc_unreachable ();
35899 emit_move_insn (target, gen_lowpart (mode, x));
35900 return true;
35902 default:
35903 return false;
35907 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35908 consisting of the values in VALS. It is known that all elements
35909 except ONE_VAR are constants. Return true if successful. */
35911 static bool
35912 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
35913 rtx target, rtx vals, int one_var)
35915 rtx var = XVECEXP (vals, 0, one_var);
35916 enum machine_mode wmode;
35917 rtx const_vec, x;
35919 const_vec = copy_rtx (vals);
35920 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
35921 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
35923 switch (mode)
35925 case V2DFmode:
35926 case V2DImode:
35927 case V2SFmode:
35928 case V2SImode:
35929 /* For the two element vectors, it's just as easy to use
35930 the general case. */
35931 return false;
35933 case V4DImode:
35934 /* Use ix86_expand_vector_set in 64bit mode only. */
35935 if (!TARGET_64BIT)
35936 return false;
35937 case V4DFmode:
35938 case V8SFmode:
35939 case V8SImode:
35940 case V16HImode:
35941 case V32QImode:
35942 case V4SFmode:
35943 case V4SImode:
35944 case V8HImode:
35945 case V4HImode:
35946 break;
35948 case V16QImode:
35949 if (TARGET_SSE4_1)
35950 break;
35951 wmode = V8HImode;
35952 goto widen;
35953 case V8QImode:
35954 wmode = V4HImode;
35955 goto widen;
35956 widen:
35957 /* There's no way to set one QImode entry easily. Combine
35958 the variable value with its adjacent constant value, and
35959 promote to an HImode set. */
35960 x = XVECEXP (vals, 0, one_var ^ 1);
35961 if (one_var & 1)
35963 var = convert_modes (HImode, QImode, var, true);
35964 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
35965 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35966 x = GEN_INT (INTVAL (x) & 0xff);
35968 else
35970 var = convert_modes (HImode, QImode, var, true);
35971 x = gen_int_mode (INTVAL (x) << 8, HImode);
35973 if (x != const0_rtx)
35974 var = expand_simple_binop (HImode, IOR, var, x, var,
35975 1, OPTAB_LIB_WIDEN);
35977 x = gen_reg_rtx (wmode);
35978 emit_move_insn (x, gen_lowpart (wmode, const_vec));
35979 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
35981 emit_move_insn (target, gen_lowpart (mode, x));
35982 return true;
35984 default:
35985 return false;
35988 emit_move_insn (target, const_vec);
35989 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35990 return true;
35993 /* A subroutine of ix86_expand_vector_init_general. Use vector
35994 concatenate to handle the most general case: all values variable,
35995 and none identical. */
35997 static void
35998 ix86_expand_vector_init_concat (enum machine_mode mode,
35999 rtx target, rtx *ops, int n)
36001 enum machine_mode cmode, hmode = VOIDmode;
36002 rtx first[8], second[4];
36003 rtvec v;
36004 int i, j;
36006 switch (n)
36008 case 2:
36009 switch (mode)
36011 case V8SImode:
36012 cmode = V4SImode;
36013 break;
36014 case V8SFmode:
36015 cmode = V4SFmode;
36016 break;
36017 case V4DImode:
36018 cmode = V2DImode;
36019 break;
36020 case V4DFmode:
36021 cmode = V2DFmode;
36022 break;
36023 case V4SImode:
36024 cmode = V2SImode;
36025 break;
36026 case V4SFmode:
36027 cmode = V2SFmode;
36028 break;
36029 case V2DImode:
36030 cmode = DImode;
36031 break;
36032 case V2SImode:
36033 cmode = SImode;
36034 break;
36035 case V2DFmode:
36036 cmode = DFmode;
36037 break;
36038 case V2SFmode:
36039 cmode = SFmode;
36040 break;
36041 default:
36042 gcc_unreachable ();
36045 if (!register_operand (ops[1], cmode))
36046 ops[1] = force_reg (cmode, ops[1]);
36047 if (!register_operand (ops[0], cmode))
36048 ops[0] = force_reg (cmode, ops[0]);
36049 emit_insn (gen_rtx_SET (VOIDmode, target,
36050 gen_rtx_VEC_CONCAT (mode, ops[0],
36051 ops[1])));
36052 break;
36054 case 4:
36055 switch (mode)
36057 case V4DImode:
36058 cmode = V2DImode;
36059 break;
36060 case V4DFmode:
36061 cmode = V2DFmode;
36062 break;
36063 case V4SImode:
36064 cmode = V2SImode;
36065 break;
36066 case V4SFmode:
36067 cmode = V2SFmode;
36068 break;
36069 default:
36070 gcc_unreachable ();
36072 goto half;
36074 case 8:
36075 switch (mode)
36077 case V8SImode:
36078 cmode = V2SImode;
36079 hmode = V4SImode;
36080 break;
36081 case V8SFmode:
36082 cmode = V2SFmode;
36083 hmode = V4SFmode;
36084 break;
36085 default:
36086 gcc_unreachable ();
36088 goto half;
36090 half:
36091 /* FIXME: We process inputs backward to help RA. PR 36222. */
36092 i = n - 1;
36093 j = (n >> 1) - 1;
36094 for (; i > 0; i -= 2, j--)
36096 first[j] = gen_reg_rtx (cmode);
36097 v = gen_rtvec (2, ops[i - 1], ops[i]);
36098 ix86_expand_vector_init (false, first[j],
36099 gen_rtx_PARALLEL (cmode, v));
36102 n >>= 1;
36103 if (n > 2)
36105 gcc_assert (hmode != VOIDmode);
36106 for (i = j = 0; i < n; i += 2, j++)
36108 second[j] = gen_reg_rtx (hmode);
36109 ix86_expand_vector_init_concat (hmode, second [j],
36110 &first [i], 2);
36112 n >>= 1;
36113 ix86_expand_vector_init_concat (mode, target, second, n);
36115 else
36116 ix86_expand_vector_init_concat (mode, target, first, n);
36117 break;
36119 default:
36120 gcc_unreachable ();
36124 /* A subroutine of ix86_expand_vector_init_general. Use vector
36125 interleave to handle the most general case: all values variable,
36126 and none identical. */
36128 static void
36129 ix86_expand_vector_init_interleave (enum machine_mode mode,
36130 rtx target, rtx *ops, int n)
36132 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36133 int i, j;
36134 rtx op0, op1;
36135 rtx (*gen_load_even) (rtx, rtx, rtx);
36136 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36137 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36139 switch (mode)
36141 case V8HImode:
36142 gen_load_even = gen_vec_setv8hi;
36143 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36144 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36145 inner_mode = HImode;
36146 first_imode = V4SImode;
36147 second_imode = V2DImode;
36148 third_imode = VOIDmode;
36149 break;
36150 case V16QImode:
36151 gen_load_even = gen_vec_setv16qi;
36152 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36153 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36154 inner_mode = QImode;
36155 first_imode = V8HImode;
36156 second_imode = V4SImode;
36157 third_imode = V2DImode;
36158 break;
36159 default:
36160 gcc_unreachable ();
36163 for (i = 0; i < n; i++)
36165 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36166 op0 = gen_reg_rtx (SImode);
36167 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36169 /* Insert the SImode value as low element of V4SImode vector. */
36170 op1 = gen_reg_rtx (V4SImode);
36171 op0 = gen_rtx_VEC_MERGE (V4SImode,
36172 gen_rtx_VEC_DUPLICATE (V4SImode,
36173 op0),
36174 CONST0_RTX (V4SImode),
36175 const1_rtx);
36176 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36178 /* Cast the V4SImode vector back to a vector in orignal mode. */
36179 op0 = gen_reg_rtx (mode);
36180 emit_move_insn (op0, gen_lowpart (mode, op1));
36182 /* Load even elements into the second positon. */
36183 emit_insn (gen_load_even (op0,
36184 force_reg (inner_mode,
36185 ops [i + i + 1]),
36186 const1_rtx));
36188 /* Cast vector to FIRST_IMODE vector. */
36189 ops[i] = gen_reg_rtx (first_imode);
36190 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36193 /* Interleave low FIRST_IMODE vectors. */
36194 for (i = j = 0; i < n; i += 2, j++)
36196 op0 = gen_reg_rtx (first_imode);
36197 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36199 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36200 ops[j] = gen_reg_rtx (second_imode);
36201 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36204 /* Interleave low SECOND_IMODE vectors. */
36205 switch (second_imode)
36207 case V4SImode:
36208 for (i = j = 0; i < n / 2; i += 2, j++)
36210 op0 = gen_reg_rtx (second_imode);
36211 emit_insn (gen_interleave_second_low (op0, ops[i],
36212 ops[i + 1]));
36214 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36215 vector. */
36216 ops[j] = gen_reg_rtx (third_imode);
36217 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36219 second_imode = V2DImode;
36220 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36221 /* FALLTHRU */
36223 case V2DImode:
36224 op0 = gen_reg_rtx (second_imode);
36225 emit_insn (gen_interleave_second_low (op0, ops[0],
36226 ops[1]));
36228 /* Cast the SECOND_IMODE vector back to a vector on original
36229 mode. */
36230 emit_insn (gen_rtx_SET (VOIDmode, target,
36231 gen_lowpart (mode, op0)));
36232 break;
36234 default:
36235 gcc_unreachable ();
36239 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36240 all values variable, and none identical. */
36242 static void
36243 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36244 rtx target, rtx vals)
36246 rtx ops[32], op0, op1;
36247 enum machine_mode half_mode = VOIDmode;
36248 int n, i;
36250 switch (mode)
36252 case V2SFmode:
36253 case V2SImode:
36254 if (!mmx_ok && !TARGET_SSE)
36255 break;
36256 /* FALLTHRU */
36258 case V8SFmode:
36259 case V8SImode:
36260 case V4DFmode:
36261 case V4DImode:
36262 case V4SFmode:
36263 case V4SImode:
36264 case V2DFmode:
36265 case V2DImode:
36266 n = GET_MODE_NUNITS (mode);
36267 for (i = 0; i < n; i++)
36268 ops[i] = XVECEXP (vals, 0, i);
36269 ix86_expand_vector_init_concat (mode, target, ops, n);
36270 return;
36272 case V32QImode:
36273 half_mode = V16QImode;
36274 goto half;
36276 case V16HImode:
36277 half_mode = V8HImode;
36278 goto half;
36280 half:
36281 n = GET_MODE_NUNITS (mode);
36282 for (i = 0; i < n; i++)
36283 ops[i] = XVECEXP (vals, 0, i);
36284 op0 = gen_reg_rtx (half_mode);
36285 op1 = gen_reg_rtx (half_mode);
36286 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36287 n >> 2);
36288 ix86_expand_vector_init_interleave (half_mode, op1,
36289 &ops [n >> 1], n >> 2);
36290 emit_insn (gen_rtx_SET (VOIDmode, target,
36291 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36292 return;
36294 case V16QImode:
36295 if (!TARGET_SSE4_1)
36296 break;
36297 /* FALLTHRU */
36299 case V8HImode:
36300 if (!TARGET_SSE2)
36301 break;
36303 /* Don't use ix86_expand_vector_init_interleave if we can't
36304 move from GPR to SSE register directly. */
36305 if (!TARGET_INTER_UNIT_MOVES)
36306 break;
36308 n = GET_MODE_NUNITS (mode);
36309 for (i = 0; i < n; i++)
36310 ops[i] = XVECEXP (vals, 0, i);
36311 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36312 return;
36314 case V4HImode:
36315 case V8QImode:
36316 break;
36318 default:
36319 gcc_unreachable ();
36323 int i, j, n_elts, n_words, n_elt_per_word;
36324 enum machine_mode inner_mode;
36325 rtx words[4], shift;
36327 inner_mode = GET_MODE_INNER (mode);
36328 n_elts = GET_MODE_NUNITS (mode);
36329 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36330 n_elt_per_word = n_elts / n_words;
36331 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36333 for (i = 0; i < n_words; ++i)
36335 rtx word = NULL_RTX;
36337 for (j = 0; j < n_elt_per_word; ++j)
36339 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36340 elt = convert_modes (word_mode, inner_mode, elt, true);
36342 if (j == 0)
36343 word = elt;
36344 else
36346 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36347 word, 1, OPTAB_LIB_WIDEN);
36348 word = expand_simple_binop (word_mode, IOR, word, elt,
36349 word, 1, OPTAB_LIB_WIDEN);
36353 words[i] = word;
36356 if (n_words == 1)
36357 emit_move_insn (target, gen_lowpart (mode, words[0]));
36358 else if (n_words == 2)
36360 rtx tmp = gen_reg_rtx (mode);
36361 emit_clobber (tmp);
36362 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36363 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36364 emit_move_insn (target, tmp);
36366 else if (n_words == 4)
36368 rtx tmp = gen_reg_rtx (V4SImode);
36369 gcc_assert (word_mode == SImode);
36370 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36371 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36372 emit_move_insn (target, gen_lowpart (mode, tmp));
36374 else
36375 gcc_unreachable ();
36379 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36380 instructions unless MMX_OK is true. */
36382 void
36383 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36385 enum machine_mode mode = GET_MODE (target);
36386 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36387 int n_elts = GET_MODE_NUNITS (mode);
36388 int n_var = 0, one_var = -1;
36389 bool all_same = true, all_const_zero = true;
36390 int i;
36391 rtx x;
36393 for (i = 0; i < n_elts; ++i)
36395 x = XVECEXP (vals, 0, i);
36396 if (!(CONST_INT_P (x)
36397 || GET_CODE (x) == CONST_DOUBLE
36398 || GET_CODE (x) == CONST_FIXED))
36399 n_var++, one_var = i;
36400 else if (x != CONST0_RTX (inner_mode))
36401 all_const_zero = false;
36402 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36403 all_same = false;
36406 /* Constants are best loaded from the constant pool. */
36407 if (n_var == 0)
36409 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36410 return;
36413 /* If all values are identical, broadcast the value. */
36414 if (all_same
36415 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36416 XVECEXP (vals, 0, 0)))
36417 return;
36419 /* Values where only one field is non-constant are best loaded from
36420 the pool and overwritten via move later. */
36421 if (n_var == 1)
36423 if (all_const_zero
36424 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36425 XVECEXP (vals, 0, one_var),
36426 one_var))
36427 return;
36429 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36430 return;
36433 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36436 void
36437 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36439 enum machine_mode mode = GET_MODE (target);
36440 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36441 enum machine_mode half_mode;
36442 bool use_vec_merge = false;
36443 rtx tmp;
36444 static rtx (*gen_extract[6][2]) (rtx, rtx)
36446 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36447 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36448 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36449 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36450 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36451 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36453 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36455 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36456 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36457 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36458 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36459 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36460 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36462 int i, j, n;
36464 switch (mode)
36466 case V2SFmode:
36467 case V2SImode:
36468 if (mmx_ok)
36470 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36471 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36472 if (elt == 0)
36473 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36474 else
36475 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36476 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36477 return;
36479 break;
36481 case V2DImode:
36482 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36483 if (use_vec_merge)
36484 break;
36486 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36487 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36488 if (elt == 0)
36489 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36490 else
36491 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36492 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36493 return;
36495 case V2DFmode:
36497 rtx op0, op1;
36499 /* For the two element vectors, we implement a VEC_CONCAT with
36500 the extraction of the other element. */
36502 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36503 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36505 if (elt == 0)
36506 op0 = val, op1 = tmp;
36507 else
36508 op0 = tmp, op1 = val;
36510 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36511 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36513 return;
36515 case V4SFmode:
36516 use_vec_merge = TARGET_SSE4_1;
36517 if (use_vec_merge)
36518 break;
36520 switch (elt)
36522 case 0:
36523 use_vec_merge = true;
36524 break;
36526 case 1:
36527 /* tmp = target = A B C D */
36528 tmp = copy_to_reg (target);
36529 /* target = A A B B */
36530 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36531 /* target = X A B B */
36532 ix86_expand_vector_set (false, target, val, 0);
36533 /* target = A X C D */
36534 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36535 const1_rtx, const0_rtx,
36536 GEN_INT (2+4), GEN_INT (3+4)));
36537 return;
36539 case 2:
36540 /* tmp = target = A B C D */
36541 tmp = copy_to_reg (target);
36542 /* tmp = X B C D */
36543 ix86_expand_vector_set (false, tmp, val, 0);
36544 /* target = A B X D */
36545 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36546 const0_rtx, const1_rtx,
36547 GEN_INT (0+4), GEN_INT (3+4)));
36548 return;
36550 case 3:
36551 /* tmp = target = A B C D */
36552 tmp = copy_to_reg (target);
36553 /* tmp = X B C D */
36554 ix86_expand_vector_set (false, tmp, val, 0);
36555 /* target = A B X D */
36556 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36557 const0_rtx, const1_rtx,
36558 GEN_INT (2+4), GEN_INT (0+4)));
36559 return;
36561 default:
36562 gcc_unreachable ();
36564 break;
36566 case V4SImode:
36567 use_vec_merge = TARGET_SSE4_1;
36568 if (use_vec_merge)
36569 break;
36571 /* Element 0 handled by vec_merge below. */
36572 if (elt == 0)
36574 use_vec_merge = true;
36575 break;
36578 if (TARGET_SSE2)
36580 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36581 store into element 0, then shuffle them back. */
36583 rtx order[4];
36585 order[0] = GEN_INT (elt);
36586 order[1] = const1_rtx;
36587 order[2] = const2_rtx;
36588 order[3] = GEN_INT (3);
36589 order[elt] = const0_rtx;
36591 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36592 order[1], order[2], order[3]));
36594 ix86_expand_vector_set (false, target, val, 0);
36596 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36597 order[1], order[2], order[3]));
36599 else
36601 /* For SSE1, we have to reuse the V4SF code. */
36602 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36603 gen_lowpart (SFmode, val), elt);
36605 return;
36607 case V8HImode:
36608 use_vec_merge = TARGET_SSE2;
36609 break;
36610 case V4HImode:
36611 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36612 break;
36614 case V16QImode:
36615 use_vec_merge = TARGET_SSE4_1;
36616 break;
36618 case V8QImode:
36619 break;
36621 case V32QImode:
36622 half_mode = V16QImode;
36623 j = 0;
36624 n = 16;
36625 goto half;
36627 case V16HImode:
36628 half_mode = V8HImode;
36629 j = 1;
36630 n = 8;
36631 goto half;
36633 case V8SImode:
36634 half_mode = V4SImode;
36635 j = 2;
36636 n = 4;
36637 goto half;
36639 case V4DImode:
36640 half_mode = V2DImode;
36641 j = 3;
36642 n = 2;
36643 goto half;
36645 case V8SFmode:
36646 half_mode = V4SFmode;
36647 j = 4;
36648 n = 4;
36649 goto half;
36651 case V4DFmode:
36652 half_mode = V2DFmode;
36653 j = 5;
36654 n = 2;
36655 goto half;
36657 half:
36658 /* Compute offset. */
36659 i = elt / n;
36660 elt %= n;
36662 gcc_assert (i <= 1);
36664 /* Extract the half. */
36665 tmp = gen_reg_rtx (half_mode);
36666 emit_insn (gen_extract[j][i] (tmp, target));
36668 /* Put val in tmp at elt. */
36669 ix86_expand_vector_set (false, tmp, val, elt);
36671 /* Put it back. */
36672 emit_insn (gen_insert[j][i] (target, target, tmp));
36673 return;
36675 default:
36676 break;
36679 if (use_vec_merge)
36681 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36682 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36683 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36685 else
36687 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36689 emit_move_insn (mem, target);
36691 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36692 emit_move_insn (tmp, val);
36694 emit_move_insn (target, mem);
36698 void
36699 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36701 enum machine_mode mode = GET_MODE (vec);
36702 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36703 bool use_vec_extr = false;
36704 rtx tmp;
36706 switch (mode)
36708 case V2SImode:
36709 case V2SFmode:
36710 if (!mmx_ok)
36711 break;
36712 /* FALLTHRU */
36714 case V2DFmode:
36715 case V2DImode:
36716 use_vec_extr = true;
36717 break;
36719 case V4SFmode:
36720 use_vec_extr = TARGET_SSE4_1;
36721 if (use_vec_extr)
36722 break;
36724 switch (elt)
36726 case 0:
36727 tmp = vec;
36728 break;
36730 case 1:
36731 case 3:
36732 tmp = gen_reg_rtx (mode);
36733 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36734 GEN_INT (elt), GEN_INT (elt),
36735 GEN_INT (elt+4), GEN_INT (elt+4)));
36736 break;
36738 case 2:
36739 tmp = gen_reg_rtx (mode);
36740 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36741 break;
36743 default:
36744 gcc_unreachable ();
36746 vec = tmp;
36747 use_vec_extr = true;
36748 elt = 0;
36749 break;
36751 case V4SImode:
36752 use_vec_extr = TARGET_SSE4_1;
36753 if (use_vec_extr)
36754 break;
36756 if (TARGET_SSE2)
36758 switch (elt)
36760 case 0:
36761 tmp = vec;
36762 break;
36764 case 1:
36765 case 3:
36766 tmp = gen_reg_rtx (mode);
36767 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36768 GEN_INT (elt), GEN_INT (elt),
36769 GEN_INT (elt), GEN_INT (elt)));
36770 break;
36772 case 2:
36773 tmp = gen_reg_rtx (mode);
36774 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36775 break;
36777 default:
36778 gcc_unreachable ();
36780 vec = tmp;
36781 use_vec_extr = true;
36782 elt = 0;
36784 else
36786 /* For SSE1, we have to reuse the V4SF code. */
36787 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36788 gen_lowpart (V4SFmode, vec), elt);
36789 return;
36791 break;
36793 case V8HImode:
36794 use_vec_extr = TARGET_SSE2;
36795 break;
36796 case V4HImode:
36797 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36798 break;
36800 case V16QImode:
36801 use_vec_extr = TARGET_SSE4_1;
36802 break;
36804 case V8SFmode:
36805 if (TARGET_AVX)
36807 tmp = gen_reg_rtx (V4SFmode);
36808 if (elt < 4)
36809 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36810 else
36811 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36812 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36813 return;
36815 break;
36817 case V4DFmode:
36818 if (TARGET_AVX)
36820 tmp = gen_reg_rtx (V2DFmode);
36821 if (elt < 2)
36822 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36823 else
36824 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36825 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36826 return;
36828 break;
36830 case V32QImode:
36831 if (TARGET_AVX)
36833 tmp = gen_reg_rtx (V16QImode);
36834 if (elt < 16)
36835 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36836 else
36837 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36838 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36839 return;
36841 break;
36843 case V16HImode:
36844 if (TARGET_AVX)
36846 tmp = gen_reg_rtx (V8HImode);
36847 if (elt < 8)
36848 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36849 else
36850 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36851 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36852 return;
36854 break;
36856 case V8SImode:
36857 if (TARGET_AVX)
36859 tmp = gen_reg_rtx (V4SImode);
36860 if (elt < 4)
36861 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
36862 else
36863 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
36864 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36865 return;
36867 break;
36869 case V4DImode:
36870 if (TARGET_AVX)
36872 tmp = gen_reg_rtx (V2DImode);
36873 if (elt < 2)
36874 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
36875 else
36876 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
36877 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36878 return;
36880 break;
36882 case V8QImode:
36883 /* ??? Could extract the appropriate HImode element and shift. */
36884 default:
36885 break;
36888 if (use_vec_extr)
36890 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
36891 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
36893 /* Let the rtl optimizers know about the zero extension performed. */
36894 if (inner_mode == QImode || inner_mode == HImode)
36896 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
36897 target = gen_lowpart (SImode, target);
36900 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36902 else
36904 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36906 emit_move_insn (mem, vec);
36908 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36909 emit_move_insn (target, tmp);
36913 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
36914 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
36915 The upper bits of DEST are undefined, though they shouldn't cause
36916 exceptions (some bits from src or all zeros are ok). */
36918 static void
36919 emit_reduc_half (rtx dest, rtx src, int i)
36921 rtx tem;
36922 switch (GET_MODE (src))
36924 case V4SFmode:
36925 if (i == 128)
36926 tem = gen_sse_movhlps (dest, src, src);
36927 else
36928 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
36929 GEN_INT (1 + 4), GEN_INT (1 + 4));
36930 break;
36931 case V2DFmode:
36932 tem = gen_vec_interleave_highv2df (dest, src, src);
36933 break;
36934 case V16QImode:
36935 case V8HImode:
36936 case V4SImode:
36937 case V2DImode:
36938 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
36939 gen_lowpart (V1TImode, src),
36940 GEN_INT (i / 2));
36941 break;
36942 case V8SFmode:
36943 if (i == 256)
36944 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
36945 else
36946 tem = gen_avx_shufps256 (dest, src, src,
36947 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
36948 break;
36949 case V4DFmode:
36950 if (i == 256)
36951 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
36952 else
36953 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
36954 break;
36955 case V32QImode:
36956 case V16HImode:
36957 case V8SImode:
36958 case V4DImode:
36959 if (i == 256)
36960 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
36961 gen_lowpart (V4DImode, src),
36962 gen_lowpart (V4DImode, src),
36963 const1_rtx);
36964 else
36965 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
36966 gen_lowpart (V2TImode, src),
36967 GEN_INT (i / 2));
36968 break;
36969 default:
36970 gcc_unreachable ();
36972 emit_insn (tem);
36975 /* Expand a vector reduction. FN is the binary pattern to reduce;
36976 DEST is the destination; IN is the input vector. */
36978 void
36979 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
36981 rtx half, dst, vec = in;
36982 enum machine_mode mode = GET_MODE (in);
36983 int i;
36985 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
36986 if (TARGET_SSE4_1
36987 && mode == V8HImode
36988 && fn == gen_uminv8hi3)
36990 emit_insn (gen_sse4_1_phminposuw (dest, in));
36991 return;
36994 for (i = GET_MODE_BITSIZE (mode);
36995 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
36996 i >>= 1)
36998 half = gen_reg_rtx (mode);
36999 emit_reduc_half (half, vec, i);
37000 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37001 dst = dest;
37002 else
37003 dst = gen_reg_rtx (mode);
37004 emit_insn (fn (dst, half, vec));
37005 vec = dst;
37009 /* Target hook for scalar_mode_supported_p. */
37010 static bool
37011 ix86_scalar_mode_supported_p (enum machine_mode mode)
37013 if (DECIMAL_FLOAT_MODE_P (mode))
37014 return default_decimal_float_supported_p ();
37015 else if (mode == TFmode)
37016 return true;
37017 else
37018 return default_scalar_mode_supported_p (mode);
37021 /* Implements target hook vector_mode_supported_p. */
37022 static bool
37023 ix86_vector_mode_supported_p (enum machine_mode mode)
37025 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37026 return true;
37027 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37028 return true;
37029 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37030 return true;
37031 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37032 return true;
37033 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37034 return true;
37035 return false;
37038 /* Target hook for c_mode_for_suffix. */
37039 static enum machine_mode
37040 ix86_c_mode_for_suffix (char suffix)
37042 if (suffix == 'q')
37043 return TFmode;
37044 if (suffix == 'w')
37045 return XFmode;
37047 return VOIDmode;
37050 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37052 We do this in the new i386 backend to maintain source compatibility
37053 with the old cc0-based compiler. */
37055 static tree
37056 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37057 tree inputs ATTRIBUTE_UNUSED,
37058 tree clobbers)
37060 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37061 clobbers);
37062 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37063 clobbers);
37064 return clobbers;
37067 /* Implements target vector targetm.asm.encode_section_info. */
37069 static void ATTRIBUTE_UNUSED
37070 ix86_encode_section_info (tree decl, rtx rtl, int first)
37072 default_encode_section_info (decl, rtl, first);
37074 if (TREE_CODE (decl) == VAR_DECL
37075 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37076 && ix86_in_large_data_p (decl))
37077 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37080 /* Worker function for REVERSE_CONDITION. */
37082 enum rtx_code
37083 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37085 return (mode != CCFPmode && mode != CCFPUmode
37086 ? reverse_condition (code)
37087 : reverse_condition_maybe_unordered (code));
37090 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37091 to OPERANDS[0]. */
37093 const char *
37094 output_387_reg_move (rtx insn, rtx *operands)
37096 if (REG_P (operands[0]))
37098 if (REG_P (operands[1])
37099 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37101 if (REGNO (operands[0]) == FIRST_STACK_REG)
37102 return output_387_ffreep (operands, 0);
37103 return "fstp\t%y0";
37105 if (STACK_TOP_P (operands[0]))
37106 return "fld%Z1\t%y1";
37107 return "fst\t%y0";
37109 else if (MEM_P (operands[0]))
37111 gcc_assert (REG_P (operands[1]));
37112 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37113 return "fstp%Z0\t%y0";
37114 else
37116 /* There is no non-popping store to memory for XFmode.
37117 So if we need one, follow the store with a load. */
37118 if (GET_MODE (operands[0]) == XFmode)
37119 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37120 else
37121 return "fst%Z0\t%y0";
37124 else
37125 gcc_unreachable();
37128 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37129 FP status register is set. */
37131 void
37132 ix86_emit_fp_unordered_jump (rtx label)
37134 rtx reg = gen_reg_rtx (HImode);
37135 rtx temp;
37137 emit_insn (gen_x86_fnstsw_1 (reg));
37139 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37141 emit_insn (gen_x86_sahf_1 (reg));
37143 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37144 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37146 else
37148 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37150 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37151 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37154 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37155 gen_rtx_LABEL_REF (VOIDmode, label),
37156 pc_rtx);
37157 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37159 emit_jump_insn (temp);
37160 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37163 /* Output code to perform a log1p XFmode calculation. */
37165 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37167 rtx label1 = gen_label_rtx ();
37168 rtx label2 = gen_label_rtx ();
37170 rtx tmp = gen_reg_rtx (XFmode);
37171 rtx tmp2 = gen_reg_rtx (XFmode);
37172 rtx test;
37174 emit_insn (gen_absxf2 (tmp, op1));
37175 test = gen_rtx_GE (VOIDmode, tmp,
37176 CONST_DOUBLE_FROM_REAL_VALUE (
37177 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37178 XFmode));
37179 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37181 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37182 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37183 emit_jump (label2);
37185 emit_label (label1);
37186 emit_move_insn (tmp, CONST1_RTX (XFmode));
37187 emit_insn (gen_addxf3 (tmp, op1, tmp));
37188 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37189 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37191 emit_label (label2);
37194 /* Emit code for round calculation. */
37195 void ix86_emit_i387_round (rtx op0, rtx op1)
37197 enum machine_mode inmode = GET_MODE (op1);
37198 enum machine_mode outmode = GET_MODE (op0);
37199 rtx e1, e2, res, tmp, tmp1, half;
37200 rtx scratch = gen_reg_rtx (HImode);
37201 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37202 rtx jump_label = gen_label_rtx ();
37203 rtx insn;
37204 rtx (*gen_abs) (rtx, rtx);
37205 rtx (*gen_neg) (rtx, rtx);
37207 switch (inmode)
37209 case SFmode:
37210 gen_abs = gen_abssf2;
37211 break;
37212 case DFmode:
37213 gen_abs = gen_absdf2;
37214 break;
37215 case XFmode:
37216 gen_abs = gen_absxf2;
37217 break;
37218 default:
37219 gcc_unreachable ();
37222 switch (outmode)
37224 case SFmode:
37225 gen_neg = gen_negsf2;
37226 break;
37227 case DFmode:
37228 gen_neg = gen_negdf2;
37229 break;
37230 case XFmode:
37231 gen_neg = gen_negxf2;
37232 break;
37233 case HImode:
37234 gen_neg = gen_neghi2;
37235 break;
37236 case SImode:
37237 gen_neg = gen_negsi2;
37238 break;
37239 case DImode:
37240 gen_neg = gen_negdi2;
37241 break;
37242 default:
37243 gcc_unreachable ();
37246 e1 = gen_reg_rtx (inmode);
37247 e2 = gen_reg_rtx (inmode);
37248 res = gen_reg_rtx (outmode);
37250 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37252 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37254 /* scratch = fxam(op1) */
37255 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37256 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37257 UNSPEC_FXAM)));
37258 /* e1 = fabs(op1) */
37259 emit_insn (gen_abs (e1, op1));
37261 /* e2 = e1 + 0.5 */
37262 half = force_reg (inmode, half);
37263 emit_insn (gen_rtx_SET (VOIDmode, e2,
37264 gen_rtx_PLUS (inmode, e1, half)));
37266 /* res = floor(e2) */
37267 if (inmode != XFmode)
37269 tmp1 = gen_reg_rtx (XFmode);
37271 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37272 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37274 else
37275 tmp1 = e2;
37277 switch (outmode)
37279 case SFmode:
37280 case DFmode:
37282 rtx tmp0 = gen_reg_rtx (XFmode);
37284 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37286 emit_insn (gen_rtx_SET (VOIDmode, res,
37287 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37288 UNSPEC_TRUNC_NOOP)));
37290 break;
37291 case XFmode:
37292 emit_insn (gen_frndintxf2_floor (res, tmp1));
37293 break;
37294 case HImode:
37295 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37296 break;
37297 case SImode:
37298 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37299 break;
37300 case DImode:
37301 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37302 break;
37303 default:
37304 gcc_unreachable ();
37307 /* flags = signbit(a) */
37308 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37310 /* if (flags) then res = -res */
37311 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37312 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37313 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37314 pc_rtx);
37315 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37316 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37317 JUMP_LABEL (insn) = jump_label;
37319 emit_insn (gen_neg (res, res));
37321 emit_label (jump_label);
37322 LABEL_NUSES (jump_label) = 1;
37324 emit_move_insn (op0, res);
37327 /* Output code to perform a Newton-Rhapson approximation of a single precision
37328 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37330 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37332 rtx x0, x1, e0, e1;
37334 x0 = gen_reg_rtx (mode);
37335 e0 = gen_reg_rtx (mode);
37336 e1 = gen_reg_rtx (mode);
37337 x1 = gen_reg_rtx (mode);
37339 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37341 b = force_reg (mode, b);
37343 /* x0 = rcp(b) estimate */
37344 emit_insn (gen_rtx_SET (VOIDmode, x0,
37345 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37346 UNSPEC_RCP)));
37347 /* e0 = x0 * b */
37348 emit_insn (gen_rtx_SET (VOIDmode, e0,
37349 gen_rtx_MULT (mode, x0, b)));
37351 /* e0 = x0 * e0 */
37352 emit_insn (gen_rtx_SET (VOIDmode, e0,
37353 gen_rtx_MULT (mode, x0, e0)));
37355 /* e1 = x0 + x0 */
37356 emit_insn (gen_rtx_SET (VOIDmode, e1,
37357 gen_rtx_PLUS (mode, x0, x0)));
37359 /* x1 = e1 - e0 */
37360 emit_insn (gen_rtx_SET (VOIDmode, x1,
37361 gen_rtx_MINUS (mode, e1, e0)));
37363 /* res = a * x1 */
37364 emit_insn (gen_rtx_SET (VOIDmode, res,
37365 gen_rtx_MULT (mode, a, x1)));
37368 /* Output code to perform a Newton-Rhapson approximation of a
37369 single precision floating point [reciprocal] square root. */
37371 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37372 bool recip)
37374 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37375 REAL_VALUE_TYPE r;
37377 x0 = gen_reg_rtx (mode);
37378 e0 = gen_reg_rtx (mode);
37379 e1 = gen_reg_rtx (mode);
37380 e2 = gen_reg_rtx (mode);
37381 e3 = gen_reg_rtx (mode);
37383 real_from_integer (&r, VOIDmode, -3, -1, 0);
37384 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37386 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37387 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37389 if (VECTOR_MODE_P (mode))
37391 mthree = ix86_build_const_vector (mode, true, mthree);
37392 mhalf = ix86_build_const_vector (mode, true, mhalf);
37395 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37396 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37398 a = force_reg (mode, a);
37400 /* x0 = rsqrt(a) estimate */
37401 emit_insn (gen_rtx_SET (VOIDmode, x0,
37402 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37403 UNSPEC_RSQRT)));
37405 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37406 if (!recip)
37408 rtx zero, mask;
37410 zero = gen_reg_rtx (mode);
37411 mask = gen_reg_rtx (mode);
37413 zero = force_reg (mode, CONST0_RTX(mode));
37414 emit_insn (gen_rtx_SET (VOIDmode, mask,
37415 gen_rtx_NE (mode, zero, a)));
37417 emit_insn (gen_rtx_SET (VOIDmode, x0,
37418 gen_rtx_AND (mode, x0, mask)));
37421 /* e0 = x0 * a */
37422 emit_insn (gen_rtx_SET (VOIDmode, e0,
37423 gen_rtx_MULT (mode, x0, a)));
37424 /* e1 = e0 * x0 */
37425 emit_insn (gen_rtx_SET (VOIDmode, e1,
37426 gen_rtx_MULT (mode, e0, x0)));
37428 /* e2 = e1 - 3. */
37429 mthree = force_reg (mode, mthree);
37430 emit_insn (gen_rtx_SET (VOIDmode, e2,
37431 gen_rtx_PLUS (mode, e1, mthree)));
37433 mhalf = force_reg (mode, mhalf);
37434 if (recip)
37435 /* e3 = -.5 * x0 */
37436 emit_insn (gen_rtx_SET (VOIDmode, e3,
37437 gen_rtx_MULT (mode, x0, mhalf)));
37438 else
37439 /* e3 = -.5 * e0 */
37440 emit_insn (gen_rtx_SET (VOIDmode, e3,
37441 gen_rtx_MULT (mode, e0, mhalf)));
37442 /* ret = e2 * e3 */
37443 emit_insn (gen_rtx_SET (VOIDmode, res,
37444 gen_rtx_MULT (mode, e2, e3)));
37447 #ifdef TARGET_SOLARIS
37448 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37450 static void
37451 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37452 tree decl)
37454 /* With Binutils 2.15, the "@unwind" marker must be specified on
37455 every occurrence of the ".eh_frame" section, not just the first
37456 one. */
37457 if (TARGET_64BIT
37458 && strcmp (name, ".eh_frame") == 0)
37460 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37461 flags & SECTION_WRITE ? "aw" : "a");
37462 return;
37465 #ifndef USE_GAS
37466 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37468 solaris_elf_asm_comdat_section (name, flags, decl);
37469 return;
37471 #endif
37473 default_elf_asm_named_section (name, flags, decl);
37475 #endif /* TARGET_SOLARIS */
37477 /* Return the mangling of TYPE if it is an extended fundamental type. */
37479 static const char *
37480 ix86_mangle_type (const_tree type)
37482 type = TYPE_MAIN_VARIANT (type);
37484 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37485 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37486 return NULL;
37488 switch (TYPE_MODE (type))
37490 case TFmode:
37491 /* __float128 is "g". */
37492 return "g";
37493 case XFmode:
37494 /* "long double" or __float80 is "e". */
37495 return "e";
37496 default:
37497 return NULL;
37501 /* For 32-bit code we can save PIC register setup by using
37502 __stack_chk_fail_local hidden function instead of calling
37503 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37504 register, so it is better to call __stack_chk_fail directly. */
37506 static tree ATTRIBUTE_UNUSED
37507 ix86_stack_protect_fail (void)
37509 return TARGET_64BIT
37510 ? default_external_stack_protect_fail ()
37511 : default_hidden_stack_protect_fail ();
37514 /* Select a format to encode pointers in exception handling data. CODE
37515 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37516 true if the symbol may be affected by dynamic relocations.
37518 ??? All x86 object file formats are capable of representing this.
37519 After all, the relocation needed is the same as for the call insn.
37520 Whether or not a particular assembler allows us to enter such, I
37521 guess we'll have to see. */
37523 asm_preferred_eh_data_format (int code, int global)
37525 if (flag_pic)
37527 int type = DW_EH_PE_sdata8;
37528 if (!TARGET_64BIT
37529 || ix86_cmodel == CM_SMALL_PIC
37530 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37531 type = DW_EH_PE_sdata4;
37532 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37534 if (ix86_cmodel == CM_SMALL
37535 || (ix86_cmodel == CM_MEDIUM && code))
37536 return DW_EH_PE_udata4;
37537 return DW_EH_PE_absptr;
37540 /* Expand copysign from SIGN to the positive value ABS_VALUE
37541 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37542 the sign-bit. */
37543 static void
37544 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37546 enum machine_mode mode = GET_MODE (sign);
37547 rtx sgn = gen_reg_rtx (mode);
37548 if (mask == NULL_RTX)
37550 enum machine_mode vmode;
37552 if (mode == SFmode)
37553 vmode = V4SFmode;
37554 else if (mode == DFmode)
37555 vmode = V2DFmode;
37556 else
37557 vmode = mode;
37559 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37560 if (!VECTOR_MODE_P (mode))
37562 /* We need to generate a scalar mode mask in this case. */
37563 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37564 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37565 mask = gen_reg_rtx (mode);
37566 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37569 else
37570 mask = gen_rtx_NOT (mode, mask);
37571 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37572 gen_rtx_AND (mode, mask, sign)));
37573 emit_insn (gen_rtx_SET (VOIDmode, result,
37574 gen_rtx_IOR (mode, abs_value, sgn)));
37577 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37578 mask for masking out the sign-bit is stored in *SMASK, if that is
37579 non-null. */
37580 static rtx
37581 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37583 enum machine_mode vmode, mode = GET_MODE (op0);
37584 rtx xa, mask;
37586 xa = gen_reg_rtx (mode);
37587 if (mode == SFmode)
37588 vmode = V4SFmode;
37589 else if (mode == DFmode)
37590 vmode = V2DFmode;
37591 else
37592 vmode = mode;
37593 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37594 if (!VECTOR_MODE_P (mode))
37596 /* We need to generate a scalar mode mask in this case. */
37597 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37598 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37599 mask = gen_reg_rtx (mode);
37600 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37602 emit_insn (gen_rtx_SET (VOIDmode, xa,
37603 gen_rtx_AND (mode, op0, mask)));
37605 if (smask)
37606 *smask = mask;
37608 return xa;
37611 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37612 swapping the operands if SWAP_OPERANDS is true. The expanded
37613 code is a forward jump to a newly created label in case the
37614 comparison is true. The generated label rtx is returned. */
37615 static rtx
37616 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37617 bool swap_operands)
37619 rtx label, tmp;
37621 if (swap_operands)
37623 tmp = op0;
37624 op0 = op1;
37625 op1 = tmp;
37628 label = gen_label_rtx ();
37629 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37630 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37631 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37632 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37633 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37634 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37635 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37636 JUMP_LABEL (tmp) = label;
37638 return label;
37641 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37642 using comparison code CODE. Operands are swapped for the comparison if
37643 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37644 static rtx
37645 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37646 bool swap_operands)
37648 rtx (*insn)(rtx, rtx, rtx, rtx);
37649 enum machine_mode mode = GET_MODE (op0);
37650 rtx mask = gen_reg_rtx (mode);
37652 if (swap_operands)
37654 rtx tmp = op0;
37655 op0 = op1;
37656 op1 = tmp;
37659 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37661 emit_insn (insn (mask, op0, op1,
37662 gen_rtx_fmt_ee (code, mode, op0, op1)));
37663 return mask;
37666 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37667 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37668 static rtx
37669 ix86_gen_TWO52 (enum machine_mode mode)
37671 REAL_VALUE_TYPE TWO52r;
37672 rtx TWO52;
37674 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37675 TWO52 = const_double_from_real_value (TWO52r, mode);
37676 TWO52 = force_reg (mode, TWO52);
37678 return TWO52;
37681 /* Expand SSE sequence for computing lround from OP1 storing
37682 into OP0. */
37683 void
37684 ix86_expand_lround (rtx op0, rtx op1)
37686 /* C code for the stuff we're doing below:
37687 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37688 return (long)tmp;
37690 enum machine_mode mode = GET_MODE (op1);
37691 const struct real_format *fmt;
37692 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37693 rtx adj;
37695 /* load nextafter (0.5, 0.0) */
37696 fmt = REAL_MODE_FORMAT (mode);
37697 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37698 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37700 /* adj = copysign (0.5, op1) */
37701 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37702 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37704 /* adj = op1 + adj */
37705 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37707 /* op0 = (imode)adj */
37708 expand_fix (op0, adj, 0);
37711 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37712 into OPERAND0. */
37713 void
37714 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37716 /* C code for the stuff we're doing below (for do_floor):
37717 xi = (long)op1;
37718 xi -= (double)xi > op1 ? 1 : 0;
37719 return xi;
37721 enum machine_mode fmode = GET_MODE (op1);
37722 enum machine_mode imode = GET_MODE (op0);
37723 rtx ireg, freg, label, tmp;
37725 /* reg = (long)op1 */
37726 ireg = gen_reg_rtx (imode);
37727 expand_fix (ireg, op1, 0);
37729 /* freg = (double)reg */
37730 freg = gen_reg_rtx (fmode);
37731 expand_float (freg, ireg, 0);
37733 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37734 label = ix86_expand_sse_compare_and_jump (UNLE,
37735 freg, op1, !do_floor);
37736 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37737 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37738 emit_move_insn (ireg, tmp);
37740 emit_label (label);
37741 LABEL_NUSES (label) = 1;
37743 emit_move_insn (op0, ireg);
37746 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37747 result in OPERAND0. */
37748 void
37749 ix86_expand_rint (rtx operand0, rtx operand1)
37751 /* C code for the stuff we're doing below:
37752 xa = fabs (operand1);
37753 if (!isless (xa, 2**52))
37754 return operand1;
37755 xa = xa + 2**52 - 2**52;
37756 return copysign (xa, operand1);
37758 enum machine_mode mode = GET_MODE (operand0);
37759 rtx res, xa, label, TWO52, mask;
37761 res = gen_reg_rtx (mode);
37762 emit_move_insn (res, operand1);
37764 /* xa = abs (operand1) */
37765 xa = ix86_expand_sse_fabs (res, &mask);
37767 /* if (!isless (xa, TWO52)) goto label; */
37768 TWO52 = ix86_gen_TWO52 (mode);
37769 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37771 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37772 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37774 ix86_sse_copysign_to_positive (res, xa, res, mask);
37776 emit_label (label);
37777 LABEL_NUSES (label) = 1;
37779 emit_move_insn (operand0, res);
37782 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37783 into OPERAND0. */
37784 void
37785 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37787 /* C code for the stuff we expand below.
37788 double xa = fabs (x), x2;
37789 if (!isless (xa, TWO52))
37790 return x;
37791 xa = xa + TWO52 - TWO52;
37792 x2 = copysign (xa, x);
37793 Compensate. Floor:
37794 if (x2 > x)
37795 x2 -= 1;
37796 Compensate. Ceil:
37797 if (x2 < x)
37798 x2 -= -1;
37799 return x2;
37801 enum machine_mode mode = GET_MODE (operand0);
37802 rtx xa, TWO52, tmp, label, one, res, mask;
37804 TWO52 = ix86_gen_TWO52 (mode);
37806 /* Temporary for holding the result, initialized to the input
37807 operand to ease control flow. */
37808 res = gen_reg_rtx (mode);
37809 emit_move_insn (res, operand1);
37811 /* xa = abs (operand1) */
37812 xa = ix86_expand_sse_fabs (res, &mask);
37814 /* if (!isless (xa, TWO52)) goto label; */
37815 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37817 /* xa = xa + TWO52 - TWO52; */
37818 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37819 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37821 /* xa = copysign (xa, operand1) */
37822 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37824 /* generate 1.0 or -1.0 */
37825 one = force_reg (mode,
37826 const_double_from_real_value (do_floor
37827 ? dconst1 : dconstm1, mode));
37829 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37830 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37831 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37832 gen_rtx_AND (mode, one, tmp)));
37833 /* We always need to subtract here to preserve signed zero. */
37834 tmp = expand_simple_binop (mode, MINUS,
37835 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37836 emit_move_insn (res, tmp);
37838 emit_label (label);
37839 LABEL_NUSES (label) = 1;
37841 emit_move_insn (operand0, res);
37844 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37845 into OPERAND0. */
37846 void
37847 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37849 /* C code for the stuff we expand below.
37850 double xa = fabs (x), x2;
37851 if (!isless (xa, TWO52))
37852 return x;
37853 x2 = (double)(long)x;
37854 Compensate. Floor:
37855 if (x2 > x)
37856 x2 -= 1;
37857 Compensate. Ceil:
37858 if (x2 < x)
37859 x2 += 1;
37860 if (HONOR_SIGNED_ZEROS (mode))
37861 return copysign (x2, x);
37862 return x2;
37864 enum machine_mode mode = GET_MODE (operand0);
37865 rtx xa, xi, TWO52, tmp, label, one, res, mask;
37867 TWO52 = ix86_gen_TWO52 (mode);
37869 /* Temporary for holding the result, initialized to the input
37870 operand to ease control flow. */
37871 res = gen_reg_rtx (mode);
37872 emit_move_insn (res, operand1);
37874 /* xa = abs (operand1) */
37875 xa = ix86_expand_sse_fabs (res, &mask);
37877 /* if (!isless (xa, TWO52)) goto label; */
37878 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37880 /* xa = (double)(long)x */
37881 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37882 expand_fix (xi, res, 0);
37883 expand_float (xa, xi, 0);
37885 /* generate 1.0 */
37886 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37888 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37889 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37890 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37891 gen_rtx_AND (mode, one, tmp)));
37892 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
37893 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37894 emit_move_insn (res, tmp);
37896 if (HONOR_SIGNED_ZEROS (mode))
37897 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37899 emit_label (label);
37900 LABEL_NUSES (label) = 1;
37902 emit_move_insn (operand0, res);
37905 /* Expand SSE sequence for computing round from OPERAND1 storing
37906 into OPERAND0. Sequence that works without relying on DImode truncation
37907 via cvttsd2siq that is only available on 64bit targets. */
37908 void
37909 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
37911 /* C code for the stuff we expand below.
37912 double xa = fabs (x), xa2, x2;
37913 if (!isless (xa, TWO52))
37914 return x;
37915 Using the absolute value and copying back sign makes
37916 -0.0 -> -0.0 correct.
37917 xa2 = xa + TWO52 - TWO52;
37918 Compensate.
37919 dxa = xa2 - xa;
37920 if (dxa <= -0.5)
37921 xa2 += 1;
37922 else if (dxa > 0.5)
37923 xa2 -= 1;
37924 x2 = copysign (xa2, x);
37925 return x2;
37927 enum machine_mode mode = GET_MODE (operand0);
37928 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
37930 TWO52 = ix86_gen_TWO52 (mode);
37932 /* Temporary for holding the result, initialized to the input
37933 operand to ease control flow. */
37934 res = gen_reg_rtx (mode);
37935 emit_move_insn (res, operand1);
37937 /* xa = abs (operand1) */
37938 xa = ix86_expand_sse_fabs (res, &mask);
37940 /* if (!isless (xa, TWO52)) goto label; */
37941 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37943 /* xa2 = xa + TWO52 - TWO52; */
37944 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37945 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
37947 /* dxa = xa2 - xa; */
37948 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
37950 /* generate 0.5, 1.0 and -0.5 */
37951 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
37952 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
37953 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
37954 0, OPTAB_DIRECT);
37956 /* Compensate. */
37957 tmp = gen_reg_rtx (mode);
37958 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
37959 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
37960 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37961 gen_rtx_AND (mode, one, tmp)));
37962 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37963 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
37964 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
37965 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37966 gen_rtx_AND (mode, one, tmp)));
37967 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37969 /* res = copysign (xa2, operand1) */
37970 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
37972 emit_label (label);
37973 LABEL_NUSES (label) = 1;
37975 emit_move_insn (operand0, res);
37978 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37979 into OPERAND0. */
37980 void
37981 ix86_expand_trunc (rtx operand0, rtx operand1)
37983 /* C code for SSE variant we expand below.
37984 double xa = fabs (x), x2;
37985 if (!isless (xa, TWO52))
37986 return x;
37987 x2 = (double)(long)x;
37988 if (HONOR_SIGNED_ZEROS (mode))
37989 return copysign (x2, x);
37990 return x2;
37992 enum machine_mode mode = GET_MODE (operand0);
37993 rtx xa, xi, TWO52, label, res, mask;
37995 TWO52 = ix86_gen_TWO52 (mode);
37997 /* Temporary for holding the result, initialized to the input
37998 operand to ease control flow. */
37999 res = gen_reg_rtx (mode);
38000 emit_move_insn (res, operand1);
38002 /* xa = abs (operand1) */
38003 xa = ix86_expand_sse_fabs (res, &mask);
38005 /* if (!isless (xa, TWO52)) goto label; */
38006 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38008 /* x = (double)(long)x */
38009 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38010 expand_fix (xi, res, 0);
38011 expand_float (res, xi, 0);
38013 if (HONOR_SIGNED_ZEROS (mode))
38014 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38016 emit_label (label);
38017 LABEL_NUSES (label) = 1;
38019 emit_move_insn (operand0, res);
38022 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38023 into OPERAND0. */
38024 void
38025 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38027 enum machine_mode mode = GET_MODE (operand0);
38028 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38030 /* C code for SSE variant we expand below.
38031 double xa = fabs (x), x2;
38032 if (!isless (xa, TWO52))
38033 return x;
38034 xa2 = xa + TWO52 - TWO52;
38035 Compensate:
38036 if (xa2 > xa)
38037 xa2 -= 1.0;
38038 x2 = copysign (xa2, x);
38039 return x2;
38042 TWO52 = ix86_gen_TWO52 (mode);
38044 /* Temporary for holding the result, initialized to the input
38045 operand to ease control flow. */
38046 res = gen_reg_rtx (mode);
38047 emit_move_insn (res, operand1);
38049 /* xa = abs (operand1) */
38050 xa = ix86_expand_sse_fabs (res, &smask);
38052 /* if (!isless (xa, TWO52)) goto label; */
38053 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38055 /* res = xa + TWO52 - TWO52; */
38056 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38057 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38058 emit_move_insn (res, tmp);
38060 /* generate 1.0 */
38061 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38063 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38064 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38065 emit_insn (gen_rtx_SET (VOIDmode, mask,
38066 gen_rtx_AND (mode, mask, one)));
38067 tmp = expand_simple_binop (mode, MINUS,
38068 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38069 emit_move_insn (res, tmp);
38071 /* res = copysign (res, operand1) */
38072 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38074 emit_label (label);
38075 LABEL_NUSES (label) = 1;
38077 emit_move_insn (operand0, res);
38080 /* Expand SSE sequence for computing round from OPERAND1 storing
38081 into OPERAND0. */
38082 void
38083 ix86_expand_round (rtx operand0, rtx operand1)
38085 /* C code for the stuff we're doing below:
38086 double xa = fabs (x);
38087 if (!isless (xa, TWO52))
38088 return x;
38089 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38090 return copysign (xa, x);
38092 enum machine_mode mode = GET_MODE (operand0);
38093 rtx res, TWO52, xa, label, xi, half, mask;
38094 const struct real_format *fmt;
38095 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38097 /* Temporary for holding the result, initialized to the input
38098 operand to ease control flow. */
38099 res = gen_reg_rtx (mode);
38100 emit_move_insn (res, operand1);
38102 TWO52 = ix86_gen_TWO52 (mode);
38103 xa = ix86_expand_sse_fabs (res, &mask);
38104 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38106 /* load nextafter (0.5, 0.0) */
38107 fmt = REAL_MODE_FORMAT (mode);
38108 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38109 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38111 /* xa = xa + 0.5 */
38112 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38113 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38115 /* xa = (double)(int64_t)xa */
38116 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38117 expand_fix (xi, xa, 0);
38118 expand_float (xa, xi, 0);
38120 /* res = copysign (xa, operand1) */
38121 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38123 emit_label (label);
38124 LABEL_NUSES (label) = 1;
38126 emit_move_insn (operand0, res);
38129 /* Expand SSE sequence for computing round
38130 from OP1 storing into OP0 using sse4 round insn. */
38131 void
38132 ix86_expand_round_sse4 (rtx op0, rtx op1)
38134 enum machine_mode mode = GET_MODE (op0);
38135 rtx e1, e2, res, half;
38136 const struct real_format *fmt;
38137 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38138 rtx (*gen_copysign) (rtx, rtx, rtx);
38139 rtx (*gen_round) (rtx, rtx, rtx);
38141 switch (mode)
38143 case SFmode:
38144 gen_copysign = gen_copysignsf3;
38145 gen_round = gen_sse4_1_roundsf2;
38146 break;
38147 case DFmode:
38148 gen_copysign = gen_copysigndf3;
38149 gen_round = gen_sse4_1_rounddf2;
38150 break;
38151 default:
38152 gcc_unreachable ();
38155 /* round (a) = trunc (a + copysign (0.5, a)) */
38157 /* load nextafter (0.5, 0.0) */
38158 fmt = REAL_MODE_FORMAT (mode);
38159 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38160 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38161 half = const_double_from_real_value (pred_half, mode);
38163 /* e1 = copysign (0.5, op1) */
38164 e1 = gen_reg_rtx (mode);
38165 emit_insn (gen_copysign (e1, half, op1));
38167 /* e2 = op1 + e1 */
38168 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38170 /* res = trunc (e2) */
38171 res = gen_reg_rtx (mode);
38172 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38174 emit_move_insn (op0, res);
38178 /* Table of valid machine attributes. */
38179 static const struct attribute_spec ix86_attribute_table[] =
38181 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38182 affects_type_identity } */
38183 /* Stdcall attribute says callee is responsible for popping arguments
38184 if they are not variable. */
38185 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38186 true },
38187 /* Fastcall attribute says callee is responsible for popping arguments
38188 if they are not variable. */
38189 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38190 true },
38191 /* Thiscall attribute says callee is responsible for popping arguments
38192 if they are not variable. */
38193 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38194 true },
38195 /* Cdecl attribute says the callee is a normal C declaration */
38196 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38197 true },
38198 /* Regparm attribute specifies how many integer arguments are to be
38199 passed in registers. */
38200 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38201 true },
38202 /* Sseregparm attribute says we are using x86_64 calling conventions
38203 for FP arguments. */
38204 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38205 true },
38206 /* The transactional memory builtins are implicitly regparm or fastcall
38207 depending on the ABI. Override the generic do-nothing attribute that
38208 these builtins were declared with. */
38209 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38210 true },
38211 /* force_align_arg_pointer says this function realigns the stack at entry. */
38212 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38213 false, true, true, ix86_handle_cconv_attribute, false },
38214 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38215 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38216 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38217 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38218 false },
38219 #endif
38220 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38221 false },
38222 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38223 false },
38224 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38225 SUBTARGET_ATTRIBUTE_TABLE,
38226 #endif
38227 /* ms_abi and sysv_abi calling convention function attributes. */
38228 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38229 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38230 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38231 false },
38232 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38233 ix86_handle_callee_pop_aggregate_return, true },
38234 /* End element. */
38235 { NULL, 0, 0, false, false, false, NULL, false }
38238 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38239 static int
38240 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38241 tree vectype,
38242 int misalign ATTRIBUTE_UNUSED)
38244 unsigned elements;
38246 switch (type_of_cost)
38248 case scalar_stmt:
38249 return ix86_cost->scalar_stmt_cost;
38251 case scalar_load:
38252 return ix86_cost->scalar_load_cost;
38254 case scalar_store:
38255 return ix86_cost->scalar_store_cost;
38257 case vector_stmt:
38258 return ix86_cost->vec_stmt_cost;
38260 case vector_load:
38261 return ix86_cost->vec_align_load_cost;
38263 case vector_store:
38264 return ix86_cost->vec_store_cost;
38266 case vec_to_scalar:
38267 return ix86_cost->vec_to_scalar_cost;
38269 case scalar_to_vec:
38270 return ix86_cost->scalar_to_vec_cost;
38272 case unaligned_load:
38273 case unaligned_store:
38274 return ix86_cost->vec_unalign_load_cost;
38276 case cond_branch_taken:
38277 return ix86_cost->cond_taken_branch_cost;
38279 case cond_branch_not_taken:
38280 return ix86_cost->cond_not_taken_branch_cost;
38282 case vec_perm:
38283 case vec_promote_demote:
38284 return ix86_cost->vec_stmt_cost;
38286 case vec_construct:
38287 elements = TYPE_VECTOR_SUBPARTS (vectype);
38288 return elements / 2 + 1;
38290 default:
38291 gcc_unreachable ();
38295 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38296 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38297 insn every time. */
38299 static GTY(()) rtx vselect_insn;
38301 /* Initialize vselect_insn. */
38303 static void
38304 init_vselect_insn (void)
38306 unsigned i;
38307 rtx x;
38309 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38310 for (i = 0; i < MAX_VECT_LEN; ++i)
38311 XVECEXP (x, 0, i) = const0_rtx;
38312 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38313 const0_rtx), x);
38314 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38315 start_sequence ();
38316 vselect_insn = emit_insn (x);
38317 end_sequence ();
38320 /* Construct (set target (vec_select op0 (parallel perm))) and
38321 return true if that's a valid instruction in the active ISA. */
38323 static bool
38324 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38325 unsigned nelt, bool testing_p)
38327 unsigned int i;
38328 rtx x, save_vconcat;
38329 int icode;
38331 if (vselect_insn == NULL_RTX)
38332 init_vselect_insn ();
38334 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38335 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38336 for (i = 0; i < nelt; ++i)
38337 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38338 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38339 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38340 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38341 SET_DEST (PATTERN (vselect_insn)) = target;
38342 icode = recog_memoized (vselect_insn);
38344 if (icode >= 0 && !testing_p)
38345 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38347 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38348 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38349 INSN_CODE (vselect_insn) = -1;
38351 return icode >= 0;
38354 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38356 static bool
38357 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38358 const unsigned char *perm, unsigned nelt,
38359 bool testing_p)
38361 enum machine_mode v2mode;
38362 rtx x;
38363 bool ok;
38365 if (vselect_insn == NULL_RTX)
38366 init_vselect_insn ();
38368 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38369 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38370 PUT_MODE (x, v2mode);
38371 XEXP (x, 0) = op0;
38372 XEXP (x, 1) = op1;
38373 ok = expand_vselect (target, x, perm, nelt, testing_p);
38374 XEXP (x, 0) = const0_rtx;
38375 XEXP (x, 1) = const0_rtx;
38376 return ok;
38379 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38380 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38382 static bool
38383 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38385 enum machine_mode vmode = d->vmode;
38386 unsigned i, mask, nelt = d->nelt;
38387 rtx target, op0, op1, x;
38388 rtx rperm[32], vperm;
38390 if (d->one_operand_p)
38391 return false;
38392 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38394 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38396 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38398 else
38399 return false;
38401 /* This is a blend, not a permute. Elements must stay in their
38402 respective lanes. */
38403 for (i = 0; i < nelt; ++i)
38405 unsigned e = d->perm[i];
38406 if (!(e == i || e == i + nelt))
38407 return false;
38410 if (d->testing_p)
38411 return true;
38413 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38414 decision should be extracted elsewhere, so that we only try that
38415 sequence once all budget==3 options have been tried. */
38416 target = d->target;
38417 op0 = d->op0;
38418 op1 = d->op1;
38419 mask = 0;
38421 switch (vmode)
38423 case V4DFmode:
38424 case V8SFmode:
38425 case V2DFmode:
38426 case V4SFmode:
38427 case V8HImode:
38428 case V8SImode:
38429 for (i = 0; i < nelt; ++i)
38430 mask |= (d->perm[i] >= nelt) << i;
38431 break;
38433 case V2DImode:
38434 for (i = 0; i < 2; ++i)
38435 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38436 vmode = V8HImode;
38437 goto do_subreg;
38439 case V4SImode:
38440 for (i = 0; i < 4; ++i)
38441 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38442 vmode = V8HImode;
38443 goto do_subreg;
38445 case V16QImode:
38446 /* See if bytes move in pairs so we can use pblendw with
38447 an immediate argument, rather than pblendvb with a vector
38448 argument. */
38449 for (i = 0; i < 16; i += 2)
38450 if (d->perm[i] + 1 != d->perm[i + 1])
38452 use_pblendvb:
38453 for (i = 0; i < nelt; ++i)
38454 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38456 finish_pblendvb:
38457 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38458 vperm = force_reg (vmode, vperm);
38460 if (GET_MODE_SIZE (vmode) == 16)
38461 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38462 else
38463 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38464 return true;
38467 for (i = 0; i < 8; ++i)
38468 mask |= (d->perm[i * 2] >= 16) << i;
38469 vmode = V8HImode;
38470 /* FALLTHRU */
38472 do_subreg:
38473 target = gen_lowpart (vmode, target);
38474 op0 = gen_lowpart (vmode, op0);
38475 op1 = gen_lowpart (vmode, op1);
38476 break;
38478 case V32QImode:
38479 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38480 for (i = 0; i < 32; i += 2)
38481 if (d->perm[i] + 1 != d->perm[i + 1])
38482 goto use_pblendvb;
38483 /* See if bytes move in quadruplets. If yes, vpblendd
38484 with immediate can be used. */
38485 for (i = 0; i < 32; i += 4)
38486 if (d->perm[i] + 2 != d->perm[i + 2])
38487 break;
38488 if (i < 32)
38490 /* See if bytes move the same in both lanes. If yes,
38491 vpblendw with immediate can be used. */
38492 for (i = 0; i < 16; i += 2)
38493 if (d->perm[i] + 16 != d->perm[i + 16])
38494 goto use_pblendvb;
38496 /* Use vpblendw. */
38497 for (i = 0; i < 16; ++i)
38498 mask |= (d->perm[i * 2] >= 32) << i;
38499 vmode = V16HImode;
38500 goto do_subreg;
38503 /* Use vpblendd. */
38504 for (i = 0; i < 8; ++i)
38505 mask |= (d->perm[i * 4] >= 32) << i;
38506 vmode = V8SImode;
38507 goto do_subreg;
38509 case V16HImode:
38510 /* See if words move in pairs. If yes, vpblendd can be used. */
38511 for (i = 0; i < 16; i += 2)
38512 if (d->perm[i] + 1 != d->perm[i + 1])
38513 break;
38514 if (i < 16)
38516 /* See if words move the same in both lanes. If not,
38517 vpblendvb must be used. */
38518 for (i = 0; i < 8; i++)
38519 if (d->perm[i] + 8 != d->perm[i + 8])
38521 /* Use vpblendvb. */
38522 for (i = 0; i < 32; ++i)
38523 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38525 vmode = V32QImode;
38526 nelt = 32;
38527 target = gen_lowpart (vmode, target);
38528 op0 = gen_lowpart (vmode, op0);
38529 op1 = gen_lowpart (vmode, op1);
38530 goto finish_pblendvb;
38533 /* Use vpblendw. */
38534 for (i = 0; i < 16; ++i)
38535 mask |= (d->perm[i] >= 16) << i;
38536 break;
38539 /* Use vpblendd. */
38540 for (i = 0; i < 8; ++i)
38541 mask |= (d->perm[i * 2] >= 16) << i;
38542 vmode = V8SImode;
38543 goto do_subreg;
38545 case V4DImode:
38546 /* Use vpblendd. */
38547 for (i = 0; i < 4; ++i)
38548 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38549 vmode = V8SImode;
38550 goto do_subreg;
38552 default:
38553 gcc_unreachable ();
38556 /* This matches five different patterns with the different modes. */
38557 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38558 x = gen_rtx_SET (VOIDmode, target, x);
38559 emit_insn (x);
38561 return true;
38564 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38565 in terms of the variable form of vpermilps.
38567 Note that we will have already failed the immediate input vpermilps,
38568 which requires that the high and low part shuffle be identical; the
38569 variable form doesn't require that. */
38571 static bool
38572 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38574 rtx rperm[8], vperm;
38575 unsigned i;
38577 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38578 return false;
38580 /* We can only permute within the 128-bit lane. */
38581 for (i = 0; i < 8; ++i)
38583 unsigned e = d->perm[i];
38584 if (i < 4 ? e >= 4 : e < 4)
38585 return false;
38588 if (d->testing_p)
38589 return true;
38591 for (i = 0; i < 8; ++i)
38593 unsigned e = d->perm[i];
38595 /* Within each 128-bit lane, the elements of op0 are numbered
38596 from 0 and the elements of op1 are numbered from 4. */
38597 if (e >= 8 + 4)
38598 e -= 8;
38599 else if (e >= 4)
38600 e -= 4;
38602 rperm[i] = GEN_INT (e);
38605 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38606 vperm = force_reg (V8SImode, vperm);
38607 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38609 return true;
38612 /* Return true if permutation D can be performed as VMODE permutation
38613 instead. */
38615 static bool
38616 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38618 unsigned int i, j, chunk;
38620 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38621 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38622 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38623 return false;
38625 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38626 return true;
38628 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38629 for (i = 0; i < d->nelt; i += chunk)
38630 if (d->perm[i] & (chunk - 1))
38631 return false;
38632 else
38633 for (j = 1; j < chunk; ++j)
38634 if (d->perm[i] + j != d->perm[i + j])
38635 return false;
38637 return true;
38640 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38641 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38643 static bool
38644 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38646 unsigned i, nelt, eltsz, mask;
38647 unsigned char perm[32];
38648 enum machine_mode vmode = V16QImode;
38649 rtx rperm[32], vperm, target, op0, op1;
38651 nelt = d->nelt;
38653 if (!d->one_operand_p)
38655 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38657 if (TARGET_AVX2
38658 && valid_perm_using_mode_p (V2TImode, d))
38660 if (d->testing_p)
38661 return true;
38663 /* Use vperm2i128 insn. The pattern uses
38664 V4DImode instead of V2TImode. */
38665 target = gen_lowpart (V4DImode, d->target);
38666 op0 = gen_lowpart (V4DImode, d->op0);
38667 op1 = gen_lowpart (V4DImode, d->op1);
38668 rperm[0]
38669 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38670 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38671 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38672 return true;
38674 return false;
38677 else
38679 if (GET_MODE_SIZE (d->vmode) == 16)
38681 if (!TARGET_SSSE3)
38682 return false;
38684 else if (GET_MODE_SIZE (d->vmode) == 32)
38686 if (!TARGET_AVX2)
38687 return false;
38689 /* V4DImode should be already handled through
38690 expand_vselect by vpermq instruction. */
38691 gcc_assert (d->vmode != V4DImode);
38693 vmode = V32QImode;
38694 if (d->vmode == V8SImode
38695 || d->vmode == V16HImode
38696 || d->vmode == V32QImode)
38698 /* First see if vpermq can be used for
38699 V8SImode/V16HImode/V32QImode. */
38700 if (valid_perm_using_mode_p (V4DImode, d))
38702 for (i = 0; i < 4; i++)
38703 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38704 if (d->testing_p)
38705 return true;
38706 return expand_vselect (gen_lowpart (V4DImode, d->target),
38707 gen_lowpart (V4DImode, d->op0),
38708 perm, 4, false);
38711 /* Next see if vpermd can be used. */
38712 if (valid_perm_using_mode_p (V8SImode, d))
38713 vmode = V8SImode;
38715 /* Or if vpermps can be used. */
38716 else if (d->vmode == V8SFmode)
38717 vmode = V8SImode;
38719 if (vmode == V32QImode)
38721 /* vpshufb only works intra lanes, it is not
38722 possible to shuffle bytes in between the lanes. */
38723 for (i = 0; i < nelt; ++i)
38724 if ((d->perm[i] ^ i) & (nelt / 2))
38725 return false;
38728 else
38729 return false;
38732 if (d->testing_p)
38733 return true;
38735 if (vmode == V8SImode)
38736 for (i = 0; i < 8; ++i)
38737 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38738 else
38740 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38741 if (!d->one_operand_p)
38742 mask = 2 * nelt - 1;
38743 else if (vmode == V16QImode)
38744 mask = nelt - 1;
38745 else
38746 mask = nelt / 2 - 1;
38748 for (i = 0; i < nelt; ++i)
38750 unsigned j, e = d->perm[i] & mask;
38751 for (j = 0; j < eltsz; ++j)
38752 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38756 vperm = gen_rtx_CONST_VECTOR (vmode,
38757 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38758 vperm = force_reg (vmode, vperm);
38760 target = gen_lowpart (vmode, d->target);
38761 op0 = gen_lowpart (vmode, d->op0);
38762 if (d->one_operand_p)
38764 if (vmode == V16QImode)
38765 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38766 else if (vmode == V32QImode)
38767 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38768 else if (vmode == V8SFmode)
38769 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38770 else
38771 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38773 else
38775 op1 = gen_lowpart (vmode, d->op1);
38776 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38779 return true;
38782 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38783 in a single instruction. */
38785 static bool
38786 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38788 unsigned i, nelt = d->nelt;
38789 unsigned char perm2[MAX_VECT_LEN];
38791 /* Check plain VEC_SELECT first, because AVX has instructions that could
38792 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38793 input where SEL+CONCAT may not. */
38794 if (d->one_operand_p)
38796 int mask = nelt - 1;
38797 bool identity_perm = true;
38798 bool broadcast_perm = true;
38800 for (i = 0; i < nelt; i++)
38802 perm2[i] = d->perm[i] & mask;
38803 if (perm2[i] != i)
38804 identity_perm = false;
38805 if (perm2[i])
38806 broadcast_perm = false;
38809 if (identity_perm)
38811 if (!d->testing_p)
38812 emit_move_insn (d->target, d->op0);
38813 return true;
38815 else if (broadcast_perm && TARGET_AVX2)
38817 /* Use vpbroadcast{b,w,d}. */
38818 rtx (*gen) (rtx, rtx) = NULL;
38819 switch (d->vmode)
38821 case V32QImode:
38822 gen = gen_avx2_pbroadcastv32qi_1;
38823 break;
38824 case V16HImode:
38825 gen = gen_avx2_pbroadcastv16hi_1;
38826 break;
38827 case V8SImode:
38828 gen = gen_avx2_pbroadcastv8si_1;
38829 break;
38830 case V16QImode:
38831 gen = gen_avx2_pbroadcastv16qi;
38832 break;
38833 case V8HImode:
38834 gen = gen_avx2_pbroadcastv8hi;
38835 break;
38836 case V8SFmode:
38837 gen = gen_avx2_vec_dupv8sf_1;
38838 break;
38839 /* For other modes prefer other shuffles this function creates. */
38840 default: break;
38842 if (gen != NULL)
38844 if (!d->testing_p)
38845 emit_insn (gen (d->target, d->op0));
38846 return true;
38850 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38851 return true;
38853 /* There are plenty of patterns in sse.md that are written for
38854 SEL+CONCAT and are not replicated for a single op. Perhaps
38855 that should be changed, to avoid the nastiness here. */
38857 /* Recognize interleave style patterns, which means incrementing
38858 every other permutation operand. */
38859 for (i = 0; i < nelt; i += 2)
38861 perm2[i] = d->perm[i] & mask;
38862 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
38864 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38865 d->testing_p))
38866 return true;
38868 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
38869 if (nelt >= 4)
38871 for (i = 0; i < nelt; i += 4)
38873 perm2[i + 0] = d->perm[i + 0] & mask;
38874 perm2[i + 1] = d->perm[i + 1] & mask;
38875 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
38876 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
38879 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38880 d->testing_p))
38881 return true;
38885 /* Finally, try the fully general two operand permute. */
38886 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
38887 d->testing_p))
38888 return true;
38890 /* Recognize interleave style patterns with reversed operands. */
38891 if (!d->one_operand_p)
38893 for (i = 0; i < nelt; ++i)
38895 unsigned e = d->perm[i];
38896 if (e >= nelt)
38897 e -= nelt;
38898 else
38899 e += nelt;
38900 perm2[i] = e;
38903 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
38904 d->testing_p))
38905 return true;
38908 /* Try the SSE4.1 blend variable merge instructions. */
38909 if (expand_vec_perm_blend (d))
38910 return true;
38912 /* Try one of the AVX vpermil variable permutations. */
38913 if (expand_vec_perm_vpermil (d))
38914 return true;
38916 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
38917 vpshufb, vpermd, vpermps or vpermq variable permutation. */
38918 if (expand_vec_perm_pshufb (d))
38919 return true;
38921 return false;
38924 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38925 in terms of a pair of pshuflw + pshufhw instructions. */
38927 static bool
38928 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
38930 unsigned char perm2[MAX_VECT_LEN];
38931 unsigned i;
38932 bool ok;
38934 if (d->vmode != V8HImode || !d->one_operand_p)
38935 return false;
38937 /* The two permutations only operate in 64-bit lanes. */
38938 for (i = 0; i < 4; ++i)
38939 if (d->perm[i] >= 4)
38940 return false;
38941 for (i = 4; i < 8; ++i)
38942 if (d->perm[i] < 4)
38943 return false;
38945 if (d->testing_p)
38946 return true;
38948 /* Emit the pshuflw. */
38949 memcpy (perm2, d->perm, 4);
38950 for (i = 4; i < 8; ++i)
38951 perm2[i] = i;
38952 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
38953 gcc_assert (ok);
38955 /* Emit the pshufhw. */
38956 memcpy (perm2 + 4, d->perm + 4, 4);
38957 for (i = 0; i < 4; ++i)
38958 perm2[i] = i;
38959 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
38960 gcc_assert (ok);
38962 return true;
38965 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38966 the permutation using the SSSE3 palignr instruction. This succeeds
38967 when all of the elements in PERM fit within one vector and we merely
38968 need to shift them down so that a single vector permutation has a
38969 chance to succeed. */
38971 static bool
38972 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
38974 unsigned i, nelt = d->nelt;
38975 unsigned min, max;
38976 bool in_order, ok;
38977 rtx shift;
38979 /* Even with AVX, palignr only operates on 128-bit vectors. */
38980 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38981 return false;
38983 min = nelt, max = 0;
38984 for (i = 0; i < nelt; ++i)
38986 unsigned e = d->perm[i];
38987 if (e < min)
38988 min = e;
38989 if (e > max)
38990 max = e;
38992 if (min == 0 || max - min >= nelt)
38993 return false;
38995 /* Given that we have SSSE3, we know we'll be able to implement the
38996 single operand permutation after the palignr with pshufb. */
38997 if (d->testing_p)
38998 return true;
39000 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39001 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39002 gen_lowpart (TImode, d->op1),
39003 gen_lowpart (TImode, d->op0), shift));
39005 d->op0 = d->op1 = d->target;
39006 d->one_operand_p = true;
39008 in_order = true;
39009 for (i = 0; i < nelt; ++i)
39011 unsigned e = d->perm[i] - min;
39012 if (e != i)
39013 in_order = false;
39014 d->perm[i] = e;
39017 /* Test for the degenerate case where the alignment by itself
39018 produces the desired permutation. */
39019 if (in_order)
39020 return true;
39022 ok = expand_vec_perm_1 (d);
39023 gcc_assert (ok);
39025 return ok;
39028 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39030 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39031 a two vector permutation into a single vector permutation by using
39032 an interleave operation to merge the vectors. */
39034 static bool
39035 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39037 struct expand_vec_perm_d dremap, dfinal;
39038 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39039 unsigned HOST_WIDE_INT contents;
39040 unsigned char remap[2 * MAX_VECT_LEN];
39041 rtx seq;
39042 bool ok, same_halves = false;
39044 if (GET_MODE_SIZE (d->vmode) == 16)
39046 if (d->one_operand_p)
39047 return false;
39049 else if (GET_MODE_SIZE (d->vmode) == 32)
39051 if (!TARGET_AVX)
39052 return false;
39053 /* For 32-byte modes allow even d->one_operand_p.
39054 The lack of cross-lane shuffling in some instructions
39055 might prevent a single insn shuffle. */
39056 dfinal = *d;
39057 dfinal.testing_p = true;
39058 /* If expand_vec_perm_interleave3 can expand this into
39059 a 3 insn sequence, give up and let it be expanded as
39060 3 insn sequence. While that is one insn longer,
39061 it doesn't need a memory operand and in the common
39062 case that both interleave low and high permutations
39063 with the same operands are adjacent needs 4 insns
39064 for both after CSE. */
39065 if (expand_vec_perm_interleave3 (&dfinal))
39066 return false;
39068 else
39069 return false;
39071 /* Examine from whence the elements come. */
39072 contents = 0;
39073 for (i = 0; i < nelt; ++i)
39074 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39076 memset (remap, 0xff, sizeof (remap));
39077 dremap = *d;
39079 if (GET_MODE_SIZE (d->vmode) == 16)
39081 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39083 /* Split the two input vectors into 4 halves. */
39084 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39085 h2 = h1 << nelt2;
39086 h3 = h2 << nelt2;
39087 h4 = h3 << nelt2;
39089 /* If the elements from the low halves use interleave low, and similarly
39090 for interleave high. If the elements are from mis-matched halves, we
39091 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39092 if ((contents & (h1 | h3)) == contents)
39094 /* punpckl* */
39095 for (i = 0; i < nelt2; ++i)
39097 remap[i] = i * 2;
39098 remap[i + nelt] = i * 2 + 1;
39099 dremap.perm[i * 2] = i;
39100 dremap.perm[i * 2 + 1] = i + nelt;
39102 if (!TARGET_SSE2 && d->vmode == V4SImode)
39103 dremap.vmode = V4SFmode;
39105 else if ((contents & (h2 | h4)) == contents)
39107 /* punpckh* */
39108 for (i = 0; i < nelt2; ++i)
39110 remap[i + nelt2] = i * 2;
39111 remap[i + nelt + nelt2] = i * 2 + 1;
39112 dremap.perm[i * 2] = i + nelt2;
39113 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39115 if (!TARGET_SSE2 && d->vmode == V4SImode)
39116 dremap.vmode = V4SFmode;
39118 else if ((contents & (h1 | h4)) == contents)
39120 /* shufps */
39121 for (i = 0; i < nelt2; ++i)
39123 remap[i] = i;
39124 remap[i + nelt + nelt2] = i + nelt2;
39125 dremap.perm[i] = i;
39126 dremap.perm[i + nelt2] = i + nelt + nelt2;
39128 if (nelt != 4)
39130 /* shufpd */
39131 dremap.vmode = V2DImode;
39132 dremap.nelt = 2;
39133 dremap.perm[0] = 0;
39134 dremap.perm[1] = 3;
39137 else if ((contents & (h2 | h3)) == contents)
39139 /* shufps */
39140 for (i = 0; i < nelt2; ++i)
39142 remap[i + nelt2] = i;
39143 remap[i + nelt] = i + nelt2;
39144 dremap.perm[i] = i + nelt2;
39145 dremap.perm[i + nelt2] = i + nelt;
39147 if (nelt != 4)
39149 /* shufpd */
39150 dremap.vmode = V2DImode;
39151 dremap.nelt = 2;
39152 dremap.perm[0] = 1;
39153 dremap.perm[1] = 2;
39156 else
39157 return false;
39159 else
39161 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39162 unsigned HOST_WIDE_INT q[8];
39163 unsigned int nonzero_halves[4];
39165 /* Split the two input vectors into 8 quarters. */
39166 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39167 for (i = 1; i < 8; ++i)
39168 q[i] = q[0] << (nelt4 * i);
39169 for (i = 0; i < 4; ++i)
39170 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39172 nonzero_halves[nzcnt] = i;
39173 ++nzcnt;
39176 if (nzcnt == 1)
39178 gcc_assert (d->one_operand_p);
39179 nonzero_halves[1] = nonzero_halves[0];
39180 same_halves = true;
39182 else if (d->one_operand_p)
39184 gcc_assert (nonzero_halves[0] == 0);
39185 gcc_assert (nonzero_halves[1] == 1);
39188 if (nzcnt <= 2)
39190 if (d->perm[0] / nelt2 == nonzero_halves[1])
39192 /* Attempt to increase the likelihood that dfinal
39193 shuffle will be intra-lane. */
39194 char tmph = nonzero_halves[0];
39195 nonzero_halves[0] = nonzero_halves[1];
39196 nonzero_halves[1] = tmph;
39199 /* vperm2f128 or vperm2i128. */
39200 for (i = 0; i < nelt2; ++i)
39202 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39203 remap[i + nonzero_halves[0] * nelt2] = i;
39204 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39205 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39208 if (d->vmode != V8SFmode
39209 && d->vmode != V4DFmode
39210 && d->vmode != V8SImode)
39212 dremap.vmode = V8SImode;
39213 dremap.nelt = 8;
39214 for (i = 0; i < 4; ++i)
39216 dremap.perm[i] = i + nonzero_halves[0] * 4;
39217 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39221 else if (d->one_operand_p)
39222 return false;
39223 else if (TARGET_AVX2
39224 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39226 /* vpunpckl* */
39227 for (i = 0; i < nelt4; ++i)
39229 remap[i] = i * 2;
39230 remap[i + nelt] = i * 2 + 1;
39231 remap[i + nelt2] = i * 2 + nelt2;
39232 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39233 dremap.perm[i * 2] = i;
39234 dremap.perm[i * 2 + 1] = i + nelt;
39235 dremap.perm[i * 2 + nelt2] = i + nelt2;
39236 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39239 else if (TARGET_AVX2
39240 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39242 /* vpunpckh* */
39243 for (i = 0; i < nelt4; ++i)
39245 remap[i + nelt4] = i * 2;
39246 remap[i + nelt + nelt4] = i * 2 + 1;
39247 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39248 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39249 dremap.perm[i * 2] = i + nelt4;
39250 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39251 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39252 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39255 else
39256 return false;
39259 /* Use the remapping array set up above to move the elements from their
39260 swizzled locations into their final destinations. */
39261 dfinal = *d;
39262 for (i = 0; i < nelt; ++i)
39264 unsigned e = remap[d->perm[i]];
39265 gcc_assert (e < nelt);
39266 /* If same_halves is true, both halves of the remapped vector are the
39267 same. Avoid cross-lane accesses if possible. */
39268 if (same_halves && i >= nelt2)
39270 gcc_assert (e < nelt2);
39271 dfinal.perm[i] = e + nelt2;
39273 else
39274 dfinal.perm[i] = e;
39276 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39277 dfinal.op1 = dfinal.op0;
39278 dfinal.one_operand_p = true;
39279 dremap.target = dfinal.op0;
39281 /* Test if the final remap can be done with a single insn. For V4SFmode or
39282 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39283 start_sequence ();
39284 ok = expand_vec_perm_1 (&dfinal);
39285 seq = get_insns ();
39286 end_sequence ();
39288 if (!ok)
39289 return false;
39291 if (d->testing_p)
39292 return true;
39294 if (dremap.vmode != dfinal.vmode)
39296 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39297 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39298 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39301 ok = expand_vec_perm_1 (&dremap);
39302 gcc_assert (ok);
39304 emit_insn (seq);
39305 return true;
39308 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39309 a single vector cross-lane permutation into vpermq followed
39310 by any of the single insn permutations. */
39312 static bool
39313 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39315 struct expand_vec_perm_d dremap, dfinal;
39316 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39317 unsigned contents[2];
39318 bool ok;
39320 if (!(TARGET_AVX2
39321 && (d->vmode == V32QImode || d->vmode == V16HImode)
39322 && d->one_operand_p))
39323 return false;
39325 contents[0] = 0;
39326 contents[1] = 0;
39327 for (i = 0; i < nelt2; ++i)
39329 contents[0] |= 1u << (d->perm[i] / nelt4);
39330 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39333 for (i = 0; i < 2; ++i)
39335 unsigned int cnt = 0;
39336 for (j = 0; j < 4; ++j)
39337 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39338 return false;
39341 if (d->testing_p)
39342 return true;
39344 dremap = *d;
39345 dremap.vmode = V4DImode;
39346 dremap.nelt = 4;
39347 dremap.target = gen_reg_rtx (V4DImode);
39348 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39349 dremap.op1 = dremap.op0;
39350 dremap.one_operand_p = true;
39351 for (i = 0; i < 2; ++i)
39353 unsigned int cnt = 0;
39354 for (j = 0; j < 4; ++j)
39355 if ((contents[i] & (1u << j)) != 0)
39356 dremap.perm[2 * i + cnt++] = j;
39357 for (; cnt < 2; ++cnt)
39358 dremap.perm[2 * i + cnt] = 0;
39361 dfinal = *d;
39362 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39363 dfinal.op1 = dfinal.op0;
39364 dfinal.one_operand_p = true;
39365 for (i = 0, j = 0; i < nelt; ++i)
39367 if (i == nelt2)
39368 j = 2;
39369 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39370 if ((d->perm[i] / nelt4) == dremap.perm[j])
39372 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39373 dfinal.perm[i] |= nelt4;
39374 else
39375 gcc_unreachable ();
39378 ok = expand_vec_perm_1 (&dremap);
39379 gcc_assert (ok);
39381 ok = expand_vec_perm_1 (&dfinal);
39382 gcc_assert (ok);
39384 return true;
39387 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39388 a vector permutation using two instructions, vperm2f128 resp.
39389 vperm2i128 followed by any single in-lane permutation. */
39391 static bool
39392 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39394 struct expand_vec_perm_d dfirst, dsecond;
39395 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39396 bool ok;
39398 if (!TARGET_AVX
39399 || GET_MODE_SIZE (d->vmode) != 32
39400 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39401 return false;
39403 dsecond = *d;
39404 dsecond.one_operand_p = false;
39405 dsecond.testing_p = true;
39407 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39408 immediate. For perm < 16 the second permutation uses
39409 d->op0 as first operand, for perm >= 16 it uses d->op1
39410 as first operand. The second operand is the result of
39411 vperm2[fi]128. */
39412 for (perm = 0; perm < 32; perm++)
39414 /* Ignore permutations which do not move anything cross-lane. */
39415 if (perm < 16)
39417 /* The second shuffle for e.g. V4DFmode has
39418 0123 and ABCD operands.
39419 Ignore AB23, as 23 is already in the second lane
39420 of the first operand. */
39421 if ((perm & 0xc) == (1 << 2)) continue;
39422 /* And 01CD, as 01 is in the first lane of the first
39423 operand. */
39424 if ((perm & 3) == 0) continue;
39425 /* And 4567, as then the vperm2[fi]128 doesn't change
39426 anything on the original 4567 second operand. */
39427 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39429 else
39431 /* The second shuffle for e.g. V4DFmode has
39432 4567 and ABCD operands.
39433 Ignore AB67, as 67 is already in the second lane
39434 of the first operand. */
39435 if ((perm & 0xc) == (3 << 2)) continue;
39436 /* And 45CD, as 45 is in the first lane of the first
39437 operand. */
39438 if ((perm & 3) == 2) continue;
39439 /* And 0123, as then the vperm2[fi]128 doesn't change
39440 anything on the original 0123 first operand. */
39441 if ((perm & 0xf) == (1 << 2)) continue;
39444 for (i = 0; i < nelt; i++)
39446 j = d->perm[i] / nelt2;
39447 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39448 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39449 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39450 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39451 else
39452 break;
39455 if (i == nelt)
39457 start_sequence ();
39458 ok = expand_vec_perm_1 (&dsecond);
39459 end_sequence ();
39461 else
39462 ok = false;
39464 if (ok)
39466 if (d->testing_p)
39467 return true;
39469 /* Found a usable second shuffle. dfirst will be
39470 vperm2f128 on d->op0 and d->op1. */
39471 dsecond.testing_p = false;
39472 dfirst = *d;
39473 dfirst.target = gen_reg_rtx (d->vmode);
39474 for (i = 0; i < nelt; i++)
39475 dfirst.perm[i] = (i & (nelt2 - 1))
39476 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39478 ok = expand_vec_perm_1 (&dfirst);
39479 gcc_assert (ok);
39481 /* And dsecond is some single insn shuffle, taking
39482 d->op0 and result of vperm2f128 (if perm < 16) or
39483 d->op1 and result of vperm2f128 (otherwise). */
39484 dsecond.op1 = dfirst.target;
39485 if (perm >= 16)
39486 dsecond.op0 = dfirst.op1;
39488 ok = expand_vec_perm_1 (&dsecond);
39489 gcc_assert (ok);
39491 return true;
39494 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39495 if (d->one_operand_p)
39496 return false;
39499 return false;
39502 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39503 a two vector permutation using 2 intra-lane interleave insns
39504 and cross-lane shuffle for 32-byte vectors. */
39506 static bool
39507 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39509 unsigned i, nelt;
39510 rtx (*gen) (rtx, rtx, rtx);
39512 if (d->one_operand_p)
39513 return false;
39514 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39516 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39518 else
39519 return false;
39521 nelt = d->nelt;
39522 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39523 return false;
39524 for (i = 0; i < nelt; i += 2)
39525 if (d->perm[i] != d->perm[0] + i / 2
39526 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39527 return false;
39529 if (d->testing_p)
39530 return true;
39532 switch (d->vmode)
39534 case V32QImode:
39535 if (d->perm[0])
39536 gen = gen_vec_interleave_highv32qi;
39537 else
39538 gen = gen_vec_interleave_lowv32qi;
39539 break;
39540 case V16HImode:
39541 if (d->perm[0])
39542 gen = gen_vec_interleave_highv16hi;
39543 else
39544 gen = gen_vec_interleave_lowv16hi;
39545 break;
39546 case V8SImode:
39547 if (d->perm[0])
39548 gen = gen_vec_interleave_highv8si;
39549 else
39550 gen = gen_vec_interleave_lowv8si;
39551 break;
39552 case V4DImode:
39553 if (d->perm[0])
39554 gen = gen_vec_interleave_highv4di;
39555 else
39556 gen = gen_vec_interleave_lowv4di;
39557 break;
39558 case V8SFmode:
39559 if (d->perm[0])
39560 gen = gen_vec_interleave_highv8sf;
39561 else
39562 gen = gen_vec_interleave_lowv8sf;
39563 break;
39564 case V4DFmode:
39565 if (d->perm[0])
39566 gen = gen_vec_interleave_highv4df;
39567 else
39568 gen = gen_vec_interleave_lowv4df;
39569 break;
39570 default:
39571 gcc_unreachable ();
39574 emit_insn (gen (d->target, d->op0, d->op1));
39575 return true;
39578 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39579 a single vector permutation using a single intra-lane vector
39580 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39581 the non-swapped and swapped vectors together. */
39583 static bool
39584 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39586 struct expand_vec_perm_d dfirst, dsecond;
39587 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39588 rtx seq;
39589 bool ok;
39590 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39592 if (!TARGET_AVX
39593 || TARGET_AVX2
39594 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39595 || !d->one_operand_p)
39596 return false;
39598 dfirst = *d;
39599 for (i = 0; i < nelt; i++)
39600 dfirst.perm[i] = 0xff;
39601 for (i = 0, msk = 0; i < nelt; i++)
39603 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39604 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39605 return false;
39606 dfirst.perm[j] = d->perm[i];
39607 if (j != i)
39608 msk |= (1 << i);
39610 for (i = 0; i < nelt; i++)
39611 if (dfirst.perm[i] == 0xff)
39612 dfirst.perm[i] = i;
39614 if (!d->testing_p)
39615 dfirst.target = gen_reg_rtx (dfirst.vmode);
39617 start_sequence ();
39618 ok = expand_vec_perm_1 (&dfirst);
39619 seq = get_insns ();
39620 end_sequence ();
39622 if (!ok)
39623 return false;
39625 if (d->testing_p)
39626 return true;
39628 emit_insn (seq);
39630 dsecond = *d;
39631 dsecond.op0 = dfirst.target;
39632 dsecond.op1 = dfirst.target;
39633 dsecond.one_operand_p = true;
39634 dsecond.target = gen_reg_rtx (dsecond.vmode);
39635 for (i = 0; i < nelt; i++)
39636 dsecond.perm[i] = i ^ nelt2;
39638 ok = expand_vec_perm_1 (&dsecond);
39639 gcc_assert (ok);
39641 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39642 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39643 return true;
39646 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39647 permutation using two vperm2f128, followed by a vshufpd insn blending
39648 the two vectors together. */
39650 static bool
39651 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39653 struct expand_vec_perm_d dfirst, dsecond, dthird;
39654 bool ok;
39656 if (!TARGET_AVX || (d->vmode != V4DFmode))
39657 return false;
39659 if (d->testing_p)
39660 return true;
39662 dfirst = *d;
39663 dsecond = *d;
39664 dthird = *d;
39666 dfirst.perm[0] = (d->perm[0] & ~1);
39667 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39668 dfirst.perm[2] = (d->perm[2] & ~1);
39669 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39670 dsecond.perm[0] = (d->perm[1] & ~1);
39671 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39672 dsecond.perm[2] = (d->perm[3] & ~1);
39673 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39674 dthird.perm[0] = (d->perm[0] % 2);
39675 dthird.perm[1] = (d->perm[1] % 2) + 4;
39676 dthird.perm[2] = (d->perm[2] % 2) + 2;
39677 dthird.perm[3] = (d->perm[3] % 2) + 6;
39679 dfirst.target = gen_reg_rtx (dfirst.vmode);
39680 dsecond.target = gen_reg_rtx (dsecond.vmode);
39681 dthird.op0 = dfirst.target;
39682 dthird.op1 = dsecond.target;
39683 dthird.one_operand_p = false;
39685 canonicalize_perm (&dfirst);
39686 canonicalize_perm (&dsecond);
39688 ok = expand_vec_perm_1 (&dfirst)
39689 && expand_vec_perm_1 (&dsecond)
39690 && expand_vec_perm_1 (&dthird);
39692 gcc_assert (ok);
39694 return true;
39697 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39698 permutation with two pshufb insns and an ior. We should have already
39699 failed all two instruction sequences. */
39701 static bool
39702 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39704 rtx rperm[2][16], vperm, l, h, op, m128;
39705 unsigned int i, nelt, eltsz;
39707 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39708 return false;
39709 gcc_assert (!d->one_operand_p);
39711 nelt = d->nelt;
39712 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39714 /* Generate two permutation masks. If the required element is within
39715 the given vector it is shuffled into the proper lane. If the required
39716 element is in the other vector, force a zero into the lane by setting
39717 bit 7 in the permutation mask. */
39718 m128 = GEN_INT (-128);
39719 for (i = 0; i < nelt; ++i)
39721 unsigned j, e = d->perm[i];
39722 unsigned which = (e >= nelt);
39723 if (e >= nelt)
39724 e -= nelt;
39726 for (j = 0; j < eltsz; ++j)
39728 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39729 rperm[1-which][i*eltsz + j] = m128;
39733 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39734 vperm = force_reg (V16QImode, vperm);
39736 l = gen_reg_rtx (V16QImode);
39737 op = gen_lowpart (V16QImode, d->op0);
39738 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39740 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39741 vperm = force_reg (V16QImode, vperm);
39743 h = gen_reg_rtx (V16QImode);
39744 op = gen_lowpart (V16QImode, d->op1);
39745 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39747 op = gen_lowpart (V16QImode, d->target);
39748 emit_insn (gen_iorv16qi3 (op, l, h));
39750 return true;
39753 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39754 with two vpshufb insns, vpermq and vpor. We should have already failed
39755 all two or three instruction sequences. */
39757 static bool
39758 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39760 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39761 unsigned int i, nelt, eltsz;
39763 if (!TARGET_AVX2
39764 || !d->one_operand_p
39765 || (d->vmode != V32QImode && d->vmode != V16HImode))
39766 return false;
39768 if (d->testing_p)
39769 return true;
39771 nelt = d->nelt;
39772 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39774 /* Generate two permutation masks. If the required element is within
39775 the same lane, it is shuffled in. If the required element from the
39776 other lane, force a zero by setting bit 7 in the permutation mask.
39777 In the other mask the mask has non-negative elements if element
39778 is requested from the other lane, but also moved to the other lane,
39779 so that the result of vpshufb can have the two V2TImode halves
39780 swapped. */
39781 m128 = GEN_INT (-128);
39782 for (i = 0; i < nelt; ++i)
39784 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39785 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39787 for (j = 0; j < eltsz; ++j)
39789 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39790 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39794 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39795 vperm = force_reg (V32QImode, vperm);
39797 h = gen_reg_rtx (V32QImode);
39798 op = gen_lowpart (V32QImode, d->op0);
39799 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39801 /* Swap the 128-byte lanes of h into hp. */
39802 hp = gen_reg_rtx (V4DImode);
39803 op = gen_lowpart (V4DImode, h);
39804 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39805 const1_rtx));
39807 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39808 vperm = force_reg (V32QImode, vperm);
39810 l = gen_reg_rtx (V32QImode);
39811 op = gen_lowpart (V32QImode, d->op0);
39812 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39814 op = gen_lowpart (V32QImode, d->target);
39815 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39817 return true;
39820 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39821 and extract-odd permutations of two V32QImode and V16QImode operand
39822 with two vpshufb insns, vpor and vpermq. We should have already
39823 failed all two or three instruction sequences. */
39825 static bool
39826 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39828 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39829 unsigned int i, nelt, eltsz;
39831 if (!TARGET_AVX2
39832 || d->one_operand_p
39833 || (d->vmode != V32QImode && d->vmode != V16HImode))
39834 return false;
39836 for (i = 0; i < d->nelt; ++i)
39837 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39838 return false;
39840 if (d->testing_p)
39841 return true;
39843 nelt = d->nelt;
39844 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39846 /* Generate two permutation masks. In the first permutation mask
39847 the first quarter will contain indexes for the first half
39848 of the op0, the second quarter will contain bit 7 set, third quarter
39849 will contain indexes for the second half of the op0 and the
39850 last quarter bit 7 set. In the second permutation mask
39851 the first quarter will contain bit 7 set, the second quarter
39852 indexes for the first half of the op1, the third quarter bit 7 set
39853 and last quarter indexes for the second half of the op1.
39854 I.e. the first mask e.g. for V32QImode extract even will be:
39855 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
39856 (all values masked with 0xf except for -128) and second mask
39857 for extract even will be
39858 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
39859 m128 = GEN_INT (-128);
39860 for (i = 0; i < nelt; ++i)
39862 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39863 unsigned which = d->perm[i] >= nelt;
39864 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
39866 for (j = 0; j < eltsz; ++j)
39868 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
39869 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
39873 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39874 vperm = force_reg (V32QImode, vperm);
39876 l = gen_reg_rtx (V32QImode);
39877 op = gen_lowpart (V32QImode, d->op0);
39878 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39880 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39881 vperm = force_reg (V32QImode, vperm);
39883 h = gen_reg_rtx (V32QImode);
39884 op = gen_lowpart (V32QImode, d->op1);
39885 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39887 ior = gen_reg_rtx (V32QImode);
39888 emit_insn (gen_iorv32qi3 (ior, l, h));
39890 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
39891 op = gen_lowpart (V4DImode, d->target);
39892 ior = gen_lowpart (V4DImode, ior);
39893 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
39894 const1_rtx, GEN_INT (3)));
39896 return true;
39899 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
39900 and extract-odd permutations. */
39902 static bool
39903 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
39905 rtx t1, t2, t3;
39907 switch (d->vmode)
39909 case V4DFmode:
39910 t1 = gen_reg_rtx (V4DFmode);
39911 t2 = gen_reg_rtx (V4DFmode);
39913 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39914 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
39915 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
39917 /* Now an unpck[lh]pd will produce the result required. */
39918 if (odd)
39919 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
39920 else
39921 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
39922 emit_insn (t3);
39923 break;
39925 case V8SFmode:
39927 int mask = odd ? 0xdd : 0x88;
39929 t1 = gen_reg_rtx (V8SFmode);
39930 t2 = gen_reg_rtx (V8SFmode);
39931 t3 = gen_reg_rtx (V8SFmode);
39933 /* Shuffle within the 128-bit lanes to produce:
39934 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
39935 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
39936 GEN_INT (mask)));
39938 /* Shuffle the lanes around to produce:
39939 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
39940 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
39941 GEN_INT (0x3)));
39943 /* Shuffle within the 128-bit lanes to produce:
39944 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
39945 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
39947 /* Shuffle within the 128-bit lanes to produce:
39948 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
39949 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
39951 /* Shuffle the lanes around to produce:
39952 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
39953 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
39954 GEN_INT (0x20)));
39956 break;
39958 case V2DFmode:
39959 case V4SFmode:
39960 case V2DImode:
39961 case V4SImode:
39962 /* These are always directly implementable by expand_vec_perm_1. */
39963 gcc_unreachable ();
39965 case V8HImode:
39966 if (TARGET_SSSE3)
39967 return expand_vec_perm_pshufb2 (d);
39968 else
39970 /* We need 2*log2(N)-1 operations to achieve odd/even
39971 with interleave. */
39972 t1 = gen_reg_rtx (V8HImode);
39973 t2 = gen_reg_rtx (V8HImode);
39974 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
39975 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
39976 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
39977 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
39978 if (odd)
39979 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
39980 else
39981 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
39982 emit_insn (t3);
39984 break;
39986 case V16QImode:
39987 if (TARGET_SSSE3)
39988 return expand_vec_perm_pshufb2 (d);
39989 else
39991 t1 = gen_reg_rtx (V16QImode);
39992 t2 = gen_reg_rtx (V16QImode);
39993 t3 = gen_reg_rtx (V16QImode);
39994 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
39995 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
39996 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
39997 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
39998 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
39999 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40000 if (odd)
40001 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40002 else
40003 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40004 emit_insn (t3);
40006 break;
40008 case V16HImode:
40009 case V32QImode:
40010 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40012 case V4DImode:
40013 if (!TARGET_AVX2)
40015 struct expand_vec_perm_d d_copy = *d;
40016 d_copy.vmode = V4DFmode;
40017 d_copy.target = gen_lowpart (V4DFmode, d->target);
40018 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40019 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40020 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40023 t1 = gen_reg_rtx (V4DImode);
40024 t2 = gen_reg_rtx (V4DImode);
40026 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40027 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40028 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40030 /* Now an vpunpck[lh]qdq will produce the result required. */
40031 if (odd)
40032 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40033 else
40034 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40035 emit_insn (t3);
40036 break;
40038 case V8SImode:
40039 if (!TARGET_AVX2)
40041 struct expand_vec_perm_d d_copy = *d;
40042 d_copy.vmode = V8SFmode;
40043 d_copy.target = gen_lowpart (V8SFmode, d->target);
40044 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40045 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40046 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40049 t1 = gen_reg_rtx (V8SImode);
40050 t2 = gen_reg_rtx (V8SImode);
40052 /* Shuffle the lanes around into
40053 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40054 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40055 gen_lowpart (V4DImode, d->op0),
40056 gen_lowpart (V4DImode, d->op1),
40057 GEN_INT (0x20)));
40058 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40059 gen_lowpart (V4DImode, d->op0),
40060 gen_lowpart (V4DImode, d->op1),
40061 GEN_INT (0x31)));
40063 /* Swap the 2nd and 3rd position in each lane into
40064 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40065 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40066 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40067 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40068 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40070 /* Now an vpunpck[lh]qdq will produce
40071 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40072 if (odd)
40073 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40074 gen_lowpart (V4DImode, t1),
40075 gen_lowpart (V4DImode, t2));
40076 else
40077 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40078 gen_lowpart (V4DImode, t1),
40079 gen_lowpart (V4DImode, t2));
40080 emit_insn (t3);
40081 break;
40083 default:
40084 gcc_unreachable ();
40087 return true;
40090 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40091 extract-even and extract-odd permutations. */
40093 static bool
40094 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40096 unsigned i, odd, nelt = d->nelt;
40098 odd = d->perm[0];
40099 if (odd != 0 && odd != 1)
40100 return false;
40102 for (i = 1; i < nelt; ++i)
40103 if (d->perm[i] != 2 * i + odd)
40104 return false;
40106 return expand_vec_perm_even_odd_1 (d, odd);
40109 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40110 permutations. We assume that expand_vec_perm_1 has already failed. */
40112 static bool
40113 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40115 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40116 enum machine_mode vmode = d->vmode;
40117 unsigned char perm2[4];
40118 rtx op0 = d->op0;
40119 bool ok;
40121 switch (vmode)
40123 case V4DFmode:
40124 case V8SFmode:
40125 /* These are special-cased in sse.md so that we can optionally
40126 use the vbroadcast instruction. They expand to two insns
40127 if the input happens to be in a register. */
40128 gcc_unreachable ();
40130 case V2DFmode:
40131 case V2DImode:
40132 case V4SFmode:
40133 case V4SImode:
40134 /* These are always implementable using standard shuffle patterns. */
40135 gcc_unreachable ();
40137 case V8HImode:
40138 case V16QImode:
40139 /* These can be implemented via interleave. We save one insn by
40140 stopping once we have promoted to V4SImode and then use pshufd. */
40143 rtx dest;
40144 rtx (*gen) (rtx, rtx, rtx)
40145 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40146 : gen_vec_interleave_lowv8hi;
40148 if (elt >= nelt2)
40150 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40151 : gen_vec_interleave_highv8hi;
40152 elt -= nelt2;
40154 nelt2 /= 2;
40156 dest = gen_reg_rtx (vmode);
40157 emit_insn (gen (dest, op0, op0));
40158 vmode = get_mode_wider_vector (vmode);
40159 op0 = gen_lowpart (vmode, dest);
40161 while (vmode != V4SImode);
40163 memset (perm2, elt, 4);
40164 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40165 d->testing_p);
40166 gcc_assert (ok);
40167 return true;
40169 case V32QImode:
40170 case V16HImode:
40171 case V8SImode:
40172 case V4DImode:
40173 /* For AVX2 broadcasts of the first element vpbroadcast* or
40174 vpermq should be used by expand_vec_perm_1. */
40175 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40176 return false;
40178 default:
40179 gcc_unreachable ();
40183 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40184 broadcast permutations. */
40186 static bool
40187 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40189 unsigned i, elt, nelt = d->nelt;
40191 if (!d->one_operand_p)
40192 return false;
40194 elt = d->perm[0];
40195 for (i = 1; i < nelt; ++i)
40196 if (d->perm[i] != elt)
40197 return false;
40199 return expand_vec_perm_broadcast_1 (d);
40202 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40203 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40204 all the shorter instruction sequences. */
40206 static bool
40207 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40209 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40210 unsigned int i, nelt, eltsz;
40211 bool used[4];
40213 if (!TARGET_AVX2
40214 || d->one_operand_p
40215 || (d->vmode != V32QImode && d->vmode != V16HImode))
40216 return false;
40218 if (d->testing_p)
40219 return true;
40221 nelt = d->nelt;
40222 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40224 /* Generate 4 permutation masks. If the required element is within
40225 the same lane, it is shuffled in. If the required element from the
40226 other lane, force a zero by setting bit 7 in the permutation mask.
40227 In the other mask the mask has non-negative elements if element
40228 is requested from the other lane, but also moved to the other lane,
40229 so that the result of vpshufb can have the two V2TImode halves
40230 swapped. */
40231 m128 = GEN_INT (-128);
40232 for (i = 0; i < 32; ++i)
40234 rperm[0][i] = m128;
40235 rperm[1][i] = m128;
40236 rperm[2][i] = m128;
40237 rperm[3][i] = m128;
40239 used[0] = false;
40240 used[1] = false;
40241 used[2] = false;
40242 used[3] = false;
40243 for (i = 0; i < nelt; ++i)
40245 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40246 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40247 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40249 for (j = 0; j < eltsz; ++j)
40250 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40251 used[which] = true;
40254 for (i = 0; i < 2; ++i)
40256 if (!used[2 * i + 1])
40258 h[i] = NULL_RTX;
40259 continue;
40261 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40262 gen_rtvec_v (32, rperm[2 * i + 1]));
40263 vperm = force_reg (V32QImode, vperm);
40264 h[i] = gen_reg_rtx (V32QImode);
40265 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40266 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40269 /* Swap the 128-byte lanes of h[X]. */
40270 for (i = 0; i < 2; ++i)
40272 if (h[i] == NULL_RTX)
40273 continue;
40274 op = gen_reg_rtx (V4DImode);
40275 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40276 const2_rtx, GEN_INT (3), const0_rtx,
40277 const1_rtx));
40278 h[i] = gen_lowpart (V32QImode, op);
40281 for (i = 0; i < 2; ++i)
40283 if (!used[2 * i])
40285 l[i] = NULL_RTX;
40286 continue;
40288 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40289 vperm = force_reg (V32QImode, vperm);
40290 l[i] = gen_reg_rtx (V32QImode);
40291 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40292 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40295 for (i = 0; i < 2; ++i)
40297 if (h[i] && l[i])
40299 op = gen_reg_rtx (V32QImode);
40300 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40301 l[i] = op;
40303 else if (h[i])
40304 l[i] = h[i];
40307 gcc_assert (l[0] && l[1]);
40308 op = gen_lowpart (V32QImode, d->target);
40309 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40310 return true;
40313 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40314 With all of the interface bits taken care of, perform the expansion
40315 in D and return true on success. */
40317 static bool
40318 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40320 /* Try a single instruction expansion. */
40321 if (expand_vec_perm_1 (d))
40322 return true;
40324 /* Try sequences of two instructions. */
40326 if (expand_vec_perm_pshuflw_pshufhw (d))
40327 return true;
40329 if (expand_vec_perm_palignr (d))
40330 return true;
40332 if (expand_vec_perm_interleave2 (d))
40333 return true;
40335 if (expand_vec_perm_broadcast (d))
40336 return true;
40338 if (expand_vec_perm_vpermq_perm_1 (d))
40339 return true;
40341 if (expand_vec_perm_vperm2f128 (d))
40342 return true;
40344 /* Try sequences of three instructions. */
40346 if (expand_vec_perm_2vperm2f128_vshuf (d))
40347 return true;
40349 if (expand_vec_perm_pshufb2 (d))
40350 return true;
40352 if (expand_vec_perm_interleave3 (d))
40353 return true;
40355 if (expand_vec_perm_vperm2f128_vblend (d))
40356 return true;
40358 /* Try sequences of four instructions. */
40360 if (expand_vec_perm_vpshufb2_vpermq (d))
40361 return true;
40363 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40364 return true;
40366 /* ??? Look for narrow permutations whose element orderings would
40367 allow the promotion to a wider mode. */
40369 /* ??? Look for sequences of interleave or a wider permute that place
40370 the data into the correct lanes for a half-vector shuffle like
40371 pshuf[lh]w or vpermilps. */
40373 /* ??? Look for sequences of interleave that produce the desired results.
40374 The combinatorics of punpck[lh] get pretty ugly... */
40376 if (expand_vec_perm_even_odd (d))
40377 return true;
40379 /* Even longer sequences. */
40380 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40381 return true;
40383 return false;
40386 /* If a permutation only uses one operand, make it clear. Returns true
40387 if the permutation references both operands. */
40389 static bool
40390 canonicalize_perm (struct expand_vec_perm_d *d)
40392 int i, which, nelt = d->nelt;
40394 for (i = which = 0; i < nelt; ++i)
40395 which |= (d->perm[i] < nelt ? 1 : 2);
40397 d->one_operand_p = true;
40398 switch (which)
40400 default:
40401 gcc_unreachable();
40403 case 3:
40404 if (!rtx_equal_p (d->op0, d->op1))
40406 d->one_operand_p = false;
40407 break;
40409 /* The elements of PERM do not suggest that only the first operand
40410 is used, but both operands are identical. Allow easier matching
40411 of the permutation by folding the permutation into the single
40412 input vector. */
40413 /* FALLTHRU */
40415 case 2:
40416 for (i = 0; i < nelt; ++i)
40417 d->perm[i] &= nelt - 1;
40418 d->op0 = d->op1;
40419 break;
40421 case 1:
40422 d->op1 = d->op0;
40423 break;
40426 return (which == 3);
40429 bool
40430 ix86_expand_vec_perm_const (rtx operands[4])
40432 struct expand_vec_perm_d d;
40433 unsigned char perm[MAX_VECT_LEN];
40434 int i, nelt;
40435 bool two_args;
40436 rtx sel;
40438 d.target = operands[0];
40439 d.op0 = operands[1];
40440 d.op1 = operands[2];
40441 sel = operands[3];
40443 d.vmode = GET_MODE (d.target);
40444 gcc_assert (VECTOR_MODE_P (d.vmode));
40445 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40446 d.testing_p = false;
40448 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40449 gcc_assert (XVECLEN (sel, 0) == nelt);
40450 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40452 for (i = 0; i < nelt; ++i)
40454 rtx e = XVECEXP (sel, 0, i);
40455 int ei = INTVAL (e) & (2 * nelt - 1);
40456 d.perm[i] = ei;
40457 perm[i] = ei;
40460 two_args = canonicalize_perm (&d);
40462 if (ix86_expand_vec_perm_const_1 (&d))
40463 return true;
40465 /* If the selector says both arguments are needed, but the operands are the
40466 same, the above tried to expand with one_operand_p and flattened selector.
40467 If that didn't work, retry without one_operand_p; we succeeded with that
40468 during testing. */
40469 if (two_args && d.one_operand_p)
40471 d.one_operand_p = false;
40472 memcpy (d.perm, perm, sizeof (perm));
40473 return ix86_expand_vec_perm_const_1 (&d);
40476 return false;
40479 /* Implement targetm.vectorize.vec_perm_const_ok. */
40481 static bool
40482 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40483 const unsigned char *sel)
40485 struct expand_vec_perm_d d;
40486 unsigned int i, nelt, which;
40487 bool ret;
40489 d.vmode = vmode;
40490 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40491 d.testing_p = true;
40493 /* Given sufficient ISA support we can just return true here
40494 for selected vector modes. */
40495 if (GET_MODE_SIZE (d.vmode) == 16)
40497 /* All implementable with a single vpperm insn. */
40498 if (TARGET_XOP)
40499 return true;
40500 /* All implementable with 2 pshufb + 1 ior. */
40501 if (TARGET_SSSE3)
40502 return true;
40503 /* All implementable with shufpd or unpck[lh]pd. */
40504 if (d.nelt == 2)
40505 return true;
40508 /* Extract the values from the vector CST into the permutation
40509 array in D. */
40510 memcpy (d.perm, sel, nelt);
40511 for (i = which = 0; i < nelt; ++i)
40513 unsigned char e = d.perm[i];
40514 gcc_assert (e < 2 * nelt);
40515 which |= (e < nelt ? 1 : 2);
40518 /* For all elements from second vector, fold the elements to first. */
40519 if (which == 2)
40520 for (i = 0; i < nelt; ++i)
40521 d.perm[i] -= nelt;
40523 /* Check whether the mask can be applied to the vector type. */
40524 d.one_operand_p = (which != 3);
40526 /* Implementable with shufps or pshufd. */
40527 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40528 return true;
40530 /* Otherwise we have to go through the motions and see if we can
40531 figure out how to generate the requested permutation. */
40532 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40533 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40534 if (!d.one_operand_p)
40535 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40537 start_sequence ();
40538 ret = ix86_expand_vec_perm_const_1 (&d);
40539 end_sequence ();
40541 return ret;
40544 void
40545 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40547 struct expand_vec_perm_d d;
40548 unsigned i, nelt;
40550 d.target = targ;
40551 d.op0 = op0;
40552 d.op1 = op1;
40553 d.vmode = GET_MODE (targ);
40554 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40555 d.one_operand_p = false;
40556 d.testing_p = false;
40558 for (i = 0; i < nelt; ++i)
40559 d.perm[i] = i * 2 + odd;
40561 /* We'll either be able to implement the permutation directly... */
40562 if (expand_vec_perm_1 (&d))
40563 return;
40565 /* ... or we use the special-case patterns. */
40566 expand_vec_perm_even_odd_1 (&d, odd);
40569 static void
40570 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40572 struct expand_vec_perm_d d;
40573 unsigned i, nelt, base;
40574 bool ok;
40576 d.target = targ;
40577 d.op0 = op0;
40578 d.op1 = op1;
40579 d.vmode = GET_MODE (targ);
40580 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40581 d.one_operand_p = false;
40582 d.testing_p = false;
40584 base = high_p ? nelt / 2 : 0;
40585 for (i = 0; i < nelt / 2; ++i)
40587 d.perm[i * 2] = i + base;
40588 d.perm[i * 2 + 1] = i + base + nelt;
40591 /* Note that for AVX this isn't one instruction. */
40592 ok = ix86_expand_vec_perm_const_1 (&d);
40593 gcc_assert (ok);
40597 /* Expand a vector operation CODE for a V*QImode in terms of the
40598 same operation on V*HImode. */
40600 void
40601 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40603 enum machine_mode qimode = GET_MODE (dest);
40604 enum machine_mode himode;
40605 rtx (*gen_il) (rtx, rtx, rtx);
40606 rtx (*gen_ih) (rtx, rtx, rtx);
40607 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40608 struct expand_vec_perm_d d;
40609 bool ok, full_interleave;
40610 bool uns_p = false;
40611 int i;
40613 switch (qimode)
40615 case V16QImode:
40616 himode = V8HImode;
40617 gen_il = gen_vec_interleave_lowv16qi;
40618 gen_ih = gen_vec_interleave_highv16qi;
40619 break;
40620 case V32QImode:
40621 himode = V16HImode;
40622 gen_il = gen_avx2_interleave_lowv32qi;
40623 gen_ih = gen_avx2_interleave_highv32qi;
40624 break;
40625 default:
40626 gcc_unreachable ();
40629 op2_l = op2_h = op2;
40630 switch (code)
40632 case MULT:
40633 /* Unpack data such that we've got a source byte in each low byte of
40634 each word. We don't care what goes into the high byte of each word.
40635 Rather than trying to get zero in there, most convenient is to let
40636 it be a copy of the low byte. */
40637 op2_l = gen_reg_rtx (qimode);
40638 op2_h = gen_reg_rtx (qimode);
40639 emit_insn (gen_il (op2_l, op2, op2));
40640 emit_insn (gen_ih (op2_h, op2, op2));
40641 /* FALLTHRU */
40643 op1_l = gen_reg_rtx (qimode);
40644 op1_h = gen_reg_rtx (qimode);
40645 emit_insn (gen_il (op1_l, op1, op1));
40646 emit_insn (gen_ih (op1_h, op1, op1));
40647 full_interleave = qimode == V16QImode;
40648 break;
40650 case ASHIFT:
40651 case LSHIFTRT:
40652 uns_p = true;
40653 /* FALLTHRU */
40654 case ASHIFTRT:
40655 op1_l = gen_reg_rtx (himode);
40656 op1_h = gen_reg_rtx (himode);
40657 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40658 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40659 full_interleave = true;
40660 break;
40661 default:
40662 gcc_unreachable ();
40665 /* Perform the operation. */
40666 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40667 1, OPTAB_DIRECT);
40668 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40669 1, OPTAB_DIRECT);
40670 gcc_assert (res_l && res_h);
40672 /* Merge the data back into the right place. */
40673 d.target = dest;
40674 d.op0 = gen_lowpart (qimode, res_l);
40675 d.op1 = gen_lowpart (qimode, res_h);
40676 d.vmode = qimode;
40677 d.nelt = GET_MODE_NUNITS (qimode);
40678 d.one_operand_p = false;
40679 d.testing_p = false;
40681 if (full_interleave)
40683 /* For SSE2, we used an full interleave, so the desired
40684 results are in the even elements. */
40685 for (i = 0; i < 32; ++i)
40686 d.perm[i] = i * 2;
40688 else
40690 /* For AVX, the interleave used above was not cross-lane. So the
40691 extraction is evens but with the second and third quarter swapped.
40692 Happily, that is even one insn shorter than even extraction. */
40693 for (i = 0; i < 32; ++i)
40694 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40697 ok = ix86_expand_vec_perm_const_1 (&d);
40698 gcc_assert (ok);
40700 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40701 gen_rtx_fmt_ee (code, qimode, op1, op2));
40704 void
40705 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40706 bool uns_p, bool odd_p)
40708 enum machine_mode mode = GET_MODE (op1);
40709 enum machine_mode wmode = GET_MODE (dest);
40710 rtx x;
40712 /* We only play even/odd games with vectors of SImode. */
40713 gcc_assert (mode == V4SImode || mode == V8SImode);
40715 /* If we're looking for the odd results, shift those members down to
40716 the even slots. For some cpus this is faster than a PSHUFD. */
40717 if (odd_p)
40719 if (TARGET_XOP && mode == V4SImode)
40721 x = force_reg (wmode, CONST0_RTX (wmode));
40722 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40723 return;
40726 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40727 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40728 x, NULL, 1, OPTAB_DIRECT);
40729 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40730 x, NULL, 1, OPTAB_DIRECT);
40731 op1 = gen_lowpart (mode, op1);
40732 op2 = gen_lowpart (mode, op2);
40735 if (mode == V8SImode)
40737 if (uns_p)
40738 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40739 else
40740 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40742 else if (uns_p)
40743 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40744 else if (TARGET_SSE4_1)
40745 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40746 else
40748 rtx s1, s2, t0, t1, t2;
40750 /* The easiest way to implement this without PMULDQ is to go through
40751 the motions as if we are performing a full 64-bit multiply. With
40752 the exception that we need to do less shuffling of the elements. */
40754 /* Compute the sign-extension, aka highparts, of the two operands. */
40755 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40756 op1, pc_rtx, pc_rtx);
40757 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40758 op2, pc_rtx, pc_rtx);
40760 /* Multiply LO(A) * HI(B), and vice-versa. */
40761 t1 = gen_reg_rtx (wmode);
40762 t2 = gen_reg_rtx (wmode);
40763 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40764 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40766 /* Multiply LO(A) * LO(B). */
40767 t0 = gen_reg_rtx (wmode);
40768 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40770 /* Combine and shift the highparts into place. */
40771 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40772 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40773 1, OPTAB_DIRECT);
40775 /* Combine high and low parts. */
40776 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40777 return;
40779 emit_insn (x);
40782 void
40783 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40784 bool uns_p, bool high_p)
40786 enum machine_mode wmode = GET_MODE (dest);
40787 enum machine_mode mode = GET_MODE (op1);
40788 rtx t1, t2, t3, t4, mask;
40790 switch (mode)
40792 case V4SImode:
40793 t1 = gen_reg_rtx (mode);
40794 t2 = gen_reg_rtx (mode);
40795 if (TARGET_XOP && !uns_p)
40797 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40798 shuffle the elements once so that all elements are in the right
40799 place for immediate use: { A C B D }. */
40800 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40801 const1_rtx, GEN_INT (3)));
40802 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40803 const1_rtx, GEN_INT (3)));
40805 else
40807 /* Put the elements into place for the multiply. */
40808 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40809 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40810 high_p = false;
40812 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40813 break;
40815 case V8SImode:
40816 /* Shuffle the elements between the lanes. After this we
40817 have { A B E F | C D G H } for each operand. */
40818 t1 = gen_reg_rtx (V4DImode);
40819 t2 = gen_reg_rtx (V4DImode);
40820 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40821 const0_rtx, const2_rtx,
40822 const1_rtx, GEN_INT (3)));
40823 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40824 const0_rtx, const2_rtx,
40825 const1_rtx, GEN_INT (3)));
40827 /* Shuffle the elements within the lanes. After this we
40828 have { A A B B | C C D D } or { E E F F | G G H H }. */
40829 t3 = gen_reg_rtx (V8SImode);
40830 t4 = gen_reg_rtx (V8SImode);
40831 mask = GEN_INT (high_p
40832 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
40833 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
40834 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
40835 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
40837 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
40838 break;
40840 case V8HImode:
40841 case V16HImode:
40842 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
40843 uns_p, OPTAB_DIRECT);
40844 t2 = expand_binop (mode,
40845 uns_p ? umul_highpart_optab : smul_highpart_optab,
40846 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
40847 gcc_assert (t1 && t2);
40849 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
40850 break;
40852 case V16QImode:
40853 case V32QImode:
40854 t1 = gen_reg_rtx (wmode);
40855 t2 = gen_reg_rtx (wmode);
40856 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
40857 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
40859 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
40860 break;
40862 default:
40863 gcc_unreachable ();
40867 void
40868 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
40870 rtx res_1, res_2;
40872 res_1 = gen_reg_rtx (V4SImode);
40873 res_2 = gen_reg_rtx (V4SImode);
40874 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
40875 op1, op2, true, false);
40876 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
40877 op1, op2, true, true);
40879 /* Move the results in element 2 down to element 1; we don't care
40880 what goes in elements 2 and 3. Then we can merge the parts
40881 back together with an interleave.
40883 Note that two other sequences were tried:
40884 (1) Use interleaves at the start instead of psrldq, which allows
40885 us to use a single shufps to merge things back at the end.
40886 (2) Use shufps here to combine the two vectors, then pshufd to
40887 put the elements in the correct order.
40888 In both cases the cost of the reformatting stall was too high
40889 and the overall sequence slower. */
40891 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
40892 const0_rtx, const0_rtx));
40893 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
40894 const0_rtx, const0_rtx));
40895 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
40897 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
40900 void
40901 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
40903 enum machine_mode mode = GET_MODE (op0);
40904 rtx t1, t2, t3, t4, t5, t6;
40906 if (TARGET_XOP && mode == V2DImode)
40908 /* op1: A,B,C,D, op2: E,F,G,H */
40909 op1 = gen_lowpart (V4SImode, op1);
40910 op2 = gen_lowpart (V4SImode, op2);
40912 t1 = gen_reg_rtx (V4SImode);
40913 t2 = gen_reg_rtx (V4SImode);
40914 t3 = gen_reg_rtx (V2DImode);
40915 t4 = gen_reg_rtx (V2DImode);
40917 /* t1: B,A,D,C */
40918 emit_insn (gen_sse2_pshufd_1 (t1, op1,
40919 GEN_INT (1),
40920 GEN_INT (0),
40921 GEN_INT (3),
40922 GEN_INT (2)));
40924 /* t2: (B*E),(A*F),(D*G),(C*H) */
40925 emit_insn (gen_mulv4si3 (t2, t1, op2));
40927 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
40928 emit_insn (gen_xop_phadddq (t3, t2));
40930 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
40931 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
40933 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
40934 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
40936 else
40938 enum machine_mode nmode;
40939 rtx (*umul) (rtx, rtx, rtx);
40941 if (mode == V2DImode)
40943 umul = gen_vec_widen_umult_even_v4si;
40944 nmode = V4SImode;
40946 else if (mode == V4DImode)
40948 umul = gen_vec_widen_umult_even_v8si;
40949 nmode = V8SImode;
40951 else
40952 gcc_unreachable ();
40955 /* Multiply low parts. */
40956 t1 = gen_reg_rtx (mode);
40957 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
40959 /* Shift input vectors right 32 bits so we can multiply high parts. */
40960 t6 = GEN_INT (32);
40961 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
40962 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
40964 /* Multiply high parts by low parts. */
40965 t4 = gen_reg_rtx (mode);
40966 t5 = gen_reg_rtx (mode);
40967 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
40968 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
40970 /* Combine and shift the highparts back. */
40971 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
40972 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
40974 /* Combine high and low parts. */
40975 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
40978 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40979 gen_rtx_MULT (mode, op1, op2));
40982 /* Expand an insert into a vector register through pinsr insn.
40983 Return true if successful. */
40985 bool
40986 ix86_expand_pinsr (rtx *operands)
40988 rtx dst = operands[0];
40989 rtx src = operands[3];
40991 unsigned int size = INTVAL (operands[1]);
40992 unsigned int pos = INTVAL (operands[2]);
40994 if (GET_CODE (dst) == SUBREG)
40996 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
40997 dst = SUBREG_REG (dst);
41000 if (GET_CODE (src) == SUBREG)
41001 src = SUBREG_REG (src);
41003 switch (GET_MODE (dst))
41005 case V16QImode:
41006 case V8HImode:
41007 case V4SImode:
41008 case V2DImode:
41010 enum machine_mode srcmode, dstmode;
41011 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41013 srcmode = mode_for_size (size, MODE_INT, 0);
41015 switch (srcmode)
41017 case QImode:
41018 if (!TARGET_SSE4_1)
41019 return false;
41020 dstmode = V16QImode;
41021 pinsr = gen_sse4_1_pinsrb;
41022 break;
41024 case HImode:
41025 if (!TARGET_SSE2)
41026 return false;
41027 dstmode = V8HImode;
41028 pinsr = gen_sse2_pinsrw;
41029 break;
41031 case SImode:
41032 if (!TARGET_SSE4_1)
41033 return false;
41034 dstmode = V4SImode;
41035 pinsr = gen_sse4_1_pinsrd;
41036 break;
41038 case DImode:
41039 gcc_assert (TARGET_64BIT);
41040 if (!TARGET_SSE4_1)
41041 return false;
41042 dstmode = V2DImode;
41043 pinsr = gen_sse4_1_pinsrq;
41044 break;
41046 default:
41047 return false;
41050 dst = gen_lowpart (dstmode, dst);
41051 src = gen_lowpart (srcmode, src);
41053 pos /= size;
41055 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41056 return true;
41059 default:
41060 return false;
41064 /* This function returns the calling abi specific va_list type node.
41065 It returns the FNDECL specific va_list type. */
41067 static tree
41068 ix86_fn_abi_va_list (tree fndecl)
41070 if (!TARGET_64BIT)
41071 return va_list_type_node;
41072 gcc_assert (fndecl != NULL_TREE);
41074 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41075 return ms_va_list_type_node;
41076 else
41077 return sysv_va_list_type_node;
41080 /* Returns the canonical va_list type specified by TYPE. If there
41081 is no valid TYPE provided, it return NULL_TREE. */
41083 static tree
41084 ix86_canonical_va_list_type (tree type)
41086 tree wtype, htype;
41088 /* Resolve references and pointers to va_list type. */
41089 if (TREE_CODE (type) == MEM_REF)
41090 type = TREE_TYPE (type);
41091 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41092 type = TREE_TYPE (type);
41093 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41094 type = TREE_TYPE (type);
41096 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41098 wtype = va_list_type_node;
41099 gcc_assert (wtype != NULL_TREE);
41100 htype = type;
41101 if (TREE_CODE (wtype) == ARRAY_TYPE)
41103 /* If va_list is an array type, the argument may have decayed
41104 to a pointer type, e.g. by being passed to another function.
41105 In that case, unwrap both types so that we can compare the
41106 underlying records. */
41107 if (TREE_CODE (htype) == ARRAY_TYPE
41108 || POINTER_TYPE_P (htype))
41110 wtype = TREE_TYPE (wtype);
41111 htype = TREE_TYPE (htype);
41114 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41115 return va_list_type_node;
41116 wtype = sysv_va_list_type_node;
41117 gcc_assert (wtype != NULL_TREE);
41118 htype = type;
41119 if (TREE_CODE (wtype) == ARRAY_TYPE)
41121 /* If va_list is an array type, the argument may have decayed
41122 to a pointer type, e.g. by being passed to another function.
41123 In that case, unwrap both types so that we can compare the
41124 underlying records. */
41125 if (TREE_CODE (htype) == ARRAY_TYPE
41126 || POINTER_TYPE_P (htype))
41128 wtype = TREE_TYPE (wtype);
41129 htype = TREE_TYPE (htype);
41132 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41133 return sysv_va_list_type_node;
41134 wtype = ms_va_list_type_node;
41135 gcc_assert (wtype != NULL_TREE);
41136 htype = type;
41137 if (TREE_CODE (wtype) == ARRAY_TYPE)
41139 /* If va_list is an array type, the argument may have decayed
41140 to a pointer type, e.g. by being passed to another function.
41141 In that case, unwrap both types so that we can compare the
41142 underlying records. */
41143 if (TREE_CODE (htype) == ARRAY_TYPE
41144 || POINTER_TYPE_P (htype))
41146 wtype = TREE_TYPE (wtype);
41147 htype = TREE_TYPE (htype);
41150 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41151 return ms_va_list_type_node;
41152 return NULL_TREE;
41154 return std_canonical_va_list_type (type);
41157 /* Iterate through the target-specific builtin types for va_list.
41158 IDX denotes the iterator, *PTREE is set to the result type of
41159 the va_list builtin, and *PNAME to its internal type.
41160 Returns zero if there is no element for this index, otherwise
41161 IDX should be increased upon the next call.
41162 Note, do not iterate a base builtin's name like __builtin_va_list.
41163 Used from c_common_nodes_and_builtins. */
41165 static int
41166 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41168 if (TARGET_64BIT)
41170 switch (idx)
41172 default:
41173 break;
41175 case 0:
41176 *ptree = ms_va_list_type_node;
41177 *pname = "__builtin_ms_va_list";
41178 return 1;
41180 case 1:
41181 *ptree = sysv_va_list_type_node;
41182 *pname = "__builtin_sysv_va_list";
41183 return 1;
41187 return 0;
41190 #undef TARGET_SCHED_DISPATCH
41191 #define TARGET_SCHED_DISPATCH has_dispatch
41192 #undef TARGET_SCHED_DISPATCH_DO
41193 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41194 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41195 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41196 #undef TARGET_SCHED_REORDER
41197 #define TARGET_SCHED_REORDER ix86_sched_reorder
41198 #undef TARGET_SCHED_ADJUST_PRIORITY
41199 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41200 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41201 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
41203 /* The size of the dispatch window is the total number of bytes of
41204 object code allowed in a window. */
41205 #define DISPATCH_WINDOW_SIZE 16
41207 /* Number of dispatch windows considered for scheduling. */
41208 #define MAX_DISPATCH_WINDOWS 3
41210 /* Maximum number of instructions in a window. */
41211 #define MAX_INSN 4
41213 /* Maximum number of immediate operands in a window. */
41214 #define MAX_IMM 4
41216 /* Maximum number of immediate bits allowed in a window. */
41217 #define MAX_IMM_SIZE 128
41219 /* Maximum number of 32 bit immediates allowed in a window. */
41220 #define MAX_IMM_32 4
41222 /* Maximum number of 64 bit immediates allowed in a window. */
41223 #define MAX_IMM_64 2
41225 /* Maximum total of loads or prefetches allowed in a window. */
41226 #define MAX_LOAD 2
41228 /* Maximum total of stores allowed in a window. */
41229 #define MAX_STORE 1
41231 #undef BIG
41232 #define BIG 100
41235 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41236 enum dispatch_group {
41237 disp_no_group = 0,
41238 disp_load,
41239 disp_store,
41240 disp_load_store,
41241 disp_prefetch,
41242 disp_imm,
41243 disp_imm_32,
41244 disp_imm_64,
41245 disp_branch,
41246 disp_cmp,
41247 disp_jcc,
41248 disp_last
41251 /* Number of allowable groups in a dispatch window. It is an array
41252 indexed by dispatch_group enum. 100 is used as a big number,
41253 because the number of these kind of operations does not have any
41254 effect in dispatch window, but we need them for other reasons in
41255 the table. */
41256 static unsigned int num_allowable_groups[disp_last] = {
41257 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41260 char group_name[disp_last + 1][16] = {
41261 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41262 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41263 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41266 /* Instruction path. */
41267 enum insn_path {
41268 no_path = 0,
41269 path_single, /* Single micro op. */
41270 path_double, /* Double micro op. */
41271 path_multi, /* Instructions with more than 2 micro op.. */
41272 last_path
41275 /* sched_insn_info defines a window to the instructions scheduled in
41276 the basic block. It contains a pointer to the insn_info table and
41277 the instruction scheduled.
41279 Windows are allocated for each basic block and are linked
41280 together. */
41281 typedef struct sched_insn_info_s {
41282 rtx insn;
41283 enum dispatch_group group;
41284 enum insn_path path;
41285 int byte_len;
41286 int imm_bytes;
41287 } sched_insn_info;
41289 /* Linked list of dispatch windows. This is a two way list of
41290 dispatch windows of a basic block. It contains information about
41291 the number of uops in the window and the total number of
41292 instructions and of bytes in the object code for this dispatch
41293 window. */
41294 typedef struct dispatch_windows_s {
41295 int num_insn; /* Number of insn in the window. */
41296 int num_uops; /* Number of uops in the window. */
41297 int window_size; /* Number of bytes in the window. */
41298 int window_num; /* Window number between 0 or 1. */
41299 int num_imm; /* Number of immediates in an insn. */
41300 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41301 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41302 int imm_size; /* Total immediates in the window. */
41303 int num_loads; /* Total memory loads in the window. */
41304 int num_stores; /* Total memory stores in the window. */
41305 int violation; /* Violation exists in window. */
41306 sched_insn_info *window; /* Pointer to the window. */
41307 struct dispatch_windows_s *next;
41308 struct dispatch_windows_s *prev;
41309 } dispatch_windows;
41311 /* Immediate valuse used in an insn. */
41312 typedef struct imm_info_s
41314 int imm;
41315 int imm32;
41316 int imm64;
41317 } imm_info;
41319 static dispatch_windows *dispatch_window_list;
41320 static dispatch_windows *dispatch_window_list1;
41322 /* Get dispatch group of insn. */
41324 static enum dispatch_group
41325 get_mem_group (rtx insn)
41327 enum attr_memory memory;
41329 if (INSN_CODE (insn) < 0)
41330 return disp_no_group;
41331 memory = get_attr_memory (insn);
41332 if (memory == MEMORY_STORE)
41333 return disp_store;
41335 if (memory == MEMORY_LOAD)
41336 return disp_load;
41338 if (memory == MEMORY_BOTH)
41339 return disp_load_store;
41341 return disp_no_group;
41344 /* Return true if insn is a compare instruction. */
41346 static bool
41347 is_cmp (rtx insn)
41349 enum attr_type type;
41351 type = get_attr_type (insn);
41352 return (type == TYPE_TEST
41353 || type == TYPE_ICMP
41354 || type == TYPE_FCMP
41355 || GET_CODE (PATTERN (insn)) == COMPARE);
41358 /* Return true if a dispatch violation encountered. */
41360 static bool
41361 dispatch_violation (void)
41363 if (dispatch_window_list->next)
41364 return dispatch_window_list->next->violation;
41365 return dispatch_window_list->violation;
41368 /* Return true if insn is a branch instruction. */
41370 static bool
41371 is_branch (rtx insn)
41373 return (CALL_P (insn) || JUMP_P (insn));
41376 /* Return true if insn is a prefetch instruction. */
41378 static bool
41379 is_prefetch (rtx insn)
41381 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41384 /* This function initializes a dispatch window and the list container holding a
41385 pointer to the window. */
41387 static void
41388 init_window (int window_num)
41390 int i;
41391 dispatch_windows *new_list;
41393 if (window_num == 0)
41394 new_list = dispatch_window_list;
41395 else
41396 new_list = dispatch_window_list1;
41398 new_list->num_insn = 0;
41399 new_list->num_uops = 0;
41400 new_list->window_size = 0;
41401 new_list->next = NULL;
41402 new_list->prev = NULL;
41403 new_list->window_num = window_num;
41404 new_list->num_imm = 0;
41405 new_list->num_imm_32 = 0;
41406 new_list->num_imm_64 = 0;
41407 new_list->imm_size = 0;
41408 new_list->num_loads = 0;
41409 new_list->num_stores = 0;
41410 new_list->violation = false;
41412 for (i = 0; i < MAX_INSN; i++)
41414 new_list->window[i].insn = NULL;
41415 new_list->window[i].group = disp_no_group;
41416 new_list->window[i].path = no_path;
41417 new_list->window[i].byte_len = 0;
41418 new_list->window[i].imm_bytes = 0;
41420 return;
41423 /* This function allocates and initializes a dispatch window and the
41424 list container holding a pointer to the window. */
41426 static dispatch_windows *
41427 allocate_window (void)
41429 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41430 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41432 return new_list;
41435 /* This routine initializes the dispatch scheduling information. It
41436 initiates building dispatch scheduler tables and constructs the
41437 first dispatch window. */
41439 static void
41440 init_dispatch_sched (void)
41442 /* Allocate a dispatch list and a window. */
41443 dispatch_window_list = allocate_window ();
41444 dispatch_window_list1 = allocate_window ();
41445 init_window (0);
41446 init_window (1);
41449 /* This function returns true if a branch is detected. End of a basic block
41450 does not have to be a branch, but here we assume only branches end a
41451 window. */
41453 static bool
41454 is_end_basic_block (enum dispatch_group group)
41456 return group == disp_branch;
41459 /* This function is called when the end of a window processing is reached. */
41461 static void
41462 process_end_window (void)
41464 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41465 if (dispatch_window_list->next)
41467 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41468 gcc_assert (dispatch_window_list->window_size
41469 + dispatch_window_list1->window_size <= 48);
41470 init_window (1);
41472 init_window (0);
41475 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41476 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41477 for 48 bytes of instructions. Note that these windows are not dispatch
41478 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41480 static dispatch_windows *
41481 allocate_next_window (int window_num)
41483 if (window_num == 0)
41485 if (dispatch_window_list->next)
41486 init_window (1);
41487 init_window (0);
41488 return dispatch_window_list;
41491 dispatch_window_list->next = dispatch_window_list1;
41492 dispatch_window_list1->prev = dispatch_window_list;
41494 return dispatch_window_list1;
41497 /* Increment the number of immediate operands of an instruction. */
41499 static int
41500 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41502 if (*in_rtx == 0)
41503 return 0;
41505 switch ( GET_CODE (*in_rtx))
41507 case CONST:
41508 case SYMBOL_REF:
41509 case CONST_INT:
41510 (imm_values->imm)++;
41511 if (x86_64_immediate_operand (*in_rtx, SImode))
41512 (imm_values->imm32)++;
41513 else
41514 (imm_values->imm64)++;
41515 break;
41517 case CONST_DOUBLE:
41518 (imm_values->imm)++;
41519 (imm_values->imm64)++;
41520 break;
41522 case CODE_LABEL:
41523 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41525 (imm_values->imm)++;
41526 (imm_values->imm32)++;
41528 break;
41530 default:
41531 break;
41534 return 0;
41537 /* Compute number of immediate operands of an instruction. */
41539 static void
41540 find_constant (rtx in_rtx, imm_info *imm_values)
41542 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41543 (rtx_function) find_constant_1, (void *) imm_values);
41546 /* Return total size of immediate operands of an instruction along with number
41547 of corresponding immediate-operands. It initializes its parameters to zero
41548 befor calling FIND_CONSTANT.
41549 INSN is the input instruction. IMM is the total of immediates.
41550 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41551 bit immediates. */
41553 static int
41554 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41556 imm_info imm_values = {0, 0, 0};
41558 find_constant (insn, &imm_values);
41559 *imm = imm_values.imm;
41560 *imm32 = imm_values.imm32;
41561 *imm64 = imm_values.imm64;
41562 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41565 /* This function indicates if an operand of an instruction is an
41566 immediate. */
41568 static bool
41569 has_immediate (rtx insn)
41571 int num_imm_operand;
41572 int num_imm32_operand;
41573 int num_imm64_operand;
41575 if (insn)
41576 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41577 &num_imm64_operand);
41578 return false;
41581 /* Return single or double path for instructions. */
41583 static enum insn_path
41584 get_insn_path (rtx insn)
41586 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41588 if ((int)path == 0)
41589 return path_single;
41591 if ((int)path == 1)
41592 return path_double;
41594 return path_multi;
41597 /* Return insn dispatch group. */
41599 static enum dispatch_group
41600 get_insn_group (rtx insn)
41602 enum dispatch_group group = get_mem_group (insn);
41603 if (group)
41604 return group;
41606 if (is_branch (insn))
41607 return disp_branch;
41609 if (is_cmp (insn))
41610 return disp_cmp;
41612 if (has_immediate (insn))
41613 return disp_imm;
41615 if (is_prefetch (insn))
41616 return disp_prefetch;
41618 return disp_no_group;
41621 /* Count number of GROUP restricted instructions in a dispatch
41622 window WINDOW_LIST. */
41624 static int
41625 count_num_restricted (rtx insn, dispatch_windows *window_list)
41627 enum dispatch_group group = get_insn_group (insn);
41628 int imm_size;
41629 int num_imm_operand;
41630 int num_imm32_operand;
41631 int num_imm64_operand;
41633 if (group == disp_no_group)
41634 return 0;
41636 if (group == disp_imm)
41638 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41639 &num_imm64_operand);
41640 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41641 || num_imm_operand + window_list->num_imm > MAX_IMM
41642 || (num_imm32_operand > 0
41643 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41644 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41645 || (num_imm64_operand > 0
41646 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41647 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41648 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41649 && num_imm64_operand > 0
41650 && ((window_list->num_imm_64 > 0
41651 && window_list->num_insn >= 2)
41652 || window_list->num_insn >= 3)))
41653 return BIG;
41655 return 1;
41658 if ((group == disp_load_store
41659 && (window_list->num_loads >= MAX_LOAD
41660 || window_list->num_stores >= MAX_STORE))
41661 || ((group == disp_load
41662 || group == disp_prefetch)
41663 && window_list->num_loads >= MAX_LOAD)
41664 || (group == disp_store
41665 && window_list->num_stores >= MAX_STORE))
41666 return BIG;
41668 return 1;
41671 /* This function returns true if insn satisfies dispatch rules on the
41672 last window scheduled. */
41674 static bool
41675 fits_dispatch_window (rtx insn)
41677 dispatch_windows *window_list = dispatch_window_list;
41678 dispatch_windows *window_list_next = dispatch_window_list->next;
41679 unsigned int num_restrict;
41680 enum dispatch_group group = get_insn_group (insn);
41681 enum insn_path path = get_insn_path (insn);
41682 int sum;
41684 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41685 instructions should be given the lowest priority in the
41686 scheduling process in Haifa scheduler to make sure they will be
41687 scheduled in the same dispatch window as the reference to them. */
41688 if (group == disp_jcc || group == disp_cmp)
41689 return false;
41691 /* Check nonrestricted. */
41692 if (group == disp_no_group || group == disp_branch)
41693 return true;
41695 /* Get last dispatch window. */
41696 if (window_list_next)
41697 window_list = window_list_next;
41699 if (window_list->window_num == 1)
41701 sum = window_list->prev->window_size + window_list->window_size;
41703 if (sum == 32
41704 || (min_insn_size (insn) + sum) >= 48)
41705 /* Window 1 is full. Go for next window. */
41706 return true;
41709 num_restrict = count_num_restricted (insn, window_list);
41711 if (num_restrict > num_allowable_groups[group])
41712 return false;
41714 /* See if it fits in the first window. */
41715 if (window_list->window_num == 0)
41717 /* The first widow should have only single and double path
41718 uops. */
41719 if (path == path_double
41720 && (window_list->num_uops + 2) > MAX_INSN)
41721 return false;
41722 else if (path != path_single)
41723 return false;
41725 return true;
41728 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41729 dispatch window WINDOW_LIST. */
41731 static void
41732 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41734 int byte_len = min_insn_size (insn);
41735 int num_insn = window_list->num_insn;
41736 int imm_size;
41737 sched_insn_info *window = window_list->window;
41738 enum dispatch_group group = get_insn_group (insn);
41739 enum insn_path path = get_insn_path (insn);
41740 int num_imm_operand;
41741 int num_imm32_operand;
41742 int num_imm64_operand;
41744 if (!window_list->violation && group != disp_cmp
41745 && !fits_dispatch_window (insn))
41746 window_list->violation = true;
41748 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41749 &num_imm64_operand);
41751 /* Initialize window with new instruction. */
41752 window[num_insn].insn = insn;
41753 window[num_insn].byte_len = byte_len;
41754 window[num_insn].group = group;
41755 window[num_insn].path = path;
41756 window[num_insn].imm_bytes = imm_size;
41758 window_list->window_size += byte_len;
41759 window_list->num_insn = num_insn + 1;
41760 window_list->num_uops = window_list->num_uops + num_uops;
41761 window_list->imm_size += imm_size;
41762 window_list->num_imm += num_imm_operand;
41763 window_list->num_imm_32 += num_imm32_operand;
41764 window_list->num_imm_64 += num_imm64_operand;
41766 if (group == disp_store)
41767 window_list->num_stores += 1;
41768 else if (group == disp_load
41769 || group == disp_prefetch)
41770 window_list->num_loads += 1;
41771 else if (group == disp_load_store)
41773 window_list->num_stores += 1;
41774 window_list->num_loads += 1;
41778 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41779 If the total bytes of instructions or the number of instructions in
41780 the window exceed allowable, it allocates a new window. */
41782 static void
41783 add_to_dispatch_window (rtx insn)
41785 int byte_len;
41786 dispatch_windows *window_list;
41787 dispatch_windows *next_list;
41788 dispatch_windows *window0_list;
41789 enum insn_path path;
41790 enum dispatch_group insn_group;
41791 bool insn_fits;
41792 int num_insn;
41793 int num_uops;
41794 int window_num;
41795 int insn_num_uops;
41796 int sum;
41798 if (INSN_CODE (insn) < 0)
41799 return;
41801 byte_len = min_insn_size (insn);
41802 window_list = dispatch_window_list;
41803 next_list = window_list->next;
41804 path = get_insn_path (insn);
41805 insn_group = get_insn_group (insn);
41807 /* Get the last dispatch window. */
41808 if (next_list)
41809 window_list = dispatch_window_list->next;
41811 if (path == path_single)
41812 insn_num_uops = 1;
41813 else if (path == path_double)
41814 insn_num_uops = 2;
41815 else
41816 insn_num_uops = (int) path;
41818 /* If current window is full, get a new window.
41819 Window number zero is full, if MAX_INSN uops are scheduled in it.
41820 Window number one is full, if window zero's bytes plus window
41821 one's bytes is 32, or if the bytes of the new instruction added
41822 to the total makes it greater than 48, or it has already MAX_INSN
41823 instructions in it. */
41824 num_insn = window_list->num_insn;
41825 num_uops = window_list->num_uops;
41826 window_num = window_list->window_num;
41827 insn_fits = fits_dispatch_window (insn);
41829 if (num_insn >= MAX_INSN
41830 || num_uops + insn_num_uops > MAX_INSN
41831 || !(insn_fits))
41833 window_num = ~window_num & 1;
41834 window_list = allocate_next_window (window_num);
41837 if (window_num == 0)
41839 add_insn_window (insn, window_list, insn_num_uops);
41840 if (window_list->num_insn >= MAX_INSN
41841 && insn_group == disp_branch)
41843 process_end_window ();
41844 return;
41847 else if (window_num == 1)
41849 window0_list = window_list->prev;
41850 sum = window0_list->window_size + window_list->window_size;
41851 if (sum == 32
41852 || (byte_len + sum) >= 48)
41854 process_end_window ();
41855 window_list = dispatch_window_list;
41858 add_insn_window (insn, window_list, insn_num_uops);
41860 else
41861 gcc_unreachable ();
41863 if (is_end_basic_block (insn_group))
41865 /* End of basic block is reached do end-basic-block process. */
41866 process_end_window ();
41867 return;
41871 /* Print the dispatch window, WINDOW_NUM, to FILE. */
41873 DEBUG_FUNCTION static void
41874 debug_dispatch_window_file (FILE *file, int window_num)
41876 dispatch_windows *list;
41877 int i;
41879 if (window_num == 0)
41880 list = dispatch_window_list;
41881 else
41882 list = dispatch_window_list1;
41884 fprintf (file, "Window #%d:\n", list->window_num);
41885 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
41886 list->num_insn, list->num_uops, list->window_size);
41887 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41888 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
41890 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
41891 list->num_stores);
41892 fprintf (file, " insn info:\n");
41894 for (i = 0; i < MAX_INSN; i++)
41896 if (!list->window[i].insn)
41897 break;
41898 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
41899 i, group_name[list->window[i].group],
41900 i, (void *)list->window[i].insn,
41901 i, list->window[i].path,
41902 i, list->window[i].byte_len,
41903 i, list->window[i].imm_bytes);
41907 /* Print to stdout a dispatch window. */
41909 DEBUG_FUNCTION void
41910 debug_dispatch_window (int window_num)
41912 debug_dispatch_window_file (stdout, window_num);
41915 /* Print INSN dispatch information to FILE. */
41917 DEBUG_FUNCTION static void
41918 debug_insn_dispatch_info_file (FILE *file, rtx insn)
41920 int byte_len;
41921 enum insn_path path;
41922 enum dispatch_group group;
41923 int imm_size;
41924 int num_imm_operand;
41925 int num_imm32_operand;
41926 int num_imm64_operand;
41928 if (INSN_CODE (insn) < 0)
41929 return;
41931 byte_len = min_insn_size (insn);
41932 path = get_insn_path (insn);
41933 group = get_insn_group (insn);
41934 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41935 &num_imm64_operand);
41937 fprintf (file, " insn info:\n");
41938 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
41939 group_name[group], path, byte_len);
41940 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41941 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
41944 /* Print to STDERR the status of the ready list with respect to
41945 dispatch windows. */
41947 DEBUG_FUNCTION void
41948 debug_ready_dispatch (void)
41950 int i;
41951 int no_ready = number_in_ready ();
41953 fprintf (stdout, "Number of ready: %d\n", no_ready);
41955 for (i = 0; i < no_ready; i++)
41956 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
41959 /* This routine is the driver of the dispatch scheduler. */
41961 static void
41962 do_dispatch (rtx insn, int mode)
41964 if (mode == DISPATCH_INIT)
41965 init_dispatch_sched ();
41966 else if (mode == ADD_TO_DISPATCH_WINDOW)
41967 add_to_dispatch_window (insn);
41970 /* Return TRUE if Dispatch Scheduling is supported. */
41972 static bool
41973 has_dispatch (rtx insn, int action)
41975 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
41976 && flag_dispatch_scheduler)
41977 switch (action)
41979 default:
41980 return false;
41982 case IS_DISPATCH_ON:
41983 return true;
41984 break;
41986 case IS_CMP:
41987 return is_cmp (insn);
41989 case DISPATCH_VIOLATION:
41990 return dispatch_violation ();
41992 case FITS_DISPATCH_WINDOW:
41993 return fits_dispatch_window (insn);
41996 return false;
41999 /* Implementation of reassociation_width target hook used by
42000 reassoc phase to identify parallelism level in reassociated
42001 tree. Statements tree_code is passed in OPC. Arguments type
42002 is passed in MODE.
42004 Currently parallel reassociation is enabled for Atom
42005 processors only and we set reassociation width to be 2
42006 because Atom may issue up to 2 instructions per cycle.
42008 Return value should be fixed if parallel reassociation is
42009 enabled for other processors. */
42011 static int
42012 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42013 enum machine_mode mode)
42015 int res = 1;
42017 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42018 res = 2;
42019 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42020 res = 2;
42022 return res;
42025 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42026 place emms and femms instructions. */
42028 static enum machine_mode
42029 ix86_preferred_simd_mode (enum machine_mode mode)
42031 if (!TARGET_SSE)
42032 return word_mode;
42034 switch (mode)
42036 case QImode:
42037 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42038 case HImode:
42039 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42040 case SImode:
42041 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42042 case DImode:
42043 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42045 case SFmode:
42046 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42047 return V8SFmode;
42048 else
42049 return V4SFmode;
42051 case DFmode:
42052 if (!TARGET_VECTORIZE_DOUBLE)
42053 return word_mode;
42054 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42055 return V4DFmode;
42056 else if (TARGET_SSE2)
42057 return V2DFmode;
42058 /* FALLTHRU */
42060 default:
42061 return word_mode;
42065 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42066 vectors. */
42068 static unsigned int
42069 ix86_autovectorize_vector_sizes (void)
42071 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42076 /* Return class of registers which could be used for pseudo of MODE
42077 and of class RCLASS for spilling instead of memory. Return NO_REGS
42078 if it is not possible or non-profitable. */
42079 static reg_class_t
42080 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42082 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42083 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42084 && INTEGER_CLASS_P (rclass))
42085 return SSE_REGS;
42086 return NO_REGS;
42089 /* Implement targetm.vectorize.init_cost. */
42091 static void *
42092 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42094 unsigned *cost = XNEWVEC (unsigned, 3);
42095 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42096 return cost;
42099 /* Implement targetm.vectorize.add_stmt_cost. */
42101 static unsigned
42102 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42103 struct _stmt_vec_info *stmt_info, int misalign,
42104 enum vect_cost_model_location where)
42106 unsigned *cost = (unsigned *) data;
42107 unsigned retval = 0;
42109 if (flag_vect_cost_model)
42111 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42112 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42114 /* Statements in an inner loop relative to the loop being
42115 vectorized are weighted more heavily. The value here is
42116 arbitrary and could potentially be improved with analysis. */
42117 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42118 count *= 50; /* FIXME. */
42120 retval = (unsigned) (count * stmt_cost);
42121 cost[where] += retval;
42124 return retval;
42127 /* Implement targetm.vectorize.finish_cost. */
42129 static void
42130 ix86_finish_cost (void *data, unsigned *prologue_cost,
42131 unsigned *body_cost, unsigned *epilogue_cost)
42133 unsigned *cost = (unsigned *) data;
42134 *prologue_cost = cost[vect_prologue];
42135 *body_cost = cost[vect_body];
42136 *epilogue_cost = cost[vect_epilogue];
42139 /* Implement targetm.vectorize.destroy_cost_data. */
42141 static void
42142 ix86_destroy_cost_data (void *data)
42144 free (data);
42147 /* Validate target specific memory model bits in VAL. */
42149 static unsigned HOST_WIDE_INT
42150 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42152 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42153 bool strong;
42155 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42156 |MEMMODEL_MASK)
42157 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42159 warning (OPT_Winvalid_memory_model,
42160 "Unknown architecture specific memory model");
42161 return MEMMODEL_SEQ_CST;
42163 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42164 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42166 warning (OPT_Winvalid_memory_model,
42167 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42168 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42170 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42172 warning (OPT_Winvalid_memory_model,
42173 "HLE_RELEASE not used with RELEASE or stronger memory model");
42174 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42176 return val;
42179 /* Initialize the GCC target structure. */
42180 #undef TARGET_RETURN_IN_MEMORY
42181 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42183 #undef TARGET_LEGITIMIZE_ADDRESS
42184 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42186 #undef TARGET_ATTRIBUTE_TABLE
42187 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42188 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42189 # undef TARGET_MERGE_DECL_ATTRIBUTES
42190 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42191 #endif
42193 #undef TARGET_COMP_TYPE_ATTRIBUTES
42194 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42196 #undef TARGET_INIT_BUILTINS
42197 #define TARGET_INIT_BUILTINS ix86_init_builtins
42198 #undef TARGET_BUILTIN_DECL
42199 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42200 #undef TARGET_EXPAND_BUILTIN
42201 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42203 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42204 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42205 ix86_builtin_vectorized_function
42207 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42208 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42210 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42211 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42213 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42214 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42216 #undef TARGET_BUILTIN_RECIPROCAL
42217 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42219 #undef TARGET_ASM_FUNCTION_EPILOGUE
42220 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42222 #undef TARGET_ENCODE_SECTION_INFO
42223 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42224 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42225 #else
42226 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42227 #endif
42229 #undef TARGET_ASM_OPEN_PAREN
42230 #define TARGET_ASM_OPEN_PAREN ""
42231 #undef TARGET_ASM_CLOSE_PAREN
42232 #define TARGET_ASM_CLOSE_PAREN ""
42234 #undef TARGET_ASM_BYTE_OP
42235 #define TARGET_ASM_BYTE_OP ASM_BYTE
42237 #undef TARGET_ASM_ALIGNED_HI_OP
42238 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42239 #undef TARGET_ASM_ALIGNED_SI_OP
42240 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42241 #ifdef ASM_QUAD
42242 #undef TARGET_ASM_ALIGNED_DI_OP
42243 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42244 #endif
42246 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42247 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42249 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42250 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42252 #undef TARGET_ASM_UNALIGNED_HI_OP
42253 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42254 #undef TARGET_ASM_UNALIGNED_SI_OP
42255 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42256 #undef TARGET_ASM_UNALIGNED_DI_OP
42257 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42259 #undef TARGET_PRINT_OPERAND
42260 #define TARGET_PRINT_OPERAND ix86_print_operand
42261 #undef TARGET_PRINT_OPERAND_ADDRESS
42262 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42263 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42264 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42265 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42266 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42268 #undef TARGET_SCHED_INIT_GLOBAL
42269 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42270 #undef TARGET_SCHED_ADJUST_COST
42271 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42272 #undef TARGET_SCHED_ISSUE_RATE
42273 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42274 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42275 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42276 ia32_multipass_dfa_lookahead
42278 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42279 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42281 #undef TARGET_MEMMODEL_CHECK
42282 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42284 #ifdef HAVE_AS_TLS
42285 #undef TARGET_HAVE_TLS
42286 #define TARGET_HAVE_TLS true
42287 #endif
42288 #undef TARGET_CANNOT_FORCE_CONST_MEM
42289 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42290 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42291 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42293 #undef TARGET_DELEGITIMIZE_ADDRESS
42294 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42296 #undef TARGET_MS_BITFIELD_LAYOUT_P
42297 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42299 #if TARGET_MACHO
42300 #undef TARGET_BINDS_LOCAL_P
42301 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42302 #endif
42303 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42304 #undef TARGET_BINDS_LOCAL_P
42305 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42306 #endif
42308 #undef TARGET_ASM_OUTPUT_MI_THUNK
42309 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42310 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42311 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42313 #undef TARGET_ASM_FILE_START
42314 #define TARGET_ASM_FILE_START x86_file_start
42316 #undef TARGET_OPTION_OVERRIDE
42317 #define TARGET_OPTION_OVERRIDE ix86_option_override
42319 #undef TARGET_REGISTER_MOVE_COST
42320 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42321 #undef TARGET_MEMORY_MOVE_COST
42322 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42323 #undef TARGET_RTX_COSTS
42324 #define TARGET_RTX_COSTS ix86_rtx_costs
42325 #undef TARGET_ADDRESS_COST
42326 #define TARGET_ADDRESS_COST ix86_address_cost
42328 #undef TARGET_FIXED_CONDITION_CODE_REGS
42329 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42330 #undef TARGET_CC_MODES_COMPATIBLE
42331 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42333 #undef TARGET_MACHINE_DEPENDENT_REORG
42334 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42336 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42337 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42339 #undef TARGET_BUILD_BUILTIN_VA_LIST
42340 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42342 #undef TARGET_FOLD_BUILTIN
42343 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42345 #undef TARGET_COMPARE_VERSION_PRIORITY
42346 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42348 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42349 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42350 ix86_generate_version_dispatcher_body
42352 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42353 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42354 ix86_get_function_versions_dispatcher
42356 #undef TARGET_ENUM_VA_LIST_P
42357 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42359 #undef TARGET_FN_ABI_VA_LIST
42360 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42362 #undef TARGET_CANONICAL_VA_LIST_TYPE
42363 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42365 #undef TARGET_EXPAND_BUILTIN_VA_START
42366 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42368 #undef TARGET_MD_ASM_CLOBBERS
42369 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42371 #undef TARGET_PROMOTE_PROTOTYPES
42372 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42373 #undef TARGET_STRUCT_VALUE_RTX
42374 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
42375 #undef TARGET_SETUP_INCOMING_VARARGS
42376 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42377 #undef TARGET_MUST_PASS_IN_STACK
42378 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42379 #undef TARGET_FUNCTION_ARG_ADVANCE
42380 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42381 #undef TARGET_FUNCTION_ARG
42382 #define TARGET_FUNCTION_ARG ix86_function_arg
42383 #undef TARGET_FUNCTION_ARG_BOUNDARY
42384 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42385 #undef TARGET_PASS_BY_REFERENCE
42386 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42387 #undef TARGET_INTERNAL_ARG_POINTER
42388 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42389 #undef TARGET_UPDATE_STACK_BOUNDARY
42390 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42391 #undef TARGET_GET_DRAP_RTX
42392 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42393 #undef TARGET_STRICT_ARGUMENT_NAMING
42394 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42395 #undef TARGET_STATIC_CHAIN
42396 #define TARGET_STATIC_CHAIN ix86_static_chain
42397 #undef TARGET_TRAMPOLINE_INIT
42398 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42399 #undef TARGET_RETURN_POPS_ARGS
42400 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42402 #undef TARGET_LEGITIMATE_COMBINED_INSN
42403 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42405 #undef TARGET_ASAN_SHADOW_OFFSET
42406 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42408 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42409 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42411 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42412 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42414 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42415 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42417 #undef TARGET_C_MODE_FOR_SUFFIX
42418 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42420 #ifdef HAVE_AS_TLS
42421 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42422 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42423 #endif
42425 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42426 #undef TARGET_INSERT_ATTRIBUTES
42427 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42428 #endif
42430 #undef TARGET_MANGLE_TYPE
42431 #define TARGET_MANGLE_TYPE ix86_mangle_type
42433 #if !TARGET_MACHO
42434 #undef TARGET_STACK_PROTECT_FAIL
42435 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42436 #endif
42438 #undef TARGET_FUNCTION_VALUE
42439 #define TARGET_FUNCTION_VALUE ix86_function_value
42441 #undef TARGET_FUNCTION_VALUE_REGNO_P
42442 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42444 #undef TARGET_PROMOTE_FUNCTION_MODE
42445 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42447 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42448 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42450 #undef TARGET_INSTANTIATE_DECLS
42451 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42453 #undef TARGET_SECONDARY_RELOAD
42454 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42456 #undef TARGET_CLASS_MAX_NREGS
42457 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42459 #undef TARGET_PREFERRED_RELOAD_CLASS
42460 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42461 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42462 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42463 #undef TARGET_CLASS_LIKELY_SPILLED_P
42464 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42466 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42467 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42468 ix86_builtin_vectorization_cost
42469 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42470 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42471 ix86_vectorize_vec_perm_const_ok
42472 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42473 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42474 ix86_preferred_simd_mode
42475 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42476 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42477 ix86_autovectorize_vector_sizes
42478 #undef TARGET_VECTORIZE_INIT_COST
42479 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42480 #undef TARGET_VECTORIZE_ADD_STMT_COST
42481 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42482 #undef TARGET_VECTORIZE_FINISH_COST
42483 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42484 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42485 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42487 #undef TARGET_SET_CURRENT_FUNCTION
42488 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42490 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42491 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42493 #undef TARGET_OPTION_SAVE
42494 #define TARGET_OPTION_SAVE ix86_function_specific_save
42496 #undef TARGET_OPTION_RESTORE
42497 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42499 #undef TARGET_OPTION_PRINT
42500 #define TARGET_OPTION_PRINT ix86_function_specific_print
42502 #undef TARGET_OPTION_FUNCTION_VERSIONS
42503 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42505 #undef TARGET_CAN_INLINE_P
42506 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42508 #undef TARGET_EXPAND_TO_RTL_HOOK
42509 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42511 #undef TARGET_LEGITIMATE_ADDRESS_P
42512 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42514 #undef TARGET_LRA_P
42515 #define TARGET_LRA_P hook_bool_void_true
42517 #undef TARGET_REGISTER_PRIORITY
42518 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42520 #undef TARGET_LEGITIMATE_CONSTANT_P
42521 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42523 #undef TARGET_FRAME_POINTER_REQUIRED
42524 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42526 #undef TARGET_CAN_ELIMINATE
42527 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42529 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42530 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42532 #undef TARGET_ASM_CODE_END
42533 #define TARGET_ASM_CODE_END ix86_code_end
42535 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42536 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42538 #if TARGET_MACHO
42539 #undef TARGET_INIT_LIBFUNCS
42540 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42541 #endif
42543 #undef TARGET_SPILL_CLASS
42544 #define TARGET_SPILL_CLASS ix86_spill_class
42546 struct gcc_target targetm = TARGET_INITIALIZER;
42548 #include "gt-i386.h"