2014-02-01 Christophe Lyon <christophe.lyon@linaro.org>
[official-gcc.git] / gcc-4_8-branch / gcc / config / i386 / i386.c
blobc56ce3e98627a964dc891133a9591907fe7d8aee
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
66 static rtx legitimize_dllimport_symbol (rtx, bool);
68 #ifndef CHECK_STACK_LIMIT
69 #define CHECK_STACK_LIMIT (-1)
70 #endif
72 /* Return index of given mode in mult and division cost tables. */
73 #define MODE_INDEX(mode) \
74 ((mode) == QImode ? 0 \
75 : (mode) == HImode ? 1 \
76 : (mode) == SImode ? 2 \
77 : (mode) == DImode ? 3 \
78 : 4)
80 /* Processor costs (relative to an add) */
81 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
82 #define COSTS_N_BYTES(N) ((N) * 2)
84 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
86 const
87 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
88 COSTS_N_BYTES (2), /* cost of an add instruction */
89 COSTS_N_BYTES (3), /* cost of a lea instruction */
90 COSTS_N_BYTES (2), /* variable shift costs */
91 COSTS_N_BYTES (3), /* constant shift costs */
92 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
93 COSTS_N_BYTES (3), /* HI */
94 COSTS_N_BYTES (3), /* SI */
95 COSTS_N_BYTES (3), /* DI */
96 COSTS_N_BYTES (5)}, /* other */
97 0, /* cost of multiply per each bit set */
98 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
99 COSTS_N_BYTES (3), /* HI */
100 COSTS_N_BYTES (3), /* SI */
101 COSTS_N_BYTES (3), /* DI */
102 COSTS_N_BYTES (5)}, /* other */
103 COSTS_N_BYTES (3), /* cost of movsx */
104 COSTS_N_BYTES (3), /* cost of movzx */
105 0, /* "large" insn */
106 2, /* MOVE_RATIO */
107 2, /* cost for loading QImode using movzbl */
108 {2, 2, 2}, /* cost of loading integer registers
109 in QImode, HImode and SImode.
110 Relative to reg-reg move (2). */
111 {2, 2, 2}, /* cost of storing integer registers */
112 2, /* cost of reg,reg fld/fst */
113 {2, 2, 2}, /* cost of loading fp registers
114 in SFmode, DFmode and XFmode */
115 {2, 2, 2}, /* cost of storing fp registers
116 in SFmode, DFmode and XFmode */
117 3, /* cost of moving MMX register */
118 {3, 3}, /* cost of loading MMX registers
119 in SImode and DImode */
120 {3, 3}, /* cost of storing MMX registers
121 in SImode and DImode */
122 3, /* cost of moving SSE register */
123 {3, 3, 3}, /* cost of loading SSE registers
124 in SImode, DImode and TImode */
125 {3, 3, 3}, /* cost of storing SSE registers
126 in SImode, DImode and TImode */
127 3, /* MMX or SSE register to integer */
128 0, /* size of l1 cache */
129 0, /* size of l2 cache */
130 0, /* size of prefetch block */
131 0, /* number of parallel prefetches */
132 2, /* Branch cost */
133 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
134 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
135 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
136 COSTS_N_BYTES (2), /* cost of FABS instruction. */
137 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
138 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
139 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
140 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
143 1, /* scalar_stmt_cost. */
144 1, /* scalar load_cost. */
145 1, /* scalar_store_cost. */
146 1, /* vec_stmt_cost. */
147 1, /* vec_to_scalar_cost. */
148 1, /* scalar_to_vec_cost. */
149 1, /* vec_align_load_cost. */
150 1, /* vec_unalign_load_cost. */
151 1, /* vec_store_cost. */
152 1, /* cond_taken_branch_cost. */
153 1, /* cond_not_taken_branch_cost. */
156 /* Processor costs (relative to an add) */
157 static const
158 struct processor_costs i386_cost = { /* 386 specific costs */
159 COSTS_N_INSNS (1), /* cost of an add instruction */
160 COSTS_N_INSNS (1), /* cost of a lea instruction */
161 COSTS_N_INSNS (3), /* variable shift costs */
162 COSTS_N_INSNS (2), /* constant shift costs */
163 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
164 COSTS_N_INSNS (6), /* HI */
165 COSTS_N_INSNS (6), /* SI */
166 COSTS_N_INSNS (6), /* DI */
167 COSTS_N_INSNS (6)}, /* other */
168 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
169 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
170 COSTS_N_INSNS (23), /* HI */
171 COSTS_N_INSNS (23), /* SI */
172 COSTS_N_INSNS (23), /* DI */
173 COSTS_N_INSNS (23)}, /* other */
174 COSTS_N_INSNS (3), /* cost of movsx */
175 COSTS_N_INSNS (2), /* cost of movzx */
176 15, /* "large" insn */
177 3, /* MOVE_RATIO */
178 4, /* cost for loading QImode using movzbl */
179 {2, 4, 2}, /* cost of loading integer registers
180 in QImode, HImode and SImode.
181 Relative to reg-reg move (2). */
182 {2, 4, 2}, /* cost of storing integer registers */
183 2, /* cost of reg,reg fld/fst */
184 {8, 8, 8}, /* cost of loading fp registers
185 in SFmode, DFmode and XFmode */
186 {8, 8, 8}, /* cost of storing fp registers
187 in SFmode, DFmode and XFmode */
188 2, /* cost of moving MMX register */
189 {4, 8}, /* cost of loading MMX registers
190 in SImode and DImode */
191 {4, 8}, /* cost of storing MMX registers
192 in SImode and DImode */
193 2, /* cost of moving SSE register */
194 {4, 8, 16}, /* cost of loading SSE registers
195 in SImode, DImode and TImode */
196 {4, 8, 16}, /* cost of storing SSE registers
197 in SImode, DImode and TImode */
198 3, /* MMX or SSE register to integer */
199 0, /* size of l1 cache */
200 0, /* size of l2 cache */
201 0, /* size of prefetch block */
202 0, /* number of parallel prefetches */
203 1, /* Branch cost */
204 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
205 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
206 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
207 COSTS_N_INSNS (22), /* cost of FABS instruction. */
208 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
209 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
210 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
211 DUMMY_STRINGOP_ALGS},
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
213 DUMMY_STRINGOP_ALGS},
214 1, /* scalar_stmt_cost. */
215 1, /* scalar load_cost. */
216 1, /* scalar_store_cost. */
217 1, /* vec_stmt_cost. */
218 1, /* vec_to_scalar_cost. */
219 1, /* scalar_to_vec_cost. */
220 1, /* vec_align_load_cost. */
221 2, /* vec_unalign_load_cost. */
222 1, /* vec_store_cost. */
223 3, /* cond_taken_branch_cost. */
224 1, /* cond_not_taken_branch_cost. */
227 static const
228 struct processor_costs i486_cost = { /* 486 specific costs */
229 COSTS_N_INSNS (1), /* cost of an add instruction */
230 COSTS_N_INSNS (1), /* cost of a lea instruction */
231 COSTS_N_INSNS (3), /* variable shift costs */
232 COSTS_N_INSNS (2), /* constant shift costs */
233 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
234 COSTS_N_INSNS (12), /* HI */
235 COSTS_N_INSNS (12), /* SI */
236 COSTS_N_INSNS (12), /* DI */
237 COSTS_N_INSNS (12)}, /* other */
238 1, /* cost of multiply per each bit set */
239 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
240 COSTS_N_INSNS (40), /* HI */
241 COSTS_N_INSNS (40), /* SI */
242 COSTS_N_INSNS (40), /* DI */
243 COSTS_N_INSNS (40)}, /* other */
244 COSTS_N_INSNS (3), /* cost of movsx */
245 COSTS_N_INSNS (2), /* cost of movzx */
246 15, /* "large" insn */
247 3, /* MOVE_RATIO */
248 4, /* cost for loading QImode using movzbl */
249 {2, 4, 2}, /* cost of loading integer registers
250 in QImode, HImode and SImode.
251 Relative to reg-reg move (2). */
252 {2, 4, 2}, /* cost of storing integer registers */
253 2, /* cost of reg,reg fld/fst */
254 {8, 8, 8}, /* cost of loading fp registers
255 in SFmode, DFmode and XFmode */
256 {8, 8, 8}, /* cost of storing fp registers
257 in SFmode, DFmode and XFmode */
258 2, /* cost of moving MMX register */
259 {4, 8}, /* cost of loading MMX registers
260 in SImode and DImode */
261 {4, 8}, /* cost of storing MMX registers
262 in SImode and DImode */
263 2, /* cost of moving SSE register */
264 {4, 8, 16}, /* cost of loading SSE registers
265 in SImode, DImode and TImode */
266 {4, 8, 16}, /* cost of storing SSE registers
267 in SImode, DImode and TImode */
268 3, /* MMX or SSE register to integer */
269 4, /* size of l1 cache. 486 has 8kB cache
270 shared for code and data, so 4kB is
271 not really precise. */
272 4, /* size of l2 cache */
273 0, /* size of prefetch block */
274 0, /* number of parallel prefetches */
275 1, /* Branch cost */
276 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
277 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
278 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
279 COSTS_N_INSNS (3), /* cost of FABS instruction. */
280 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
281 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
282 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
283 DUMMY_STRINGOP_ALGS},
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
285 DUMMY_STRINGOP_ALGS},
286 1, /* scalar_stmt_cost. */
287 1, /* scalar load_cost. */
288 1, /* scalar_store_cost. */
289 1, /* vec_stmt_cost. */
290 1, /* vec_to_scalar_cost. */
291 1, /* scalar_to_vec_cost. */
292 1, /* vec_align_load_cost. */
293 2, /* vec_unalign_load_cost. */
294 1, /* vec_store_cost. */
295 3, /* cond_taken_branch_cost. */
296 1, /* cond_not_taken_branch_cost. */
299 static const
300 struct processor_costs pentium_cost = {
301 COSTS_N_INSNS (1), /* cost of an add instruction */
302 COSTS_N_INSNS (1), /* cost of a lea instruction */
303 COSTS_N_INSNS (4), /* variable shift costs */
304 COSTS_N_INSNS (1), /* constant shift costs */
305 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
306 COSTS_N_INSNS (11), /* HI */
307 COSTS_N_INSNS (11), /* SI */
308 COSTS_N_INSNS (11), /* DI */
309 COSTS_N_INSNS (11)}, /* other */
310 0, /* cost of multiply per each bit set */
311 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
312 COSTS_N_INSNS (25), /* HI */
313 COSTS_N_INSNS (25), /* SI */
314 COSTS_N_INSNS (25), /* DI */
315 COSTS_N_INSNS (25)}, /* other */
316 COSTS_N_INSNS (3), /* cost of movsx */
317 COSTS_N_INSNS (2), /* cost of movzx */
318 8, /* "large" insn */
319 6, /* MOVE_RATIO */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, /* cost of moving SSE register */
336 {4, 8, 16}, /* cost of loading SSE registers
337 in SImode, DImode and TImode */
338 {4, 8, 16}, /* cost of storing SSE registers
339 in SImode, DImode and TImode */
340 3, /* MMX or SSE register to integer */
341 8, /* size of l1 cache. */
342 8, /* size of l2 cache */
343 0, /* size of prefetch block */
344 0, /* number of parallel prefetches */
345 2, /* Branch cost */
346 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
347 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
348 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
349 COSTS_N_INSNS (1), /* cost of FABS instruction. */
350 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
351 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
352 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
353 DUMMY_STRINGOP_ALGS},
354 {{libcall, {{-1, rep_prefix_4_byte, false}}},
355 DUMMY_STRINGOP_ALGS},
356 1, /* scalar_stmt_cost. */
357 1, /* scalar load_cost. */
358 1, /* scalar_store_cost. */
359 1, /* vec_stmt_cost. */
360 1, /* vec_to_scalar_cost. */
361 1, /* scalar_to_vec_cost. */
362 1, /* vec_align_load_cost. */
363 2, /* vec_unalign_load_cost. */
364 1, /* vec_store_cost. */
365 3, /* cond_taken_branch_cost. */
366 1, /* cond_not_taken_branch_cost. */
369 static const
370 struct processor_costs pentiumpro_cost = {
371 COSTS_N_INSNS (1), /* cost of an add instruction */
372 COSTS_N_INSNS (1), /* cost of a lea instruction */
373 COSTS_N_INSNS (1), /* variable shift costs */
374 COSTS_N_INSNS (1), /* constant shift costs */
375 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
376 COSTS_N_INSNS (4), /* HI */
377 COSTS_N_INSNS (4), /* SI */
378 COSTS_N_INSNS (4), /* DI */
379 COSTS_N_INSNS (4)}, /* other */
380 0, /* cost of multiply per each bit set */
381 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
382 COSTS_N_INSNS (17), /* HI */
383 COSTS_N_INSNS (17), /* SI */
384 COSTS_N_INSNS (17), /* DI */
385 COSTS_N_INSNS (17)}, /* other */
386 COSTS_N_INSNS (1), /* cost of movsx */
387 COSTS_N_INSNS (1), /* cost of movzx */
388 8, /* "large" insn */
389 6, /* MOVE_RATIO */
390 2, /* cost for loading QImode using movzbl */
391 {4, 4, 4}, /* cost of loading integer registers
392 in QImode, HImode and SImode.
393 Relative to reg-reg move (2). */
394 {2, 2, 2}, /* cost of storing integer registers */
395 2, /* cost of reg,reg fld/fst */
396 {2, 2, 6}, /* cost of loading fp registers
397 in SFmode, DFmode and XFmode */
398 {4, 4, 6}, /* cost of storing fp registers
399 in SFmode, DFmode and XFmode */
400 2, /* cost of moving MMX register */
401 {2, 2}, /* cost of loading MMX registers
402 in SImode and DImode */
403 {2, 2}, /* cost of storing MMX registers
404 in SImode and DImode */
405 2, /* cost of moving SSE register */
406 {2, 2, 8}, /* cost of loading SSE registers
407 in SImode, DImode and TImode */
408 {2, 2, 8}, /* cost of storing SSE registers
409 in SImode, DImode and TImode */
410 3, /* MMX or SSE register to integer */
411 8, /* size of l1 cache. */
412 256, /* size of l2 cache */
413 32, /* size of prefetch block */
414 6, /* number of parallel prefetches */
415 2, /* Branch cost */
416 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
417 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
418 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
421 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
422 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
423 (we ensure the alignment). For small blocks inline loop is still a
424 noticeable win, for bigger blocks either rep movsl or rep movsb is
425 way to go. Rep movsb has apparently more expensive startup time in CPU,
426 but after 4K the difference is down in the noise. */
427 {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
428 {8192, rep_prefix_4_byte, false},
429 {-1, rep_prefix_1_byte, false}}},
430 DUMMY_STRINGOP_ALGS},
431 {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
432 {8192, rep_prefix_4_byte, false},
433 {-1, libcall, false}}},
434 DUMMY_STRINGOP_ALGS},
435 1, /* scalar_stmt_cost. */
436 1, /* scalar load_cost. */
437 1, /* scalar_store_cost. */
438 1, /* vec_stmt_cost. */
439 1, /* vec_to_scalar_cost. */
440 1, /* scalar_to_vec_cost. */
441 1, /* vec_align_load_cost. */
442 2, /* vec_unalign_load_cost. */
443 1, /* vec_store_cost. */
444 3, /* cond_taken_branch_cost. */
445 1, /* cond_not_taken_branch_cost. */
448 static const
449 struct processor_costs geode_cost = {
450 COSTS_N_INSNS (1), /* cost of an add instruction */
451 COSTS_N_INSNS (1), /* cost of a lea instruction */
452 COSTS_N_INSNS (2), /* variable shift costs */
453 COSTS_N_INSNS (1), /* constant shift costs */
454 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
455 COSTS_N_INSNS (4), /* HI */
456 COSTS_N_INSNS (7), /* SI */
457 COSTS_N_INSNS (7), /* DI */
458 COSTS_N_INSNS (7)}, /* other */
459 0, /* cost of multiply per each bit set */
460 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
461 COSTS_N_INSNS (23), /* HI */
462 COSTS_N_INSNS (39), /* SI */
463 COSTS_N_INSNS (39), /* DI */
464 COSTS_N_INSNS (39)}, /* other */
465 COSTS_N_INSNS (1), /* cost of movsx */
466 COSTS_N_INSNS (1), /* cost of movzx */
467 8, /* "large" insn */
468 4, /* MOVE_RATIO */
469 1, /* cost for loading QImode using movzbl */
470 {1, 1, 1}, /* cost of loading integer registers
471 in QImode, HImode and SImode.
472 Relative to reg-reg move (2). */
473 {1, 1, 1}, /* cost of storing integer registers */
474 1, /* cost of reg,reg fld/fst */
475 {1, 1, 1}, /* cost of loading fp registers
476 in SFmode, DFmode and XFmode */
477 {4, 6, 6}, /* cost of storing fp registers
478 in SFmode, DFmode and XFmode */
480 1, /* cost of moving MMX register */
481 {1, 1}, /* cost of loading MMX registers
482 in SImode and DImode */
483 {1, 1}, /* cost of storing MMX registers
484 in SImode and DImode */
485 1, /* cost of moving SSE register */
486 {1, 1, 1}, /* cost of loading SSE registers
487 in SImode, DImode and TImode */
488 {1, 1, 1}, /* cost of storing SSE registers
489 in SImode, DImode and TImode */
490 1, /* MMX or SSE register to integer */
491 64, /* size of l1 cache. */
492 128, /* size of l2 cache. */
493 32, /* size of prefetch block */
494 1, /* number of parallel prefetches */
495 1, /* Branch cost */
496 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
497 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
498 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
501 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
502 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
503 DUMMY_STRINGOP_ALGS},
504 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS},
506 1, /* scalar_stmt_cost. */
507 1, /* scalar load_cost. */
508 1, /* scalar_store_cost. */
509 1, /* vec_stmt_cost. */
510 1, /* vec_to_scalar_cost. */
511 1, /* scalar_to_vec_cost. */
512 1, /* vec_align_load_cost. */
513 2, /* vec_unalign_load_cost. */
514 1, /* vec_store_cost. */
515 3, /* cond_taken_branch_cost. */
516 1, /* cond_not_taken_branch_cost. */
519 static const
520 struct processor_costs k6_cost = {
521 COSTS_N_INSNS (1), /* cost of an add instruction */
522 COSTS_N_INSNS (2), /* cost of a lea instruction */
523 COSTS_N_INSNS (1), /* variable shift costs */
524 COSTS_N_INSNS (1), /* constant shift costs */
525 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
526 COSTS_N_INSNS (3), /* HI */
527 COSTS_N_INSNS (3), /* SI */
528 COSTS_N_INSNS (3), /* DI */
529 COSTS_N_INSNS (3)}, /* other */
530 0, /* cost of multiply per each bit set */
531 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
532 COSTS_N_INSNS (18), /* HI */
533 COSTS_N_INSNS (18), /* SI */
534 COSTS_N_INSNS (18), /* DI */
535 COSTS_N_INSNS (18)}, /* other */
536 COSTS_N_INSNS (2), /* cost of movsx */
537 COSTS_N_INSNS (2), /* cost of movzx */
538 8, /* "large" insn */
539 4, /* MOVE_RATIO */
540 3, /* cost for loading QImode using movzbl */
541 {4, 5, 4}, /* cost of loading integer registers
542 in QImode, HImode and SImode.
543 Relative to reg-reg move (2). */
544 {2, 3, 2}, /* cost of storing integer registers */
545 4, /* cost of reg,reg fld/fst */
546 {6, 6, 6}, /* cost of loading fp registers
547 in SFmode, DFmode and XFmode */
548 {4, 4, 4}, /* cost of storing fp registers
549 in SFmode, DFmode and XFmode */
550 2, /* cost of moving MMX register */
551 {2, 2}, /* cost of loading MMX registers
552 in SImode and DImode */
553 {2, 2}, /* cost of storing MMX registers
554 in SImode and DImode */
555 2, /* cost of moving SSE register */
556 {2, 2, 8}, /* cost of loading SSE registers
557 in SImode, DImode and TImode */
558 {2, 2, 8}, /* cost of storing SSE registers
559 in SImode, DImode and TImode */
560 6, /* MMX or SSE register to integer */
561 32, /* size of l1 cache. */
562 32, /* size of l2 cache. Some models
563 have integrated l2 cache, but
564 optimizing for k6 is not important
565 enough to worry about that. */
566 32, /* size of prefetch block */
567 1, /* number of parallel prefetches */
568 1, /* Branch cost */
569 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
570 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
571 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
572 COSTS_N_INSNS (2), /* cost of FABS instruction. */
573 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
574 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
575 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS},
577 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
578 DUMMY_STRINGOP_ALGS},
579 1, /* scalar_stmt_cost. */
580 1, /* scalar load_cost. */
581 1, /* scalar_store_cost. */
582 1, /* vec_stmt_cost. */
583 1, /* vec_to_scalar_cost. */
584 1, /* scalar_to_vec_cost. */
585 1, /* vec_align_load_cost. */
586 2, /* vec_unalign_load_cost. */
587 1, /* vec_store_cost. */
588 3, /* cond_taken_branch_cost. */
589 1, /* cond_not_taken_branch_cost. */
592 static const
593 struct processor_costs athlon_cost = {
594 COSTS_N_INSNS (1), /* cost of an add instruction */
595 COSTS_N_INSNS (2), /* cost of a lea instruction */
596 COSTS_N_INSNS (1), /* variable shift costs */
597 COSTS_N_INSNS (1), /* constant shift costs */
598 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
599 COSTS_N_INSNS (5), /* HI */
600 COSTS_N_INSNS (5), /* SI */
601 COSTS_N_INSNS (5), /* DI */
602 COSTS_N_INSNS (5)}, /* other */
603 0, /* cost of multiply per each bit set */
604 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
605 COSTS_N_INSNS (26), /* HI */
606 COSTS_N_INSNS (42), /* SI */
607 COSTS_N_INSNS (74), /* DI */
608 COSTS_N_INSNS (74)}, /* other */
609 COSTS_N_INSNS (1), /* cost of movsx */
610 COSTS_N_INSNS (1), /* cost of movzx */
611 8, /* "large" insn */
612 9, /* MOVE_RATIO */
613 4, /* cost for loading QImode using movzbl */
614 {3, 4, 3}, /* cost of loading integer registers
615 in QImode, HImode and SImode.
616 Relative to reg-reg move (2). */
617 {3, 4, 3}, /* cost of storing integer registers */
618 4, /* cost of reg,reg fld/fst */
619 {4, 4, 12}, /* cost of loading fp registers
620 in SFmode, DFmode and XFmode */
621 {6, 6, 8}, /* cost of storing fp registers
622 in SFmode, DFmode and XFmode */
623 2, /* cost of moving MMX register */
624 {4, 4}, /* cost of loading MMX registers
625 in SImode and DImode */
626 {4, 4}, /* cost of storing MMX registers
627 in SImode and DImode */
628 2, /* cost of moving SSE register */
629 {4, 4, 6}, /* cost of loading SSE registers
630 in SImode, DImode and TImode */
631 {4, 4, 5}, /* cost of storing SSE registers
632 in SImode, DImode and TImode */
633 5, /* MMX or SSE register to integer */
634 64, /* size of l1 cache. */
635 256, /* size of l2 cache. */
636 64, /* size of prefetch block */
637 6, /* number of parallel prefetches */
638 5, /* Branch cost */
639 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
640 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
641 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
642 COSTS_N_INSNS (2), /* cost of FABS instruction. */
643 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
644 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
645 /* For some reason, Athlon deals better with REP prefix (relative to loops)
646 compared to K8. Alignment becomes important after 8 bytes for memcpy and
647 128 bytes for memset. */
648 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
649 DUMMY_STRINGOP_ALGS},
650 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS},
652 1, /* scalar_stmt_cost. */
653 1, /* scalar load_cost. */
654 1, /* scalar_store_cost. */
655 1, /* vec_stmt_cost. */
656 1, /* vec_to_scalar_cost. */
657 1, /* scalar_to_vec_cost. */
658 1, /* vec_align_load_cost. */
659 2, /* vec_unalign_load_cost. */
660 1, /* vec_store_cost. */
661 3, /* cond_taken_branch_cost. */
662 1, /* cond_not_taken_branch_cost. */
665 static const
666 struct processor_costs k8_cost = {
667 COSTS_N_INSNS (1), /* cost of an add instruction */
668 COSTS_N_INSNS (2), /* cost of a lea instruction */
669 COSTS_N_INSNS (1), /* variable shift costs */
670 COSTS_N_INSNS (1), /* constant shift costs */
671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
672 COSTS_N_INSNS (4), /* HI */
673 COSTS_N_INSNS (3), /* SI */
674 COSTS_N_INSNS (4), /* DI */
675 COSTS_N_INSNS (5)}, /* other */
676 0, /* cost of multiply per each bit set */
677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
678 COSTS_N_INSNS (26), /* HI */
679 COSTS_N_INSNS (42), /* SI */
680 COSTS_N_INSNS (74), /* DI */
681 COSTS_N_INSNS (74)}, /* other */
682 COSTS_N_INSNS (1), /* cost of movsx */
683 COSTS_N_INSNS (1), /* cost of movzx */
684 8, /* "large" insn */
685 9, /* MOVE_RATIO */
686 4, /* cost for loading QImode using movzbl */
687 {3, 4, 3}, /* cost of loading integer registers
688 in QImode, HImode and SImode.
689 Relative to reg-reg move (2). */
690 {3, 4, 3}, /* cost of storing integer registers */
691 4, /* cost of reg,reg fld/fst */
692 {4, 4, 12}, /* cost of loading fp registers
693 in SFmode, DFmode and XFmode */
694 {6, 6, 8}, /* cost of storing fp registers
695 in SFmode, DFmode and XFmode */
696 2, /* cost of moving MMX register */
697 {3, 3}, /* cost of loading MMX registers
698 in SImode and DImode */
699 {4, 4}, /* cost of storing MMX registers
700 in SImode and DImode */
701 2, /* cost of moving SSE register */
702 {4, 3, 6}, /* cost of loading SSE registers
703 in SImode, DImode and TImode */
704 {4, 4, 5}, /* cost of storing SSE registers
705 in SImode, DImode and TImode */
706 5, /* MMX or SSE register to integer */
707 64, /* size of l1 cache. */
708 512, /* size of l2 cache. */
709 64, /* size of prefetch block */
710 /* New AMD processors never drop prefetches; if they cannot be performed
711 immediately, they are queued. We set number of simultaneous prefetches
712 to a large constant to reflect this (it probably is not a good idea not
713 to limit number of prefetches at all, as their execution also takes some
714 time). */
715 100, /* number of parallel prefetches */
716 3, /* Branch cost */
717 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
718 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
719 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
720 COSTS_N_INSNS (2), /* cost of FABS instruction. */
721 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
722 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
723 /* K8 has optimized REP instruction for medium sized blocks, but for very
724 small blocks it is better to use loop. For large blocks, libcall can
725 do nontemporary accesses and beat inline considerably. */
726 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
727 {-1, rep_prefix_4_byte, false}}},
728 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
729 {-1, libcall, false}}}},
730 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
731 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
732 {libcall, {{48, unrolled_loop, false},
733 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
734 4, /* scalar_stmt_cost. */
735 2, /* scalar load_cost. */
736 2, /* scalar_store_cost. */
737 5, /* vec_stmt_cost. */
738 0, /* vec_to_scalar_cost. */
739 2, /* scalar_to_vec_cost. */
740 2, /* vec_align_load_cost. */
741 3, /* vec_unalign_load_cost. */
742 3, /* vec_store_cost. */
743 3, /* cond_taken_branch_cost. */
744 2, /* cond_not_taken_branch_cost. */
747 struct processor_costs amdfam10_cost = {
748 COSTS_N_INSNS (1), /* cost of an add instruction */
749 COSTS_N_INSNS (2), /* cost of a lea instruction */
750 COSTS_N_INSNS (1), /* variable shift costs */
751 COSTS_N_INSNS (1), /* constant shift costs */
752 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
753 COSTS_N_INSNS (4), /* HI */
754 COSTS_N_INSNS (3), /* SI */
755 COSTS_N_INSNS (4), /* DI */
756 COSTS_N_INSNS (5)}, /* other */
757 0, /* cost of multiply per each bit set */
758 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
759 COSTS_N_INSNS (35), /* HI */
760 COSTS_N_INSNS (51), /* SI */
761 COSTS_N_INSNS (83), /* DI */
762 COSTS_N_INSNS (83)}, /* other */
763 COSTS_N_INSNS (1), /* cost of movsx */
764 COSTS_N_INSNS (1), /* cost of movzx */
765 8, /* "large" insn */
766 9, /* MOVE_RATIO */
767 4, /* cost for loading QImode using movzbl */
768 {3, 4, 3}, /* cost of loading integer registers
769 in QImode, HImode and SImode.
770 Relative to reg-reg move (2). */
771 {3, 4, 3}, /* cost of storing integer registers */
772 4, /* cost of reg,reg fld/fst */
773 {4, 4, 12}, /* cost of loading fp registers
774 in SFmode, DFmode and XFmode */
775 {6, 6, 8}, /* cost of storing fp registers
776 in SFmode, DFmode and XFmode */
777 2, /* cost of moving MMX register */
778 {3, 3}, /* cost of loading MMX registers
779 in SImode and DImode */
780 {4, 4}, /* cost of storing MMX registers
781 in SImode and DImode */
782 2, /* cost of moving SSE register */
783 {4, 4, 3}, /* cost of loading SSE registers
784 in SImode, DImode and TImode */
785 {4, 4, 5}, /* cost of storing SSE registers
786 in SImode, DImode and TImode */
787 3, /* MMX or SSE register to integer */
788 /* On K8:
789 MOVD reg64, xmmreg Double FSTORE 4
790 MOVD reg32, xmmreg Double FSTORE 4
791 On AMDFAM10:
792 MOVD reg64, xmmreg Double FADD 3
793 1/1 1/1
794 MOVD reg32, xmmreg Double FADD 3
795 1/1 1/1 */
796 64, /* size of l1 cache. */
797 512, /* size of l2 cache. */
798 64, /* size of prefetch block */
799 /* New AMD processors never drop prefetches; if they cannot be performed
800 immediately, they are queued. We set number of simultaneous prefetches
801 to a large constant to reflect this (it probably is not a good idea not
802 to limit number of prefetches at all, as their execution also takes some
803 time). */
804 100, /* number of parallel prefetches */
805 2, /* Branch cost */
806 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
807 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
808 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
809 COSTS_N_INSNS (2), /* cost of FABS instruction. */
810 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
811 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
813 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
814 very small blocks it is better to use loop. For large blocks, libcall can
815 do nontemporary accesses and beat inline considerably. */
816 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
817 {-1, rep_prefix_4_byte, false}}},
818 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
819 {-1, libcall, false}}}},
820 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
821 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
822 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
823 {-1, libcall, false}}}},
824 4, /* scalar_stmt_cost. */
825 2, /* scalar load_cost. */
826 2, /* scalar_store_cost. */
827 6, /* vec_stmt_cost. */
828 0, /* vec_to_scalar_cost. */
829 2, /* scalar_to_vec_cost. */
830 2, /* vec_align_load_cost. */
831 2, /* vec_unalign_load_cost. */
832 2, /* vec_store_cost. */
833 2, /* cond_taken_branch_cost. */
834 1, /* cond_not_taken_branch_cost. */
837 struct processor_costs bdver1_cost = {
838 COSTS_N_INSNS (1), /* cost of an add instruction */
839 COSTS_N_INSNS (1), /* cost of a lea instruction */
840 COSTS_N_INSNS (1), /* variable shift costs */
841 COSTS_N_INSNS (1), /* constant shift costs */
842 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
843 COSTS_N_INSNS (4), /* HI */
844 COSTS_N_INSNS (4), /* SI */
845 COSTS_N_INSNS (6), /* DI */
846 COSTS_N_INSNS (6)}, /* other */
847 0, /* cost of multiply per each bit set */
848 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
849 COSTS_N_INSNS (35), /* HI */
850 COSTS_N_INSNS (51), /* SI */
851 COSTS_N_INSNS (83), /* DI */
852 COSTS_N_INSNS (83)}, /* other */
853 COSTS_N_INSNS (1), /* cost of movsx */
854 COSTS_N_INSNS (1), /* cost of movzx */
855 8, /* "large" insn */
856 9, /* MOVE_RATIO */
857 4, /* cost for loading QImode using movzbl */
858 {5, 5, 4}, /* cost of loading integer registers
859 in QImode, HImode and SImode.
860 Relative to reg-reg move (2). */
861 {4, 4, 4}, /* cost of storing integer registers */
862 2, /* cost of reg,reg fld/fst */
863 {5, 5, 12}, /* cost of loading fp registers
864 in SFmode, DFmode and XFmode */
865 {4, 4, 8}, /* cost of storing fp registers
866 in SFmode, DFmode and XFmode */
867 2, /* cost of moving MMX register */
868 {4, 4}, /* cost of loading MMX registers
869 in SImode and DImode */
870 {4, 4}, /* cost of storing MMX registers
871 in SImode and DImode */
872 2, /* cost of moving SSE register */
873 {4, 4, 4}, /* cost of loading SSE registers
874 in SImode, DImode and TImode */
875 {4, 4, 4}, /* cost of storing SSE registers
876 in SImode, DImode and TImode */
877 2, /* MMX or SSE register to integer */
878 /* On K8:
879 MOVD reg64, xmmreg Double FSTORE 4
880 MOVD reg32, xmmreg Double FSTORE 4
881 On AMDFAM10:
882 MOVD reg64, xmmreg Double FADD 3
883 1/1 1/1
884 MOVD reg32, xmmreg Double FADD 3
885 1/1 1/1 */
886 16, /* size of l1 cache. */
887 2048, /* size of l2 cache. */
888 64, /* size of prefetch block */
889 /* New AMD processors never drop prefetches; if they cannot be performed
890 immediately, they are queued. We set number of simultaneous prefetches
891 to a large constant to reflect this (it probably is not a good idea not
892 to limit number of prefetches at all, as their execution also takes some
893 time). */
894 100, /* number of parallel prefetches */
895 2, /* Branch cost */
896 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
897 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
898 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
899 COSTS_N_INSNS (2), /* cost of FABS instruction. */
900 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
901 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
903 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
904 very small blocks it is better to use loop. For large blocks, libcall
905 can do nontemporary accesses and beat inline considerably. */
906 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}},
910 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
911 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
912 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
913 {-1, libcall, false}}}},
914 6, /* scalar_stmt_cost. */
915 4, /* scalar load_cost. */
916 4, /* scalar_store_cost. */
917 6, /* vec_stmt_cost. */
918 0, /* vec_to_scalar_cost. */
919 2, /* scalar_to_vec_cost. */
920 4, /* vec_align_load_cost. */
921 4, /* vec_unalign_load_cost. */
922 4, /* vec_store_cost. */
923 2, /* cond_taken_branch_cost. */
924 1, /* cond_not_taken_branch_cost. */
927 struct processor_costs bdver2_cost = {
928 COSTS_N_INSNS (1), /* cost of an add instruction */
929 COSTS_N_INSNS (1), /* cost of a lea instruction */
930 COSTS_N_INSNS (1), /* variable shift costs */
931 COSTS_N_INSNS (1), /* constant shift costs */
932 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
933 COSTS_N_INSNS (4), /* HI */
934 COSTS_N_INSNS (4), /* SI */
935 COSTS_N_INSNS (6), /* DI */
936 COSTS_N_INSNS (6)}, /* other */
937 0, /* cost of multiply per each bit set */
938 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
939 COSTS_N_INSNS (35), /* HI */
940 COSTS_N_INSNS (51), /* SI */
941 COSTS_N_INSNS (83), /* DI */
942 COSTS_N_INSNS (83)}, /* other */
943 COSTS_N_INSNS (1), /* cost of movsx */
944 COSTS_N_INSNS (1), /* cost of movzx */
945 8, /* "large" insn */
946 9, /* MOVE_RATIO */
947 4, /* cost for loading QImode using movzbl */
948 {5, 5, 4}, /* cost of loading integer registers
949 in QImode, HImode and SImode.
950 Relative to reg-reg move (2). */
951 {4, 4, 4}, /* cost of storing integer registers */
952 2, /* cost of reg,reg fld/fst */
953 {5, 5, 12}, /* cost of loading fp registers
954 in SFmode, DFmode and XFmode */
955 {4, 4, 8}, /* cost of storing fp registers
956 in SFmode, DFmode and XFmode */
957 2, /* cost of moving MMX register */
958 {4, 4}, /* cost of loading MMX registers
959 in SImode and DImode */
960 {4, 4}, /* cost of storing MMX registers
961 in SImode and DImode */
962 2, /* cost of moving SSE register */
963 {4, 4, 4}, /* cost of loading SSE registers
964 in SImode, DImode and TImode */
965 {4, 4, 4}, /* cost of storing SSE registers
966 in SImode, DImode and TImode */
967 2, /* MMX or SSE register to integer */
968 /* On K8:
969 MOVD reg64, xmmreg Double FSTORE 4
970 MOVD reg32, xmmreg Double FSTORE 4
971 On AMDFAM10:
972 MOVD reg64, xmmreg Double FADD 3
973 1/1 1/1
974 MOVD reg32, xmmreg Double FADD 3
975 1/1 1/1 */
976 16, /* size of l1 cache. */
977 2048, /* size of l2 cache. */
978 64, /* size of prefetch block */
979 /* New AMD processors never drop prefetches; if they cannot be performed
980 immediately, they are queued. We set number of simultaneous prefetches
981 to a large constant to reflect this (it probably is not a good idea not
982 to limit number of prefetches at all, as their execution also takes some
983 time). */
984 100, /* number of parallel prefetches */
985 2, /* Branch cost */
986 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
987 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
988 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
989 COSTS_N_INSNS (2), /* cost of FABS instruction. */
990 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
991 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
993 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
994 very small blocks it is better to use loop. For large blocks, libcall
995 can do nontemporary accesses and beat inline considerably. */
996 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
997 {-1, rep_prefix_4_byte, false}}},
998 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
999 {-1, libcall, false}}}},
1000 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1001 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1002 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1003 {-1, libcall, false}}}},
1004 6, /* scalar_stmt_cost. */
1005 4, /* scalar load_cost. */
1006 4, /* scalar_store_cost. */
1007 6, /* vec_stmt_cost. */
1008 0, /* vec_to_scalar_cost. */
1009 2, /* scalar_to_vec_cost. */
1010 4, /* vec_align_load_cost. */
1011 4, /* vec_unalign_load_cost. */
1012 4, /* vec_store_cost. */
1013 2, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1017 struct processor_costs bdver3_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (1), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (4), /* HI */
1024 COSTS_N_INSNS (4), /* SI */
1025 COSTS_N_INSNS (6), /* DI */
1026 COSTS_N_INSNS (6)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (35), /* HI */
1030 COSTS_N_INSNS (51), /* SI */
1031 COSTS_N_INSNS (83), /* DI */
1032 COSTS_N_INSNS (83)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {5, 5, 4}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {4, 4, 4}, /* cost of storing integer registers */
1042 2, /* cost of reg,reg fld/fst */
1043 {5, 5, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {4, 4, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 4}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 4}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 2, /* MMX or SSE register to integer */
1058 16, /* size of l1 cache. */
1059 2048, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 /* New AMD processors never drop prefetches; if they cannot be performed
1062 immediately, they are queued. We set number of simultaneous prefetches
1063 to a large constant to reflect this (it probably is not a good idea not
1064 to limit number of prefetches at all, as their execution also takes some
1065 time). */
1066 100, /* number of parallel prefetches */
1067 2, /* Branch cost */
1068 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1069 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1070 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1071 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1072 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1073 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1075 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1076 very small blocks it is better to use loop. For large blocks, libcall
1077 can do nontemporary accesses and beat inline considerably. */
1078 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1079 {-1, rep_prefix_4_byte, false}}},
1080 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1081 {-1, libcall, false}}}},
1082 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}},
1086 6, /* scalar_stmt_cost. */
1087 4, /* scalar load_cost. */
1088 4, /* scalar_store_cost. */
1089 6, /* vec_stmt_cost. */
1090 0, /* vec_to_scalar_cost. */
1091 2, /* scalar_to_vec_cost. */
1092 4, /* vec_align_load_cost. */
1093 4, /* vec_unalign_load_cost. */
1094 4, /* vec_store_cost. */
1095 2, /* cond_taken_branch_cost. */
1096 1, /* cond_not_taken_branch_cost. */
1099 struct processor_costs btver1_cost = {
1100 COSTS_N_INSNS (1), /* cost of an add instruction */
1101 COSTS_N_INSNS (2), /* cost of a lea instruction */
1102 COSTS_N_INSNS (1), /* variable shift costs */
1103 COSTS_N_INSNS (1), /* constant shift costs */
1104 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1105 COSTS_N_INSNS (4), /* HI */
1106 COSTS_N_INSNS (3), /* SI */
1107 COSTS_N_INSNS (4), /* DI */
1108 COSTS_N_INSNS (5)}, /* other */
1109 0, /* cost of multiply per each bit set */
1110 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1111 COSTS_N_INSNS (35), /* HI */
1112 COSTS_N_INSNS (51), /* SI */
1113 COSTS_N_INSNS (83), /* DI */
1114 COSTS_N_INSNS (83)}, /* other */
1115 COSTS_N_INSNS (1), /* cost of movsx */
1116 COSTS_N_INSNS (1), /* cost of movzx */
1117 8, /* "large" insn */
1118 9, /* MOVE_RATIO */
1119 4, /* cost for loading QImode using movzbl */
1120 {3, 4, 3}, /* cost of loading integer registers
1121 in QImode, HImode and SImode.
1122 Relative to reg-reg move (2). */
1123 {3, 4, 3}, /* cost of storing integer registers */
1124 4, /* cost of reg,reg fld/fst */
1125 {4, 4, 12}, /* cost of loading fp registers
1126 in SFmode, DFmode and XFmode */
1127 {6, 6, 8}, /* cost of storing fp registers
1128 in SFmode, DFmode and XFmode */
1129 2, /* cost of moving MMX register */
1130 {3, 3}, /* cost of loading MMX registers
1131 in SImode and DImode */
1132 {4, 4}, /* cost of storing MMX registers
1133 in SImode and DImode */
1134 2, /* cost of moving SSE register */
1135 {4, 4, 3}, /* cost of loading SSE registers
1136 in SImode, DImode and TImode */
1137 {4, 4, 5}, /* cost of storing SSE registers
1138 in SImode, DImode and TImode */
1139 3, /* MMX or SSE register to integer */
1140 /* On K8:
1141 MOVD reg64, xmmreg Double FSTORE 4
1142 MOVD reg32, xmmreg Double FSTORE 4
1143 On AMDFAM10:
1144 MOVD reg64, xmmreg Double FADD 3
1145 1/1 1/1
1146 MOVD reg32, xmmreg Double FADD 3
1147 1/1 1/1 */
1148 32, /* size of l1 cache. */
1149 512, /* size of l2 cache. */
1150 64, /* size of prefetch block */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1160 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1161 very small blocks it is better to use loop. For large blocks, libcall can
1162 do nontemporary accesses and beat inline considerably. */
1163 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1164 {-1, rep_prefix_4_byte, false}}},
1165 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1166 {-1, libcall, false}}}},
1167 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1168 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1169 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}},
1171 4, /* scalar_stmt_cost. */
1172 2, /* scalar load_cost. */
1173 2, /* scalar_store_cost. */
1174 6, /* vec_stmt_cost. */
1175 0, /* vec_to_scalar_cost. */
1176 2, /* scalar_to_vec_cost. */
1177 2, /* vec_align_load_cost. */
1178 2, /* vec_unalign_load_cost. */
1179 2, /* vec_store_cost. */
1180 2, /* cond_taken_branch_cost. */
1181 1, /* cond_not_taken_branch_cost. */
1184 struct processor_costs btver2_cost = {
1185 COSTS_N_INSNS (1), /* cost of an add instruction */
1186 COSTS_N_INSNS (2), /* cost of a lea instruction */
1187 COSTS_N_INSNS (1), /* variable shift costs */
1188 COSTS_N_INSNS (1), /* constant shift costs */
1189 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1190 COSTS_N_INSNS (4), /* HI */
1191 COSTS_N_INSNS (3), /* SI */
1192 COSTS_N_INSNS (4), /* DI */
1193 COSTS_N_INSNS (5)}, /* other */
1194 0, /* cost of multiply per each bit set */
1195 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1196 COSTS_N_INSNS (35), /* HI */
1197 COSTS_N_INSNS (51), /* SI */
1198 COSTS_N_INSNS (83), /* DI */
1199 COSTS_N_INSNS (83)}, /* other */
1200 COSTS_N_INSNS (1), /* cost of movsx */
1201 COSTS_N_INSNS (1), /* cost of movzx */
1202 8, /* "large" insn */
1203 9, /* MOVE_RATIO */
1204 4, /* cost for loading QImode using movzbl */
1205 {3, 4, 3}, /* cost of loading integer registers
1206 in QImode, HImode and SImode.
1207 Relative to reg-reg move (2). */
1208 {3, 4, 3}, /* cost of storing integer registers */
1209 4, /* cost of reg,reg fld/fst */
1210 {4, 4, 12}, /* cost of loading fp registers
1211 in SFmode, DFmode and XFmode */
1212 {6, 6, 8}, /* cost of storing fp registers
1213 in SFmode, DFmode and XFmode */
1214 2, /* cost of moving MMX register */
1215 {3, 3}, /* cost of loading MMX registers
1216 in SImode and DImode */
1217 {4, 4}, /* cost of storing MMX registers
1218 in SImode and DImode */
1219 2, /* cost of moving SSE register */
1220 {4, 4, 3}, /* cost of loading SSE registers
1221 in SImode, DImode and TImode */
1222 {4, 4, 5}, /* cost of storing SSE registers
1223 in SImode, DImode and TImode */
1224 3, /* MMX or SSE register to integer */
1225 /* On K8:
1226 MOVD reg64, xmmreg Double FSTORE 4
1227 MOVD reg32, xmmreg Double FSTORE 4
1228 On AMDFAM10:
1229 MOVD reg64, xmmreg Double FADD 3
1230 1/1 1/1
1231 MOVD reg32, xmmreg Double FADD 3
1232 1/1 1/1 */
1233 32, /* size of l1 cache. */
1234 2048, /* size of l2 cache. */
1235 64, /* size of prefetch block */
1236 100, /* number of parallel prefetches */
1237 2, /* Branch cost */
1238 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1239 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1240 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1241 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1242 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1243 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1245 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1246 {-1, rep_prefix_4_byte, false}}},
1247 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1248 {-1, libcall, false}}}},
1249 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1250 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1251 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1252 {-1, libcall, false}}}},
1253 4, /* scalar_stmt_cost. */
1254 2, /* scalar load_cost. */
1255 2, /* scalar_store_cost. */
1256 6, /* vec_stmt_cost. */
1257 0, /* vec_to_scalar_cost. */
1258 2, /* scalar_to_vec_cost. */
1259 2, /* vec_align_load_cost. */
1260 2, /* vec_unalign_load_cost. */
1261 2, /* vec_store_cost. */
1262 2, /* cond_taken_branch_cost. */
1263 1, /* cond_not_taken_branch_cost. */
1266 static const
1267 struct processor_costs pentium4_cost = {
1268 COSTS_N_INSNS (1), /* cost of an add instruction */
1269 COSTS_N_INSNS (3), /* cost of a lea instruction */
1270 COSTS_N_INSNS (4), /* variable shift costs */
1271 COSTS_N_INSNS (4), /* constant shift costs */
1272 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1273 COSTS_N_INSNS (15), /* HI */
1274 COSTS_N_INSNS (15), /* SI */
1275 COSTS_N_INSNS (15), /* DI */
1276 COSTS_N_INSNS (15)}, /* other */
1277 0, /* cost of multiply per each bit set */
1278 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1279 COSTS_N_INSNS (56), /* HI */
1280 COSTS_N_INSNS (56), /* SI */
1281 COSTS_N_INSNS (56), /* DI */
1282 COSTS_N_INSNS (56)}, /* other */
1283 COSTS_N_INSNS (1), /* cost of movsx */
1284 COSTS_N_INSNS (1), /* cost of movzx */
1285 16, /* "large" insn */
1286 6, /* MOVE_RATIO */
1287 2, /* cost for loading QImode using movzbl */
1288 {4, 5, 4}, /* cost of loading integer registers
1289 in QImode, HImode and SImode.
1290 Relative to reg-reg move (2). */
1291 {2, 3, 2}, /* cost of storing integer registers */
1292 2, /* cost of reg,reg fld/fst */
1293 {2, 2, 6}, /* cost of loading fp registers
1294 in SFmode, DFmode and XFmode */
1295 {4, 4, 6}, /* cost of storing fp registers
1296 in SFmode, DFmode and XFmode */
1297 2, /* cost of moving MMX register */
1298 {2, 2}, /* cost of loading MMX registers
1299 in SImode and DImode */
1300 {2, 2}, /* cost of storing MMX registers
1301 in SImode and DImode */
1302 12, /* cost of moving SSE register */
1303 {12, 12, 12}, /* cost of loading SSE registers
1304 in SImode, DImode and TImode */
1305 {2, 2, 8}, /* cost of storing SSE registers
1306 in SImode, DImode and TImode */
1307 10, /* MMX or SSE register to integer */
1308 8, /* size of l1 cache. */
1309 256, /* size of l2 cache. */
1310 64, /* size of prefetch block */
1311 6, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1319 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1320 DUMMY_STRINGOP_ALGS},
1321 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1322 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1323 DUMMY_STRINGOP_ALGS},
1324 1, /* scalar_stmt_cost. */
1325 1, /* scalar load_cost. */
1326 1, /* scalar_store_cost. */
1327 1, /* vec_stmt_cost. */
1328 1, /* vec_to_scalar_cost. */
1329 1, /* scalar_to_vec_cost. */
1330 1, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 1, /* vec_store_cost. */
1333 3, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1337 static const
1338 struct processor_costs nocona_cost = {
1339 COSTS_N_INSNS (1), /* cost of an add instruction */
1340 COSTS_N_INSNS (1), /* cost of a lea instruction */
1341 COSTS_N_INSNS (1), /* variable shift costs */
1342 COSTS_N_INSNS (1), /* constant shift costs */
1343 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1344 COSTS_N_INSNS (10), /* HI */
1345 COSTS_N_INSNS (10), /* SI */
1346 COSTS_N_INSNS (10), /* DI */
1347 COSTS_N_INSNS (10)}, /* other */
1348 0, /* cost of multiply per each bit set */
1349 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1350 COSTS_N_INSNS (66), /* HI */
1351 COSTS_N_INSNS (66), /* SI */
1352 COSTS_N_INSNS (66), /* DI */
1353 COSTS_N_INSNS (66)}, /* other */
1354 COSTS_N_INSNS (1), /* cost of movsx */
1355 COSTS_N_INSNS (1), /* cost of movzx */
1356 16, /* "large" insn */
1357 17, /* MOVE_RATIO */
1358 4, /* cost for loading QImode using movzbl */
1359 {4, 4, 4}, /* cost of loading integer registers
1360 in QImode, HImode and SImode.
1361 Relative to reg-reg move (2). */
1362 {4, 4, 4}, /* cost of storing integer registers */
1363 3, /* cost of reg,reg fld/fst */
1364 {12, 12, 12}, /* cost of loading fp registers
1365 in SFmode, DFmode and XFmode */
1366 {4, 4, 4}, /* cost of storing fp registers
1367 in SFmode, DFmode and XFmode */
1368 6, /* cost of moving MMX register */
1369 {12, 12}, /* cost of loading MMX registers
1370 in SImode and DImode */
1371 {12, 12}, /* cost of storing MMX registers
1372 in SImode and DImode */
1373 6, /* cost of moving SSE register */
1374 {12, 12, 12}, /* cost of loading SSE registers
1375 in SImode, DImode and TImode */
1376 {12, 12, 12}, /* cost of storing SSE registers
1377 in SImode, DImode and TImode */
1378 8, /* MMX or SSE register to integer */
1379 8, /* size of l1 cache. */
1380 1024, /* size of l2 cache. */
1381 64, /* size of prefetch block */
1382 8, /* number of parallel prefetches */
1383 1, /* Branch cost */
1384 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1385 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1386 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1387 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1388 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1389 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1390 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1391 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1392 {100000, unrolled_loop, false}, {-1, libcall, false}}}},
1393 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1394 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1395 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1396 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1397 1, /* scalar_stmt_cost. */
1398 1, /* scalar load_cost. */
1399 1, /* scalar_store_cost. */
1400 1, /* vec_stmt_cost. */
1401 1, /* vec_to_scalar_cost. */
1402 1, /* scalar_to_vec_cost. */
1403 1, /* vec_align_load_cost. */
1404 2, /* vec_unalign_load_cost. */
1405 1, /* vec_store_cost. */
1406 3, /* cond_taken_branch_cost. */
1407 1, /* cond_not_taken_branch_cost. */
1410 static const
1411 struct processor_costs atom_cost = {
1412 COSTS_N_INSNS (1), /* cost of an add instruction */
1413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1414 COSTS_N_INSNS (1), /* variable shift costs */
1415 COSTS_N_INSNS (1), /* constant shift costs */
1416 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1417 COSTS_N_INSNS (4), /* HI */
1418 COSTS_N_INSNS (3), /* SI */
1419 COSTS_N_INSNS (4), /* DI */
1420 COSTS_N_INSNS (2)}, /* other */
1421 0, /* cost of multiply per each bit set */
1422 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1423 COSTS_N_INSNS (26), /* HI */
1424 COSTS_N_INSNS (42), /* SI */
1425 COSTS_N_INSNS (74), /* DI */
1426 COSTS_N_INSNS (74)}, /* other */
1427 COSTS_N_INSNS (1), /* cost of movsx */
1428 COSTS_N_INSNS (1), /* cost of movzx */
1429 8, /* "large" insn */
1430 17, /* MOVE_RATIO */
1431 4, /* cost for loading QImode using movzbl */
1432 {4, 4, 4}, /* cost of loading integer registers
1433 in QImode, HImode and SImode.
1434 Relative to reg-reg move (2). */
1435 {4, 4, 4}, /* cost of storing integer registers */
1436 4, /* cost of reg,reg fld/fst */
1437 {12, 12, 12}, /* cost of loading fp registers
1438 in SFmode, DFmode and XFmode */
1439 {6, 6, 8}, /* cost of storing fp registers
1440 in SFmode, DFmode and XFmode */
1441 2, /* cost of moving MMX register */
1442 {8, 8}, /* cost of loading MMX registers
1443 in SImode and DImode */
1444 {8, 8}, /* cost of storing MMX registers
1445 in SImode and DImode */
1446 2, /* cost of moving SSE register */
1447 {8, 8, 8}, /* cost of loading SSE registers
1448 in SImode, DImode and TImode */
1449 {8, 8, 8}, /* cost of storing SSE registers
1450 in SImode, DImode and TImode */
1451 5, /* MMX or SSE register to integer */
1452 32, /* size of l1 cache. */
1453 256, /* size of l2 cache. */
1454 64, /* size of prefetch block */
1455 6, /* number of parallel prefetches */
1456 3, /* Branch cost */
1457 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1458 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1459 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1460 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1461 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1462 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1463 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1464 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1465 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1466 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1467 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1468 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1469 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1470 1, /* scalar_stmt_cost. */
1471 1, /* scalar load_cost. */
1472 1, /* scalar_store_cost. */
1473 1, /* vec_stmt_cost. */
1474 1, /* vec_to_scalar_cost. */
1475 1, /* scalar_to_vec_cost. */
1476 1, /* vec_align_load_cost. */
1477 2, /* vec_unalign_load_cost. */
1478 1, /* vec_store_cost. */
1479 3, /* cond_taken_branch_cost. */
1480 1, /* cond_not_taken_branch_cost. */
1483 /* Generic64 should produce code tuned for Nocona and K8. */
1484 static const
1485 struct processor_costs generic64_cost = {
1486 COSTS_N_INSNS (1), /* cost of an add instruction */
1487 /* On all chips taken into consideration lea is 2 cycles and more. With
1488 this cost however our current implementation of synth_mult results in
1489 use of unnecessary temporary registers causing regression on several
1490 SPECfp benchmarks. */
1491 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1492 COSTS_N_INSNS (1), /* variable shift costs */
1493 COSTS_N_INSNS (1), /* constant shift costs */
1494 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1495 COSTS_N_INSNS (4), /* HI */
1496 COSTS_N_INSNS (3), /* SI */
1497 COSTS_N_INSNS (4), /* DI */
1498 COSTS_N_INSNS (2)}, /* other */
1499 0, /* cost of multiply per each bit set */
1500 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1501 COSTS_N_INSNS (26), /* HI */
1502 COSTS_N_INSNS (42), /* SI */
1503 COSTS_N_INSNS (74), /* DI */
1504 COSTS_N_INSNS (74)}, /* other */
1505 COSTS_N_INSNS (1), /* cost of movsx */
1506 COSTS_N_INSNS (1), /* cost of movzx */
1507 8, /* "large" insn */
1508 17, /* MOVE_RATIO */
1509 4, /* cost for loading QImode using movzbl */
1510 {4, 4, 4}, /* cost of loading integer registers
1511 in QImode, HImode and SImode.
1512 Relative to reg-reg move (2). */
1513 {4, 4, 4}, /* cost of storing integer registers */
1514 4, /* cost of reg,reg fld/fst */
1515 {12, 12, 12}, /* cost of loading fp registers
1516 in SFmode, DFmode and XFmode */
1517 {6, 6, 8}, /* cost of storing fp registers
1518 in SFmode, DFmode and XFmode */
1519 2, /* cost of moving MMX register */
1520 {8, 8}, /* cost of loading MMX registers
1521 in SImode and DImode */
1522 {8, 8}, /* cost of storing MMX registers
1523 in SImode and DImode */
1524 2, /* cost of moving SSE register */
1525 {8, 8, 8}, /* cost of loading SSE registers
1526 in SImode, DImode and TImode */
1527 {8, 8, 8}, /* cost of storing SSE registers
1528 in SImode, DImode and TImode */
1529 5, /* MMX or SSE register to integer */
1530 32, /* size of l1 cache. */
1531 512, /* size of l2 cache. */
1532 64, /* size of prefetch block */
1533 6, /* number of parallel prefetches */
1534 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1535 value is increased to perhaps more appropriate value of 5. */
1536 3, /* Branch cost */
1537 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1538 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1539 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1540 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1541 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1542 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1543 {DUMMY_STRINGOP_ALGS,
1544 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1545 {-1, libcall, false}}}},
1546 {DUMMY_STRINGOP_ALGS,
1547 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1548 {-1, libcall, false}}}},
1549 1, /* scalar_stmt_cost. */
1550 1, /* scalar load_cost. */
1551 1, /* scalar_store_cost. */
1552 1, /* vec_stmt_cost. */
1553 1, /* vec_to_scalar_cost. */
1554 1, /* scalar_to_vec_cost. */
1555 1, /* vec_align_load_cost. */
1556 2, /* vec_unalign_load_cost. */
1557 1, /* vec_store_cost. */
1558 3, /* cond_taken_branch_cost. */
1559 1, /* cond_not_taken_branch_cost. */
1562 /* core_cost should produce code tuned for Core familly of CPUs. */
1563 static const
1564 struct processor_costs core_cost = {
1565 COSTS_N_INSNS (1), /* cost of an add instruction */
1566 /* On all chips taken into consideration lea is 2 cycles and more. With
1567 this cost however our current implementation of synth_mult results in
1568 use of unnecessary temporary registers causing regression on several
1569 SPECfp benchmarks. */
1570 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1571 COSTS_N_INSNS (1), /* variable shift costs */
1572 COSTS_N_INSNS (1), /* constant shift costs */
1573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1574 COSTS_N_INSNS (4), /* HI */
1575 COSTS_N_INSNS (3), /* SI */
1576 COSTS_N_INSNS (4), /* DI */
1577 COSTS_N_INSNS (2)}, /* other */
1578 0, /* cost of multiply per each bit set */
1579 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1580 COSTS_N_INSNS (26), /* HI */
1581 COSTS_N_INSNS (42), /* SI */
1582 COSTS_N_INSNS (74), /* DI */
1583 COSTS_N_INSNS (74)}, /* other */
1584 COSTS_N_INSNS (1), /* cost of movsx */
1585 COSTS_N_INSNS (1), /* cost of movzx */
1586 8, /* "large" insn */
1587 17, /* MOVE_RATIO */
1588 4, /* cost for loading QImode using movzbl */
1589 {4, 4, 4}, /* cost of loading integer registers
1590 in QImode, HImode and SImode.
1591 Relative to reg-reg move (2). */
1592 {4, 4, 4}, /* cost of storing integer registers */
1593 4, /* cost of reg,reg fld/fst */
1594 {12, 12, 12}, /* cost of loading fp registers
1595 in SFmode, DFmode and XFmode */
1596 {6, 6, 8}, /* cost of storing fp registers
1597 in SFmode, DFmode and XFmode */
1598 2, /* cost of moving MMX register */
1599 {8, 8}, /* cost of loading MMX registers
1600 in SImode and DImode */
1601 {8, 8}, /* cost of storing MMX registers
1602 in SImode and DImode */
1603 2, /* cost of moving SSE register */
1604 {8, 8, 8}, /* cost of loading SSE registers
1605 in SImode, DImode and TImode */
1606 {8, 8, 8}, /* cost of storing SSE registers
1607 in SImode, DImode and TImode */
1608 5, /* MMX or SSE register to integer */
1609 64, /* size of l1 cache. */
1610 512, /* size of l2 cache. */
1611 64, /* size of prefetch block */
1612 6, /* number of parallel prefetches */
1613 /* FIXME perhaps more appropriate value is 5. */
1614 3, /* Branch cost */
1615 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1616 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1617 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1618 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1619 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1620 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1621 {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1622 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1623 {-1, libcall, false}}}},
1624 {{libcall, {{6, loop_1_byte, true},
1625 {24, loop, true},
1626 {8192, rep_prefix_4_byte, true},
1627 {-1, libcall, false}}},
1628 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1629 {-1, libcall, false}}}},
1630 1, /* scalar_stmt_cost. */
1631 1, /* scalar load_cost. */
1632 1, /* scalar_store_cost. */
1633 1, /* vec_stmt_cost. */
1634 1, /* vec_to_scalar_cost. */
1635 1, /* scalar_to_vec_cost. */
1636 1, /* vec_align_load_cost. */
1637 2, /* vec_unalign_load_cost. */
1638 1, /* vec_store_cost. */
1639 3, /* cond_taken_branch_cost. */
1640 1, /* cond_not_taken_branch_cost. */
1643 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1644 Athlon and K8. */
1645 static const
1646 struct processor_costs generic32_cost = {
1647 COSTS_N_INSNS (1), /* cost of an add instruction */
1648 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1649 COSTS_N_INSNS (1), /* variable shift costs */
1650 COSTS_N_INSNS (1), /* constant shift costs */
1651 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1652 COSTS_N_INSNS (4), /* HI */
1653 COSTS_N_INSNS (3), /* SI */
1654 COSTS_N_INSNS (4), /* DI */
1655 COSTS_N_INSNS (2)}, /* other */
1656 0, /* cost of multiply per each bit set */
1657 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1658 COSTS_N_INSNS (26), /* HI */
1659 COSTS_N_INSNS (42), /* SI */
1660 COSTS_N_INSNS (74), /* DI */
1661 COSTS_N_INSNS (74)}, /* other */
1662 COSTS_N_INSNS (1), /* cost of movsx */
1663 COSTS_N_INSNS (1), /* cost of movzx */
1664 8, /* "large" insn */
1665 17, /* MOVE_RATIO */
1666 4, /* cost for loading QImode using movzbl */
1667 {4, 4, 4}, /* cost of loading integer registers
1668 in QImode, HImode and SImode.
1669 Relative to reg-reg move (2). */
1670 {4, 4, 4}, /* cost of storing integer registers */
1671 4, /* cost of reg,reg fld/fst */
1672 {12, 12, 12}, /* cost of loading fp registers
1673 in SFmode, DFmode and XFmode */
1674 {6, 6, 8}, /* cost of storing fp registers
1675 in SFmode, DFmode and XFmode */
1676 2, /* cost of moving MMX register */
1677 {8, 8}, /* cost of loading MMX registers
1678 in SImode and DImode */
1679 {8, 8}, /* cost of storing MMX registers
1680 in SImode and DImode */
1681 2, /* cost of moving SSE register */
1682 {8, 8, 8}, /* cost of loading SSE registers
1683 in SImode, DImode and TImode */
1684 {8, 8, 8}, /* cost of storing SSE registers
1685 in SImode, DImode and TImode */
1686 5, /* MMX or SSE register to integer */
1687 32, /* size of l1 cache. */
1688 256, /* size of l2 cache. */
1689 64, /* size of prefetch block */
1690 6, /* number of parallel prefetches */
1691 3, /* Branch cost */
1692 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1693 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1694 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1695 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1696 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1697 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1698 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1699 {-1, libcall, false}}},
1700 DUMMY_STRINGOP_ALGS},
1701 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1702 {-1, libcall, false}}},
1703 DUMMY_STRINGOP_ALGS},
1704 1, /* scalar_stmt_cost. */
1705 1, /* scalar load_cost. */
1706 1, /* scalar_store_cost. */
1707 1, /* vec_stmt_cost. */
1708 1, /* vec_to_scalar_cost. */
1709 1, /* scalar_to_vec_cost. */
1710 1, /* vec_align_load_cost. */
1711 2, /* vec_unalign_load_cost. */
1712 1, /* vec_store_cost. */
1713 3, /* cond_taken_branch_cost. */
1714 1, /* cond_not_taken_branch_cost. */
1717 /* Set by -mtune. */
1718 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1720 /* Set by -mtune or -Os. */
1721 const struct processor_costs *ix86_cost = &pentium_cost;
1723 /* Processor feature/optimization bitmasks. */
1724 #define m_386 (1<<PROCESSOR_I386)
1725 #define m_486 (1<<PROCESSOR_I486)
1726 #define m_PENT (1<<PROCESSOR_PENTIUM)
1727 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1728 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1729 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1730 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1731 #define m_CORE2 (1<<PROCESSOR_CORE2)
1732 #define m_COREI7 (1<<PROCESSOR_COREI7)
1733 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1734 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1735 #define m_ATOM (1<<PROCESSOR_ATOM)
1737 #define m_GEODE (1<<PROCESSOR_GEODE)
1738 #define m_K6 (1<<PROCESSOR_K6)
1739 #define m_K6_GEODE (m_K6 | m_GEODE)
1740 #define m_K8 (1<<PROCESSOR_K8)
1741 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1742 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1743 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1744 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1745 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1746 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1747 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1748 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1749 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1750 #define m_BTVER (m_BTVER1 | m_BTVER2)
1751 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1753 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1754 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1756 /* Generic instruction choice should be common subset of supported CPUs
1757 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1758 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1760 /* Feature tests against the various tunings. */
1761 unsigned char ix86_tune_features[X86_TUNE_LAST];
1763 /* Feature tests against the various tunings used to create ix86_tune_features
1764 based on the processor mask. */
1765 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1766 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1767 negatively, so enabling for Generic64 seems like good code size
1768 tradeoff. We can't enable it for 32bit generic because it does not
1769 work well with PPro base chips. */
1770 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1772 /* X86_TUNE_PUSH_MEMORY */
1773 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1775 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1776 m_486 | m_PENT,
1778 /* X86_TUNE_UNROLL_STRLEN */
1779 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1781 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1782 on simulation result. But after P4 was made, no performance benefit
1783 was observed with branch hints. It also increases the code size.
1784 As a result, icc never generates branch hints. */
1787 /* X86_TUNE_DOUBLE_WITH_ADD */
1788 ~m_386,
1790 /* X86_TUNE_USE_SAHF */
1791 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1793 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1794 partial dependencies. */
1795 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1797 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1798 register stalls on Generic32 compilation setting as well. However
1799 in current implementation the partial register stalls are not eliminated
1800 very well - they can be introduced via subregs synthesized by combine
1801 and can happen in caller/callee saving sequences. Because this option
1802 pays back little on PPro based chips and is in conflict with partial reg
1803 dependencies used by Athlon/P4 based chips, it is better to leave it off
1804 for generic32 for now. */
1805 m_PPRO,
1807 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1808 m_CORE_ALL | m_GENERIC,
1810 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1811 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1812 m_CORE_ALL | m_GENERIC,
1814 /* X86_TUNE_USE_HIMODE_FIOP */
1815 m_386 | m_486 | m_K6_GEODE,
1817 /* X86_TUNE_USE_SIMODE_FIOP */
1818 ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1820 /* X86_TUNE_USE_MOV0 */
1821 m_K6,
1823 /* X86_TUNE_USE_CLTD */
1824 ~(m_PENT | m_ATOM | m_K6),
1826 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1827 m_PENT4,
1829 /* X86_TUNE_SPLIT_LONG_MOVES */
1830 m_PPRO,
1832 /* X86_TUNE_READ_MODIFY_WRITE */
1833 ~m_PENT,
1835 /* X86_TUNE_READ_MODIFY */
1836 ~(m_PENT | m_PPRO),
1838 /* X86_TUNE_PROMOTE_QIMODE */
1839 m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1841 /* X86_TUNE_FAST_PREFIX */
1842 ~(m_386 | m_486 | m_PENT),
1844 /* X86_TUNE_SINGLE_STRINGOP */
1845 m_386 | m_P4_NOCONA,
1847 /* X86_TUNE_QIMODE_MATH */
1850 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1851 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1852 might be considered for Generic32 if our scheme for avoiding partial
1853 stalls was more effective. */
1854 ~m_PPRO,
1856 /* X86_TUNE_PROMOTE_QI_REGS */
1859 /* X86_TUNE_PROMOTE_HI_REGS */
1860 m_PPRO,
1862 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1863 over esp addition. */
1864 m_386 | m_486 | m_PENT | m_PPRO,
1866 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1867 over esp addition. */
1868 m_PENT,
1870 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1871 over esp subtraction. */
1872 m_386 | m_486 | m_PENT | m_K6_GEODE,
1874 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1875 over esp subtraction. */
1876 m_PENT | m_K6_GEODE,
1878 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1879 for DFmode copies */
1880 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1882 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1883 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1885 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1886 conflict here in between PPro/Pentium4 based chips that thread 128bit
1887 SSE registers as single units versus K8 based chips that divide SSE
1888 registers to two 64bit halves. This knob promotes all store destinations
1889 to be 128bit to allow register renaming on 128bit SSE units, but usually
1890 results in one extra microop on 64bit SSE units. Experimental results
1891 shows that disabling this option on P4 brings over 20% SPECfp regression,
1892 while enabling it on K8 brings roughly 2.4% regression that can be partly
1893 masked by careful scheduling of moves. */
1894 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1896 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1897 m_COREI7 | m_HASWELL | m_AMDFAM10 | m_BDVER | m_BTVER,
1899 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1900 m_COREI7 | m_HASWELL| m_BDVER,
1902 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1903 m_BDVER ,
1905 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1906 are resolved on SSE register parts instead of whole registers, so we may
1907 maintain just lower part of scalar values in proper format leaving the
1908 upper part undefined. */
1909 m_ATHLON_K8,
1911 /* X86_TUNE_SSE_TYPELESS_STORES */
1912 m_AMD_MULTIPLE,
1914 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1915 m_PPRO | m_P4_NOCONA,
1917 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1918 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1920 /* X86_TUNE_PROLOGUE_USING_MOVE */
1921 m_PPRO | m_ATHLON_K8,
1923 /* X86_TUNE_EPILOGUE_USING_MOVE */
1924 m_PPRO | m_ATHLON_K8,
1926 /* X86_TUNE_SHIFT1 */
1927 ~m_486,
1929 /* X86_TUNE_USE_FFREEP */
1930 m_AMD_MULTIPLE,
1932 /* X86_TUNE_INTER_UNIT_MOVES */
1933 ~(m_AMD_MULTIPLE | m_GENERIC),
1935 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1936 ~(m_AMDFAM10 | m_BDVER ),
1938 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1939 than 4 branch instructions in the 16 byte window. */
1940 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1942 /* X86_TUNE_SCHEDULE */
1943 m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1945 /* X86_TUNE_USE_BT */
1946 m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1948 /* X86_TUNE_USE_INCDEC */
1949 ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC),
1951 /* X86_TUNE_PAD_RETURNS */
1952 m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
1954 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1955 m_ATOM,
1957 /* X86_TUNE_EXT_80387_CONSTANTS */
1958 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1960 /* X86_TUNE_AVOID_VECTOR_DECODE */
1961 m_CORE_ALL | m_K8 | m_GENERIC64,
1963 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1964 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1965 ~(m_386 | m_486),
1967 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1968 vector path on AMD machines. */
1969 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1971 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1972 machines. */
1973 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1975 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1976 than a MOV. */
1977 m_PENT,
1979 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1980 but one byte longer. */
1981 m_PENT,
1983 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1984 operand that cannot be represented using a modRM byte. The XOR
1985 replacement is long decoded, so this split helps here as well. */
1986 m_K6,
1988 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1989 from FP to FP. */
1990 m_CORE_ALL | m_AMDFAM10 | m_GENERIC,
1992 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1993 from integer to FP. */
1994 m_AMDFAM10,
1996 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1997 with a subsequent conditional jump instruction into a single
1998 compare-and-branch uop. */
1999 m_BDVER,
2001 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2002 will impact LEA instruction selection. */
2003 m_ATOM,
2005 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2006 instructions. */
2007 ~m_ATOM,
2009 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2010 at -O3. For the moment, the prefetching seems badly tuned for Intel
2011 chips. */
2012 m_K6_GEODE | m_AMD_MULTIPLE,
2014 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2015 the auto-vectorizer. */
2016 m_BDVER | m_BTVER2,
2018 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2019 during reassociation of integer computation. */
2020 m_ATOM,
2022 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2023 during reassociation of fp computation. */
2024 m_ATOM | m_HASWELL,
2026 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2027 regs instead of memory. */
2028 m_CORE_ALL,
2030 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2031 a conditional move. */
2032 m_ATOM
2035 /* Feature tests against the various architecture variations. */
2036 unsigned char ix86_arch_features[X86_ARCH_LAST];
2038 /* Feature tests against the various architecture variations, used to create
2039 ix86_arch_features based on the processor mask. */
2040 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2041 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2042 ~(m_386 | m_486 | m_PENT | m_K6),
2044 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2045 ~m_386,
2047 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2048 ~(m_386 | m_486),
2050 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2051 ~m_386,
2053 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2054 ~m_386,
2057 static const unsigned int x86_accumulate_outgoing_args
2058 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
2060 static const unsigned int x86_arch_always_fancy_math_387
2061 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2063 static const unsigned int x86_avx256_split_unaligned_load
2064 = m_COREI7 | m_GENERIC;
2066 static const unsigned int x86_avx256_split_unaligned_store
2067 = m_COREI7 | m_BDVER | m_GENERIC;
2069 /* In case the average insn count for single function invocation is
2070 lower than this constant, emit fast (but longer) prologue and
2071 epilogue code. */
2072 #define FAST_PROLOGUE_INSN_COUNT 20
2074 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2075 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2076 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2077 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2079 /* Array of the smallest class containing reg number REGNO, indexed by
2080 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2082 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2084 /* ax, dx, cx, bx */
2085 AREG, DREG, CREG, BREG,
2086 /* si, di, bp, sp */
2087 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2088 /* FP registers */
2089 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2090 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2091 /* arg pointer */
2092 NON_Q_REGS,
2093 /* flags, fpsr, fpcr, frame */
2094 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2095 /* SSE registers */
2096 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2097 SSE_REGS, SSE_REGS,
2098 /* MMX registers */
2099 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2100 MMX_REGS, MMX_REGS,
2101 /* REX registers */
2102 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2103 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2104 /* SSE REX registers */
2105 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2106 SSE_REGS, SSE_REGS,
2109 /* The "default" register map used in 32bit mode. */
2111 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2113 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2114 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2115 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2116 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2117 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2118 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2119 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2122 /* The "default" register map used in 64bit mode. */
2124 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2126 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2127 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2128 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2129 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2130 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2131 8,9,10,11,12,13,14,15, /* extended integer registers */
2132 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2135 /* Define the register numbers to be used in Dwarf debugging information.
2136 The SVR4 reference port C compiler uses the following register numbers
2137 in its Dwarf output code:
2138 0 for %eax (gcc regno = 0)
2139 1 for %ecx (gcc regno = 2)
2140 2 for %edx (gcc regno = 1)
2141 3 for %ebx (gcc regno = 3)
2142 4 for %esp (gcc regno = 7)
2143 5 for %ebp (gcc regno = 6)
2144 6 for %esi (gcc regno = 4)
2145 7 for %edi (gcc regno = 5)
2146 The following three DWARF register numbers are never generated by
2147 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2148 believes these numbers have these meanings.
2149 8 for %eip (no gcc equivalent)
2150 9 for %eflags (gcc regno = 17)
2151 10 for %trapno (no gcc equivalent)
2152 It is not at all clear how we should number the FP stack registers
2153 for the x86 architecture. If the version of SDB on x86/svr4 were
2154 a bit less brain dead with respect to floating-point then we would
2155 have a precedent to follow with respect to DWARF register numbers
2156 for x86 FP registers, but the SDB on x86/svr4 is so completely
2157 broken with respect to FP registers that it is hardly worth thinking
2158 of it as something to strive for compatibility with.
2159 The version of x86/svr4 SDB I have at the moment does (partially)
2160 seem to believe that DWARF register number 11 is associated with
2161 the x86 register %st(0), but that's about all. Higher DWARF
2162 register numbers don't seem to be associated with anything in
2163 particular, and even for DWARF regno 11, SDB only seems to under-
2164 stand that it should say that a variable lives in %st(0) (when
2165 asked via an `=' command) if we said it was in DWARF regno 11,
2166 but SDB still prints garbage when asked for the value of the
2167 variable in question (via a `/' command).
2168 (Also note that the labels SDB prints for various FP stack regs
2169 when doing an `x' command are all wrong.)
2170 Note that these problems generally don't affect the native SVR4
2171 C compiler because it doesn't allow the use of -O with -g and
2172 because when it is *not* optimizing, it allocates a memory
2173 location for each floating-point variable, and the memory
2174 location is what gets described in the DWARF AT_location
2175 attribute for the variable in question.
2176 Regardless of the severe mental illness of the x86/svr4 SDB, we
2177 do something sensible here and we use the following DWARF
2178 register numbers. Note that these are all stack-top-relative
2179 numbers.
2180 11 for %st(0) (gcc regno = 8)
2181 12 for %st(1) (gcc regno = 9)
2182 13 for %st(2) (gcc regno = 10)
2183 14 for %st(3) (gcc regno = 11)
2184 15 for %st(4) (gcc regno = 12)
2185 16 for %st(5) (gcc regno = 13)
2186 17 for %st(6) (gcc regno = 14)
2187 18 for %st(7) (gcc regno = 15)
2189 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2191 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2192 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2193 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2194 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2195 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2196 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2197 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2200 /* Define parameter passing and return registers. */
2202 static int const x86_64_int_parameter_registers[6] =
2204 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2207 static int const x86_64_ms_abi_int_parameter_registers[4] =
2209 CX_REG, DX_REG, R8_REG, R9_REG
2212 static int const x86_64_int_return_registers[4] =
2214 AX_REG, DX_REG, DI_REG, SI_REG
2217 /* Define the structure for the machine field in struct function. */
2219 struct GTY(()) stack_local_entry {
2220 unsigned short mode;
2221 unsigned short n;
2222 rtx rtl;
2223 struct stack_local_entry *next;
2226 /* Structure describing stack frame layout.
2227 Stack grows downward:
2229 [arguments]
2230 <- ARG_POINTER
2231 saved pc
2233 saved static chain if ix86_static_chain_on_stack
2235 saved frame pointer if frame_pointer_needed
2236 <- HARD_FRAME_POINTER
2237 [saved regs]
2238 <- regs_save_offset
2239 [padding0]
2241 [saved SSE regs]
2242 <- sse_regs_save_offset
2243 [padding1] |
2244 | <- FRAME_POINTER
2245 [va_arg registers] |
2247 [frame] |
2249 [padding2] | = to_allocate
2250 <- STACK_POINTER
2252 struct ix86_frame
2254 int nsseregs;
2255 int nregs;
2256 int va_arg_size;
2257 int red_zone_size;
2258 int outgoing_arguments_size;
2260 /* The offsets relative to ARG_POINTER. */
2261 HOST_WIDE_INT frame_pointer_offset;
2262 HOST_WIDE_INT hard_frame_pointer_offset;
2263 HOST_WIDE_INT stack_pointer_offset;
2264 HOST_WIDE_INT hfp_save_offset;
2265 HOST_WIDE_INT reg_save_offset;
2266 HOST_WIDE_INT sse_reg_save_offset;
2268 /* When save_regs_using_mov is set, emit prologue using
2269 move instead of push instructions. */
2270 bool save_regs_using_mov;
2273 /* Which cpu are we scheduling for. */
2274 enum attr_cpu ix86_schedule;
2276 /* Which cpu are we optimizing for. */
2277 enum processor_type ix86_tune;
2279 /* Which instruction set architecture to use. */
2280 enum processor_type ix86_arch;
2282 /* True if processor has SSE prefetch instruction. */
2283 unsigned char x86_prefetch_sse;
2285 /* -mstackrealign option */
2286 static const char ix86_force_align_arg_pointer_string[]
2287 = "force_align_arg_pointer";
2289 static rtx (*ix86_gen_leave) (void);
2290 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2291 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2292 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2293 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2294 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2295 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2296 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2297 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2298 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2299 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2300 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2302 /* Preferred alignment for stack boundary in bits. */
2303 unsigned int ix86_preferred_stack_boundary;
2305 /* Alignment for incoming stack boundary in bits specified at
2306 command line. */
2307 static unsigned int ix86_user_incoming_stack_boundary;
2309 /* Default alignment for incoming stack boundary in bits. */
2310 static unsigned int ix86_default_incoming_stack_boundary;
2312 /* Alignment for incoming stack boundary in bits. */
2313 unsigned int ix86_incoming_stack_boundary;
2315 /* Calling abi specific va_list type nodes. */
2316 static GTY(()) tree sysv_va_list_type_node;
2317 static GTY(()) tree ms_va_list_type_node;
2319 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2320 char internal_label_prefix[16];
2321 int internal_label_prefix_len;
2323 /* Fence to use after loop using movnt. */
2324 tree x86_mfence;
2326 /* Register class used for passing given 64bit part of the argument.
2327 These represent classes as documented by the PS ABI, with the exception
2328 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2329 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2331 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2332 whenever possible (upper half does contain padding). */
2333 enum x86_64_reg_class
2335 X86_64_NO_CLASS,
2336 X86_64_INTEGER_CLASS,
2337 X86_64_INTEGERSI_CLASS,
2338 X86_64_SSE_CLASS,
2339 X86_64_SSESF_CLASS,
2340 X86_64_SSEDF_CLASS,
2341 X86_64_SSEUP_CLASS,
2342 X86_64_X87_CLASS,
2343 X86_64_X87UP_CLASS,
2344 X86_64_COMPLEX_X87_CLASS,
2345 X86_64_MEMORY_CLASS
2348 #define MAX_CLASSES 4
2350 /* Table of constants used by fldpi, fldln2, etc.... */
2351 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2352 static bool ext_80387_constants_init = 0;
2355 static struct machine_function * ix86_init_machine_status (void);
2356 static rtx ix86_function_value (const_tree, const_tree, bool);
2357 static bool ix86_function_value_regno_p (const unsigned int);
2358 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2359 const_tree);
2360 static rtx ix86_static_chain (const_tree, bool);
2361 static int ix86_function_regparm (const_tree, const_tree);
2362 static void ix86_compute_frame_layout (struct ix86_frame *);
2363 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2364 rtx, rtx, int);
2365 static void ix86_add_new_builtins (HOST_WIDE_INT);
2366 static tree ix86_canonical_va_list_type (tree);
2367 static void predict_jump (int);
2368 static unsigned int split_stack_prologue_scratch_regno (void);
2369 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2371 enum ix86_function_specific_strings
2373 IX86_FUNCTION_SPECIFIC_ARCH,
2374 IX86_FUNCTION_SPECIFIC_TUNE,
2375 IX86_FUNCTION_SPECIFIC_MAX
2378 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2379 const char *, enum fpmath_unit, bool);
2380 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2381 static void ix86_function_specific_save (struct cl_target_option *);
2382 static void ix86_function_specific_restore (struct cl_target_option *);
2383 static void ix86_function_specific_print (FILE *, int,
2384 struct cl_target_option *);
2385 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2386 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2387 struct gcc_options *);
2388 static bool ix86_can_inline_p (tree, tree);
2389 static void ix86_set_current_function (tree);
2390 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2392 static enum calling_abi ix86_function_abi (const_tree);
2395 #ifndef SUBTARGET32_DEFAULT_CPU
2396 #define SUBTARGET32_DEFAULT_CPU "i386"
2397 #endif
2399 /* Whether -mtune= or -march= were specified */
2400 static int ix86_tune_defaulted;
2401 static int ix86_arch_specified;
2403 /* Vectorization library interface and handlers. */
2404 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2406 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2407 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2409 /* Processor target table, indexed by processor number */
2410 struct ptt
2412 const char *const name; /* processor name */
2413 const struct processor_costs *cost; /* Processor costs */
2414 const int align_loop; /* Default alignments. */
2415 const int align_loop_max_skip;
2416 const int align_jump;
2417 const int align_jump_max_skip;
2418 const int align_func;
2421 /* This table must be in sync with enum processor_type in i386.h. */
2422 static const struct ptt processor_target_table[PROCESSOR_max] =
2424 {"generic", &generic32_cost, 16, 7, 16, 7, 16},
2425 {"generic", &generic64_cost, 16, 10, 16, 10, 16},
2426 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2427 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2428 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2429 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2430 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2431 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2432 {"core2", &core_cost, 16, 10, 16, 10, 16},
2433 {"corei7", &core_cost, 16, 10, 16, 10, 16},
2434 {"core-avx2", &core_cost, 16, 10, 16, 10, 16},
2435 {"atom", &atom_cost, 16, 15, 16, 7, 16},
2436 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2437 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2438 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2439 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2440 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2441 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2442 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2443 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2444 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2445 {"btver2", &btver2_cost, 16, 10, 16, 7, 11}
2448 static bool
2449 gate_insert_vzeroupper (void)
2451 return TARGET_VZEROUPPER;
2454 static unsigned int
2455 rest_of_handle_insert_vzeroupper (void)
2457 int i;
2459 /* vzeroupper instructions are inserted immediately after reload to
2460 account for possible spills from 256bit registers. The pass
2461 reuses mode switching infrastructure by re-running mode insertion
2462 pass, so disable entities that have already been processed. */
2463 for (i = 0; i < MAX_386_ENTITIES; i++)
2464 ix86_optimize_mode_switching[i] = 0;
2466 ix86_optimize_mode_switching[AVX_U128] = 1;
2468 /* Call optimize_mode_switching. */
2469 pass_mode_switching.pass.execute ();
2470 return 0;
2473 struct rtl_opt_pass pass_insert_vzeroupper =
2476 RTL_PASS,
2477 "vzeroupper", /* name */
2478 OPTGROUP_NONE, /* optinfo_flags */
2479 gate_insert_vzeroupper, /* gate */
2480 rest_of_handle_insert_vzeroupper, /* execute */
2481 NULL, /* sub */
2482 NULL, /* next */
2483 0, /* static_pass_number */
2484 TV_NONE, /* tv_id */
2485 0, /* properties_required */
2486 0, /* properties_provided */
2487 0, /* properties_destroyed */
2488 0, /* todo_flags_start */
2489 TODO_df_finish | TODO_verify_rtl_sharing |
2490 0, /* todo_flags_finish */
2494 /* Return true if a red-zone is in use. */
2496 static inline bool
2497 ix86_using_red_zone (void)
2499 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2502 /* Return a string that documents the current -m options. The caller is
2503 responsible for freeing the string. */
2505 static char *
2506 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2507 const char *tune, enum fpmath_unit fpmath,
2508 bool add_nl_p)
2510 struct ix86_target_opts
2512 const char *option; /* option string */
2513 HOST_WIDE_INT mask; /* isa mask options */
2516 /* This table is ordered so that options like -msse4.2 that imply
2517 preceding options while match those first. */
2518 static struct ix86_target_opts isa_opts[] =
2520 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2521 { "-mfma", OPTION_MASK_ISA_FMA },
2522 { "-mxop", OPTION_MASK_ISA_XOP },
2523 { "-mlwp", OPTION_MASK_ISA_LWP },
2524 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2525 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2526 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2527 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2528 { "-msse3", OPTION_MASK_ISA_SSE3 },
2529 { "-msse2", OPTION_MASK_ISA_SSE2 },
2530 { "-msse", OPTION_MASK_ISA_SSE },
2531 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2532 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2533 { "-mmmx", OPTION_MASK_ISA_MMX },
2534 { "-mabm", OPTION_MASK_ISA_ABM },
2535 { "-mbmi", OPTION_MASK_ISA_BMI },
2536 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2537 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2538 { "-mhle", OPTION_MASK_ISA_HLE },
2539 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2540 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2541 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2542 { "-madx", OPTION_MASK_ISA_ADX },
2543 { "-mtbm", OPTION_MASK_ISA_TBM },
2544 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2545 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2546 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2547 { "-maes", OPTION_MASK_ISA_AES },
2548 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2549 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2550 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2551 { "-mf16c", OPTION_MASK_ISA_F16C },
2552 { "-mrtm", OPTION_MASK_ISA_RTM },
2553 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2554 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2557 /* Flag options. */
2558 static struct ix86_target_opts flag_opts[] =
2560 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2561 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2562 { "-m80387", MASK_80387 },
2563 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2564 { "-malign-double", MASK_ALIGN_DOUBLE },
2565 { "-mcld", MASK_CLD },
2566 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2567 { "-mieee-fp", MASK_IEEE_FP },
2568 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2569 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2570 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2571 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2572 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2573 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2574 { "-mno-red-zone", MASK_NO_RED_ZONE },
2575 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2576 { "-mrecip", MASK_RECIP },
2577 { "-mrtd", MASK_RTD },
2578 { "-msseregparm", MASK_SSEREGPARM },
2579 { "-mstack-arg-probe", MASK_STACK_PROBE },
2580 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2581 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2582 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2583 { "-mvzeroupper", MASK_VZEROUPPER },
2584 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2585 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2586 { "-mprefer-avx128", MASK_PREFER_AVX128},
2589 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2591 char isa_other[40];
2592 char target_other[40];
2593 unsigned num = 0;
2594 unsigned i, j;
2595 char *ret;
2596 char *ptr;
2597 size_t len;
2598 size_t line_len;
2599 size_t sep_len;
2600 const char *abi;
2602 memset (opts, '\0', sizeof (opts));
2604 /* Add -march= option. */
2605 if (arch)
2607 opts[num][0] = "-march=";
2608 opts[num++][1] = arch;
2611 /* Add -mtune= option. */
2612 if (tune)
2614 opts[num][0] = "-mtune=";
2615 opts[num++][1] = tune;
2618 /* Add -m32/-m64/-mx32. */
2619 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2621 if ((isa & OPTION_MASK_ABI_64) != 0)
2622 abi = "-m64";
2623 else
2624 abi = "-mx32";
2625 isa &= ~ (OPTION_MASK_ISA_64BIT
2626 | OPTION_MASK_ABI_64
2627 | OPTION_MASK_ABI_X32);
2629 else
2630 abi = "-m32";
2631 opts[num++][0] = abi;
2633 /* Pick out the options in isa options. */
2634 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2636 if ((isa & isa_opts[i].mask) != 0)
2638 opts[num++][0] = isa_opts[i].option;
2639 isa &= ~ isa_opts[i].mask;
2643 if (isa && add_nl_p)
2645 opts[num++][0] = isa_other;
2646 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2647 isa);
2650 /* Add flag options. */
2651 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2653 if ((flags & flag_opts[i].mask) != 0)
2655 opts[num++][0] = flag_opts[i].option;
2656 flags &= ~ flag_opts[i].mask;
2660 if (flags && add_nl_p)
2662 opts[num++][0] = target_other;
2663 sprintf (target_other, "(other flags: %#x)", flags);
2666 /* Add -fpmath= option. */
2667 if (fpmath)
2669 opts[num][0] = "-mfpmath=";
2670 switch ((int) fpmath)
2672 case FPMATH_387:
2673 opts[num++][1] = "387";
2674 break;
2676 case FPMATH_SSE:
2677 opts[num++][1] = "sse";
2678 break;
2680 case FPMATH_387 | FPMATH_SSE:
2681 opts[num++][1] = "sse+387";
2682 break;
2684 default:
2685 gcc_unreachable ();
2689 /* Any options? */
2690 if (num == 0)
2691 return NULL;
2693 gcc_assert (num < ARRAY_SIZE (opts));
2695 /* Size the string. */
2696 len = 0;
2697 sep_len = (add_nl_p) ? 3 : 1;
2698 for (i = 0; i < num; i++)
2700 len += sep_len;
2701 for (j = 0; j < 2; j++)
2702 if (opts[i][j])
2703 len += strlen (opts[i][j]);
2706 /* Build the string. */
2707 ret = ptr = (char *) xmalloc (len);
2708 line_len = 0;
2710 for (i = 0; i < num; i++)
2712 size_t len2[2];
2714 for (j = 0; j < 2; j++)
2715 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2717 if (i != 0)
2719 *ptr++ = ' ';
2720 line_len++;
2722 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2724 *ptr++ = '\\';
2725 *ptr++ = '\n';
2726 line_len = 0;
2730 for (j = 0; j < 2; j++)
2731 if (opts[i][j])
2733 memcpy (ptr, opts[i][j], len2[j]);
2734 ptr += len2[j];
2735 line_len += len2[j];
2739 *ptr = '\0';
2740 gcc_assert (ret + len >= ptr);
2742 return ret;
2745 /* Return true, if profiling code should be emitted before
2746 prologue. Otherwise it returns false.
2747 Note: For x86 with "hotfix" it is sorried. */
2748 static bool
2749 ix86_profile_before_prologue (void)
2751 return flag_fentry != 0;
2754 /* Function that is callable from the debugger to print the current
2755 options. */
2756 void
2757 ix86_debug_options (void)
2759 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2760 ix86_arch_string, ix86_tune_string,
2761 ix86_fpmath, true);
2763 if (opts)
2765 fprintf (stderr, "%s\n\n", opts);
2766 free (opts);
2768 else
2769 fputs ("<no options>\n\n", stderr);
2771 return;
2774 /* Override various settings based on options. If MAIN_ARGS_P, the
2775 options are from the command line, otherwise they are from
2776 attributes. */
2778 static void
2779 ix86_option_override_internal (bool main_args_p)
2781 int i;
2782 unsigned int ix86_arch_mask, ix86_tune_mask;
2783 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2784 const char *prefix;
2785 const char *suffix;
2786 const char *sw;
2788 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2789 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2790 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2791 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2792 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2793 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2794 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2795 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2796 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2797 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2798 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2799 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2800 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2801 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2802 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2803 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2804 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2805 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2806 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2807 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2808 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2809 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2810 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2811 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2812 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2813 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2814 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2815 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2816 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2817 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2818 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2819 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2820 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2821 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2822 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2823 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2824 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2825 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2826 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2827 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2829 /* if this reaches 64, need to widen struct pta flags below */
2831 static struct pta
2833 const char *const name; /* processor name or nickname. */
2834 const enum processor_type processor;
2835 const enum attr_cpu schedule;
2836 const unsigned HOST_WIDE_INT flags;
2838 const processor_alias_table[] =
2840 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2841 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2842 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2843 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2844 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2845 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2846 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2847 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2848 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2849 PTA_MMX | PTA_SSE | PTA_FXSR},
2850 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2851 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2852 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2853 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2854 PTA_MMX | PTA_SSE | PTA_FXSR},
2855 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2856 PTA_MMX | PTA_SSE | PTA_FXSR},
2857 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2858 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2859 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2860 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2861 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2862 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2863 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2864 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2865 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2866 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2867 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2868 {"core2", PROCESSOR_CORE2, CPU_CORE2,
2869 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2870 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2871 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
2872 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
2873 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
2874 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
2875 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2876 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2877 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2878 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2879 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
2880 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2881 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2882 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2883 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2884 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
2885 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2886 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2887 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2888 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2889 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
2890 | PTA_XSAVEOPT},
2891 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2892 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2893 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2894 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2895 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
2896 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2897 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2898 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2899 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2900 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
2901 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2902 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
2903 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2904 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
2905 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2906 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
2907 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2908 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
2909 {"x86-64", PROCESSOR_K8, CPU_K8,
2910 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
2911 {"k8", PROCESSOR_K8, CPU_K8,
2912 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2913 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2914 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2915 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2916 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2917 {"opteron", PROCESSOR_K8, CPU_K8,
2918 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2919 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2920 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2921 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2922 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2923 {"athlon64", PROCESSOR_K8, CPU_K8,
2924 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2925 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2926 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2927 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2928 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2929 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2930 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2931 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2932 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2933 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
2934 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
2935 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2936 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
2937 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
2938 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2939 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2940 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2941 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2942 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2943 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2944 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2945 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2946 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2947 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2948 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2949 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
2950 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2951 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2952 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2953 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2954 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
2955 | PTA_XSAVEOPT},
2956 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
2957 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2958 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
2959 | PTA_FXSR | PTA_XSAVE},
2960 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
2961 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2962 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
2963 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2964 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
2965 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2967 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
2968 PTA_HLE /* flags are only used for -march switch. */ },
2969 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
2970 PTA_64BIT
2971 | PTA_HLE /* flags are only used for -march switch. */ },
2974 /* -mrecip options. */
2975 static struct
2977 const char *string; /* option name */
2978 unsigned int mask; /* mask bits to set */
2980 const recip_options[] =
2982 { "all", RECIP_MASK_ALL },
2983 { "none", RECIP_MASK_NONE },
2984 { "div", RECIP_MASK_DIV },
2985 { "sqrt", RECIP_MASK_SQRT },
2986 { "vec-div", RECIP_MASK_VEC_DIV },
2987 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
2990 int const pta_size = ARRAY_SIZE (processor_alias_table);
2992 /* Set up prefix/suffix so the error messages refer to either the command
2993 line argument, or the attribute(target). */
2994 if (main_args_p)
2996 prefix = "-m";
2997 suffix = "";
2998 sw = "switch";
3000 else
3002 prefix = "option(\"";
3003 suffix = "\")";
3004 sw = "attribute";
3007 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3008 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3009 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3010 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3011 #ifdef TARGET_BI_ARCH
3012 else
3014 #if TARGET_BI_ARCH == 1
3015 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3016 is on and OPTION_MASK_ABI_X32 is off. We turn off
3017 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3018 -mx32. */
3019 if (TARGET_X32)
3020 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3021 #else
3022 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3023 on and OPTION_MASK_ABI_64 is off. We turn off
3024 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3025 -m64. */
3026 if (TARGET_LP64)
3027 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3028 #endif
3030 #endif
3032 if (TARGET_X32)
3034 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3035 OPTION_MASK_ABI_64 for TARGET_X32. */
3036 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3037 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3039 else if (TARGET_LP64)
3041 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3042 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3043 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3044 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3047 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3048 SUBTARGET_OVERRIDE_OPTIONS;
3049 #endif
3051 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3052 SUBSUBTARGET_OVERRIDE_OPTIONS;
3053 #endif
3055 /* -fPIC is the default for x86_64. */
3056 if (TARGET_MACHO && TARGET_64BIT)
3057 flag_pic = 2;
3059 /* Need to check -mtune=generic first. */
3060 if (ix86_tune_string)
3062 if (!strcmp (ix86_tune_string, "generic")
3063 || !strcmp (ix86_tune_string, "i686")
3064 /* As special support for cross compilers we read -mtune=native
3065 as -mtune=generic. With native compilers we won't see the
3066 -mtune=native, as it was changed by the driver. */
3067 || !strcmp (ix86_tune_string, "native"))
3069 if (TARGET_64BIT)
3070 ix86_tune_string = "generic64";
3071 else
3072 ix86_tune_string = "generic32";
3074 /* If this call is for setting the option attribute, allow the
3075 generic32/generic64 that was previously set. */
3076 else if (!main_args_p
3077 && (!strcmp (ix86_tune_string, "generic32")
3078 || !strcmp (ix86_tune_string, "generic64")))
3080 else if (!strncmp (ix86_tune_string, "generic", 7))
3081 error ("bad value (%s) for %stune=%s %s",
3082 ix86_tune_string, prefix, suffix, sw);
3083 else if (!strcmp (ix86_tune_string, "x86-64"))
3084 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3085 "%stune=k8%s or %stune=generic%s instead as appropriate",
3086 prefix, suffix, prefix, suffix, prefix, suffix);
3088 else
3090 if (ix86_arch_string)
3091 ix86_tune_string = ix86_arch_string;
3092 if (!ix86_tune_string)
3094 ix86_tune_string
3095 = processor_target_table[TARGET_CPU_DEFAULT].name;
3096 ix86_tune_defaulted = 1;
3099 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3100 need to use a sensible tune option. */
3101 if (!strcmp (ix86_tune_string, "generic")
3102 || !strcmp (ix86_tune_string, "x86-64")
3103 || !strcmp (ix86_tune_string, "i686"))
3105 if (TARGET_64BIT)
3106 ix86_tune_string = "generic64";
3107 else
3108 ix86_tune_string = "generic32";
3112 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3114 /* rep; movq isn't available in 32-bit code. */
3115 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3116 ix86_stringop_alg = no_stringop;
3119 if (!ix86_arch_string)
3120 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3121 else
3122 ix86_arch_specified = 1;
3124 if (global_options_set.x_ix86_pmode)
3126 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3127 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3128 error ("address mode %qs not supported in the %s bit mode",
3129 TARGET_64BIT ? "short" : "long",
3130 TARGET_64BIT ? "64" : "32");
3132 else
3133 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3135 if (!global_options_set.x_ix86_abi)
3136 ix86_abi = DEFAULT_ABI;
3138 if (global_options_set.x_ix86_cmodel)
3140 switch (ix86_cmodel)
3142 case CM_SMALL:
3143 case CM_SMALL_PIC:
3144 if (flag_pic)
3145 ix86_cmodel = CM_SMALL_PIC;
3146 if (!TARGET_64BIT)
3147 error ("code model %qs not supported in the %s bit mode",
3148 "small", "32");
3149 break;
3151 case CM_MEDIUM:
3152 case CM_MEDIUM_PIC:
3153 if (flag_pic)
3154 ix86_cmodel = CM_MEDIUM_PIC;
3155 if (!TARGET_64BIT)
3156 error ("code model %qs not supported in the %s bit mode",
3157 "medium", "32");
3158 else if (TARGET_X32)
3159 error ("code model %qs not supported in x32 mode",
3160 "medium");
3161 break;
3163 case CM_LARGE:
3164 case CM_LARGE_PIC:
3165 if (flag_pic)
3166 ix86_cmodel = CM_LARGE_PIC;
3167 if (!TARGET_64BIT)
3168 error ("code model %qs not supported in the %s bit mode",
3169 "large", "32");
3170 else if (TARGET_X32)
3171 error ("code model %qs not supported in x32 mode",
3172 "large");
3173 break;
3175 case CM_32:
3176 if (flag_pic)
3177 error ("code model %s does not support PIC mode", "32");
3178 if (TARGET_64BIT)
3179 error ("code model %qs not supported in the %s bit mode",
3180 "32", "64");
3181 break;
3183 case CM_KERNEL:
3184 if (flag_pic)
3186 error ("code model %s does not support PIC mode", "kernel");
3187 ix86_cmodel = CM_32;
3189 if (!TARGET_64BIT)
3190 error ("code model %qs not supported in the %s bit mode",
3191 "kernel", "32");
3192 break;
3194 default:
3195 gcc_unreachable ();
3198 else
3200 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3201 use of rip-relative addressing. This eliminates fixups that
3202 would otherwise be needed if this object is to be placed in a
3203 DLL, and is essentially just as efficient as direct addressing. */
3204 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3205 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3206 else if (TARGET_64BIT && TARGET_RDOS)
3207 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3208 else if (TARGET_64BIT)
3209 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3210 else
3211 ix86_cmodel = CM_32;
3213 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3215 error ("-masm=intel not supported in this configuration");
3216 ix86_asm_dialect = ASM_ATT;
3218 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3219 sorry ("%i-bit mode not compiled in",
3220 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3222 for (i = 0; i < pta_size; i++)
3223 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3225 ix86_schedule = processor_alias_table[i].schedule;
3226 ix86_arch = processor_alias_table[i].processor;
3227 /* Default cpu tuning to the architecture. */
3228 ix86_tune = ix86_arch;
3230 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3231 error ("CPU you selected does not support x86-64 "
3232 "instruction set");
3234 if (processor_alias_table[i].flags & PTA_MMX
3235 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3236 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3237 if (processor_alias_table[i].flags & PTA_3DNOW
3238 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3239 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3240 if (processor_alias_table[i].flags & PTA_3DNOW_A
3241 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3242 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3243 if (processor_alias_table[i].flags & PTA_SSE
3244 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3245 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3246 if (processor_alias_table[i].flags & PTA_SSE2
3247 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3248 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3249 if (processor_alias_table[i].flags & PTA_SSE3
3250 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3251 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3252 if (processor_alias_table[i].flags & PTA_SSSE3
3253 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3254 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3255 if (processor_alias_table[i].flags & PTA_SSE4_1
3256 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3257 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3258 if (processor_alias_table[i].flags & PTA_SSE4_2
3259 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3260 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3261 if (processor_alias_table[i].flags & PTA_AVX
3262 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3263 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3264 if (processor_alias_table[i].flags & PTA_AVX2
3265 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3266 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3267 if (processor_alias_table[i].flags & PTA_FMA
3268 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3269 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3270 if (processor_alias_table[i].flags & PTA_SSE4A
3271 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3272 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3273 if (processor_alias_table[i].flags & PTA_FMA4
3274 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3275 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3276 if (processor_alias_table[i].flags & PTA_XOP
3277 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3278 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3279 if (processor_alias_table[i].flags & PTA_LWP
3280 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3281 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3282 if (processor_alias_table[i].flags & PTA_ABM
3283 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3284 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3285 if (processor_alias_table[i].flags & PTA_BMI
3286 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3287 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3288 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3289 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3290 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3291 if (processor_alias_table[i].flags & PTA_TBM
3292 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3293 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3294 if (processor_alias_table[i].flags & PTA_BMI2
3295 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3296 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3297 if (processor_alias_table[i].flags & PTA_CX16
3298 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3299 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3300 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3301 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3302 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3303 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3304 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3305 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3306 if (processor_alias_table[i].flags & PTA_MOVBE
3307 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3308 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3309 if (processor_alias_table[i].flags & PTA_AES
3310 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3311 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3312 if (processor_alias_table[i].flags & PTA_PCLMUL
3313 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3314 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3315 if (processor_alias_table[i].flags & PTA_FSGSBASE
3316 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3317 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3318 if (processor_alias_table[i].flags & PTA_RDRND
3319 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3320 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3321 if (processor_alias_table[i].flags & PTA_F16C
3322 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3323 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3324 if (processor_alias_table[i].flags & PTA_RTM
3325 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3326 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3327 if (processor_alias_table[i].flags & PTA_HLE
3328 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3329 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3330 if (processor_alias_table[i].flags & PTA_PRFCHW
3331 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3332 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3333 if (processor_alias_table[i].flags & PTA_RDSEED
3334 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3335 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3336 if (processor_alias_table[i].flags & PTA_ADX
3337 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3338 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3339 if (processor_alias_table[i].flags & PTA_FXSR
3340 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3341 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3342 if (processor_alias_table[i].flags & PTA_XSAVE
3343 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3344 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3345 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3346 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3347 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3348 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3349 x86_prefetch_sse = true;
3351 break;
3354 if (!strcmp (ix86_arch_string, "generic"))
3355 error ("generic CPU can be used only for %stune=%s %s",
3356 prefix, suffix, sw);
3357 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3358 error ("bad value (%s) for %sarch=%s %s",
3359 ix86_arch_string, prefix, suffix, sw);
3361 ix86_arch_mask = 1u << ix86_arch;
3362 for (i = 0; i < X86_ARCH_LAST; ++i)
3363 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3365 for (i = 0; i < pta_size; i++)
3366 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3368 ix86_schedule = processor_alias_table[i].schedule;
3369 ix86_tune = processor_alias_table[i].processor;
3370 if (TARGET_64BIT)
3372 if (!(processor_alias_table[i].flags & PTA_64BIT))
3374 if (ix86_tune_defaulted)
3376 ix86_tune_string = "x86-64";
3377 for (i = 0; i < pta_size; i++)
3378 if (! strcmp (ix86_tune_string,
3379 processor_alias_table[i].name))
3380 break;
3381 ix86_schedule = processor_alias_table[i].schedule;
3382 ix86_tune = processor_alias_table[i].processor;
3384 else
3385 error ("CPU you selected does not support x86-64 "
3386 "instruction set");
3389 else
3391 /* Adjust tuning when compiling for 32-bit ABI. */
3392 switch (ix86_tune)
3394 case PROCESSOR_GENERIC64:
3395 ix86_tune = PROCESSOR_GENERIC32;
3396 ix86_schedule = CPU_PENTIUMPRO;
3397 break;
3399 default:
3400 break;
3403 /* Intel CPUs have always interpreted SSE prefetch instructions as
3404 NOPs; so, we can enable SSE prefetch instructions even when
3405 -mtune (rather than -march) points us to a processor that has them.
3406 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3407 higher processors. */
3408 if (TARGET_CMOV
3409 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3410 x86_prefetch_sse = true;
3411 break;
3414 if (ix86_tune_specified && i == pta_size)
3415 error ("bad value (%s) for %stune=%s %s",
3416 ix86_tune_string, prefix, suffix, sw);
3418 ix86_tune_mask = 1u << ix86_tune;
3419 for (i = 0; i < X86_TUNE_LAST; ++i)
3420 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3422 #ifndef USE_IX86_FRAME_POINTER
3423 #define USE_IX86_FRAME_POINTER 0
3424 #endif
3426 #ifndef USE_X86_64_FRAME_POINTER
3427 #define USE_X86_64_FRAME_POINTER 0
3428 #endif
3430 /* Set the default values for switches whose default depends on TARGET_64BIT
3431 in case they weren't overwritten by command line options. */
3432 if (TARGET_64BIT)
3434 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3435 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3436 if (flag_asynchronous_unwind_tables == 2)
3437 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3438 if (flag_pcc_struct_return == 2)
3439 flag_pcc_struct_return = 0;
3441 else
3443 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3444 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3445 if (flag_asynchronous_unwind_tables == 2)
3446 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3447 if (flag_pcc_struct_return == 2)
3448 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3451 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3452 if (optimize_size)
3453 ix86_cost = &ix86_size_cost;
3454 else
3455 ix86_cost = ix86_tune_cost;
3457 /* Arrange to set up i386_stack_locals for all functions. */
3458 init_machine_status = ix86_init_machine_status;
3460 /* Validate -mregparm= value. */
3461 if (global_options_set.x_ix86_regparm)
3463 if (TARGET_64BIT)
3464 warning (0, "-mregparm is ignored in 64-bit mode");
3465 if (ix86_regparm > REGPARM_MAX)
3467 error ("-mregparm=%d is not between 0 and %d",
3468 ix86_regparm, REGPARM_MAX);
3469 ix86_regparm = 0;
3472 if (TARGET_64BIT)
3473 ix86_regparm = REGPARM_MAX;
3475 /* Default align_* from the processor table. */
3476 if (align_loops == 0)
3478 align_loops = processor_target_table[ix86_tune].align_loop;
3479 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3481 if (align_jumps == 0)
3483 align_jumps = processor_target_table[ix86_tune].align_jump;
3484 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3486 if (align_functions == 0)
3488 align_functions = processor_target_table[ix86_tune].align_func;
3491 /* Provide default for -mbranch-cost= value. */
3492 if (!global_options_set.x_ix86_branch_cost)
3493 ix86_branch_cost = ix86_cost->branch_cost;
3495 if (TARGET_64BIT)
3497 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3499 /* Enable by default the SSE and MMX builtins. Do allow the user to
3500 explicitly disable any of these. In particular, disabling SSE and
3501 MMX for kernel code is extremely useful. */
3502 if (!ix86_arch_specified)
3503 ix86_isa_flags
3504 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3505 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3507 if (TARGET_RTD)
3508 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3510 else
3512 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3514 if (!ix86_arch_specified)
3515 ix86_isa_flags
3516 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3518 /* i386 ABI does not specify red zone. It still makes sense to use it
3519 when programmer takes care to stack from being destroyed. */
3520 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3521 target_flags |= MASK_NO_RED_ZONE;
3524 /* Keep nonleaf frame pointers. */
3525 if (flag_omit_frame_pointer)
3526 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3527 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3528 flag_omit_frame_pointer = 1;
3530 /* If we're doing fast math, we don't care about comparison order
3531 wrt NaNs. This lets us use a shorter comparison sequence. */
3532 if (flag_finite_math_only)
3533 target_flags &= ~MASK_IEEE_FP;
3535 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3536 since the insns won't need emulation. */
3537 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3538 target_flags &= ~MASK_NO_FANCY_MATH_387;
3540 /* Likewise, if the target doesn't have a 387, or we've specified
3541 software floating point, don't use 387 inline intrinsics. */
3542 if (!TARGET_80387)
3543 target_flags |= MASK_NO_FANCY_MATH_387;
3545 /* Turn on MMX builtins for -msse. */
3546 if (TARGET_SSE)
3547 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3549 /* Enable SSE prefetch. */
3550 if (TARGET_SSE || (TARGET_PRFCHW && !TARGET_3DNOW))
3551 x86_prefetch_sse = true;
3553 /* Enable prefetch{,w} instructions for -m3dnow. */
3554 if (TARGET_3DNOW)
3555 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW & ~ix86_isa_flags_explicit;
3557 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3558 if (TARGET_SSE4_2 || TARGET_ABM)
3559 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3561 /* Enable lzcnt instruction for -mabm. */
3562 if (TARGET_ABM)
3563 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3565 /* Validate -mpreferred-stack-boundary= value or default it to
3566 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3567 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3568 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3570 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3571 int max = (TARGET_SEH ? 4 : 12);
3573 if (ix86_preferred_stack_boundary_arg < min
3574 || ix86_preferred_stack_boundary_arg > max)
3576 if (min == max)
3577 error ("-mpreferred-stack-boundary is not supported "
3578 "for this target");
3579 else
3580 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3581 ix86_preferred_stack_boundary_arg, min, max);
3583 else
3584 ix86_preferred_stack_boundary
3585 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3588 /* Set the default value for -mstackrealign. */
3589 if (ix86_force_align_arg_pointer == -1)
3590 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3592 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3594 /* Validate -mincoming-stack-boundary= value or default it to
3595 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3596 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3597 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3599 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3600 || ix86_incoming_stack_boundary_arg > 12)
3601 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3602 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3603 else
3605 ix86_user_incoming_stack_boundary
3606 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3607 ix86_incoming_stack_boundary
3608 = ix86_user_incoming_stack_boundary;
3612 /* Accept -msseregparm only if at least SSE support is enabled. */
3613 if (TARGET_SSEREGPARM
3614 && ! TARGET_SSE)
3615 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3617 if (global_options_set.x_ix86_fpmath)
3619 if (ix86_fpmath & FPMATH_SSE)
3621 if (!TARGET_SSE)
3623 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3624 ix86_fpmath = FPMATH_387;
3626 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3628 warning (0, "387 instruction set disabled, using SSE arithmetics");
3629 ix86_fpmath = FPMATH_SSE;
3633 else
3634 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3636 /* If the i387 is disabled, then do not return values in it. */
3637 if (!TARGET_80387)
3638 target_flags &= ~MASK_FLOAT_RETURNS;
3640 /* Use external vectorized library in vectorizing intrinsics. */
3641 if (global_options_set.x_ix86_veclibabi_type)
3642 switch (ix86_veclibabi_type)
3644 case ix86_veclibabi_type_svml:
3645 ix86_veclib_handler = ix86_veclibabi_svml;
3646 break;
3648 case ix86_veclibabi_type_acml:
3649 ix86_veclib_handler = ix86_veclibabi_acml;
3650 break;
3652 default:
3653 gcc_unreachable ();
3656 if ((!USE_IX86_FRAME_POINTER
3657 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3658 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3659 && !optimize_size)
3660 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3662 /* ??? Unwind info is not correct around the CFG unless either a frame
3663 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3664 unwind info generation to be aware of the CFG and propagating states
3665 around edges. */
3666 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3667 || flag_exceptions || flag_non_call_exceptions)
3668 && flag_omit_frame_pointer
3669 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3671 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3672 warning (0, "unwind tables currently require either a frame pointer "
3673 "or %saccumulate-outgoing-args%s for correctness",
3674 prefix, suffix);
3675 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3678 /* If stack probes are required, the space used for large function
3679 arguments on the stack must also be probed, so enable
3680 -maccumulate-outgoing-args so this happens in the prologue. */
3681 if (TARGET_STACK_PROBE
3682 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3684 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3685 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3686 "for correctness", prefix, suffix);
3687 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3692 char *p;
3693 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3694 p = strchr (internal_label_prefix, 'X');
3695 internal_label_prefix_len = p - internal_label_prefix;
3696 *p = '\0';
3699 /* When scheduling description is not available, disable scheduler pass
3700 so it won't slow down the compilation and make x87 code slower. */
3701 if (!TARGET_SCHEDULE)
3702 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3704 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3705 ix86_tune_cost->simultaneous_prefetches,
3706 global_options.x_param_values,
3707 global_options_set.x_param_values);
3708 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3709 ix86_tune_cost->prefetch_block,
3710 global_options.x_param_values,
3711 global_options_set.x_param_values);
3712 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3713 ix86_tune_cost->l1_cache_size,
3714 global_options.x_param_values,
3715 global_options_set.x_param_values);
3716 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3717 ix86_tune_cost->l2_cache_size,
3718 global_options.x_param_values,
3719 global_options_set.x_param_values);
3721 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3722 if (flag_prefetch_loop_arrays < 0
3723 && HAVE_prefetch
3724 && (optimize >= 3 || flag_profile_use)
3725 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3726 flag_prefetch_loop_arrays = 1;
3728 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3729 can be optimized to ap = __builtin_next_arg (0). */
3730 if (!TARGET_64BIT && !flag_split_stack)
3731 targetm.expand_builtin_va_start = NULL;
3733 if (TARGET_64BIT)
3735 ix86_gen_leave = gen_leave_rex64;
3736 if (Pmode == DImode)
3738 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3739 ix86_gen_tls_local_dynamic_base_64
3740 = gen_tls_local_dynamic_base_64_di;
3742 else
3744 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3745 ix86_gen_tls_local_dynamic_base_64
3746 = gen_tls_local_dynamic_base_64_si;
3749 else
3750 ix86_gen_leave = gen_leave;
3752 if (Pmode == DImode)
3754 ix86_gen_add3 = gen_adddi3;
3755 ix86_gen_sub3 = gen_subdi3;
3756 ix86_gen_sub3_carry = gen_subdi3_carry;
3757 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3758 ix86_gen_andsp = gen_anddi3;
3759 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3760 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3761 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3762 ix86_gen_monitor = gen_sse3_monitor_di;
3764 else
3766 ix86_gen_add3 = gen_addsi3;
3767 ix86_gen_sub3 = gen_subsi3;
3768 ix86_gen_sub3_carry = gen_subsi3_carry;
3769 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3770 ix86_gen_andsp = gen_andsi3;
3771 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3772 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3773 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3774 ix86_gen_monitor = gen_sse3_monitor_si;
3777 #ifdef USE_IX86_CLD
3778 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3779 if (!TARGET_64BIT)
3780 target_flags |= MASK_CLD & ~target_flags_explicit;
3781 #endif
3783 if (!TARGET_64BIT && flag_pic)
3785 if (flag_fentry > 0)
3786 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3787 "with -fpic");
3788 flag_fentry = 0;
3790 else if (TARGET_SEH)
3792 if (flag_fentry == 0)
3793 sorry ("-mno-fentry isn%'t compatible with SEH");
3794 flag_fentry = 1;
3796 else if (flag_fentry < 0)
3798 #if defined(PROFILE_BEFORE_PROLOGUE)
3799 flag_fentry = 1;
3800 #else
3801 flag_fentry = 0;
3802 #endif
3805 if (TARGET_AVX)
3807 /* When not optimize for size, enable vzeroupper optimization for
3808 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3809 AVX unaligned load/store. */
3810 if (!optimize_size)
3812 if (flag_expensive_optimizations
3813 && !(target_flags_explicit & MASK_VZEROUPPER))
3814 target_flags |= MASK_VZEROUPPER;
3815 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3816 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3817 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3818 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3819 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3820 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3821 /* Enable 128-bit AVX instruction generation
3822 for the auto-vectorizer. */
3823 if (TARGET_AVX128_OPTIMAL
3824 && !(target_flags_explicit & MASK_PREFER_AVX128))
3825 target_flags |= MASK_PREFER_AVX128;
3828 else
3830 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3831 target_flags &= ~MASK_VZEROUPPER;
3834 if (ix86_recip_name)
3836 char *p = ASTRDUP (ix86_recip_name);
3837 char *q;
3838 unsigned int mask, i;
3839 bool invert;
3841 while ((q = strtok (p, ",")) != NULL)
3843 p = NULL;
3844 if (*q == '!')
3846 invert = true;
3847 q++;
3849 else
3850 invert = false;
3852 if (!strcmp (q, "default"))
3853 mask = RECIP_MASK_ALL;
3854 else
3856 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3857 if (!strcmp (q, recip_options[i].string))
3859 mask = recip_options[i].mask;
3860 break;
3863 if (i == ARRAY_SIZE (recip_options))
3865 error ("unknown option for -mrecip=%s", q);
3866 invert = false;
3867 mask = RECIP_MASK_NONE;
3871 recip_mask_explicit |= mask;
3872 if (invert)
3873 recip_mask &= ~mask;
3874 else
3875 recip_mask |= mask;
3879 if (TARGET_RECIP)
3880 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3881 else if (target_flags_explicit & MASK_RECIP)
3882 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3884 /* Default long double to 64-bit for Bionic. */
3885 if (TARGET_HAS_BIONIC
3886 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3887 target_flags |= MASK_LONG_DOUBLE_64;
3889 /* Save the initial options in case the user does function specific
3890 options. */
3891 if (main_args_p)
3892 target_option_default_node = target_option_current_node
3893 = build_target_option_node ();
3896 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3898 static void
3899 ix86_option_override (void)
3901 static struct register_pass_info insert_vzeroupper_info
3902 = { &pass_insert_vzeroupper.pass, "reload",
3903 1, PASS_POS_INSERT_AFTER
3906 ix86_option_override_internal (true);
3909 /* This needs to be done at start up. It's convenient to do it here. */
3910 register_pass (&insert_vzeroupper_info);
3913 /* Update register usage after having seen the compiler flags. */
3915 static void
3916 ix86_conditional_register_usage (void)
3918 int i, c_mask;
3919 unsigned int j;
3921 /* The PIC register, if it exists, is fixed. */
3922 j = PIC_OFFSET_TABLE_REGNUM;
3923 if (j != INVALID_REGNUM)
3924 fixed_regs[j] = call_used_regs[j] = 1;
3926 /* For 32-bit targets, squash the REX registers. */
3927 if (! TARGET_64BIT)
3929 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3930 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3931 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3932 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3935 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3936 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3937 : TARGET_64BIT ? (1 << 2)
3938 : (1 << 1));
3940 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3942 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3944 /* Set/reset conditionally defined registers from
3945 CALL_USED_REGISTERS initializer. */
3946 if (call_used_regs[i] > 1)
3947 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3949 /* Calculate registers of CLOBBERED_REGS register set
3950 as call used registers from GENERAL_REGS register set. */
3951 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3952 && call_used_regs[i])
3953 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3956 /* If MMX is disabled, squash the registers. */
3957 if (! TARGET_MMX)
3958 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3959 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3960 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3962 /* If SSE is disabled, squash the registers. */
3963 if (! TARGET_SSE)
3964 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3965 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3966 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3968 /* If the FPU is disabled, squash the registers. */
3969 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3970 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3971 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3972 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3976 /* Save the current options */
3978 static void
3979 ix86_function_specific_save (struct cl_target_option *ptr)
3981 ptr->arch = ix86_arch;
3982 ptr->schedule = ix86_schedule;
3983 ptr->tune = ix86_tune;
3984 ptr->branch_cost = ix86_branch_cost;
3985 ptr->tune_defaulted = ix86_tune_defaulted;
3986 ptr->arch_specified = ix86_arch_specified;
3987 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
3988 ptr->ix86_target_flags_explicit = target_flags_explicit;
3989 ptr->x_recip_mask_explicit = recip_mask_explicit;
3991 /* The fields are char but the variables are not; make sure the
3992 values fit in the fields. */
3993 gcc_assert (ptr->arch == ix86_arch);
3994 gcc_assert (ptr->schedule == ix86_schedule);
3995 gcc_assert (ptr->tune == ix86_tune);
3996 gcc_assert (ptr->branch_cost == ix86_branch_cost);
3999 /* Restore the current options */
4001 static void
4002 ix86_function_specific_restore (struct cl_target_option *ptr)
4004 enum processor_type old_tune = ix86_tune;
4005 enum processor_type old_arch = ix86_arch;
4006 unsigned int ix86_arch_mask, ix86_tune_mask;
4007 int i;
4009 ix86_arch = (enum processor_type) ptr->arch;
4010 ix86_schedule = (enum attr_cpu) ptr->schedule;
4011 ix86_tune = (enum processor_type) ptr->tune;
4012 ix86_branch_cost = ptr->branch_cost;
4013 ix86_tune_defaulted = ptr->tune_defaulted;
4014 ix86_arch_specified = ptr->arch_specified;
4015 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4016 target_flags_explicit = ptr->ix86_target_flags_explicit;
4017 recip_mask_explicit = ptr->x_recip_mask_explicit;
4019 /* Recreate the arch feature tests if the arch changed */
4020 if (old_arch != ix86_arch)
4022 ix86_arch_mask = 1u << ix86_arch;
4023 for (i = 0; i < X86_ARCH_LAST; ++i)
4024 ix86_arch_features[i]
4025 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4028 /* Recreate the tune optimization tests */
4029 if (old_tune != ix86_tune)
4031 ix86_tune_mask = 1u << ix86_tune;
4032 for (i = 0; i < X86_TUNE_LAST; ++i)
4033 ix86_tune_features[i]
4034 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4038 /* Print the current options */
4040 static void
4041 ix86_function_specific_print (FILE *file, int indent,
4042 struct cl_target_option *ptr)
4044 char *target_string
4045 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4046 NULL, NULL, ptr->x_ix86_fpmath, false);
4048 gcc_assert (ptr->arch < PROCESSOR_max);
4049 fprintf (file, "%*sarch = %d (%s)\n",
4050 indent, "",
4051 ptr->arch, processor_target_table[ptr->arch].name);
4053 gcc_assert (ptr->tune < PROCESSOR_max);
4054 fprintf (file, "%*stune = %d (%s)\n",
4055 indent, "",
4056 ptr->tune, processor_target_table[ptr->tune].name);
4058 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4060 if (target_string)
4062 fprintf (file, "%*s%s\n", indent, "", target_string);
4063 free (target_string);
4068 /* Inner function to process the attribute((target(...))), take an argument and
4069 set the current options from the argument. If we have a list, recursively go
4070 over the list. */
4072 static bool
4073 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4074 struct gcc_options *enum_opts_set)
4076 char *next_optstr;
4077 bool ret = true;
4079 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4080 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4081 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4082 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4083 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4085 enum ix86_opt_type
4087 ix86_opt_unknown,
4088 ix86_opt_yes,
4089 ix86_opt_no,
4090 ix86_opt_str,
4091 ix86_opt_enum,
4092 ix86_opt_isa
4095 static const struct
4097 const char *string;
4098 size_t len;
4099 enum ix86_opt_type type;
4100 int opt;
4101 int mask;
4102 } attrs[] = {
4103 /* isa options */
4104 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4105 IX86_ATTR_ISA ("abm", OPT_mabm),
4106 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4107 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4108 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4109 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4110 IX86_ATTR_ISA ("aes", OPT_maes),
4111 IX86_ATTR_ISA ("avx", OPT_mavx),
4112 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4113 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4114 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4115 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4116 IX86_ATTR_ISA ("sse", OPT_msse),
4117 IX86_ATTR_ISA ("sse2", OPT_msse2),
4118 IX86_ATTR_ISA ("sse3", OPT_msse3),
4119 IX86_ATTR_ISA ("sse4", OPT_msse4),
4120 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4121 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4122 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4123 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4124 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4125 IX86_ATTR_ISA ("fma", OPT_mfma),
4126 IX86_ATTR_ISA ("xop", OPT_mxop),
4127 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4128 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4129 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4130 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4131 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4132 IX86_ATTR_ISA ("hle", OPT_mhle),
4133 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4134 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4135 IX86_ATTR_ISA ("adx", OPT_madx),
4136 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4137 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4138 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4140 /* enum options */
4141 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4143 /* string options */
4144 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4145 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4147 /* flag options */
4148 IX86_ATTR_YES ("cld",
4149 OPT_mcld,
4150 MASK_CLD),
4152 IX86_ATTR_NO ("fancy-math-387",
4153 OPT_mfancy_math_387,
4154 MASK_NO_FANCY_MATH_387),
4156 IX86_ATTR_YES ("ieee-fp",
4157 OPT_mieee_fp,
4158 MASK_IEEE_FP),
4160 IX86_ATTR_YES ("inline-all-stringops",
4161 OPT_minline_all_stringops,
4162 MASK_INLINE_ALL_STRINGOPS),
4164 IX86_ATTR_YES ("inline-stringops-dynamically",
4165 OPT_minline_stringops_dynamically,
4166 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4168 IX86_ATTR_NO ("align-stringops",
4169 OPT_mno_align_stringops,
4170 MASK_NO_ALIGN_STRINGOPS),
4172 IX86_ATTR_YES ("recip",
4173 OPT_mrecip,
4174 MASK_RECIP),
4178 /* If this is a list, recurse to get the options. */
4179 if (TREE_CODE (args) == TREE_LIST)
4181 bool ret = true;
4183 for (; args; args = TREE_CHAIN (args))
4184 if (TREE_VALUE (args)
4185 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4186 p_strings, enum_opts_set))
4187 ret = false;
4189 return ret;
4192 else if (TREE_CODE (args) != STRING_CST)
4194 error ("attribute %<target%> argument not a string");
4195 return false;
4198 /* Handle multiple arguments separated by commas. */
4199 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4201 while (next_optstr && *next_optstr != '\0')
4203 char *p = next_optstr;
4204 char *orig_p = p;
4205 char *comma = strchr (next_optstr, ',');
4206 const char *opt_string;
4207 size_t len, opt_len;
4208 int opt;
4209 bool opt_set_p;
4210 char ch;
4211 unsigned i;
4212 enum ix86_opt_type type = ix86_opt_unknown;
4213 int mask = 0;
4215 if (comma)
4217 *comma = '\0';
4218 len = comma - next_optstr;
4219 next_optstr = comma + 1;
4221 else
4223 len = strlen (p);
4224 next_optstr = NULL;
4227 /* Recognize no-xxx. */
4228 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4230 opt_set_p = false;
4231 p += 3;
4232 len -= 3;
4234 else
4235 opt_set_p = true;
4237 /* Find the option. */
4238 ch = *p;
4239 opt = N_OPTS;
4240 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4242 type = attrs[i].type;
4243 opt_len = attrs[i].len;
4244 if (ch == attrs[i].string[0]
4245 && ((type != ix86_opt_str && type != ix86_opt_enum)
4246 ? len == opt_len
4247 : len > opt_len)
4248 && memcmp (p, attrs[i].string, opt_len) == 0)
4250 opt = attrs[i].opt;
4251 mask = attrs[i].mask;
4252 opt_string = attrs[i].string;
4253 break;
4257 /* Process the option. */
4258 if (opt == N_OPTS)
4260 error ("attribute(target(\"%s\")) is unknown", orig_p);
4261 ret = false;
4264 else if (type == ix86_opt_isa)
4266 struct cl_decoded_option decoded;
4268 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4269 ix86_handle_option (&global_options, &global_options_set,
4270 &decoded, input_location);
4273 else if (type == ix86_opt_yes || type == ix86_opt_no)
4275 if (type == ix86_opt_no)
4276 opt_set_p = !opt_set_p;
4278 if (opt_set_p)
4279 target_flags |= mask;
4280 else
4281 target_flags &= ~mask;
4284 else if (type == ix86_opt_str)
4286 if (p_strings[opt])
4288 error ("option(\"%s\") was already specified", opt_string);
4289 ret = false;
4291 else
4292 p_strings[opt] = xstrdup (p + opt_len);
4295 else if (type == ix86_opt_enum)
4297 bool arg_ok;
4298 int value;
4300 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4301 if (arg_ok)
4302 set_option (&global_options, enum_opts_set, opt, value,
4303 p + opt_len, DK_UNSPECIFIED, input_location,
4304 global_dc);
4305 else
4307 error ("attribute(target(\"%s\")) is unknown", orig_p);
4308 ret = false;
4312 else
4313 gcc_unreachable ();
4316 return ret;
4319 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4321 tree
4322 ix86_valid_target_attribute_tree (tree args)
4324 const char *orig_arch_string = ix86_arch_string;
4325 const char *orig_tune_string = ix86_tune_string;
4326 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4327 int orig_tune_defaulted = ix86_tune_defaulted;
4328 int orig_arch_specified = ix86_arch_specified;
4329 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4330 tree t = NULL_TREE;
4331 int i;
4332 struct cl_target_option *def
4333 = TREE_TARGET_OPTION (target_option_default_node);
4334 struct gcc_options enum_opts_set;
4336 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4338 /* Process each of the options on the chain. */
4339 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4340 &enum_opts_set))
4341 return error_mark_node;
4343 /* If the changed options are different from the default, rerun
4344 ix86_option_override_internal, and then save the options away.
4345 The string options are are attribute options, and will be undone
4346 when we copy the save structure. */
4347 if (ix86_isa_flags != def->x_ix86_isa_flags
4348 || target_flags != def->x_target_flags
4349 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4350 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4351 || enum_opts_set.x_ix86_fpmath)
4353 /* If we are using the default tune= or arch=, undo the string assigned,
4354 and use the default. */
4355 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4356 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4357 else if (!orig_arch_specified)
4358 ix86_arch_string = NULL;
4360 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4361 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4362 else if (orig_tune_defaulted)
4363 ix86_tune_string = NULL;
4365 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4366 if (enum_opts_set.x_ix86_fpmath)
4367 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4368 else if (!TARGET_64BIT && TARGET_SSE)
4370 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4371 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4374 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4375 ix86_option_override_internal (false);
4377 /* Add any builtin functions with the new isa if any. */
4378 ix86_add_new_builtins (ix86_isa_flags);
4380 /* Save the current options unless we are validating options for
4381 #pragma. */
4382 t = build_target_option_node ();
4384 ix86_arch_string = orig_arch_string;
4385 ix86_tune_string = orig_tune_string;
4386 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4388 /* Free up memory allocated to hold the strings */
4389 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4390 free (option_strings[i]);
4393 return t;
4396 /* Hook to validate attribute((target("string"))). */
4398 static bool
4399 ix86_valid_target_attribute_p (tree fndecl,
4400 tree ARG_UNUSED (name),
4401 tree args,
4402 int ARG_UNUSED (flags))
4404 struct cl_target_option cur_target;
4405 bool ret = true;
4407 /* attribute((target("default"))) does nothing, beyond
4408 affecting multi-versioning. */
4409 if (TREE_VALUE (args)
4410 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4411 && TREE_CHAIN (args) == NULL_TREE
4412 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4413 return true;
4415 tree old_optimize = build_optimization_node ();
4416 tree new_target, new_optimize;
4417 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4419 /* If the function changed the optimization levels as well as setting target
4420 options, start with the optimizations specified. */
4421 if (func_optimize && func_optimize != old_optimize)
4422 cl_optimization_restore (&global_options,
4423 TREE_OPTIMIZATION (func_optimize));
4425 /* The target attributes may also change some optimization flags, so update
4426 the optimization options if necessary. */
4427 cl_target_option_save (&cur_target, &global_options);
4428 new_target = ix86_valid_target_attribute_tree (args);
4429 new_optimize = build_optimization_node ();
4431 if (new_target == error_mark_node)
4432 ret = false;
4434 else if (fndecl && new_target)
4436 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4438 if (old_optimize != new_optimize)
4439 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4442 cl_target_option_restore (&global_options, &cur_target);
4444 if (old_optimize != new_optimize)
4445 cl_optimization_restore (&global_options,
4446 TREE_OPTIMIZATION (old_optimize));
4448 return ret;
4452 /* Hook to determine if one function can safely inline another. */
4454 static bool
4455 ix86_can_inline_p (tree caller, tree callee)
4457 bool ret = false;
4458 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4459 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4461 /* If callee has no option attributes, then it is ok to inline. */
4462 if (!callee_tree)
4463 ret = true;
4465 /* If caller has no option attributes, but callee does then it is not ok to
4466 inline. */
4467 else if (!caller_tree)
4468 ret = false;
4470 else
4472 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4473 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4475 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4476 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4477 function. */
4478 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4479 != callee_opts->x_ix86_isa_flags)
4480 ret = false;
4482 /* See if we have the same non-isa options. */
4483 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4484 ret = false;
4486 /* See if arch, tune, etc. are the same. */
4487 else if (caller_opts->arch != callee_opts->arch)
4488 ret = false;
4490 else if (caller_opts->tune != callee_opts->tune)
4491 ret = false;
4493 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4494 ret = false;
4496 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4497 ret = false;
4499 else
4500 ret = true;
4503 return ret;
4507 /* Remember the last target of ix86_set_current_function. */
4508 static GTY(()) tree ix86_previous_fndecl;
4510 /* Establish appropriate back-end context for processing the function
4511 FNDECL. The argument might be NULL to indicate processing at top
4512 level, outside of any function scope. */
4513 static void
4514 ix86_set_current_function (tree fndecl)
4516 /* Only change the context if the function changes. This hook is called
4517 several times in the course of compiling a function, and we don't want to
4518 slow things down too much or call target_reinit when it isn't safe. */
4519 if (fndecl && fndecl != ix86_previous_fndecl)
4521 tree old_tree = (ix86_previous_fndecl
4522 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4523 : NULL_TREE);
4525 tree new_tree = (fndecl
4526 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4527 : NULL_TREE);
4529 ix86_previous_fndecl = fndecl;
4530 if (old_tree == new_tree)
4533 else if (new_tree)
4535 cl_target_option_restore (&global_options,
4536 TREE_TARGET_OPTION (new_tree));
4537 target_reinit ();
4540 else if (old_tree)
4542 struct cl_target_option *def
4543 = TREE_TARGET_OPTION (target_option_current_node);
4545 cl_target_option_restore (&global_options, def);
4546 target_reinit ();
4552 /* Return true if this goes in large data/bss. */
4554 static bool
4555 ix86_in_large_data_p (tree exp)
4557 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4558 return false;
4560 /* Functions are never large data. */
4561 if (TREE_CODE (exp) == FUNCTION_DECL)
4562 return false;
4564 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4566 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4567 if (strcmp (section, ".ldata") == 0
4568 || strcmp (section, ".lbss") == 0)
4569 return true;
4570 return false;
4572 else
4574 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4576 /* If this is an incomplete type with size 0, then we can't put it
4577 in data because it might be too big when completed. */
4578 if (!size || size > ix86_section_threshold)
4579 return true;
4582 return false;
4585 /* Switch to the appropriate section for output of DECL.
4586 DECL is either a `VAR_DECL' node or a constant of some sort.
4587 RELOC indicates whether forming the initial value of DECL requires
4588 link-time relocations. */
4590 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4591 ATTRIBUTE_UNUSED;
4593 static section *
4594 x86_64_elf_select_section (tree decl, int reloc,
4595 unsigned HOST_WIDE_INT align)
4597 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4598 && ix86_in_large_data_p (decl))
4600 const char *sname = NULL;
4601 unsigned int flags = SECTION_WRITE;
4602 switch (categorize_decl_for_section (decl, reloc))
4604 case SECCAT_DATA:
4605 sname = ".ldata";
4606 break;
4607 case SECCAT_DATA_REL:
4608 sname = ".ldata.rel";
4609 break;
4610 case SECCAT_DATA_REL_LOCAL:
4611 sname = ".ldata.rel.local";
4612 break;
4613 case SECCAT_DATA_REL_RO:
4614 sname = ".ldata.rel.ro";
4615 break;
4616 case SECCAT_DATA_REL_RO_LOCAL:
4617 sname = ".ldata.rel.ro.local";
4618 break;
4619 case SECCAT_BSS:
4620 sname = ".lbss";
4621 flags |= SECTION_BSS;
4622 break;
4623 case SECCAT_RODATA:
4624 case SECCAT_RODATA_MERGE_STR:
4625 case SECCAT_RODATA_MERGE_STR_INIT:
4626 case SECCAT_RODATA_MERGE_CONST:
4627 sname = ".lrodata";
4628 flags = 0;
4629 break;
4630 case SECCAT_SRODATA:
4631 case SECCAT_SDATA:
4632 case SECCAT_SBSS:
4633 gcc_unreachable ();
4634 case SECCAT_TEXT:
4635 case SECCAT_TDATA:
4636 case SECCAT_TBSS:
4637 /* We don't split these for medium model. Place them into
4638 default sections and hope for best. */
4639 break;
4641 if (sname)
4643 /* We might get called with string constants, but get_named_section
4644 doesn't like them as they are not DECLs. Also, we need to set
4645 flags in that case. */
4646 if (!DECL_P (decl))
4647 return get_section (sname, flags, NULL);
4648 return get_named_section (decl, sname, reloc);
4651 return default_elf_select_section (decl, reloc, align);
4654 /* Select a set of attributes for section NAME based on the properties
4655 of DECL and whether or not RELOC indicates that DECL's initializer
4656 might contain runtime relocations. */
4658 static unsigned int ATTRIBUTE_UNUSED
4659 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
4661 unsigned int flags = default_section_type_flags (decl, name, reloc);
4663 if (decl == NULL_TREE
4664 && (strcmp (name, ".ldata.rel.ro") == 0
4665 || strcmp (name, ".ldata.rel.ro.local") == 0))
4666 flags |= SECTION_RELRO;
4668 if (strcmp (name, ".lbss") == 0
4669 || strncmp (name, ".lbss.", 5) == 0
4670 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
4671 flags |= SECTION_BSS;
4673 return flags;
4676 /* Build up a unique section name, expressed as a
4677 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4678 RELOC indicates whether the initial value of EXP requires
4679 link-time relocations. */
4681 static void ATTRIBUTE_UNUSED
4682 x86_64_elf_unique_section (tree decl, int reloc)
4684 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4685 && ix86_in_large_data_p (decl))
4687 const char *prefix = NULL;
4688 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4689 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4691 switch (categorize_decl_for_section (decl, reloc))
4693 case SECCAT_DATA:
4694 case SECCAT_DATA_REL:
4695 case SECCAT_DATA_REL_LOCAL:
4696 case SECCAT_DATA_REL_RO:
4697 case SECCAT_DATA_REL_RO_LOCAL:
4698 prefix = one_only ? ".ld" : ".ldata";
4699 break;
4700 case SECCAT_BSS:
4701 prefix = one_only ? ".lb" : ".lbss";
4702 break;
4703 case SECCAT_RODATA:
4704 case SECCAT_RODATA_MERGE_STR:
4705 case SECCAT_RODATA_MERGE_STR_INIT:
4706 case SECCAT_RODATA_MERGE_CONST:
4707 prefix = one_only ? ".lr" : ".lrodata";
4708 break;
4709 case SECCAT_SRODATA:
4710 case SECCAT_SDATA:
4711 case SECCAT_SBSS:
4712 gcc_unreachable ();
4713 case SECCAT_TEXT:
4714 case SECCAT_TDATA:
4715 case SECCAT_TBSS:
4716 /* We don't split these for medium model. Place them into
4717 default sections and hope for best. */
4718 break;
4720 if (prefix)
4722 const char *name, *linkonce;
4723 char *string;
4725 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4726 name = targetm.strip_name_encoding (name);
4728 /* If we're using one_only, then there needs to be a .gnu.linkonce
4729 prefix to the section name. */
4730 linkonce = one_only ? ".gnu.linkonce" : "";
4732 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4734 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4735 return;
4738 default_unique_section (decl, reloc);
4741 #ifdef COMMON_ASM_OP
4742 /* This says how to output assembler code to declare an
4743 uninitialized external linkage data object.
4745 For medium model x86-64 we need to use .largecomm opcode for
4746 large objects. */
4747 void
4748 x86_elf_aligned_common (FILE *file,
4749 const char *name, unsigned HOST_WIDE_INT size,
4750 int align)
4752 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4753 && size > (unsigned int)ix86_section_threshold)
4754 fputs (".largecomm\t", file);
4755 else
4756 fputs (COMMON_ASM_OP, file);
4757 assemble_name (file, name);
4758 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4759 size, align / BITS_PER_UNIT);
4761 #endif
4763 /* Utility function for targets to use in implementing
4764 ASM_OUTPUT_ALIGNED_BSS. */
4766 void
4767 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4768 const char *name, unsigned HOST_WIDE_INT size,
4769 int align)
4771 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4772 && size > (unsigned int)ix86_section_threshold)
4773 switch_to_section (get_named_section (decl, ".lbss", 0));
4774 else
4775 switch_to_section (bss_section);
4776 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4777 #ifdef ASM_DECLARE_OBJECT_NAME
4778 last_assemble_variable_decl = decl;
4779 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4780 #else
4781 /* Standard thing is just output label for the object. */
4782 ASM_OUTPUT_LABEL (file, name);
4783 #endif /* ASM_DECLARE_OBJECT_NAME */
4784 ASM_OUTPUT_SKIP (file, size ? size : 1);
4787 /* Decide whether we must probe the stack before any space allocation
4788 on this target. It's essentially TARGET_STACK_PROBE except when
4789 -fstack-check causes the stack to be already probed differently. */
4791 bool
4792 ix86_target_stack_probe (void)
4794 /* Do not probe the stack twice if static stack checking is enabled. */
4795 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4796 return false;
4798 return TARGET_STACK_PROBE;
4801 /* Decide whether we can make a sibling call to a function. DECL is the
4802 declaration of the function being targeted by the call and EXP is the
4803 CALL_EXPR representing the call. */
4805 static bool
4806 ix86_function_ok_for_sibcall (tree decl, tree exp)
4808 tree type, decl_or_type;
4809 rtx a, b;
4811 /* If we are generating position-independent code, we cannot sibcall
4812 optimize any indirect call, or a direct call to a global function,
4813 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4814 if (!TARGET_MACHO
4815 && !TARGET_64BIT
4816 && flag_pic
4817 && (!decl || !targetm.binds_local_p (decl)))
4818 return false;
4820 /* If we need to align the outgoing stack, then sibcalling would
4821 unalign the stack, which may break the called function. */
4822 if (ix86_minimum_incoming_stack_boundary (true)
4823 < PREFERRED_STACK_BOUNDARY)
4824 return false;
4826 if (decl)
4828 decl_or_type = decl;
4829 type = TREE_TYPE (decl);
4831 else
4833 /* We're looking at the CALL_EXPR, we need the type of the function. */
4834 type = CALL_EXPR_FN (exp); /* pointer expression */
4835 type = TREE_TYPE (type); /* pointer type */
4836 type = TREE_TYPE (type); /* function type */
4837 decl_or_type = type;
4840 /* Check that the return value locations are the same. Like
4841 if we are returning floats on the 80387 register stack, we cannot
4842 make a sibcall from a function that doesn't return a float to a
4843 function that does or, conversely, from a function that does return
4844 a float to a function that doesn't; the necessary stack adjustment
4845 would not be executed. This is also the place we notice
4846 differences in the return value ABI. Note that it is ok for one
4847 of the functions to have void return type as long as the return
4848 value of the other is passed in a register. */
4849 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4850 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4851 cfun->decl, false);
4852 if (STACK_REG_P (a) || STACK_REG_P (b))
4854 if (!rtx_equal_p (a, b))
4855 return false;
4857 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4859 else if (!rtx_equal_p (a, b))
4860 return false;
4862 if (TARGET_64BIT)
4864 /* The SYSV ABI has more call-clobbered registers;
4865 disallow sibcalls from MS to SYSV. */
4866 if (cfun->machine->call_abi == MS_ABI
4867 && ix86_function_type_abi (type) == SYSV_ABI)
4868 return false;
4870 else
4872 /* If this call is indirect, we'll need to be able to use a
4873 call-clobbered register for the address of the target function.
4874 Make sure that all such registers are not used for passing
4875 parameters. Note that DLLIMPORT functions are indirect. */
4876 if (!decl
4877 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4879 if (ix86_function_regparm (type, NULL) >= 3)
4881 /* ??? Need to count the actual number of registers to be used,
4882 not the possible number of registers. Fix later. */
4883 return false;
4888 /* Otherwise okay. That also includes certain types of indirect calls. */
4889 return true;
4892 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4893 and "sseregparm" calling convention attributes;
4894 arguments as in struct attribute_spec.handler. */
4896 static tree
4897 ix86_handle_cconv_attribute (tree *node, tree name,
4898 tree args,
4899 int flags ATTRIBUTE_UNUSED,
4900 bool *no_add_attrs)
4902 if (TREE_CODE (*node) != FUNCTION_TYPE
4903 && TREE_CODE (*node) != METHOD_TYPE
4904 && TREE_CODE (*node) != FIELD_DECL
4905 && TREE_CODE (*node) != TYPE_DECL)
4907 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4908 name);
4909 *no_add_attrs = true;
4910 return NULL_TREE;
4913 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4914 if (is_attribute_p ("regparm", name))
4916 tree cst;
4918 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4920 error ("fastcall and regparm attributes are not compatible");
4923 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4925 error ("regparam and thiscall attributes are not compatible");
4928 cst = TREE_VALUE (args);
4929 if (TREE_CODE (cst) != INTEGER_CST)
4931 warning (OPT_Wattributes,
4932 "%qE attribute requires an integer constant argument",
4933 name);
4934 *no_add_attrs = true;
4936 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4938 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4939 name, REGPARM_MAX);
4940 *no_add_attrs = true;
4943 return NULL_TREE;
4946 if (TARGET_64BIT)
4948 /* Do not warn when emulating the MS ABI. */
4949 if ((TREE_CODE (*node) != FUNCTION_TYPE
4950 && TREE_CODE (*node) != METHOD_TYPE)
4951 || ix86_function_type_abi (*node) != MS_ABI)
4952 warning (OPT_Wattributes, "%qE attribute ignored",
4953 name);
4954 *no_add_attrs = true;
4955 return NULL_TREE;
4958 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4959 if (is_attribute_p ("fastcall", name))
4961 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4963 error ("fastcall and cdecl attributes are not compatible");
4965 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4967 error ("fastcall and stdcall attributes are not compatible");
4969 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4971 error ("fastcall and regparm attributes are not compatible");
4973 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4975 error ("fastcall and thiscall attributes are not compatible");
4979 /* Can combine stdcall with fastcall (redundant), regparm and
4980 sseregparm. */
4981 else if (is_attribute_p ("stdcall", name))
4983 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4985 error ("stdcall and cdecl attributes are not compatible");
4987 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4989 error ("stdcall and fastcall attributes are not compatible");
4991 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4993 error ("stdcall and thiscall attributes are not compatible");
4997 /* Can combine cdecl with regparm and sseregparm. */
4998 else if (is_attribute_p ("cdecl", name))
5000 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5002 error ("stdcall and cdecl attributes are not compatible");
5004 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5006 error ("fastcall and cdecl attributes are not compatible");
5008 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5010 error ("cdecl and thiscall attributes are not compatible");
5013 else if (is_attribute_p ("thiscall", name))
5015 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5016 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5017 name);
5018 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5020 error ("stdcall and thiscall attributes are not compatible");
5022 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5024 error ("fastcall and thiscall attributes are not compatible");
5026 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5028 error ("cdecl and thiscall attributes are not compatible");
5032 /* Can combine sseregparm with all attributes. */
5034 return NULL_TREE;
5037 /* The transactional memory builtins are implicitly regparm or fastcall
5038 depending on the ABI. Override the generic do-nothing attribute that
5039 these builtins were declared with, and replace it with one of the two
5040 attributes that we expect elsewhere. */
5042 static tree
5043 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5044 tree args ATTRIBUTE_UNUSED,
5045 int flags ATTRIBUTE_UNUSED,
5046 bool *no_add_attrs)
5048 tree alt;
5050 /* In no case do we want to add the placeholder attribute. */
5051 *no_add_attrs = true;
5053 /* The 64-bit ABI is unchanged for transactional memory. */
5054 if (TARGET_64BIT)
5055 return NULL_TREE;
5057 /* ??? Is there a better way to validate 32-bit windows? We have
5058 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5059 if (CHECK_STACK_LIMIT > 0)
5060 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5061 else
5063 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5064 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5066 decl_attributes (node, alt, flags);
5068 return NULL_TREE;
5071 /* This function determines from TYPE the calling-convention. */
5073 unsigned int
5074 ix86_get_callcvt (const_tree type)
5076 unsigned int ret = 0;
5077 bool is_stdarg;
5078 tree attrs;
5080 if (TARGET_64BIT)
5081 return IX86_CALLCVT_CDECL;
5083 attrs = TYPE_ATTRIBUTES (type);
5084 if (attrs != NULL_TREE)
5086 if (lookup_attribute ("cdecl", attrs))
5087 ret |= IX86_CALLCVT_CDECL;
5088 else if (lookup_attribute ("stdcall", attrs))
5089 ret |= IX86_CALLCVT_STDCALL;
5090 else if (lookup_attribute ("fastcall", attrs))
5091 ret |= IX86_CALLCVT_FASTCALL;
5092 else if (lookup_attribute ("thiscall", attrs))
5093 ret |= IX86_CALLCVT_THISCALL;
5095 /* Regparam isn't allowed for thiscall and fastcall. */
5096 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5098 if (lookup_attribute ("regparm", attrs))
5099 ret |= IX86_CALLCVT_REGPARM;
5100 if (lookup_attribute ("sseregparm", attrs))
5101 ret |= IX86_CALLCVT_SSEREGPARM;
5104 if (IX86_BASE_CALLCVT(ret) != 0)
5105 return ret;
5108 is_stdarg = stdarg_p (type);
5109 if (TARGET_RTD && !is_stdarg)
5110 return IX86_CALLCVT_STDCALL | ret;
5112 if (ret != 0
5113 || is_stdarg
5114 || TREE_CODE (type) != METHOD_TYPE
5115 || ix86_function_type_abi (type) != MS_ABI)
5116 return IX86_CALLCVT_CDECL | ret;
5118 return IX86_CALLCVT_THISCALL;
5121 /* Return 0 if the attributes for two types are incompatible, 1 if they
5122 are compatible, and 2 if they are nearly compatible (which causes a
5123 warning to be generated). */
5125 static int
5126 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5128 unsigned int ccvt1, ccvt2;
5130 if (TREE_CODE (type1) != FUNCTION_TYPE
5131 && TREE_CODE (type1) != METHOD_TYPE)
5132 return 1;
5134 ccvt1 = ix86_get_callcvt (type1);
5135 ccvt2 = ix86_get_callcvt (type2);
5136 if (ccvt1 != ccvt2)
5137 return 0;
5138 if (ix86_function_regparm (type1, NULL)
5139 != ix86_function_regparm (type2, NULL))
5140 return 0;
5142 return 1;
5145 /* Return the regparm value for a function with the indicated TYPE and DECL.
5146 DECL may be NULL when calling function indirectly
5147 or considering a libcall. */
5149 static int
5150 ix86_function_regparm (const_tree type, const_tree decl)
5152 tree attr;
5153 int regparm;
5154 unsigned int ccvt;
5156 if (TARGET_64BIT)
5157 return (ix86_function_type_abi (type) == SYSV_ABI
5158 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5159 ccvt = ix86_get_callcvt (type);
5160 regparm = ix86_regparm;
5162 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5164 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5165 if (attr)
5167 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5168 return regparm;
5171 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5172 return 2;
5173 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5174 return 1;
5176 /* Use register calling convention for local functions when possible. */
5177 if (decl
5178 && TREE_CODE (decl) == FUNCTION_DECL
5179 && optimize
5180 && !(profile_flag && !flag_fentry))
5182 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5183 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5184 if (i && i->local && i->can_change_signature)
5186 int local_regparm, globals = 0, regno;
5188 /* Make sure no regparm register is taken by a
5189 fixed register variable. */
5190 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5191 if (fixed_regs[local_regparm])
5192 break;
5194 /* We don't want to use regparm(3) for nested functions as
5195 these use a static chain pointer in the third argument. */
5196 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5197 local_regparm = 2;
5199 /* In 32-bit mode save a register for the split stack. */
5200 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5201 local_regparm = 2;
5203 /* Each fixed register usage increases register pressure,
5204 so less registers should be used for argument passing.
5205 This functionality can be overriden by an explicit
5206 regparm value. */
5207 for (regno = AX_REG; regno <= DI_REG; regno++)
5208 if (fixed_regs[regno])
5209 globals++;
5211 local_regparm
5212 = globals < local_regparm ? local_regparm - globals : 0;
5214 if (local_regparm > regparm)
5215 regparm = local_regparm;
5219 return regparm;
5222 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5223 DFmode (2) arguments in SSE registers for a function with the
5224 indicated TYPE and DECL. DECL may be NULL when calling function
5225 indirectly or considering a libcall. Otherwise return 0. */
5227 static int
5228 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5230 gcc_assert (!TARGET_64BIT);
5232 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5233 by the sseregparm attribute. */
5234 if (TARGET_SSEREGPARM
5235 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5237 if (!TARGET_SSE)
5239 if (warn)
5241 if (decl)
5242 error ("calling %qD with attribute sseregparm without "
5243 "SSE/SSE2 enabled", decl);
5244 else
5245 error ("calling %qT with attribute sseregparm without "
5246 "SSE/SSE2 enabled", type);
5248 return 0;
5251 return 2;
5254 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5255 (and DFmode for SSE2) arguments in SSE registers. */
5256 if (decl && TARGET_SSE_MATH && optimize
5257 && !(profile_flag && !flag_fentry))
5259 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5260 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5261 if (i && i->local && i->can_change_signature)
5262 return TARGET_SSE2 ? 2 : 1;
5265 return 0;
5268 /* Return true if EAX is live at the start of the function. Used by
5269 ix86_expand_prologue to determine if we need special help before
5270 calling allocate_stack_worker. */
5272 static bool
5273 ix86_eax_live_at_start_p (void)
5275 /* Cheat. Don't bother working forward from ix86_function_regparm
5276 to the function type to whether an actual argument is located in
5277 eax. Instead just look at cfg info, which is still close enough
5278 to correct at this point. This gives false positives for broken
5279 functions that might use uninitialized data that happens to be
5280 allocated in eax, but who cares? */
5281 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5284 static bool
5285 ix86_keep_aggregate_return_pointer (tree fntype)
5287 tree attr;
5289 if (!TARGET_64BIT)
5291 attr = lookup_attribute ("callee_pop_aggregate_return",
5292 TYPE_ATTRIBUTES (fntype));
5293 if (attr)
5294 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5296 /* For 32-bit MS-ABI the default is to keep aggregate
5297 return pointer. */
5298 if (ix86_function_type_abi (fntype) == MS_ABI)
5299 return true;
5301 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5304 /* Value is the number of bytes of arguments automatically
5305 popped when returning from a subroutine call.
5306 FUNDECL is the declaration node of the function (as a tree),
5307 FUNTYPE is the data type of the function (as a tree),
5308 or for a library call it is an identifier node for the subroutine name.
5309 SIZE is the number of bytes of arguments passed on the stack.
5311 On the 80386, the RTD insn may be used to pop them if the number
5312 of args is fixed, but if the number is variable then the caller
5313 must pop them all. RTD can't be used for library calls now
5314 because the library is compiled with the Unix compiler.
5315 Use of RTD is a selectable option, since it is incompatible with
5316 standard Unix calling sequences. If the option is not selected,
5317 the caller must always pop the args.
5319 The attribute stdcall is equivalent to RTD on a per module basis. */
5321 static int
5322 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5324 unsigned int ccvt;
5326 /* None of the 64-bit ABIs pop arguments. */
5327 if (TARGET_64BIT)
5328 return 0;
5330 ccvt = ix86_get_callcvt (funtype);
5332 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5333 | IX86_CALLCVT_THISCALL)) != 0
5334 && ! stdarg_p (funtype))
5335 return size;
5337 /* Lose any fake structure return argument if it is passed on the stack. */
5338 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5339 && !ix86_keep_aggregate_return_pointer (funtype))
5341 int nregs = ix86_function_regparm (funtype, fundecl);
5342 if (nregs == 0)
5343 return GET_MODE_SIZE (Pmode);
5346 return 0;
5349 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5351 static bool
5352 ix86_legitimate_combined_insn (rtx insn)
5354 /* Check operand constraints in case hard registers were propagated
5355 into insn pattern. This check prevents combine pass from
5356 generating insn patterns with invalid hard register operands.
5357 These invalid insns can eventually confuse reload to error out
5358 with a spill failure. See also PRs 46829 and 46843. */
5359 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5361 int i;
5363 extract_insn (insn);
5364 preprocess_constraints ();
5366 for (i = 0; i < recog_data.n_operands; i++)
5368 rtx op = recog_data.operand[i];
5369 enum machine_mode mode = GET_MODE (op);
5370 struct operand_alternative *op_alt;
5371 int offset = 0;
5372 bool win;
5373 int j;
5375 /* For pre-AVX disallow unaligned loads/stores where the
5376 instructions don't support it. */
5377 if (!TARGET_AVX
5378 && VECTOR_MODE_P (GET_MODE (op))
5379 && misaligned_operand (op, GET_MODE (op)))
5381 int min_align = get_attr_ssememalign (insn);
5382 if (min_align == 0)
5383 return false;
5386 /* A unary operator may be accepted by the predicate, but it
5387 is irrelevant for matching constraints. */
5388 if (UNARY_P (op))
5389 op = XEXP (op, 0);
5391 if (GET_CODE (op) == SUBREG)
5393 if (REG_P (SUBREG_REG (op))
5394 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5395 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5396 GET_MODE (SUBREG_REG (op)),
5397 SUBREG_BYTE (op),
5398 GET_MODE (op));
5399 op = SUBREG_REG (op);
5402 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5403 continue;
5405 op_alt = recog_op_alt[i];
5407 /* Operand has no constraints, anything is OK. */
5408 win = !recog_data.n_alternatives;
5410 for (j = 0; j < recog_data.n_alternatives; j++)
5412 if (op_alt[j].anything_ok
5413 || (op_alt[j].matches != -1
5414 && operands_match_p
5415 (recog_data.operand[i],
5416 recog_data.operand[op_alt[j].matches]))
5417 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5419 win = true;
5420 break;
5424 if (!win)
5425 return false;
5429 return true;
5432 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5434 static unsigned HOST_WIDE_INT
5435 ix86_asan_shadow_offset (void)
5437 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5438 : HOST_WIDE_INT_C (0x7fff8000))
5439 : (HOST_WIDE_INT_1 << 29);
5442 /* Argument support functions. */
5444 /* Return true when register may be used to pass function parameters. */
5445 bool
5446 ix86_function_arg_regno_p (int regno)
5448 int i;
5449 const int *parm_regs;
5451 if (!TARGET_64BIT)
5453 if (TARGET_MACHO)
5454 return (regno < REGPARM_MAX
5455 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5456 else
5457 return (regno < REGPARM_MAX
5458 || (TARGET_MMX && MMX_REGNO_P (regno)
5459 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5460 || (TARGET_SSE && SSE_REGNO_P (regno)
5461 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5464 if (TARGET_MACHO)
5466 if (SSE_REGNO_P (regno) && TARGET_SSE)
5467 return true;
5469 else
5471 if (TARGET_SSE && SSE_REGNO_P (regno)
5472 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5473 return true;
5476 /* TODO: The function should depend on current function ABI but
5477 builtins.c would need updating then. Therefore we use the
5478 default ABI. */
5480 /* RAX is used as hidden argument to va_arg functions. */
5481 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5482 return true;
5484 if (ix86_abi == MS_ABI)
5485 parm_regs = x86_64_ms_abi_int_parameter_registers;
5486 else
5487 parm_regs = x86_64_int_parameter_registers;
5488 for (i = 0; i < (ix86_abi == MS_ABI
5489 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5490 if (regno == parm_regs[i])
5491 return true;
5492 return false;
5495 /* Return if we do not know how to pass TYPE solely in registers. */
5497 static bool
5498 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5500 if (must_pass_in_stack_var_size_or_pad (mode, type))
5501 return true;
5503 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5504 The layout_type routine is crafty and tries to trick us into passing
5505 currently unsupported vector types on the stack by using TImode. */
5506 return (!TARGET_64BIT && mode == TImode
5507 && type && TREE_CODE (type) != VECTOR_TYPE);
5510 /* It returns the size, in bytes, of the area reserved for arguments passed
5511 in registers for the function represented by fndecl dependent to the used
5512 abi format. */
5514 ix86_reg_parm_stack_space (const_tree fndecl)
5516 enum calling_abi call_abi = SYSV_ABI;
5517 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5518 call_abi = ix86_function_abi (fndecl);
5519 else
5520 call_abi = ix86_function_type_abi (fndecl);
5521 if (TARGET_64BIT && call_abi == MS_ABI)
5522 return 32;
5523 return 0;
5526 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5527 call abi used. */
5528 enum calling_abi
5529 ix86_function_type_abi (const_tree fntype)
5531 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5533 enum calling_abi abi = ix86_abi;
5534 if (abi == SYSV_ABI)
5536 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5537 abi = MS_ABI;
5539 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5540 abi = SYSV_ABI;
5541 return abi;
5543 return ix86_abi;
5546 static bool
5547 ix86_function_ms_hook_prologue (const_tree fn)
5549 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5551 if (decl_function_context (fn) != NULL_TREE)
5552 error_at (DECL_SOURCE_LOCATION (fn),
5553 "ms_hook_prologue is not compatible with nested function");
5554 else
5555 return true;
5557 return false;
5560 static enum calling_abi
5561 ix86_function_abi (const_tree fndecl)
5563 if (! fndecl)
5564 return ix86_abi;
5565 return ix86_function_type_abi (TREE_TYPE (fndecl));
5568 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5569 call abi used. */
5570 enum calling_abi
5571 ix86_cfun_abi (void)
5573 if (! cfun)
5574 return ix86_abi;
5575 return cfun->machine->call_abi;
5578 /* Write the extra assembler code needed to declare a function properly. */
5580 void
5581 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5582 tree decl)
5584 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5586 if (is_ms_hook)
5588 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5589 unsigned int filler_cc = 0xcccccccc;
5591 for (i = 0; i < filler_count; i += 4)
5592 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5595 #ifdef SUBTARGET_ASM_UNWIND_INIT
5596 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5597 #endif
5599 ASM_OUTPUT_LABEL (asm_out_file, fname);
5601 /* Output magic byte marker, if hot-patch attribute is set. */
5602 if (is_ms_hook)
5604 if (TARGET_64BIT)
5606 /* leaq [%rsp + 0], %rsp */
5607 asm_fprintf (asm_out_file, ASM_BYTE
5608 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5610 else
5612 /* movl.s %edi, %edi
5613 push %ebp
5614 movl.s %esp, %ebp */
5615 asm_fprintf (asm_out_file, ASM_BYTE
5616 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5621 /* regclass.c */
5622 extern void init_regs (void);
5624 /* Implementation of call abi switching target hook. Specific to FNDECL
5625 the specific call register sets are set. See also
5626 ix86_conditional_register_usage for more details. */
5627 void
5628 ix86_call_abi_override (const_tree fndecl)
5630 if (fndecl == NULL_TREE)
5631 cfun->machine->call_abi = ix86_abi;
5632 else
5633 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5636 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5637 expensive re-initialization of init_regs each time we switch function context
5638 since this is needed only during RTL expansion. */
5639 static void
5640 ix86_maybe_switch_abi (void)
5642 if (TARGET_64BIT &&
5643 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5644 reinit_regs ();
5647 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5648 for a call to a function whose data type is FNTYPE.
5649 For a library call, FNTYPE is 0. */
5651 void
5652 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5653 tree fntype, /* tree ptr for function decl */
5654 rtx libname, /* SYMBOL_REF of library name or 0 */
5655 tree fndecl,
5656 int caller)
5658 struct cgraph_local_info *i;
5660 memset (cum, 0, sizeof (*cum));
5662 if (fndecl)
5664 i = cgraph_local_info (fndecl);
5665 cum->call_abi = ix86_function_abi (fndecl);
5667 else
5669 i = NULL;
5670 cum->call_abi = ix86_function_type_abi (fntype);
5673 cum->caller = caller;
5675 /* Set up the number of registers to use for passing arguments. */
5677 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5678 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5679 "or subtarget optimization implying it");
5680 cum->nregs = ix86_regparm;
5681 if (TARGET_64BIT)
5683 cum->nregs = (cum->call_abi == SYSV_ABI
5684 ? X86_64_REGPARM_MAX
5685 : X86_64_MS_REGPARM_MAX);
5687 if (TARGET_SSE)
5689 cum->sse_nregs = SSE_REGPARM_MAX;
5690 if (TARGET_64BIT)
5692 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5693 ? X86_64_SSE_REGPARM_MAX
5694 : X86_64_MS_SSE_REGPARM_MAX);
5697 if (TARGET_MMX)
5698 cum->mmx_nregs = MMX_REGPARM_MAX;
5699 cum->warn_avx = true;
5700 cum->warn_sse = true;
5701 cum->warn_mmx = true;
5703 /* Because type might mismatch in between caller and callee, we need to
5704 use actual type of function for local calls.
5705 FIXME: cgraph_analyze can be told to actually record if function uses
5706 va_start so for local functions maybe_vaarg can be made aggressive
5707 helping K&R code.
5708 FIXME: once typesytem is fixed, we won't need this code anymore. */
5709 if (i && i->local && i->can_change_signature)
5710 fntype = TREE_TYPE (fndecl);
5711 cum->maybe_vaarg = (fntype
5712 ? (!prototype_p (fntype) || stdarg_p (fntype))
5713 : !libname);
5715 if (!TARGET_64BIT)
5717 /* If there are variable arguments, then we won't pass anything
5718 in registers in 32-bit mode. */
5719 if (stdarg_p (fntype))
5721 cum->nregs = 0;
5722 cum->sse_nregs = 0;
5723 cum->mmx_nregs = 0;
5724 cum->warn_avx = 0;
5725 cum->warn_sse = 0;
5726 cum->warn_mmx = 0;
5727 return;
5730 /* Use ecx and edx registers if function has fastcall attribute,
5731 else look for regparm information. */
5732 if (fntype)
5734 unsigned int ccvt = ix86_get_callcvt (fntype);
5735 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5737 cum->nregs = 1;
5738 cum->fastcall = 1; /* Same first register as in fastcall. */
5740 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5742 cum->nregs = 2;
5743 cum->fastcall = 1;
5745 else
5746 cum->nregs = ix86_function_regparm (fntype, fndecl);
5749 /* Set up the number of SSE registers used for passing SFmode
5750 and DFmode arguments. Warn for mismatching ABI. */
5751 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5755 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5756 But in the case of vector types, it is some vector mode.
5758 When we have only some of our vector isa extensions enabled, then there
5759 are some modes for which vector_mode_supported_p is false. For these
5760 modes, the generic vector support in gcc will choose some non-vector mode
5761 in order to implement the type. By computing the natural mode, we'll
5762 select the proper ABI location for the operand and not depend on whatever
5763 the middle-end decides to do with these vector types.
5765 The midde-end can't deal with the vector types > 16 bytes. In this
5766 case, we return the original mode and warn ABI change if CUM isn't
5767 NULL.
5769 If INT_RETURN is true, warn ABI change if the vector mode isn't
5770 available for function return value. */
5772 static enum machine_mode
5773 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
5774 bool in_return)
5776 enum machine_mode mode = TYPE_MODE (type);
5778 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5780 HOST_WIDE_INT size = int_size_in_bytes (type);
5781 if ((size == 8 || size == 16 || size == 32)
5782 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5783 && TYPE_VECTOR_SUBPARTS (type) > 1)
5785 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5787 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5788 mode = MIN_MODE_VECTOR_FLOAT;
5789 else
5790 mode = MIN_MODE_VECTOR_INT;
5792 /* Get the mode which has this inner mode and number of units. */
5793 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5794 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5795 && GET_MODE_INNER (mode) == innermode)
5797 if (size == 32 && !TARGET_AVX)
5799 static bool warnedavx;
5800 static bool warnedavx_ret;
5802 if (cum
5803 && !warnedavx
5804 && cum->warn_avx)
5806 warnedavx = true;
5807 warning (0, "AVX vector argument without AVX "
5808 "enabled changes the ABI");
5810 else if (in_return & !warnedavx_ret)
5812 warnedavx_ret = true;
5813 warning (0, "AVX vector return without AVX "
5814 "enabled changes the ABI");
5817 return TYPE_MODE (type);
5819 else if (((size == 8 && TARGET_64BIT) || size == 16)
5820 && !TARGET_SSE)
5822 static bool warnedsse;
5823 static bool warnedsse_ret;
5825 if (cum
5826 && !warnedsse
5827 && cum->warn_sse)
5829 warnedsse = true;
5830 warning (0, "SSE vector argument without SSE "
5831 "enabled changes the ABI");
5833 else if (!TARGET_64BIT
5834 && in_return
5835 & !warnedsse_ret)
5837 warnedsse_ret = true;
5838 warning (0, "SSE vector return without SSE "
5839 "enabled changes the ABI");
5842 else if ((size == 8 && !TARGET_64BIT) && !TARGET_MMX)
5844 static bool warnedmmx;
5845 static bool warnedmmx_ret;
5847 if (cum
5848 && !warnedmmx
5849 && cum->warn_mmx)
5851 warnedmmx = true;
5852 warning (0, "MMX vector argument without MMX "
5853 "enabled changes the ABI");
5855 else if (in_return & !warnedmmx_ret)
5857 warnedmmx_ret = true;
5858 warning (0, "MMX vector return without MMX "
5859 "enabled changes the ABI");
5862 return mode;
5865 gcc_unreachable ();
5869 return mode;
5872 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5873 this may not agree with the mode that the type system has chosen for the
5874 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5875 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5877 static rtx
5878 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5879 unsigned int regno)
5881 rtx tmp;
5883 if (orig_mode != BLKmode)
5884 tmp = gen_rtx_REG (orig_mode, regno);
5885 else
5887 tmp = gen_rtx_REG (mode, regno);
5888 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5889 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5892 return tmp;
5895 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5896 of this code is to classify each 8bytes of incoming argument by the register
5897 class and assign registers accordingly. */
5899 /* Return the union class of CLASS1 and CLASS2.
5900 See the x86-64 PS ABI for details. */
5902 static enum x86_64_reg_class
5903 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5905 /* Rule #1: If both classes are equal, this is the resulting class. */
5906 if (class1 == class2)
5907 return class1;
5909 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5910 the other class. */
5911 if (class1 == X86_64_NO_CLASS)
5912 return class2;
5913 if (class2 == X86_64_NO_CLASS)
5914 return class1;
5916 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5917 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5918 return X86_64_MEMORY_CLASS;
5920 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5921 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5922 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5923 return X86_64_INTEGERSI_CLASS;
5924 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5925 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5926 return X86_64_INTEGER_CLASS;
5928 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5929 MEMORY is used. */
5930 if (class1 == X86_64_X87_CLASS
5931 || class1 == X86_64_X87UP_CLASS
5932 || class1 == X86_64_COMPLEX_X87_CLASS
5933 || class2 == X86_64_X87_CLASS
5934 || class2 == X86_64_X87UP_CLASS
5935 || class2 == X86_64_COMPLEX_X87_CLASS)
5936 return X86_64_MEMORY_CLASS;
5938 /* Rule #6: Otherwise class SSE is used. */
5939 return X86_64_SSE_CLASS;
5942 /* Classify the argument of type TYPE and mode MODE.
5943 CLASSES will be filled by the register class used to pass each word
5944 of the operand. The number of words is returned. In case the parameter
5945 should be passed in memory, 0 is returned. As a special case for zero
5946 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5948 BIT_OFFSET is used internally for handling records and specifies offset
5949 of the offset in bits modulo 256 to avoid overflow cases.
5951 See the x86-64 PS ABI for details.
5954 static int
5955 classify_argument (enum machine_mode mode, const_tree type,
5956 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5958 HOST_WIDE_INT bytes =
5959 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5960 int words
5961 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5963 /* Variable sized entities are always passed/returned in memory. */
5964 if (bytes < 0)
5965 return 0;
5967 if (mode != VOIDmode
5968 && targetm.calls.must_pass_in_stack (mode, type))
5969 return 0;
5971 if (type && AGGREGATE_TYPE_P (type))
5973 int i;
5974 tree field;
5975 enum x86_64_reg_class subclasses[MAX_CLASSES];
5977 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5978 if (bytes > 32)
5979 return 0;
5981 for (i = 0; i < words; i++)
5982 classes[i] = X86_64_NO_CLASS;
5984 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5985 signalize memory class, so handle it as special case. */
5986 if (!words)
5988 classes[0] = X86_64_NO_CLASS;
5989 return 1;
5992 /* Classify each field of record and merge classes. */
5993 switch (TREE_CODE (type))
5995 case RECORD_TYPE:
5996 /* And now merge the fields of structure. */
5997 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5999 if (TREE_CODE (field) == FIELD_DECL)
6001 int num;
6003 if (TREE_TYPE (field) == error_mark_node)
6004 continue;
6006 /* Bitfields are always classified as integer. Handle them
6007 early, since later code would consider them to be
6008 misaligned integers. */
6009 if (DECL_BIT_FIELD (field))
6011 for (i = (int_bit_position (field)
6012 + (bit_offset % 64)) / 8 / 8;
6013 i < ((int_bit_position (field) + (bit_offset % 64))
6014 + tree_low_cst (DECL_SIZE (field), 0)
6015 + 63) / 8 / 8; i++)
6016 classes[i] =
6017 merge_classes (X86_64_INTEGER_CLASS,
6018 classes[i]);
6020 else
6022 int pos;
6024 type = TREE_TYPE (field);
6026 /* Flexible array member is ignored. */
6027 if (TYPE_MODE (type) == BLKmode
6028 && TREE_CODE (type) == ARRAY_TYPE
6029 && TYPE_SIZE (type) == NULL_TREE
6030 && TYPE_DOMAIN (type) != NULL_TREE
6031 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6032 == NULL_TREE))
6034 static bool warned;
6036 if (!warned && warn_psabi)
6038 warned = true;
6039 inform (input_location,
6040 "the ABI of passing struct with"
6041 " a flexible array member has"
6042 " changed in GCC 4.4");
6044 continue;
6046 num = classify_argument (TYPE_MODE (type), type,
6047 subclasses,
6048 (int_bit_position (field)
6049 + bit_offset) % 256);
6050 if (!num)
6051 return 0;
6052 pos = (int_bit_position (field)
6053 + (bit_offset % 64)) / 8 / 8;
6054 for (i = 0; i < num && (i + pos) < words; i++)
6055 classes[i + pos] =
6056 merge_classes (subclasses[i], classes[i + pos]);
6060 break;
6062 case ARRAY_TYPE:
6063 /* Arrays are handled as small records. */
6065 int num;
6066 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6067 TREE_TYPE (type), subclasses, bit_offset);
6068 if (!num)
6069 return 0;
6071 /* The partial classes are now full classes. */
6072 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6073 subclasses[0] = X86_64_SSE_CLASS;
6074 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6075 && !((bit_offset % 64) == 0 && bytes == 4))
6076 subclasses[0] = X86_64_INTEGER_CLASS;
6078 for (i = 0; i < words; i++)
6079 classes[i] = subclasses[i % num];
6081 break;
6083 case UNION_TYPE:
6084 case QUAL_UNION_TYPE:
6085 /* Unions are similar to RECORD_TYPE but offset is always 0.
6087 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6089 if (TREE_CODE (field) == FIELD_DECL)
6091 int num;
6093 if (TREE_TYPE (field) == error_mark_node)
6094 continue;
6096 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6097 TREE_TYPE (field), subclasses,
6098 bit_offset);
6099 if (!num)
6100 return 0;
6101 for (i = 0; i < num; i++)
6102 classes[i] = merge_classes (subclasses[i], classes[i]);
6105 break;
6107 default:
6108 gcc_unreachable ();
6111 if (words > 2)
6113 /* When size > 16 bytes, if the first one isn't
6114 X86_64_SSE_CLASS or any other ones aren't
6115 X86_64_SSEUP_CLASS, everything should be passed in
6116 memory. */
6117 if (classes[0] != X86_64_SSE_CLASS)
6118 return 0;
6120 for (i = 1; i < words; i++)
6121 if (classes[i] != X86_64_SSEUP_CLASS)
6122 return 0;
6125 /* Final merger cleanup. */
6126 for (i = 0; i < words; i++)
6128 /* If one class is MEMORY, everything should be passed in
6129 memory. */
6130 if (classes[i] == X86_64_MEMORY_CLASS)
6131 return 0;
6133 /* The X86_64_SSEUP_CLASS should be always preceded by
6134 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6135 if (classes[i] == X86_64_SSEUP_CLASS
6136 && classes[i - 1] != X86_64_SSE_CLASS
6137 && classes[i - 1] != X86_64_SSEUP_CLASS)
6139 /* The first one should never be X86_64_SSEUP_CLASS. */
6140 gcc_assert (i != 0);
6141 classes[i] = X86_64_SSE_CLASS;
6144 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6145 everything should be passed in memory. */
6146 if (classes[i] == X86_64_X87UP_CLASS
6147 && (classes[i - 1] != X86_64_X87_CLASS))
6149 static bool warned;
6151 /* The first one should never be X86_64_X87UP_CLASS. */
6152 gcc_assert (i != 0);
6153 if (!warned && warn_psabi)
6155 warned = true;
6156 inform (input_location,
6157 "the ABI of passing union with long double"
6158 " has changed in GCC 4.4");
6160 return 0;
6163 return words;
6166 /* Compute alignment needed. We align all types to natural boundaries with
6167 exception of XFmode that is aligned to 64bits. */
6168 if (mode != VOIDmode && mode != BLKmode)
6170 int mode_alignment = GET_MODE_BITSIZE (mode);
6172 if (mode == XFmode)
6173 mode_alignment = 128;
6174 else if (mode == XCmode)
6175 mode_alignment = 256;
6176 if (COMPLEX_MODE_P (mode))
6177 mode_alignment /= 2;
6178 /* Misaligned fields are always returned in memory. */
6179 if (bit_offset % mode_alignment)
6180 return 0;
6183 /* for V1xx modes, just use the base mode */
6184 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6185 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6186 mode = GET_MODE_INNER (mode);
6188 /* Classification of atomic types. */
6189 switch (mode)
6191 case SDmode:
6192 case DDmode:
6193 classes[0] = X86_64_SSE_CLASS;
6194 return 1;
6195 case TDmode:
6196 classes[0] = X86_64_SSE_CLASS;
6197 classes[1] = X86_64_SSEUP_CLASS;
6198 return 2;
6199 case DImode:
6200 case SImode:
6201 case HImode:
6202 case QImode:
6203 case CSImode:
6204 case CHImode:
6205 case CQImode:
6207 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6209 if (size <= 32)
6211 classes[0] = X86_64_INTEGERSI_CLASS;
6212 return 1;
6214 else if (size <= 64)
6216 classes[0] = X86_64_INTEGER_CLASS;
6217 return 1;
6219 else if (size <= 64+32)
6221 classes[0] = X86_64_INTEGER_CLASS;
6222 classes[1] = X86_64_INTEGERSI_CLASS;
6223 return 2;
6225 else if (size <= 64+64)
6227 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6228 return 2;
6230 else
6231 gcc_unreachable ();
6233 case CDImode:
6234 case TImode:
6235 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6236 return 2;
6237 case COImode:
6238 case OImode:
6239 /* OImode shouldn't be used directly. */
6240 gcc_unreachable ();
6241 case CTImode:
6242 return 0;
6243 case SFmode:
6244 if (!(bit_offset % 64))
6245 classes[0] = X86_64_SSESF_CLASS;
6246 else
6247 classes[0] = X86_64_SSE_CLASS;
6248 return 1;
6249 case DFmode:
6250 classes[0] = X86_64_SSEDF_CLASS;
6251 return 1;
6252 case XFmode:
6253 classes[0] = X86_64_X87_CLASS;
6254 classes[1] = X86_64_X87UP_CLASS;
6255 return 2;
6256 case TFmode:
6257 classes[0] = X86_64_SSE_CLASS;
6258 classes[1] = X86_64_SSEUP_CLASS;
6259 return 2;
6260 case SCmode:
6261 classes[0] = X86_64_SSE_CLASS;
6262 if (!(bit_offset % 64))
6263 return 1;
6264 else
6266 static bool warned;
6268 if (!warned && warn_psabi)
6270 warned = true;
6271 inform (input_location,
6272 "the ABI of passing structure with complex float"
6273 " member has changed in GCC 4.4");
6275 classes[1] = X86_64_SSESF_CLASS;
6276 return 2;
6278 case DCmode:
6279 classes[0] = X86_64_SSEDF_CLASS;
6280 classes[1] = X86_64_SSEDF_CLASS;
6281 return 2;
6282 case XCmode:
6283 classes[0] = X86_64_COMPLEX_X87_CLASS;
6284 return 1;
6285 case TCmode:
6286 /* This modes is larger than 16 bytes. */
6287 return 0;
6288 case V8SFmode:
6289 case V8SImode:
6290 case V32QImode:
6291 case V16HImode:
6292 case V4DFmode:
6293 case V4DImode:
6294 classes[0] = X86_64_SSE_CLASS;
6295 classes[1] = X86_64_SSEUP_CLASS;
6296 classes[2] = X86_64_SSEUP_CLASS;
6297 classes[3] = X86_64_SSEUP_CLASS;
6298 return 4;
6299 case V4SFmode:
6300 case V4SImode:
6301 case V16QImode:
6302 case V8HImode:
6303 case V2DFmode:
6304 case V2DImode:
6305 classes[0] = X86_64_SSE_CLASS;
6306 classes[1] = X86_64_SSEUP_CLASS;
6307 return 2;
6308 case V1TImode:
6309 case V1DImode:
6310 case V2SFmode:
6311 case V2SImode:
6312 case V4HImode:
6313 case V8QImode:
6314 classes[0] = X86_64_SSE_CLASS;
6315 return 1;
6316 case BLKmode:
6317 case VOIDmode:
6318 return 0;
6319 default:
6320 gcc_assert (VECTOR_MODE_P (mode));
6322 if (bytes > 16)
6323 return 0;
6325 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6327 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6328 classes[0] = X86_64_INTEGERSI_CLASS;
6329 else
6330 classes[0] = X86_64_INTEGER_CLASS;
6331 classes[1] = X86_64_INTEGER_CLASS;
6332 return 1 + (bytes > 8);
6336 /* Examine the argument and return set number of register required in each
6337 class. Return 0 iff parameter should be passed in memory. */
6338 static int
6339 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6340 int *int_nregs, int *sse_nregs)
6342 enum x86_64_reg_class regclass[MAX_CLASSES];
6343 int n = classify_argument (mode, type, regclass, 0);
6345 *int_nregs = 0;
6346 *sse_nregs = 0;
6347 if (!n)
6348 return 0;
6349 for (n--; n >= 0; n--)
6350 switch (regclass[n])
6352 case X86_64_INTEGER_CLASS:
6353 case X86_64_INTEGERSI_CLASS:
6354 (*int_nregs)++;
6355 break;
6356 case X86_64_SSE_CLASS:
6357 case X86_64_SSESF_CLASS:
6358 case X86_64_SSEDF_CLASS:
6359 (*sse_nregs)++;
6360 break;
6361 case X86_64_NO_CLASS:
6362 case X86_64_SSEUP_CLASS:
6363 break;
6364 case X86_64_X87_CLASS:
6365 case X86_64_X87UP_CLASS:
6366 if (!in_return)
6367 return 0;
6368 break;
6369 case X86_64_COMPLEX_X87_CLASS:
6370 return in_return ? 2 : 0;
6371 case X86_64_MEMORY_CLASS:
6372 gcc_unreachable ();
6374 return 1;
6377 /* Construct container for the argument used by GCC interface. See
6378 FUNCTION_ARG for the detailed description. */
6380 static rtx
6381 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6382 const_tree type, int in_return, int nintregs, int nsseregs,
6383 const int *intreg, int sse_regno)
6385 /* The following variables hold the static issued_error state. */
6386 static bool issued_sse_arg_error;
6387 static bool issued_sse_ret_error;
6388 static bool issued_x87_ret_error;
6390 enum machine_mode tmpmode;
6391 int bytes =
6392 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6393 enum x86_64_reg_class regclass[MAX_CLASSES];
6394 int n;
6395 int i;
6396 int nexps = 0;
6397 int needed_sseregs, needed_intregs;
6398 rtx exp[MAX_CLASSES];
6399 rtx ret;
6401 n = classify_argument (mode, type, regclass, 0);
6402 if (!n)
6403 return NULL;
6404 if (!examine_argument (mode, type, in_return, &needed_intregs,
6405 &needed_sseregs))
6406 return NULL;
6407 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6408 return NULL;
6410 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6411 some less clueful developer tries to use floating-point anyway. */
6412 if (needed_sseregs && !TARGET_SSE)
6414 if (in_return)
6416 if (!issued_sse_ret_error)
6418 error ("SSE register return with SSE disabled");
6419 issued_sse_ret_error = true;
6422 else if (!issued_sse_arg_error)
6424 error ("SSE register argument with SSE disabled");
6425 issued_sse_arg_error = true;
6427 return NULL;
6430 /* Likewise, error if the ABI requires us to return values in the
6431 x87 registers and the user specified -mno-80387. */
6432 if (in_return && !TARGET_FLOAT_RETURNS_IN_80387)
6433 for (i = 0; i < n; i++)
6434 if (regclass[i] == X86_64_X87_CLASS
6435 || regclass[i] == X86_64_X87UP_CLASS
6436 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6438 if (!issued_x87_ret_error)
6440 error ("x87 register return with x87 disabled");
6441 issued_x87_ret_error = true;
6443 return NULL;
6446 /* First construct simple cases. Avoid SCmode, since we want to use
6447 single register to pass this type. */
6448 if (n == 1 && mode != SCmode)
6449 switch (regclass[0])
6451 case X86_64_INTEGER_CLASS:
6452 case X86_64_INTEGERSI_CLASS:
6453 return gen_rtx_REG (mode, intreg[0]);
6454 case X86_64_SSE_CLASS:
6455 case X86_64_SSESF_CLASS:
6456 case X86_64_SSEDF_CLASS:
6457 if (mode != BLKmode)
6458 return gen_reg_or_parallel (mode, orig_mode,
6459 SSE_REGNO (sse_regno));
6460 break;
6461 case X86_64_X87_CLASS:
6462 case X86_64_COMPLEX_X87_CLASS:
6463 return gen_rtx_REG (mode, FIRST_STACK_REG);
6464 case X86_64_NO_CLASS:
6465 /* Zero sized array, struct or class. */
6466 return NULL;
6467 default:
6468 gcc_unreachable ();
6470 if (n == 2
6471 && regclass[0] == X86_64_SSE_CLASS
6472 && regclass[1] == X86_64_SSEUP_CLASS
6473 && mode != BLKmode)
6474 return gen_reg_or_parallel (mode, orig_mode,
6475 SSE_REGNO (sse_regno));
6476 if (n == 4
6477 && regclass[0] == X86_64_SSE_CLASS
6478 && regclass[1] == X86_64_SSEUP_CLASS
6479 && regclass[2] == X86_64_SSEUP_CLASS
6480 && regclass[3] == X86_64_SSEUP_CLASS
6481 && mode != BLKmode)
6482 return gen_reg_or_parallel (mode, orig_mode,
6483 SSE_REGNO (sse_regno));
6484 if (n == 2
6485 && regclass[0] == X86_64_X87_CLASS
6486 && regclass[1] == X86_64_X87UP_CLASS)
6487 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6489 if (n == 2
6490 && regclass[0] == X86_64_INTEGER_CLASS
6491 && regclass[1] == X86_64_INTEGER_CLASS
6492 && (mode == CDImode || mode == TImode || mode == TFmode)
6493 && intreg[0] + 1 == intreg[1])
6494 return gen_rtx_REG (mode, intreg[0]);
6496 /* Otherwise figure out the entries of the PARALLEL. */
6497 for (i = 0; i < n; i++)
6499 int pos;
6501 switch (regclass[i])
6503 case X86_64_NO_CLASS:
6504 break;
6505 case X86_64_INTEGER_CLASS:
6506 case X86_64_INTEGERSI_CLASS:
6507 /* Merge TImodes on aligned occasions here too. */
6508 if (i * 8 + 8 > bytes)
6509 tmpmode
6510 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6511 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6512 tmpmode = SImode;
6513 else
6514 tmpmode = DImode;
6515 /* We've requested 24 bytes we
6516 don't have mode for. Use DImode. */
6517 if (tmpmode == BLKmode)
6518 tmpmode = DImode;
6519 exp [nexps++]
6520 = gen_rtx_EXPR_LIST (VOIDmode,
6521 gen_rtx_REG (tmpmode, *intreg),
6522 GEN_INT (i*8));
6523 intreg++;
6524 break;
6525 case X86_64_SSESF_CLASS:
6526 exp [nexps++]
6527 = gen_rtx_EXPR_LIST (VOIDmode,
6528 gen_rtx_REG (SFmode,
6529 SSE_REGNO (sse_regno)),
6530 GEN_INT (i*8));
6531 sse_regno++;
6532 break;
6533 case X86_64_SSEDF_CLASS:
6534 exp [nexps++]
6535 = gen_rtx_EXPR_LIST (VOIDmode,
6536 gen_rtx_REG (DFmode,
6537 SSE_REGNO (sse_regno)),
6538 GEN_INT (i*8));
6539 sse_regno++;
6540 break;
6541 case X86_64_SSE_CLASS:
6542 pos = i;
6543 switch (n)
6545 case 1:
6546 tmpmode = DImode;
6547 break;
6548 case 2:
6549 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6551 tmpmode = TImode;
6552 i++;
6554 else
6555 tmpmode = DImode;
6556 break;
6557 case 4:
6558 gcc_assert (i == 0
6559 && regclass[1] == X86_64_SSEUP_CLASS
6560 && regclass[2] == X86_64_SSEUP_CLASS
6561 && regclass[3] == X86_64_SSEUP_CLASS);
6562 tmpmode = OImode;
6563 i += 3;
6564 break;
6565 default:
6566 gcc_unreachable ();
6568 exp [nexps++]
6569 = gen_rtx_EXPR_LIST (VOIDmode,
6570 gen_rtx_REG (tmpmode,
6571 SSE_REGNO (sse_regno)),
6572 GEN_INT (pos*8));
6573 sse_regno++;
6574 break;
6575 default:
6576 gcc_unreachable ();
6580 /* Empty aligned struct, union or class. */
6581 if (nexps == 0)
6582 return NULL;
6584 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6585 for (i = 0; i < nexps; i++)
6586 XVECEXP (ret, 0, i) = exp [i];
6587 return ret;
6590 /* Update the data in CUM to advance over an argument of mode MODE
6591 and data type TYPE. (TYPE is null for libcalls where that information
6592 may not be available.) */
6594 static void
6595 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6596 const_tree type, HOST_WIDE_INT bytes,
6597 HOST_WIDE_INT words)
6599 switch (mode)
6601 default:
6602 break;
6604 case BLKmode:
6605 if (bytes < 0)
6606 break;
6607 /* FALLTHRU */
6609 case DImode:
6610 case SImode:
6611 case HImode:
6612 case QImode:
6613 cum->words += words;
6614 cum->nregs -= words;
6615 cum->regno += words;
6617 if (cum->nregs <= 0)
6619 cum->nregs = 0;
6620 cum->regno = 0;
6622 break;
6624 case OImode:
6625 /* OImode shouldn't be used directly. */
6626 gcc_unreachable ();
6628 case DFmode:
6629 if (cum->float_in_sse < 2)
6630 break;
6631 case SFmode:
6632 if (cum->float_in_sse < 1)
6633 break;
6634 /* FALLTHRU */
6636 case V8SFmode:
6637 case V8SImode:
6638 case V32QImode:
6639 case V16HImode:
6640 case V4DFmode:
6641 case V4DImode:
6642 case TImode:
6643 case V16QImode:
6644 case V8HImode:
6645 case V4SImode:
6646 case V2DImode:
6647 case V4SFmode:
6648 case V2DFmode:
6649 if (!type || !AGGREGATE_TYPE_P (type))
6651 cum->sse_words += words;
6652 cum->sse_nregs -= 1;
6653 cum->sse_regno += 1;
6654 if (cum->sse_nregs <= 0)
6656 cum->sse_nregs = 0;
6657 cum->sse_regno = 0;
6660 break;
6662 case V8QImode:
6663 case V4HImode:
6664 case V2SImode:
6665 case V2SFmode:
6666 case V1TImode:
6667 case V1DImode:
6668 if (!type || !AGGREGATE_TYPE_P (type))
6670 cum->mmx_words += words;
6671 cum->mmx_nregs -= 1;
6672 cum->mmx_regno += 1;
6673 if (cum->mmx_nregs <= 0)
6675 cum->mmx_nregs = 0;
6676 cum->mmx_regno = 0;
6679 break;
6683 static void
6684 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6685 const_tree type, HOST_WIDE_INT words, bool named)
6687 int int_nregs, sse_nregs;
6689 /* Unnamed 256bit vector mode parameters are passed on stack. */
6690 if (!named && VALID_AVX256_REG_MODE (mode))
6691 return;
6693 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6694 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6696 cum->nregs -= int_nregs;
6697 cum->sse_nregs -= sse_nregs;
6698 cum->regno += int_nregs;
6699 cum->sse_regno += sse_nregs;
6701 else
6703 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6704 cum->words = (cum->words + align - 1) & ~(align - 1);
6705 cum->words += words;
6709 static void
6710 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6711 HOST_WIDE_INT words)
6713 /* Otherwise, this should be passed indirect. */
6714 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6716 cum->words += words;
6717 if (cum->nregs > 0)
6719 cum->nregs -= 1;
6720 cum->regno += 1;
6724 /* Update the data in CUM to advance over an argument of mode MODE and
6725 data type TYPE. (TYPE is null for libcalls where that information
6726 may not be available.) */
6728 static void
6729 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6730 const_tree type, bool named)
6732 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6733 HOST_WIDE_INT bytes, words;
6735 if (mode == BLKmode)
6736 bytes = int_size_in_bytes (type);
6737 else
6738 bytes = GET_MODE_SIZE (mode);
6739 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6741 if (type)
6742 mode = type_natural_mode (type, NULL, false);
6744 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6745 function_arg_advance_ms_64 (cum, bytes, words);
6746 else if (TARGET_64BIT)
6747 function_arg_advance_64 (cum, mode, type, words, named);
6748 else
6749 function_arg_advance_32 (cum, mode, type, bytes, words);
6752 /* Define where to put the arguments to a function.
6753 Value is zero to push the argument on the stack,
6754 or a hard register in which to store the argument.
6756 MODE is the argument's machine mode.
6757 TYPE is the data type of the argument (as a tree).
6758 This is null for libcalls where that information may
6759 not be available.
6760 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6761 the preceding args and about the function being called.
6762 NAMED is nonzero if this argument is a named parameter
6763 (otherwise it is an extra parameter matching an ellipsis). */
6765 static rtx
6766 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6767 enum machine_mode orig_mode, const_tree type,
6768 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6770 /* Avoid the AL settings for the Unix64 ABI. */
6771 if (mode == VOIDmode)
6772 return constm1_rtx;
6774 switch (mode)
6776 default:
6777 break;
6779 case BLKmode:
6780 if (bytes < 0)
6781 break;
6782 /* FALLTHRU */
6783 case DImode:
6784 case SImode:
6785 case HImode:
6786 case QImode:
6787 if (words <= cum->nregs)
6789 int regno = cum->regno;
6791 /* Fastcall allocates the first two DWORD (SImode) or
6792 smaller arguments to ECX and EDX if it isn't an
6793 aggregate type . */
6794 if (cum->fastcall)
6796 if (mode == BLKmode
6797 || mode == DImode
6798 || (type && AGGREGATE_TYPE_P (type)))
6799 break;
6801 /* ECX not EAX is the first allocated register. */
6802 if (regno == AX_REG)
6803 regno = CX_REG;
6805 return gen_rtx_REG (mode, regno);
6807 break;
6809 case DFmode:
6810 if (cum->float_in_sse < 2)
6811 break;
6812 case SFmode:
6813 if (cum->float_in_sse < 1)
6814 break;
6815 /* FALLTHRU */
6816 case TImode:
6817 /* In 32bit, we pass TImode in xmm registers. */
6818 case V16QImode:
6819 case V8HImode:
6820 case V4SImode:
6821 case V2DImode:
6822 case V4SFmode:
6823 case V2DFmode:
6824 if (!type || !AGGREGATE_TYPE_P (type))
6826 if (cum->sse_nregs)
6827 return gen_reg_or_parallel (mode, orig_mode,
6828 cum->sse_regno + FIRST_SSE_REG);
6830 break;
6832 case OImode:
6833 /* OImode shouldn't be used directly. */
6834 gcc_unreachable ();
6836 case V8SFmode:
6837 case V8SImode:
6838 case V32QImode:
6839 case V16HImode:
6840 case V4DFmode:
6841 case V4DImode:
6842 if (!type || !AGGREGATE_TYPE_P (type))
6844 if (cum->sse_nregs)
6845 return gen_reg_or_parallel (mode, orig_mode,
6846 cum->sse_regno + FIRST_SSE_REG);
6848 break;
6850 case V8QImode:
6851 case V4HImode:
6852 case V2SImode:
6853 case V2SFmode:
6854 case V1TImode:
6855 case V1DImode:
6856 if (!type || !AGGREGATE_TYPE_P (type))
6858 if (cum->mmx_nregs)
6859 return gen_reg_or_parallel (mode, orig_mode,
6860 cum->mmx_regno + FIRST_MMX_REG);
6862 break;
6865 return NULL_RTX;
6868 static rtx
6869 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6870 enum machine_mode orig_mode, const_tree type, bool named)
6872 /* Handle a hidden AL argument containing number of registers
6873 for varargs x86-64 functions. */
6874 if (mode == VOIDmode)
6875 return GEN_INT (cum->maybe_vaarg
6876 ? (cum->sse_nregs < 0
6877 ? X86_64_SSE_REGPARM_MAX
6878 : cum->sse_regno)
6879 : -1);
6881 switch (mode)
6883 default:
6884 break;
6886 case V8SFmode:
6887 case V8SImode:
6888 case V32QImode:
6889 case V16HImode:
6890 case V4DFmode:
6891 case V4DImode:
6892 /* Unnamed 256bit vector mode parameters are passed on stack. */
6893 if (!named)
6894 return NULL;
6895 break;
6898 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6899 cum->sse_nregs,
6900 &x86_64_int_parameter_registers [cum->regno],
6901 cum->sse_regno);
6904 static rtx
6905 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6906 enum machine_mode orig_mode, bool named,
6907 HOST_WIDE_INT bytes)
6909 unsigned int regno;
6911 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6912 We use value of -2 to specify that current function call is MSABI. */
6913 if (mode == VOIDmode)
6914 return GEN_INT (-2);
6916 /* If we've run out of registers, it goes on the stack. */
6917 if (cum->nregs == 0)
6918 return NULL_RTX;
6920 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6922 /* Only floating point modes are passed in anything but integer regs. */
6923 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6925 if (named)
6926 regno = cum->regno + FIRST_SSE_REG;
6927 else
6929 rtx t1, t2;
6931 /* Unnamed floating parameters are passed in both the
6932 SSE and integer registers. */
6933 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6934 t2 = gen_rtx_REG (mode, regno);
6935 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6936 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6937 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6940 /* Handle aggregated types passed in register. */
6941 if (orig_mode == BLKmode)
6943 if (bytes > 0 && bytes <= 8)
6944 mode = (bytes > 4 ? DImode : SImode);
6945 if (mode == BLKmode)
6946 mode = DImode;
6949 return gen_reg_or_parallel (mode, orig_mode, regno);
6952 /* Return where to put the arguments to a function.
6953 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6955 MODE is the argument's machine mode. TYPE is the data type of the
6956 argument. It is null for libcalls where that information may not be
6957 available. CUM gives information about the preceding args and about
6958 the function being called. NAMED is nonzero if this argument is a
6959 named parameter (otherwise it is an extra parameter matching an
6960 ellipsis). */
6962 static rtx
6963 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6964 const_tree type, bool named)
6966 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6967 enum machine_mode mode = omode;
6968 HOST_WIDE_INT bytes, words;
6969 rtx arg;
6971 if (mode == BLKmode)
6972 bytes = int_size_in_bytes (type);
6973 else
6974 bytes = GET_MODE_SIZE (mode);
6975 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6977 /* To simplify the code below, represent vector types with a vector mode
6978 even if MMX/SSE are not active. */
6979 if (type && TREE_CODE (type) == VECTOR_TYPE)
6980 mode = type_natural_mode (type, cum, false);
6982 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6983 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6984 else if (TARGET_64BIT)
6985 arg = function_arg_64 (cum, mode, omode, type, named);
6986 else
6987 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6989 return arg;
6992 /* A C expression that indicates when an argument must be passed by
6993 reference. If nonzero for an argument, a copy of that argument is
6994 made in memory and a pointer to the argument is passed instead of
6995 the argument itself. The pointer is passed in whatever way is
6996 appropriate for passing a pointer to that type. */
6998 static bool
6999 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7000 enum machine_mode mode ATTRIBUTE_UNUSED,
7001 const_tree type, bool named ATTRIBUTE_UNUSED)
7003 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7005 /* See Windows x64 Software Convention. */
7006 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7008 int msize = (int) GET_MODE_SIZE (mode);
7009 if (type)
7011 /* Arrays are passed by reference. */
7012 if (TREE_CODE (type) == ARRAY_TYPE)
7013 return true;
7015 if (AGGREGATE_TYPE_P (type))
7017 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7018 are passed by reference. */
7019 msize = int_size_in_bytes (type);
7023 /* __m128 is passed by reference. */
7024 switch (msize) {
7025 case 1: case 2: case 4: case 8:
7026 break;
7027 default:
7028 return true;
7031 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7032 return 1;
7034 return 0;
7037 /* Return true when TYPE should be 128bit aligned for 32bit argument
7038 passing ABI. XXX: This function is obsolete and is only used for
7039 checking psABI compatibility with previous versions of GCC. */
7041 static bool
7042 ix86_compat_aligned_value_p (const_tree type)
7044 enum machine_mode mode = TYPE_MODE (type);
7045 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7046 || mode == TDmode
7047 || mode == TFmode
7048 || mode == TCmode)
7049 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7050 return true;
7051 if (TYPE_ALIGN (type) < 128)
7052 return false;
7054 if (AGGREGATE_TYPE_P (type))
7056 /* Walk the aggregates recursively. */
7057 switch (TREE_CODE (type))
7059 case RECORD_TYPE:
7060 case UNION_TYPE:
7061 case QUAL_UNION_TYPE:
7063 tree field;
7065 /* Walk all the structure fields. */
7066 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7068 if (TREE_CODE (field) == FIELD_DECL
7069 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7070 return true;
7072 break;
7075 case ARRAY_TYPE:
7076 /* Just for use if some languages passes arrays by value. */
7077 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7078 return true;
7079 break;
7081 default:
7082 gcc_unreachable ();
7085 return false;
7088 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7089 XXX: This function is obsolete and is only used for checking psABI
7090 compatibility with previous versions of GCC. */
7092 static unsigned int
7093 ix86_compat_function_arg_boundary (enum machine_mode mode,
7094 const_tree type, unsigned int align)
7096 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7097 natural boundaries. */
7098 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7100 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7101 make an exception for SSE modes since these require 128bit
7102 alignment.
7104 The handling here differs from field_alignment. ICC aligns MMX
7105 arguments to 4 byte boundaries, while structure fields are aligned
7106 to 8 byte boundaries. */
7107 if (!type)
7109 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7110 align = PARM_BOUNDARY;
7112 else
7114 if (!ix86_compat_aligned_value_p (type))
7115 align = PARM_BOUNDARY;
7118 if (align > BIGGEST_ALIGNMENT)
7119 align = BIGGEST_ALIGNMENT;
7120 return align;
7123 /* Return true when TYPE should be 128bit aligned for 32bit argument
7124 passing ABI. */
7126 static bool
7127 ix86_contains_aligned_value_p (const_tree type)
7129 enum machine_mode mode = TYPE_MODE (type);
7131 if (mode == XFmode || mode == XCmode)
7132 return false;
7134 if (TYPE_ALIGN (type) < 128)
7135 return false;
7137 if (AGGREGATE_TYPE_P (type))
7139 /* Walk the aggregates recursively. */
7140 switch (TREE_CODE (type))
7142 case RECORD_TYPE:
7143 case UNION_TYPE:
7144 case QUAL_UNION_TYPE:
7146 tree field;
7148 /* Walk all the structure fields. */
7149 for (field = TYPE_FIELDS (type);
7150 field;
7151 field = DECL_CHAIN (field))
7153 if (TREE_CODE (field) == FIELD_DECL
7154 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7155 return true;
7157 break;
7160 case ARRAY_TYPE:
7161 /* Just for use if some languages passes arrays by value. */
7162 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7163 return true;
7164 break;
7166 default:
7167 gcc_unreachable ();
7170 else
7171 return TYPE_ALIGN (type) >= 128;
7173 return false;
7176 /* Gives the alignment boundary, in bits, of an argument with the
7177 specified mode and type. */
7179 static unsigned int
7180 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7182 unsigned int align;
7183 if (type)
7185 /* Since the main variant type is used for call, we convert it to
7186 the main variant type. */
7187 type = TYPE_MAIN_VARIANT (type);
7188 align = TYPE_ALIGN (type);
7190 else
7191 align = GET_MODE_ALIGNMENT (mode);
7192 if (align < PARM_BOUNDARY)
7193 align = PARM_BOUNDARY;
7194 else
7196 static bool warned;
7197 unsigned int saved_align = align;
7199 if (!TARGET_64BIT)
7201 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7202 if (!type)
7204 if (mode == XFmode || mode == XCmode)
7205 align = PARM_BOUNDARY;
7207 else if (!ix86_contains_aligned_value_p (type))
7208 align = PARM_BOUNDARY;
7210 if (align < 128)
7211 align = PARM_BOUNDARY;
7214 if (warn_psabi
7215 && !warned
7216 && align != ix86_compat_function_arg_boundary (mode, type,
7217 saved_align))
7219 warned = true;
7220 inform (input_location,
7221 "The ABI for passing parameters with %d-byte"
7222 " alignment has changed in GCC 4.6",
7223 align / BITS_PER_UNIT);
7227 return align;
7230 /* Return true if N is a possible register number of function value. */
7232 static bool
7233 ix86_function_value_regno_p (const unsigned int regno)
7235 switch (regno)
7237 case AX_REG:
7238 case DX_REG:
7239 return true;
7240 case DI_REG:
7241 case SI_REG:
7242 return TARGET_64BIT && ix86_abi != MS_ABI;
7244 /* Complex values are returned in %st(0)/%st(1) pair. */
7245 case ST0_REG:
7246 case ST1_REG:
7247 /* TODO: The function should depend on current function ABI but
7248 builtins.c would need updating then. Therefore we use the
7249 default ABI. */
7250 if (TARGET_64BIT && ix86_abi == MS_ABI)
7251 return false;
7252 return TARGET_FLOAT_RETURNS_IN_80387;
7254 /* Complex values are returned in %xmm0/%xmm1 pair. */
7255 case XMM0_REG:
7256 case XMM1_REG:
7257 return TARGET_SSE;
7259 case MM0_REG:
7260 if (TARGET_MACHO || TARGET_64BIT)
7261 return false;
7262 return TARGET_MMX;
7265 return false;
7268 /* Define how to find the value returned by a function.
7269 VALTYPE is the data type of the value (as a tree).
7270 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7271 otherwise, FUNC is 0. */
7273 static rtx
7274 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7275 const_tree fntype, const_tree fn)
7277 unsigned int regno;
7279 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7280 we normally prevent this case when mmx is not available. However
7281 some ABIs may require the result to be returned like DImode. */
7282 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7283 regno = FIRST_MMX_REG;
7285 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7286 we prevent this case when sse is not available. However some ABIs
7287 may require the result to be returned like integer TImode. */
7288 else if (mode == TImode
7289 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7290 regno = FIRST_SSE_REG;
7292 /* 32-byte vector modes in %ymm0. */
7293 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7294 regno = FIRST_SSE_REG;
7296 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7297 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7298 regno = FIRST_FLOAT_REG;
7299 else
7300 /* Most things go in %eax. */
7301 regno = AX_REG;
7303 /* Override FP return register with %xmm0 for local functions when
7304 SSE math is enabled or for functions with sseregparm attribute. */
7305 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7307 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7308 if ((sse_level >= 1 && mode == SFmode)
7309 || (sse_level == 2 && mode == DFmode))
7310 regno = FIRST_SSE_REG;
7313 /* OImode shouldn't be used directly. */
7314 gcc_assert (mode != OImode);
7316 return gen_rtx_REG (orig_mode, regno);
7319 static rtx
7320 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7321 const_tree valtype)
7323 rtx ret;
7325 /* Handle libcalls, which don't provide a type node. */
7326 if (valtype == NULL)
7328 unsigned int regno;
7330 switch (mode)
7332 case SFmode:
7333 case SCmode:
7334 case DFmode:
7335 case DCmode:
7336 case TFmode:
7337 case SDmode:
7338 case DDmode:
7339 case TDmode:
7340 regno = FIRST_SSE_REG;
7341 break;
7342 case XFmode:
7343 case XCmode:
7344 regno = FIRST_FLOAT_REG;
7345 break;
7346 case TCmode:
7347 return NULL;
7348 default:
7349 regno = AX_REG;
7352 return gen_rtx_REG (mode, regno);
7354 else if (POINTER_TYPE_P (valtype))
7356 /* Pointers are always returned in word_mode. */
7357 mode = word_mode;
7360 ret = construct_container (mode, orig_mode, valtype, 1,
7361 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7362 x86_64_int_return_registers, 0);
7364 /* For zero sized structures, construct_container returns NULL, but we
7365 need to keep rest of compiler happy by returning meaningful value. */
7366 if (!ret)
7367 ret = gen_rtx_REG (orig_mode, AX_REG);
7369 return ret;
7372 static rtx
7373 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7374 const_tree valtype)
7376 unsigned int regno = AX_REG;
7378 if (TARGET_SSE)
7380 switch (GET_MODE_SIZE (mode))
7382 case 16:
7383 if (valtype != NULL_TREE
7384 && !VECTOR_INTEGER_TYPE_P (valtype)
7385 && !VECTOR_INTEGER_TYPE_P (valtype)
7386 && !INTEGRAL_TYPE_P (valtype)
7387 && !VECTOR_FLOAT_TYPE_P (valtype))
7388 break;
7389 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7390 && !COMPLEX_MODE_P (mode))
7391 regno = FIRST_SSE_REG;
7392 break;
7393 case 8:
7394 case 4:
7395 if (mode == SFmode || mode == DFmode)
7396 regno = FIRST_SSE_REG;
7397 break;
7398 default:
7399 break;
7402 return gen_rtx_REG (orig_mode, regno);
7405 static rtx
7406 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7407 enum machine_mode orig_mode, enum machine_mode mode)
7409 const_tree fn, fntype;
7411 fn = NULL_TREE;
7412 if (fntype_or_decl && DECL_P (fntype_or_decl))
7413 fn = fntype_or_decl;
7414 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7416 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7417 return function_value_ms_64 (orig_mode, mode, valtype);
7418 else if (TARGET_64BIT)
7419 return function_value_64 (orig_mode, mode, valtype);
7420 else
7421 return function_value_32 (orig_mode, mode, fntype, fn);
7424 static rtx
7425 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7426 bool outgoing ATTRIBUTE_UNUSED)
7428 enum machine_mode mode, orig_mode;
7430 orig_mode = TYPE_MODE (valtype);
7431 mode = type_natural_mode (valtype, NULL, true);
7432 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7435 /* Pointer function arguments and return values are promoted to
7436 word_mode. */
7438 static enum machine_mode
7439 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7440 int *punsignedp, const_tree fntype,
7441 int for_return)
7443 if (type != NULL_TREE && POINTER_TYPE_P (type))
7445 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7446 return word_mode;
7448 return default_promote_function_mode (type, mode, punsignedp, fntype,
7449 for_return);
7452 /* Return true if a structure, union or array with MODE containing FIELD
7453 should be accessed using BLKmode. */
7455 static bool
7456 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7458 /* Union with XFmode must be in BLKmode. */
7459 return (mode == XFmode
7460 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7461 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7465 ix86_libcall_value (enum machine_mode mode)
7467 return ix86_function_value_1 (NULL, NULL, mode, mode);
7470 /* Return true iff type is returned in memory. */
7472 static bool ATTRIBUTE_UNUSED
7473 return_in_memory_32 (const_tree type, enum machine_mode mode)
7475 HOST_WIDE_INT size;
7477 if (mode == BLKmode)
7478 return true;
7480 size = int_size_in_bytes (type);
7482 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7483 return false;
7485 if (VECTOR_MODE_P (mode) || mode == TImode)
7487 /* User-created vectors small enough to fit in EAX. */
7488 if (size < 8)
7489 return false;
7491 /* MMX/3dNow values are returned in MM0,
7492 except when it doesn't exits or the ABI prescribes otherwise. */
7493 if (size == 8)
7494 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7496 /* SSE values are returned in XMM0, except when it doesn't exist. */
7497 if (size == 16)
7498 return !TARGET_SSE;
7500 /* AVX values are returned in YMM0, except when it doesn't exist. */
7501 if (size == 32)
7502 return !TARGET_AVX;
7505 if (mode == XFmode)
7506 return false;
7508 if (size > 12)
7509 return true;
7511 /* OImode shouldn't be used directly. */
7512 gcc_assert (mode != OImode);
7514 return false;
7517 static bool ATTRIBUTE_UNUSED
7518 return_in_memory_64 (const_tree type, enum machine_mode mode)
7520 int needed_intregs, needed_sseregs;
7521 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7524 static bool ATTRIBUTE_UNUSED
7525 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7527 HOST_WIDE_INT size = int_size_in_bytes (type);
7529 /* __m128 is returned in xmm0. */
7530 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7531 || VECTOR_FLOAT_TYPE_P (type))
7532 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7533 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7534 return false;
7536 /* Otherwise, the size must be exactly in [1248]. */
7537 return size != 1 && size != 2 && size != 4 && size != 8;
7540 static bool
7541 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7543 #ifdef SUBTARGET_RETURN_IN_MEMORY
7544 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7545 #else
7546 const enum machine_mode mode = type_natural_mode (type, NULL, true);
7548 if (TARGET_64BIT)
7550 if (ix86_function_type_abi (fntype) == MS_ABI)
7551 return return_in_memory_ms_64 (type, mode);
7552 else
7553 return return_in_memory_64 (type, mode);
7555 else
7556 return return_in_memory_32 (type, mode);
7557 #endif
7561 /* Create the va_list data type. */
7563 /* Returns the calling convention specific va_list date type.
7564 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7566 static tree
7567 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7569 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7571 /* For i386 we use plain pointer to argument area. */
7572 if (!TARGET_64BIT || abi == MS_ABI)
7573 return build_pointer_type (char_type_node);
7575 record = lang_hooks.types.make_type (RECORD_TYPE);
7576 type_decl = build_decl (BUILTINS_LOCATION,
7577 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7579 f_gpr = build_decl (BUILTINS_LOCATION,
7580 FIELD_DECL, get_identifier ("gp_offset"),
7581 unsigned_type_node);
7582 f_fpr = build_decl (BUILTINS_LOCATION,
7583 FIELD_DECL, get_identifier ("fp_offset"),
7584 unsigned_type_node);
7585 f_ovf = build_decl (BUILTINS_LOCATION,
7586 FIELD_DECL, get_identifier ("overflow_arg_area"),
7587 ptr_type_node);
7588 f_sav = build_decl (BUILTINS_LOCATION,
7589 FIELD_DECL, get_identifier ("reg_save_area"),
7590 ptr_type_node);
7592 va_list_gpr_counter_field = f_gpr;
7593 va_list_fpr_counter_field = f_fpr;
7595 DECL_FIELD_CONTEXT (f_gpr) = record;
7596 DECL_FIELD_CONTEXT (f_fpr) = record;
7597 DECL_FIELD_CONTEXT (f_ovf) = record;
7598 DECL_FIELD_CONTEXT (f_sav) = record;
7600 TYPE_STUB_DECL (record) = type_decl;
7601 TYPE_NAME (record) = type_decl;
7602 TYPE_FIELDS (record) = f_gpr;
7603 DECL_CHAIN (f_gpr) = f_fpr;
7604 DECL_CHAIN (f_fpr) = f_ovf;
7605 DECL_CHAIN (f_ovf) = f_sav;
7607 layout_type (record);
7609 /* The correct type is an array type of one element. */
7610 return build_array_type (record, build_index_type (size_zero_node));
7613 /* Setup the builtin va_list data type and for 64-bit the additional
7614 calling convention specific va_list data types. */
7616 static tree
7617 ix86_build_builtin_va_list (void)
7619 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7621 /* Initialize abi specific va_list builtin types. */
7622 if (TARGET_64BIT)
7624 tree t;
7625 if (ix86_abi == MS_ABI)
7627 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7628 if (TREE_CODE (t) != RECORD_TYPE)
7629 t = build_variant_type_copy (t);
7630 sysv_va_list_type_node = t;
7632 else
7634 t = ret;
7635 if (TREE_CODE (t) != RECORD_TYPE)
7636 t = build_variant_type_copy (t);
7637 sysv_va_list_type_node = t;
7639 if (ix86_abi != MS_ABI)
7641 t = ix86_build_builtin_va_list_abi (MS_ABI);
7642 if (TREE_CODE (t) != RECORD_TYPE)
7643 t = build_variant_type_copy (t);
7644 ms_va_list_type_node = t;
7646 else
7648 t = ret;
7649 if (TREE_CODE (t) != RECORD_TYPE)
7650 t = build_variant_type_copy (t);
7651 ms_va_list_type_node = t;
7655 return ret;
7658 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7660 static void
7661 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7663 rtx save_area, mem;
7664 alias_set_type set;
7665 int i, max;
7667 /* GPR size of varargs save area. */
7668 if (cfun->va_list_gpr_size)
7669 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7670 else
7671 ix86_varargs_gpr_size = 0;
7673 /* FPR size of varargs save area. We don't need it if we don't pass
7674 anything in SSE registers. */
7675 if (TARGET_SSE && cfun->va_list_fpr_size)
7676 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7677 else
7678 ix86_varargs_fpr_size = 0;
7680 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7681 return;
7683 save_area = frame_pointer_rtx;
7684 set = get_varargs_alias_set ();
7686 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7687 if (max > X86_64_REGPARM_MAX)
7688 max = X86_64_REGPARM_MAX;
7690 for (i = cum->regno; i < max; i++)
7692 mem = gen_rtx_MEM (word_mode,
7693 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7694 MEM_NOTRAP_P (mem) = 1;
7695 set_mem_alias_set (mem, set);
7696 emit_move_insn (mem,
7697 gen_rtx_REG (word_mode,
7698 x86_64_int_parameter_registers[i]));
7701 if (ix86_varargs_fpr_size)
7703 enum machine_mode smode;
7704 rtx label, test;
7706 /* Now emit code to save SSE registers. The AX parameter contains number
7707 of SSE parameter registers used to call this function, though all we
7708 actually check here is the zero/non-zero status. */
7710 label = gen_label_rtx ();
7711 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7712 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7713 label));
7715 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7716 we used movdqa (i.e. TImode) instead? Perhaps even better would
7717 be if we could determine the real mode of the data, via a hook
7718 into pass_stdarg. Ignore all that for now. */
7719 smode = V4SFmode;
7720 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7721 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7723 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7724 if (max > X86_64_SSE_REGPARM_MAX)
7725 max = X86_64_SSE_REGPARM_MAX;
7727 for (i = cum->sse_regno; i < max; ++i)
7729 mem = plus_constant (Pmode, save_area,
7730 i * 16 + ix86_varargs_gpr_size);
7731 mem = gen_rtx_MEM (smode, mem);
7732 MEM_NOTRAP_P (mem) = 1;
7733 set_mem_alias_set (mem, set);
7734 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7736 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7739 emit_label (label);
7743 static void
7744 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7746 alias_set_type set = get_varargs_alias_set ();
7747 int i;
7749 /* Reset to zero, as there might be a sysv vaarg used
7750 before. */
7751 ix86_varargs_gpr_size = 0;
7752 ix86_varargs_fpr_size = 0;
7754 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7756 rtx reg, mem;
7758 mem = gen_rtx_MEM (Pmode,
7759 plus_constant (Pmode, virtual_incoming_args_rtx,
7760 i * UNITS_PER_WORD));
7761 MEM_NOTRAP_P (mem) = 1;
7762 set_mem_alias_set (mem, set);
7764 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7765 emit_move_insn (mem, reg);
7769 static void
7770 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7771 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7772 int no_rtl)
7774 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7775 CUMULATIVE_ARGS next_cum;
7776 tree fntype;
7778 /* This argument doesn't appear to be used anymore. Which is good,
7779 because the old code here didn't suppress rtl generation. */
7780 gcc_assert (!no_rtl);
7782 if (!TARGET_64BIT)
7783 return;
7785 fntype = TREE_TYPE (current_function_decl);
7787 /* For varargs, we do not want to skip the dummy va_dcl argument.
7788 For stdargs, we do want to skip the last named argument. */
7789 next_cum = *cum;
7790 if (stdarg_p (fntype))
7791 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7792 true);
7794 if (cum->call_abi == MS_ABI)
7795 setup_incoming_varargs_ms_64 (&next_cum);
7796 else
7797 setup_incoming_varargs_64 (&next_cum);
7800 /* Checks if TYPE is of kind va_list char *. */
7802 static bool
7803 is_va_list_char_pointer (tree type)
7805 tree canonic;
7807 /* For 32-bit it is always true. */
7808 if (!TARGET_64BIT)
7809 return true;
7810 canonic = ix86_canonical_va_list_type (type);
7811 return (canonic == ms_va_list_type_node
7812 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7815 /* Implement va_start. */
7817 static void
7818 ix86_va_start (tree valist, rtx nextarg)
7820 HOST_WIDE_INT words, n_gpr, n_fpr;
7821 tree f_gpr, f_fpr, f_ovf, f_sav;
7822 tree gpr, fpr, ovf, sav, t;
7823 tree type;
7824 rtx ovf_rtx;
7826 if (flag_split_stack
7827 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7829 unsigned int scratch_regno;
7831 /* When we are splitting the stack, we can't refer to the stack
7832 arguments using internal_arg_pointer, because they may be on
7833 the old stack. The split stack prologue will arrange to
7834 leave a pointer to the old stack arguments in a scratch
7835 register, which we here copy to a pseudo-register. The split
7836 stack prologue can't set the pseudo-register directly because
7837 it (the prologue) runs before any registers have been saved. */
7839 scratch_regno = split_stack_prologue_scratch_regno ();
7840 if (scratch_regno != INVALID_REGNUM)
7842 rtx reg, seq;
7844 reg = gen_reg_rtx (Pmode);
7845 cfun->machine->split_stack_varargs_pointer = reg;
7847 start_sequence ();
7848 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7849 seq = get_insns ();
7850 end_sequence ();
7852 push_topmost_sequence ();
7853 emit_insn_after (seq, entry_of_function ());
7854 pop_topmost_sequence ();
7858 /* Only 64bit target needs something special. */
7859 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7861 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7862 std_expand_builtin_va_start (valist, nextarg);
7863 else
7865 rtx va_r, next;
7867 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7868 next = expand_binop (ptr_mode, add_optab,
7869 cfun->machine->split_stack_varargs_pointer,
7870 crtl->args.arg_offset_rtx,
7871 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7872 convert_move (va_r, next, 0);
7874 return;
7877 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7878 f_fpr = DECL_CHAIN (f_gpr);
7879 f_ovf = DECL_CHAIN (f_fpr);
7880 f_sav = DECL_CHAIN (f_ovf);
7882 valist = build_simple_mem_ref (valist);
7883 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7884 /* The following should be folded into the MEM_REF offset. */
7885 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7886 f_gpr, NULL_TREE);
7887 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7888 f_fpr, NULL_TREE);
7889 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7890 f_ovf, NULL_TREE);
7891 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7892 f_sav, NULL_TREE);
7894 /* Count number of gp and fp argument registers used. */
7895 words = crtl->args.info.words;
7896 n_gpr = crtl->args.info.regno;
7897 n_fpr = crtl->args.info.sse_regno;
7899 if (cfun->va_list_gpr_size)
7901 type = TREE_TYPE (gpr);
7902 t = build2 (MODIFY_EXPR, type,
7903 gpr, build_int_cst (type, n_gpr * 8));
7904 TREE_SIDE_EFFECTS (t) = 1;
7905 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7908 if (TARGET_SSE && cfun->va_list_fpr_size)
7910 type = TREE_TYPE (fpr);
7911 t = build2 (MODIFY_EXPR, type, fpr,
7912 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7913 TREE_SIDE_EFFECTS (t) = 1;
7914 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7917 /* Find the overflow area. */
7918 type = TREE_TYPE (ovf);
7919 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7920 ovf_rtx = crtl->args.internal_arg_pointer;
7921 else
7922 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7923 t = make_tree (type, ovf_rtx);
7924 if (words != 0)
7925 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7926 t = build2 (MODIFY_EXPR, type, ovf, t);
7927 TREE_SIDE_EFFECTS (t) = 1;
7928 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7930 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7932 /* Find the register save area.
7933 Prologue of the function save it right above stack frame. */
7934 type = TREE_TYPE (sav);
7935 t = make_tree (type, frame_pointer_rtx);
7936 if (!ix86_varargs_gpr_size)
7937 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7938 t = build2 (MODIFY_EXPR, type, sav, t);
7939 TREE_SIDE_EFFECTS (t) = 1;
7940 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7944 /* Implement va_arg. */
7946 static tree
7947 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7948 gimple_seq *post_p)
7950 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7951 tree f_gpr, f_fpr, f_ovf, f_sav;
7952 tree gpr, fpr, ovf, sav, t;
7953 int size, rsize;
7954 tree lab_false, lab_over = NULL_TREE;
7955 tree addr, t2;
7956 rtx container;
7957 int indirect_p = 0;
7958 tree ptrtype;
7959 enum machine_mode nat_mode;
7960 unsigned int arg_boundary;
7962 /* Only 64bit target needs something special. */
7963 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7964 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7966 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7967 f_fpr = DECL_CHAIN (f_gpr);
7968 f_ovf = DECL_CHAIN (f_fpr);
7969 f_sav = DECL_CHAIN (f_ovf);
7971 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7972 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7973 valist = build_va_arg_indirect_ref (valist);
7974 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7975 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7976 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7978 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7979 if (indirect_p)
7980 type = build_pointer_type (type);
7981 size = int_size_in_bytes (type);
7982 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7984 nat_mode = type_natural_mode (type, NULL, false);
7985 switch (nat_mode)
7987 case V8SFmode:
7988 case V8SImode:
7989 case V32QImode:
7990 case V16HImode:
7991 case V4DFmode:
7992 case V4DImode:
7993 /* Unnamed 256bit vector mode parameters are passed on stack. */
7994 if (!TARGET_64BIT_MS_ABI)
7996 container = NULL;
7997 break;
8000 default:
8001 container = construct_container (nat_mode, TYPE_MODE (type),
8002 type, 0, X86_64_REGPARM_MAX,
8003 X86_64_SSE_REGPARM_MAX, intreg,
8005 break;
8008 /* Pull the value out of the saved registers. */
8010 addr = create_tmp_var (ptr_type_node, "addr");
8012 if (container)
8014 int needed_intregs, needed_sseregs;
8015 bool need_temp;
8016 tree int_addr, sse_addr;
8018 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8019 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8021 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8023 need_temp = (!REG_P (container)
8024 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8025 || TYPE_ALIGN (type) > 128));
8027 /* In case we are passing structure, verify that it is consecutive block
8028 on the register save area. If not we need to do moves. */
8029 if (!need_temp && !REG_P (container))
8031 /* Verify that all registers are strictly consecutive */
8032 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8034 int i;
8036 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8038 rtx slot = XVECEXP (container, 0, i);
8039 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8040 || INTVAL (XEXP (slot, 1)) != i * 16)
8041 need_temp = 1;
8044 else
8046 int i;
8048 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8050 rtx slot = XVECEXP (container, 0, i);
8051 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8052 || INTVAL (XEXP (slot, 1)) != i * 8)
8053 need_temp = 1;
8057 if (!need_temp)
8059 int_addr = addr;
8060 sse_addr = addr;
8062 else
8064 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8065 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8068 /* First ensure that we fit completely in registers. */
8069 if (needed_intregs)
8071 t = build_int_cst (TREE_TYPE (gpr),
8072 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8073 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8074 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8075 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8076 gimplify_and_add (t, pre_p);
8078 if (needed_sseregs)
8080 t = build_int_cst (TREE_TYPE (fpr),
8081 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8082 + X86_64_REGPARM_MAX * 8);
8083 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8084 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8085 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8086 gimplify_and_add (t, pre_p);
8089 /* Compute index to start of area used for integer regs. */
8090 if (needed_intregs)
8092 /* int_addr = gpr + sav; */
8093 t = fold_build_pointer_plus (sav, gpr);
8094 gimplify_assign (int_addr, t, pre_p);
8096 if (needed_sseregs)
8098 /* sse_addr = fpr + sav; */
8099 t = fold_build_pointer_plus (sav, fpr);
8100 gimplify_assign (sse_addr, t, pre_p);
8102 if (need_temp)
8104 int i, prev_size = 0;
8105 tree temp = create_tmp_var (type, "va_arg_tmp");
8107 /* addr = &temp; */
8108 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8109 gimplify_assign (addr, t, pre_p);
8111 for (i = 0; i < XVECLEN (container, 0); i++)
8113 rtx slot = XVECEXP (container, 0, i);
8114 rtx reg = XEXP (slot, 0);
8115 enum machine_mode mode = GET_MODE (reg);
8116 tree piece_type;
8117 tree addr_type;
8118 tree daddr_type;
8119 tree src_addr, src;
8120 int src_offset;
8121 tree dest_addr, dest;
8122 int cur_size = GET_MODE_SIZE (mode);
8124 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8125 prev_size = INTVAL (XEXP (slot, 1));
8126 if (prev_size + cur_size > size)
8128 cur_size = size - prev_size;
8129 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8130 if (mode == BLKmode)
8131 mode = QImode;
8133 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8134 if (mode == GET_MODE (reg))
8135 addr_type = build_pointer_type (piece_type);
8136 else
8137 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8138 true);
8139 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8140 true);
8142 if (SSE_REGNO_P (REGNO (reg)))
8144 src_addr = sse_addr;
8145 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8147 else
8149 src_addr = int_addr;
8150 src_offset = REGNO (reg) * 8;
8152 src_addr = fold_convert (addr_type, src_addr);
8153 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8155 dest_addr = fold_convert (daddr_type, addr);
8156 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8157 if (cur_size == GET_MODE_SIZE (mode))
8159 src = build_va_arg_indirect_ref (src_addr);
8160 dest = build_va_arg_indirect_ref (dest_addr);
8162 gimplify_assign (dest, src, pre_p);
8164 else
8166 tree copy
8167 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8168 3, dest_addr, src_addr,
8169 size_int (cur_size));
8170 gimplify_and_add (copy, pre_p);
8172 prev_size += cur_size;
8176 if (needed_intregs)
8178 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8179 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8180 gimplify_assign (gpr, t, pre_p);
8183 if (needed_sseregs)
8185 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8186 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8187 gimplify_assign (fpr, t, pre_p);
8190 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8192 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8195 /* ... otherwise out of the overflow area. */
8197 /* When we align parameter on stack for caller, if the parameter
8198 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8199 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8200 here with caller. */
8201 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8202 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8203 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8205 /* Care for on-stack alignment if needed. */
8206 if (arg_boundary <= 64 || size == 0)
8207 t = ovf;
8208 else
8210 HOST_WIDE_INT align = arg_boundary / 8;
8211 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8212 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8213 build_int_cst (TREE_TYPE (t), -align));
8216 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8217 gimplify_assign (addr, t, pre_p);
8219 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8220 gimplify_assign (unshare_expr (ovf), t, pre_p);
8222 if (container)
8223 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8225 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8226 addr = fold_convert (ptrtype, addr);
8228 if (indirect_p)
8229 addr = build_va_arg_indirect_ref (addr);
8230 return build_va_arg_indirect_ref (addr);
8233 /* Return true if OPNUM's MEM should be matched
8234 in movabs* patterns. */
8236 bool
8237 ix86_check_movabs (rtx insn, int opnum)
8239 rtx set, mem;
8241 set = PATTERN (insn);
8242 if (GET_CODE (set) == PARALLEL)
8243 set = XVECEXP (set, 0, 0);
8244 gcc_assert (GET_CODE (set) == SET);
8245 mem = XEXP (set, opnum);
8246 while (GET_CODE (mem) == SUBREG)
8247 mem = SUBREG_REG (mem);
8248 gcc_assert (MEM_P (mem));
8249 return volatile_ok || !MEM_VOLATILE_P (mem);
8252 /* Initialize the table of extra 80387 mathematical constants. */
8254 static void
8255 init_ext_80387_constants (void)
8257 static const char * cst[5] =
8259 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8260 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8261 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8262 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8263 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8265 int i;
8267 for (i = 0; i < 5; i++)
8269 real_from_string (&ext_80387_constants_table[i], cst[i]);
8270 /* Ensure each constant is rounded to XFmode precision. */
8271 real_convert (&ext_80387_constants_table[i],
8272 XFmode, &ext_80387_constants_table[i]);
8275 ext_80387_constants_init = 1;
8278 /* Return non-zero if the constant is something that
8279 can be loaded with a special instruction. */
8282 standard_80387_constant_p (rtx x)
8284 enum machine_mode mode = GET_MODE (x);
8286 REAL_VALUE_TYPE r;
8288 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8289 return -1;
8291 if (x == CONST0_RTX (mode))
8292 return 1;
8293 if (x == CONST1_RTX (mode))
8294 return 2;
8296 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8298 /* For XFmode constants, try to find a special 80387 instruction when
8299 optimizing for size or on those CPUs that benefit from them. */
8300 if (mode == XFmode
8301 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8303 int i;
8305 if (! ext_80387_constants_init)
8306 init_ext_80387_constants ();
8308 for (i = 0; i < 5; i++)
8309 if (real_identical (&r, &ext_80387_constants_table[i]))
8310 return i + 3;
8313 /* Load of the constant -0.0 or -1.0 will be split as
8314 fldz;fchs or fld1;fchs sequence. */
8315 if (real_isnegzero (&r))
8316 return 8;
8317 if (real_identical (&r, &dconstm1))
8318 return 9;
8320 return 0;
8323 /* Return the opcode of the special instruction to be used to load
8324 the constant X. */
8326 const char *
8327 standard_80387_constant_opcode (rtx x)
8329 switch (standard_80387_constant_p (x))
8331 case 1:
8332 return "fldz";
8333 case 2:
8334 return "fld1";
8335 case 3:
8336 return "fldlg2";
8337 case 4:
8338 return "fldln2";
8339 case 5:
8340 return "fldl2e";
8341 case 6:
8342 return "fldl2t";
8343 case 7:
8344 return "fldpi";
8345 case 8:
8346 case 9:
8347 return "#";
8348 default:
8349 gcc_unreachable ();
8353 /* Return the CONST_DOUBLE representing the 80387 constant that is
8354 loaded by the specified special instruction. The argument IDX
8355 matches the return value from standard_80387_constant_p. */
8358 standard_80387_constant_rtx (int idx)
8360 int i;
8362 if (! ext_80387_constants_init)
8363 init_ext_80387_constants ();
8365 switch (idx)
8367 case 3:
8368 case 4:
8369 case 5:
8370 case 6:
8371 case 7:
8372 i = idx - 3;
8373 break;
8375 default:
8376 gcc_unreachable ();
8379 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8380 XFmode);
8383 /* Return 1 if X is all 0s and 2 if x is all 1s
8384 in supported SSE/AVX vector mode. */
8387 standard_sse_constant_p (rtx x)
8389 enum machine_mode mode = GET_MODE (x);
8391 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8392 return 1;
8393 if (vector_all_ones_operand (x, mode))
8394 switch (mode)
8396 case V16QImode:
8397 case V8HImode:
8398 case V4SImode:
8399 case V2DImode:
8400 if (TARGET_SSE2)
8401 return 2;
8402 case V32QImode:
8403 case V16HImode:
8404 case V8SImode:
8405 case V4DImode:
8406 if (TARGET_AVX2)
8407 return 2;
8408 default:
8409 break;
8412 return 0;
8415 /* Return the opcode of the special instruction to be used to load
8416 the constant X. */
8418 const char *
8419 standard_sse_constant_opcode (rtx insn, rtx x)
8421 switch (standard_sse_constant_p (x))
8423 case 1:
8424 switch (get_attr_mode (insn))
8426 case MODE_TI:
8427 return "%vpxor\t%0, %d0";
8428 case MODE_V2DF:
8429 return "%vxorpd\t%0, %d0";
8430 case MODE_V4SF:
8431 return "%vxorps\t%0, %d0";
8433 case MODE_OI:
8434 return "vpxor\t%x0, %x0, %x0";
8435 case MODE_V4DF:
8436 return "vxorpd\t%x0, %x0, %x0";
8437 case MODE_V8SF:
8438 return "vxorps\t%x0, %x0, %x0";
8440 default:
8441 break;
8444 case 2:
8445 if (TARGET_AVX)
8446 return "vpcmpeqd\t%0, %0, %0";
8447 else
8448 return "pcmpeqd\t%0, %0";
8450 default:
8451 break;
8453 gcc_unreachable ();
8456 /* Returns true if OP contains a symbol reference */
8458 bool
8459 symbolic_reference_mentioned_p (rtx op)
8461 const char *fmt;
8462 int i;
8464 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8465 return true;
8467 fmt = GET_RTX_FORMAT (GET_CODE (op));
8468 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8470 if (fmt[i] == 'E')
8472 int j;
8474 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8475 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8476 return true;
8479 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8480 return true;
8483 return false;
8486 /* Return true if it is appropriate to emit `ret' instructions in the
8487 body of a function. Do this only if the epilogue is simple, needing a
8488 couple of insns. Prior to reloading, we can't tell how many registers
8489 must be saved, so return false then. Return false if there is no frame
8490 marker to de-allocate. */
8492 bool
8493 ix86_can_use_return_insn_p (void)
8495 struct ix86_frame frame;
8497 if (! reload_completed || frame_pointer_needed)
8498 return 0;
8500 /* Don't allow more than 32k pop, since that's all we can do
8501 with one instruction. */
8502 if (crtl->args.pops_args && crtl->args.size >= 32768)
8503 return 0;
8505 ix86_compute_frame_layout (&frame);
8506 return (frame.stack_pointer_offset == UNITS_PER_WORD
8507 && (frame.nregs + frame.nsseregs) == 0);
8510 /* Value should be nonzero if functions must have frame pointers.
8511 Zero means the frame pointer need not be set up (and parms may
8512 be accessed via the stack pointer) in functions that seem suitable. */
8514 static bool
8515 ix86_frame_pointer_required (void)
8517 /* If we accessed previous frames, then the generated code expects
8518 to be able to access the saved ebp value in our frame. */
8519 if (cfun->machine->accesses_prev_frame)
8520 return true;
8522 /* Several x86 os'es need a frame pointer for other reasons,
8523 usually pertaining to setjmp. */
8524 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8525 return true;
8527 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8528 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8529 return true;
8531 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8532 allocation is 4GB. */
8533 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8534 return true;
8536 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8537 turns off the frame pointer by default. Turn it back on now if
8538 we've not got a leaf function. */
8539 if (TARGET_OMIT_LEAF_FRAME_POINTER
8540 && (!crtl->is_leaf
8541 || ix86_current_function_calls_tls_descriptor))
8542 return true;
8544 if (crtl->profile && !flag_fentry)
8545 return true;
8547 return false;
8550 /* Record that the current function accesses previous call frames. */
8552 void
8553 ix86_setup_frame_addresses (void)
8555 cfun->machine->accesses_prev_frame = 1;
8558 #ifndef USE_HIDDEN_LINKONCE
8559 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8560 # define USE_HIDDEN_LINKONCE 1
8561 # else
8562 # define USE_HIDDEN_LINKONCE 0
8563 # endif
8564 #endif
8566 static int pic_labels_used;
8568 /* Fills in the label name that should be used for a pc thunk for
8569 the given register. */
8571 static void
8572 get_pc_thunk_name (char name[32], unsigned int regno)
8574 gcc_assert (!TARGET_64BIT);
8576 if (USE_HIDDEN_LINKONCE)
8577 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8578 else
8579 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8583 /* This function generates code for -fpic that loads %ebx with
8584 the return address of the caller and then returns. */
8586 static void
8587 ix86_code_end (void)
8589 rtx xops[2];
8590 int regno;
8592 for (regno = AX_REG; regno <= SP_REG; regno++)
8594 char name[32];
8595 tree decl;
8597 if (!(pic_labels_used & (1 << regno)))
8598 continue;
8600 get_pc_thunk_name (name, regno);
8602 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8603 get_identifier (name),
8604 build_function_type_list (void_type_node, NULL_TREE));
8605 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8606 NULL_TREE, void_type_node);
8607 TREE_PUBLIC (decl) = 1;
8608 TREE_STATIC (decl) = 1;
8609 DECL_IGNORED_P (decl) = 1;
8611 #if TARGET_MACHO
8612 if (TARGET_MACHO)
8614 switch_to_section (darwin_sections[text_coal_section]);
8615 fputs ("\t.weak_definition\t", asm_out_file);
8616 assemble_name (asm_out_file, name);
8617 fputs ("\n\t.private_extern\t", asm_out_file);
8618 assemble_name (asm_out_file, name);
8619 putc ('\n', asm_out_file);
8620 ASM_OUTPUT_LABEL (asm_out_file, name);
8621 DECL_WEAK (decl) = 1;
8623 else
8624 #endif
8625 if (USE_HIDDEN_LINKONCE)
8627 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8629 targetm.asm_out.unique_section (decl, 0);
8630 switch_to_section (get_named_section (decl, NULL, 0));
8632 targetm.asm_out.globalize_label (asm_out_file, name);
8633 fputs ("\t.hidden\t", asm_out_file);
8634 assemble_name (asm_out_file, name);
8635 putc ('\n', asm_out_file);
8636 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8638 else
8640 switch_to_section (text_section);
8641 ASM_OUTPUT_LABEL (asm_out_file, name);
8644 DECL_INITIAL (decl) = make_node (BLOCK);
8645 current_function_decl = decl;
8646 init_function_start (decl);
8647 first_function_block_is_cold = false;
8648 /* Make sure unwind info is emitted for the thunk if needed. */
8649 final_start_function (emit_barrier (), asm_out_file, 1);
8651 /* Pad stack IP move with 4 instructions (two NOPs count
8652 as one instruction). */
8653 if (TARGET_PAD_SHORT_FUNCTION)
8655 int i = 8;
8657 while (i--)
8658 fputs ("\tnop\n", asm_out_file);
8661 xops[0] = gen_rtx_REG (Pmode, regno);
8662 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8663 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8664 fputs ("\tret\n", asm_out_file);
8665 final_end_function ();
8666 init_insn_lengths ();
8667 free_after_compilation (cfun);
8668 set_cfun (NULL);
8669 current_function_decl = NULL;
8672 if (flag_split_stack)
8673 file_end_indicate_split_stack ();
8676 /* Emit code for the SET_GOT patterns. */
8678 const char *
8679 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8681 rtx xops[3];
8683 xops[0] = dest;
8685 if (TARGET_VXWORKS_RTP && flag_pic)
8687 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8688 xops[2] = gen_rtx_MEM (Pmode,
8689 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8690 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8692 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8693 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8694 an unadorned address. */
8695 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8696 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8697 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8698 return "";
8701 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8703 if (!flag_pic)
8705 if (TARGET_MACHO)
8706 /* We don't need a pic base, we're not producing pic. */
8707 gcc_unreachable ();
8709 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8710 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8711 targetm.asm_out.internal_label (asm_out_file, "L",
8712 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8714 else
8716 char name[32];
8717 get_pc_thunk_name (name, REGNO (dest));
8718 pic_labels_used |= 1 << REGNO (dest);
8720 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8721 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8722 output_asm_insn ("call\t%X2", xops);
8724 #if TARGET_MACHO
8725 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
8726 This is what will be referenced by the Mach-O PIC subsystem. */
8727 if (machopic_should_output_picbase_label () || !label)
8728 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8730 /* When we are restoring the pic base at the site of a nonlocal label,
8731 and we decided to emit the pic base above, we will still output a
8732 local label used for calculating the correction offset (even though
8733 the offset will be 0 in that case). */
8734 if (label)
8735 targetm.asm_out.internal_label (asm_out_file, "L",
8736 CODE_LABEL_NUMBER (label));
8737 #endif
8740 if (!TARGET_MACHO)
8741 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8743 return "";
8746 /* Generate an "push" pattern for input ARG. */
8748 static rtx
8749 gen_push (rtx arg)
8751 struct machine_function *m = cfun->machine;
8753 if (m->fs.cfa_reg == stack_pointer_rtx)
8754 m->fs.cfa_offset += UNITS_PER_WORD;
8755 m->fs.sp_offset += UNITS_PER_WORD;
8757 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8758 arg = gen_rtx_REG (word_mode, REGNO (arg));
8760 return gen_rtx_SET (VOIDmode,
8761 gen_rtx_MEM (word_mode,
8762 gen_rtx_PRE_DEC (Pmode,
8763 stack_pointer_rtx)),
8764 arg);
8767 /* Generate an "pop" pattern for input ARG. */
8769 static rtx
8770 gen_pop (rtx arg)
8772 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8773 arg = gen_rtx_REG (word_mode, REGNO (arg));
8775 return gen_rtx_SET (VOIDmode,
8776 arg,
8777 gen_rtx_MEM (word_mode,
8778 gen_rtx_POST_INC (Pmode,
8779 stack_pointer_rtx)));
8782 /* Return >= 0 if there is an unused call-clobbered register available
8783 for the entire function. */
8785 static unsigned int
8786 ix86_select_alt_pic_regnum (void)
8788 if (crtl->is_leaf
8789 && !crtl->profile
8790 && !ix86_current_function_calls_tls_descriptor)
8792 int i, drap;
8793 /* Can't use the same register for both PIC and DRAP. */
8794 if (crtl->drap_reg)
8795 drap = REGNO (crtl->drap_reg);
8796 else
8797 drap = -1;
8798 for (i = 2; i >= 0; --i)
8799 if (i != drap && !df_regs_ever_live_p (i))
8800 return i;
8803 return INVALID_REGNUM;
8806 /* Return TRUE if we need to save REGNO. */
8808 static bool
8809 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8811 if (pic_offset_table_rtx
8812 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8813 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8814 || crtl->profile
8815 || crtl->calls_eh_return
8816 || crtl->uses_const_pool
8817 || cfun->has_nonlocal_label))
8818 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8820 if (crtl->calls_eh_return && maybe_eh_return)
8822 unsigned i;
8823 for (i = 0; ; i++)
8825 unsigned test = EH_RETURN_DATA_REGNO (i);
8826 if (test == INVALID_REGNUM)
8827 break;
8828 if (test == regno)
8829 return true;
8833 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8834 return true;
8836 return (df_regs_ever_live_p (regno)
8837 && !call_used_regs[regno]
8838 && !fixed_regs[regno]
8839 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8842 /* Return number of saved general prupose registers. */
8844 static int
8845 ix86_nsaved_regs (void)
8847 int nregs = 0;
8848 int regno;
8850 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8851 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8852 nregs ++;
8853 return nregs;
8856 /* Return number of saved SSE registrers. */
8858 static int
8859 ix86_nsaved_sseregs (void)
8861 int nregs = 0;
8862 int regno;
8864 if (!TARGET_64BIT_MS_ABI)
8865 return 0;
8866 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8867 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8868 nregs ++;
8869 return nregs;
8872 /* Given FROM and TO register numbers, say whether this elimination is
8873 allowed. If stack alignment is needed, we can only replace argument
8874 pointer with hard frame pointer, or replace frame pointer with stack
8875 pointer. Otherwise, frame pointer elimination is automatically
8876 handled and all other eliminations are valid. */
8878 static bool
8879 ix86_can_eliminate (const int from, const int to)
8881 if (stack_realign_fp)
8882 return ((from == ARG_POINTER_REGNUM
8883 && to == HARD_FRAME_POINTER_REGNUM)
8884 || (from == FRAME_POINTER_REGNUM
8885 && to == STACK_POINTER_REGNUM));
8886 else
8887 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8890 /* Return the offset between two registers, one to be eliminated, and the other
8891 its replacement, at the start of a routine. */
8893 HOST_WIDE_INT
8894 ix86_initial_elimination_offset (int from, int to)
8896 struct ix86_frame frame;
8897 ix86_compute_frame_layout (&frame);
8899 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8900 return frame.hard_frame_pointer_offset;
8901 else if (from == FRAME_POINTER_REGNUM
8902 && to == HARD_FRAME_POINTER_REGNUM)
8903 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8904 else
8906 gcc_assert (to == STACK_POINTER_REGNUM);
8908 if (from == ARG_POINTER_REGNUM)
8909 return frame.stack_pointer_offset;
8911 gcc_assert (from == FRAME_POINTER_REGNUM);
8912 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8916 /* In a dynamically-aligned function, we can't know the offset from
8917 stack pointer to frame pointer, so we must ensure that setjmp
8918 eliminates fp against the hard fp (%ebp) rather than trying to
8919 index from %esp up to the top of the frame across a gap that is
8920 of unknown (at compile-time) size. */
8921 static rtx
8922 ix86_builtin_setjmp_frame_value (void)
8924 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8927 /* When using -fsplit-stack, the allocation routines set a field in
8928 the TCB to the bottom of the stack plus this much space, measured
8929 in bytes. */
8931 #define SPLIT_STACK_AVAILABLE 256
8933 /* Fill structure ix86_frame about frame of currently computed function. */
8935 static void
8936 ix86_compute_frame_layout (struct ix86_frame *frame)
8938 unsigned HOST_WIDE_INT stack_alignment_needed;
8939 HOST_WIDE_INT offset;
8940 unsigned HOST_WIDE_INT preferred_alignment;
8941 HOST_WIDE_INT size = get_frame_size ();
8942 HOST_WIDE_INT to_allocate;
8944 frame->nregs = ix86_nsaved_regs ();
8945 frame->nsseregs = ix86_nsaved_sseregs ();
8947 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8948 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8950 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8951 function prologues and leaf. */
8952 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8953 && (!crtl->is_leaf || cfun->calls_alloca != 0
8954 || ix86_current_function_calls_tls_descriptor))
8956 preferred_alignment = 16;
8957 stack_alignment_needed = 16;
8958 crtl->preferred_stack_boundary = 128;
8959 crtl->stack_alignment_needed = 128;
8962 gcc_assert (!size || stack_alignment_needed);
8963 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8964 gcc_assert (preferred_alignment <= stack_alignment_needed);
8966 /* For SEH we have to limit the amount of code movement into the prologue.
8967 At present we do this via a BLOCKAGE, at which point there's very little
8968 scheduling that can be done, which means that there's very little point
8969 in doing anything except PUSHs. */
8970 if (TARGET_SEH)
8971 cfun->machine->use_fast_prologue_epilogue = false;
8973 /* During reload iteration the amount of registers saved can change.
8974 Recompute the value as needed. Do not recompute when amount of registers
8975 didn't change as reload does multiple calls to the function and does not
8976 expect the decision to change within single iteration. */
8977 else if (!optimize_function_for_size_p (cfun)
8978 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8980 int count = frame->nregs;
8981 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8983 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8985 /* The fast prologue uses move instead of push to save registers. This
8986 is significantly longer, but also executes faster as modern hardware
8987 can execute the moves in parallel, but can't do that for push/pop.
8989 Be careful about choosing what prologue to emit: When function takes
8990 many instructions to execute we may use slow version as well as in
8991 case function is known to be outside hot spot (this is known with
8992 feedback only). Weight the size of function by number of registers
8993 to save as it is cheap to use one or two push instructions but very
8994 slow to use many of them. */
8995 if (count)
8996 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8997 if (node->frequency < NODE_FREQUENCY_NORMAL
8998 || (flag_branch_probabilities
8999 && node->frequency < NODE_FREQUENCY_HOT))
9000 cfun->machine->use_fast_prologue_epilogue = false;
9001 else
9002 cfun->machine->use_fast_prologue_epilogue
9003 = !expensive_function_p (count);
9006 frame->save_regs_using_mov
9007 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9008 /* If static stack checking is enabled and done with probes,
9009 the registers need to be saved before allocating the frame. */
9010 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9012 /* Skip return address. */
9013 offset = UNITS_PER_WORD;
9015 /* Skip pushed static chain. */
9016 if (ix86_static_chain_on_stack)
9017 offset += UNITS_PER_WORD;
9019 /* Skip saved base pointer. */
9020 if (frame_pointer_needed)
9021 offset += UNITS_PER_WORD;
9022 frame->hfp_save_offset = offset;
9024 /* The traditional frame pointer location is at the top of the frame. */
9025 frame->hard_frame_pointer_offset = offset;
9027 /* Register save area */
9028 offset += frame->nregs * UNITS_PER_WORD;
9029 frame->reg_save_offset = offset;
9031 /* On SEH target, registers are pushed just before the frame pointer
9032 location. */
9033 if (TARGET_SEH)
9034 frame->hard_frame_pointer_offset = offset;
9036 /* Align and set SSE register save area. */
9037 if (frame->nsseregs)
9039 /* The only ABI that has saved SSE registers (Win64) also has a
9040 16-byte aligned default stack, and thus we don't need to be
9041 within the re-aligned local stack frame to save them. */
9042 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9043 offset = (offset + 16 - 1) & -16;
9044 offset += frame->nsseregs * 16;
9046 frame->sse_reg_save_offset = offset;
9048 /* The re-aligned stack starts here. Values before this point are not
9049 directly comparable with values below this point. In order to make
9050 sure that no value happens to be the same before and after, force
9051 the alignment computation below to add a non-zero value. */
9052 if (stack_realign_fp)
9053 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9055 /* Va-arg area */
9056 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9057 offset += frame->va_arg_size;
9059 /* Align start of frame for local function. */
9060 if (stack_realign_fp
9061 || offset != frame->sse_reg_save_offset
9062 || size != 0
9063 || !crtl->is_leaf
9064 || cfun->calls_alloca
9065 || ix86_current_function_calls_tls_descriptor)
9066 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9068 /* Frame pointer points here. */
9069 frame->frame_pointer_offset = offset;
9071 offset += size;
9073 /* Add outgoing arguments area. Can be skipped if we eliminated
9074 all the function calls as dead code.
9075 Skipping is however impossible when function calls alloca. Alloca
9076 expander assumes that last crtl->outgoing_args_size
9077 of stack frame are unused. */
9078 if (ACCUMULATE_OUTGOING_ARGS
9079 && (!crtl->is_leaf || cfun->calls_alloca
9080 || ix86_current_function_calls_tls_descriptor))
9082 offset += crtl->outgoing_args_size;
9083 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9085 else
9086 frame->outgoing_arguments_size = 0;
9088 /* Align stack boundary. Only needed if we're calling another function
9089 or using alloca. */
9090 if (!crtl->is_leaf || cfun->calls_alloca
9091 || ix86_current_function_calls_tls_descriptor)
9092 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9094 /* We've reached end of stack frame. */
9095 frame->stack_pointer_offset = offset;
9097 /* Size prologue needs to allocate. */
9098 to_allocate = offset - frame->sse_reg_save_offset;
9100 if ((!to_allocate && frame->nregs <= 1)
9101 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9102 frame->save_regs_using_mov = false;
9104 if (ix86_using_red_zone ()
9105 && crtl->sp_is_unchanging
9106 && crtl->is_leaf
9107 && !ix86_current_function_calls_tls_descriptor)
9109 frame->red_zone_size = to_allocate;
9110 if (frame->save_regs_using_mov)
9111 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9112 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9113 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9115 else
9116 frame->red_zone_size = 0;
9117 frame->stack_pointer_offset -= frame->red_zone_size;
9119 /* The SEH frame pointer location is near the bottom of the frame.
9120 This is enforced by the fact that the difference between the
9121 stack pointer and the frame pointer is limited to 240 bytes in
9122 the unwind data structure. */
9123 if (TARGET_SEH)
9125 HOST_WIDE_INT diff;
9127 /* If we can leave the frame pointer where it is, do so. Also, returns
9128 the establisher frame for __builtin_frame_address (0). */
9129 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9130 if (diff <= SEH_MAX_FRAME_SIZE
9131 && (diff > 240 || (diff & 15) != 0)
9132 && !crtl->accesses_prior_frames)
9134 /* Ideally we'd determine what portion of the local stack frame
9135 (within the constraint of the lowest 240) is most heavily used.
9136 But without that complication, simply bias the frame pointer
9137 by 128 bytes so as to maximize the amount of the local stack
9138 frame that is addressable with 8-bit offsets. */
9139 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9144 /* This is semi-inlined memory_address_length, but simplified
9145 since we know that we're always dealing with reg+offset, and
9146 to avoid having to create and discard all that rtl. */
9148 static inline int
9149 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9151 int len = 4;
9153 if (offset == 0)
9155 /* EBP and R13 cannot be encoded without an offset. */
9156 len = (regno == BP_REG || regno == R13_REG);
9158 else if (IN_RANGE (offset, -128, 127))
9159 len = 1;
9161 /* ESP and R12 must be encoded with a SIB byte. */
9162 if (regno == SP_REG || regno == R12_REG)
9163 len++;
9165 return len;
9168 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9169 The valid base registers are taken from CFUN->MACHINE->FS. */
9171 static rtx
9172 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9174 const struct machine_function *m = cfun->machine;
9175 rtx base_reg = NULL;
9176 HOST_WIDE_INT base_offset = 0;
9178 if (m->use_fast_prologue_epilogue)
9180 /* Choose the base register most likely to allow the most scheduling
9181 opportunities. Generally FP is valid throughout the function,
9182 while DRAP must be reloaded within the epilogue. But choose either
9183 over the SP due to increased encoding size. */
9185 if (m->fs.fp_valid)
9187 base_reg = hard_frame_pointer_rtx;
9188 base_offset = m->fs.fp_offset - cfa_offset;
9190 else if (m->fs.drap_valid)
9192 base_reg = crtl->drap_reg;
9193 base_offset = 0 - cfa_offset;
9195 else if (m->fs.sp_valid)
9197 base_reg = stack_pointer_rtx;
9198 base_offset = m->fs.sp_offset - cfa_offset;
9201 else
9203 HOST_WIDE_INT toffset;
9204 int len = 16, tlen;
9206 /* Choose the base register with the smallest address encoding.
9207 With a tie, choose FP > DRAP > SP. */
9208 if (m->fs.sp_valid)
9210 base_reg = stack_pointer_rtx;
9211 base_offset = m->fs.sp_offset - cfa_offset;
9212 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9214 if (m->fs.drap_valid)
9216 toffset = 0 - cfa_offset;
9217 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9218 if (tlen <= len)
9220 base_reg = crtl->drap_reg;
9221 base_offset = toffset;
9222 len = tlen;
9225 if (m->fs.fp_valid)
9227 toffset = m->fs.fp_offset - cfa_offset;
9228 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9229 if (tlen <= len)
9231 base_reg = hard_frame_pointer_rtx;
9232 base_offset = toffset;
9233 len = tlen;
9237 gcc_assert (base_reg != NULL);
9239 return plus_constant (Pmode, base_reg, base_offset);
9242 /* Emit code to save registers in the prologue. */
9244 static void
9245 ix86_emit_save_regs (void)
9247 unsigned int regno;
9248 rtx insn;
9250 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9251 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9253 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9254 RTX_FRAME_RELATED_P (insn) = 1;
9258 /* Emit a single register save at CFA - CFA_OFFSET. */
9260 static void
9261 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9262 HOST_WIDE_INT cfa_offset)
9264 struct machine_function *m = cfun->machine;
9265 rtx reg = gen_rtx_REG (mode, regno);
9266 rtx mem, addr, base, insn;
9268 addr = choose_baseaddr (cfa_offset);
9269 mem = gen_frame_mem (mode, addr);
9271 /* For SSE saves, we need to indicate the 128-bit alignment. */
9272 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9274 insn = emit_move_insn (mem, reg);
9275 RTX_FRAME_RELATED_P (insn) = 1;
9277 base = addr;
9278 if (GET_CODE (base) == PLUS)
9279 base = XEXP (base, 0);
9280 gcc_checking_assert (REG_P (base));
9282 /* When saving registers into a re-aligned local stack frame, avoid
9283 any tricky guessing by dwarf2out. */
9284 if (m->fs.realigned)
9286 gcc_checking_assert (stack_realign_drap);
9288 if (regno == REGNO (crtl->drap_reg))
9290 /* A bit of a hack. We force the DRAP register to be saved in
9291 the re-aligned stack frame, which provides us with a copy
9292 of the CFA that will last past the prologue. Install it. */
9293 gcc_checking_assert (cfun->machine->fs.fp_valid);
9294 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9295 cfun->machine->fs.fp_offset - cfa_offset);
9296 mem = gen_rtx_MEM (mode, addr);
9297 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9299 else
9301 /* The frame pointer is a stable reference within the
9302 aligned frame. Use it. */
9303 gcc_checking_assert (cfun->machine->fs.fp_valid);
9304 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9305 cfun->machine->fs.fp_offset - cfa_offset);
9306 mem = gen_rtx_MEM (mode, addr);
9307 add_reg_note (insn, REG_CFA_EXPRESSION,
9308 gen_rtx_SET (VOIDmode, mem, reg));
9312 /* The memory may not be relative to the current CFA register,
9313 which means that we may need to generate a new pattern for
9314 use by the unwind info. */
9315 else if (base != m->fs.cfa_reg)
9317 addr = plus_constant (Pmode, m->fs.cfa_reg,
9318 m->fs.cfa_offset - cfa_offset);
9319 mem = gen_rtx_MEM (mode, addr);
9320 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9324 /* Emit code to save registers using MOV insns.
9325 First register is stored at CFA - CFA_OFFSET. */
9326 static void
9327 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9329 unsigned int regno;
9331 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9332 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9334 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9335 cfa_offset -= UNITS_PER_WORD;
9339 /* Emit code to save SSE registers using MOV insns.
9340 First register is stored at CFA - CFA_OFFSET. */
9341 static void
9342 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9344 unsigned int regno;
9346 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9347 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9349 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9350 cfa_offset -= 16;
9354 static GTY(()) rtx queued_cfa_restores;
9356 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9357 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9358 Don't add the note if the previously saved value will be left untouched
9359 within stack red-zone till return, as unwinders can find the same value
9360 in the register and on the stack. */
9362 static void
9363 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9365 if (!crtl->shrink_wrapped
9366 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9367 return;
9369 if (insn)
9371 add_reg_note (insn, REG_CFA_RESTORE, reg);
9372 RTX_FRAME_RELATED_P (insn) = 1;
9374 else
9375 queued_cfa_restores
9376 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9379 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9381 static void
9382 ix86_add_queued_cfa_restore_notes (rtx insn)
9384 rtx last;
9385 if (!queued_cfa_restores)
9386 return;
9387 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9389 XEXP (last, 1) = REG_NOTES (insn);
9390 REG_NOTES (insn) = queued_cfa_restores;
9391 queued_cfa_restores = NULL_RTX;
9392 RTX_FRAME_RELATED_P (insn) = 1;
9395 /* Expand prologue or epilogue stack adjustment.
9396 The pattern exist to put a dependency on all ebp-based memory accesses.
9397 STYLE should be negative if instructions should be marked as frame related,
9398 zero if %r11 register is live and cannot be freely used and positive
9399 otherwise. */
9401 static void
9402 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9403 int style, bool set_cfa)
9405 struct machine_function *m = cfun->machine;
9406 rtx insn;
9407 bool add_frame_related_expr = false;
9409 if (Pmode == SImode)
9410 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9411 else if (x86_64_immediate_operand (offset, DImode))
9412 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9413 else
9415 rtx tmp;
9416 /* r11 is used by indirect sibcall return as well, set before the
9417 epilogue and used after the epilogue. */
9418 if (style)
9419 tmp = gen_rtx_REG (DImode, R11_REG);
9420 else
9422 gcc_assert (src != hard_frame_pointer_rtx
9423 && dest != hard_frame_pointer_rtx);
9424 tmp = hard_frame_pointer_rtx;
9426 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9427 if (style < 0)
9428 add_frame_related_expr = true;
9430 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9433 insn = emit_insn (insn);
9434 if (style >= 0)
9435 ix86_add_queued_cfa_restore_notes (insn);
9437 if (set_cfa)
9439 rtx r;
9441 gcc_assert (m->fs.cfa_reg == src);
9442 m->fs.cfa_offset += INTVAL (offset);
9443 m->fs.cfa_reg = dest;
9445 r = gen_rtx_PLUS (Pmode, src, offset);
9446 r = gen_rtx_SET (VOIDmode, dest, r);
9447 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9448 RTX_FRAME_RELATED_P (insn) = 1;
9450 else if (style < 0)
9452 RTX_FRAME_RELATED_P (insn) = 1;
9453 if (add_frame_related_expr)
9455 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9456 r = gen_rtx_SET (VOIDmode, dest, r);
9457 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9461 if (dest == stack_pointer_rtx)
9463 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9464 bool valid = m->fs.sp_valid;
9466 if (src == hard_frame_pointer_rtx)
9468 valid = m->fs.fp_valid;
9469 ooffset = m->fs.fp_offset;
9471 else if (src == crtl->drap_reg)
9473 valid = m->fs.drap_valid;
9474 ooffset = 0;
9476 else
9478 /* Else there are two possibilities: SP itself, which we set
9479 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9480 taken care of this by hand along the eh_return path. */
9481 gcc_checking_assert (src == stack_pointer_rtx
9482 || offset == const0_rtx);
9485 m->fs.sp_offset = ooffset - INTVAL (offset);
9486 m->fs.sp_valid = valid;
9490 /* Find an available register to be used as dynamic realign argument
9491 pointer regsiter. Such a register will be written in prologue and
9492 used in begin of body, so it must not be
9493 1. parameter passing register.
9494 2. GOT pointer.
9495 We reuse static-chain register if it is available. Otherwise, we
9496 use DI for i386 and R13 for x86-64. We chose R13 since it has
9497 shorter encoding.
9499 Return: the regno of chosen register. */
9501 static unsigned int
9502 find_drap_reg (void)
9504 tree decl = cfun->decl;
9506 if (TARGET_64BIT)
9508 /* Use R13 for nested function or function need static chain.
9509 Since function with tail call may use any caller-saved
9510 registers in epilogue, DRAP must not use caller-saved
9511 register in such case. */
9512 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9513 return R13_REG;
9515 return R10_REG;
9517 else
9519 /* Use DI for nested function or function need static chain.
9520 Since function with tail call may use any caller-saved
9521 registers in epilogue, DRAP must not use caller-saved
9522 register in such case. */
9523 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9524 return DI_REG;
9526 /* Reuse static chain register if it isn't used for parameter
9527 passing. */
9528 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9530 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9531 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9532 return CX_REG;
9534 return DI_REG;
9538 /* Return minimum incoming stack alignment. */
9540 static unsigned int
9541 ix86_minimum_incoming_stack_boundary (bool sibcall)
9543 unsigned int incoming_stack_boundary;
9545 /* Prefer the one specified at command line. */
9546 if (ix86_user_incoming_stack_boundary)
9547 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9548 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9549 if -mstackrealign is used, it isn't used for sibcall check and
9550 estimated stack alignment is 128bit. */
9551 else if (!sibcall
9552 && !TARGET_64BIT
9553 && ix86_force_align_arg_pointer
9554 && crtl->stack_alignment_estimated == 128)
9555 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9556 else
9557 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9559 /* Incoming stack alignment can be changed on individual functions
9560 via force_align_arg_pointer attribute. We use the smallest
9561 incoming stack boundary. */
9562 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9563 && lookup_attribute (ix86_force_align_arg_pointer_string,
9564 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9565 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9567 /* The incoming stack frame has to be aligned at least at
9568 parm_stack_boundary. */
9569 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9570 incoming_stack_boundary = crtl->parm_stack_boundary;
9572 /* Stack at entrance of main is aligned by runtime. We use the
9573 smallest incoming stack boundary. */
9574 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9575 && DECL_NAME (current_function_decl)
9576 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9577 && DECL_FILE_SCOPE_P (current_function_decl))
9578 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9580 return incoming_stack_boundary;
9583 /* Update incoming stack boundary and estimated stack alignment. */
9585 static void
9586 ix86_update_stack_boundary (void)
9588 ix86_incoming_stack_boundary
9589 = ix86_minimum_incoming_stack_boundary (false);
9591 /* x86_64 vararg needs 16byte stack alignment for register save
9592 area. */
9593 if (TARGET_64BIT
9594 && cfun->stdarg
9595 && crtl->stack_alignment_estimated < 128)
9596 crtl->stack_alignment_estimated = 128;
9599 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9600 needed or an rtx for DRAP otherwise. */
9602 static rtx
9603 ix86_get_drap_rtx (void)
9605 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9606 crtl->need_drap = true;
9608 if (stack_realign_drap)
9610 /* Assign DRAP to vDRAP and returns vDRAP */
9611 unsigned int regno = find_drap_reg ();
9612 rtx drap_vreg;
9613 rtx arg_ptr;
9614 rtx seq, insn;
9616 arg_ptr = gen_rtx_REG (Pmode, regno);
9617 crtl->drap_reg = arg_ptr;
9619 start_sequence ();
9620 drap_vreg = copy_to_reg (arg_ptr);
9621 seq = get_insns ();
9622 end_sequence ();
9624 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9625 if (!optimize)
9627 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9628 RTX_FRAME_RELATED_P (insn) = 1;
9630 return drap_vreg;
9632 else
9633 return NULL;
9636 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9638 static rtx
9639 ix86_internal_arg_pointer (void)
9641 return virtual_incoming_args_rtx;
9644 struct scratch_reg {
9645 rtx reg;
9646 bool saved;
9649 /* Return a short-lived scratch register for use on function entry.
9650 In 32-bit mode, it is valid only after the registers are saved
9651 in the prologue. This register must be released by means of
9652 release_scratch_register_on_entry once it is dead. */
9654 static void
9655 get_scratch_register_on_entry (struct scratch_reg *sr)
9657 int regno;
9659 sr->saved = false;
9661 if (TARGET_64BIT)
9663 /* We always use R11 in 64-bit mode. */
9664 regno = R11_REG;
9666 else
9668 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9669 bool fastcall_p
9670 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9671 bool thiscall_p
9672 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9673 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9674 int regparm = ix86_function_regparm (fntype, decl);
9675 int drap_regno
9676 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9678 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9679 for the static chain register. */
9680 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9681 && drap_regno != AX_REG)
9682 regno = AX_REG;
9683 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9684 for the static chain register. */
9685 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9686 regno = AX_REG;
9687 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9688 regno = DX_REG;
9689 /* ecx is the static chain register. */
9690 else if (regparm < 3 && !fastcall_p && !thiscall_p
9691 && !static_chain_p
9692 && drap_regno != CX_REG)
9693 regno = CX_REG;
9694 else if (ix86_save_reg (BX_REG, true))
9695 regno = BX_REG;
9696 /* esi is the static chain register. */
9697 else if (!(regparm == 3 && static_chain_p)
9698 && ix86_save_reg (SI_REG, true))
9699 regno = SI_REG;
9700 else if (ix86_save_reg (DI_REG, true))
9701 regno = DI_REG;
9702 else
9704 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9705 sr->saved = true;
9709 sr->reg = gen_rtx_REG (Pmode, regno);
9710 if (sr->saved)
9712 rtx insn = emit_insn (gen_push (sr->reg));
9713 RTX_FRAME_RELATED_P (insn) = 1;
9717 /* Release a scratch register obtained from the preceding function. */
9719 static void
9720 release_scratch_register_on_entry (struct scratch_reg *sr)
9722 if (sr->saved)
9724 struct machine_function *m = cfun->machine;
9725 rtx x, insn = emit_insn (gen_pop (sr->reg));
9727 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9728 RTX_FRAME_RELATED_P (insn) = 1;
9729 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9730 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9731 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9732 m->fs.sp_offset -= UNITS_PER_WORD;
9736 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9738 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9740 static void
9741 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9743 /* We skip the probe for the first interval + a small dope of 4 words and
9744 probe that many bytes past the specified size to maintain a protection
9745 area at the botton of the stack. */
9746 const int dope = 4 * UNITS_PER_WORD;
9747 rtx size_rtx = GEN_INT (size), last;
9749 /* See if we have a constant small number of probes to generate. If so,
9750 that's the easy case. The run-time loop is made up of 11 insns in the
9751 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9752 for n # of intervals. */
9753 if (size <= 5 * PROBE_INTERVAL)
9755 HOST_WIDE_INT i, adjust;
9756 bool first_probe = true;
9758 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9759 values of N from 1 until it exceeds SIZE. If only one probe is
9760 needed, this will not generate any code. Then adjust and probe
9761 to PROBE_INTERVAL + SIZE. */
9762 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9764 if (first_probe)
9766 adjust = 2 * PROBE_INTERVAL + dope;
9767 first_probe = false;
9769 else
9770 adjust = PROBE_INTERVAL;
9772 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9773 plus_constant (Pmode, stack_pointer_rtx,
9774 -adjust)));
9775 emit_stack_probe (stack_pointer_rtx);
9778 if (first_probe)
9779 adjust = size + PROBE_INTERVAL + dope;
9780 else
9781 adjust = size + PROBE_INTERVAL - i;
9783 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9784 plus_constant (Pmode, stack_pointer_rtx,
9785 -adjust)));
9786 emit_stack_probe (stack_pointer_rtx);
9788 /* Adjust back to account for the additional first interval. */
9789 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9790 plus_constant (Pmode, stack_pointer_rtx,
9791 PROBE_INTERVAL + dope)));
9794 /* Otherwise, do the same as above, but in a loop. Note that we must be
9795 extra careful with variables wrapping around because we might be at
9796 the very top (or the very bottom) of the address space and we have
9797 to be able to handle this case properly; in particular, we use an
9798 equality test for the loop condition. */
9799 else
9801 HOST_WIDE_INT rounded_size;
9802 struct scratch_reg sr;
9804 get_scratch_register_on_entry (&sr);
9807 /* Step 1: round SIZE to the previous multiple of the interval. */
9809 rounded_size = size & -PROBE_INTERVAL;
9812 /* Step 2: compute initial and final value of the loop counter. */
9814 /* SP = SP_0 + PROBE_INTERVAL. */
9815 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9816 plus_constant (Pmode, stack_pointer_rtx,
9817 - (PROBE_INTERVAL + dope))));
9819 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9820 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9821 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9822 gen_rtx_PLUS (Pmode, sr.reg,
9823 stack_pointer_rtx)));
9826 /* Step 3: the loop
9828 while (SP != LAST_ADDR)
9830 SP = SP + PROBE_INTERVAL
9831 probe at SP
9834 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9835 values of N from 1 until it is equal to ROUNDED_SIZE. */
9837 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9840 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9841 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9843 if (size != rounded_size)
9845 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9846 plus_constant (Pmode, stack_pointer_rtx,
9847 rounded_size - size)));
9848 emit_stack_probe (stack_pointer_rtx);
9851 /* Adjust back to account for the additional first interval. */
9852 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9853 plus_constant (Pmode, stack_pointer_rtx,
9854 PROBE_INTERVAL + dope)));
9856 release_scratch_register_on_entry (&sr);
9859 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9861 /* Even if the stack pointer isn't the CFA register, we need to correctly
9862 describe the adjustments made to it, in particular differentiate the
9863 frame-related ones from the frame-unrelated ones. */
9864 if (size > 0)
9866 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9867 XVECEXP (expr, 0, 0)
9868 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9869 plus_constant (Pmode, stack_pointer_rtx, -size));
9870 XVECEXP (expr, 0, 1)
9871 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9872 plus_constant (Pmode, stack_pointer_rtx,
9873 PROBE_INTERVAL + dope + size));
9874 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9875 RTX_FRAME_RELATED_P (last) = 1;
9877 cfun->machine->fs.sp_offset += size;
9880 /* Make sure nothing is scheduled before we are done. */
9881 emit_insn (gen_blockage ());
9884 /* Adjust the stack pointer up to REG while probing it. */
9886 const char *
9887 output_adjust_stack_and_probe (rtx reg)
9889 static int labelno = 0;
9890 char loop_lab[32], end_lab[32];
9891 rtx xops[2];
9893 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9894 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9896 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9898 /* Jump to END_LAB if SP == LAST_ADDR. */
9899 xops[0] = stack_pointer_rtx;
9900 xops[1] = reg;
9901 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9902 fputs ("\tje\t", asm_out_file);
9903 assemble_name_raw (asm_out_file, end_lab);
9904 fputc ('\n', asm_out_file);
9906 /* SP = SP + PROBE_INTERVAL. */
9907 xops[1] = GEN_INT (PROBE_INTERVAL);
9908 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9910 /* Probe at SP. */
9911 xops[1] = const0_rtx;
9912 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9914 fprintf (asm_out_file, "\tjmp\t");
9915 assemble_name_raw (asm_out_file, loop_lab);
9916 fputc ('\n', asm_out_file);
9918 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9920 return "";
9923 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9924 inclusive. These are offsets from the current stack pointer. */
9926 static void
9927 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9929 /* See if we have a constant small number of probes to generate. If so,
9930 that's the easy case. The run-time loop is made up of 7 insns in the
9931 generic case while the compile-time loop is made up of n insns for n #
9932 of intervals. */
9933 if (size <= 7 * PROBE_INTERVAL)
9935 HOST_WIDE_INT i;
9937 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9938 it exceeds SIZE. If only one probe is needed, this will not
9939 generate any code. Then probe at FIRST + SIZE. */
9940 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9941 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9942 -(first + i)));
9944 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9945 -(first + size)));
9948 /* Otherwise, do the same as above, but in a loop. Note that we must be
9949 extra careful with variables wrapping around because we might be at
9950 the very top (or the very bottom) of the address space and we have
9951 to be able to handle this case properly; in particular, we use an
9952 equality test for the loop condition. */
9953 else
9955 HOST_WIDE_INT rounded_size, last;
9956 struct scratch_reg sr;
9958 get_scratch_register_on_entry (&sr);
9961 /* Step 1: round SIZE to the previous multiple of the interval. */
9963 rounded_size = size & -PROBE_INTERVAL;
9966 /* Step 2: compute initial and final value of the loop counter. */
9968 /* TEST_OFFSET = FIRST. */
9969 emit_move_insn (sr.reg, GEN_INT (-first));
9971 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9972 last = first + rounded_size;
9975 /* Step 3: the loop
9977 while (TEST_ADDR != LAST_ADDR)
9979 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9980 probe at TEST_ADDR
9983 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9984 until it is equal to ROUNDED_SIZE. */
9986 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9989 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9990 that SIZE is equal to ROUNDED_SIZE. */
9992 if (size != rounded_size)
9993 emit_stack_probe (plus_constant (Pmode,
9994 gen_rtx_PLUS (Pmode,
9995 stack_pointer_rtx,
9996 sr.reg),
9997 rounded_size - size));
9999 release_scratch_register_on_entry (&sr);
10002 /* Make sure nothing is scheduled before we are done. */
10003 emit_insn (gen_blockage ());
10006 /* Probe a range of stack addresses from REG to END, inclusive. These are
10007 offsets from the current stack pointer. */
10009 const char *
10010 output_probe_stack_range (rtx reg, rtx end)
10012 static int labelno = 0;
10013 char loop_lab[32], end_lab[32];
10014 rtx xops[3];
10016 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10017 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10019 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10021 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10022 xops[0] = reg;
10023 xops[1] = end;
10024 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10025 fputs ("\tje\t", asm_out_file);
10026 assemble_name_raw (asm_out_file, end_lab);
10027 fputc ('\n', asm_out_file);
10029 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10030 xops[1] = GEN_INT (PROBE_INTERVAL);
10031 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10033 /* Probe at TEST_ADDR. */
10034 xops[0] = stack_pointer_rtx;
10035 xops[1] = reg;
10036 xops[2] = const0_rtx;
10037 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10039 fprintf (asm_out_file, "\tjmp\t");
10040 assemble_name_raw (asm_out_file, loop_lab);
10041 fputc ('\n', asm_out_file);
10043 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10045 return "";
10048 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10049 to be generated in correct form. */
10050 static void
10051 ix86_finalize_stack_realign_flags (void)
10053 /* Check if stack realign is really needed after reload, and
10054 stores result in cfun */
10055 unsigned int incoming_stack_boundary
10056 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10057 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10058 unsigned int stack_realign = (incoming_stack_boundary
10059 < (crtl->is_leaf
10060 ? crtl->max_used_stack_slot_alignment
10061 : crtl->stack_alignment_needed));
10063 if (crtl->stack_realign_finalized)
10065 /* After stack_realign_needed is finalized, we can't no longer
10066 change it. */
10067 gcc_assert (crtl->stack_realign_needed == stack_realign);
10068 return;
10071 /* If the only reason for frame_pointer_needed is that we conservatively
10072 assumed stack realignment might be needed, but in the end nothing that
10073 needed the stack alignment had been spilled, clear frame_pointer_needed
10074 and say we don't need stack realignment. */
10075 if (stack_realign
10076 && !crtl->need_drap
10077 && frame_pointer_needed
10078 && crtl->is_leaf
10079 && flag_omit_frame_pointer
10080 && crtl->sp_is_unchanging
10081 && !ix86_current_function_calls_tls_descriptor
10082 && !crtl->accesses_prior_frames
10083 && !cfun->calls_alloca
10084 && !crtl->calls_eh_return
10085 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10086 && !ix86_frame_pointer_required ()
10087 && get_frame_size () == 0
10088 && ix86_nsaved_sseregs () == 0
10089 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10091 HARD_REG_SET set_up_by_prologue, prologue_used;
10092 basic_block bb;
10094 CLEAR_HARD_REG_SET (prologue_used);
10095 CLEAR_HARD_REG_SET (set_up_by_prologue);
10096 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10097 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10098 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10099 HARD_FRAME_POINTER_REGNUM);
10100 FOR_EACH_BB (bb)
10102 rtx insn;
10103 FOR_BB_INSNS (bb, insn)
10104 if (NONDEBUG_INSN_P (insn)
10105 && requires_stack_frame_p (insn, prologue_used,
10106 set_up_by_prologue))
10108 crtl->stack_realign_needed = stack_realign;
10109 crtl->stack_realign_finalized = true;
10110 return;
10114 frame_pointer_needed = false;
10115 stack_realign = false;
10116 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10117 crtl->stack_alignment_needed = incoming_stack_boundary;
10118 crtl->stack_alignment_estimated = incoming_stack_boundary;
10119 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10120 crtl->preferred_stack_boundary = incoming_stack_boundary;
10121 df_finish_pass (true);
10122 df_scan_alloc (NULL);
10123 df_scan_blocks ();
10124 df_compute_regs_ever_live (true);
10125 df_analyze ();
10128 crtl->stack_realign_needed = stack_realign;
10129 crtl->stack_realign_finalized = true;
10132 /* Expand the prologue into a bunch of separate insns. */
10134 void
10135 ix86_expand_prologue (void)
10137 struct machine_function *m = cfun->machine;
10138 rtx insn, t;
10139 bool pic_reg_used;
10140 struct ix86_frame frame;
10141 HOST_WIDE_INT allocate;
10142 bool int_registers_saved;
10143 bool sse_registers_saved;
10145 ix86_finalize_stack_realign_flags ();
10147 /* DRAP should not coexist with stack_realign_fp */
10148 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10150 memset (&m->fs, 0, sizeof (m->fs));
10152 /* Initialize CFA state for before the prologue. */
10153 m->fs.cfa_reg = stack_pointer_rtx;
10154 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10156 /* Track SP offset to the CFA. We continue tracking this after we've
10157 swapped the CFA register away from SP. In the case of re-alignment
10158 this is fudged; we're interested to offsets within the local frame. */
10159 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10160 m->fs.sp_valid = true;
10162 ix86_compute_frame_layout (&frame);
10164 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10166 /* We should have already generated an error for any use of
10167 ms_hook on a nested function. */
10168 gcc_checking_assert (!ix86_static_chain_on_stack);
10170 /* Check if profiling is active and we shall use profiling before
10171 prologue variant. If so sorry. */
10172 if (crtl->profile && flag_fentry != 0)
10173 sorry ("ms_hook_prologue attribute isn%'t compatible "
10174 "with -mfentry for 32-bit");
10176 /* In ix86_asm_output_function_label we emitted:
10177 8b ff movl.s %edi,%edi
10178 55 push %ebp
10179 8b ec movl.s %esp,%ebp
10181 This matches the hookable function prologue in Win32 API
10182 functions in Microsoft Windows XP Service Pack 2 and newer.
10183 Wine uses this to enable Windows apps to hook the Win32 API
10184 functions provided by Wine.
10186 What that means is that we've already set up the frame pointer. */
10188 if (frame_pointer_needed
10189 && !(crtl->drap_reg && crtl->stack_realign_needed))
10191 rtx push, mov;
10193 /* We've decided to use the frame pointer already set up.
10194 Describe this to the unwinder by pretending that both
10195 push and mov insns happen right here.
10197 Putting the unwind info here at the end of the ms_hook
10198 is done so that we can make absolutely certain we get
10199 the required byte sequence at the start of the function,
10200 rather than relying on an assembler that can produce
10201 the exact encoding required.
10203 However it does mean (in the unpatched case) that we have
10204 a 1 insn window where the asynchronous unwind info is
10205 incorrect. However, if we placed the unwind info at
10206 its correct location we would have incorrect unwind info
10207 in the patched case. Which is probably all moot since
10208 I don't expect Wine generates dwarf2 unwind info for the
10209 system libraries that use this feature. */
10211 insn = emit_insn (gen_blockage ());
10213 push = gen_push (hard_frame_pointer_rtx);
10214 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10215 stack_pointer_rtx);
10216 RTX_FRAME_RELATED_P (push) = 1;
10217 RTX_FRAME_RELATED_P (mov) = 1;
10219 RTX_FRAME_RELATED_P (insn) = 1;
10220 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10221 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10223 /* Note that gen_push incremented m->fs.cfa_offset, even
10224 though we didn't emit the push insn here. */
10225 m->fs.cfa_reg = hard_frame_pointer_rtx;
10226 m->fs.fp_offset = m->fs.cfa_offset;
10227 m->fs.fp_valid = true;
10229 else
10231 /* The frame pointer is not needed so pop %ebp again.
10232 This leaves us with a pristine state. */
10233 emit_insn (gen_pop (hard_frame_pointer_rtx));
10237 /* The first insn of a function that accepts its static chain on the
10238 stack is to push the register that would be filled in by a direct
10239 call. This insn will be skipped by the trampoline. */
10240 else if (ix86_static_chain_on_stack)
10242 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10243 emit_insn (gen_blockage ());
10245 /* We don't want to interpret this push insn as a register save,
10246 only as a stack adjustment. The real copy of the register as
10247 a save will be done later, if needed. */
10248 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10249 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10250 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10251 RTX_FRAME_RELATED_P (insn) = 1;
10254 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10255 of DRAP is needed and stack realignment is really needed after reload */
10256 if (stack_realign_drap)
10258 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10260 /* Only need to push parameter pointer reg if it is caller saved. */
10261 if (!call_used_regs[REGNO (crtl->drap_reg)])
10263 /* Push arg pointer reg */
10264 insn = emit_insn (gen_push (crtl->drap_reg));
10265 RTX_FRAME_RELATED_P (insn) = 1;
10268 /* Grab the argument pointer. */
10269 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10270 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10271 RTX_FRAME_RELATED_P (insn) = 1;
10272 m->fs.cfa_reg = crtl->drap_reg;
10273 m->fs.cfa_offset = 0;
10275 /* Align the stack. */
10276 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10277 stack_pointer_rtx,
10278 GEN_INT (-align_bytes)));
10279 RTX_FRAME_RELATED_P (insn) = 1;
10281 /* Replicate the return address on the stack so that return
10282 address can be reached via (argp - 1) slot. This is needed
10283 to implement macro RETURN_ADDR_RTX and intrinsic function
10284 expand_builtin_return_addr etc. */
10285 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10286 t = gen_frame_mem (word_mode, t);
10287 insn = emit_insn (gen_push (t));
10288 RTX_FRAME_RELATED_P (insn) = 1;
10290 /* For the purposes of frame and register save area addressing,
10291 we've started over with a new frame. */
10292 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10293 m->fs.realigned = true;
10296 int_registers_saved = (frame.nregs == 0);
10297 sse_registers_saved = (frame.nsseregs == 0);
10299 if (frame_pointer_needed && !m->fs.fp_valid)
10301 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10302 slower on all targets. Also sdb doesn't like it. */
10303 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10304 RTX_FRAME_RELATED_P (insn) = 1;
10306 /* Push registers now, before setting the frame pointer
10307 on SEH target. */
10308 if (!int_registers_saved
10309 && TARGET_SEH
10310 && !frame.save_regs_using_mov)
10312 ix86_emit_save_regs ();
10313 int_registers_saved = true;
10314 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10317 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10319 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10320 RTX_FRAME_RELATED_P (insn) = 1;
10322 if (m->fs.cfa_reg == stack_pointer_rtx)
10323 m->fs.cfa_reg = hard_frame_pointer_rtx;
10324 m->fs.fp_offset = m->fs.sp_offset;
10325 m->fs.fp_valid = true;
10329 if (!int_registers_saved)
10331 /* If saving registers via PUSH, do so now. */
10332 if (!frame.save_regs_using_mov)
10334 ix86_emit_save_regs ();
10335 int_registers_saved = true;
10336 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10339 /* When using red zone we may start register saving before allocating
10340 the stack frame saving one cycle of the prologue. However, avoid
10341 doing this if we have to probe the stack; at least on x86_64 the
10342 stack probe can turn into a call that clobbers a red zone location. */
10343 else if (ix86_using_red_zone ()
10344 && (! TARGET_STACK_PROBE
10345 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10347 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10348 int_registers_saved = true;
10352 if (stack_realign_fp)
10354 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10355 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10357 /* The computation of the size of the re-aligned stack frame means
10358 that we must allocate the size of the register save area before
10359 performing the actual alignment. Otherwise we cannot guarantee
10360 that there's enough storage above the realignment point. */
10361 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10362 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10363 GEN_INT (m->fs.sp_offset
10364 - frame.sse_reg_save_offset),
10365 -1, false);
10367 /* Align the stack. */
10368 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10369 stack_pointer_rtx,
10370 GEN_INT (-align_bytes)));
10372 /* For the purposes of register save area addressing, the stack
10373 pointer is no longer valid. As for the value of sp_offset,
10374 see ix86_compute_frame_layout, which we need to match in order
10375 to pass verification of stack_pointer_offset at the end. */
10376 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10377 m->fs.sp_valid = false;
10380 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10382 if (flag_stack_usage_info)
10384 /* We start to count from ARG_POINTER. */
10385 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10387 /* If it was realigned, take into account the fake frame. */
10388 if (stack_realign_drap)
10390 if (ix86_static_chain_on_stack)
10391 stack_size += UNITS_PER_WORD;
10393 if (!call_used_regs[REGNO (crtl->drap_reg)])
10394 stack_size += UNITS_PER_WORD;
10396 /* This over-estimates by 1 minimal-stack-alignment-unit but
10397 mitigates that by counting in the new return address slot. */
10398 current_function_dynamic_stack_size
10399 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10402 current_function_static_stack_size = stack_size;
10405 /* On SEH target with very large frame size, allocate an area to save
10406 SSE registers (as the very large allocation won't be described). */
10407 if (TARGET_SEH
10408 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10409 && !sse_registers_saved)
10411 HOST_WIDE_INT sse_size =
10412 frame.sse_reg_save_offset - frame.reg_save_offset;
10414 gcc_assert (int_registers_saved);
10416 /* No need to do stack checking as the area will be immediately
10417 written. */
10418 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10419 GEN_INT (-sse_size), -1,
10420 m->fs.cfa_reg == stack_pointer_rtx);
10421 allocate -= sse_size;
10422 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10423 sse_registers_saved = true;
10426 /* The stack has already been decremented by the instruction calling us
10427 so probe if the size is non-negative to preserve the protection area. */
10428 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10430 /* We expect the registers to be saved when probes are used. */
10431 gcc_assert (int_registers_saved);
10433 if (STACK_CHECK_MOVING_SP)
10435 ix86_adjust_stack_and_probe (allocate);
10436 allocate = 0;
10438 else
10440 HOST_WIDE_INT size = allocate;
10442 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10443 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10445 if (TARGET_STACK_PROBE)
10446 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10447 else
10448 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10452 if (allocate == 0)
10454 else if (!ix86_target_stack_probe ()
10455 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10457 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10458 GEN_INT (-allocate), -1,
10459 m->fs.cfa_reg == stack_pointer_rtx);
10461 else
10463 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10464 rtx r10 = NULL;
10465 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10466 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10467 bool eax_live = false;
10468 bool r10_live = false;
10470 if (TARGET_64BIT)
10471 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10472 if (!TARGET_64BIT_MS_ABI)
10473 eax_live = ix86_eax_live_at_start_p ();
10475 /* Note that SEH directives need to continue tracking the stack
10476 pointer even after the frame pointer has been set up. */
10477 if (eax_live)
10479 insn = emit_insn (gen_push (eax));
10480 allocate -= UNITS_PER_WORD;
10481 if (sp_is_cfa_reg || TARGET_SEH)
10483 if (sp_is_cfa_reg)
10484 m->fs.cfa_offset += UNITS_PER_WORD;
10485 RTX_FRAME_RELATED_P (insn) = 1;
10489 if (r10_live)
10491 r10 = gen_rtx_REG (Pmode, R10_REG);
10492 insn = emit_insn (gen_push (r10));
10493 allocate -= UNITS_PER_WORD;
10494 if (sp_is_cfa_reg || TARGET_SEH)
10496 if (sp_is_cfa_reg)
10497 m->fs.cfa_offset += UNITS_PER_WORD;
10498 RTX_FRAME_RELATED_P (insn) = 1;
10502 emit_move_insn (eax, GEN_INT (allocate));
10503 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10505 /* Use the fact that AX still contains ALLOCATE. */
10506 adjust_stack_insn = (Pmode == DImode
10507 ? gen_pro_epilogue_adjust_stack_di_sub
10508 : gen_pro_epilogue_adjust_stack_si_sub);
10510 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10511 stack_pointer_rtx, eax));
10513 if (sp_is_cfa_reg || TARGET_SEH)
10515 if (sp_is_cfa_reg)
10516 m->fs.cfa_offset += allocate;
10517 RTX_FRAME_RELATED_P (insn) = 1;
10518 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10519 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10520 plus_constant (Pmode, stack_pointer_rtx,
10521 -allocate)));
10523 m->fs.sp_offset += allocate;
10525 /* Use stack_pointer_rtx for relative addressing so that code
10526 works for realigned stack, too. */
10527 if (r10_live && eax_live)
10529 t = plus_constant (Pmode, stack_pointer_rtx, allocate);
10530 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10531 gen_frame_mem (word_mode, t));
10532 t = plus_constant (Pmode, stack_pointer_rtx,
10533 allocate - UNITS_PER_WORD);
10534 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10535 gen_frame_mem (word_mode, t));
10537 else if (eax_live || r10_live)
10539 t = plus_constant (Pmode, stack_pointer_rtx, allocate);
10540 emit_move_insn (gen_rtx_REG (word_mode,
10541 (eax_live ? AX_REG : R10_REG)),
10542 gen_frame_mem (word_mode, t));
10545 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10547 /* If we havn't already set up the frame pointer, do so now. */
10548 if (frame_pointer_needed && !m->fs.fp_valid)
10550 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10551 GEN_INT (frame.stack_pointer_offset
10552 - frame.hard_frame_pointer_offset));
10553 insn = emit_insn (insn);
10554 RTX_FRAME_RELATED_P (insn) = 1;
10555 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10557 if (m->fs.cfa_reg == stack_pointer_rtx)
10558 m->fs.cfa_reg = hard_frame_pointer_rtx;
10559 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10560 m->fs.fp_valid = true;
10563 if (!int_registers_saved)
10564 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10565 if (!sse_registers_saved)
10566 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10568 pic_reg_used = false;
10569 if (pic_offset_table_rtx
10570 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10571 || crtl->profile))
10573 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10575 if (alt_pic_reg_used != INVALID_REGNUM)
10576 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10578 pic_reg_used = true;
10581 if (pic_reg_used)
10583 if (TARGET_64BIT)
10585 if (ix86_cmodel == CM_LARGE_PIC)
10587 rtx label, tmp_reg;
10589 gcc_assert (Pmode == DImode);
10590 label = gen_label_rtx ();
10591 emit_label (label);
10592 LABEL_PRESERVE_P (label) = 1;
10593 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10594 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10595 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10596 label));
10597 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10598 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10599 pic_offset_table_rtx, tmp_reg));
10601 else
10602 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10604 else
10606 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10607 RTX_FRAME_RELATED_P (insn) = 1;
10608 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10612 /* In the pic_reg_used case, make sure that the got load isn't deleted
10613 when mcount needs it. Blockage to avoid call movement across mcount
10614 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10615 note. */
10616 if (crtl->profile && !flag_fentry && pic_reg_used)
10617 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10619 if (crtl->drap_reg && !crtl->stack_realign_needed)
10621 /* vDRAP is setup but after reload it turns out stack realign
10622 isn't necessary, here we will emit prologue to setup DRAP
10623 without stack realign adjustment */
10624 t = choose_baseaddr (0);
10625 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10628 /* Prevent instructions from being scheduled into register save push
10629 sequence when access to the redzone area is done through frame pointer.
10630 The offset between the frame pointer and the stack pointer is calculated
10631 relative to the value of the stack pointer at the end of the function
10632 prologue, and moving instructions that access redzone area via frame
10633 pointer inside push sequence violates this assumption. */
10634 if (frame_pointer_needed && frame.red_zone_size)
10635 emit_insn (gen_memory_blockage ());
10637 /* Emit cld instruction if stringops are used in the function. */
10638 if (TARGET_CLD && ix86_current_function_needs_cld)
10639 emit_insn (gen_cld ());
10641 /* SEH requires that the prologue end within 256 bytes of the start of
10642 the function. Prevent instruction schedules that would extend that.
10643 Further, prevent alloca modifications to the stack pointer from being
10644 combined with prologue modifications. */
10645 if (TARGET_SEH)
10646 emit_insn (gen_prologue_use (stack_pointer_rtx));
10649 /* Emit code to restore REG using a POP insn. */
10651 static void
10652 ix86_emit_restore_reg_using_pop (rtx reg)
10654 struct machine_function *m = cfun->machine;
10655 rtx insn = emit_insn (gen_pop (reg));
10657 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10658 m->fs.sp_offset -= UNITS_PER_WORD;
10660 if (m->fs.cfa_reg == crtl->drap_reg
10661 && REGNO (reg) == REGNO (crtl->drap_reg))
10663 /* Previously we'd represented the CFA as an expression
10664 like *(%ebp - 8). We've just popped that value from
10665 the stack, which means we need to reset the CFA to
10666 the drap register. This will remain until we restore
10667 the stack pointer. */
10668 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10669 RTX_FRAME_RELATED_P (insn) = 1;
10671 /* This means that the DRAP register is valid for addressing too. */
10672 m->fs.drap_valid = true;
10673 return;
10676 if (m->fs.cfa_reg == stack_pointer_rtx)
10678 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10679 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10680 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10681 RTX_FRAME_RELATED_P (insn) = 1;
10683 m->fs.cfa_offset -= UNITS_PER_WORD;
10686 /* When the frame pointer is the CFA, and we pop it, we are
10687 swapping back to the stack pointer as the CFA. This happens
10688 for stack frames that don't allocate other data, so we assume
10689 the stack pointer is now pointing at the return address, i.e.
10690 the function entry state, which makes the offset be 1 word. */
10691 if (reg == hard_frame_pointer_rtx)
10693 m->fs.fp_valid = false;
10694 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10696 m->fs.cfa_reg = stack_pointer_rtx;
10697 m->fs.cfa_offset -= UNITS_PER_WORD;
10699 add_reg_note (insn, REG_CFA_DEF_CFA,
10700 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10701 GEN_INT (m->fs.cfa_offset)));
10702 RTX_FRAME_RELATED_P (insn) = 1;
10707 /* Emit code to restore saved registers using POP insns. */
10709 static void
10710 ix86_emit_restore_regs_using_pop (void)
10712 unsigned int regno;
10714 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10715 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10716 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10719 /* Emit code and notes for the LEAVE instruction. */
10721 static void
10722 ix86_emit_leave (void)
10724 struct machine_function *m = cfun->machine;
10725 rtx insn = emit_insn (ix86_gen_leave ());
10727 ix86_add_queued_cfa_restore_notes (insn);
10729 gcc_assert (m->fs.fp_valid);
10730 m->fs.sp_valid = true;
10731 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10732 m->fs.fp_valid = false;
10734 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10736 m->fs.cfa_reg = stack_pointer_rtx;
10737 m->fs.cfa_offset = m->fs.sp_offset;
10739 add_reg_note (insn, REG_CFA_DEF_CFA,
10740 plus_constant (Pmode, stack_pointer_rtx,
10741 m->fs.sp_offset));
10742 RTX_FRAME_RELATED_P (insn) = 1;
10744 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10745 m->fs.fp_offset);
10748 /* Emit code to restore saved registers using MOV insns.
10749 First register is restored from CFA - CFA_OFFSET. */
10750 static void
10751 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10752 bool maybe_eh_return)
10754 struct machine_function *m = cfun->machine;
10755 unsigned int regno;
10757 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10758 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10760 rtx reg = gen_rtx_REG (word_mode, regno);
10761 rtx insn, mem;
10763 mem = choose_baseaddr (cfa_offset);
10764 mem = gen_frame_mem (word_mode, mem);
10765 insn = emit_move_insn (reg, mem);
10767 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10769 /* Previously we'd represented the CFA as an expression
10770 like *(%ebp - 8). We've just popped that value from
10771 the stack, which means we need to reset the CFA to
10772 the drap register. This will remain until we restore
10773 the stack pointer. */
10774 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10775 RTX_FRAME_RELATED_P (insn) = 1;
10777 /* This means that the DRAP register is valid for addressing. */
10778 m->fs.drap_valid = true;
10780 else
10781 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10783 cfa_offset -= UNITS_PER_WORD;
10787 /* Emit code to restore saved registers using MOV insns.
10788 First register is restored from CFA - CFA_OFFSET. */
10789 static void
10790 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10791 bool maybe_eh_return)
10793 unsigned int regno;
10795 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10796 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10798 rtx reg = gen_rtx_REG (V4SFmode, regno);
10799 rtx mem;
10801 mem = choose_baseaddr (cfa_offset);
10802 mem = gen_rtx_MEM (V4SFmode, mem);
10803 set_mem_align (mem, 128);
10804 emit_move_insn (reg, mem);
10806 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10808 cfa_offset -= 16;
10812 /* Restore function stack, frame, and registers. */
10814 void
10815 ix86_expand_epilogue (int style)
10817 struct machine_function *m = cfun->machine;
10818 struct machine_frame_state frame_state_save = m->fs;
10819 struct ix86_frame frame;
10820 bool restore_regs_via_mov;
10821 bool using_drap;
10823 ix86_finalize_stack_realign_flags ();
10824 ix86_compute_frame_layout (&frame);
10826 m->fs.sp_valid = (!frame_pointer_needed
10827 || (crtl->sp_is_unchanging
10828 && !stack_realign_fp));
10829 gcc_assert (!m->fs.sp_valid
10830 || m->fs.sp_offset == frame.stack_pointer_offset);
10832 /* The FP must be valid if the frame pointer is present. */
10833 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10834 gcc_assert (!m->fs.fp_valid
10835 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10837 /* We must have *some* valid pointer to the stack frame. */
10838 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10840 /* The DRAP is never valid at this point. */
10841 gcc_assert (!m->fs.drap_valid);
10843 /* See the comment about red zone and frame
10844 pointer usage in ix86_expand_prologue. */
10845 if (frame_pointer_needed && frame.red_zone_size)
10846 emit_insn (gen_memory_blockage ());
10848 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10849 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10851 /* Determine the CFA offset of the end of the red-zone. */
10852 m->fs.red_zone_offset = 0;
10853 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10855 /* The red-zone begins below the return address. */
10856 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10858 /* When the register save area is in the aligned portion of
10859 the stack, determine the maximum runtime displacement that
10860 matches up with the aligned frame. */
10861 if (stack_realign_drap)
10862 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10863 + UNITS_PER_WORD);
10866 /* Special care must be taken for the normal return case of a function
10867 using eh_return: the eax and edx registers are marked as saved, but
10868 not restored along this path. Adjust the save location to match. */
10869 if (crtl->calls_eh_return && style != 2)
10870 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10872 /* EH_RETURN requires the use of moves to function properly. */
10873 if (crtl->calls_eh_return)
10874 restore_regs_via_mov = true;
10875 /* SEH requires the use of pops to identify the epilogue. */
10876 else if (TARGET_SEH)
10877 restore_regs_via_mov = false;
10878 /* If we're only restoring one register and sp is not valid then
10879 using a move instruction to restore the register since it's
10880 less work than reloading sp and popping the register. */
10881 else if (!m->fs.sp_valid && frame.nregs <= 1)
10882 restore_regs_via_mov = true;
10883 else if (TARGET_EPILOGUE_USING_MOVE
10884 && cfun->machine->use_fast_prologue_epilogue
10885 && (frame.nregs > 1
10886 || m->fs.sp_offset != frame.reg_save_offset))
10887 restore_regs_via_mov = true;
10888 else if (frame_pointer_needed
10889 && !frame.nregs
10890 && m->fs.sp_offset != frame.reg_save_offset)
10891 restore_regs_via_mov = true;
10892 else if (frame_pointer_needed
10893 && TARGET_USE_LEAVE
10894 && cfun->machine->use_fast_prologue_epilogue
10895 && frame.nregs == 1)
10896 restore_regs_via_mov = true;
10897 else
10898 restore_regs_via_mov = false;
10900 if (restore_regs_via_mov || frame.nsseregs)
10902 /* Ensure that the entire register save area is addressable via
10903 the stack pointer, if we will restore via sp. */
10904 if (TARGET_64BIT
10905 && m->fs.sp_offset > 0x7fffffff
10906 && !(m->fs.fp_valid || m->fs.drap_valid)
10907 && (frame.nsseregs + frame.nregs) != 0)
10909 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10910 GEN_INT (m->fs.sp_offset
10911 - frame.sse_reg_save_offset),
10912 style,
10913 m->fs.cfa_reg == stack_pointer_rtx);
10917 /* If there are any SSE registers to restore, then we have to do it
10918 via moves, since there's obviously no pop for SSE regs. */
10919 if (frame.nsseregs)
10920 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10921 style == 2);
10923 if (restore_regs_via_mov)
10925 rtx t;
10927 if (frame.nregs)
10928 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10930 /* eh_return epilogues need %ecx added to the stack pointer. */
10931 if (style == 2)
10933 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10935 /* Stack align doesn't work with eh_return. */
10936 gcc_assert (!stack_realign_drap);
10937 /* Neither does regparm nested functions. */
10938 gcc_assert (!ix86_static_chain_on_stack);
10940 if (frame_pointer_needed)
10942 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10943 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10944 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10946 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10947 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10949 /* Note that we use SA as a temporary CFA, as the return
10950 address is at the proper place relative to it. We
10951 pretend this happens at the FP restore insn because
10952 prior to this insn the FP would be stored at the wrong
10953 offset relative to SA, and after this insn we have no
10954 other reasonable register to use for the CFA. We don't
10955 bother resetting the CFA to the SP for the duration of
10956 the return insn. */
10957 add_reg_note (insn, REG_CFA_DEF_CFA,
10958 plus_constant (Pmode, sa, UNITS_PER_WORD));
10959 ix86_add_queued_cfa_restore_notes (insn);
10960 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10961 RTX_FRAME_RELATED_P (insn) = 1;
10963 m->fs.cfa_reg = sa;
10964 m->fs.cfa_offset = UNITS_PER_WORD;
10965 m->fs.fp_valid = false;
10967 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10968 const0_rtx, style, false);
10970 else
10972 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10973 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10974 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10975 ix86_add_queued_cfa_restore_notes (insn);
10977 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10978 if (m->fs.cfa_offset != UNITS_PER_WORD)
10980 m->fs.cfa_offset = UNITS_PER_WORD;
10981 add_reg_note (insn, REG_CFA_DEF_CFA,
10982 plus_constant (Pmode, stack_pointer_rtx,
10983 UNITS_PER_WORD));
10984 RTX_FRAME_RELATED_P (insn) = 1;
10987 m->fs.sp_offset = UNITS_PER_WORD;
10988 m->fs.sp_valid = true;
10991 else
10993 /* SEH requires that the function end with (1) a stack adjustment
10994 if necessary, (2) a sequence of pops, and (3) a return or
10995 jump instruction. Prevent insns from the function body from
10996 being scheduled into this sequence. */
10997 if (TARGET_SEH)
10999 /* Prevent a catch region from being adjacent to the standard
11000 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11001 several other flags that would be interesting to test are
11002 not yet set up. */
11003 if (flag_non_call_exceptions)
11004 emit_insn (gen_nops (const1_rtx));
11005 else
11006 emit_insn (gen_blockage ());
11009 /* First step is to deallocate the stack frame so that we can
11010 pop the registers. Also do it on SEH target for very large
11011 frame as the emitted instructions aren't allowed by the ABI in
11012 epilogues. */
11013 if (!m->fs.sp_valid
11014 || (TARGET_SEH
11015 && (m->fs.sp_offset - frame.reg_save_offset
11016 >= SEH_MAX_FRAME_SIZE)))
11018 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11019 GEN_INT (m->fs.fp_offset
11020 - frame.reg_save_offset),
11021 style, false);
11023 else if (m->fs.sp_offset != frame.reg_save_offset)
11025 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11026 GEN_INT (m->fs.sp_offset
11027 - frame.reg_save_offset),
11028 style,
11029 m->fs.cfa_reg == stack_pointer_rtx);
11032 ix86_emit_restore_regs_using_pop ();
11035 /* If we used a stack pointer and haven't already got rid of it,
11036 then do so now. */
11037 if (m->fs.fp_valid)
11039 /* If the stack pointer is valid and pointing at the frame
11040 pointer store address, then we only need a pop. */
11041 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11042 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11043 /* Leave results in shorter dependency chains on CPUs that are
11044 able to grok it fast. */
11045 else if (TARGET_USE_LEAVE
11046 || optimize_function_for_size_p (cfun)
11047 || !cfun->machine->use_fast_prologue_epilogue)
11048 ix86_emit_leave ();
11049 else
11051 pro_epilogue_adjust_stack (stack_pointer_rtx,
11052 hard_frame_pointer_rtx,
11053 const0_rtx, style, !using_drap);
11054 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11058 if (using_drap)
11060 int param_ptr_offset = UNITS_PER_WORD;
11061 rtx insn;
11063 gcc_assert (stack_realign_drap);
11065 if (ix86_static_chain_on_stack)
11066 param_ptr_offset += UNITS_PER_WORD;
11067 if (!call_used_regs[REGNO (crtl->drap_reg)])
11068 param_ptr_offset += UNITS_PER_WORD;
11070 insn = emit_insn (gen_rtx_SET
11071 (VOIDmode, stack_pointer_rtx,
11072 gen_rtx_PLUS (Pmode,
11073 crtl->drap_reg,
11074 GEN_INT (-param_ptr_offset))));
11075 m->fs.cfa_reg = stack_pointer_rtx;
11076 m->fs.cfa_offset = param_ptr_offset;
11077 m->fs.sp_offset = param_ptr_offset;
11078 m->fs.realigned = false;
11080 add_reg_note (insn, REG_CFA_DEF_CFA,
11081 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11082 GEN_INT (param_ptr_offset)));
11083 RTX_FRAME_RELATED_P (insn) = 1;
11085 if (!call_used_regs[REGNO (crtl->drap_reg)])
11086 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11089 /* At this point the stack pointer must be valid, and we must have
11090 restored all of the registers. We may not have deallocated the
11091 entire stack frame. We've delayed this until now because it may
11092 be possible to merge the local stack deallocation with the
11093 deallocation forced by ix86_static_chain_on_stack. */
11094 gcc_assert (m->fs.sp_valid);
11095 gcc_assert (!m->fs.fp_valid);
11096 gcc_assert (!m->fs.realigned);
11097 if (m->fs.sp_offset != UNITS_PER_WORD)
11099 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11100 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11101 style, true);
11103 else
11104 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11106 /* Sibcall epilogues don't want a return instruction. */
11107 if (style == 0)
11109 m->fs = frame_state_save;
11110 return;
11113 if (crtl->args.pops_args && crtl->args.size)
11115 rtx popc = GEN_INT (crtl->args.pops_args);
11117 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11118 address, do explicit add, and jump indirectly to the caller. */
11120 if (crtl->args.pops_args >= 65536)
11122 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11123 rtx insn;
11125 /* There is no "pascal" calling convention in any 64bit ABI. */
11126 gcc_assert (!TARGET_64BIT);
11128 insn = emit_insn (gen_pop (ecx));
11129 m->fs.cfa_offset -= UNITS_PER_WORD;
11130 m->fs.sp_offset -= UNITS_PER_WORD;
11132 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11133 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11134 add_reg_note (insn, REG_CFA_REGISTER,
11135 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11136 RTX_FRAME_RELATED_P (insn) = 1;
11138 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11139 popc, -1, true);
11140 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11142 else
11143 emit_jump_insn (gen_simple_return_pop_internal (popc));
11145 else
11146 emit_jump_insn (gen_simple_return_internal ());
11148 /* Restore the state back to the state from the prologue,
11149 so that it's correct for the next epilogue. */
11150 m->fs = frame_state_save;
11153 /* Reset from the function's potential modifications. */
11155 static void
11156 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11157 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11159 if (pic_offset_table_rtx)
11160 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11161 #if TARGET_MACHO
11162 /* Mach-O doesn't support labels at the end of objects, so if
11163 it looks like we might want one, insert a NOP. */
11165 rtx insn = get_last_insn ();
11166 rtx deleted_debug_label = NULL_RTX;
11167 while (insn
11168 && NOTE_P (insn)
11169 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11171 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11172 notes only, instead set their CODE_LABEL_NUMBER to -1,
11173 otherwise there would be code generation differences
11174 in between -g and -g0. */
11175 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11176 deleted_debug_label = insn;
11177 insn = PREV_INSN (insn);
11179 if (insn
11180 && (LABEL_P (insn)
11181 || (NOTE_P (insn)
11182 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11183 fputs ("\tnop\n", file);
11184 else if (deleted_debug_label)
11185 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11186 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11187 CODE_LABEL_NUMBER (insn) = -1;
11189 #endif
11193 /* Return a scratch register to use in the split stack prologue. The
11194 split stack prologue is used for -fsplit-stack. It is the first
11195 instructions in the function, even before the regular prologue.
11196 The scratch register can be any caller-saved register which is not
11197 used for parameters or for the static chain. */
11199 static unsigned int
11200 split_stack_prologue_scratch_regno (void)
11202 if (TARGET_64BIT)
11203 return R11_REG;
11204 else
11206 bool is_fastcall, is_thiscall;
11207 int regparm;
11209 is_fastcall = (lookup_attribute ("fastcall",
11210 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11211 != NULL);
11212 is_thiscall = (lookup_attribute ("thiscall",
11213 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11214 != NULL);
11215 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11217 if (is_fastcall)
11219 if (DECL_STATIC_CHAIN (cfun->decl))
11221 sorry ("-fsplit-stack does not support fastcall with "
11222 "nested function");
11223 return INVALID_REGNUM;
11225 return AX_REG;
11227 else if (is_thiscall)
11229 if (!DECL_STATIC_CHAIN (cfun->decl))
11230 return DX_REG;
11231 return AX_REG;
11233 else if (regparm < 3)
11235 if (!DECL_STATIC_CHAIN (cfun->decl))
11236 return CX_REG;
11237 else
11239 if (regparm >= 2)
11241 sorry ("-fsplit-stack does not support 2 register "
11242 " parameters for a nested function");
11243 return INVALID_REGNUM;
11245 return DX_REG;
11248 else
11250 /* FIXME: We could make this work by pushing a register
11251 around the addition and comparison. */
11252 sorry ("-fsplit-stack does not support 3 register parameters");
11253 return INVALID_REGNUM;
11258 /* A SYMBOL_REF for the function which allocates new stackspace for
11259 -fsplit-stack. */
11261 static GTY(()) rtx split_stack_fn;
11263 /* A SYMBOL_REF for the more stack function when using the large
11264 model. */
11266 static GTY(()) rtx split_stack_fn_large;
11268 /* Handle -fsplit-stack. These are the first instructions in the
11269 function, even before the regular prologue. */
11271 void
11272 ix86_expand_split_stack_prologue (void)
11274 struct ix86_frame frame;
11275 HOST_WIDE_INT allocate;
11276 unsigned HOST_WIDE_INT args_size;
11277 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11278 rtx scratch_reg = NULL_RTX;
11279 rtx varargs_label = NULL_RTX;
11280 rtx fn;
11282 gcc_assert (flag_split_stack && reload_completed);
11284 ix86_finalize_stack_realign_flags ();
11285 ix86_compute_frame_layout (&frame);
11286 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11288 /* This is the label we will branch to if we have enough stack
11289 space. We expect the basic block reordering pass to reverse this
11290 branch if optimizing, so that we branch in the unlikely case. */
11291 label = gen_label_rtx ();
11293 /* We need to compare the stack pointer minus the frame size with
11294 the stack boundary in the TCB. The stack boundary always gives
11295 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11296 can compare directly. Otherwise we need to do an addition. */
11298 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11299 UNSPEC_STACK_CHECK);
11300 limit = gen_rtx_CONST (Pmode, limit);
11301 limit = gen_rtx_MEM (Pmode, limit);
11302 if (allocate < SPLIT_STACK_AVAILABLE)
11303 current = stack_pointer_rtx;
11304 else
11306 unsigned int scratch_regno;
11307 rtx offset;
11309 /* We need a scratch register to hold the stack pointer minus
11310 the required frame size. Since this is the very start of the
11311 function, the scratch register can be any caller-saved
11312 register which is not used for parameters. */
11313 offset = GEN_INT (- allocate);
11314 scratch_regno = split_stack_prologue_scratch_regno ();
11315 if (scratch_regno == INVALID_REGNUM)
11316 return;
11317 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11318 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11320 /* We don't use ix86_gen_add3 in this case because it will
11321 want to split to lea, but when not optimizing the insn
11322 will not be split after this point. */
11323 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11324 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11325 offset)));
11327 else
11329 emit_move_insn (scratch_reg, offset);
11330 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11331 stack_pointer_rtx));
11333 current = scratch_reg;
11336 ix86_expand_branch (GEU, current, limit, label);
11337 jump_insn = get_last_insn ();
11338 JUMP_LABEL (jump_insn) = label;
11340 /* Mark the jump as very likely to be taken. */
11341 add_reg_note (jump_insn, REG_BR_PROB,
11342 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11344 if (split_stack_fn == NULL_RTX)
11345 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11346 fn = split_stack_fn;
11348 /* Get more stack space. We pass in the desired stack space and the
11349 size of the arguments to copy to the new stack. In 32-bit mode
11350 we push the parameters; __morestack will return on a new stack
11351 anyhow. In 64-bit mode we pass the parameters in r10 and
11352 r11. */
11353 allocate_rtx = GEN_INT (allocate);
11354 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11355 call_fusage = NULL_RTX;
11356 if (TARGET_64BIT)
11358 rtx reg10, reg11;
11360 reg10 = gen_rtx_REG (Pmode, R10_REG);
11361 reg11 = gen_rtx_REG (Pmode, R11_REG);
11363 /* If this function uses a static chain, it will be in %r10.
11364 Preserve it across the call to __morestack. */
11365 if (DECL_STATIC_CHAIN (cfun->decl))
11367 rtx rax;
11369 rax = gen_rtx_REG (word_mode, AX_REG);
11370 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11371 use_reg (&call_fusage, rax);
11374 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11376 HOST_WIDE_INT argval;
11378 gcc_assert (Pmode == DImode);
11379 /* When using the large model we need to load the address
11380 into a register, and we've run out of registers. So we
11381 switch to a different calling convention, and we call a
11382 different function: __morestack_large. We pass the
11383 argument size in the upper 32 bits of r10 and pass the
11384 frame size in the lower 32 bits. */
11385 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11386 gcc_assert ((args_size & 0xffffffff) == args_size);
11388 if (split_stack_fn_large == NULL_RTX)
11389 split_stack_fn_large =
11390 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11392 if (ix86_cmodel == CM_LARGE_PIC)
11394 rtx label, x;
11396 label = gen_label_rtx ();
11397 emit_label (label);
11398 LABEL_PRESERVE_P (label) = 1;
11399 emit_insn (gen_set_rip_rex64 (reg10, label));
11400 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11401 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11402 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11403 UNSPEC_GOT);
11404 x = gen_rtx_CONST (Pmode, x);
11405 emit_move_insn (reg11, x);
11406 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11407 x = gen_const_mem (Pmode, x);
11408 emit_move_insn (reg11, x);
11410 else
11411 emit_move_insn (reg11, split_stack_fn_large);
11413 fn = reg11;
11415 argval = ((args_size << 16) << 16) + allocate;
11416 emit_move_insn (reg10, GEN_INT (argval));
11418 else
11420 emit_move_insn (reg10, allocate_rtx);
11421 emit_move_insn (reg11, GEN_INT (args_size));
11422 use_reg (&call_fusage, reg11);
11425 use_reg (&call_fusage, reg10);
11427 else
11429 emit_insn (gen_push (GEN_INT (args_size)));
11430 emit_insn (gen_push (allocate_rtx));
11432 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11433 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11434 NULL_RTX, false);
11435 add_function_usage_to (call_insn, call_fusage);
11437 /* In order to make call/return prediction work right, we now need
11438 to execute a return instruction. See
11439 libgcc/config/i386/morestack.S for the details on how this works.
11441 For flow purposes gcc must not see this as a return
11442 instruction--we need control flow to continue at the subsequent
11443 label. Therefore, we use an unspec. */
11444 gcc_assert (crtl->args.pops_args < 65536);
11445 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11447 /* If we are in 64-bit mode and this function uses a static chain,
11448 we saved %r10 in %rax before calling _morestack. */
11449 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11450 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11451 gen_rtx_REG (word_mode, AX_REG));
11453 /* If this function calls va_start, we need to store a pointer to
11454 the arguments on the old stack, because they may not have been
11455 all copied to the new stack. At this point the old stack can be
11456 found at the frame pointer value used by __morestack, because
11457 __morestack has set that up before calling back to us. Here we
11458 store that pointer in a scratch register, and in
11459 ix86_expand_prologue we store the scratch register in a stack
11460 slot. */
11461 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11463 unsigned int scratch_regno;
11464 rtx frame_reg;
11465 int words;
11467 scratch_regno = split_stack_prologue_scratch_regno ();
11468 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11469 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11471 /* 64-bit:
11472 fp -> old fp value
11473 return address within this function
11474 return address of caller of this function
11475 stack arguments
11476 So we add three words to get to the stack arguments.
11478 32-bit:
11479 fp -> old fp value
11480 return address within this function
11481 first argument to __morestack
11482 second argument to __morestack
11483 return address of caller of this function
11484 stack arguments
11485 So we add five words to get to the stack arguments.
11487 words = TARGET_64BIT ? 3 : 5;
11488 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11489 gen_rtx_PLUS (Pmode, frame_reg,
11490 GEN_INT (words * UNITS_PER_WORD))));
11492 varargs_label = gen_label_rtx ();
11493 emit_jump_insn (gen_jump (varargs_label));
11494 JUMP_LABEL (get_last_insn ()) = varargs_label;
11496 emit_barrier ();
11499 emit_label (label);
11500 LABEL_NUSES (label) = 1;
11502 /* If this function calls va_start, we now have to set the scratch
11503 register for the case where we do not call __morestack. In this
11504 case we need to set it based on the stack pointer. */
11505 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11507 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11508 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11509 GEN_INT (UNITS_PER_WORD))));
11511 emit_label (varargs_label);
11512 LABEL_NUSES (varargs_label) = 1;
11516 /* We may have to tell the dataflow pass that the split stack prologue
11517 is initializing a scratch register. */
11519 static void
11520 ix86_live_on_entry (bitmap regs)
11522 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11524 gcc_assert (flag_split_stack);
11525 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11529 /* Extract the parts of an RTL expression that is a valid memory address
11530 for an instruction. Return 0 if the structure of the address is
11531 grossly off. Return -1 if the address contains ASHIFT, so it is not
11532 strictly valid, but still used for computing length of lea instruction. */
11535 ix86_decompose_address (rtx addr, struct ix86_address *out)
11537 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11538 rtx base_reg, index_reg;
11539 HOST_WIDE_INT scale = 1;
11540 rtx scale_rtx = NULL_RTX;
11541 rtx tmp;
11542 int retval = 1;
11543 enum ix86_address_seg seg = SEG_DEFAULT;
11545 /* Allow zero-extended SImode addresses,
11546 they will be emitted with addr32 prefix. */
11547 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11549 if (GET_CODE (addr) == ZERO_EXTEND
11550 && GET_MODE (XEXP (addr, 0)) == SImode)
11552 addr = XEXP (addr, 0);
11553 if (CONST_INT_P (addr))
11554 return 0;
11556 else if (GET_CODE (addr) == AND
11557 && const_32bit_mask (XEXP (addr, 1), DImode))
11559 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11560 if (addr == NULL_RTX)
11561 return 0;
11563 if (CONST_INT_P (addr))
11564 return 0;
11568 /* Allow SImode subregs of DImode addresses,
11569 they will be emitted with addr32 prefix. */
11570 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11572 if (GET_CODE (addr) == SUBREG
11573 && GET_MODE (SUBREG_REG (addr)) == DImode)
11575 addr = SUBREG_REG (addr);
11576 if (CONST_INT_P (addr))
11577 return 0;
11581 if (REG_P (addr))
11582 base = addr;
11583 else if (GET_CODE (addr) == SUBREG)
11585 if (REG_P (SUBREG_REG (addr)))
11586 base = addr;
11587 else
11588 return 0;
11590 else if (GET_CODE (addr) == PLUS)
11592 rtx addends[4], op;
11593 int n = 0, i;
11595 op = addr;
11598 if (n >= 4)
11599 return 0;
11600 addends[n++] = XEXP (op, 1);
11601 op = XEXP (op, 0);
11603 while (GET_CODE (op) == PLUS);
11604 if (n >= 4)
11605 return 0;
11606 addends[n] = op;
11608 for (i = n; i >= 0; --i)
11610 op = addends[i];
11611 switch (GET_CODE (op))
11613 case MULT:
11614 if (index)
11615 return 0;
11616 index = XEXP (op, 0);
11617 scale_rtx = XEXP (op, 1);
11618 break;
11620 case ASHIFT:
11621 if (index)
11622 return 0;
11623 index = XEXP (op, 0);
11624 tmp = XEXP (op, 1);
11625 if (!CONST_INT_P (tmp))
11626 return 0;
11627 scale = INTVAL (tmp);
11628 if ((unsigned HOST_WIDE_INT) scale > 3)
11629 return 0;
11630 scale = 1 << scale;
11631 break;
11633 case ZERO_EXTEND:
11634 op = XEXP (op, 0);
11635 if (GET_CODE (op) != UNSPEC)
11636 return 0;
11637 /* FALLTHRU */
11639 case UNSPEC:
11640 if (XINT (op, 1) == UNSPEC_TP
11641 && TARGET_TLS_DIRECT_SEG_REFS
11642 && seg == SEG_DEFAULT)
11643 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11644 else
11645 return 0;
11646 break;
11648 case SUBREG:
11649 if (!REG_P (SUBREG_REG (op)))
11650 return 0;
11651 /* FALLTHRU */
11653 case REG:
11654 if (!base)
11655 base = op;
11656 else if (!index)
11657 index = op;
11658 else
11659 return 0;
11660 break;
11662 case CONST:
11663 case CONST_INT:
11664 case SYMBOL_REF:
11665 case LABEL_REF:
11666 if (disp)
11667 return 0;
11668 disp = op;
11669 break;
11671 default:
11672 return 0;
11676 else if (GET_CODE (addr) == MULT)
11678 index = XEXP (addr, 0); /* index*scale */
11679 scale_rtx = XEXP (addr, 1);
11681 else if (GET_CODE (addr) == ASHIFT)
11683 /* We're called for lea too, which implements ashift on occasion. */
11684 index = XEXP (addr, 0);
11685 tmp = XEXP (addr, 1);
11686 if (!CONST_INT_P (tmp))
11687 return 0;
11688 scale = INTVAL (tmp);
11689 if ((unsigned HOST_WIDE_INT) scale > 3)
11690 return 0;
11691 scale = 1 << scale;
11692 retval = -1;
11694 else
11695 disp = addr; /* displacement */
11697 if (index)
11699 if (REG_P (index))
11701 else if (GET_CODE (index) == SUBREG
11702 && REG_P (SUBREG_REG (index)))
11704 else
11705 return 0;
11708 /* Extract the integral value of scale. */
11709 if (scale_rtx)
11711 if (!CONST_INT_P (scale_rtx))
11712 return 0;
11713 scale = INTVAL (scale_rtx);
11716 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11717 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11719 /* Avoid useless 0 displacement. */
11720 if (disp == const0_rtx && (base || index))
11721 disp = NULL_RTX;
11723 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11724 if (base_reg && index_reg && scale == 1
11725 && (index_reg == arg_pointer_rtx
11726 || index_reg == frame_pointer_rtx
11727 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11729 rtx tmp;
11730 tmp = base, base = index, index = tmp;
11731 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11734 /* Special case: %ebp cannot be encoded as a base without a displacement.
11735 Similarly %r13. */
11736 if (!disp
11737 && base_reg
11738 && (base_reg == hard_frame_pointer_rtx
11739 || base_reg == frame_pointer_rtx
11740 || base_reg == arg_pointer_rtx
11741 || (REG_P (base_reg)
11742 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11743 || REGNO (base_reg) == R13_REG))))
11744 disp = const0_rtx;
11746 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11747 Avoid this by transforming to [%esi+0].
11748 Reload calls address legitimization without cfun defined, so we need
11749 to test cfun for being non-NULL. */
11750 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11751 && base_reg && !index_reg && !disp
11752 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11753 disp = const0_rtx;
11755 /* Special case: encode reg+reg instead of reg*2. */
11756 if (!base && index && scale == 2)
11757 base = index, base_reg = index_reg, scale = 1;
11759 /* Special case: scaling cannot be encoded without base or displacement. */
11760 if (!base && !disp && index && scale != 1)
11761 disp = const0_rtx;
11763 out->base = base;
11764 out->index = index;
11765 out->disp = disp;
11766 out->scale = scale;
11767 out->seg = seg;
11769 return retval;
11772 /* Return cost of the memory address x.
11773 For i386, it is better to use a complex address than let gcc copy
11774 the address into a reg and make a new pseudo. But not if the address
11775 requires to two regs - that would mean more pseudos with longer
11776 lifetimes. */
11777 static int
11778 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11779 addr_space_t as ATTRIBUTE_UNUSED,
11780 bool speed ATTRIBUTE_UNUSED)
11782 struct ix86_address parts;
11783 int cost = 1;
11784 int ok = ix86_decompose_address (x, &parts);
11786 gcc_assert (ok);
11788 if (parts.base && GET_CODE (parts.base) == SUBREG)
11789 parts.base = SUBREG_REG (parts.base);
11790 if (parts.index && GET_CODE (parts.index) == SUBREG)
11791 parts.index = SUBREG_REG (parts.index);
11793 /* Attempt to minimize number of registers in the address. */
11794 if ((parts.base
11795 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11796 || (parts.index
11797 && (!REG_P (parts.index)
11798 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11799 cost++;
11801 if (parts.base
11802 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11803 && parts.index
11804 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11805 && parts.base != parts.index)
11806 cost++;
11808 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11809 since it's predecode logic can't detect the length of instructions
11810 and it degenerates to vector decoded. Increase cost of such
11811 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11812 to split such addresses or even refuse such addresses at all.
11814 Following addressing modes are affected:
11815 [base+scale*index]
11816 [scale*index+disp]
11817 [base+index]
11819 The first and last case may be avoidable by explicitly coding the zero in
11820 memory address, but I don't have AMD-K6 machine handy to check this
11821 theory. */
11823 if (TARGET_K6
11824 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11825 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11826 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11827 cost += 10;
11829 return cost;
11832 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11833 this is used for to form addresses to local data when -fPIC is in
11834 use. */
11836 static bool
11837 darwin_local_data_pic (rtx disp)
11839 return (GET_CODE (disp) == UNSPEC
11840 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11843 /* Determine if a given RTX is a valid constant. We already know this
11844 satisfies CONSTANT_P. */
11846 static bool
11847 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11849 switch (GET_CODE (x))
11851 case CONST:
11852 x = XEXP (x, 0);
11854 if (GET_CODE (x) == PLUS)
11856 if (!CONST_INT_P (XEXP (x, 1)))
11857 return false;
11858 x = XEXP (x, 0);
11861 if (TARGET_MACHO && darwin_local_data_pic (x))
11862 return true;
11864 /* Only some unspecs are valid as "constants". */
11865 if (GET_CODE (x) == UNSPEC)
11866 switch (XINT (x, 1))
11868 case UNSPEC_GOT:
11869 case UNSPEC_GOTOFF:
11870 case UNSPEC_PLTOFF:
11871 return TARGET_64BIT;
11872 case UNSPEC_TPOFF:
11873 case UNSPEC_NTPOFF:
11874 x = XVECEXP (x, 0, 0);
11875 return (GET_CODE (x) == SYMBOL_REF
11876 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11877 case UNSPEC_DTPOFF:
11878 x = XVECEXP (x, 0, 0);
11879 return (GET_CODE (x) == SYMBOL_REF
11880 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11881 default:
11882 return false;
11885 /* We must have drilled down to a symbol. */
11886 if (GET_CODE (x) == LABEL_REF)
11887 return true;
11888 if (GET_CODE (x) != SYMBOL_REF)
11889 return false;
11890 /* FALLTHRU */
11892 case SYMBOL_REF:
11893 /* TLS symbols are never valid. */
11894 if (SYMBOL_REF_TLS_MODEL (x))
11895 return false;
11897 /* DLLIMPORT symbols are never valid. */
11898 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11899 && SYMBOL_REF_DLLIMPORT_P (x))
11900 return false;
11902 #if TARGET_MACHO
11903 /* mdynamic-no-pic */
11904 if (MACHO_DYNAMIC_NO_PIC_P)
11905 return machopic_symbol_defined_p (x);
11906 #endif
11907 break;
11909 case CONST_DOUBLE:
11910 if (GET_MODE (x) == TImode
11911 && x != CONST0_RTX (TImode)
11912 && !TARGET_64BIT)
11913 return false;
11914 break;
11916 case CONST_VECTOR:
11917 if (!standard_sse_constant_p (x))
11918 return false;
11920 default:
11921 break;
11924 /* Otherwise we handle everything else in the move patterns. */
11925 return true;
11928 /* Determine if it's legal to put X into the constant pool. This
11929 is not possible for the address of thread-local symbols, which
11930 is checked above. */
11932 static bool
11933 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11935 /* We can always put integral constants and vectors in memory. */
11936 switch (GET_CODE (x))
11938 case CONST_INT:
11939 case CONST_DOUBLE:
11940 case CONST_VECTOR:
11941 return false;
11943 default:
11944 break;
11946 return !ix86_legitimate_constant_p (mode, x);
11950 /* Nonzero if the constant value X is a legitimate general operand
11951 when generating PIC code. It is given that flag_pic is on and
11952 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11954 bool
11955 legitimate_pic_operand_p (rtx x)
11957 rtx inner;
11959 switch (GET_CODE (x))
11961 case CONST:
11962 inner = XEXP (x, 0);
11963 if (GET_CODE (inner) == PLUS
11964 && CONST_INT_P (XEXP (inner, 1)))
11965 inner = XEXP (inner, 0);
11967 /* Only some unspecs are valid as "constants". */
11968 if (GET_CODE (inner) == UNSPEC)
11969 switch (XINT (inner, 1))
11971 case UNSPEC_GOT:
11972 case UNSPEC_GOTOFF:
11973 case UNSPEC_PLTOFF:
11974 return TARGET_64BIT;
11975 case UNSPEC_TPOFF:
11976 x = XVECEXP (inner, 0, 0);
11977 return (GET_CODE (x) == SYMBOL_REF
11978 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11979 case UNSPEC_MACHOPIC_OFFSET:
11980 return legitimate_pic_address_disp_p (x);
11981 default:
11982 return false;
11984 /* FALLTHRU */
11986 case SYMBOL_REF:
11987 case LABEL_REF:
11988 return legitimate_pic_address_disp_p (x);
11990 default:
11991 return true;
11995 /* Determine if a given CONST RTX is a valid memory displacement
11996 in PIC mode. */
11998 bool
11999 legitimate_pic_address_disp_p (rtx disp)
12001 bool saw_plus;
12003 /* In 64bit mode we can allow direct addresses of symbols and labels
12004 when they are not dynamic symbols. */
12005 if (TARGET_64BIT)
12007 rtx op0 = disp, op1;
12009 switch (GET_CODE (disp))
12011 case LABEL_REF:
12012 return true;
12014 case CONST:
12015 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12016 break;
12017 op0 = XEXP (XEXP (disp, 0), 0);
12018 op1 = XEXP (XEXP (disp, 0), 1);
12019 if (!CONST_INT_P (op1)
12020 || INTVAL (op1) >= 16*1024*1024
12021 || INTVAL (op1) < -16*1024*1024)
12022 break;
12023 if (GET_CODE (op0) == LABEL_REF)
12024 return true;
12025 if (GET_CODE (op0) == CONST
12026 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12027 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12028 return true;
12029 if (GET_CODE (op0) == UNSPEC
12030 && XINT (op0, 1) == UNSPEC_PCREL)
12031 return true;
12032 if (GET_CODE (op0) != SYMBOL_REF)
12033 break;
12034 /* FALLTHRU */
12036 case SYMBOL_REF:
12037 /* TLS references should always be enclosed in UNSPEC. */
12038 if (SYMBOL_REF_TLS_MODEL (op0))
12039 return false;
12040 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12041 && ix86_cmodel != CM_LARGE_PIC)
12042 return true;
12043 break;
12045 default:
12046 break;
12049 if (GET_CODE (disp) != CONST)
12050 return false;
12051 disp = XEXP (disp, 0);
12053 if (TARGET_64BIT)
12055 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12056 of GOT tables. We should not need these anyway. */
12057 if (GET_CODE (disp) != UNSPEC
12058 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12059 && XINT (disp, 1) != UNSPEC_GOTOFF
12060 && XINT (disp, 1) != UNSPEC_PCREL
12061 && XINT (disp, 1) != UNSPEC_PLTOFF))
12062 return false;
12064 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12065 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12066 return false;
12067 return true;
12070 saw_plus = false;
12071 if (GET_CODE (disp) == PLUS)
12073 if (!CONST_INT_P (XEXP (disp, 1)))
12074 return false;
12075 disp = XEXP (disp, 0);
12076 saw_plus = true;
12079 if (TARGET_MACHO && darwin_local_data_pic (disp))
12080 return true;
12082 if (GET_CODE (disp) != UNSPEC)
12083 return false;
12085 switch (XINT (disp, 1))
12087 case UNSPEC_GOT:
12088 if (saw_plus)
12089 return false;
12090 /* We need to check for both symbols and labels because VxWorks loads
12091 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12092 details. */
12093 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12094 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12095 case UNSPEC_GOTOFF:
12096 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12097 While ABI specify also 32bit relocation but we don't produce it in
12098 small PIC model at all. */
12099 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12100 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12101 && !TARGET_64BIT)
12102 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12103 return false;
12104 case UNSPEC_GOTTPOFF:
12105 case UNSPEC_GOTNTPOFF:
12106 case UNSPEC_INDNTPOFF:
12107 if (saw_plus)
12108 return false;
12109 disp = XVECEXP (disp, 0, 0);
12110 return (GET_CODE (disp) == SYMBOL_REF
12111 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12112 case UNSPEC_NTPOFF:
12113 disp = XVECEXP (disp, 0, 0);
12114 return (GET_CODE (disp) == SYMBOL_REF
12115 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12116 case UNSPEC_DTPOFF:
12117 disp = XVECEXP (disp, 0, 0);
12118 return (GET_CODE (disp) == SYMBOL_REF
12119 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12122 return false;
12125 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12126 replace the input X, or the original X if no replacement is called for.
12127 The output parameter *WIN is 1 if the calling macro should goto WIN,
12128 0 if it should not. */
12130 bool
12131 ix86_legitimize_reload_address (rtx x,
12132 enum machine_mode mode ATTRIBUTE_UNUSED,
12133 int opnum, int type,
12134 int ind_levels ATTRIBUTE_UNUSED)
12136 /* Reload can generate:
12138 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12139 (reg:DI 97))
12140 (reg:DI 2 cx))
12142 This RTX is rejected from ix86_legitimate_address_p due to
12143 non-strictness of base register 97. Following this rejection,
12144 reload pushes all three components into separate registers,
12145 creating invalid memory address RTX.
12147 Following code reloads only the invalid part of the
12148 memory address RTX. */
12150 if (GET_CODE (x) == PLUS
12151 && REG_P (XEXP (x, 1))
12152 && GET_CODE (XEXP (x, 0)) == PLUS
12153 && REG_P (XEXP (XEXP (x, 0), 1)))
12155 rtx base, index;
12156 bool something_reloaded = false;
12158 base = XEXP (XEXP (x, 0), 1);
12159 if (!REG_OK_FOR_BASE_STRICT_P (base))
12161 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12162 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12163 opnum, (enum reload_type) type);
12164 something_reloaded = true;
12167 index = XEXP (x, 1);
12168 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12170 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12171 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12172 opnum, (enum reload_type) type);
12173 something_reloaded = true;
12176 gcc_assert (something_reloaded);
12177 return true;
12180 return false;
12183 /* Determine if op is suitable RTX for an address register.
12184 Return naked register if a register or a register subreg is
12185 found, otherwise return NULL_RTX. */
12187 static rtx
12188 ix86_validate_address_register (rtx op)
12190 enum machine_mode mode = GET_MODE (op);
12192 /* Only SImode or DImode registers can form the address. */
12193 if (mode != SImode && mode != DImode)
12194 return NULL_RTX;
12196 if (REG_P (op))
12197 return op;
12198 else if (GET_CODE (op) == SUBREG)
12200 rtx reg = SUBREG_REG (op);
12202 if (!REG_P (reg))
12203 return NULL_RTX;
12205 mode = GET_MODE (reg);
12207 /* Don't allow SUBREGs that span more than a word. It can
12208 lead to spill failures when the register is one word out
12209 of a two word structure. */
12210 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
12211 return NULL_RTX;
12213 /* Allow only SUBREGs of non-eliminable hard registers. */
12214 if (register_no_elim_operand (reg, mode))
12215 return reg;
12218 /* Op is not a register. */
12219 return NULL_RTX;
12222 /* Recognizes RTL expressions that are valid memory addresses for an
12223 instruction. The MODE argument is the machine mode for the MEM
12224 expression that wants to use this address.
12226 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12227 convert common non-canonical forms to canonical form so that they will
12228 be recognized. */
12230 static bool
12231 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12232 rtx addr, bool strict)
12234 struct ix86_address parts;
12235 rtx base, index, disp;
12236 HOST_WIDE_INT scale;
12237 enum ix86_address_seg seg;
12239 if (ix86_decompose_address (addr, &parts) <= 0)
12240 /* Decomposition failed. */
12241 return false;
12243 base = parts.base;
12244 index = parts.index;
12245 disp = parts.disp;
12246 scale = parts.scale;
12247 seg = parts.seg;
12249 /* Validate base register. */
12250 if (base)
12252 rtx reg = ix86_validate_address_register (base);
12254 if (reg == NULL_RTX)
12255 return false;
12257 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12258 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12259 /* Base is not valid. */
12260 return false;
12263 /* Validate index register. */
12264 if (index)
12266 rtx reg = ix86_validate_address_register (index);
12268 if (reg == NULL_RTX)
12269 return false;
12271 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12272 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12273 /* Index is not valid. */
12274 return false;
12277 /* Index and base should have the same mode. */
12278 if (base && index
12279 && GET_MODE (base) != GET_MODE (index))
12280 return false;
12282 /* Address override works only on the (%reg) part of %fs:(%reg). */
12283 if (seg != SEG_DEFAULT
12284 && ((base && GET_MODE (base) != word_mode)
12285 || (index && GET_MODE (index) != word_mode)))
12286 return false;
12288 /* Validate scale factor. */
12289 if (scale != 1)
12291 if (!index)
12292 /* Scale without index. */
12293 return false;
12295 if (scale != 2 && scale != 4 && scale != 8)
12296 /* Scale is not a valid multiplier. */
12297 return false;
12300 /* Validate displacement. */
12301 if (disp)
12303 if (GET_CODE (disp) == CONST
12304 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12305 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12306 switch (XINT (XEXP (disp, 0), 1))
12308 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12309 used. While ABI specify also 32bit relocations, we don't produce
12310 them at all and use IP relative instead. */
12311 case UNSPEC_GOT:
12312 case UNSPEC_GOTOFF:
12313 gcc_assert (flag_pic);
12314 if (!TARGET_64BIT)
12315 goto is_legitimate_pic;
12317 /* 64bit address unspec. */
12318 return false;
12320 case UNSPEC_GOTPCREL:
12321 case UNSPEC_PCREL:
12322 gcc_assert (flag_pic);
12323 goto is_legitimate_pic;
12325 case UNSPEC_GOTTPOFF:
12326 case UNSPEC_GOTNTPOFF:
12327 case UNSPEC_INDNTPOFF:
12328 case UNSPEC_NTPOFF:
12329 case UNSPEC_DTPOFF:
12330 break;
12332 case UNSPEC_STACK_CHECK:
12333 gcc_assert (flag_split_stack);
12334 break;
12336 default:
12337 /* Invalid address unspec. */
12338 return false;
12341 else if (SYMBOLIC_CONST (disp)
12342 && (flag_pic
12343 || (TARGET_MACHO
12344 #if TARGET_MACHO
12345 && MACHOPIC_INDIRECT
12346 && !machopic_operand_p (disp)
12347 #endif
12351 is_legitimate_pic:
12352 if (TARGET_64BIT && (index || base))
12354 /* foo@dtpoff(%rX) is ok. */
12355 if (GET_CODE (disp) != CONST
12356 || GET_CODE (XEXP (disp, 0)) != PLUS
12357 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12358 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12359 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12360 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12361 /* Non-constant pic memory reference. */
12362 return false;
12364 else if ((!TARGET_MACHO || flag_pic)
12365 && ! legitimate_pic_address_disp_p (disp))
12366 /* Displacement is an invalid pic construct. */
12367 return false;
12368 #if TARGET_MACHO
12369 else if (MACHO_DYNAMIC_NO_PIC_P
12370 && !ix86_legitimate_constant_p (Pmode, disp))
12371 /* displacment must be referenced via non_lazy_pointer */
12372 return false;
12373 #endif
12375 /* This code used to verify that a symbolic pic displacement
12376 includes the pic_offset_table_rtx register.
12378 While this is good idea, unfortunately these constructs may
12379 be created by "adds using lea" optimization for incorrect
12380 code like:
12382 int a;
12383 int foo(int i)
12385 return *(&a+i);
12388 This code is nonsensical, but results in addressing
12389 GOT table with pic_offset_table_rtx base. We can't
12390 just refuse it easily, since it gets matched by
12391 "addsi3" pattern, that later gets split to lea in the
12392 case output register differs from input. While this
12393 can be handled by separate addsi pattern for this case
12394 that never results in lea, this seems to be easier and
12395 correct fix for crash to disable this test. */
12397 else if (GET_CODE (disp) != LABEL_REF
12398 && !CONST_INT_P (disp)
12399 && (GET_CODE (disp) != CONST
12400 || !ix86_legitimate_constant_p (Pmode, disp))
12401 && (GET_CODE (disp) != SYMBOL_REF
12402 || !ix86_legitimate_constant_p (Pmode, disp)))
12403 /* Displacement is not constant. */
12404 return false;
12405 else if (TARGET_64BIT
12406 && !x86_64_immediate_operand (disp, VOIDmode))
12407 /* Displacement is out of range. */
12408 return false;
12409 /* In x32 mode, constant addresses are sign extended to 64bit, so
12410 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12411 else if (TARGET_X32 && !(index || base)
12412 && CONST_INT_P (disp)
12413 && val_signbit_known_set_p (SImode, INTVAL (disp)))
12414 return false;
12417 /* Everything looks valid. */
12418 return true;
12421 /* Determine if a given RTX is a valid constant address. */
12423 bool
12424 constant_address_p (rtx x)
12426 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12429 /* Return a unique alias set for the GOT. */
12431 static alias_set_type
12432 ix86_GOT_alias_set (void)
12434 static alias_set_type set = -1;
12435 if (set == -1)
12436 set = new_alias_set ();
12437 return set;
12440 /* Return a legitimate reference for ORIG (an address) using the
12441 register REG. If REG is 0, a new pseudo is generated.
12443 There are two types of references that must be handled:
12445 1. Global data references must load the address from the GOT, via
12446 the PIC reg. An insn is emitted to do this load, and the reg is
12447 returned.
12449 2. Static data references, constant pool addresses, and code labels
12450 compute the address as an offset from the GOT, whose base is in
12451 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12452 differentiate them from global data objects. The returned
12453 address is the PIC reg + an unspec constant.
12455 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12456 reg also appears in the address. */
12458 static rtx
12459 legitimize_pic_address (rtx orig, rtx reg)
12461 rtx addr = orig;
12462 rtx new_rtx = orig;
12464 #if TARGET_MACHO
12465 if (TARGET_MACHO && !TARGET_64BIT)
12467 if (reg == 0)
12468 reg = gen_reg_rtx (Pmode);
12469 /* Use the generic Mach-O PIC machinery. */
12470 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12472 #endif
12474 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12475 new_rtx = addr;
12476 else if (TARGET_64BIT
12477 && ix86_cmodel != CM_SMALL_PIC
12478 && gotoff_operand (addr, Pmode))
12480 rtx tmpreg;
12481 /* This symbol may be referenced via a displacement from the PIC
12482 base address (@GOTOFF). */
12484 if (reload_in_progress)
12485 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12486 if (GET_CODE (addr) == CONST)
12487 addr = XEXP (addr, 0);
12488 if (GET_CODE (addr) == PLUS)
12490 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12491 UNSPEC_GOTOFF);
12492 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12494 else
12495 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12496 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12497 if (!reg)
12498 tmpreg = gen_reg_rtx (Pmode);
12499 else
12500 tmpreg = reg;
12501 emit_move_insn (tmpreg, new_rtx);
12503 if (reg != 0)
12505 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12506 tmpreg, 1, OPTAB_DIRECT);
12507 new_rtx = reg;
12509 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12511 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12513 /* This symbol may be referenced via a displacement from the PIC
12514 base address (@GOTOFF). */
12516 if (reload_in_progress)
12517 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12518 if (GET_CODE (addr) == CONST)
12519 addr = XEXP (addr, 0);
12520 if (GET_CODE (addr) == PLUS)
12522 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12523 UNSPEC_GOTOFF);
12524 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12526 else
12527 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12528 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12529 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12531 if (reg != 0)
12533 emit_move_insn (reg, new_rtx);
12534 new_rtx = reg;
12537 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12538 /* We can't use @GOTOFF for text labels on VxWorks;
12539 see gotoff_operand. */
12540 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12542 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12544 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12545 return legitimize_dllimport_symbol (addr, true);
12546 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12547 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12548 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12550 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12551 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12555 /* For x64 PE-COFF there is no GOT table. So we use address
12556 directly. */
12557 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12559 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12560 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12562 if (reg == 0)
12563 reg = gen_reg_rtx (Pmode);
12564 emit_move_insn (reg, new_rtx);
12565 new_rtx = reg;
12567 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12569 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12570 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12571 new_rtx = gen_const_mem (Pmode, new_rtx);
12572 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12574 if (reg == 0)
12575 reg = gen_reg_rtx (Pmode);
12576 /* Use directly gen_movsi, otherwise the address is loaded
12577 into register for CSE. We don't want to CSE this addresses,
12578 instead we CSE addresses from the GOT table, so skip this. */
12579 emit_insn (gen_movsi (reg, new_rtx));
12580 new_rtx = reg;
12582 else
12584 /* This symbol must be referenced via a load from the
12585 Global Offset Table (@GOT). */
12587 if (reload_in_progress)
12588 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12589 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12590 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12591 if (TARGET_64BIT)
12592 new_rtx = force_reg (Pmode, new_rtx);
12593 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12594 new_rtx = gen_const_mem (Pmode, new_rtx);
12595 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12597 if (reg == 0)
12598 reg = gen_reg_rtx (Pmode);
12599 emit_move_insn (reg, new_rtx);
12600 new_rtx = reg;
12603 else
12605 if (CONST_INT_P (addr)
12606 && !x86_64_immediate_operand (addr, VOIDmode))
12608 if (reg)
12610 emit_move_insn (reg, addr);
12611 new_rtx = reg;
12613 else
12614 new_rtx = force_reg (Pmode, addr);
12616 else if (GET_CODE (addr) == CONST)
12618 addr = XEXP (addr, 0);
12620 /* We must match stuff we generate before. Assume the only
12621 unspecs that can get here are ours. Not that we could do
12622 anything with them anyway.... */
12623 if (GET_CODE (addr) == UNSPEC
12624 || (GET_CODE (addr) == PLUS
12625 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12626 return orig;
12627 gcc_assert (GET_CODE (addr) == PLUS);
12629 if (GET_CODE (addr) == PLUS)
12631 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12633 /* Check first to see if this is a constant offset from a @GOTOFF
12634 symbol reference. */
12635 if (gotoff_operand (op0, Pmode)
12636 && CONST_INT_P (op1))
12638 if (!TARGET_64BIT)
12640 if (reload_in_progress)
12641 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12642 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12643 UNSPEC_GOTOFF);
12644 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12645 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12646 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12648 if (reg != 0)
12650 emit_move_insn (reg, new_rtx);
12651 new_rtx = reg;
12654 else
12656 if (INTVAL (op1) < -16*1024*1024
12657 || INTVAL (op1) >= 16*1024*1024)
12659 if (!x86_64_immediate_operand (op1, Pmode))
12660 op1 = force_reg (Pmode, op1);
12661 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12665 else
12667 rtx base = legitimize_pic_address (op0, reg);
12668 enum machine_mode mode = GET_MODE (base);
12669 new_rtx
12670 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12672 if (CONST_INT_P (new_rtx))
12674 if (INTVAL (new_rtx) < -16*1024*1024
12675 || INTVAL (new_rtx) >= 16*1024*1024)
12677 if (!x86_64_immediate_operand (new_rtx, mode))
12678 new_rtx = force_reg (mode, new_rtx);
12679 new_rtx
12680 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12682 else
12683 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12685 else
12687 if (GET_CODE (new_rtx) == PLUS
12688 && CONSTANT_P (XEXP (new_rtx, 1)))
12690 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12691 new_rtx = XEXP (new_rtx, 1);
12693 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12698 return new_rtx;
12701 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12703 static rtx
12704 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12706 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12708 if (GET_MODE (tp) != tp_mode)
12710 gcc_assert (GET_MODE (tp) == SImode);
12711 gcc_assert (tp_mode == DImode);
12713 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12716 if (to_reg)
12717 tp = copy_to_mode_reg (tp_mode, tp);
12719 return tp;
12722 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12724 static GTY(()) rtx ix86_tls_symbol;
12726 static rtx
12727 ix86_tls_get_addr (void)
12729 if (!ix86_tls_symbol)
12731 const char *sym
12732 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12733 ? "___tls_get_addr" : "__tls_get_addr");
12735 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12738 return ix86_tls_symbol;
12741 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12743 static GTY(()) rtx ix86_tls_module_base_symbol;
12746 ix86_tls_module_base (void)
12748 if (!ix86_tls_module_base_symbol)
12750 ix86_tls_module_base_symbol
12751 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12753 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12754 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12757 return ix86_tls_module_base_symbol;
12760 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12761 false if we expect this to be used for a memory address and true if
12762 we expect to load the address into a register. */
12764 static rtx
12765 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12767 rtx dest, base, off;
12768 rtx pic = NULL_RTX, tp = NULL_RTX;
12769 enum machine_mode tp_mode = Pmode;
12770 int type;
12772 switch (model)
12774 case TLS_MODEL_GLOBAL_DYNAMIC:
12775 dest = gen_reg_rtx (Pmode);
12777 if (!TARGET_64BIT)
12779 if (flag_pic)
12780 pic = pic_offset_table_rtx;
12781 else
12783 pic = gen_reg_rtx (Pmode);
12784 emit_insn (gen_set_got (pic));
12788 if (TARGET_GNU2_TLS)
12790 if (TARGET_64BIT)
12791 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12792 else
12793 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12795 tp = get_thread_pointer (Pmode, true);
12796 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12798 if (GET_MODE (x) != Pmode)
12799 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12801 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12803 else
12805 rtx caddr = ix86_tls_get_addr ();
12807 if (TARGET_64BIT)
12809 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12810 rtx insns;
12812 start_sequence ();
12813 emit_call_insn
12814 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
12815 insns = get_insns ();
12816 end_sequence ();
12818 if (GET_MODE (x) != Pmode)
12819 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12821 RTL_CONST_CALL_P (insns) = 1;
12822 emit_libcall_block (insns, dest, rax, x);
12824 else
12825 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12827 break;
12829 case TLS_MODEL_LOCAL_DYNAMIC:
12830 base = gen_reg_rtx (Pmode);
12832 if (!TARGET_64BIT)
12834 if (flag_pic)
12835 pic = pic_offset_table_rtx;
12836 else
12838 pic = gen_reg_rtx (Pmode);
12839 emit_insn (gen_set_got (pic));
12843 if (TARGET_GNU2_TLS)
12845 rtx tmp = ix86_tls_module_base ();
12847 if (TARGET_64BIT)
12848 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12849 else
12850 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12852 tp = get_thread_pointer (Pmode, true);
12853 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12854 gen_rtx_MINUS (Pmode, tmp, tp));
12856 else
12858 rtx caddr = ix86_tls_get_addr ();
12860 if (TARGET_64BIT)
12862 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12863 rtx insns, eqv;
12865 start_sequence ();
12866 emit_call_insn
12867 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
12868 insns = get_insns ();
12869 end_sequence ();
12871 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12872 share the LD_BASE result with other LD model accesses. */
12873 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12874 UNSPEC_TLS_LD_BASE);
12876 RTL_CONST_CALL_P (insns) = 1;
12877 emit_libcall_block (insns, base, rax, eqv);
12879 else
12880 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12883 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12884 off = gen_rtx_CONST (Pmode, off);
12886 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12888 if (TARGET_GNU2_TLS)
12890 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12892 if (GET_MODE (x) != Pmode)
12893 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12895 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12897 break;
12899 case TLS_MODEL_INITIAL_EXEC:
12900 if (TARGET_64BIT)
12902 if (TARGET_SUN_TLS && !TARGET_X32)
12904 /* The Sun linker took the AMD64 TLS spec literally
12905 and can only handle %rax as destination of the
12906 initial executable code sequence. */
12908 dest = gen_reg_rtx (DImode);
12909 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12910 return dest;
12913 /* Generate DImode references to avoid %fs:(%reg32)
12914 problems and linker IE->LE relaxation bug. */
12915 tp_mode = DImode;
12916 pic = NULL;
12917 type = UNSPEC_GOTNTPOFF;
12919 else if (flag_pic)
12921 if (reload_in_progress)
12922 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12923 pic = pic_offset_table_rtx;
12924 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12926 else if (!TARGET_ANY_GNU_TLS)
12928 pic = gen_reg_rtx (Pmode);
12929 emit_insn (gen_set_got (pic));
12930 type = UNSPEC_GOTTPOFF;
12932 else
12934 pic = NULL;
12935 type = UNSPEC_INDNTPOFF;
12938 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12939 off = gen_rtx_CONST (tp_mode, off);
12940 if (pic)
12941 off = gen_rtx_PLUS (tp_mode, pic, off);
12942 off = gen_const_mem (tp_mode, off);
12943 set_mem_alias_set (off, ix86_GOT_alias_set ());
12945 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12947 base = get_thread_pointer (tp_mode,
12948 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12949 off = force_reg (tp_mode, off);
12950 return gen_rtx_PLUS (tp_mode, base, off);
12952 else
12954 base = get_thread_pointer (Pmode, true);
12955 dest = gen_reg_rtx (Pmode);
12956 emit_insn (ix86_gen_sub3 (dest, base, off));
12958 break;
12960 case TLS_MODEL_LOCAL_EXEC:
12961 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12962 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12963 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12964 off = gen_rtx_CONST (Pmode, off);
12966 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12968 base = get_thread_pointer (Pmode,
12969 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12970 return gen_rtx_PLUS (Pmode, base, off);
12972 else
12974 base = get_thread_pointer (Pmode, true);
12975 dest = gen_reg_rtx (Pmode);
12976 emit_insn (ix86_gen_sub3 (dest, base, off));
12978 break;
12980 default:
12981 gcc_unreachable ();
12984 return dest;
12987 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12988 to symbol DECL. */
12990 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12991 htab_t dllimport_map;
12993 static tree
12994 get_dllimport_decl (tree decl)
12996 struct tree_map *h, in;
12997 void **loc;
12998 const char *name;
12999 const char *prefix;
13000 size_t namelen, prefixlen;
13001 char *imp_name;
13002 tree to;
13003 rtx rtl;
13005 if (!dllimport_map)
13006 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13008 in.hash = htab_hash_pointer (decl);
13009 in.base.from = decl;
13010 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13011 h = (struct tree_map *) *loc;
13012 if (h)
13013 return h->to;
13015 *loc = h = ggc_alloc_tree_map ();
13016 h->hash = in.hash;
13017 h->base.from = decl;
13018 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13019 VAR_DECL, NULL, ptr_type_node);
13020 DECL_ARTIFICIAL (to) = 1;
13021 DECL_IGNORED_P (to) = 1;
13022 DECL_EXTERNAL (to) = 1;
13023 TREE_READONLY (to) = 1;
13025 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13026 name = targetm.strip_name_encoding (name);
13027 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13028 ? "*__imp_" : "*__imp__";
13029 namelen = strlen (name);
13030 prefixlen = strlen (prefix);
13031 imp_name = (char *) alloca (namelen + prefixlen + 1);
13032 memcpy (imp_name, prefix, prefixlen);
13033 memcpy (imp_name + prefixlen, name, namelen + 1);
13035 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13036 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13037 SET_SYMBOL_REF_DECL (rtl, to);
13038 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13040 rtl = gen_const_mem (Pmode, rtl);
13041 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13043 SET_DECL_RTL (to, rtl);
13044 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13046 return to;
13049 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13050 true if we require the result be a register. */
13052 static rtx
13053 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13055 tree imp_decl;
13056 rtx x;
13058 gcc_assert (SYMBOL_REF_DECL (symbol));
13059 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13061 x = DECL_RTL (imp_decl);
13062 if (want_reg)
13063 x = force_reg (Pmode, x);
13064 return x;
13067 /* Try machine-dependent ways of modifying an illegitimate address
13068 to be legitimate. If we find one, return the new, valid address.
13069 This macro is used in only one place: `memory_address' in explow.c.
13071 OLDX is the address as it was before break_out_memory_refs was called.
13072 In some cases it is useful to look at this to decide what needs to be done.
13074 It is always safe for this macro to do nothing. It exists to recognize
13075 opportunities to optimize the output.
13077 For the 80386, we handle X+REG by loading X into a register R and
13078 using R+REG. R will go in a general reg and indexing will be used.
13079 However, if REG is a broken-out memory address or multiplication,
13080 nothing needs to be done because REG can certainly go in a general reg.
13082 When -fpic is used, special handling is needed for symbolic references.
13083 See comments by legitimize_pic_address in i386.c for details. */
13085 static rtx
13086 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13087 enum machine_mode mode)
13089 int changed = 0;
13090 unsigned log;
13092 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13093 if (log)
13094 return legitimize_tls_address (x, (enum tls_model) log, false);
13095 if (GET_CODE (x) == CONST
13096 && GET_CODE (XEXP (x, 0)) == PLUS
13097 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13098 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13100 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13101 (enum tls_model) log, false);
13102 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13105 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13107 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13108 return legitimize_dllimport_symbol (x, true);
13109 if (GET_CODE (x) == CONST
13110 && GET_CODE (XEXP (x, 0)) == PLUS
13111 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13112 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13114 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13115 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13119 if (flag_pic && SYMBOLIC_CONST (x))
13120 return legitimize_pic_address (x, 0);
13122 #if TARGET_MACHO
13123 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13124 return machopic_indirect_data_reference (x, 0);
13125 #endif
13127 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13128 if (GET_CODE (x) == ASHIFT
13129 && CONST_INT_P (XEXP (x, 1))
13130 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13132 changed = 1;
13133 log = INTVAL (XEXP (x, 1));
13134 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13135 GEN_INT (1 << log));
13138 if (GET_CODE (x) == PLUS)
13140 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13142 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13143 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13144 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13146 changed = 1;
13147 log = INTVAL (XEXP (XEXP (x, 0), 1));
13148 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13149 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13150 GEN_INT (1 << log));
13153 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13154 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13155 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13157 changed = 1;
13158 log = INTVAL (XEXP (XEXP (x, 1), 1));
13159 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13160 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13161 GEN_INT (1 << log));
13164 /* Put multiply first if it isn't already. */
13165 if (GET_CODE (XEXP (x, 1)) == MULT)
13167 rtx tmp = XEXP (x, 0);
13168 XEXP (x, 0) = XEXP (x, 1);
13169 XEXP (x, 1) = tmp;
13170 changed = 1;
13173 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13174 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13175 created by virtual register instantiation, register elimination, and
13176 similar optimizations. */
13177 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13179 changed = 1;
13180 x = gen_rtx_PLUS (Pmode,
13181 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13182 XEXP (XEXP (x, 1), 0)),
13183 XEXP (XEXP (x, 1), 1));
13186 /* Canonicalize
13187 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13188 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13189 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13190 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13191 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13192 && CONSTANT_P (XEXP (x, 1)))
13194 rtx constant;
13195 rtx other = NULL_RTX;
13197 if (CONST_INT_P (XEXP (x, 1)))
13199 constant = XEXP (x, 1);
13200 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13202 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13204 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13205 other = XEXP (x, 1);
13207 else
13208 constant = 0;
13210 if (constant)
13212 changed = 1;
13213 x = gen_rtx_PLUS (Pmode,
13214 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13215 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13216 plus_constant (Pmode, other,
13217 INTVAL (constant)));
13221 if (changed && ix86_legitimate_address_p (mode, x, false))
13222 return x;
13224 if (GET_CODE (XEXP (x, 0)) == MULT)
13226 changed = 1;
13227 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13230 if (GET_CODE (XEXP (x, 1)) == MULT)
13232 changed = 1;
13233 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13236 if (changed
13237 && REG_P (XEXP (x, 1))
13238 && REG_P (XEXP (x, 0)))
13239 return x;
13241 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13243 changed = 1;
13244 x = legitimize_pic_address (x, 0);
13247 if (changed && ix86_legitimate_address_p (mode, x, false))
13248 return x;
13250 if (REG_P (XEXP (x, 0)))
13252 rtx temp = gen_reg_rtx (Pmode);
13253 rtx val = force_operand (XEXP (x, 1), temp);
13254 if (val != temp)
13256 val = convert_to_mode (Pmode, val, 1);
13257 emit_move_insn (temp, val);
13260 XEXP (x, 1) = temp;
13261 return x;
13264 else if (REG_P (XEXP (x, 1)))
13266 rtx temp = gen_reg_rtx (Pmode);
13267 rtx val = force_operand (XEXP (x, 0), temp);
13268 if (val != temp)
13270 val = convert_to_mode (Pmode, val, 1);
13271 emit_move_insn (temp, val);
13274 XEXP (x, 0) = temp;
13275 return x;
13279 return x;
13282 /* Print an integer constant expression in assembler syntax. Addition
13283 and subtraction are the only arithmetic that may appear in these
13284 expressions. FILE is the stdio stream to write to, X is the rtx, and
13285 CODE is the operand print code from the output string. */
13287 static void
13288 output_pic_addr_const (FILE *file, rtx x, int code)
13290 char buf[256];
13292 switch (GET_CODE (x))
13294 case PC:
13295 gcc_assert (flag_pic);
13296 putc ('.', file);
13297 break;
13299 case SYMBOL_REF:
13300 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13301 output_addr_const (file, x);
13302 else
13304 const char *name = XSTR (x, 0);
13306 /* Mark the decl as referenced so that cgraph will
13307 output the function. */
13308 if (SYMBOL_REF_DECL (x))
13309 mark_decl_referenced (SYMBOL_REF_DECL (x));
13311 #if TARGET_MACHO
13312 if (MACHOPIC_INDIRECT
13313 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13314 name = machopic_indirection_name (x, /*stub_p=*/true);
13315 #endif
13316 assemble_name (file, name);
13318 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13319 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13320 fputs ("@PLT", file);
13321 break;
13323 case LABEL_REF:
13324 x = XEXP (x, 0);
13325 /* FALLTHRU */
13326 case CODE_LABEL:
13327 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13328 assemble_name (asm_out_file, buf);
13329 break;
13331 case CONST_INT:
13332 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13333 break;
13335 case CONST:
13336 /* This used to output parentheses around the expression,
13337 but that does not work on the 386 (either ATT or BSD assembler). */
13338 output_pic_addr_const (file, XEXP (x, 0), code);
13339 break;
13341 case CONST_DOUBLE:
13342 if (GET_MODE (x) == VOIDmode)
13344 /* We can use %d if the number is <32 bits and positive. */
13345 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13346 fprintf (file, "0x%lx%08lx",
13347 (unsigned long) CONST_DOUBLE_HIGH (x),
13348 (unsigned long) CONST_DOUBLE_LOW (x));
13349 else
13350 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13352 else
13353 /* We can't handle floating point constants;
13354 TARGET_PRINT_OPERAND must handle them. */
13355 output_operand_lossage ("floating constant misused");
13356 break;
13358 case PLUS:
13359 /* Some assemblers need integer constants to appear first. */
13360 if (CONST_INT_P (XEXP (x, 0)))
13362 output_pic_addr_const (file, XEXP (x, 0), code);
13363 putc ('+', file);
13364 output_pic_addr_const (file, XEXP (x, 1), code);
13366 else
13368 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13369 output_pic_addr_const (file, XEXP (x, 1), code);
13370 putc ('+', file);
13371 output_pic_addr_const (file, XEXP (x, 0), code);
13373 break;
13375 case MINUS:
13376 if (!TARGET_MACHO)
13377 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13378 output_pic_addr_const (file, XEXP (x, 0), code);
13379 putc ('-', file);
13380 output_pic_addr_const (file, XEXP (x, 1), code);
13381 if (!TARGET_MACHO)
13382 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13383 break;
13385 case UNSPEC:
13386 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13388 bool f = i386_asm_output_addr_const_extra (file, x);
13389 gcc_assert (f);
13390 break;
13393 gcc_assert (XVECLEN (x, 0) == 1);
13394 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13395 switch (XINT (x, 1))
13397 case UNSPEC_GOT:
13398 fputs ("@GOT", file);
13399 break;
13400 case UNSPEC_GOTOFF:
13401 fputs ("@GOTOFF", file);
13402 break;
13403 case UNSPEC_PLTOFF:
13404 fputs ("@PLTOFF", file);
13405 break;
13406 case UNSPEC_PCREL:
13407 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13408 "(%rip)" : "[rip]", file);
13409 break;
13410 case UNSPEC_GOTPCREL:
13411 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13412 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13413 break;
13414 case UNSPEC_GOTTPOFF:
13415 /* FIXME: This might be @TPOFF in Sun ld too. */
13416 fputs ("@gottpoff", file);
13417 break;
13418 case UNSPEC_TPOFF:
13419 fputs ("@tpoff", file);
13420 break;
13421 case UNSPEC_NTPOFF:
13422 if (TARGET_64BIT)
13423 fputs ("@tpoff", file);
13424 else
13425 fputs ("@ntpoff", file);
13426 break;
13427 case UNSPEC_DTPOFF:
13428 fputs ("@dtpoff", file);
13429 break;
13430 case UNSPEC_GOTNTPOFF:
13431 if (TARGET_64BIT)
13432 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13433 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13434 else
13435 fputs ("@gotntpoff", file);
13436 break;
13437 case UNSPEC_INDNTPOFF:
13438 fputs ("@indntpoff", file);
13439 break;
13440 #if TARGET_MACHO
13441 case UNSPEC_MACHOPIC_OFFSET:
13442 putc ('-', file);
13443 machopic_output_function_base_name (file);
13444 break;
13445 #endif
13446 default:
13447 output_operand_lossage ("invalid UNSPEC as operand");
13448 break;
13450 break;
13452 default:
13453 output_operand_lossage ("invalid expression as operand");
13457 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13458 We need to emit DTP-relative relocations. */
13460 static void ATTRIBUTE_UNUSED
13461 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13463 fputs (ASM_LONG, file);
13464 output_addr_const (file, x);
13465 fputs ("@dtpoff", file);
13466 switch (size)
13468 case 4:
13469 break;
13470 case 8:
13471 fputs (", 0", file);
13472 break;
13473 default:
13474 gcc_unreachable ();
13478 /* Return true if X is a representation of the PIC register. This copes
13479 with calls from ix86_find_base_term, where the register might have
13480 been replaced by a cselib value. */
13482 static bool
13483 ix86_pic_register_p (rtx x)
13485 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13486 return (pic_offset_table_rtx
13487 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13488 else
13489 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13492 /* Helper function for ix86_delegitimize_address.
13493 Attempt to delegitimize TLS local-exec accesses. */
13495 static rtx
13496 ix86_delegitimize_tls_address (rtx orig_x)
13498 rtx x = orig_x, unspec;
13499 struct ix86_address addr;
13501 if (!TARGET_TLS_DIRECT_SEG_REFS)
13502 return orig_x;
13503 if (MEM_P (x))
13504 x = XEXP (x, 0);
13505 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13506 return orig_x;
13507 if (ix86_decompose_address (x, &addr) == 0
13508 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13509 || addr.disp == NULL_RTX
13510 || GET_CODE (addr.disp) != CONST)
13511 return orig_x;
13512 unspec = XEXP (addr.disp, 0);
13513 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13514 unspec = XEXP (unspec, 0);
13515 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13516 return orig_x;
13517 x = XVECEXP (unspec, 0, 0);
13518 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13519 if (unspec != XEXP (addr.disp, 0))
13520 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13521 if (addr.index)
13523 rtx idx = addr.index;
13524 if (addr.scale != 1)
13525 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13526 x = gen_rtx_PLUS (Pmode, idx, x);
13528 if (addr.base)
13529 x = gen_rtx_PLUS (Pmode, addr.base, x);
13530 if (MEM_P (orig_x))
13531 x = replace_equiv_address_nv (orig_x, x);
13532 return x;
13535 /* In the name of slightly smaller debug output, and to cater to
13536 general assembler lossage, recognize PIC+GOTOFF and turn it back
13537 into a direct symbol reference.
13539 On Darwin, this is necessary to avoid a crash, because Darwin
13540 has a different PIC label for each routine but the DWARF debugging
13541 information is not associated with any particular routine, so it's
13542 necessary to remove references to the PIC label from RTL stored by
13543 the DWARF output code. */
13545 static rtx
13546 ix86_delegitimize_address (rtx x)
13548 rtx orig_x = delegitimize_mem_from_attrs (x);
13549 /* addend is NULL or some rtx if x is something+GOTOFF where
13550 something doesn't include the PIC register. */
13551 rtx addend = NULL_RTX;
13552 /* reg_addend is NULL or a multiple of some register. */
13553 rtx reg_addend = NULL_RTX;
13554 /* const_addend is NULL or a const_int. */
13555 rtx const_addend = NULL_RTX;
13556 /* This is the result, or NULL. */
13557 rtx result = NULL_RTX;
13559 x = orig_x;
13561 if (MEM_P (x))
13562 x = XEXP (x, 0);
13564 if (TARGET_64BIT)
13566 if (GET_CODE (x) == CONST
13567 && GET_CODE (XEXP (x, 0)) == PLUS
13568 && GET_MODE (XEXP (x, 0)) == Pmode
13569 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13570 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13571 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13573 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13574 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13575 if (MEM_P (orig_x))
13576 x = replace_equiv_address_nv (orig_x, x);
13577 return x;
13580 if (GET_CODE (x) == CONST
13581 && GET_CODE (XEXP (x, 0)) == UNSPEC
13582 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
13583 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
13584 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
13586 x = XVECEXP (XEXP (x, 0), 0, 0);
13587 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13589 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13590 GET_MODE (x), 0);
13591 if (x == NULL_RTX)
13592 return orig_x;
13594 return x;
13597 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
13598 return ix86_delegitimize_tls_address (orig_x);
13600 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
13601 and -mcmodel=medium -fpic. */
13604 if (GET_CODE (x) != PLUS
13605 || GET_CODE (XEXP (x, 1)) != CONST)
13606 return ix86_delegitimize_tls_address (orig_x);
13608 if (ix86_pic_register_p (XEXP (x, 0)))
13609 /* %ebx + GOT/GOTOFF */
13611 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13613 /* %ebx + %reg * scale + GOT/GOTOFF */
13614 reg_addend = XEXP (x, 0);
13615 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13616 reg_addend = XEXP (reg_addend, 1);
13617 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13618 reg_addend = XEXP (reg_addend, 0);
13619 else
13621 reg_addend = NULL_RTX;
13622 addend = XEXP (x, 0);
13625 else
13626 addend = XEXP (x, 0);
13628 x = XEXP (XEXP (x, 1), 0);
13629 if (GET_CODE (x) == PLUS
13630 && CONST_INT_P (XEXP (x, 1)))
13632 const_addend = XEXP (x, 1);
13633 x = XEXP (x, 0);
13636 if (GET_CODE (x) == UNSPEC
13637 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13638 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
13639 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
13640 && !MEM_P (orig_x) && !addend)))
13641 result = XVECEXP (x, 0, 0);
13643 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
13644 && !MEM_P (orig_x))
13645 result = XVECEXP (x, 0, 0);
13647 if (! result)
13648 return ix86_delegitimize_tls_address (orig_x);
13650 if (const_addend)
13651 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13652 if (reg_addend)
13653 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13654 if (addend)
13656 /* If the rest of original X doesn't involve the PIC register, add
13657 addend and subtract pic_offset_table_rtx. This can happen e.g.
13658 for code like:
13659 leal (%ebx, %ecx, 4), %ecx
13661 movl foo@GOTOFF(%ecx), %edx
13662 in which case we return (%ecx - %ebx) + foo. */
13663 if (pic_offset_table_rtx)
13664 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13665 pic_offset_table_rtx),
13666 result);
13667 else
13668 return orig_x;
13670 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13672 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13673 if (result == NULL_RTX)
13674 return orig_x;
13676 return result;
13679 /* If X is a machine specific address (i.e. a symbol or label being
13680 referenced as a displacement from the GOT implemented using an
13681 UNSPEC), then return the base term. Otherwise return X. */
13684 ix86_find_base_term (rtx x)
13686 rtx term;
13688 if (TARGET_64BIT)
13690 if (GET_CODE (x) != CONST)
13691 return x;
13692 term = XEXP (x, 0);
13693 if (GET_CODE (term) == PLUS
13694 && (CONST_INT_P (XEXP (term, 1))
13695 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13696 term = XEXP (term, 0);
13697 if (GET_CODE (term) != UNSPEC
13698 || (XINT (term, 1) != UNSPEC_GOTPCREL
13699 && XINT (term, 1) != UNSPEC_PCREL))
13700 return x;
13702 return XVECEXP (term, 0, 0);
13705 return ix86_delegitimize_address (x);
13708 static void
13709 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13710 bool fp, FILE *file)
13712 const char *suffix;
13714 if (mode == CCFPmode || mode == CCFPUmode)
13716 code = ix86_fp_compare_code_to_integer (code);
13717 mode = CCmode;
13719 if (reverse)
13720 code = reverse_condition (code);
13722 switch (code)
13724 case EQ:
13725 switch (mode)
13727 case CCAmode:
13728 suffix = "a";
13729 break;
13731 case CCCmode:
13732 suffix = "c";
13733 break;
13735 case CCOmode:
13736 suffix = "o";
13737 break;
13739 case CCSmode:
13740 suffix = "s";
13741 break;
13743 default:
13744 suffix = "e";
13746 break;
13747 case NE:
13748 switch (mode)
13750 case CCAmode:
13751 suffix = "na";
13752 break;
13754 case CCCmode:
13755 suffix = "nc";
13756 break;
13758 case CCOmode:
13759 suffix = "no";
13760 break;
13762 case CCSmode:
13763 suffix = "ns";
13764 break;
13766 default:
13767 suffix = "ne";
13769 break;
13770 case GT:
13771 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13772 suffix = "g";
13773 break;
13774 case GTU:
13775 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13776 Those same assemblers have the same but opposite lossage on cmov. */
13777 if (mode == CCmode)
13778 suffix = fp ? "nbe" : "a";
13779 else
13780 gcc_unreachable ();
13781 break;
13782 case LT:
13783 switch (mode)
13785 case CCNOmode:
13786 case CCGOCmode:
13787 suffix = "s";
13788 break;
13790 case CCmode:
13791 case CCGCmode:
13792 suffix = "l";
13793 break;
13795 default:
13796 gcc_unreachable ();
13798 break;
13799 case LTU:
13800 if (mode == CCmode)
13801 suffix = "b";
13802 else if (mode == CCCmode)
13803 suffix = "c";
13804 else
13805 gcc_unreachable ();
13806 break;
13807 case GE:
13808 switch (mode)
13810 case CCNOmode:
13811 case CCGOCmode:
13812 suffix = "ns";
13813 break;
13815 case CCmode:
13816 case CCGCmode:
13817 suffix = "ge";
13818 break;
13820 default:
13821 gcc_unreachable ();
13823 break;
13824 case GEU:
13825 if (mode == CCmode)
13826 suffix = fp ? "nb" : "ae";
13827 else if (mode == CCCmode)
13828 suffix = "nc";
13829 else
13830 gcc_unreachable ();
13831 break;
13832 case LE:
13833 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13834 suffix = "le";
13835 break;
13836 case LEU:
13837 if (mode == CCmode)
13838 suffix = "be";
13839 else
13840 gcc_unreachable ();
13841 break;
13842 case UNORDERED:
13843 suffix = fp ? "u" : "p";
13844 break;
13845 case ORDERED:
13846 suffix = fp ? "nu" : "np";
13847 break;
13848 default:
13849 gcc_unreachable ();
13851 fputs (suffix, file);
13854 /* Print the name of register X to FILE based on its machine mode and number.
13855 If CODE is 'w', pretend the mode is HImode.
13856 If CODE is 'b', pretend the mode is QImode.
13857 If CODE is 'k', pretend the mode is SImode.
13858 If CODE is 'q', pretend the mode is DImode.
13859 If CODE is 'x', pretend the mode is V4SFmode.
13860 If CODE is 't', pretend the mode is V8SFmode.
13861 If CODE is 'h', pretend the reg is the 'high' byte register.
13862 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13863 If CODE is 'd', duplicate the operand for AVX instruction.
13866 void
13867 print_reg (rtx x, int code, FILE *file)
13869 const char *reg;
13870 unsigned int regno;
13871 bool duplicated = code == 'd' && TARGET_AVX;
13873 if (ASSEMBLER_DIALECT == ASM_ATT)
13874 putc ('%', file);
13876 if (x == pc_rtx)
13878 gcc_assert (TARGET_64BIT);
13879 fputs ("rip", file);
13880 return;
13883 regno = true_regnum (x);
13884 gcc_assert (regno != ARG_POINTER_REGNUM
13885 && regno != FRAME_POINTER_REGNUM
13886 && regno != FLAGS_REG
13887 && regno != FPSR_REG
13888 && regno != FPCR_REG);
13890 if (code == 'w' || MMX_REG_P (x))
13891 code = 2;
13892 else if (code == 'b')
13893 code = 1;
13894 else if (code == 'k')
13895 code = 4;
13896 else if (code == 'q')
13897 code = 8;
13898 else if (code == 'y')
13899 code = 3;
13900 else if (code == 'h')
13901 code = 0;
13902 else if (code == 'x')
13903 code = 16;
13904 else if (code == 't')
13905 code = 32;
13906 else
13907 code = GET_MODE_SIZE (GET_MODE (x));
13909 /* Irritatingly, AMD extended registers use different naming convention
13910 from the normal registers: "r%d[bwd]" */
13911 if (REX_INT_REGNO_P (regno))
13913 gcc_assert (TARGET_64BIT);
13914 putc ('r', file);
13915 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13916 switch (code)
13918 case 0:
13919 error ("extended registers have no high halves");
13920 break;
13921 case 1:
13922 putc ('b', file);
13923 break;
13924 case 2:
13925 putc ('w', file);
13926 break;
13927 case 4:
13928 putc ('d', file);
13929 break;
13930 case 8:
13931 /* no suffix */
13932 break;
13933 default:
13934 error ("unsupported operand size for extended register");
13935 break;
13937 return;
13940 reg = NULL;
13941 switch (code)
13943 case 3:
13944 if (STACK_TOP_P (x))
13946 reg = "st(0)";
13947 break;
13949 /* FALLTHRU */
13950 case 8:
13951 case 4:
13952 case 12:
13953 if (! ANY_FP_REG_P (x))
13954 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13955 /* FALLTHRU */
13956 case 16:
13957 case 2:
13958 normal:
13959 reg = hi_reg_name[regno];
13960 break;
13961 case 1:
13962 if (regno >= ARRAY_SIZE (qi_reg_name))
13963 goto normal;
13964 reg = qi_reg_name[regno];
13965 break;
13966 case 0:
13967 if (regno >= ARRAY_SIZE (qi_high_reg_name))
13968 goto normal;
13969 reg = qi_high_reg_name[regno];
13970 break;
13971 case 32:
13972 if (SSE_REG_P (x))
13974 gcc_assert (!duplicated);
13975 putc ('y', file);
13976 fputs (hi_reg_name[regno] + 1, file);
13977 return;
13979 break;
13980 default:
13981 gcc_unreachable ();
13984 fputs (reg, file);
13985 if (duplicated)
13987 if (ASSEMBLER_DIALECT == ASM_ATT)
13988 fprintf (file, ", %%%s", reg);
13989 else
13990 fprintf (file, ", %s", reg);
13994 /* Locate some local-dynamic symbol still in use by this function
13995 so that we can print its name in some tls_local_dynamic_base
13996 pattern. */
13998 static int
13999 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14001 rtx x = *px;
14003 if (GET_CODE (x) == SYMBOL_REF
14004 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14006 cfun->machine->some_ld_name = XSTR (x, 0);
14007 return 1;
14010 return 0;
14013 static const char *
14014 get_some_local_dynamic_name (void)
14016 rtx insn;
14018 if (cfun->machine->some_ld_name)
14019 return cfun->machine->some_ld_name;
14021 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14022 if (NONDEBUG_INSN_P (insn)
14023 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14024 return cfun->machine->some_ld_name;
14026 return NULL;
14029 /* Meaning of CODE:
14030 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14031 C -- print opcode suffix for set/cmov insn.
14032 c -- like C, but print reversed condition
14033 F,f -- likewise, but for floating-point.
14034 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14035 otherwise nothing
14036 R -- print the prefix for register names.
14037 z -- print the opcode suffix for the size of the current operand.
14038 Z -- likewise, with special suffixes for x87 instructions.
14039 * -- print a star (in certain assembler syntax)
14040 A -- print an absolute memory reference.
14041 E -- print address with DImode register names if TARGET_64BIT.
14042 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14043 s -- print a shift double count, followed by the assemblers argument
14044 delimiter.
14045 b -- print the QImode name of the register for the indicated operand.
14046 %b0 would print %al if operands[0] is reg 0.
14047 w -- likewise, print the HImode name of the register.
14048 k -- likewise, print the SImode name of the register.
14049 q -- likewise, print the DImode name of the register.
14050 x -- likewise, print the V4SFmode name of the register.
14051 t -- likewise, print the V8SFmode name of the register.
14052 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14053 y -- print "st(0)" instead of "st" as a register.
14054 d -- print duplicated register operand for AVX instruction.
14055 D -- print condition for SSE cmp instruction.
14056 P -- if PIC, print an @PLT suffix.
14057 p -- print raw symbol name.
14058 X -- don't print any sort of PIC '@' suffix for a symbol.
14059 & -- print some in-use local-dynamic symbol name.
14060 H -- print a memory address offset by 8; used for sse high-parts
14061 Y -- print condition for XOP pcom* instruction.
14062 + -- print a branch hint as 'cs' or 'ds' prefix
14063 ; -- print a semicolon (after prefixes due to bug in older gas).
14064 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14065 @ -- print a segment register of thread base pointer load
14066 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14069 void
14070 ix86_print_operand (FILE *file, rtx x, int code)
14072 if (code)
14074 switch (code)
14076 case 'A':
14077 switch (ASSEMBLER_DIALECT)
14079 case ASM_ATT:
14080 putc ('*', file);
14081 break;
14083 case ASM_INTEL:
14084 /* Intel syntax. For absolute addresses, registers should not
14085 be surrounded by braces. */
14086 if (!REG_P (x))
14088 putc ('[', file);
14089 ix86_print_operand (file, x, 0);
14090 putc (']', file);
14091 return;
14093 break;
14095 default:
14096 gcc_unreachable ();
14099 ix86_print_operand (file, x, 0);
14100 return;
14102 case 'E':
14103 /* Wrap address in an UNSPEC to declare special handling. */
14104 if (TARGET_64BIT)
14105 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14107 output_address (x);
14108 return;
14110 case 'L':
14111 if (ASSEMBLER_DIALECT == ASM_ATT)
14112 putc ('l', file);
14113 return;
14115 case 'W':
14116 if (ASSEMBLER_DIALECT == ASM_ATT)
14117 putc ('w', file);
14118 return;
14120 case 'B':
14121 if (ASSEMBLER_DIALECT == ASM_ATT)
14122 putc ('b', file);
14123 return;
14125 case 'Q':
14126 if (ASSEMBLER_DIALECT == ASM_ATT)
14127 putc ('l', file);
14128 return;
14130 case 'S':
14131 if (ASSEMBLER_DIALECT == ASM_ATT)
14132 putc ('s', file);
14133 return;
14135 case 'T':
14136 if (ASSEMBLER_DIALECT == ASM_ATT)
14137 putc ('t', file);
14138 return;
14140 case 'O':
14141 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14142 if (ASSEMBLER_DIALECT != ASM_ATT)
14143 return;
14145 switch (GET_MODE_SIZE (GET_MODE (x)))
14147 case 2:
14148 putc ('w', file);
14149 break;
14151 case 4:
14152 putc ('l', file);
14153 break;
14155 case 8:
14156 putc ('q', file);
14157 break;
14159 default:
14160 output_operand_lossage
14161 ("invalid operand size for operand code 'O'");
14162 return;
14165 putc ('.', file);
14166 #endif
14167 return;
14169 case 'z':
14170 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14172 /* Opcodes don't get size suffixes if using Intel opcodes. */
14173 if (ASSEMBLER_DIALECT == ASM_INTEL)
14174 return;
14176 switch (GET_MODE_SIZE (GET_MODE (x)))
14178 case 1:
14179 putc ('b', file);
14180 return;
14182 case 2:
14183 putc ('w', file);
14184 return;
14186 case 4:
14187 putc ('l', file);
14188 return;
14190 case 8:
14191 putc ('q', file);
14192 return;
14194 default:
14195 output_operand_lossage
14196 ("invalid operand size for operand code 'z'");
14197 return;
14201 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14202 warning
14203 (0, "non-integer operand used with operand code 'z'");
14204 /* FALLTHRU */
14206 case 'Z':
14207 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14208 if (ASSEMBLER_DIALECT == ASM_INTEL)
14209 return;
14211 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14213 switch (GET_MODE_SIZE (GET_MODE (x)))
14215 case 2:
14216 #ifdef HAVE_AS_IX86_FILDS
14217 putc ('s', file);
14218 #endif
14219 return;
14221 case 4:
14222 putc ('l', file);
14223 return;
14225 case 8:
14226 #ifdef HAVE_AS_IX86_FILDQ
14227 putc ('q', file);
14228 #else
14229 fputs ("ll", file);
14230 #endif
14231 return;
14233 default:
14234 break;
14237 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14239 /* 387 opcodes don't get size suffixes
14240 if the operands are registers. */
14241 if (STACK_REG_P (x))
14242 return;
14244 switch (GET_MODE_SIZE (GET_MODE (x)))
14246 case 4:
14247 putc ('s', file);
14248 return;
14250 case 8:
14251 putc ('l', file);
14252 return;
14254 case 12:
14255 case 16:
14256 putc ('t', file);
14257 return;
14259 default:
14260 break;
14263 else
14265 output_operand_lossage
14266 ("invalid operand type used with operand code 'Z'");
14267 return;
14270 output_operand_lossage
14271 ("invalid operand size for operand code 'Z'");
14272 return;
14274 case 'd':
14275 case 'b':
14276 case 'w':
14277 case 'k':
14278 case 'q':
14279 case 'h':
14280 case 't':
14281 case 'y':
14282 case 'x':
14283 case 'X':
14284 case 'P':
14285 case 'p':
14286 break;
14288 case 's':
14289 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14291 ix86_print_operand (file, x, 0);
14292 fputs (", ", file);
14294 return;
14296 case 'Y':
14297 switch (GET_CODE (x))
14299 case NE:
14300 fputs ("neq", file);
14301 break;
14302 case EQ:
14303 fputs ("eq", file);
14304 break;
14305 case GE:
14306 case GEU:
14307 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14308 break;
14309 case GT:
14310 case GTU:
14311 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14312 break;
14313 case LE:
14314 case LEU:
14315 fputs ("le", file);
14316 break;
14317 case LT:
14318 case LTU:
14319 fputs ("lt", file);
14320 break;
14321 case UNORDERED:
14322 fputs ("unord", file);
14323 break;
14324 case ORDERED:
14325 fputs ("ord", file);
14326 break;
14327 case UNEQ:
14328 fputs ("ueq", file);
14329 break;
14330 case UNGE:
14331 fputs ("nlt", file);
14332 break;
14333 case UNGT:
14334 fputs ("nle", file);
14335 break;
14336 case UNLE:
14337 fputs ("ule", file);
14338 break;
14339 case UNLT:
14340 fputs ("ult", file);
14341 break;
14342 case LTGT:
14343 fputs ("une", file);
14344 break;
14345 default:
14346 output_operand_lossage ("operand is not a condition code, "
14347 "invalid operand code 'Y'");
14348 return;
14350 return;
14352 case 'D':
14353 /* Little bit of braindamage here. The SSE compare instructions
14354 does use completely different names for the comparisons that the
14355 fp conditional moves. */
14356 switch (GET_CODE (x))
14358 case UNEQ:
14359 if (TARGET_AVX)
14361 fputs ("eq_us", file);
14362 break;
14364 case EQ:
14365 fputs ("eq", file);
14366 break;
14367 case UNLT:
14368 if (TARGET_AVX)
14370 fputs ("nge", file);
14371 break;
14373 case LT:
14374 fputs ("lt", file);
14375 break;
14376 case UNLE:
14377 if (TARGET_AVX)
14379 fputs ("ngt", file);
14380 break;
14382 case LE:
14383 fputs ("le", file);
14384 break;
14385 case UNORDERED:
14386 fputs ("unord", file);
14387 break;
14388 case LTGT:
14389 if (TARGET_AVX)
14391 fputs ("neq_oq", file);
14392 break;
14394 case NE:
14395 fputs ("neq", file);
14396 break;
14397 case GE:
14398 if (TARGET_AVX)
14400 fputs ("ge", file);
14401 break;
14403 case UNGE:
14404 fputs ("nlt", file);
14405 break;
14406 case GT:
14407 if (TARGET_AVX)
14409 fputs ("gt", file);
14410 break;
14412 case UNGT:
14413 fputs ("nle", file);
14414 break;
14415 case ORDERED:
14416 fputs ("ord", file);
14417 break;
14418 default:
14419 output_operand_lossage ("operand is not a condition code, "
14420 "invalid operand code 'D'");
14421 return;
14423 return;
14425 case 'F':
14426 case 'f':
14427 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14428 if (ASSEMBLER_DIALECT == ASM_ATT)
14429 putc ('.', file);
14430 #endif
14432 case 'C':
14433 case 'c':
14434 if (!COMPARISON_P (x))
14436 output_operand_lossage ("operand is not a condition code, "
14437 "invalid operand code '%c'", code);
14438 return;
14440 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14441 code == 'c' || code == 'f',
14442 code == 'F' || code == 'f',
14443 file);
14444 return;
14446 case 'H':
14447 if (!offsettable_memref_p (x))
14449 output_operand_lossage ("operand is not an offsettable memory "
14450 "reference, invalid operand code 'H'");
14451 return;
14453 /* It doesn't actually matter what mode we use here, as we're
14454 only going to use this for printing. */
14455 x = adjust_address_nv (x, DImode, 8);
14456 break;
14458 case 'K':
14459 gcc_assert (CONST_INT_P (x));
14461 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14462 #ifdef HAVE_AS_IX86_HLE
14463 fputs ("xacquire ", file);
14464 #else
14465 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14466 #endif
14467 else if (INTVAL (x) & IX86_HLE_RELEASE)
14468 #ifdef HAVE_AS_IX86_HLE
14469 fputs ("xrelease ", file);
14470 #else
14471 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14472 #endif
14473 /* We do not want to print value of the operand. */
14474 return;
14476 case '*':
14477 if (ASSEMBLER_DIALECT == ASM_ATT)
14478 putc ('*', file);
14479 return;
14481 case '&':
14483 const char *name = get_some_local_dynamic_name ();
14484 if (name == NULL)
14485 output_operand_lossage ("'%%&' used without any "
14486 "local dynamic TLS references");
14487 else
14488 assemble_name (file, name);
14489 return;
14492 case '+':
14494 rtx x;
14496 if (!optimize
14497 || optimize_function_for_size_p (cfun)
14498 || !TARGET_BRANCH_PREDICTION_HINTS)
14499 return;
14501 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14502 if (x)
14504 int pred_val = INTVAL (XEXP (x, 0));
14506 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14507 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14509 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14510 bool cputaken
14511 = final_forward_branch_p (current_output_insn) == 0;
14513 /* Emit hints only in the case default branch prediction
14514 heuristics would fail. */
14515 if (taken != cputaken)
14517 /* We use 3e (DS) prefix for taken branches and
14518 2e (CS) prefix for not taken branches. */
14519 if (taken)
14520 fputs ("ds ; ", file);
14521 else
14522 fputs ("cs ; ", file);
14526 return;
14529 case ';':
14530 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14531 putc (';', file);
14532 #endif
14533 return;
14535 case '@':
14536 if (ASSEMBLER_DIALECT == ASM_ATT)
14537 putc ('%', file);
14539 /* The kernel uses a different segment register for performance
14540 reasons; a system call would not have to trash the userspace
14541 segment register, which would be expensive. */
14542 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14543 fputs ("fs", file);
14544 else
14545 fputs ("gs", file);
14546 return;
14548 case '~':
14549 putc (TARGET_AVX2 ? 'i' : 'f', file);
14550 return;
14552 case '^':
14553 if (TARGET_64BIT && Pmode != word_mode)
14554 fputs ("addr32 ", file);
14555 return;
14557 default:
14558 output_operand_lossage ("invalid operand code '%c'", code);
14562 if (REG_P (x))
14563 print_reg (x, code, file);
14565 else if (MEM_P (x))
14567 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14568 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14569 && GET_MODE (x) != BLKmode)
14571 const char * size;
14572 switch (GET_MODE_SIZE (GET_MODE (x)))
14574 case 1: size = "BYTE"; break;
14575 case 2: size = "WORD"; break;
14576 case 4: size = "DWORD"; break;
14577 case 8: size = "QWORD"; break;
14578 case 12: size = "TBYTE"; break;
14579 case 16:
14580 if (GET_MODE (x) == XFmode)
14581 size = "TBYTE";
14582 else
14583 size = "XMMWORD";
14584 break;
14585 case 32: size = "YMMWORD"; break;
14586 default:
14587 gcc_unreachable ();
14590 /* Check for explicit size override (codes 'b', 'w', 'k',
14591 'q' and 'x') */
14592 if (code == 'b')
14593 size = "BYTE";
14594 else if (code == 'w')
14595 size = "WORD";
14596 else if (code == 'k')
14597 size = "DWORD";
14598 else if (code == 'q')
14599 size = "QWORD";
14600 else if (code == 'x')
14601 size = "XMMWORD";
14603 fputs (size, file);
14604 fputs (" PTR ", file);
14607 x = XEXP (x, 0);
14608 /* Avoid (%rip) for call operands. */
14609 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14610 && !CONST_INT_P (x))
14611 output_addr_const (file, x);
14612 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14613 output_operand_lossage ("invalid constraints for operand");
14614 else
14615 output_address (x);
14618 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14620 REAL_VALUE_TYPE r;
14621 long l;
14623 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14624 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14626 if (ASSEMBLER_DIALECT == ASM_ATT)
14627 putc ('$', file);
14628 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14629 if (code == 'q')
14630 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14631 (unsigned long long) (int) l);
14632 else
14633 fprintf (file, "0x%08x", (unsigned int) l);
14636 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14638 REAL_VALUE_TYPE r;
14639 long l[2];
14641 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14642 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14644 if (ASSEMBLER_DIALECT == ASM_ATT)
14645 putc ('$', file);
14646 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14649 /* These float cases don't actually occur as immediate operands. */
14650 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14652 char dstr[30];
14654 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14655 fputs (dstr, file);
14658 else
14660 /* We have patterns that allow zero sets of memory, for instance.
14661 In 64-bit mode, we should probably support all 8-byte vectors,
14662 since we can in fact encode that into an immediate. */
14663 if (GET_CODE (x) == CONST_VECTOR)
14665 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14666 x = const0_rtx;
14669 if (code != 'P' && code != 'p')
14671 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14673 if (ASSEMBLER_DIALECT == ASM_ATT)
14674 putc ('$', file);
14676 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14677 || GET_CODE (x) == LABEL_REF)
14679 if (ASSEMBLER_DIALECT == ASM_ATT)
14680 putc ('$', file);
14681 else
14682 fputs ("OFFSET FLAT:", file);
14685 if (CONST_INT_P (x))
14686 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14687 else if (flag_pic || MACHOPIC_INDIRECT)
14688 output_pic_addr_const (file, x, code);
14689 else
14690 output_addr_const (file, x);
14694 static bool
14695 ix86_print_operand_punct_valid_p (unsigned char code)
14697 return (code == '@' || code == '*' || code == '+' || code == '&'
14698 || code == ';' || code == '~' || code == '^');
14701 /* Print a memory operand whose address is ADDR. */
14703 static void
14704 ix86_print_operand_address (FILE *file, rtx addr)
14706 struct ix86_address parts;
14707 rtx base, index, disp;
14708 int scale;
14709 int ok;
14710 bool vsib = false;
14711 int code = 0;
14713 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14715 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14716 gcc_assert (parts.index == NULL_RTX);
14717 parts.index = XVECEXP (addr, 0, 1);
14718 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14719 addr = XVECEXP (addr, 0, 0);
14720 vsib = true;
14722 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14724 gcc_assert (TARGET_64BIT);
14725 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14726 code = 'q';
14728 else
14729 ok = ix86_decompose_address (addr, &parts);
14731 gcc_assert (ok);
14733 base = parts.base;
14734 index = parts.index;
14735 disp = parts.disp;
14736 scale = parts.scale;
14738 switch (parts.seg)
14740 case SEG_DEFAULT:
14741 break;
14742 case SEG_FS:
14743 case SEG_GS:
14744 if (ASSEMBLER_DIALECT == ASM_ATT)
14745 putc ('%', file);
14746 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14747 break;
14748 default:
14749 gcc_unreachable ();
14752 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14753 if (TARGET_64BIT && !base && !index)
14755 rtx symbol = disp;
14757 if (GET_CODE (disp) == CONST
14758 && GET_CODE (XEXP (disp, 0)) == PLUS
14759 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14760 symbol = XEXP (XEXP (disp, 0), 0);
14762 if (GET_CODE (symbol) == LABEL_REF
14763 || (GET_CODE (symbol) == SYMBOL_REF
14764 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14765 base = pc_rtx;
14767 if (!base && !index)
14769 /* Displacement only requires special attention. */
14771 if (CONST_INT_P (disp))
14773 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14774 fputs ("ds:", file);
14775 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14777 else if (flag_pic)
14778 output_pic_addr_const (file, disp, 0);
14779 else
14780 output_addr_const (file, disp);
14782 else
14784 /* Print SImode register names to force addr32 prefix. */
14785 if (SImode_address_operand (addr, VOIDmode))
14787 #ifdef ENABLE_CHECKING
14788 gcc_assert (TARGET_64BIT);
14789 switch (GET_CODE (addr))
14791 case SUBREG:
14792 gcc_assert (GET_MODE (addr) == SImode);
14793 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14794 break;
14795 case ZERO_EXTEND:
14796 case AND:
14797 gcc_assert (GET_MODE (addr) == DImode);
14798 break;
14799 default:
14800 gcc_unreachable ();
14802 #endif
14803 gcc_assert (!code);
14804 code = 'k';
14806 else if (code == 0
14807 && TARGET_X32
14808 && disp
14809 && CONST_INT_P (disp)
14810 && INTVAL (disp) < -16*1024*1024)
14812 /* X32 runs in 64-bit mode, where displacement, DISP, in
14813 address DISP(%r64), is encoded as 32-bit immediate sign-
14814 extended from 32-bit to 64-bit. For -0x40000300(%r64),
14815 address is %r64 + 0xffffffffbffffd00. When %r64 <
14816 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14817 which is invalid for x32. The correct address is %r64
14818 - 0x40000300 == 0xf7ffdd64. To properly encode
14819 -0x40000300(%r64) for x32, we zero-extend negative
14820 displacement by forcing addr32 prefix which truncates
14821 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
14822 zero-extend all negative displacements, including -1(%rsp).
14823 However, for small negative displacements, sign-extension
14824 won't cause overflow. We only zero-extend negative
14825 displacements if they < -16*1024*1024, which is also used
14826 to check legitimate address displacements for PIC. */
14827 code = 'k';
14830 if (ASSEMBLER_DIALECT == ASM_ATT)
14832 if (disp)
14834 if (flag_pic)
14835 output_pic_addr_const (file, disp, 0);
14836 else if (GET_CODE (disp) == LABEL_REF)
14837 output_asm_label (disp);
14838 else
14839 output_addr_const (file, disp);
14842 putc ('(', file);
14843 if (base)
14844 print_reg (base, code, file);
14845 if (index)
14847 putc (',', file);
14848 print_reg (index, vsib ? 0 : code, file);
14849 if (scale != 1 || vsib)
14850 fprintf (file, ",%d", scale);
14852 putc (')', file);
14854 else
14856 rtx offset = NULL_RTX;
14858 if (disp)
14860 /* Pull out the offset of a symbol; print any symbol itself. */
14861 if (GET_CODE (disp) == CONST
14862 && GET_CODE (XEXP (disp, 0)) == PLUS
14863 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14865 offset = XEXP (XEXP (disp, 0), 1);
14866 disp = gen_rtx_CONST (VOIDmode,
14867 XEXP (XEXP (disp, 0), 0));
14870 if (flag_pic)
14871 output_pic_addr_const (file, disp, 0);
14872 else if (GET_CODE (disp) == LABEL_REF)
14873 output_asm_label (disp);
14874 else if (CONST_INT_P (disp))
14875 offset = disp;
14876 else
14877 output_addr_const (file, disp);
14880 putc ('[', file);
14881 if (base)
14883 print_reg (base, code, file);
14884 if (offset)
14886 if (INTVAL (offset) >= 0)
14887 putc ('+', file);
14888 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14891 else if (offset)
14892 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14893 else
14894 putc ('0', file);
14896 if (index)
14898 putc ('+', file);
14899 print_reg (index, vsib ? 0 : code, file);
14900 if (scale != 1 || vsib)
14901 fprintf (file, "*%d", scale);
14903 putc (']', file);
14908 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14910 static bool
14911 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14913 rtx op;
14915 if (GET_CODE (x) != UNSPEC)
14916 return false;
14918 op = XVECEXP (x, 0, 0);
14919 switch (XINT (x, 1))
14921 case UNSPEC_GOTTPOFF:
14922 output_addr_const (file, op);
14923 /* FIXME: This might be @TPOFF in Sun ld. */
14924 fputs ("@gottpoff", file);
14925 break;
14926 case UNSPEC_TPOFF:
14927 output_addr_const (file, op);
14928 fputs ("@tpoff", file);
14929 break;
14930 case UNSPEC_NTPOFF:
14931 output_addr_const (file, op);
14932 if (TARGET_64BIT)
14933 fputs ("@tpoff", file);
14934 else
14935 fputs ("@ntpoff", file);
14936 break;
14937 case UNSPEC_DTPOFF:
14938 output_addr_const (file, op);
14939 fputs ("@dtpoff", file);
14940 break;
14941 case UNSPEC_GOTNTPOFF:
14942 output_addr_const (file, op);
14943 if (TARGET_64BIT)
14944 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14945 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14946 else
14947 fputs ("@gotntpoff", file);
14948 break;
14949 case UNSPEC_INDNTPOFF:
14950 output_addr_const (file, op);
14951 fputs ("@indntpoff", file);
14952 break;
14953 #if TARGET_MACHO
14954 case UNSPEC_MACHOPIC_OFFSET:
14955 output_addr_const (file, op);
14956 putc ('-', file);
14957 machopic_output_function_base_name (file);
14958 break;
14959 #endif
14961 case UNSPEC_STACK_CHECK:
14963 int offset;
14965 gcc_assert (flag_split_stack);
14967 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14968 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14969 #else
14970 gcc_unreachable ();
14971 #endif
14973 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14975 break;
14977 default:
14978 return false;
14981 return true;
14984 /* Split one or more double-mode RTL references into pairs of half-mode
14985 references. The RTL can be REG, offsettable MEM, integer constant, or
14986 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14987 split and "num" is its length. lo_half and hi_half are output arrays
14988 that parallel "operands". */
14990 void
14991 split_double_mode (enum machine_mode mode, rtx operands[],
14992 int num, rtx lo_half[], rtx hi_half[])
14994 enum machine_mode half_mode;
14995 unsigned int byte;
14997 switch (mode)
14999 case TImode:
15000 half_mode = DImode;
15001 break;
15002 case DImode:
15003 half_mode = SImode;
15004 break;
15005 default:
15006 gcc_unreachable ();
15009 byte = GET_MODE_SIZE (half_mode);
15011 while (num--)
15013 rtx op = operands[num];
15015 /* simplify_subreg refuse to split volatile memory addresses,
15016 but we still have to handle it. */
15017 if (MEM_P (op))
15019 lo_half[num] = adjust_address (op, half_mode, 0);
15020 hi_half[num] = adjust_address (op, half_mode, byte);
15022 else
15024 lo_half[num] = simplify_gen_subreg (half_mode, op,
15025 GET_MODE (op) == VOIDmode
15026 ? mode : GET_MODE (op), 0);
15027 hi_half[num] = simplify_gen_subreg (half_mode, op,
15028 GET_MODE (op) == VOIDmode
15029 ? mode : GET_MODE (op), byte);
15034 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15035 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15036 is the expression of the binary operation. The output may either be
15037 emitted here, or returned to the caller, like all output_* functions.
15039 There is no guarantee that the operands are the same mode, as they
15040 might be within FLOAT or FLOAT_EXTEND expressions. */
15042 #ifndef SYSV386_COMPAT
15043 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15044 wants to fix the assemblers because that causes incompatibility
15045 with gcc. No-one wants to fix gcc because that causes
15046 incompatibility with assemblers... You can use the option of
15047 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15048 #define SYSV386_COMPAT 1
15049 #endif
15051 const char *
15052 output_387_binary_op (rtx insn, rtx *operands)
15054 static char buf[40];
15055 const char *p;
15056 const char *ssep;
15057 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15059 #ifdef ENABLE_CHECKING
15060 /* Even if we do not want to check the inputs, this documents input
15061 constraints. Which helps in understanding the following code. */
15062 if (STACK_REG_P (operands[0])
15063 && ((REG_P (operands[1])
15064 && REGNO (operands[0]) == REGNO (operands[1])
15065 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15066 || (REG_P (operands[2])
15067 && REGNO (operands[0]) == REGNO (operands[2])
15068 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15069 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15070 ; /* ok */
15071 else
15072 gcc_assert (is_sse);
15073 #endif
15075 switch (GET_CODE (operands[3]))
15077 case PLUS:
15078 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15079 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15080 p = "fiadd";
15081 else
15082 p = "fadd";
15083 ssep = "vadd";
15084 break;
15086 case MINUS:
15087 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15088 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15089 p = "fisub";
15090 else
15091 p = "fsub";
15092 ssep = "vsub";
15093 break;
15095 case MULT:
15096 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15097 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15098 p = "fimul";
15099 else
15100 p = "fmul";
15101 ssep = "vmul";
15102 break;
15104 case DIV:
15105 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15106 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15107 p = "fidiv";
15108 else
15109 p = "fdiv";
15110 ssep = "vdiv";
15111 break;
15113 default:
15114 gcc_unreachable ();
15117 if (is_sse)
15119 if (TARGET_AVX)
15121 strcpy (buf, ssep);
15122 if (GET_MODE (operands[0]) == SFmode)
15123 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15124 else
15125 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15127 else
15129 strcpy (buf, ssep + 1);
15130 if (GET_MODE (operands[0]) == SFmode)
15131 strcat (buf, "ss\t{%2, %0|%0, %2}");
15132 else
15133 strcat (buf, "sd\t{%2, %0|%0, %2}");
15135 return buf;
15137 strcpy (buf, p);
15139 switch (GET_CODE (operands[3]))
15141 case MULT:
15142 case PLUS:
15143 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15145 rtx temp = operands[2];
15146 operands[2] = operands[1];
15147 operands[1] = temp;
15150 /* know operands[0] == operands[1]. */
15152 if (MEM_P (operands[2]))
15154 p = "%Z2\t%2";
15155 break;
15158 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15160 if (STACK_TOP_P (operands[0]))
15161 /* How is it that we are storing to a dead operand[2]?
15162 Well, presumably operands[1] is dead too. We can't
15163 store the result to st(0) as st(0) gets popped on this
15164 instruction. Instead store to operands[2] (which I
15165 think has to be st(1)). st(1) will be popped later.
15166 gcc <= 2.8.1 didn't have this check and generated
15167 assembly code that the Unixware assembler rejected. */
15168 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15169 else
15170 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15171 break;
15174 if (STACK_TOP_P (operands[0]))
15175 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15176 else
15177 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15178 break;
15180 case MINUS:
15181 case DIV:
15182 if (MEM_P (operands[1]))
15184 p = "r%Z1\t%1";
15185 break;
15188 if (MEM_P (operands[2]))
15190 p = "%Z2\t%2";
15191 break;
15194 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15196 #if SYSV386_COMPAT
15197 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15198 derived assemblers, confusingly reverse the direction of
15199 the operation for fsub{r} and fdiv{r} when the
15200 destination register is not st(0). The Intel assembler
15201 doesn't have this brain damage. Read !SYSV386_COMPAT to
15202 figure out what the hardware really does. */
15203 if (STACK_TOP_P (operands[0]))
15204 p = "{p\t%0, %2|rp\t%2, %0}";
15205 else
15206 p = "{rp\t%2, %0|p\t%0, %2}";
15207 #else
15208 if (STACK_TOP_P (operands[0]))
15209 /* As above for fmul/fadd, we can't store to st(0). */
15210 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15211 else
15212 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15213 #endif
15214 break;
15217 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15219 #if SYSV386_COMPAT
15220 if (STACK_TOP_P (operands[0]))
15221 p = "{rp\t%0, %1|p\t%1, %0}";
15222 else
15223 p = "{p\t%1, %0|rp\t%0, %1}";
15224 #else
15225 if (STACK_TOP_P (operands[0]))
15226 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15227 else
15228 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15229 #endif
15230 break;
15233 if (STACK_TOP_P (operands[0]))
15235 if (STACK_TOP_P (operands[1]))
15236 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15237 else
15238 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15239 break;
15241 else if (STACK_TOP_P (operands[1]))
15243 #if SYSV386_COMPAT
15244 p = "{\t%1, %0|r\t%0, %1}";
15245 #else
15246 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15247 #endif
15249 else
15251 #if SYSV386_COMPAT
15252 p = "{r\t%2, %0|\t%0, %2}";
15253 #else
15254 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15255 #endif
15257 break;
15259 default:
15260 gcc_unreachable ();
15263 strcat (buf, p);
15264 return buf;
15267 /* Check if a 256bit AVX register is referenced inside of EXP. */
15269 static int
15270 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15272 rtx exp = *pexp;
15274 if (GET_CODE (exp) == SUBREG)
15275 exp = SUBREG_REG (exp);
15277 if (REG_P (exp)
15278 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15279 return 1;
15281 return 0;
15284 /* Return needed mode for entity in optimize_mode_switching pass. */
15286 static int
15287 ix86_avx_u128_mode_needed (rtx insn)
15289 if (CALL_P (insn))
15291 rtx link;
15293 /* Needed mode is set to AVX_U128_CLEAN if there are
15294 no 256bit modes used in function arguments. */
15295 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15296 link;
15297 link = XEXP (link, 1))
15299 if (GET_CODE (XEXP (link, 0)) == USE)
15301 rtx arg = XEXP (XEXP (link, 0), 0);
15303 if (ix86_check_avx256_register (&arg, NULL))
15304 return AVX_U128_DIRTY;
15308 return AVX_U128_CLEAN;
15311 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15312 changes state only when a 256bit register is written to, but we need
15313 to prevent the compiler from moving optimal insertion point above
15314 eventual read from 256bit register. */
15315 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15316 return AVX_U128_DIRTY;
15318 return AVX_U128_ANY;
15321 /* Return mode that i387 must be switched into
15322 prior to the execution of insn. */
15324 static int
15325 ix86_i387_mode_needed (int entity, rtx insn)
15327 enum attr_i387_cw mode;
15329 /* The mode UNINITIALIZED is used to store control word after a
15330 function call or ASM pattern. The mode ANY specify that function
15331 has no requirements on the control word and make no changes in the
15332 bits we are interested in. */
15334 if (CALL_P (insn)
15335 || (NONJUMP_INSN_P (insn)
15336 && (asm_noperands (PATTERN (insn)) >= 0
15337 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15338 return I387_CW_UNINITIALIZED;
15340 if (recog_memoized (insn) < 0)
15341 return I387_CW_ANY;
15343 mode = get_attr_i387_cw (insn);
15345 switch (entity)
15347 case I387_TRUNC:
15348 if (mode == I387_CW_TRUNC)
15349 return mode;
15350 break;
15352 case I387_FLOOR:
15353 if (mode == I387_CW_FLOOR)
15354 return mode;
15355 break;
15357 case I387_CEIL:
15358 if (mode == I387_CW_CEIL)
15359 return mode;
15360 break;
15362 case I387_MASK_PM:
15363 if (mode == I387_CW_MASK_PM)
15364 return mode;
15365 break;
15367 default:
15368 gcc_unreachable ();
15371 return I387_CW_ANY;
15374 /* Return mode that entity must be switched into
15375 prior to the execution of insn. */
15378 ix86_mode_needed (int entity, rtx insn)
15380 switch (entity)
15382 case AVX_U128:
15383 return ix86_avx_u128_mode_needed (insn);
15384 case I387_TRUNC:
15385 case I387_FLOOR:
15386 case I387_CEIL:
15387 case I387_MASK_PM:
15388 return ix86_i387_mode_needed (entity, insn);
15389 default:
15390 gcc_unreachable ();
15392 return 0;
15395 /* Check if a 256bit AVX register is referenced in stores. */
15397 static void
15398 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15400 if (ix86_check_avx256_register (&dest, NULL))
15402 bool *used = (bool *) data;
15403 *used = true;
15407 /* Calculate mode of upper 128bit AVX registers after the insn. */
15409 static int
15410 ix86_avx_u128_mode_after (int mode, rtx insn)
15412 rtx pat = PATTERN (insn);
15414 if (vzeroupper_operation (pat, VOIDmode)
15415 || vzeroall_operation (pat, VOIDmode))
15416 return AVX_U128_CLEAN;
15418 /* We know that state is clean after CALL insn if there are no
15419 256bit registers used in the function return register. */
15420 if (CALL_P (insn))
15422 bool avx_reg256_found = false;
15423 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15425 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
15428 /* Otherwise, return current mode. Remember that if insn
15429 references AVX 256bit registers, the mode was already changed
15430 to DIRTY from MODE_NEEDED. */
15431 return mode;
15434 /* Return the mode that an insn results in. */
15437 ix86_mode_after (int entity, int mode, rtx insn)
15439 switch (entity)
15441 case AVX_U128:
15442 return ix86_avx_u128_mode_after (mode, insn);
15443 case I387_TRUNC:
15444 case I387_FLOOR:
15445 case I387_CEIL:
15446 case I387_MASK_PM:
15447 return mode;
15448 default:
15449 gcc_unreachable ();
15453 static int
15454 ix86_avx_u128_mode_entry (void)
15456 tree arg;
15458 /* Entry mode is set to AVX_U128_DIRTY if there are
15459 256bit modes used in function arguments. */
15460 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15461 arg = TREE_CHAIN (arg))
15463 rtx incoming = DECL_INCOMING_RTL (arg);
15465 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15466 return AVX_U128_DIRTY;
15469 return AVX_U128_CLEAN;
15472 /* Return a mode that ENTITY is assumed to be
15473 switched to at function entry. */
15476 ix86_mode_entry (int entity)
15478 switch (entity)
15480 case AVX_U128:
15481 return ix86_avx_u128_mode_entry ();
15482 case I387_TRUNC:
15483 case I387_FLOOR:
15484 case I387_CEIL:
15485 case I387_MASK_PM:
15486 return I387_CW_ANY;
15487 default:
15488 gcc_unreachable ();
15492 static int
15493 ix86_avx_u128_mode_exit (void)
15495 rtx reg = crtl->return_rtx;
15497 /* Exit mode is set to AVX_U128_DIRTY if there are
15498 256bit modes used in the function return register. */
15499 if (reg && ix86_check_avx256_register (&reg, NULL))
15500 return AVX_U128_DIRTY;
15502 return AVX_U128_CLEAN;
15505 /* Return a mode that ENTITY is assumed to be
15506 switched to at function exit. */
15509 ix86_mode_exit (int entity)
15511 switch (entity)
15513 case AVX_U128:
15514 return ix86_avx_u128_mode_exit ();
15515 case I387_TRUNC:
15516 case I387_FLOOR:
15517 case I387_CEIL:
15518 case I387_MASK_PM:
15519 return I387_CW_ANY;
15520 default:
15521 gcc_unreachable ();
15525 /* Output code to initialize control word copies used by trunc?f?i and
15526 rounding patterns. CURRENT_MODE is set to current control word,
15527 while NEW_MODE is set to new control word. */
15529 static void
15530 emit_i387_cw_initialization (int mode)
15532 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15533 rtx new_mode;
15535 enum ix86_stack_slot slot;
15537 rtx reg = gen_reg_rtx (HImode);
15539 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15540 emit_move_insn (reg, copy_rtx (stored_mode));
15542 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15543 || optimize_function_for_size_p (cfun))
15545 switch (mode)
15547 case I387_CW_TRUNC:
15548 /* round toward zero (truncate) */
15549 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15550 slot = SLOT_CW_TRUNC;
15551 break;
15553 case I387_CW_FLOOR:
15554 /* round down toward -oo */
15555 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15556 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15557 slot = SLOT_CW_FLOOR;
15558 break;
15560 case I387_CW_CEIL:
15561 /* round up toward +oo */
15562 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15563 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15564 slot = SLOT_CW_CEIL;
15565 break;
15567 case I387_CW_MASK_PM:
15568 /* mask precision exception for nearbyint() */
15569 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15570 slot = SLOT_CW_MASK_PM;
15571 break;
15573 default:
15574 gcc_unreachable ();
15577 else
15579 switch (mode)
15581 case I387_CW_TRUNC:
15582 /* round toward zero (truncate) */
15583 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15584 slot = SLOT_CW_TRUNC;
15585 break;
15587 case I387_CW_FLOOR:
15588 /* round down toward -oo */
15589 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15590 slot = SLOT_CW_FLOOR;
15591 break;
15593 case I387_CW_CEIL:
15594 /* round up toward +oo */
15595 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15596 slot = SLOT_CW_CEIL;
15597 break;
15599 case I387_CW_MASK_PM:
15600 /* mask precision exception for nearbyint() */
15601 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15602 slot = SLOT_CW_MASK_PM;
15603 break;
15605 default:
15606 gcc_unreachable ();
15610 gcc_assert (slot < MAX_386_STACK_LOCALS);
15612 new_mode = assign_386_stack_local (HImode, slot);
15613 emit_move_insn (new_mode, reg);
15616 /* Emit vzeroupper. */
15618 void
15619 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15621 int i;
15623 /* Cancel automatic vzeroupper insertion if there are
15624 live call-saved SSE registers at the insertion point. */
15626 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15627 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15628 return;
15630 if (TARGET_64BIT)
15631 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15632 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15633 return;
15635 emit_insn (gen_avx_vzeroupper ());
15638 /* Generate one or more insns to set ENTITY to MODE. */
15640 void
15641 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15643 switch (entity)
15645 case AVX_U128:
15646 if (mode == AVX_U128_CLEAN)
15647 ix86_avx_emit_vzeroupper (regs_live);
15648 break;
15649 case I387_TRUNC:
15650 case I387_FLOOR:
15651 case I387_CEIL:
15652 case I387_MASK_PM:
15653 if (mode != I387_CW_ANY
15654 && mode != I387_CW_UNINITIALIZED)
15655 emit_i387_cw_initialization (mode);
15656 break;
15657 default:
15658 gcc_unreachable ();
15662 /* Output code for INSN to convert a float to a signed int. OPERANDS
15663 are the insn operands. The output may be [HSD]Imode and the input
15664 operand may be [SDX]Fmode. */
15666 const char *
15667 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15669 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15670 int dimode_p = GET_MODE (operands[0]) == DImode;
15671 int round_mode = get_attr_i387_cw (insn);
15673 /* Jump through a hoop or two for DImode, since the hardware has no
15674 non-popping instruction. We used to do this a different way, but
15675 that was somewhat fragile and broke with post-reload splitters. */
15676 if ((dimode_p || fisttp) && !stack_top_dies)
15677 output_asm_insn ("fld\t%y1", operands);
15679 gcc_assert (STACK_TOP_P (operands[1]));
15680 gcc_assert (MEM_P (operands[0]));
15681 gcc_assert (GET_MODE (operands[1]) != TFmode);
15683 if (fisttp)
15684 output_asm_insn ("fisttp%Z0\t%0", operands);
15685 else
15687 if (round_mode != I387_CW_ANY)
15688 output_asm_insn ("fldcw\t%3", operands);
15689 if (stack_top_dies || dimode_p)
15690 output_asm_insn ("fistp%Z0\t%0", operands);
15691 else
15692 output_asm_insn ("fist%Z0\t%0", operands);
15693 if (round_mode != I387_CW_ANY)
15694 output_asm_insn ("fldcw\t%2", operands);
15697 return "";
15700 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15701 have the values zero or one, indicates the ffreep insn's operand
15702 from the OPERANDS array. */
15704 static const char *
15705 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15707 if (TARGET_USE_FFREEP)
15708 #ifdef HAVE_AS_IX86_FFREEP
15709 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15710 #else
15712 static char retval[32];
15713 int regno = REGNO (operands[opno]);
15715 gcc_assert (STACK_REGNO_P (regno));
15717 regno -= FIRST_STACK_REG;
15719 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15720 return retval;
15722 #endif
15724 return opno ? "fstp\t%y1" : "fstp\t%y0";
15728 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15729 should be used. UNORDERED_P is true when fucom should be used. */
15731 const char *
15732 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15734 int stack_top_dies;
15735 rtx cmp_op0, cmp_op1;
15736 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15738 if (eflags_p)
15740 cmp_op0 = operands[0];
15741 cmp_op1 = operands[1];
15743 else
15745 cmp_op0 = operands[1];
15746 cmp_op1 = operands[2];
15749 if (is_sse)
15751 if (GET_MODE (operands[0]) == SFmode)
15752 if (unordered_p)
15753 return "%vucomiss\t{%1, %0|%0, %1}";
15754 else
15755 return "%vcomiss\t{%1, %0|%0, %1}";
15756 else
15757 if (unordered_p)
15758 return "%vucomisd\t{%1, %0|%0, %1}";
15759 else
15760 return "%vcomisd\t{%1, %0|%0, %1}";
15763 gcc_assert (STACK_TOP_P (cmp_op0));
15765 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15767 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15769 if (stack_top_dies)
15771 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15772 return output_387_ffreep (operands, 1);
15774 else
15775 return "ftst\n\tfnstsw\t%0";
15778 if (STACK_REG_P (cmp_op1)
15779 && stack_top_dies
15780 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15781 && REGNO (cmp_op1) != FIRST_STACK_REG)
15783 /* If both the top of the 387 stack dies, and the other operand
15784 is also a stack register that dies, then this must be a
15785 `fcompp' float compare */
15787 if (eflags_p)
15789 /* There is no double popping fcomi variant. Fortunately,
15790 eflags is immune from the fstp's cc clobbering. */
15791 if (unordered_p)
15792 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15793 else
15794 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15795 return output_387_ffreep (operands, 0);
15797 else
15799 if (unordered_p)
15800 return "fucompp\n\tfnstsw\t%0";
15801 else
15802 return "fcompp\n\tfnstsw\t%0";
15805 else
15807 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15809 static const char * const alt[16] =
15811 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15812 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15813 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15814 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15816 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15817 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15818 NULL,
15819 NULL,
15821 "fcomi\t{%y1, %0|%0, %y1}",
15822 "fcomip\t{%y1, %0|%0, %y1}",
15823 "fucomi\t{%y1, %0|%0, %y1}",
15824 "fucomip\t{%y1, %0|%0, %y1}",
15826 NULL,
15827 NULL,
15828 NULL,
15829 NULL
15832 int mask;
15833 const char *ret;
15835 mask = eflags_p << 3;
15836 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15837 mask |= unordered_p << 1;
15838 mask |= stack_top_dies;
15840 gcc_assert (mask < 16);
15841 ret = alt[mask];
15842 gcc_assert (ret);
15844 return ret;
15848 void
15849 ix86_output_addr_vec_elt (FILE *file, int value)
15851 const char *directive = ASM_LONG;
15853 #ifdef ASM_QUAD
15854 if (TARGET_LP64)
15855 directive = ASM_QUAD;
15856 #else
15857 gcc_assert (!TARGET_64BIT);
15858 #endif
15860 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15863 void
15864 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15866 const char *directive = ASM_LONG;
15868 #ifdef ASM_QUAD
15869 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15870 directive = ASM_QUAD;
15871 #else
15872 gcc_assert (!TARGET_64BIT);
15873 #endif
15874 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15875 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15876 fprintf (file, "%s%s%d-%s%d\n",
15877 directive, LPREFIX, value, LPREFIX, rel);
15878 else if (HAVE_AS_GOTOFF_IN_DATA)
15879 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15880 #if TARGET_MACHO
15881 else if (TARGET_MACHO)
15883 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15884 machopic_output_function_base_name (file);
15885 putc ('\n', file);
15887 #endif
15888 else
15889 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15890 GOT_SYMBOL_NAME, LPREFIX, value);
15893 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15894 for the target. */
15896 void
15897 ix86_expand_clear (rtx dest)
15899 rtx tmp;
15901 /* We play register width games, which are only valid after reload. */
15902 gcc_assert (reload_completed);
15904 /* Avoid HImode and its attendant prefix byte. */
15905 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15906 dest = gen_rtx_REG (SImode, REGNO (dest));
15907 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15909 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15910 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15912 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15913 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15916 emit_insn (tmp);
15919 /* X is an unchanging MEM. If it is a constant pool reference, return
15920 the constant pool rtx, else NULL. */
15923 maybe_get_pool_constant (rtx x)
15925 x = ix86_delegitimize_address (XEXP (x, 0));
15927 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15928 return get_pool_constant (x);
15930 return NULL_RTX;
15933 void
15934 ix86_expand_move (enum machine_mode mode, rtx operands[])
15936 rtx op0, op1;
15937 enum tls_model model;
15939 op0 = operands[0];
15940 op1 = operands[1];
15942 if (GET_CODE (op1) == SYMBOL_REF)
15944 model = SYMBOL_REF_TLS_MODEL (op1);
15945 if (model)
15947 op1 = legitimize_tls_address (op1, model, true);
15948 op1 = force_operand (op1, op0);
15949 if (op1 == op0)
15950 return;
15951 op1 = convert_to_mode (mode, op1, 1);
15953 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15954 && SYMBOL_REF_DLLIMPORT_P (op1))
15955 op1 = legitimize_dllimport_symbol (op1, false);
15957 else if (GET_CODE (op1) == CONST
15958 && GET_CODE (XEXP (op1, 0)) == PLUS
15959 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15961 rtx addend = XEXP (XEXP (op1, 0), 1);
15962 rtx symbol = XEXP (XEXP (op1, 0), 0);
15963 rtx tmp = NULL;
15965 model = SYMBOL_REF_TLS_MODEL (symbol);
15966 if (model)
15967 tmp = legitimize_tls_address (symbol, model, true);
15968 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15969 && SYMBOL_REF_DLLIMPORT_P (symbol))
15970 tmp = legitimize_dllimport_symbol (symbol, true);
15972 if (tmp)
15974 tmp = force_operand (tmp, NULL);
15975 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15976 op0, 1, OPTAB_DIRECT);
15977 if (tmp == op0)
15978 return;
15979 op1 = convert_to_mode (mode, tmp, 1);
15983 if ((flag_pic || MACHOPIC_INDIRECT)
15984 && symbolic_operand (op1, mode))
15986 if (TARGET_MACHO && !TARGET_64BIT)
15988 #if TARGET_MACHO
15989 /* dynamic-no-pic */
15990 if (MACHOPIC_INDIRECT)
15992 rtx temp = ((reload_in_progress
15993 || ((op0 && REG_P (op0))
15994 && mode == Pmode))
15995 ? op0 : gen_reg_rtx (Pmode));
15996 op1 = machopic_indirect_data_reference (op1, temp);
15997 if (MACHOPIC_PURE)
15998 op1 = machopic_legitimize_pic_address (op1, mode,
15999 temp == op1 ? 0 : temp);
16001 if (op0 != op1 && GET_CODE (op0) != MEM)
16003 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16004 emit_insn (insn);
16005 return;
16007 if (GET_CODE (op0) == MEM)
16008 op1 = force_reg (Pmode, op1);
16009 else
16011 rtx temp = op0;
16012 if (GET_CODE (temp) != REG)
16013 temp = gen_reg_rtx (Pmode);
16014 temp = legitimize_pic_address (op1, temp);
16015 if (temp == op0)
16016 return;
16017 op1 = temp;
16019 /* dynamic-no-pic */
16020 #endif
16022 else
16024 if (MEM_P (op0))
16025 op1 = force_reg (mode, op1);
16026 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16028 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16029 op1 = legitimize_pic_address (op1, reg);
16030 if (op0 == op1)
16031 return;
16032 op1 = convert_to_mode (mode, op1, 1);
16036 else
16038 if (MEM_P (op0)
16039 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16040 || !push_operand (op0, mode))
16041 && MEM_P (op1))
16042 op1 = force_reg (mode, op1);
16044 if (push_operand (op0, mode)
16045 && ! general_no_elim_operand (op1, mode))
16046 op1 = copy_to_mode_reg (mode, op1);
16048 /* Force large constants in 64bit compilation into register
16049 to get them CSEed. */
16050 if (can_create_pseudo_p ()
16051 && (mode == DImode) && TARGET_64BIT
16052 && immediate_operand (op1, mode)
16053 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16054 && !register_operand (op0, mode)
16055 && optimize)
16056 op1 = copy_to_mode_reg (mode, op1);
16058 if (can_create_pseudo_p ()
16059 && FLOAT_MODE_P (mode)
16060 && GET_CODE (op1) == CONST_DOUBLE)
16062 /* If we are loading a floating point constant to a register,
16063 force the value to memory now, since we'll get better code
16064 out the back end. */
16066 op1 = validize_mem (force_const_mem (mode, op1));
16067 if (!register_operand (op0, mode))
16069 rtx temp = gen_reg_rtx (mode);
16070 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16071 emit_move_insn (op0, temp);
16072 return;
16077 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16080 void
16081 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16083 rtx op0 = operands[0], op1 = operands[1];
16084 unsigned int align = GET_MODE_ALIGNMENT (mode);
16086 /* Force constants other than zero into memory. We do not know how
16087 the instructions used to build constants modify the upper 64 bits
16088 of the register, once we have that information we may be able
16089 to handle some of them more efficiently. */
16090 if (can_create_pseudo_p ()
16091 && register_operand (op0, mode)
16092 && (CONSTANT_P (op1)
16093 || (GET_CODE (op1) == SUBREG
16094 && CONSTANT_P (SUBREG_REG (op1))))
16095 && !standard_sse_constant_p (op1))
16096 op1 = validize_mem (force_const_mem (mode, op1));
16098 /* We need to check memory alignment for SSE mode since attribute
16099 can make operands unaligned. */
16100 if (can_create_pseudo_p ()
16101 && SSE_REG_MODE_P (mode)
16102 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16103 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16105 rtx tmp[2];
16107 /* ix86_expand_vector_move_misalign() does not like constants ... */
16108 if (CONSTANT_P (op1)
16109 || (GET_CODE (op1) == SUBREG
16110 && CONSTANT_P (SUBREG_REG (op1))))
16111 op1 = validize_mem (force_const_mem (mode, op1));
16113 /* ... nor both arguments in memory. */
16114 if (!register_operand (op0, mode)
16115 && !register_operand (op1, mode))
16116 op1 = force_reg (mode, op1);
16118 tmp[0] = op0; tmp[1] = op1;
16119 ix86_expand_vector_move_misalign (mode, tmp);
16120 return;
16123 /* Make operand1 a register if it isn't already. */
16124 if (can_create_pseudo_p ()
16125 && !register_operand (op0, mode)
16126 && !register_operand (op1, mode))
16128 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16129 return;
16132 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16135 /* Split 32-byte AVX unaligned load and store if needed. */
16137 static void
16138 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16140 rtx m;
16141 rtx (*extract) (rtx, rtx, rtx);
16142 rtx (*load_unaligned) (rtx, rtx);
16143 rtx (*store_unaligned) (rtx, rtx);
16144 enum machine_mode mode;
16146 switch (GET_MODE (op0))
16148 default:
16149 gcc_unreachable ();
16150 case V32QImode:
16151 extract = gen_avx_vextractf128v32qi;
16152 load_unaligned = gen_avx_loaddqu256;
16153 store_unaligned = gen_avx_storedqu256;
16154 mode = V16QImode;
16155 break;
16156 case V8SFmode:
16157 extract = gen_avx_vextractf128v8sf;
16158 load_unaligned = gen_avx_loadups256;
16159 store_unaligned = gen_avx_storeups256;
16160 mode = V4SFmode;
16161 break;
16162 case V4DFmode:
16163 extract = gen_avx_vextractf128v4df;
16164 load_unaligned = gen_avx_loadupd256;
16165 store_unaligned = gen_avx_storeupd256;
16166 mode = V2DFmode;
16167 break;
16170 if (MEM_P (op1))
16172 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16174 rtx r = gen_reg_rtx (mode);
16175 m = adjust_address (op1, mode, 0);
16176 emit_move_insn (r, m);
16177 m = adjust_address (op1, mode, 16);
16178 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16179 emit_move_insn (op0, r);
16181 else
16182 emit_insn (load_unaligned (op0, op1));
16184 else if (MEM_P (op0))
16186 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16188 m = adjust_address (op0, mode, 0);
16189 emit_insn (extract (m, op1, const0_rtx));
16190 m = adjust_address (op0, mode, 16);
16191 emit_insn (extract (m, op1, const1_rtx));
16193 else
16194 emit_insn (store_unaligned (op0, op1));
16196 else
16197 gcc_unreachable ();
16200 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16201 straight to ix86_expand_vector_move. */
16202 /* Code generation for scalar reg-reg moves of single and double precision data:
16203 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16204 movaps reg, reg
16205 else
16206 movss reg, reg
16207 if (x86_sse_partial_reg_dependency == true)
16208 movapd reg, reg
16209 else
16210 movsd reg, reg
16212 Code generation for scalar loads of double precision data:
16213 if (x86_sse_split_regs == true)
16214 movlpd mem, reg (gas syntax)
16215 else
16216 movsd mem, reg
16218 Code generation for unaligned packed loads of single precision data
16219 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16220 if (x86_sse_unaligned_move_optimal)
16221 movups mem, reg
16223 if (x86_sse_partial_reg_dependency == true)
16225 xorps reg, reg
16226 movlps mem, reg
16227 movhps mem+8, reg
16229 else
16231 movlps mem, reg
16232 movhps mem+8, reg
16235 Code generation for unaligned packed loads of double precision data
16236 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16237 if (x86_sse_unaligned_move_optimal)
16238 movupd mem, reg
16240 if (x86_sse_split_regs == true)
16242 movlpd mem, reg
16243 movhpd mem+8, reg
16245 else
16247 movsd mem, reg
16248 movhpd mem+8, reg
16252 void
16253 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16255 rtx op0, op1, m;
16257 op0 = operands[0];
16258 op1 = operands[1];
16260 if (TARGET_AVX
16261 && GET_MODE_SIZE (mode) == 32)
16263 switch (GET_MODE_CLASS (mode))
16265 case MODE_VECTOR_INT:
16266 case MODE_INT:
16267 op0 = gen_lowpart (V32QImode, op0);
16268 op1 = gen_lowpart (V32QImode, op1);
16269 /* FALLTHRU */
16271 case MODE_VECTOR_FLOAT:
16272 ix86_avx256_split_vector_move_misalign (op0, op1);
16273 break;
16275 default:
16276 gcc_unreachable ();
16279 return;
16282 if (MEM_P (op1))
16284 /* ??? If we have typed data, then it would appear that using
16285 movdqu is the only way to get unaligned data loaded with
16286 integer type. */
16287 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16289 op0 = gen_lowpart (V16QImode, op0);
16290 op1 = gen_lowpart (V16QImode, op1);
16291 /* We will eventually emit movups based on insn attributes. */
16292 emit_insn (gen_sse2_loaddqu (op0, op1));
16294 else if (TARGET_SSE2 && mode == V2DFmode)
16296 rtx zero;
16298 if (TARGET_AVX
16299 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16300 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16301 || optimize_function_for_size_p (cfun))
16303 /* We will eventually emit movups based on insn attributes. */
16304 emit_insn (gen_sse2_loadupd (op0, op1));
16305 return;
16308 /* When SSE registers are split into halves, we can avoid
16309 writing to the top half twice. */
16310 if (TARGET_SSE_SPLIT_REGS)
16312 emit_clobber (op0);
16313 zero = op0;
16315 else
16317 /* ??? Not sure about the best option for the Intel chips.
16318 The following would seem to satisfy; the register is
16319 entirely cleared, breaking the dependency chain. We
16320 then store to the upper half, with a dependency depth
16321 of one. A rumor has it that Intel recommends two movsd
16322 followed by an unpacklpd, but this is unconfirmed. And
16323 given that the dependency depth of the unpacklpd would
16324 still be one, I'm not sure why this would be better. */
16325 zero = CONST0_RTX (V2DFmode);
16328 m = adjust_address (op1, DFmode, 0);
16329 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16330 m = adjust_address (op1, DFmode, 8);
16331 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16333 else
16335 if (TARGET_AVX
16336 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16337 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16338 || optimize_function_for_size_p (cfun))
16340 op0 = gen_lowpart (V4SFmode, op0);
16341 op1 = gen_lowpart (V4SFmode, op1);
16342 emit_insn (gen_sse_loadups (op0, op1));
16343 return;
16346 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16347 emit_move_insn (op0, CONST0_RTX (mode));
16348 else
16349 emit_clobber (op0);
16351 if (mode != V4SFmode)
16352 op0 = gen_lowpart (V4SFmode, op0);
16354 m = adjust_address (op1, V2SFmode, 0);
16355 emit_insn (gen_sse_loadlps (op0, op0, m));
16356 m = adjust_address (op1, V2SFmode, 8);
16357 emit_insn (gen_sse_loadhps (op0, op0, m));
16360 else if (MEM_P (op0))
16362 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16364 op0 = gen_lowpart (V16QImode, op0);
16365 op1 = gen_lowpart (V16QImode, op1);
16366 /* We will eventually emit movups based on insn attributes. */
16367 emit_insn (gen_sse2_storedqu (op0, op1));
16369 else if (TARGET_SSE2 && mode == V2DFmode)
16371 if (TARGET_AVX
16372 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16373 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16374 || optimize_function_for_size_p (cfun))
16375 /* We will eventually emit movups based on insn attributes. */
16376 emit_insn (gen_sse2_storeupd (op0, op1));
16377 else
16379 m = adjust_address (op0, DFmode, 0);
16380 emit_insn (gen_sse2_storelpd (m, op1));
16381 m = adjust_address (op0, DFmode, 8);
16382 emit_insn (gen_sse2_storehpd (m, op1));
16385 else
16387 if (mode != V4SFmode)
16388 op1 = gen_lowpart (V4SFmode, op1);
16390 if (TARGET_AVX
16391 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16392 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16393 || optimize_function_for_size_p (cfun))
16395 op0 = gen_lowpart (V4SFmode, op0);
16396 emit_insn (gen_sse_storeups (op0, op1));
16398 else
16400 m = adjust_address (op0, V2SFmode, 0);
16401 emit_insn (gen_sse_storelps (m, op1));
16402 m = adjust_address (op0, V2SFmode, 8);
16403 emit_insn (gen_sse_storehps (m, op1));
16407 else
16408 gcc_unreachable ();
16411 /* Expand a push in MODE. This is some mode for which we do not support
16412 proper push instructions, at least from the registers that we expect
16413 the value to live in. */
16415 void
16416 ix86_expand_push (enum machine_mode mode, rtx x)
16418 rtx tmp;
16420 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16421 GEN_INT (-GET_MODE_SIZE (mode)),
16422 stack_pointer_rtx, 1, OPTAB_DIRECT);
16423 if (tmp != stack_pointer_rtx)
16424 emit_move_insn (stack_pointer_rtx, tmp);
16426 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16428 /* When we push an operand onto stack, it has to be aligned at least
16429 at the function argument boundary. However since we don't have
16430 the argument type, we can't determine the actual argument
16431 boundary. */
16432 emit_move_insn (tmp, x);
16435 /* Helper function of ix86_fixup_binary_operands to canonicalize
16436 operand order. Returns true if the operands should be swapped. */
16438 static bool
16439 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16440 rtx operands[])
16442 rtx dst = operands[0];
16443 rtx src1 = operands[1];
16444 rtx src2 = operands[2];
16446 /* If the operation is not commutative, we can't do anything. */
16447 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16448 return false;
16450 /* Highest priority is that src1 should match dst. */
16451 if (rtx_equal_p (dst, src1))
16452 return false;
16453 if (rtx_equal_p (dst, src2))
16454 return true;
16456 /* Next highest priority is that immediate constants come second. */
16457 if (immediate_operand (src2, mode))
16458 return false;
16459 if (immediate_operand (src1, mode))
16460 return true;
16462 /* Lowest priority is that memory references should come second. */
16463 if (MEM_P (src2))
16464 return false;
16465 if (MEM_P (src1))
16466 return true;
16468 return false;
16472 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16473 destination to use for the operation. If different from the true
16474 destination in operands[0], a copy operation will be required. */
16477 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16478 rtx operands[])
16480 rtx dst = operands[0];
16481 rtx src1 = operands[1];
16482 rtx src2 = operands[2];
16484 /* Canonicalize operand order. */
16485 if (ix86_swap_binary_operands_p (code, mode, operands))
16487 rtx temp;
16489 /* It is invalid to swap operands of different modes. */
16490 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16492 temp = src1;
16493 src1 = src2;
16494 src2 = temp;
16497 /* Both source operands cannot be in memory. */
16498 if (MEM_P (src1) && MEM_P (src2))
16500 /* Optimization: Only read from memory once. */
16501 if (rtx_equal_p (src1, src2))
16503 src2 = force_reg (mode, src2);
16504 src1 = src2;
16506 else
16507 src2 = force_reg (mode, src2);
16510 /* If the destination is memory, and we do not have matching source
16511 operands, do things in registers. */
16512 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16513 dst = gen_reg_rtx (mode);
16515 /* Source 1 cannot be a constant. */
16516 if (CONSTANT_P (src1))
16517 src1 = force_reg (mode, src1);
16519 /* Source 1 cannot be a non-matching memory. */
16520 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16521 src1 = force_reg (mode, src1);
16523 /* Improve address combine. */
16524 if (code == PLUS
16525 && GET_MODE_CLASS (mode) == MODE_INT
16526 && MEM_P (src2))
16527 src2 = force_reg (mode, src2);
16529 operands[1] = src1;
16530 operands[2] = src2;
16531 return dst;
16534 /* Similarly, but assume that the destination has already been
16535 set up properly. */
16537 void
16538 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16539 enum machine_mode mode, rtx operands[])
16541 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16542 gcc_assert (dst == operands[0]);
16545 /* Attempt to expand a binary operator. Make the expansion closer to the
16546 actual machine, then just general_operand, which will allow 3 separate
16547 memory references (one output, two input) in a single insn. */
16549 void
16550 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16551 rtx operands[])
16553 rtx src1, src2, dst, op, clob;
16555 dst = ix86_fixup_binary_operands (code, mode, operands);
16556 src1 = operands[1];
16557 src2 = operands[2];
16559 /* Emit the instruction. */
16561 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16562 if (reload_in_progress)
16564 /* Reload doesn't know about the flags register, and doesn't know that
16565 it doesn't want to clobber it. We can only do this with PLUS. */
16566 gcc_assert (code == PLUS);
16567 emit_insn (op);
16569 else if (reload_completed
16570 && code == PLUS
16571 && !rtx_equal_p (dst, src1))
16573 /* This is going to be an LEA; avoid splitting it later. */
16574 emit_insn (op);
16576 else
16578 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16579 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16582 /* Fix up the destination if needed. */
16583 if (dst != operands[0])
16584 emit_move_insn (operands[0], dst);
16587 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16588 the given OPERANDS. */
16590 void
16591 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16592 rtx operands[])
16594 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16595 if (GET_CODE (operands[1]) == SUBREG)
16597 op1 = operands[1];
16598 op2 = operands[2];
16600 else if (GET_CODE (operands[2]) == SUBREG)
16602 op1 = operands[2];
16603 op2 = operands[1];
16605 /* Optimize (__m128i) d | (__m128i) e and similar code
16606 when d and e are float vectors into float vector logical
16607 insn. In C/C++ without using intrinsics there is no other way
16608 to express vector logical operation on float vectors than
16609 to cast them temporarily to integer vectors. */
16610 if (op1
16611 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16612 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16613 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16614 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16615 && SUBREG_BYTE (op1) == 0
16616 && (GET_CODE (op2) == CONST_VECTOR
16617 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16618 && SUBREG_BYTE (op2) == 0))
16619 && can_create_pseudo_p ())
16621 rtx dst;
16622 switch (GET_MODE (SUBREG_REG (op1)))
16624 case V4SFmode:
16625 case V8SFmode:
16626 case V2DFmode:
16627 case V4DFmode:
16628 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16629 if (GET_CODE (op2) == CONST_VECTOR)
16631 op2 = gen_lowpart (GET_MODE (dst), op2);
16632 op2 = force_reg (GET_MODE (dst), op2);
16634 else
16636 op1 = operands[1];
16637 op2 = SUBREG_REG (operands[2]);
16638 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16639 op2 = force_reg (GET_MODE (dst), op2);
16641 op1 = SUBREG_REG (op1);
16642 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16643 op1 = force_reg (GET_MODE (dst), op1);
16644 emit_insn (gen_rtx_SET (VOIDmode, dst,
16645 gen_rtx_fmt_ee (code, GET_MODE (dst),
16646 op1, op2)));
16647 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16648 return;
16649 default:
16650 break;
16653 if (!nonimmediate_operand (operands[1], mode))
16654 operands[1] = force_reg (mode, operands[1]);
16655 if (!nonimmediate_operand (operands[2], mode))
16656 operands[2] = force_reg (mode, operands[2]);
16657 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16658 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16659 gen_rtx_fmt_ee (code, mode, operands[1],
16660 operands[2])));
16663 /* Return TRUE or FALSE depending on whether the binary operator meets the
16664 appropriate constraints. */
16666 bool
16667 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16668 rtx operands[3])
16670 rtx dst = operands[0];
16671 rtx src1 = operands[1];
16672 rtx src2 = operands[2];
16674 /* Both source operands cannot be in memory. */
16675 if (MEM_P (src1) && MEM_P (src2))
16676 return false;
16678 /* Canonicalize operand order for commutative operators. */
16679 if (ix86_swap_binary_operands_p (code, mode, operands))
16681 rtx temp = src1;
16682 src1 = src2;
16683 src2 = temp;
16686 /* If the destination is memory, we must have a matching source operand. */
16687 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16688 return false;
16690 /* Source 1 cannot be a constant. */
16691 if (CONSTANT_P (src1))
16692 return false;
16694 /* Source 1 cannot be a non-matching memory. */
16695 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16696 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16697 return (code == AND
16698 && (mode == HImode
16699 || mode == SImode
16700 || (TARGET_64BIT && mode == DImode))
16701 && satisfies_constraint_L (src2));
16703 return true;
16706 /* Attempt to expand a unary operator. Make the expansion closer to the
16707 actual machine, then just general_operand, which will allow 2 separate
16708 memory references (one output, one input) in a single insn. */
16710 void
16711 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16712 rtx operands[])
16714 int matching_memory;
16715 rtx src, dst, op, clob;
16717 dst = operands[0];
16718 src = operands[1];
16720 /* If the destination is memory, and we do not have matching source
16721 operands, do things in registers. */
16722 matching_memory = 0;
16723 if (MEM_P (dst))
16725 if (rtx_equal_p (dst, src))
16726 matching_memory = 1;
16727 else
16728 dst = gen_reg_rtx (mode);
16731 /* When source operand is memory, destination must match. */
16732 if (MEM_P (src) && !matching_memory)
16733 src = force_reg (mode, src);
16735 /* Emit the instruction. */
16737 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16738 if (reload_in_progress || code == NOT)
16740 /* Reload doesn't know about the flags register, and doesn't know that
16741 it doesn't want to clobber it. */
16742 gcc_assert (code == NOT);
16743 emit_insn (op);
16745 else
16747 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16748 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16751 /* Fix up the destination if needed. */
16752 if (dst != operands[0])
16753 emit_move_insn (operands[0], dst);
16756 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16757 divisor are within the range [0-255]. */
16759 void
16760 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16761 bool signed_p)
16763 rtx end_label, qimode_label;
16764 rtx insn, div, mod;
16765 rtx scratch, tmp0, tmp1, tmp2;
16766 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16767 rtx (*gen_zero_extend) (rtx, rtx);
16768 rtx (*gen_test_ccno_1) (rtx, rtx);
16770 switch (mode)
16772 case SImode:
16773 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16774 gen_test_ccno_1 = gen_testsi_ccno_1;
16775 gen_zero_extend = gen_zero_extendqisi2;
16776 break;
16777 case DImode:
16778 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16779 gen_test_ccno_1 = gen_testdi_ccno_1;
16780 gen_zero_extend = gen_zero_extendqidi2;
16781 break;
16782 default:
16783 gcc_unreachable ();
16786 end_label = gen_label_rtx ();
16787 qimode_label = gen_label_rtx ();
16789 scratch = gen_reg_rtx (mode);
16791 /* Use 8bit unsigned divimod if dividend and divisor are within
16792 the range [0-255]. */
16793 emit_move_insn (scratch, operands[2]);
16794 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16795 scratch, 1, OPTAB_DIRECT);
16796 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16797 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16798 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16799 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16800 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16801 pc_rtx);
16802 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16803 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16804 JUMP_LABEL (insn) = qimode_label;
16806 /* Generate original signed/unsigned divimod. */
16807 div = gen_divmod4_1 (operands[0], operands[1],
16808 operands[2], operands[3]);
16809 emit_insn (div);
16811 /* Branch to the end. */
16812 emit_jump_insn (gen_jump (end_label));
16813 emit_barrier ();
16815 /* Generate 8bit unsigned divide. */
16816 emit_label (qimode_label);
16817 /* Don't use operands[0] for result of 8bit divide since not all
16818 registers support QImode ZERO_EXTRACT. */
16819 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16820 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16821 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16822 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16824 if (signed_p)
16826 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16827 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16829 else
16831 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16832 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16835 /* Extract remainder from AH. */
16836 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16837 if (REG_P (operands[1]))
16838 insn = emit_move_insn (operands[1], tmp1);
16839 else
16841 /* Need a new scratch register since the old one has result
16842 of 8bit divide. */
16843 scratch = gen_reg_rtx (mode);
16844 emit_move_insn (scratch, tmp1);
16845 insn = emit_move_insn (operands[1], scratch);
16847 set_unique_reg_note (insn, REG_EQUAL, mod);
16849 /* Zero extend quotient from AL. */
16850 tmp1 = gen_lowpart (QImode, tmp0);
16851 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16852 set_unique_reg_note (insn, REG_EQUAL, div);
16854 emit_label (end_label);
16857 #define LEA_MAX_STALL (3)
16858 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16860 /* Increase given DISTANCE in half-cycles according to
16861 dependencies between PREV and NEXT instructions.
16862 Add 1 half-cycle if there is no dependency and
16863 go to next cycle if there is some dependecy. */
16865 static unsigned int
16866 increase_distance (rtx prev, rtx next, unsigned int distance)
16868 df_ref *use_rec;
16869 df_ref *def_rec;
16871 if (!prev || !next)
16872 return distance + (distance & 1) + 2;
16874 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16875 return distance + 1;
16877 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16878 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16879 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16880 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16881 return distance + (distance & 1) + 2;
16883 return distance + 1;
16886 /* Function checks if instruction INSN defines register number
16887 REGNO1 or REGNO2. */
16889 static bool
16890 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16891 rtx insn)
16893 df_ref *def_rec;
16895 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16896 if (DF_REF_REG_DEF_P (*def_rec)
16897 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16898 && (regno1 == DF_REF_REGNO (*def_rec)
16899 || regno2 == DF_REF_REGNO (*def_rec)))
16901 return true;
16904 return false;
16907 /* Function checks if instruction INSN uses register number
16908 REGNO as a part of address expression. */
16910 static bool
16911 insn_uses_reg_mem (unsigned int regno, rtx insn)
16913 df_ref *use_rec;
16915 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16916 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16917 return true;
16919 return false;
16922 /* Search backward for non-agu definition of register number REGNO1
16923 or register number REGNO2 in basic block starting from instruction
16924 START up to head of basic block or instruction INSN.
16926 Function puts true value into *FOUND var if definition was found
16927 and false otherwise.
16929 Distance in half-cycles between START and found instruction or head
16930 of BB is added to DISTANCE and returned. */
16932 static int
16933 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16934 rtx insn, int distance,
16935 rtx start, bool *found)
16937 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16938 rtx prev = start;
16939 rtx next = NULL;
16941 *found = false;
16943 while (prev
16944 && prev != insn
16945 && distance < LEA_SEARCH_THRESHOLD)
16947 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16949 distance = increase_distance (prev, next, distance);
16950 if (insn_defines_reg (regno1, regno2, prev))
16952 if (recog_memoized (prev) < 0
16953 || get_attr_type (prev) != TYPE_LEA)
16955 *found = true;
16956 return distance;
16960 next = prev;
16962 if (prev == BB_HEAD (bb))
16963 break;
16965 prev = PREV_INSN (prev);
16968 return distance;
16971 /* Search backward for non-agu definition of register number REGNO1
16972 or register number REGNO2 in INSN's basic block until
16973 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16974 2. Reach neighbour BBs boundary, or
16975 3. Reach agu definition.
16976 Returns the distance between the non-agu definition point and INSN.
16977 If no definition point, returns -1. */
16979 static int
16980 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16981 rtx insn)
16983 basic_block bb = BLOCK_FOR_INSN (insn);
16984 int distance = 0;
16985 bool found = false;
16987 if (insn != BB_HEAD (bb))
16988 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16989 distance, PREV_INSN (insn),
16990 &found);
16992 if (!found && distance < LEA_SEARCH_THRESHOLD)
16994 edge e;
16995 edge_iterator ei;
16996 bool simple_loop = false;
16998 FOR_EACH_EDGE (e, ei, bb->preds)
16999 if (e->src == bb)
17001 simple_loop = true;
17002 break;
17005 if (simple_loop)
17006 distance = distance_non_agu_define_in_bb (regno1, regno2,
17007 insn, distance,
17008 BB_END (bb), &found);
17009 else
17011 int shortest_dist = -1;
17012 bool found_in_bb = false;
17014 FOR_EACH_EDGE (e, ei, bb->preds)
17016 int bb_dist
17017 = distance_non_agu_define_in_bb (regno1, regno2,
17018 insn, distance,
17019 BB_END (e->src),
17020 &found_in_bb);
17021 if (found_in_bb)
17023 if (shortest_dist < 0)
17024 shortest_dist = bb_dist;
17025 else if (bb_dist > 0)
17026 shortest_dist = MIN (bb_dist, shortest_dist);
17028 found = true;
17032 distance = shortest_dist;
17036 /* get_attr_type may modify recog data. We want to make sure
17037 that recog data is valid for instruction INSN, on which
17038 distance_non_agu_define is called. INSN is unchanged here. */
17039 extract_insn_cached (insn);
17041 if (!found)
17042 return -1;
17044 return distance >> 1;
17047 /* Return the distance in half-cycles between INSN and the next
17048 insn that uses register number REGNO in memory address added
17049 to DISTANCE. Return -1 if REGNO0 is set.
17051 Put true value into *FOUND if register usage was found and
17052 false otherwise.
17053 Put true value into *REDEFINED if register redefinition was
17054 found and false otherwise. */
17056 static int
17057 distance_agu_use_in_bb (unsigned int regno,
17058 rtx insn, int distance, rtx start,
17059 bool *found, bool *redefined)
17061 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17062 rtx next = start;
17063 rtx prev = NULL;
17065 *found = false;
17066 *redefined = false;
17068 while (next
17069 && next != insn
17070 && distance < LEA_SEARCH_THRESHOLD)
17072 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17074 distance = increase_distance(prev, next, distance);
17075 if (insn_uses_reg_mem (regno, next))
17077 /* Return DISTANCE if OP0 is used in memory
17078 address in NEXT. */
17079 *found = true;
17080 return distance;
17083 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17085 /* Return -1 if OP0 is set in NEXT. */
17086 *redefined = true;
17087 return -1;
17090 prev = next;
17093 if (next == BB_END (bb))
17094 break;
17096 next = NEXT_INSN (next);
17099 return distance;
17102 /* Return the distance between INSN and the next insn that uses
17103 register number REGNO0 in memory address. Return -1 if no such
17104 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17106 static int
17107 distance_agu_use (unsigned int regno0, rtx insn)
17109 basic_block bb = BLOCK_FOR_INSN (insn);
17110 int distance = 0;
17111 bool found = false;
17112 bool redefined = false;
17114 if (insn != BB_END (bb))
17115 distance = distance_agu_use_in_bb (regno0, insn, distance,
17116 NEXT_INSN (insn),
17117 &found, &redefined);
17119 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17121 edge e;
17122 edge_iterator ei;
17123 bool simple_loop = false;
17125 FOR_EACH_EDGE (e, ei, bb->succs)
17126 if (e->dest == bb)
17128 simple_loop = true;
17129 break;
17132 if (simple_loop)
17133 distance = distance_agu_use_in_bb (regno0, insn,
17134 distance, BB_HEAD (bb),
17135 &found, &redefined);
17136 else
17138 int shortest_dist = -1;
17139 bool found_in_bb = false;
17140 bool redefined_in_bb = false;
17142 FOR_EACH_EDGE (e, ei, bb->succs)
17144 int bb_dist
17145 = distance_agu_use_in_bb (regno0, insn,
17146 distance, BB_HEAD (e->dest),
17147 &found_in_bb, &redefined_in_bb);
17148 if (found_in_bb)
17150 if (shortest_dist < 0)
17151 shortest_dist = bb_dist;
17152 else if (bb_dist > 0)
17153 shortest_dist = MIN (bb_dist, shortest_dist);
17155 found = true;
17159 distance = shortest_dist;
17163 if (!found || redefined)
17164 return -1;
17166 return distance >> 1;
17169 /* Define this macro to tune LEA priority vs ADD, it take effect when
17170 there is a dilemma of choicing LEA or ADD
17171 Negative value: ADD is more preferred than LEA
17172 Zero: Netrual
17173 Positive value: LEA is more preferred than ADD*/
17174 #define IX86_LEA_PRIORITY 0
17176 /* Return true if usage of lea INSN has performance advantage
17177 over a sequence of instructions. Instructions sequence has
17178 SPLIT_COST cycles higher latency than lea latency. */
17180 static bool
17181 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17182 unsigned int regno2, int split_cost)
17184 int dist_define, dist_use;
17186 dist_define = distance_non_agu_define (regno1, regno2, insn);
17187 dist_use = distance_agu_use (regno0, insn);
17189 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17191 /* If there is no non AGU operand definition, no AGU
17192 operand usage and split cost is 0 then both lea
17193 and non lea variants have same priority. Currently
17194 we prefer lea for 64 bit code and non lea on 32 bit
17195 code. */
17196 if (dist_use < 0 && split_cost == 0)
17197 return TARGET_64BIT || IX86_LEA_PRIORITY;
17198 else
17199 return true;
17202 /* With longer definitions distance lea is more preferable.
17203 Here we change it to take into account splitting cost and
17204 lea priority. */
17205 dist_define += split_cost + IX86_LEA_PRIORITY;
17207 /* If there is no use in memory addess then we just check
17208 that split cost exceeds AGU stall. */
17209 if (dist_use < 0)
17210 return dist_define > LEA_MAX_STALL;
17212 /* If this insn has both backward non-agu dependence and forward
17213 agu dependence, the one with short distance takes effect. */
17214 return dist_define >= dist_use;
17217 /* Return true if it is legal to clobber flags by INSN and
17218 false otherwise. */
17220 static bool
17221 ix86_ok_to_clobber_flags (rtx insn)
17223 basic_block bb = BLOCK_FOR_INSN (insn);
17224 df_ref *use;
17225 bitmap live;
17227 while (insn)
17229 if (NONDEBUG_INSN_P (insn))
17231 for (use = DF_INSN_USES (insn); *use; use++)
17232 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17233 return false;
17235 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17236 return true;
17239 if (insn == BB_END (bb))
17240 break;
17242 insn = NEXT_INSN (insn);
17245 live = df_get_live_out(bb);
17246 return !REGNO_REG_SET_P (live, FLAGS_REG);
17249 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17250 move and add to avoid AGU stalls. */
17252 bool
17253 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17255 unsigned int regno0, regno1, regno2;
17257 /* Check if we need to optimize. */
17258 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17259 return false;
17261 /* Check it is correct to split here. */
17262 if (!ix86_ok_to_clobber_flags(insn))
17263 return false;
17265 regno0 = true_regnum (operands[0]);
17266 regno1 = true_regnum (operands[1]);
17267 regno2 = true_regnum (operands[2]);
17269 /* We need to split only adds with non destructive
17270 destination operand. */
17271 if (regno0 == regno1 || regno0 == regno2)
17272 return false;
17273 else
17274 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17277 /* Return true if we should emit lea instruction instead of mov
17278 instruction. */
17280 bool
17281 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17283 unsigned int regno0, regno1;
17285 /* Check if we need to optimize. */
17286 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17287 return false;
17289 /* Use lea for reg to reg moves only. */
17290 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17291 return false;
17293 regno0 = true_regnum (operands[0]);
17294 regno1 = true_regnum (operands[1]);
17296 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17299 /* Return true if we need to split lea into a sequence of
17300 instructions to avoid AGU stalls. */
17302 bool
17303 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17305 unsigned int regno0, regno1, regno2;
17306 int split_cost;
17307 struct ix86_address parts;
17308 int ok;
17310 /* Check we need to optimize. */
17311 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17312 return false;
17314 /* The "at least two components" test below might not catch simple
17315 move or zero extension insns if parts.base is non-NULL and parts.disp
17316 is const0_rtx as the only components in the address, e.g. if the
17317 register is %rbp or %r13. As this test is much cheaper and moves or
17318 zero extensions are the common case, do this check first. */
17319 if (REG_P (operands[1])
17320 || (SImode_address_operand (operands[1], VOIDmode)
17321 && REG_P (XEXP (operands[1], 0))))
17322 return false;
17324 /* Check if it is OK to split here. */
17325 if (!ix86_ok_to_clobber_flags (insn))
17326 return false;
17328 ok = ix86_decompose_address (operands[1], &parts);
17329 gcc_assert (ok);
17331 /* There should be at least two components in the address. */
17332 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17333 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17334 return false;
17336 /* We should not split into add if non legitimate pic
17337 operand is used as displacement. */
17338 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17339 return false;
17341 regno0 = true_regnum (operands[0]) ;
17342 regno1 = INVALID_REGNUM;
17343 regno2 = INVALID_REGNUM;
17345 if (parts.base)
17346 regno1 = true_regnum (parts.base);
17347 if (parts.index)
17348 regno2 = true_regnum (parts.index);
17350 split_cost = 0;
17352 /* Compute how many cycles we will add to execution time
17353 if split lea into a sequence of instructions. */
17354 if (parts.base || parts.index)
17356 /* Have to use mov instruction if non desctructive
17357 destination form is used. */
17358 if (regno1 != regno0 && regno2 != regno0)
17359 split_cost += 1;
17361 /* Have to add index to base if both exist. */
17362 if (parts.base && parts.index)
17363 split_cost += 1;
17365 /* Have to use shift and adds if scale is 2 or greater. */
17366 if (parts.scale > 1)
17368 if (regno0 != regno1)
17369 split_cost += 1;
17370 else if (regno2 == regno0)
17371 split_cost += 4;
17372 else
17373 split_cost += parts.scale;
17376 /* Have to use add instruction with immediate if
17377 disp is non zero. */
17378 if (parts.disp && parts.disp != const0_rtx)
17379 split_cost += 1;
17381 /* Subtract the price of lea. */
17382 split_cost -= 1;
17385 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17388 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17389 matches destination. RTX includes clobber of FLAGS_REG. */
17391 static void
17392 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17393 rtx dst, rtx src)
17395 rtx op, clob;
17397 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17398 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17400 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17403 /* Return true if regno1 def is nearest to the insn. */
17405 static bool
17406 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17408 rtx prev = insn;
17409 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17411 if (insn == start)
17412 return false;
17413 while (prev && prev != start)
17415 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17417 prev = PREV_INSN (prev);
17418 continue;
17420 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17421 return true;
17422 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17423 return false;
17424 prev = PREV_INSN (prev);
17427 /* None of the regs is defined in the bb. */
17428 return false;
17431 /* Split lea instructions into a sequence of instructions
17432 which are executed on ALU to avoid AGU stalls.
17433 It is assumed that it is allowed to clobber flags register
17434 at lea position. */
17436 void
17437 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17439 unsigned int regno0, regno1, regno2;
17440 struct ix86_address parts;
17441 rtx target, tmp;
17442 int ok, adds;
17444 ok = ix86_decompose_address (operands[1], &parts);
17445 gcc_assert (ok);
17447 target = gen_lowpart (mode, operands[0]);
17449 regno0 = true_regnum (target);
17450 regno1 = INVALID_REGNUM;
17451 regno2 = INVALID_REGNUM;
17453 if (parts.base)
17455 parts.base = gen_lowpart (mode, parts.base);
17456 regno1 = true_regnum (parts.base);
17459 if (parts.index)
17461 parts.index = gen_lowpart (mode, parts.index);
17462 regno2 = true_regnum (parts.index);
17465 if (parts.disp)
17466 parts.disp = gen_lowpart (mode, parts.disp);
17468 if (parts.scale > 1)
17470 /* Case r1 = r1 + ... */
17471 if (regno1 == regno0)
17473 /* If we have a case r1 = r1 + C * r1 then we
17474 should use multiplication which is very
17475 expensive. Assume cost model is wrong if we
17476 have such case here. */
17477 gcc_assert (regno2 != regno0);
17479 for (adds = parts.scale; adds > 0; adds--)
17480 ix86_emit_binop (PLUS, mode, target, parts.index);
17482 else
17484 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17485 if (regno0 != regno2)
17486 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17488 /* Use shift for scaling. */
17489 ix86_emit_binop (ASHIFT, mode, target,
17490 GEN_INT (exact_log2 (parts.scale)));
17492 if (parts.base)
17493 ix86_emit_binop (PLUS, mode, target, parts.base);
17495 if (parts.disp && parts.disp != const0_rtx)
17496 ix86_emit_binop (PLUS, mode, target, parts.disp);
17499 else if (!parts.base && !parts.index)
17501 gcc_assert(parts.disp);
17502 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17504 else
17506 if (!parts.base)
17508 if (regno0 != regno2)
17509 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17511 else if (!parts.index)
17513 if (regno0 != regno1)
17514 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17516 else
17518 if (regno0 == regno1)
17519 tmp = parts.index;
17520 else if (regno0 == regno2)
17521 tmp = parts.base;
17522 else
17524 rtx tmp1;
17526 /* Find better operand for SET instruction, depending
17527 on which definition is farther from the insn. */
17528 if (find_nearest_reg_def (insn, regno1, regno2))
17529 tmp = parts.index, tmp1 = parts.base;
17530 else
17531 tmp = parts.base, tmp1 = parts.index;
17533 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17535 if (parts.disp && parts.disp != const0_rtx)
17536 ix86_emit_binop (PLUS, mode, target, parts.disp);
17538 ix86_emit_binop (PLUS, mode, target, tmp1);
17539 return;
17542 ix86_emit_binop (PLUS, mode, target, tmp);
17545 if (parts.disp && parts.disp != const0_rtx)
17546 ix86_emit_binop (PLUS, mode, target, parts.disp);
17550 /* Return true if it is ok to optimize an ADD operation to LEA
17551 operation to avoid flag register consumation. For most processors,
17552 ADD is faster than LEA. For the processors like ATOM, if the
17553 destination register of LEA holds an actual address which will be
17554 used soon, LEA is better and otherwise ADD is better. */
17556 bool
17557 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17559 unsigned int regno0 = true_regnum (operands[0]);
17560 unsigned int regno1 = true_regnum (operands[1]);
17561 unsigned int regno2 = true_regnum (operands[2]);
17563 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17564 if (regno0 != regno1 && regno0 != regno2)
17565 return true;
17567 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17568 return false;
17570 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17573 /* Return true if destination reg of SET_BODY is shift count of
17574 USE_BODY. */
17576 static bool
17577 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17579 rtx set_dest;
17580 rtx shift_rtx;
17581 int i;
17583 /* Retrieve destination of SET_BODY. */
17584 switch (GET_CODE (set_body))
17586 case SET:
17587 set_dest = SET_DEST (set_body);
17588 if (!set_dest || !REG_P (set_dest))
17589 return false;
17590 break;
17591 case PARALLEL:
17592 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17593 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17594 use_body))
17595 return true;
17596 default:
17597 return false;
17598 break;
17601 /* Retrieve shift count of USE_BODY. */
17602 switch (GET_CODE (use_body))
17604 case SET:
17605 shift_rtx = XEXP (use_body, 1);
17606 break;
17607 case PARALLEL:
17608 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17609 if (ix86_dep_by_shift_count_body (set_body,
17610 XVECEXP (use_body, 0, i)))
17611 return true;
17612 default:
17613 return false;
17614 break;
17617 if (shift_rtx
17618 && (GET_CODE (shift_rtx) == ASHIFT
17619 || GET_CODE (shift_rtx) == LSHIFTRT
17620 || GET_CODE (shift_rtx) == ASHIFTRT
17621 || GET_CODE (shift_rtx) == ROTATE
17622 || GET_CODE (shift_rtx) == ROTATERT))
17624 rtx shift_count = XEXP (shift_rtx, 1);
17626 /* Return true if shift count is dest of SET_BODY. */
17627 if (REG_P (shift_count))
17629 /* Add check since it can be invoked before register
17630 allocation in pre-reload schedule. */
17631 if (reload_completed
17632 && true_regnum (set_dest) == true_regnum (shift_count))
17633 return true;
17634 else if (REGNO(set_dest) == REGNO(shift_count))
17635 return true;
17639 return false;
17642 /* Return true if destination reg of SET_INSN is shift count of
17643 USE_INSN. */
17645 bool
17646 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17648 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17649 PATTERN (use_insn));
17652 /* Return TRUE or FALSE depending on whether the unary operator meets the
17653 appropriate constraints. */
17655 bool
17656 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17657 enum machine_mode mode ATTRIBUTE_UNUSED,
17658 rtx operands[2] ATTRIBUTE_UNUSED)
17660 /* If one of operands is memory, source and destination must match. */
17661 if ((MEM_P (operands[0])
17662 || MEM_P (operands[1]))
17663 && ! rtx_equal_p (operands[0], operands[1]))
17664 return false;
17665 return true;
17668 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17669 are ok, keeping in mind the possible movddup alternative. */
17671 bool
17672 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17674 if (MEM_P (operands[0]))
17675 return rtx_equal_p (operands[0], operands[1 + high]);
17676 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17677 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17678 return true;
17681 /* Post-reload splitter for converting an SF or DFmode value in an
17682 SSE register into an unsigned SImode. */
17684 void
17685 ix86_split_convert_uns_si_sse (rtx operands[])
17687 enum machine_mode vecmode;
17688 rtx value, large, zero_or_two31, input, two31, x;
17690 large = operands[1];
17691 zero_or_two31 = operands[2];
17692 input = operands[3];
17693 two31 = operands[4];
17694 vecmode = GET_MODE (large);
17695 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17697 /* Load up the value into the low element. We must ensure that the other
17698 elements are valid floats -- zero is the easiest such value. */
17699 if (MEM_P (input))
17701 if (vecmode == V4SFmode)
17702 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17703 else
17704 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17706 else
17708 input = gen_rtx_REG (vecmode, REGNO (input));
17709 emit_move_insn (value, CONST0_RTX (vecmode));
17710 if (vecmode == V4SFmode)
17711 emit_insn (gen_sse_movss (value, value, input));
17712 else
17713 emit_insn (gen_sse2_movsd (value, value, input));
17716 emit_move_insn (large, two31);
17717 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17719 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17720 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17722 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17723 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17725 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17726 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17728 large = gen_rtx_REG (V4SImode, REGNO (large));
17729 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17731 x = gen_rtx_REG (V4SImode, REGNO (value));
17732 if (vecmode == V4SFmode)
17733 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17734 else
17735 emit_insn (gen_sse2_cvttpd2dq (x, value));
17736 value = x;
17738 emit_insn (gen_xorv4si3 (value, value, large));
17741 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17742 Expects the 64-bit DImode to be supplied in a pair of integral
17743 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17744 -mfpmath=sse, !optimize_size only. */
17746 void
17747 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17749 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17750 rtx int_xmm, fp_xmm;
17751 rtx biases, exponents;
17752 rtx x;
17754 int_xmm = gen_reg_rtx (V4SImode);
17755 if (TARGET_INTER_UNIT_MOVES)
17756 emit_insn (gen_movdi_to_sse (int_xmm, input));
17757 else if (TARGET_SSE_SPLIT_REGS)
17759 emit_clobber (int_xmm);
17760 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17762 else
17764 x = gen_reg_rtx (V2DImode);
17765 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17766 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17769 x = gen_rtx_CONST_VECTOR (V4SImode,
17770 gen_rtvec (4, GEN_INT (0x43300000UL),
17771 GEN_INT (0x45300000UL),
17772 const0_rtx, const0_rtx));
17773 exponents = validize_mem (force_const_mem (V4SImode, x));
17775 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17776 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17778 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17779 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17780 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17781 (0x1.0p84 + double(fp_value_hi_xmm)).
17782 Note these exponents differ by 32. */
17784 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17786 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17787 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17788 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17789 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17790 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17791 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17792 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17793 biases = validize_mem (force_const_mem (V2DFmode, biases));
17794 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17796 /* Add the upper and lower DFmode values together. */
17797 if (TARGET_SSE3)
17798 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17799 else
17801 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17802 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17803 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17806 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17809 /* Not used, but eases macroization of patterns. */
17810 void
17811 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17812 rtx input ATTRIBUTE_UNUSED)
17814 gcc_unreachable ();
17817 /* Convert an unsigned SImode value into a DFmode. Only currently used
17818 for SSE, but applicable anywhere. */
17820 void
17821 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17823 REAL_VALUE_TYPE TWO31r;
17824 rtx x, fp;
17826 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17827 NULL, 1, OPTAB_DIRECT);
17829 fp = gen_reg_rtx (DFmode);
17830 emit_insn (gen_floatsidf2 (fp, x));
17832 real_ldexp (&TWO31r, &dconst1, 31);
17833 x = const_double_from_real_value (TWO31r, DFmode);
17835 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17836 if (x != target)
17837 emit_move_insn (target, x);
17840 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17841 32-bit mode; otherwise we have a direct convert instruction. */
17843 void
17844 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17846 REAL_VALUE_TYPE TWO32r;
17847 rtx fp_lo, fp_hi, x;
17849 fp_lo = gen_reg_rtx (DFmode);
17850 fp_hi = gen_reg_rtx (DFmode);
17852 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17854 real_ldexp (&TWO32r, &dconst1, 32);
17855 x = const_double_from_real_value (TWO32r, DFmode);
17856 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17858 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17860 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17861 0, OPTAB_DIRECT);
17862 if (x != target)
17863 emit_move_insn (target, x);
17866 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17867 For x86_32, -mfpmath=sse, !optimize_size only. */
17868 void
17869 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17871 REAL_VALUE_TYPE ONE16r;
17872 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17874 real_ldexp (&ONE16r, &dconst1, 16);
17875 x = const_double_from_real_value (ONE16r, SFmode);
17876 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17877 NULL, 0, OPTAB_DIRECT);
17878 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17879 NULL, 0, OPTAB_DIRECT);
17880 fp_hi = gen_reg_rtx (SFmode);
17881 fp_lo = gen_reg_rtx (SFmode);
17882 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17883 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17884 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17885 0, OPTAB_DIRECT);
17886 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17887 0, OPTAB_DIRECT);
17888 if (!rtx_equal_p (target, fp_hi))
17889 emit_move_insn (target, fp_hi);
17892 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17893 a vector of unsigned ints VAL to vector of floats TARGET. */
17895 void
17896 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17898 rtx tmp[8];
17899 REAL_VALUE_TYPE TWO16r;
17900 enum machine_mode intmode = GET_MODE (val);
17901 enum machine_mode fltmode = GET_MODE (target);
17902 rtx (*cvt) (rtx, rtx);
17904 if (intmode == V4SImode)
17905 cvt = gen_floatv4siv4sf2;
17906 else
17907 cvt = gen_floatv8siv8sf2;
17908 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17909 tmp[0] = force_reg (intmode, tmp[0]);
17910 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17911 OPTAB_DIRECT);
17912 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17913 NULL_RTX, 1, OPTAB_DIRECT);
17914 tmp[3] = gen_reg_rtx (fltmode);
17915 emit_insn (cvt (tmp[3], tmp[1]));
17916 tmp[4] = gen_reg_rtx (fltmode);
17917 emit_insn (cvt (tmp[4], tmp[2]));
17918 real_ldexp (&TWO16r, &dconst1, 16);
17919 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17920 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17921 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17922 OPTAB_DIRECT);
17923 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17924 OPTAB_DIRECT);
17925 if (tmp[7] != target)
17926 emit_move_insn (target, tmp[7]);
17929 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17930 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17931 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17932 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17935 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17937 REAL_VALUE_TYPE TWO31r;
17938 rtx two31r, tmp[4];
17939 enum machine_mode mode = GET_MODE (val);
17940 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17941 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17942 rtx (*cmp) (rtx, rtx, rtx, rtx);
17943 int i;
17945 for (i = 0; i < 3; i++)
17946 tmp[i] = gen_reg_rtx (mode);
17947 real_ldexp (&TWO31r, &dconst1, 31);
17948 two31r = const_double_from_real_value (TWO31r, scalarmode);
17949 two31r = ix86_build_const_vector (mode, 1, two31r);
17950 two31r = force_reg (mode, two31r);
17951 switch (mode)
17953 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17954 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17955 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17956 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17957 default: gcc_unreachable ();
17959 tmp[3] = gen_rtx_LE (mode, two31r, val);
17960 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17961 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17962 0, OPTAB_DIRECT);
17963 if (intmode == V4SImode || TARGET_AVX2)
17964 *xorp = expand_simple_binop (intmode, ASHIFT,
17965 gen_lowpart (intmode, tmp[0]),
17966 GEN_INT (31), NULL_RTX, 0,
17967 OPTAB_DIRECT);
17968 else
17970 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17971 two31 = ix86_build_const_vector (intmode, 1, two31);
17972 *xorp = expand_simple_binop (intmode, AND,
17973 gen_lowpart (intmode, tmp[0]),
17974 two31, NULL_RTX, 0,
17975 OPTAB_DIRECT);
17977 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17978 0, OPTAB_DIRECT);
17981 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17982 then replicate the value for all elements of the vector
17983 register. */
17986 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17988 int i, n_elt;
17989 rtvec v;
17990 enum machine_mode scalar_mode;
17992 switch (mode)
17994 case V32QImode:
17995 case V16QImode:
17996 case V16HImode:
17997 case V8HImode:
17998 case V8SImode:
17999 case V4SImode:
18000 case V4DImode:
18001 case V2DImode:
18002 gcc_assert (vect);
18003 case V8SFmode:
18004 case V4SFmode:
18005 case V4DFmode:
18006 case V2DFmode:
18007 n_elt = GET_MODE_NUNITS (mode);
18008 v = rtvec_alloc (n_elt);
18009 scalar_mode = GET_MODE_INNER (mode);
18011 RTVEC_ELT (v, 0) = value;
18013 for (i = 1; i < n_elt; ++i)
18014 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18016 return gen_rtx_CONST_VECTOR (mode, v);
18018 default:
18019 gcc_unreachable ();
18023 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18024 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18025 for an SSE register. If VECT is true, then replicate the mask for
18026 all elements of the vector register. If INVERT is true, then create
18027 a mask excluding the sign bit. */
18030 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18032 enum machine_mode vec_mode, imode;
18033 HOST_WIDE_INT hi, lo;
18034 int shift = 63;
18035 rtx v;
18036 rtx mask;
18038 /* Find the sign bit, sign extended to 2*HWI. */
18039 switch (mode)
18041 case V8SImode:
18042 case V4SImode:
18043 case V8SFmode:
18044 case V4SFmode:
18045 vec_mode = mode;
18046 mode = GET_MODE_INNER (mode);
18047 imode = SImode;
18048 lo = 0x80000000, hi = lo < 0;
18049 break;
18051 case V4DImode:
18052 case V2DImode:
18053 case V4DFmode:
18054 case V2DFmode:
18055 vec_mode = mode;
18056 mode = GET_MODE_INNER (mode);
18057 imode = DImode;
18058 if (HOST_BITS_PER_WIDE_INT >= 64)
18059 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18060 else
18061 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18062 break;
18064 case TImode:
18065 case TFmode:
18066 vec_mode = VOIDmode;
18067 if (HOST_BITS_PER_WIDE_INT >= 64)
18069 imode = TImode;
18070 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18072 else
18074 rtvec vec;
18076 imode = DImode;
18077 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18079 if (invert)
18081 lo = ~lo, hi = ~hi;
18082 v = constm1_rtx;
18084 else
18085 v = const0_rtx;
18087 mask = immed_double_const (lo, hi, imode);
18089 vec = gen_rtvec (2, v, mask);
18090 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18091 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18093 return v;
18095 break;
18097 default:
18098 gcc_unreachable ();
18101 if (invert)
18102 lo = ~lo, hi = ~hi;
18104 /* Force this value into the low part of a fp vector constant. */
18105 mask = immed_double_const (lo, hi, imode);
18106 mask = gen_lowpart (mode, mask);
18108 if (vec_mode == VOIDmode)
18109 return force_reg (mode, mask);
18111 v = ix86_build_const_vector (vec_mode, vect, mask);
18112 return force_reg (vec_mode, v);
18115 /* Generate code for floating point ABS or NEG. */
18117 void
18118 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18119 rtx operands[])
18121 rtx mask, set, dst, src;
18122 bool use_sse = false;
18123 bool vector_mode = VECTOR_MODE_P (mode);
18124 enum machine_mode vmode = mode;
18126 if (vector_mode)
18127 use_sse = true;
18128 else if (mode == TFmode)
18129 use_sse = true;
18130 else if (TARGET_SSE_MATH)
18132 use_sse = SSE_FLOAT_MODE_P (mode);
18133 if (mode == SFmode)
18134 vmode = V4SFmode;
18135 else if (mode == DFmode)
18136 vmode = V2DFmode;
18139 /* NEG and ABS performed with SSE use bitwise mask operations.
18140 Create the appropriate mask now. */
18141 if (use_sse)
18142 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18143 else
18144 mask = NULL_RTX;
18146 dst = operands[0];
18147 src = operands[1];
18149 set = gen_rtx_fmt_e (code, mode, src);
18150 set = gen_rtx_SET (VOIDmode, dst, set);
18152 if (mask)
18154 rtx use, clob;
18155 rtvec par;
18157 use = gen_rtx_USE (VOIDmode, mask);
18158 if (vector_mode)
18159 par = gen_rtvec (2, set, use);
18160 else
18162 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18163 par = gen_rtvec (3, set, use, clob);
18165 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18167 else
18168 emit_insn (set);
18171 /* Expand a copysign operation. Special case operand 0 being a constant. */
18173 void
18174 ix86_expand_copysign (rtx operands[])
18176 enum machine_mode mode, vmode;
18177 rtx dest, op0, op1, mask, nmask;
18179 dest = operands[0];
18180 op0 = operands[1];
18181 op1 = operands[2];
18183 mode = GET_MODE (dest);
18185 if (mode == SFmode)
18186 vmode = V4SFmode;
18187 else if (mode == DFmode)
18188 vmode = V2DFmode;
18189 else
18190 vmode = mode;
18192 if (GET_CODE (op0) == CONST_DOUBLE)
18194 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18196 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18197 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18199 if (mode == SFmode || mode == DFmode)
18201 if (op0 == CONST0_RTX (mode))
18202 op0 = CONST0_RTX (vmode);
18203 else
18205 rtx v = ix86_build_const_vector (vmode, false, op0);
18207 op0 = force_reg (vmode, v);
18210 else if (op0 != CONST0_RTX (mode))
18211 op0 = force_reg (mode, op0);
18213 mask = ix86_build_signbit_mask (vmode, 0, 0);
18215 if (mode == SFmode)
18216 copysign_insn = gen_copysignsf3_const;
18217 else if (mode == DFmode)
18218 copysign_insn = gen_copysigndf3_const;
18219 else
18220 copysign_insn = gen_copysigntf3_const;
18222 emit_insn (copysign_insn (dest, op0, op1, mask));
18224 else
18226 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18228 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18229 mask = ix86_build_signbit_mask (vmode, 0, 0);
18231 if (mode == SFmode)
18232 copysign_insn = gen_copysignsf3_var;
18233 else if (mode == DFmode)
18234 copysign_insn = gen_copysigndf3_var;
18235 else
18236 copysign_insn = gen_copysigntf3_var;
18238 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18242 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18243 be a constant, and so has already been expanded into a vector constant. */
18245 void
18246 ix86_split_copysign_const (rtx operands[])
18248 enum machine_mode mode, vmode;
18249 rtx dest, op0, mask, x;
18251 dest = operands[0];
18252 op0 = operands[1];
18253 mask = operands[3];
18255 mode = GET_MODE (dest);
18256 vmode = GET_MODE (mask);
18258 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18259 x = gen_rtx_AND (vmode, dest, mask);
18260 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18262 if (op0 != CONST0_RTX (vmode))
18264 x = gen_rtx_IOR (vmode, dest, op0);
18265 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18269 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18270 so we have to do two masks. */
18272 void
18273 ix86_split_copysign_var (rtx operands[])
18275 enum machine_mode mode, vmode;
18276 rtx dest, scratch, op0, op1, mask, nmask, x;
18278 dest = operands[0];
18279 scratch = operands[1];
18280 op0 = operands[2];
18281 op1 = operands[3];
18282 nmask = operands[4];
18283 mask = operands[5];
18285 mode = GET_MODE (dest);
18286 vmode = GET_MODE (mask);
18288 if (rtx_equal_p (op0, op1))
18290 /* Shouldn't happen often (it's useless, obviously), but when it does
18291 we'd generate incorrect code if we continue below. */
18292 emit_move_insn (dest, op0);
18293 return;
18296 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18298 gcc_assert (REGNO (op1) == REGNO (scratch));
18300 x = gen_rtx_AND (vmode, scratch, mask);
18301 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18303 dest = mask;
18304 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18305 x = gen_rtx_NOT (vmode, dest);
18306 x = gen_rtx_AND (vmode, x, op0);
18307 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18309 else
18311 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18313 x = gen_rtx_AND (vmode, scratch, mask);
18315 else /* alternative 2,4 */
18317 gcc_assert (REGNO (mask) == REGNO (scratch));
18318 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18319 x = gen_rtx_AND (vmode, scratch, op1);
18321 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18323 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18325 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18326 x = gen_rtx_AND (vmode, dest, nmask);
18328 else /* alternative 3,4 */
18330 gcc_assert (REGNO (nmask) == REGNO (dest));
18331 dest = nmask;
18332 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18333 x = gen_rtx_AND (vmode, dest, op0);
18335 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18338 x = gen_rtx_IOR (vmode, dest, scratch);
18339 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18342 /* Return TRUE or FALSE depending on whether the first SET in INSN
18343 has source and destination with matching CC modes, and that the
18344 CC mode is at least as constrained as REQ_MODE. */
18346 bool
18347 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18349 rtx set;
18350 enum machine_mode set_mode;
18352 set = PATTERN (insn);
18353 if (GET_CODE (set) == PARALLEL)
18354 set = XVECEXP (set, 0, 0);
18355 gcc_assert (GET_CODE (set) == SET);
18356 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18358 set_mode = GET_MODE (SET_DEST (set));
18359 switch (set_mode)
18361 case CCNOmode:
18362 if (req_mode != CCNOmode
18363 && (req_mode != CCmode
18364 || XEXP (SET_SRC (set), 1) != const0_rtx))
18365 return false;
18366 break;
18367 case CCmode:
18368 if (req_mode == CCGCmode)
18369 return false;
18370 /* FALLTHRU */
18371 case CCGCmode:
18372 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18373 return false;
18374 /* FALLTHRU */
18375 case CCGOCmode:
18376 if (req_mode == CCZmode)
18377 return false;
18378 /* FALLTHRU */
18379 case CCZmode:
18380 break;
18382 case CCAmode:
18383 case CCCmode:
18384 case CCOmode:
18385 case CCSmode:
18386 if (set_mode != req_mode)
18387 return false;
18388 break;
18390 default:
18391 gcc_unreachable ();
18394 return GET_MODE (SET_SRC (set)) == set_mode;
18397 /* Generate insn patterns to do an integer compare of OPERANDS. */
18399 static rtx
18400 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18402 enum machine_mode cmpmode;
18403 rtx tmp, flags;
18405 cmpmode = SELECT_CC_MODE (code, op0, op1);
18406 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18408 /* This is very simple, but making the interface the same as in the
18409 FP case makes the rest of the code easier. */
18410 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18411 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18413 /* Return the test that should be put into the flags user, i.e.
18414 the bcc, scc, or cmov instruction. */
18415 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18418 /* Figure out whether to use ordered or unordered fp comparisons.
18419 Return the appropriate mode to use. */
18421 enum machine_mode
18422 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18424 /* ??? In order to make all comparisons reversible, we do all comparisons
18425 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18426 all forms trapping and nontrapping comparisons, we can make inequality
18427 comparisons trapping again, since it results in better code when using
18428 FCOM based compares. */
18429 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18432 enum machine_mode
18433 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18435 enum machine_mode mode = GET_MODE (op0);
18437 if (SCALAR_FLOAT_MODE_P (mode))
18439 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18440 return ix86_fp_compare_mode (code);
18443 switch (code)
18445 /* Only zero flag is needed. */
18446 case EQ: /* ZF=0 */
18447 case NE: /* ZF!=0 */
18448 return CCZmode;
18449 /* Codes needing carry flag. */
18450 case GEU: /* CF=0 */
18451 case LTU: /* CF=1 */
18452 /* Detect overflow checks. They need just the carry flag. */
18453 if (GET_CODE (op0) == PLUS
18454 && rtx_equal_p (op1, XEXP (op0, 0)))
18455 return CCCmode;
18456 else
18457 return CCmode;
18458 case GTU: /* CF=0 & ZF=0 */
18459 case LEU: /* CF=1 | ZF=1 */
18460 return CCmode;
18461 /* Codes possibly doable only with sign flag when
18462 comparing against zero. */
18463 case GE: /* SF=OF or SF=0 */
18464 case LT: /* SF<>OF or SF=1 */
18465 if (op1 == const0_rtx)
18466 return CCGOCmode;
18467 else
18468 /* For other cases Carry flag is not required. */
18469 return CCGCmode;
18470 /* Codes doable only with sign flag when comparing
18471 against zero, but we miss jump instruction for it
18472 so we need to use relational tests against overflow
18473 that thus needs to be zero. */
18474 case GT: /* ZF=0 & SF=OF */
18475 case LE: /* ZF=1 | SF<>OF */
18476 if (op1 == const0_rtx)
18477 return CCNOmode;
18478 else
18479 return CCGCmode;
18480 /* strcmp pattern do (use flags) and combine may ask us for proper
18481 mode. */
18482 case USE:
18483 return CCmode;
18484 default:
18485 gcc_unreachable ();
18489 /* Return the fixed registers used for condition codes. */
18491 static bool
18492 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18494 *p1 = FLAGS_REG;
18495 *p2 = FPSR_REG;
18496 return true;
18499 /* If two condition code modes are compatible, return a condition code
18500 mode which is compatible with both. Otherwise, return
18501 VOIDmode. */
18503 static enum machine_mode
18504 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18506 if (m1 == m2)
18507 return m1;
18509 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18510 return VOIDmode;
18512 if ((m1 == CCGCmode && m2 == CCGOCmode)
18513 || (m1 == CCGOCmode && m2 == CCGCmode))
18514 return CCGCmode;
18516 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18517 return m2;
18518 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18519 return m1;
18521 switch (m1)
18523 default:
18524 gcc_unreachable ();
18526 case CCmode:
18527 case CCGCmode:
18528 case CCGOCmode:
18529 case CCNOmode:
18530 case CCAmode:
18531 case CCCmode:
18532 case CCOmode:
18533 case CCSmode:
18534 case CCZmode:
18535 switch (m2)
18537 default:
18538 return VOIDmode;
18540 case CCmode:
18541 case CCGCmode:
18542 case CCGOCmode:
18543 case CCNOmode:
18544 case CCAmode:
18545 case CCCmode:
18546 case CCOmode:
18547 case CCSmode:
18548 case CCZmode:
18549 return CCmode;
18552 case CCFPmode:
18553 case CCFPUmode:
18554 /* These are only compatible with themselves, which we already
18555 checked above. */
18556 return VOIDmode;
18561 /* Return a comparison we can do and that it is equivalent to
18562 swap_condition (code) apart possibly from orderedness.
18563 But, never change orderedness if TARGET_IEEE_FP, returning
18564 UNKNOWN in that case if necessary. */
18566 static enum rtx_code
18567 ix86_fp_swap_condition (enum rtx_code code)
18569 switch (code)
18571 case GT: /* GTU - CF=0 & ZF=0 */
18572 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18573 case GE: /* GEU - CF=0 */
18574 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18575 case UNLT: /* LTU - CF=1 */
18576 return TARGET_IEEE_FP ? UNKNOWN : GT;
18577 case UNLE: /* LEU - CF=1 | ZF=1 */
18578 return TARGET_IEEE_FP ? UNKNOWN : GE;
18579 default:
18580 return swap_condition (code);
18584 /* Return cost of comparison CODE using the best strategy for performance.
18585 All following functions do use number of instructions as a cost metrics.
18586 In future this should be tweaked to compute bytes for optimize_size and
18587 take into account performance of various instructions on various CPUs. */
18589 static int
18590 ix86_fp_comparison_cost (enum rtx_code code)
18592 int arith_cost;
18594 /* The cost of code using bit-twiddling on %ah. */
18595 switch (code)
18597 case UNLE:
18598 case UNLT:
18599 case LTGT:
18600 case GT:
18601 case GE:
18602 case UNORDERED:
18603 case ORDERED:
18604 case UNEQ:
18605 arith_cost = 4;
18606 break;
18607 case LT:
18608 case NE:
18609 case EQ:
18610 case UNGE:
18611 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18612 break;
18613 case LE:
18614 case UNGT:
18615 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18616 break;
18617 default:
18618 gcc_unreachable ();
18621 switch (ix86_fp_comparison_strategy (code))
18623 case IX86_FPCMP_COMI:
18624 return arith_cost > 4 ? 3 : 2;
18625 case IX86_FPCMP_SAHF:
18626 return arith_cost > 4 ? 4 : 3;
18627 default:
18628 return arith_cost;
18632 /* Return strategy to use for floating-point. We assume that fcomi is always
18633 preferrable where available, since that is also true when looking at size
18634 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18636 enum ix86_fpcmp_strategy
18637 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18639 /* Do fcomi/sahf based test when profitable. */
18641 if (TARGET_CMOVE)
18642 return IX86_FPCMP_COMI;
18644 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18645 return IX86_FPCMP_SAHF;
18647 return IX86_FPCMP_ARITH;
18650 /* Swap, force into registers, or otherwise massage the two operands
18651 to a fp comparison. The operands are updated in place; the new
18652 comparison code is returned. */
18654 static enum rtx_code
18655 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18657 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18658 rtx op0 = *pop0, op1 = *pop1;
18659 enum machine_mode op_mode = GET_MODE (op0);
18660 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18662 /* All of the unordered compare instructions only work on registers.
18663 The same is true of the fcomi compare instructions. The XFmode
18664 compare instructions require registers except when comparing
18665 against zero or when converting operand 1 from fixed point to
18666 floating point. */
18668 if (!is_sse
18669 && (fpcmp_mode == CCFPUmode
18670 || (op_mode == XFmode
18671 && ! (standard_80387_constant_p (op0) == 1
18672 || standard_80387_constant_p (op1) == 1)
18673 && GET_CODE (op1) != FLOAT)
18674 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18676 op0 = force_reg (op_mode, op0);
18677 op1 = force_reg (op_mode, op1);
18679 else
18681 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18682 things around if they appear profitable, otherwise force op0
18683 into a register. */
18685 if (standard_80387_constant_p (op0) == 0
18686 || (MEM_P (op0)
18687 && ! (standard_80387_constant_p (op1) == 0
18688 || MEM_P (op1))))
18690 enum rtx_code new_code = ix86_fp_swap_condition (code);
18691 if (new_code != UNKNOWN)
18693 rtx tmp;
18694 tmp = op0, op0 = op1, op1 = tmp;
18695 code = new_code;
18699 if (!REG_P (op0))
18700 op0 = force_reg (op_mode, op0);
18702 if (CONSTANT_P (op1))
18704 int tmp = standard_80387_constant_p (op1);
18705 if (tmp == 0)
18706 op1 = validize_mem (force_const_mem (op_mode, op1));
18707 else if (tmp == 1)
18709 if (TARGET_CMOVE)
18710 op1 = force_reg (op_mode, op1);
18712 else
18713 op1 = force_reg (op_mode, op1);
18717 /* Try to rearrange the comparison to make it cheaper. */
18718 if (ix86_fp_comparison_cost (code)
18719 > ix86_fp_comparison_cost (swap_condition (code))
18720 && (REG_P (op1) || can_create_pseudo_p ()))
18722 rtx tmp;
18723 tmp = op0, op0 = op1, op1 = tmp;
18724 code = swap_condition (code);
18725 if (!REG_P (op0))
18726 op0 = force_reg (op_mode, op0);
18729 *pop0 = op0;
18730 *pop1 = op1;
18731 return code;
18734 /* Convert comparison codes we use to represent FP comparison to integer
18735 code that will result in proper branch. Return UNKNOWN if no such code
18736 is available. */
18738 enum rtx_code
18739 ix86_fp_compare_code_to_integer (enum rtx_code code)
18741 switch (code)
18743 case GT:
18744 return GTU;
18745 case GE:
18746 return GEU;
18747 case ORDERED:
18748 case UNORDERED:
18749 return code;
18750 break;
18751 case UNEQ:
18752 return EQ;
18753 break;
18754 case UNLT:
18755 return LTU;
18756 break;
18757 case UNLE:
18758 return LEU;
18759 break;
18760 case LTGT:
18761 return NE;
18762 break;
18763 default:
18764 return UNKNOWN;
18768 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18770 static rtx
18771 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18773 enum machine_mode fpcmp_mode, intcmp_mode;
18774 rtx tmp, tmp2;
18776 fpcmp_mode = ix86_fp_compare_mode (code);
18777 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18779 /* Do fcomi/sahf based test when profitable. */
18780 switch (ix86_fp_comparison_strategy (code))
18782 case IX86_FPCMP_COMI:
18783 intcmp_mode = fpcmp_mode;
18784 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18785 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18786 tmp);
18787 emit_insn (tmp);
18788 break;
18790 case IX86_FPCMP_SAHF:
18791 intcmp_mode = fpcmp_mode;
18792 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18793 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18794 tmp);
18796 if (!scratch)
18797 scratch = gen_reg_rtx (HImode);
18798 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18799 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18800 break;
18802 case IX86_FPCMP_ARITH:
18803 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18804 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18805 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18806 if (!scratch)
18807 scratch = gen_reg_rtx (HImode);
18808 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18810 /* In the unordered case, we have to check C2 for NaN's, which
18811 doesn't happen to work out to anything nice combination-wise.
18812 So do some bit twiddling on the value we've got in AH to come
18813 up with an appropriate set of condition codes. */
18815 intcmp_mode = CCNOmode;
18816 switch (code)
18818 case GT:
18819 case UNGT:
18820 if (code == GT || !TARGET_IEEE_FP)
18822 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18823 code = EQ;
18825 else
18827 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18828 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18829 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18830 intcmp_mode = CCmode;
18831 code = GEU;
18833 break;
18834 case LT:
18835 case UNLT:
18836 if (code == LT && TARGET_IEEE_FP)
18838 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18839 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18840 intcmp_mode = CCmode;
18841 code = EQ;
18843 else
18845 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18846 code = NE;
18848 break;
18849 case GE:
18850 case UNGE:
18851 if (code == GE || !TARGET_IEEE_FP)
18853 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18854 code = EQ;
18856 else
18858 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18859 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18860 code = NE;
18862 break;
18863 case LE:
18864 case UNLE:
18865 if (code == LE && TARGET_IEEE_FP)
18867 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18868 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18869 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18870 intcmp_mode = CCmode;
18871 code = LTU;
18873 else
18875 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18876 code = NE;
18878 break;
18879 case EQ:
18880 case UNEQ:
18881 if (code == EQ && TARGET_IEEE_FP)
18883 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18884 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18885 intcmp_mode = CCmode;
18886 code = EQ;
18888 else
18890 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18891 code = NE;
18893 break;
18894 case NE:
18895 case LTGT:
18896 if (code == NE && TARGET_IEEE_FP)
18898 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18899 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18900 GEN_INT (0x40)));
18901 code = NE;
18903 else
18905 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18906 code = EQ;
18908 break;
18910 case UNORDERED:
18911 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18912 code = NE;
18913 break;
18914 case ORDERED:
18915 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18916 code = EQ;
18917 break;
18919 default:
18920 gcc_unreachable ();
18922 break;
18924 default:
18925 gcc_unreachable();
18928 /* Return the test that should be put into the flags user, i.e.
18929 the bcc, scc, or cmov instruction. */
18930 return gen_rtx_fmt_ee (code, VOIDmode,
18931 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18932 const0_rtx);
18935 static rtx
18936 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18938 rtx ret;
18940 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18941 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18943 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18945 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18946 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18948 else
18949 ret = ix86_expand_int_compare (code, op0, op1);
18951 return ret;
18954 void
18955 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18957 enum machine_mode mode = GET_MODE (op0);
18958 rtx tmp;
18960 switch (mode)
18962 case SFmode:
18963 case DFmode:
18964 case XFmode:
18965 case QImode:
18966 case HImode:
18967 case SImode:
18968 simple:
18969 tmp = ix86_expand_compare (code, op0, op1);
18970 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18971 gen_rtx_LABEL_REF (VOIDmode, label),
18972 pc_rtx);
18973 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18974 return;
18976 case DImode:
18977 if (TARGET_64BIT)
18978 goto simple;
18979 case TImode:
18980 /* Expand DImode branch into multiple compare+branch. */
18982 rtx lo[2], hi[2], label2;
18983 enum rtx_code code1, code2, code3;
18984 enum machine_mode submode;
18986 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18988 tmp = op0, op0 = op1, op1 = tmp;
18989 code = swap_condition (code);
18992 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18993 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18995 submode = mode == DImode ? SImode : DImode;
18997 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18998 avoid two branches. This costs one extra insn, so disable when
18999 optimizing for size. */
19001 if ((code == EQ || code == NE)
19002 && (!optimize_insn_for_size_p ()
19003 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19005 rtx xor0, xor1;
19007 xor1 = hi[0];
19008 if (hi[1] != const0_rtx)
19009 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19010 NULL_RTX, 0, OPTAB_WIDEN);
19012 xor0 = lo[0];
19013 if (lo[1] != const0_rtx)
19014 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19015 NULL_RTX, 0, OPTAB_WIDEN);
19017 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19018 NULL_RTX, 0, OPTAB_WIDEN);
19020 ix86_expand_branch (code, tmp, const0_rtx, label);
19021 return;
19024 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19025 op1 is a constant and the low word is zero, then we can just
19026 examine the high word. Similarly for low word -1 and
19027 less-or-equal-than or greater-than. */
19029 if (CONST_INT_P (hi[1]))
19030 switch (code)
19032 case LT: case LTU: case GE: case GEU:
19033 if (lo[1] == const0_rtx)
19035 ix86_expand_branch (code, hi[0], hi[1], label);
19036 return;
19038 break;
19039 case LE: case LEU: case GT: case GTU:
19040 if (lo[1] == constm1_rtx)
19042 ix86_expand_branch (code, hi[0], hi[1], label);
19043 return;
19045 break;
19046 default:
19047 break;
19050 /* Otherwise, we need two or three jumps. */
19052 label2 = gen_label_rtx ();
19054 code1 = code;
19055 code2 = swap_condition (code);
19056 code3 = unsigned_condition (code);
19058 switch (code)
19060 case LT: case GT: case LTU: case GTU:
19061 break;
19063 case LE: code1 = LT; code2 = GT; break;
19064 case GE: code1 = GT; code2 = LT; break;
19065 case LEU: code1 = LTU; code2 = GTU; break;
19066 case GEU: code1 = GTU; code2 = LTU; break;
19068 case EQ: code1 = UNKNOWN; code2 = NE; break;
19069 case NE: code2 = UNKNOWN; break;
19071 default:
19072 gcc_unreachable ();
19076 * a < b =>
19077 * if (hi(a) < hi(b)) goto true;
19078 * if (hi(a) > hi(b)) goto false;
19079 * if (lo(a) < lo(b)) goto true;
19080 * false:
19083 if (code1 != UNKNOWN)
19084 ix86_expand_branch (code1, hi[0], hi[1], label);
19085 if (code2 != UNKNOWN)
19086 ix86_expand_branch (code2, hi[0], hi[1], label2);
19088 ix86_expand_branch (code3, lo[0], lo[1], label);
19090 if (code2 != UNKNOWN)
19091 emit_label (label2);
19092 return;
19095 default:
19096 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19097 goto simple;
19101 /* Split branch based on floating point condition. */
19102 void
19103 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19104 rtx target1, rtx target2, rtx tmp, rtx pushed)
19106 rtx condition;
19107 rtx i;
19109 if (target2 != pc_rtx)
19111 rtx tmp = target2;
19112 code = reverse_condition_maybe_unordered (code);
19113 target2 = target1;
19114 target1 = tmp;
19117 condition = ix86_expand_fp_compare (code, op1, op2,
19118 tmp);
19120 /* Remove pushed operand from stack. */
19121 if (pushed)
19122 ix86_free_from_memory (GET_MODE (pushed));
19124 i = emit_jump_insn (gen_rtx_SET
19125 (VOIDmode, pc_rtx,
19126 gen_rtx_IF_THEN_ELSE (VOIDmode,
19127 condition, target1, target2)));
19128 if (split_branch_probability >= 0)
19129 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19132 void
19133 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19135 rtx ret;
19137 gcc_assert (GET_MODE (dest) == QImode);
19139 ret = ix86_expand_compare (code, op0, op1);
19140 PUT_MODE (ret, QImode);
19141 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19144 /* Expand comparison setting or clearing carry flag. Return true when
19145 successful and set pop for the operation. */
19146 static bool
19147 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19149 enum machine_mode mode =
19150 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19152 /* Do not handle double-mode compares that go through special path. */
19153 if (mode == (TARGET_64BIT ? TImode : DImode))
19154 return false;
19156 if (SCALAR_FLOAT_MODE_P (mode))
19158 rtx compare_op, compare_seq;
19160 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19162 /* Shortcut: following common codes never translate
19163 into carry flag compares. */
19164 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19165 || code == ORDERED || code == UNORDERED)
19166 return false;
19168 /* These comparisons require zero flag; swap operands so they won't. */
19169 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19170 && !TARGET_IEEE_FP)
19172 rtx tmp = op0;
19173 op0 = op1;
19174 op1 = tmp;
19175 code = swap_condition (code);
19178 /* Try to expand the comparison and verify that we end up with
19179 carry flag based comparison. This fails to be true only when
19180 we decide to expand comparison using arithmetic that is not
19181 too common scenario. */
19182 start_sequence ();
19183 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19184 compare_seq = get_insns ();
19185 end_sequence ();
19187 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19188 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19189 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19190 else
19191 code = GET_CODE (compare_op);
19193 if (code != LTU && code != GEU)
19194 return false;
19196 emit_insn (compare_seq);
19197 *pop = compare_op;
19198 return true;
19201 if (!INTEGRAL_MODE_P (mode))
19202 return false;
19204 switch (code)
19206 case LTU:
19207 case GEU:
19208 break;
19210 /* Convert a==0 into (unsigned)a<1. */
19211 case EQ:
19212 case NE:
19213 if (op1 != const0_rtx)
19214 return false;
19215 op1 = const1_rtx;
19216 code = (code == EQ ? LTU : GEU);
19217 break;
19219 /* Convert a>b into b<a or a>=b-1. */
19220 case GTU:
19221 case LEU:
19222 if (CONST_INT_P (op1))
19224 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19225 /* Bail out on overflow. We still can swap operands but that
19226 would force loading of the constant into register. */
19227 if (op1 == const0_rtx
19228 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19229 return false;
19230 code = (code == GTU ? GEU : LTU);
19232 else
19234 rtx tmp = op1;
19235 op1 = op0;
19236 op0 = tmp;
19237 code = (code == GTU ? LTU : GEU);
19239 break;
19241 /* Convert a>=0 into (unsigned)a<0x80000000. */
19242 case LT:
19243 case GE:
19244 if (mode == DImode || op1 != const0_rtx)
19245 return false;
19246 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19247 code = (code == LT ? GEU : LTU);
19248 break;
19249 case LE:
19250 case GT:
19251 if (mode == DImode || op1 != constm1_rtx)
19252 return false;
19253 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19254 code = (code == LE ? GEU : LTU);
19255 break;
19257 default:
19258 return false;
19260 /* Swapping operands may cause constant to appear as first operand. */
19261 if (!nonimmediate_operand (op0, VOIDmode))
19263 if (!can_create_pseudo_p ())
19264 return false;
19265 op0 = force_reg (mode, op0);
19267 *pop = ix86_expand_compare (code, op0, op1);
19268 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19269 return true;
19272 bool
19273 ix86_expand_int_movcc (rtx operands[])
19275 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19276 rtx compare_seq, compare_op;
19277 enum machine_mode mode = GET_MODE (operands[0]);
19278 bool sign_bit_compare_p = false;
19279 rtx op0 = XEXP (operands[1], 0);
19280 rtx op1 = XEXP (operands[1], 1);
19282 if (GET_MODE (op0) == TImode
19283 || (GET_MODE (op0) == DImode
19284 && !TARGET_64BIT))
19285 return false;
19287 start_sequence ();
19288 compare_op = ix86_expand_compare (code, op0, op1);
19289 compare_seq = get_insns ();
19290 end_sequence ();
19292 compare_code = GET_CODE (compare_op);
19294 if ((op1 == const0_rtx && (code == GE || code == LT))
19295 || (op1 == constm1_rtx && (code == GT || code == LE)))
19296 sign_bit_compare_p = true;
19298 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19299 HImode insns, we'd be swallowed in word prefix ops. */
19301 if ((mode != HImode || TARGET_FAST_PREFIX)
19302 && (mode != (TARGET_64BIT ? TImode : DImode))
19303 && CONST_INT_P (operands[2])
19304 && CONST_INT_P (operands[3]))
19306 rtx out = operands[0];
19307 HOST_WIDE_INT ct = INTVAL (operands[2]);
19308 HOST_WIDE_INT cf = INTVAL (operands[3]);
19309 HOST_WIDE_INT diff;
19311 diff = ct - cf;
19312 /* Sign bit compares are better done using shifts than we do by using
19313 sbb. */
19314 if (sign_bit_compare_p
19315 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19317 /* Detect overlap between destination and compare sources. */
19318 rtx tmp = out;
19320 if (!sign_bit_compare_p)
19322 rtx flags;
19323 bool fpcmp = false;
19325 compare_code = GET_CODE (compare_op);
19327 flags = XEXP (compare_op, 0);
19329 if (GET_MODE (flags) == CCFPmode
19330 || GET_MODE (flags) == CCFPUmode)
19332 fpcmp = true;
19333 compare_code
19334 = ix86_fp_compare_code_to_integer (compare_code);
19337 /* To simplify rest of code, restrict to the GEU case. */
19338 if (compare_code == LTU)
19340 HOST_WIDE_INT tmp = ct;
19341 ct = cf;
19342 cf = tmp;
19343 compare_code = reverse_condition (compare_code);
19344 code = reverse_condition (code);
19346 else
19348 if (fpcmp)
19349 PUT_CODE (compare_op,
19350 reverse_condition_maybe_unordered
19351 (GET_CODE (compare_op)));
19352 else
19353 PUT_CODE (compare_op,
19354 reverse_condition (GET_CODE (compare_op)));
19356 diff = ct - cf;
19358 if (reg_overlap_mentioned_p (out, op0)
19359 || reg_overlap_mentioned_p (out, op1))
19360 tmp = gen_reg_rtx (mode);
19362 if (mode == DImode)
19363 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19364 else
19365 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19366 flags, compare_op));
19368 else
19370 if (code == GT || code == GE)
19371 code = reverse_condition (code);
19372 else
19374 HOST_WIDE_INT tmp = ct;
19375 ct = cf;
19376 cf = tmp;
19377 diff = ct - cf;
19379 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19382 if (diff == 1)
19385 * cmpl op0,op1
19386 * sbbl dest,dest
19387 * [addl dest, ct]
19389 * Size 5 - 8.
19391 if (ct)
19392 tmp = expand_simple_binop (mode, PLUS,
19393 tmp, GEN_INT (ct),
19394 copy_rtx (tmp), 1, OPTAB_DIRECT);
19396 else if (cf == -1)
19399 * cmpl op0,op1
19400 * sbbl dest,dest
19401 * orl $ct, dest
19403 * Size 8.
19405 tmp = expand_simple_binop (mode, IOR,
19406 tmp, GEN_INT (ct),
19407 copy_rtx (tmp), 1, OPTAB_DIRECT);
19409 else if (diff == -1 && ct)
19412 * cmpl op0,op1
19413 * sbbl dest,dest
19414 * notl dest
19415 * [addl dest, cf]
19417 * Size 8 - 11.
19419 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19420 if (cf)
19421 tmp = expand_simple_binop (mode, PLUS,
19422 copy_rtx (tmp), GEN_INT (cf),
19423 copy_rtx (tmp), 1, OPTAB_DIRECT);
19425 else
19428 * cmpl op0,op1
19429 * sbbl dest,dest
19430 * [notl dest]
19431 * andl cf - ct, dest
19432 * [addl dest, ct]
19434 * Size 8 - 11.
19437 if (cf == 0)
19439 cf = ct;
19440 ct = 0;
19441 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19444 tmp = expand_simple_binop (mode, AND,
19445 copy_rtx (tmp),
19446 gen_int_mode (cf - ct, mode),
19447 copy_rtx (tmp), 1, OPTAB_DIRECT);
19448 if (ct)
19449 tmp = expand_simple_binop (mode, PLUS,
19450 copy_rtx (tmp), GEN_INT (ct),
19451 copy_rtx (tmp), 1, OPTAB_DIRECT);
19454 if (!rtx_equal_p (tmp, out))
19455 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19457 return true;
19460 if (diff < 0)
19462 enum machine_mode cmp_mode = GET_MODE (op0);
19464 HOST_WIDE_INT tmp;
19465 tmp = ct, ct = cf, cf = tmp;
19466 diff = -diff;
19468 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19470 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19472 /* We may be reversing unordered compare to normal compare, that
19473 is not valid in general (we may convert non-trapping condition
19474 to trapping one), however on i386 we currently emit all
19475 comparisons unordered. */
19476 compare_code = reverse_condition_maybe_unordered (compare_code);
19477 code = reverse_condition_maybe_unordered (code);
19479 else
19481 compare_code = reverse_condition (compare_code);
19482 code = reverse_condition (code);
19486 compare_code = UNKNOWN;
19487 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19488 && CONST_INT_P (op1))
19490 if (op1 == const0_rtx
19491 && (code == LT || code == GE))
19492 compare_code = code;
19493 else if (op1 == constm1_rtx)
19495 if (code == LE)
19496 compare_code = LT;
19497 else if (code == GT)
19498 compare_code = GE;
19502 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19503 if (compare_code != UNKNOWN
19504 && GET_MODE (op0) == GET_MODE (out)
19505 && (cf == -1 || ct == -1))
19507 /* If lea code below could be used, only optimize
19508 if it results in a 2 insn sequence. */
19510 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19511 || diff == 3 || diff == 5 || diff == 9)
19512 || (compare_code == LT && ct == -1)
19513 || (compare_code == GE && cf == -1))
19516 * notl op1 (if necessary)
19517 * sarl $31, op1
19518 * orl cf, op1
19520 if (ct != -1)
19522 cf = ct;
19523 ct = -1;
19524 code = reverse_condition (code);
19527 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19529 out = expand_simple_binop (mode, IOR,
19530 out, GEN_INT (cf),
19531 out, 1, OPTAB_DIRECT);
19532 if (out != operands[0])
19533 emit_move_insn (operands[0], out);
19535 return true;
19540 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19541 || diff == 3 || diff == 5 || diff == 9)
19542 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19543 && (mode != DImode
19544 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19547 * xorl dest,dest
19548 * cmpl op1,op2
19549 * setcc dest
19550 * lea cf(dest*(ct-cf)),dest
19552 * Size 14.
19554 * This also catches the degenerate setcc-only case.
19557 rtx tmp;
19558 int nops;
19560 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19562 nops = 0;
19563 /* On x86_64 the lea instruction operates on Pmode, so we need
19564 to get arithmetics done in proper mode to match. */
19565 if (diff == 1)
19566 tmp = copy_rtx (out);
19567 else
19569 rtx out1;
19570 out1 = copy_rtx (out);
19571 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19572 nops++;
19573 if (diff & 1)
19575 tmp = gen_rtx_PLUS (mode, tmp, out1);
19576 nops++;
19579 if (cf != 0)
19581 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19582 nops++;
19584 if (!rtx_equal_p (tmp, out))
19586 if (nops == 1)
19587 out = force_operand (tmp, copy_rtx (out));
19588 else
19589 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19591 if (!rtx_equal_p (out, operands[0]))
19592 emit_move_insn (operands[0], copy_rtx (out));
19594 return true;
19598 * General case: Jumpful:
19599 * xorl dest,dest cmpl op1, op2
19600 * cmpl op1, op2 movl ct, dest
19601 * setcc dest jcc 1f
19602 * decl dest movl cf, dest
19603 * andl (cf-ct),dest 1:
19604 * addl ct,dest
19606 * Size 20. Size 14.
19608 * This is reasonably steep, but branch mispredict costs are
19609 * high on modern cpus, so consider failing only if optimizing
19610 * for space.
19613 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19614 && BRANCH_COST (optimize_insn_for_speed_p (),
19615 false) >= 2)
19617 if (cf == 0)
19619 enum machine_mode cmp_mode = GET_MODE (op0);
19621 cf = ct;
19622 ct = 0;
19624 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19626 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19628 /* We may be reversing unordered compare to normal compare,
19629 that is not valid in general (we may convert non-trapping
19630 condition to trapping one), however on i386 we currently
19631 emit all comparisons unordered. */
19632 code = reverse_condition_maybe_unordered (code);
19634 else
19636 code = reverse_condition (code);
19637 if (compare_code != UNKNOWN)
19638 compare_code = reverse_condition (compare_code);
19642 if (compare_code != UNKNOWN)
19644 /* notl op1 (if needed)
19645 sarl $31, op1
19646 andl (cf-ct), op1
19647 addl ct, op1
19649 For x < 0 (resp. x <= -1) there will be no notl,
19650 so if possible swap the constants to get rid of the
19651 complement.
19652 True/false will be -1/0 while code below (store flag
19653 followed by decrement) is 0/-1, so the constants need
19654 to be exchanged once more. */
19656 if (compare_code == GE || !cf)
19658 code = reverse_condition (code);
19659 compare_code = LT;
19661 else
19663 HOST_WIDE_INT tmp = cf;
19664 cf = ct;
19665 ct = tmp;
19668 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19670 else
19672 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19674 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19675 constm1_rtx,
19676 copy_rtx (out), 1, OPTAB_DIRECT);
19679 out = expand_simple_binop (mode, AND, copy_rtx (out),
19680 gen_int_mode (cf - ct, mode),
19681 copy_rtx (out), 1, OPTAB_DIRECT);
19682 if (ct)
19683 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19684 copy_rtx (out), 1, OPTAB_DIRECT);
19685 if (!rtx_equal_p (out, operands[0]))
19686 emit_move_insn (operands[0], copy_rtx (out));
19688 return true;
19692 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19694 /* Try a few things more with specific constants and a variable. */
19696 optab op;
19697 rtx var, orig_out, out, tmp;
19699 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19700 return false;
19702 /* If one of the two operands is an interesting constant, load a
19703 constant with the above and mask it in with a logical operation. */
19705 if (CONST_INT_P (operands[2]))
19707 var = operands[3];
19708 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19709 operands[3] = constm1_rtx, op = and_optab;
19710 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19711 operands[3] = const0_rtx, op = ior_optab;
19712 else
19713 return false;
19715 else if (CONST_INT_P (operands[3]))
19717 var = operands[2];
19718 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19719 operands[2] = constm1_rtx, op = and_optab;
19720 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19721 operands[2] = const0_rtx, op = ior_optab;
19722 else
19723 return false;
19725 else
19726 return false;
19728 orig_out = operands[0];
19729 tmp = gen_reg_rtx (mode);
19730 operands[0] = tmp;
19732 /* Recurse to get the constant loaded. */
19733 if (ix86_expand_int_movcc (operands) == 0)
19734 return false;
19736 /* Mask in the interesting variable. */
19737 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19738 OPTAB_WIDEN);
19739 if (!rtx_equal_p (out, orig_out))
19740 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19742 return true;
19746 * For comparison with above,
19748 * movl cf,dest
19749 * movl ct,tmp
19750 * cmpl op1,op2
19751 * cmovcc tmp,dest
19753 * Size 15.
19756 if (! nonimmediate_operand (operands[2], mode))
19757 operands[2] = force_reg (mode, operands[2]);
19758 if (! nonimmediate_operand (operands[3], mode))
19759 operands[3] = force_reg (mode, operands[3]);
19761 if (! register_operand (operands[2], VOIDmode)
19762 && (mode == QImode
19763 || ! register_operand (operands[3], VOIDmode)))
19764 operands[2] = force_reg (mode, operands[2]);
19766 if (mode == QImode
19767 && ! register_operand (operands[3], VOIDmode))
19768 operands[3] = force_reg (mode, operands[3]);
19770 emit_insn (compare_seq);
19771 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19772 gen_rtx_IF_THEN_ELSE (mode,
19773 compare_op, operands[2],
19774 operands[3])));
19775 return true;
19778 /* Swap, force into registers, or otherwise massage the two operands
19779 to an sse comparison with a mask result. Thus we differ a bit from
19780 ix86_prepare_fp_compare_args which expects to produce a flags result.
19782 The DEST operand exists to help determine whether to commute commutative
19783 operators. The POP0/POP1 operands are updated in place. The new
19784 comparison code is returned, or UNKNOWN if not implementable. */
19786 static enum rtx_code
19787 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19788 rtx *pop0, rtx *pop1)
19790 rtx tmp;
19792 switch (code)
19794 case LTGT:
19795 case UNEQ:
19796 /* AVX supports all the needed comparisons. */
19797 if (TARGET_AVX)
19798 break;
19799 /* We have no LTGT as an operator. We could implement it with
19800 NE & ORDERED, but this requires an extra temporary. It's
19801 not clear that it's worth it. */
19802 return UNKNOWN;
19804 case LT:
19805 case LE:
19806 case UNGT:
19807 case UNGE:
19808 /* These are supported directly. */
19809 break;
19811 case EQ:
19812 case NE:
19813 case UNORDERED:
19814 case ORDERED:
19815 /* AVX has 3 operand comparisons, no need to swap anything. */
19816 if (TARGET_AVX)
19817 break;
19818 /* For commutative operators, try to canonicalize the destination
19819 operand to be first in the comparison - this helps reload to
19820 avoid extra moves. */
19821 if (!dest || !rtx_equal_p (dest, *pop1))
19822 break;
19823 /* FALLTHRU */
19825 case GE:
19826 case GT:
19827 case UNLE:
19828 case UNLT:
19829 /* These are not supported directly before AVX, and furthermore
19830 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19831 comparison operands to transform into something that is
19832 supported. */
19833 tmp = *pop0;
19834 *pop0 = *pop1;
19835 *pop1 = tmp;
19836 code = swap_condition (code);
19837 break;
19839 default:
19840 gcc_unreachable ();
19843 return code;
19846 /* Detect conditional moves that exactly match min/max operational
19847 semantics. Note that this is IEEE safe, as long as we don't
19848 interchange the operands.
19850 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19851 and TRUE if the operation is successful and instructions are emitted. */
19853 static bool
19854 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19855 rtx cmp_op1, rtx if_true, rtx if_false)
19857 enum machine_mode mode;
19858 bool is_min;
19859 rtx tmp;
19861 if (code == LT)
19863 else if (code == UNGE)
19865 tmp = if_true;
19866 if_true = if_false;
19867 if_false = tmp;
19869 else
19870 return false;
19872 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19873 is_min = true;
19874 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19875 is_min = false;
19876 else
19877 return false;
19879 mode = GET_MODE (dest);
19881 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19882 but MODE may be a vector mode and thus not appropriate. */
19883 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19885 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19886 rtvec v;
19888 if_true = force_reg (mode, if_true);
19889 v = gen_rtvec (2, if_true, if_false);
19890 tmp = gen_rtx_UNSPEC (mode, v, u);
19892 else
19894 code = is_min ? SMIN : SMAX;
19895 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19898 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19899 return true;
19902 /* Expand an sse vector comparison. Return the register with the result. */
19904 static rtx
19905 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19906 rtx op_true, rtx op_false)
19908 enum machine_mode mode = GET_MODE (dest);
19909 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19910 rtx x;
19912 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19913 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19914 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19916 if (optimize
19917 || reg_overlap_mentioned_p (dest, op_true)
19918 || reg_overlap_mentioned_p (dest, op_false))
19919 dest = gen_reg_rtx (mode);
19921 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19922 if (cmp_mode != mode)
19924 x = force_reg (cmp_mode, x);
19925 convert_move (dest, x, false);
19927 else
19928 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19930 return dest;
19933 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19934 operations. This is used for both scalar and vector conditional moves. */
19936 static void
19937 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19939 enum machine_mode mode = GET_MODE (dest);
19940 rtx t2, t3, x;
19942 if (vector_all_ones_operand (op_true, mode)
19943 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19945 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19947 else if (op_false == CONST0_RTX (mode))
19949 op_true = force_reg (mode, op_true);
19950 x = gen_rtx_AND (mode, cmp, op_true);
19951 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19953 else if (op_true == CONST0_RTX (mode))
19955 op_false = force_reg (mode, op_false);
19956 x = gen_rtx_NOT (mode, cmp);
19957 x = gen_rtx_AND (mode, x, op_false);
19958 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19960 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19962 op_false = force_reg (mode, op_false);
19963 x = gen_rtx_IOR (mode, cmp, op_false);
19964 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19966 else if (TARGET_XOP)
19968 op_true = force_reg (mode, op_true);
19970 if (!nonimmediate_operand (op_false, mode))
19971 op_false = force_reg (mode, op_false);
19973 emit_insn (gen_rtx_SET (mode, dest,
19974 gen_rtx_IF_THEN_ELSE (mode, cmp,
19975 op_true,
19976 op_false)));
19978 else
19980 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19982 if (!nonimmediate_operand (op_true, mode))
19983 op_true = force_reg (mode, op_true);
19985 op_false = force_reg (mode, op_false);
19987 switch (mode)
19989 case V4SFmode:
19990 if (TARGET_SSE4_1)
19991 gen = gen_sse4_1_blendvps;
19992 break;
19993 case V2DFmode:
19994 if (TARGET_SSE4_1)
19995 gen = gen_sse4_1_blendvpd;
19996 break;
19997 case V16QImode:
19998 case V8HImode:
19999 case V4SImode:
20000 case V2DImode:
20001 if (TARGET_SSE4_1)
20003 gen = gen_sse4_1_pblendvb;
20004 dest = gen_lowpart (V16QImode, dest);
20005 op_false = gen_lowpart (V16QImode, op_false);
20006 op_true = gen_lowpart (V16QImode, op_true);
20007 cmp = gen_lowpart (V16QImode, cmp);
20009 break;
20010 case V8SFmode:
20011 if (TARGET_AVX)
20012 gen = gen_avx_blendvps256;
20013 break;
20014 case V4DFmode:
20015 if (TARGET_AVX)
20016 gen = gen_avx_blendvpd256;
20017 break;
20018 case V32QImode:
20019 case V16HImode:
20020 case V8SImode:
20021 case V4DImode:
20022 if (TARGET_AVX2)
20024 gen = gen_avx2_pblendvb;
20025 dest = gen_lowpart (V32QImode, dest);
20026 op_false = gen_lowpart (V32QImode, op_false);
20027 op_true = gen_lowpart (V32QImode, op_true);
20028 cmp = gen_lowpart (V32QImode, cmp);
20030 break;
20031 default:
20032 break;
20035 if (gen != NULL)
20036 emit_insn (gen (dest, op_false, op_true, cmp));
20037 else
20039 op_true = force_reg (mode, op_true);
20041 t2 = gen_reg_rtx (mode);
20042 if (optimize)
20043 t3 = gen_reg_rtx (mode);
20044 else
20045 t3 = dest;
20047 x = gen_rtx_AND (mode, op_true, cmp);
20048 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20050 x = gen_rtx_NOT (mode, cmp);
20051 x = gen_rtx_AND (mode, x, op_false);
20052 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20054 x = gen_rtx_IOR (mode, t3, t2);
20055 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20060 /* Expand a floating-point conditional move. Return true if successful. */
20062 bool
20063 ix86_expand_fp_movcc (rtx operands[])
20065 enum machine_mode mode = GET_MODE (operands[0]);
20066 enum rtx_code code = GET_CODE (operands[1]);
20067 rtx tmp, compare_op;
20068 rtx op0 = XEXP (operands[1], 0);
20069 rtx op1 = XEXP (operands[1], 1);
20071 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20073 enum machine_mode cmode;
20075 /* Since we've no cmove for sse registers, don't force bad register
20076 allocation just to gain access to it. Deny movcc when the
20077 comparison mode doesn't match the move mode. */
20078 cmode = GET_MODE (op0);
20079 if (cmode == VOIDmode)
20080 cmode = GET_MODE (op1);
20081 if (cmode != mode)
20082 return false;
20084 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20085 if (code == UNKNOWN)
20086 return false;
20088 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20089 operands[2], operands[3]))
20090 return true;
20092 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20093 operands[2], operands[3]);
20094 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20095 return true;
20098 if (GET_MODE (op0) == TImode
20099 || (GET_MODE (op0) == DImode
20100 && !TARGET_64BIT))
20101 return false;
20103 /* The floating point conditional move instructions don't directly
20104 support conditions resulting from a signed integer comparison. */
20106 compare_op = ix86_expand_compare (code, op0, op1);
20107 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20109 tmp = gen_reg_rtx (QImode);
20110 ix86_expand_setcc (tmp, code, op0, op1);
20112 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20115 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20116 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20117 operands[2], operands[3])));
20119 return true;
20122 /* Expand a floating-point vector conditional move; a vcond operation
20123 rather than a movcc operation. */
20125 bool
20126 ix86_expand_fp_vcond (rtx operands[])
20128 enum rtx_code code = GET_CODE (operands[3]);
20129 rtx cmp;
20131 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20132 &operands[4], &operands[5]);
20133 if (code == UNKNOWN)
20135 rtx temp;
20136 switch (GET_CODE (operands[3]))
20138 case LTGT:
20139 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20140 operands[5], operands[0], operands[0]);
20141 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20142 operands[5], operands[1], operands[2]);
20143 code = AND;
20144 break;
20145 case UNEQ:
20146 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20147 operands[5], operands[0], operands[0]);
20148 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20149 operands[5], operands[1], operands[2]);
20150 code = IOR;
20151 break;
20152 default:
20153 gcc_unreachable ();
20155 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20156 OPTAB_DIRECT);
20157 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20158 return true;
20161 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20162 operands[5], operands[1], operands[2]))
20163 return true;
20165 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20166 operands[1], operands[2]);
20167 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20168 return true;
20171 /* Expand a signed/unsigned integral vector conditional move. */
20173 bool
20174 ix86_expand_int_vcond (rtx operands[])
20176 enum machine_mode data_mode = GET_MODE (operands[0]);
20177 enum machine_mode mode = GET_MODE (operands[4]);
20178 enum rtx_code code = GET_CODE (operands[3]);
20179 bool negate = false;
20180 rtx x, cop0, cop1;
20182 cop0 = operands[4];
20183 cop1 = operands[5];
20185 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20186 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20187 if ((code == LT || code == GE)
20188 && data_mode == mode
20189 && cop1 == CONST0_RTX (mode)
20190 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20191 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20192 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20193 && (GET_MODE_SIZE (data_mode) == 16
20194 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20196 rtx negop = operands[2 - (code == LT)];
20197 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20198 if (negop == CONST1_RTX (data_mode))
20200 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20201 operands[0], 1, OPTAB_DIRECT);
20202 if (res != operands[0])
20203 emit_move_insn (operands[0], res);
20204 return true;
20206 else if (GET_MODE_INNER (data_mode) != DImode
20207 && vector_all_ones_operand (negop, data_mode))
20209 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20210 operands[0], 0, OPTAB_DIRECT);
20211 if (res != operands[0])
20212 emit_move_insn (operands[0], res);
20213 return true;
20217 if (!nonimmediate_operand (cop1, mode))
20218 cop1 = force_reg (mode, cop1);
20219 if (!general_operand (operands[1], data_mode))
20220 operands[1] = force_reg (data_mode, operands[1]);
20221 if (!general_operand (operands[2], data_mode))
20222 operands[2] = force_reg (data_mode, operands[2]);
20224 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20225 if (TARGET_XOP
20226 && (mode == V16QImode || mode == V8HImode
20227 || mode == V4SImode || mode == V2DImode))
20229 else
20231 /* Canonicalize the comparison to EQ, GT, GTU. */
20232 switch (code)
20234 case EQ:
20235 case GT:
20236 case GTU:
20237 break;
20239 case NE:
20240 case LE:
20241 case LEU:
20242 code = reverse_condition (code);
20243 negate = true;
20244 break;
20246 case GE:
20247 case GEU:
20248 code = reverse_condition (code);
20249 negate = true;
20250 /* FALLTHRU */
20252 case LT:
20253 case LTU:
20254 code = swap_condition (code);
20255 x = cop0, cop0 = cop1, cop1 = x;
20256 break;
20258 default:
20259 gcc_unreachable ();
20262 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20263 if (mode == V2DImode)
20265 switch (code)
20267 case EQ:
20268 /* SSE4.1 supports EQ. */
20269 if (!TARGET_SSE4_1)
20270 return false;
20271 break;
20273 case GT:
20274 case GTU:
20275 /* SSE4.2 supports GT/GTU. */
20276 if (!TARGET_SSE4_2)
20277 return false;
20278 break;
20280 default:
20281 gcc_unreachable ();
20285 /* Unsigned parallel compare is not supported by the hardware.
20286 Play some tricks to turn this into a signed comparison
20287 against 0. */
20288 if (code == GTU)
20290 cop0 = force_reg (mode, cop0);
20292 switch (mode)
20294 case V8SImode:
20295 case V4DImode:
20296 case V4SImode:
20297 case V2DImode:
20299 rtx t1, t2, mask;
20300 rtx (*gen_sub3) (rtx, rtx, rtx);
20302 switch (mode)
20304 case V8SImode: gen_sub3 = gen_subv8si3; break;
20305 case V4DImode: gen_sub3 = gen_subv4di3; break;
20306 case V4SImode: gen_sub3 = gen_subv4si3; break;
20307 case V2DImode: gen_sub3 = gen_subv2di3; break;
20308 default:
20309 gcc_unreachable ();
20311 /* Subtract (-(INT MAX) - 1) from both operands to make
20312 them signed. */
20313 mask = ix86_build_signbit_mask (mode, true, false);
20314 t1 = gen_reg_rtx (mode);
20315 emit_insn (gen_sub3 (t1, cop0, mask));
20317 t2 = gen_reg_rtx (mode);
20318 emit_insn (gen_sub3 (t2, cop1, mask));
20320 cop0 = t1;
20321 cop1 = t2;
20322 code = GT;
20324 break;
20326 case V32QImode:
20327 case V16HImode:
20328 case V16QImode:
20329 case V8HImode:
20330 /* Perform a parallel unsigned saturating subtraction. */
20331 x = gen_reg_rtx (mode);
20332 emit_insn (gen_rtx_SET (VOIDmode, x,
20333 gen_rtx_US_MINUS (mode, cop0, cop1)));
20335 cop0 = x;
20336 cop1 = CONST0_RTX (mode);
20337 code = EQ;
20338 negate = !negate;
20339 break;
20341 default:
20342 gcc_unreachable ();
20347 /* Allow the comparison to be done in one mode, but the movcc to
20348 happen in another mode. */
20349 if (data_mode == mode)
20351 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20352 operands[1+negate], operands[2-negate]);
20354 else
20356 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20357 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20358 code, cop0, cop1,
20359 operands[1+negate], operands[2-negate]);
20360 x = gen_lowpart (data_mode, x);
20363 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20364 operands[2-negate]);
20365 return true;
20368 /* Expand a variable vector permutation. */
20370 void
20371 ix86_expand_vec_perm (rtx operands[])
20373 rtx target = operands[0];
20374 rtx op0 = operands[1];
20375 rtx op1 = operands[2];
20376 rtx mask = operands[3];
20377 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20378 enum machine_mode mode = GET_MODE (op0);
20379 enum machine_mode maskmode = GET_MODE (mask);
20380 int w, e, i;
20381 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20383 /* Number of elements in the vector. */
20384 w = GET_MODE_NUNITS (mode);
20385 e = GET_MODE_UNIT_SIZE (mode);
20386 gcc_assert (w <= 32);
20388 if (TARGET_AVX2)
20390 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20392 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20393 an constant shuffle operand. With a tiny bit of effort we can
20394 use VPERMD instead. A re-interpretation stall for V4DFmode is
20395 unfortunate but there's no avoiding it.
20396 Similarly for V16HImode we don't have instructions for variable
20397 shuffling, while for V32QImode we can use after preparing suitable
20398 masks vpshufb; vpshufb; vpermq; vpor. */
20400 if (mode == V16HImode)
20402 maskmode = mode = V32QImode;
20403 w = 32;
20404 e = 1;
20406 else
20408 maskmode = mode = V8SImode;
20409 w = 8;
20410 e = 4;
20412 t1 = gen_reg_rtx (maskmode);
20414 /* Replicate the low bits of the V4DImode mask into V8SImode:
20415 mask = { A B C D }
20416 t1 = { A A B B C C D D }. */
20417 for (i = 0; i < w / 2; ++i)
20418 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20419 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20420 vt = force_reg (maskmode, vt);
20421 mask = gen_lowpart (maskmode, mask);
20422 if (maskmode == V8SImode)
20423 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20424 else
20425 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20427 /* Multiply the shuffle indicies by two. */
20428 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20429 OPTAB_DIRECT);
20431 /* Add one to the odd shuffle indicies:
20432 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20433 for (i = 0; i < w / 2; ++i)
20435 vec[i * 2] = const0_rtx;
20436 vec[i * 2 + 1] = const1_rtx;
20438 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20439 vt = validize_mem (force_const_mem (maskmode, vt));
20440 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20441 OPTAB_DIRECT);
20443 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20444 operands[3] = mask = t1;
20445 target = gen_lowpart (mode, target);
20446 op0 = gen_lowpart (mode, op0);
20447 op1 = gen_lowpart (mode, op1);
20450 switch (mode)
20452 case V8SImode:
20453 /* The VPERMD and VPERMPS instructions already properly ignore
20454 the high bits of the shuffle elements. No need for us to
20455 perform an AND ourselves. */
20456 if (one_operand_shuffle)
20457 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20458 else
20460 t1 = gen_reg_rtx (V8SImode);
20461 t2 = gen_reg_rtx (V8SImode);
20462 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20463 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20464 goto merge_two;
20466 return;
20468 case V8SFmode:
20469 mask = gen_lowpart (V8SFmode, mask);
20470 if (one_operand_shuffle)
20471 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20472 else
20474 t1 = gen_reg_rtx (V8SFmode);
20475 t2 = gen_reg_rtx (V8SFmode);
20476 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20477 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20478 goto merge_two;
20480 return;
20482 case V4SImode:
20483 /* By combining the two 128-bit input vectors into one 256-bit
20484 input vector, we can use VPERMD and VPERMPS for the full
20485 two-operand shuffle. */
20486 t1 = gen_reg_rtx (V8SImode);
20487 t2 = gen_reg_rtx (V8SImode);
20488 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20489 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20490 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20491 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20492 return;
20494 case V4SFmode:
20495 t1 = gen_reg_rtx (V8SFmode);
20496 t2 = gen_reg_rtx (V8SImode);
20497 mask = gen_lowpart (V4SImode, mask);
20498 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20499 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20500 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20501 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20502 return;
20504 case V32QImode:
20505 t1 = gen_reg_rtx (V32QImode);
20506 t2 = gen_reg_rtx (V32QImode);
20507 t3 = gen_reg_rtx (V32QImode);
20508 vt2 = GEN_INT (128);
20509 for (i = 0; i < 32; i++)
20510 vec[i] = vt2;
20511 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20512 vt = force_reg (V32QImode, vt);
20513 for (i = 0; i < 32; i++)
20514 vec[i] = i < 16 ? vt2 : const0_rtx;
20515 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20516 vt2 = force_reg (V32QImode, vt2);
20517 /* From mask create two adjusted masks, which contain the same
20518 bits as mask in the low 7 bits of each vector element.
20519 The first mask will have the most significant bit clear
20520 if it requests element from the same 128-bit lane
20521 and MSB set if it requests element from the other 128-bit lane.
20522 The second mask will have the opposite values of the MSB,
20523 and additionally will have its 128-bit lanes swapped.
20524 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20525 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20526 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20527 stands for other 12 bytes. */
20528 /* The bit whether element is from the same lane or the other
20529 lane is bit 4, so shift it up by 3 to the MSB position. */
20530 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20531 gen_lowpart (V4DImode, mask),
20532 GEN_INT (3)));
20533 /* Clear MSB bits from the mask just in case it had them set. */
20534 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20535 /* After this t1 will have MSB set for elements from other lane. */
20536 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20537 /* Clear bits other than MSB. */
20538 emit_insn (gen_andv32qi3 (t1, t1, vt));
20539 /* Or in the lower bits from mask into t3. */
20540 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20541 /* And invert MSB bits in t1, so MSB is set for elements from the same
20542 lane. */
20543 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20544 /* Swap 128-bit lanes in t3. */
20545 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20546 gen_lowpart (V4DImode, t3),
20547 const2_rtx, GEN_INT (3),
20548 const0_rtx, const1_rtx));
20549 /* And or in the lower bits from mask into t1. */
20550 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20551 if (one_operand_shuffle)
20553 /* Each of these shuffles will put 0s in places where
20554 element from the other 128-bit lane is needed, otherwise
20555 will shuffle in the requested value. */
20556 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20557 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20558 /* For t3 the 128-bit lanes are swapped again. */
20559 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20560 gen_lowpart (V4DImode, t3),
20561 const2_rtx, GEN_INT (3),
20562 const0_rtx, const1_rtx));
20563 /* And oring both together leads to the result. */
20564 emit_insn (gen_iorv32qi3 (target, t1, t3));
20565 return;
20568 t4 = gen_reg_rtx (V32QImode);
20569 /* Similarly to the above one_operand_shuffle code,
20570 just for repeated twice for each operand. merge_two:
20571 code will merge the two results together. */
20572 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20573 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20574 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20575 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20576 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20577 gen_lowpart (V4DImode, t4),
20578 const2_rtx, GEN_INT (3),
20579 const0_rtx, const1_rtx));
20580 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20581 gen_lowpart (V4DImode, t3),
20582 const2_rtx, GEN_INT (3),
20583 const0_rtx, const1_rtx));
20584 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20585 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20586 t1 = t4;
20587 t2 = t3;
20588 goto merge_two;
20590 default:
20591 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20592 break;
20596 if (TARGET_XOP)
20598 /* The XOP VPPERM insn supports three inputs. By ignoring the
20599 one_operand_shuffle special case, we avoid creating another
20600 set of constant vectors in memory. */
20601 one_operand_shuffle = false;
20603 /* mask = mask & {2*w-1, ...} */
20604 vt = GEN_INT (2*w - 1);
20606 else
20608 /* mask = mask & {w-1, ...} */
20609 vt = GEN_INT (w - 1);
20612 for (i = 0; i < w; i++)
20613 vec[i] = vt;
20614 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20615 mask = expand_simple_binop (maskmode, AND, mask, vt,
20616 NULL_RTX, 0, OPTAB_DIRECT);
20618 /* For non-QImode operations, convert the word permutation control
20619 into a byte permutation control. */
20620 if (mode != V16QImode)
20622 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20623 GEN_INT (exact_log2 (e)),
20624 NULL_RTX, 0, OPTAB_DIRECT);
20626 /* Convert mask to vector of chars. */
20627 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20629 /* Replicate each of the input bytes into byte positions:
20630 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20631 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20632 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20633 for (i = 0; i < 16; ++i)
20634 vec[i] = GEN_INT (i/e * e);
20635 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20636 vt = validize_mem (force_const_mem (V16QImode, vt));
20637 if (TARGET_XOP)
20638 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20639 else
20640 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20642 /* Convert it into the byte positions by doing
20643 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20644 for (i = 0; i < 16; ++i)
20645 vec[i] = GEN_INT (i % e);
20646 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20647 vt = validize_mem (force_const_mem (V16QImode, vt));
20648 emit_insn (gen_addv16qi3 (mask, mask, vt));
20651 /* The actual shuffle operations all operate on V16QImode. */
20652 op0 = gen_lowpart (V16QImode, op0);
20653 op1 = gen_lowpart (V16QImode, op1);
20654 target = gen_lowpart (V16QImode, target);
20656 if (TARGET_XOP)
20658 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20660 else if (one_operand_shuffle)
20662 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20664 else
20666 rtx xops[6];
20667 bool ok;
20669 /* Shuffle the two input vectors independently. */
20670 t1 = gen_reg_rtx (V16QImode);
20671 t2 = gen_reg_rtx (V16QImode);
20672 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20673 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20675 merge_two:
20676 /* Then merge them together. The key is whether any given control
20677 element contained a bit set that indicates the second word. */
20678 mask = operands[3];
20679 vt = GEN_INT (w);
20680 if (maskmode == V2DImode && !TARGET_SSE4_1)
20682 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20683 more shuffle to convert the V2DI input mask into a V4SI
20684 input mask. At which point the masking that expand_int_vcond
20685 will work as desired. */
20686 rtx t3 = gen_reg_rtx (V4SImode);
20687 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20688 const0_rtx, const0_rtx,
20689 const2_rtx, const2_rtx));
20690 mask = t3;
20691 maskmode = V4SImode;
20692 e = w = 4;
20695 for (i = 0; i < w; i++)
20696 vec[i] = vt;
20697 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20698 vt = force_reg (maskmode, vt);
20699 mask = expand_simple_binop (maskmode, AND, mask, vt,
20700 NULL_RTX, 0, OPTAB_DIRECT);
20702 xops[0] = gen_lowpart (mode, operands[0]);
20703 xops[1] = gen_lowpart (mode, t2);
20704 xops[2] = gen_lowpart (mode, t1);
20705 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20706 xops[4] = mask;
20707 xops[5] = vt;
20708 ok = ix86_expand_int_vcond (xops);
20709 gcc_assert (ok);
20713 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20714 true if we should do zero extension, else sign extension. HIGH_P is
20715 true if we want the N/2 high elements, else the low elements. */
20717 void
20718 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20720 enum machine_mode imode = GET_MODE (src);
20721 rtx tmp;
20723 if (TARGET_SSE4_1)
20725 rtx (*unpack)(rtx, rtx);
20726 rtx (*extract)(rtx, rtx) = NULL;
20727 enum machine_mode halfmode = BLKmode;
20729 switch (imode)
20731 case V32QImode:
20732 if (unsigned_p)
20733 unpack = gen_avx2_zero_extendv16qiv16hi2;
20734 else
20735 unpack = gen_avx2_sign_extendv16qiv16hi2;
20736 halfmode = V16QImode;
20737 extract
20738 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20739 break;
20740 case V16HImode:
20741 if (unsigned_p)
20742 unpack = gen_avx2_zero_extendv8hiv8si2;
20743 else
20744 unpack = gen_avx2_sign_extendv8hiv8si2;
20745 halfmode = V8HImode;
20746 extract
20747 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20748 break;
20749 case V8SImode:
20750 if (unsigned_p)
20751 unpack = gen_avx2_zero_extendv4siv4di2;
20752 else
20753 unpack = gen_avx2_sign_extendv4siv4di2;
20754 halfmode = V4SImode;
20755 extract
20756 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20757 break;
20758 case V16QImode:
20759 if (unsigned_p)
20760 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20761 else
20762 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20763 break;
20764 case V8HImode:
20765 if (unsigned_p)
20766 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20767 else
20768 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20769 break;
20770 case V4SImode:
20771 if (unsigned_p)
20772 unpack = gen_sse4_1_zero_extendv2siv2di2;
20773 else
20774 unpack = gen_sse4_1_sign_extendv2siv2di2;
20775 break;
20776 default:
20777 gcc_unreachable ();
20780 if (GET_MODE_SIZE (imode) == 32)
20782 tmp = gen_reg_rtx (halfmode);
20783 emit_insn (extract (tmp, src));
20785 else if (high_p)
20787 /* Shift higher 8 bytes to lower 8 bytes. */
20788 tmp = gen_reg_rtx (imode);
20789 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20790 gen_lowpart (V1TImode, src),
20791 GEN_INT (64)));
20793 else
20794 tmp = src;
20796 emit_insn (unpack (dest, tmp));
20798 else
20800 rtx (*unpack)(rtx, rtx, rtx);
20802 switch (imode)
20804 case V16QImode:
20805 if (high_p)
20806 unpack = gen_vec_interleave_highv16qi;
20807 else
20808 unpack = gen_vec_interleave_lowv16qi;
20809 break;
20810 case V8HImode:
20811 if (high_p)
20812 unpack = gen_vec_interleave_highv8hi;
20813 else
20814 unpack = gen_vec_interleave_lowv8hi;
20815 break;
20816 case V4SImode:
20817 if (high_p)
20818 unpack = gen_vec_interleave_highv4si;
20819 else
20820 unpack = gen_vec_interleave_lowv4si;
20821 break;
20822 default:
20823 gcc_unreachable ();
20826 if (unsigned_p)
20827 tmp = force_reg (imode, CONST0_RTX (imode));
20828 else
20829 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20830 src, pc_rtx, pc_rtx);
20832 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20836 /* Expand conditional increment or decrement using adb/sbb instructions.
20837 The default case using setcc followed by the conditional move can be
20838 done by generic code. */
20839 bool
20840 ix86_expand_int_addcc (rtx operands[])
20842 enum rtx_code code = GET_CODE (operands[1]);
20843 rtx flags;
20844 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20845 rtx compare_op;
20846 rtx val = const0_rtx;
20847 bool fpcmp = false;
20848 enum machine_mode mode;
20849 rtx op0 = XEXP (operands[1], 0);
20850 rtx op1 = XEXP (operands[1], 1);
20852 if (operands[3] != const1_rtx
20853 && operands[3] != constm1_rtx)
20854 return false;
20855 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20856 return false;
20857 code = GET_CODE (compare_op);
20859 flags = XEXP (compare_op, 0);
20861 if (GET_MODE (flags) == CCFPmode
20862 || GET_MODE (flags) == CCFPUmode)
20864 fpcmp = true;
20865 code = ix86_fp_compare_code_to_integer (code);
20868 if (code != LTU)
20870 val = constm1_rtx;
20871 if (fpcmp)
20872 PUT_CODE (compare_op,
20873 reverse_condition_maybe_unordered
20874 (GET_CODE (compare_op)));
20875 else
20876 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20879 mode = GET_MODE (operands[0]);
20881 /* Construct either adc or sbb insn. */
20882 if ((code == LTU) == (operands[3] == constm1_rtx))
20884 switch (mode)
20886 case QImode:
20887 insn = gen_subqi3_carry;
20888 break;
20889 case HImode:
20890 insn = gen_subhi3_carry;
20891 break;
20892 case SImode:
20893 insn = gen_subsi3_carry;
20894 break;
20895 case DImode:
20896 insn = gen_subdi3_carry;
20897 break;
20898 default:
20899 gcc_unreachable ();
20902 else
20904 switch (mode)
20906 case QImode:
20907 insn = gen_addqi3_carry;
20908 break;
20909 case HImode:
20910 insn = gen_addhi3_carry;
20911 break;
20912 case SImode:
20913 insn = gen_addsi3_carry;
20914 break;
20915 case DImode:
20916 insn = gen_adddi3_carry;
20917 break;
20918 default:
20919 gcc_unreachable ();
20922 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20924 return true;
20928 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20929 but works for floating pointer parameters and nonoffsetable memories.
20930 For pushes, it returns just stack offsets; the values will be saved
20931 in the right order. Maximally three parts are generated. */
20933 static int
20934 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20936 int size;
20938 if (!TARGET_64BIT)
20939 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20940 else
20941 size = (GET_MODE_SIZE (mode) + 4) / 8;
20943 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20944 gcc_assert (size >= 2 && size <= 4);
20946 /* Optimize constant pool reference to immediates. This is used by fp
20947 moves, that force all constants to memory to allow combining. */
20948 if (MEM_P (operand) && MEM_READONLY_P (operand))
20950 rtx tmp = maybe_get_pool_constant (operand);
20951 if (tmp)
20952 operand = tmp;
20955 if (MEM_P (operand) && !offsettable_memref_p (operand))
20957 /* The only non-offsetable memories we handle are pushes. */
20958 int ok = push_operand (operand, VOIDmode);
20960 gcc_assert (ok);
20962 operand = copy_rtx (operand);
20963 PUT_MODE (operand, word_mode);
20964 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20965 return size;
20968 if (GET_CODE (operand) == CONST_VECTOR)
20970 enum machine_mode imode = int_mode_for_mode (mode);
20971 /* Caution: if we looked through a constant pool memory above,
20972 the operand may actually have a different mode now. That's
20973 ok, since we want to pun this all the way back to an integer. */
20974 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20975 gcc_assert (operand != NULL);
20976 mode = imode;
20979 if (!TARGET_64BIT)
20981 if (mode == DImode)
20982 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20983 else
20985 int i;
20987 if (REG_P (operand))
20989 gcc_assert (reload_completed);
20990 for (i = 0; i < size; i++)
20991 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20993 else if (offsettable_memref_p (operand))
20995 operand = adjust_address (operand, SImode, 0);
20996 parts[0] = operand;
20997 for (i = 1; i < size; i++)
20998 parts[i] = adjust_address (operand, SImode, 4 * i);
21000 else if (GET_CODE (operand) == CONST_DOUBLE)
21002 REAL_VALUE_TYPE r;
21003 long l[4];
21005 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21006 switch (mode)
21008 case TFmode:
21009 real_to_target (l, &r, mode);
21010 parts[3] = gen_int_mode (l[3], SImode);
21011 parts[2] = gen_int_mode (l[2], SImode);
21012 break;
21013 case XFmode:
21014 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21015 long double may not be 80-bit. */
21016 real_to_target (l, &r, mode);
21017 parts[2] = gen_int_mode (l[2], SImode);
21018 break;
21019 case DFmode:
21020 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21021 break;
21022 default:
21023 gcc_unreachable ();
21025 parts[1] = gen_int_mode (l[1], SImode);
21026 parts[0] = gen_int_mode (l[0], SImode);
21028 else
21029 gcc_unreachable ();
21032 else
21034 if (mode == TImode)
21035 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21036 if (mode == XFmode || mode == TFmode)
21038 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21039 if (REG_P (operand))
21041 gcc_assert (reload_completed);
21042 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21043 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21045 else if (offsettable_memref_p (operand))
21047 operand = adjust_address (operand, DImode, 0);
21048 parts[0] = operand;
21049 parts[1] = adjust_address (operand, upper_mode, 8);
21051 else if (GET_CODE (operand) == CONST_DOUBLE)
21053 REAL_VALUE_TYPE r;
21054 long l[4];
21056 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21057 real_to_target (l, &r, mode);
21059 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21060 if (HOST_BITS_PER_WIDE_INT >= 64)
21061 parts[0]
21062 = gen_int_mode
21063 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21064 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21065 DImode);
21066 else
21067 parts[0] = immed_double_const (l[0], l[1], DImode);
21069 if (upper_mode == SImode)
21070 parts[1] = gen_int_mode (l[2], SImode);
21071 else if (HOST_BITS_PER_WIDE_INT >= 64)
21072 parts[1]
21073 = gen_int_mode
21074 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21075 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21076 DImode);
21077 else
21078 parts[1] = immed_double_const (l[2], l[3], DImode);
21080 else
21081 gcc_unreachable ();
21085 return size;
21088 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21089 Return false when normal moves are needed; true when all required
21090 insns have been emitted. Operands 2-4 contain the input values
21091 int the correct order; operands 5-7 contain the output values. */
21093 void
21094 ix86_split_long_move (rtx operands[])
21096 rtx part[2][4];
21097 int nparts, i, j;
21098 int push = 0;
21099 int collisions = 0;
21100 enum machine_mode mode = GET_MODE (operands[0]);
21101 bool collisionparts[4];
21103 /* The DFmode expanders may ask us to move double.
21104 For 64bit target this is single move. By hiding the fact
21105 here we simplify i386.md splitters. */
21106 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21108 /* Optimize constant pool reference to immediates. This is used by
21109 fp moves, that force all constants to memory to allow combining. */
21111 if (MEM_P (operands[1])
21112 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21113 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21114 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21115 if (push_operand (operands[0], VOIDmode))
21117 operands[0] = copy_rtx (operands[0]);
21118 PUT_MODE (operands[0], word_mode);
21120 else
21121 operands[0] = gen_lowpart (DImode, operands[0]);
21122 operands[1] = gen_lowpart (DImode, operands[1]);
21123 emit_move_insn (operands[0], operands[1]);
21124 return;
21127 /* The only non-offsettable memory we handle is push. */
21128 if (push_operand (operands[0], VOIDmode))
21129 push = 1;
21130 else
21131 gcc_assert (!MEM_P (operands[0])
21132 || offsettable_memref_p (operands[0]));
21134 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21135 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21137 /* When emitting push, take care for source operands on the stack. */
21138 if (push && MEM_P (operands[1])
21139 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21141 rtx src_base = XEXP (part[1][nparts - 1], 0);
21143 /* Compensate for the stack decrement by 4. */
21144 if (!TARGET_64BIT && nparts == 3
21145 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21146 src_base = plus_constant (Pmode, src_base, 4);
21148 /* src_base refers to the stack pointer and is
21149 automatically decreased by emitted push. */
21150 for (i = 0; i < nparts; i++)
21151 part[1][i] = change_address (part[1][i],
21152 GET_MODE (part[1][i]), src_base);
21155 /* We need to do copy in the right order in case an address register
21156 of the source overlaps the destination. */
21157 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21159 rtx tmp;
21161 for (i = 0; i < nparts; i++)
21163 collisionparts[i]
21164 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21165 if (collisionparts[i])
21166 collisions++;
21169 /* Collision in the middle part can be handled by reordering. */
21170 if (collisions == 1 && nparts == 3 && collisionparts [1])
21172 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21173 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21175 else if (collisions == 1
21176 && nparts == 4
21177 && (collisionparts [1] || collisionparts [2]))
21179 if (collisionparts [1])
21181 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21182 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21184 else
21186 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21187 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21191 /* If there are more collisions, we can't handle it by reordering.
21192 Do an lea to the last part and use only one colliding move. */
21193 else if (collisions > 1)
21195 rtx base;
21197 collisions = 1;
21199 base = part[0][nparts - 1];
21201 /* Handle the case when the last part isn't valid for lea.
21202 Happens in 64-bit mode storing the 12-byte XFmode. */
21203 if (GET_MODE (base) != Pmode)
21204 base = gen_rtx_REG (Pmode, REGNO (base));
21206 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21207 part[1][0] = replace_equiv_address (part[1][0], base);
21208 for (i = 1; i < nparts; i++)
21210 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21211 part[1][i] = replace_equiv_address (part[1][i], tmp);
21216 if (push)
21218 if (!TARGET_64BIT)
21220 if (nparts == 3)
21222 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21223 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21224 stack_pointer_rtx, GEN_INT (-4)));
21225 emit_move_insn (part[0][2], part[1][2]);
21227 else if (nparts == 4)
21229 emit_move_insn (part[0][3], part[1][3]);
21230 emit_move_insn (part[0][2], part[1][2]);
21233 else
21235 /* In 64bit mode we don't have 32bit push available. In case this is
21236 register, it is OK - we will just use larger counterpart. We also
21237 retype memory - these comes from attempt to avoid REX prefix on
21238 moving of second half of TFmode value. */
21239 if (GET_MODE (part[1][1]) == SImode)
21241 switch (GET_CODE (part[1][1]))
21243 case MEM:
21244 part[1][1] = adjust_address (part[1][1], DImode, 0);
21245 break;
21247 case REG:
21248 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21249 break;
21251 default:
21252 gcc_unreachable ();
21255 if (GET_MODE (part[1][0]) == SImode)
21256 part[1][0] = part[1][1];
21259 emit_move_insn (part[0][1], part[1][1]);
21260 emit_move_insn (part[0][0], part[1][0]);
21261 return;
21264 /* Choose correct order to not overwrite the source before it is copied. */
21265 if ((REG_P (part[0][0])
21266 && REG_P (part[1][1])
21267 && (REGNO (part[0][0]) == REGNO (part[1][1])
21268 || (nparts == 3
21269 && REGNO (part[0][0]) == REGNO (part[1][2]))
21270 || (nparts == 4
21271 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21272 || (collisions > 0
21273 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21275 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21277 operands[2 + i] = part[0][j];
21278 operands[6 + i] = part[1][j];
21281 else
21283 for (i = 0; i < nparts; i++)
21285 operands[2 + i] = part[0][i];
21286 operands[6 + i] = part[1][i];
21290 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21291 if (optimize_insn_for_size_p ())
21293 for (j = 0; j < nparts - 1; j++)
21294 if (CONST_INT_P (operands[6 + j])
21295 && operands[6 + j] != const0_rtx
21296 && REG_P (operands[2 + j]))
21297 for (i = j; i < nparts - 1; i++)
21298 if (CONST_INT_P (operands[7 + i])
21299 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21300 operands[7 + i] = operands[2 + j];
21303 for (i = 0; i < nparts; i++)
21304 emit_move_insn (operands[2 + i], operands[6 + i]);
21306 return;
21309 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21310 left shift by a constant, either using a single shift or
21311 a sequence of add instructions. */
21313 static void
21314 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21316 rtx (*insn)(rtx, rtx, rtx);
21318 if (count == 1
21319 || (count * ix86_cost->add <= ix86_cost->shift_const
21320 && !optimize_insn_for_size_p ()))
21322 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21323 while (count-- > 0)
21324 emit_insn (insn (operand, operand, operand));
21326 else
21328 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21329 emit_insn (insn (operand, operand, GEN_INT (count)));
21333 void
21334 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21336 rtx (*gen_ashl3)(rtx, rtx, rtx);
21337 rtx (*gen_shld)(rtx, rtx, rtx);
21338 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21340 rtx low[2], high[2];
21341 int count;
21343 if (CONST_INT_P (operands[2]))
21345 split_double_mode (mode, operands, 2, low, high);
21346 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21348 if (count >= half_width)
21350 emit_move_insn (high[0], low[1]);
21351 emit_move_insn (low[0], const0_rtx);
21353 if (count > half_width)
21354 ix86_expand_ashl_const (high[0], count - half_width, mode);
21356 else
21358 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21360 if (!rtx_equal_p (operands[0], operands[1]))
21361 emit_move_insn (operands[0], operands[1]);
21363 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21364 ix86_expand_ashl_const (low[0], count, mode);
21366 return;
21369 split_double_mode (mode, operands, 1, low, high);
21371 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21373 if (operands[1] == const1_rtx)
21375 /* Assuming we've chosen a QImode capable registers, then 1 << N
21376 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21377 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21379 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21381 ix86_expand_clear (low[0]);
21382 ix86_expand_clear (high[0]);
21383 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21385 d = gen_lowpart (QImode, low[0]);
21386 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21387 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21388 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21390 d = gen_lowpart (QImode, high[0]);
21391 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21392 s = gen_rtx_NE (QImode, flags, const0_rtx);
21393 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21396 /* Otherwise, we can get the same results by manually performing
21397 a bit extract operation on bit 5/6, and then performing the two
21398 shifts. The two methods of getting 0/1 into low/high are exactly
21399 the same size. Avoiding the shift in the bit extract case helps
21400 pentium4 a bit; no one else seems to care much either way. */
21401 else
21403 enum machine_mode half_mode;
21404 rtx (*gen_lshr3)(rtx, rtx, rtx);
21405 rtx (*gen_and3)(rtx, rtx, rtx);
21406 rtx (*gen_xor3)(rtx, rtx, rtx);
21407 HOST_WIDE_INT bits;
21408 rtx x;
21410 if (mode == DImode)
21412 half_mode = SImode;
21413 gen_lshr3 = gen_lshrsi3;
21414 gen_and3 = gen_andsi3;
21415 gen_xor3 = gen_xorsi3;
21416 bits = 5;
21418 else
21420 half_mode = DImode;
21421 gen_lshr3 = gen_lshrdi3;
21422 gen_and3 = gen_anddi3;
21423 gen_xor3 = gen_xordi3;
21424 bits = 6;
21427 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21428 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21429 else
21430 x = gen_lowpart (half_mode, operands[2]);
21431 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21433 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21434 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21435 emit_move_insn (low[0], high[0]);
21436 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21439 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21440 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21441 return;
21444 if (operands[1] == constm1_rtx)
21446 /* For -1 << N, we can avoid the shld instruction, because we
21447 know that we're shifting 0...31/63 ones into a -1. */
21448 emit_move_insn (low[0], constm1_rtx);
21449 if (optimize_insn_for_size_p ())
21450 emit_move_insn (high[0], low[0]);
21451 else
21452 emit_move_insn (high[0], constm1_rtx);
21454 else
21456 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21458 if (!rtx_equal_p (operands[0], operands[1]))
21459 emit_move_insn (operands[0], operands[1]);
21461 split_double_mode (mode, operands, 1, low, high);
21462 emit_insn (gen_shld (high[0], low[0], operands[2]));
21465 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21467 if (TARGET_CMOVE && scratch)
21469 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21470 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21472 ix86_expand_clear (scratch);
21473 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21475 else
21477 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21478 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21480 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21484 void
21485 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21487 rtx (*gen_ashr3)(rtx, rtx, rtx)
21488 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21489 rtx (*gen_shrd)(rtx, rtx, rtx);
21490 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21492 rtx low[2], high[2];
21493 int count;
21495 if (CONST_INT_P (operands[2]))
21497 split_double_mode (mode, operands, 2, low, high);
21498 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21500 if (count == GET_MODE_BITSIZE (mode) - 1)
21502 emit_move_insn (high[0], high[1]);
21503 emit_insn (gen_ashr3 (high[0], high[0],
21504 GEN_INT (half_width - 1)));
21505 emit_move_insn (low[0], high[0]);
21508 else if (count >= half_width)
21510 emit_move_insn (low[0], high[1]);
21511 emit_move_insn (high[0], low[0]);
21512 emit_insn (gen_ashr3 (high[0], high[0],
21513 GEN_INT (half_width - 1)));
21515 if (count > half_width)
21516 emit_insn (gen_ashr3 (low[0], low[0],
21517 GEN_INT (count - half_width)));
21519 else
21521 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21523 if (!rtx_equal_p (operands[0], operands[1]))
21524 emit_move_insn (operands[0], operands[1]);
21526 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21527 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21530 else
21532 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21534 if (!rtx_equal_p (operands[0], operands[1]))
21535 emit_move_insn (operands[0], operands[1]);
21537 split_double_mode (mode, operands, 1, low, high);
21539 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21540 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21542 if (TARGET_CMOVE && scratch)
21544 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21545 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21547 emit_move_insn (scratch, high[0]);
21548 emit_insn (gen_ashr3 (scratch, scratch,
21549 GEN_INT (half_width - 1)));
21550 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21551 scratch));
21553 else
21555 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21556 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21558 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21563 void
21564 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21566 rtx (*gen_lshr3)(rtx, rtx, rtx)
21567 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21568 rtx (*gen_shrd)(rtx, rtx, rtx);
21569 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21571 rtx low[2], high[2];
21572 int count;
21574 if (CONST_INT_P (operands[2]))
21576 split_double_mode (mode, operands, 2, low, high);
21577 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21579 if (count >= half_width)
21581 emit_move_insn (low[0], high[1]);
21582 ix86_expand_clear (high[0]);
21584 if (count > half_width)
21585 emit_insn (gen_lshr3 (low[0], low[0],
21586 GEN_INT (count - half_width)));
21588 else
21590 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21592 if (!rtx_equal_p (operands[0], operands[1]))
21593 emit_move_insn (operands[0], operands[1]);
21595 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21596 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21599 else
21601 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21603 if (!rtx_equal_p (operands[0], operands[1]))
21604 emit_move_insn (operands[0], operands[1]);
21606 split_double_mode (mode, operands, 1, low, high);
21608 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21609 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21611 if (TARGET_CMOVE && scratch)
21613 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21614 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21616 ix86_expand_clear (scratch);
21617 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21618 scratch));
21620 else
21622 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21623 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21625 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21630 /* Predict just emitted jump instruction to be taken with probability PROB. */
21631 static void
21632 predict_jump (int prob)
21634 rtx insn = get_last_insn ();
21635 gcc_assert (JUMP_P (insn));
21636 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21639 /* Helper function for the string operations below. Dest VARIABLE whether
21640 it is aligned to VALUE bytes. If true, jump to the label. */
21641 static rtx
21642 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21644 rtx label = gen_label_rtx ();
21645 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21646 if (GET_MODE (variable) == DImode)
21647 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21648 else
21649 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21650 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21651 1, label);
21652 if (epilogue)
21653 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21654 else
21655 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21656 return label;
21659 /* Adjust COUNTER by the VALUE. */
21660 static void
21661 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21663 rtx (*gen_add)(rtx, rtx, rtx)
21664 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21666 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21669 /* Zero extend possibly SImode EXP to Pmode register. */
21671 ix86_zero_extend_to_Pmode (rtx exp)
21673 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21676 /* Divide COUNTREG by SCALE. */
21677 static rtx
21678 scale_counter (rtx countreg, int scale)
21680 rtx sc;
21682 if (scale == 1)
21683 return countreg;
21684 if (CONST_INT_P (countreg))
21685 return GEN_INT (INTVAL (countreg) / scale);
21686 gcc_assert (REG_P (countreg));
21688 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21689 GEN_INT (exact_log2 (scale)),
21690 NULL, 1, OPTAB_DIRECT);
21691 return sc;
21694 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21695 DImode for constant loop counts. */
21697 static enum machine_mode
21698 counter_mode (rtx count_exp)
21700 if (GET_MODE (count_exp) != VOIDmode)
21701 return GET_MODE (count_exp);
21702 if (!CONST_INT_P (count_exp))
21703 return Pmode;
21704 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21705 return DImode;
21706 return SImode;
21709 /* Copy the address to a Pmode register. This is used for x32 to
21710 truncate DImode TLS address to a SImode register. */
21712 static rtx
21713 ix86_copy_addr_to_reg (rtx addr)
21715 if (GET_MODE (addr) == Pmode)
21716 return copy_addr_to_reg (addr);
21717 else
21719 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
21720 return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
21724 /* When SRCPTR is non-NULL, output simple loop to move memory
21725 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21726 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21727 equivalent loop to set memory by VALUE (supposed to be in MODE).
21729 The size is rounded down to whole number of chunk size moved at once.
21730 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21733 static void
21734 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21735 rtx destptr, rtx srcptr, rtx value,
21736 rtx count, enum machine_mode mode, int unroll,
21737 int expected_size)
21739 rtx out_label, top_label, iter, tmp;
21740 enum machine_mode iter_mode = counter_mode (count);
21741 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21742 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21743 rtx size;
21744 rtx x_addr;
21745 rtx y_addr;
21746 int i;
21748 top_label = gen_label_rtx ();
21749 out_label = gen_label_rtx ();
21750 iter = gen_reg_rtx (iter_mode);
21752 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21753 NULL, 1, OPTAB_DIRECT);
21754 /* Those two should combine. */
21755 if (piece_size == const1_rtx)
21757 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21758 true, out_label);
21759 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21761 emit_move_insn (iter, const0_rtx);
21763 emit_label (top_label);
21765 tmp = convert_modes (Pmode, iter_mode, iter, true);
21766 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21767 destmem = change_address (destmem, mode, x_addr);
21769 if (srcmem)
21771 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21772 srcmem = change_address (srcmem, mode, y_addr);
21774 /* When unrolling for chips that reorder memory reads and writes,
21775 we can save registers by using single temporary.
21776 Also using 4 temporaries is overkill in 32bit mode. */
21777 if (!TARGET_64BIT && 0)
21779 for (i = 0; i < unroll; i++)
21781 if (i)
21783 destmem =
21784 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21785 srcmem =
21786 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21788 emit_move_insn (destmem, srcmem);
21791 else
21793 rtx tmpreg[4];
21794 gcc_assert (unroll <= 4);
21795 for (i = 0; i < unroll; i++)
21797 tmpreg[i] = gen_reg_rtx (mode);
21798 if (i)
21800 srcmem =
21801 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21803 emit_move_insn (tmpreg[i], srcmem);
21805 for (i = 0; i < unroll; i++)
21807 if (i)
21809 destmem =
21810 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21812 emit_move_insn (destmem, tmpreg[i]);
21816 else
21817 for (i = 0; i < unroll; i++)
21819 if (i)
21820 destmem =
21821 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21822 emit_move_insn (destmem, value);
21825 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21826 true, OPTAB_LIB_WIDEN);
21827 if (tmp != iter)
21828 emit_move_insn (iter, tmp);
21830 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21831 true, top_label);
21832 if (expected_size != -1)
21834 expected_size /= GET_MODE_SIZE (mode) * unroll;
21835 if (expected_size == 0)
21836 predict_jump (0);
21837 else if (expected_size > REG_BR_PROB_BASE)
21838 predict_jump (REG_BR_PROB_BASE - 1);
21839 else
21840 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21842 else
21843 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21844 iter = ix86_zero_extend_to_Pmode (iter);
21845 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21846 true, OPTAB_LIB_WIDEN);
21847 if (tmp != destptr)
21848 emit_move_insn (destptr, tmp);
21849 if (srcptr)
21851 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21852 true, OPTAB_LIB_WIDEN);
21853 if (tmp != srcptr)
21854 emit_move_insn (srcptr, tmp);
21856 emit_label (out_label);
21859 /* Output "rep; mov" instruction.
21860 Arguments have same meaning as for previous function */
21861 static void
21862 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21863 rtx destptr, rtx srcptr,
21864 rtx count,
21865 enum machine_mode mode)
21867 rtx destexp;
21868 rtx srcexp;
21869 rtx countreg;
21870 HOST_WIDE_INT rounded_count;
21872 /* If the size is known, it is shorter to use rep movs. */
21873 if (mode == QImode && CONST_INT_P (count)
21874 && !(INTVAL (count) & 3))
21875 mode = SImode;
21877 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21878 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21879 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21880 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21881 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21882 if (mode != QImode)
21884 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21885 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21886 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21887 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21888 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21889 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21891 else
21893 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21894 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21896 if (CONST_INT_P (count))
21898 rounded_count = (INTVAL (count)
21899 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21900 destmem = shallow_copy_rtx (destmem);
21901 srcmem = shallow_copy_rtx (srcmem);
21902 set_mem_size (destmem, rounded_count);
21903 set_mem_size (srcmem, rounded_count);
21905 else
21907 if (MEM_SIZE_KNOWN_P (destmem))
21908 clear_mem_size (destmem);
21909 if (MEM_SIZE_KNOWN_P (srcmem))
21910 clear_mem_size (srcmem);
21912 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21913 destexp, srcexp));
21916 /* Output "rep; stos" instruction.
21917 Arguments have same meaning as for previous function */
21918 static void
21919 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21920 rtx count, enum machine_mode mode,
21921 rtx orig_value)
21923 rtx destexp;
21924 rtx countreg;
21925 HOST_WIDE_INT rounded_count;
21927 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21928 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21929 value = force_reg (mode, gen_lowpart (mode, value));
21930 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21931 if (mode != QImode)
21933 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21934 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21935 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21937 else
21938 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21939 if (orig_value == const0_rtx && CONST_INT_P (count))
21941 rounded_count = (INTVAL (count)
21942 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21943 destmem = shallow_copy_rtx (destmem);
21944 set_mem_size (destmem, rounded_count);
21946 else if (MEM_SIZE_KNOWN_P (destmem))
21947 clear_mem_size (destmem);
21948 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21951 static void
21952 emit_strmov (rtx destmem, rtx srcmem,
21953 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21955 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21956 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21957 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21960 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21961 static void
21962 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21963 rtx destptr, rtx srcptr, rtx count, int max_size)
21965 rtx src, dest;
21966 if (CONST_INT_P (count))
21968 HOST_WIDE_INT countval = INTVAL (count);
21969 int offset = 0;
21971 if ((countval & 0x10) && max_size > 16)
21973 if (TARGET_64BIT)
21975 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21976 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21978 else
21979 gcc_unreachable ();
21980 offset += 16;
21982 if ((countval & 0x08) && max_size > 8)
21984 if (TARGET_64BIT)
21985 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21986 else
21988 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21989 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21991 offset += 8;
21993 if ((countval & 0x04) && max_size > 4)
21995 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21996 offset += 4;
21998 if ((countval & 0x02) && max_size > 2)
22000 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
22001 offset += 2;
22003 if ((countval & 0x01) && max_size > 1)
22005 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
22006 offset += 1;
22008 return;
22010 if (max_size > 8)
22012 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22013 count, 1, OPTAB_DIRECT);
22014 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22015 count, QImode, 1, 4);
22016 return;
22019 /* When there are stringops, we can cheaply increase dest and src pointers.
22020 Otherwise we save code size by maintaining offset (zero is readily
22021 available from preceding rep operation) and using x86 addressing modes.
22023 if (TARGET_SINGLE_STRINGOP)
22025 if (max_size > 4)
22027 rtx label = ix86_expand_aligntest (count, 4, true);
22028 src = change_address (srcmem, SImode, srcptr);
22029 dest = change_address (destmem, SImode, destptr);
22030 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22031 emit_label (label);
22032 LABEL_NUSES (label) = 1;
22034 if (max_size > 2)
22036 rtx label = ix86_expand_aligntest (count, 2, true);
22037 src = change_address (srcmem, HImode, srcptr);
22038 dest = change_address (destmem, HImode, destptr);
22039 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22040 emit_label (label);
22041 LABEL_NUSES (label) = 1;
22043 if (max_size > 1)
22045 rtx label = ix86_expand_aligntest (count, 1, true);
22046 src = change_address (srcmem, QImode, srcptr);
22047 dest = change_address (destmem, QImode, destptr);
22048 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22049 emit_label (label);
22050 LABEL_NUSES (label) = 1;
22053 else
22055 rtx offset = force_reg (Pmode, const0_rtx);
22056 rtx tmp;
22058 if (max_size > 4)
22060 rtx label = ix86_expand_aligntest (count, 4, true);
22061 src = change_address (srcmem, SImode, srcptr);
22062 dest = change_address (destmem, SImode, destptr);
22063 emit_move_insn (dest, src);
22064 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22065 true, OPTAB_LIB_WIDEN);
22066 if (tmp != offset)
22067 emit_move_insn (offset, tmp);
22068 emit_label (label);
22069 LABEL_NUSES (label) = 1;
22071 if (max_size > 2)
22073 rtx label = ix86_expand_aligntest (count, 2, true);
22074 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22075 src = change_address (srcmem, HImode, tmp);
22076 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22077 dest = change_address (destmem, HImode, tmp);
22078 emit_move_insn (dest, src);
22079 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22080 true, OPTAB_LIB_WIDEN);
22081 if (tmp != offset)
22082 emit_move_insn (offset, tmp);
22083 emit_label (label);
22084 LABEL_NUSES (label) = 1;
22086 if (max_size > 1)
22088 rtx label = ix86_expand_aligntest (count, 1, true);
22089 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22090 src = change_address (srcmem, QImode, tmp);
22091 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22092 dest = change_address (destmem, QImode, tmp);
22093 emit_move_insn (dest, src);
22094 emit_label (label);
22095 LABEL_NUSES (label) = 1;
22100 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22101 static void
22102 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22103 rtx count, int max_size)
22105 count =
22106 expand_simple_binop (counter_mode (count), AND, count,
22107 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22108 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22109 gen_lowpart (QImode, value), count, QImode,
22110 1, max_size / 2);
22113 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22114 static void
22115 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22117 rtx dest;
22119 if (CONST_INT_P (count))
22121 HOST_WIDE_INT countval = INTVAL (count);
22122 int offset = 0;
22124 if ((countval & 0x10) && max_size > 16)
22126 if (TARGET_64BIT)
22128 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22129 emit_insn (gen_strset (destptr, dest, value));
22130 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22131 emit_insn (gen_strset (destptr, dest, value));
22133 else
22134 gcc_unreachable ();
22135 offset += 16;
22137 if ((countval & 0x08) && max_size > 8)
22139 if (TARGET_64BIT)
22141 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22142 emit_insn (gen_strset (destptr, dest, value));
22144 else
22146 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22147 emit_insn (gen_strset (destptr, dest, value));
22148 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22149 emit_insn (gen_strset (destptr, dest, value));
22151 offset += 8;
22153 if ((countval & 0x04) && max_size > 4)
22155 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22156 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22157 offset += 4;
22159 if ((countval & 0x02) && max_size > 2)
22161 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22162 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22163 offset += 2;
22165 if ((countval & 0x01) && max_size > 1)
22167 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22168 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22169 offset += 1;
22171 return;
22173 if (max_size > 32)
22175 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22176 return;
22178 if (max_size > 16)
22180 rtx label = ix86_expand_aligntest (count, 16, true);
22181 if (TARGET_64BIT)
22183 dest = change_address (destmem, DImode, destptr);
22184 emit_insn (gen_strset (destptr, dest, value));
22185 emit_insn (gen_strset (destptr, dest, value));
22187 else
22189 dest = change_address (destmem, SImode, destptr);
22190 emit_insn (gen_strset (destptr, dest, value));
22191 emit_insn (gen_strset (destptr, dest, value));
22192 emit_insn (gen_strset (destptr, dest, value));
22193 emit_insn (gen_strset (destptr, dest, value));
22195 emit_label (label);
22196 LABEL_NUSES (label) = 1;
22198 if (max_size > 8)
22200 rtx label = ix86_expand_aligntest (count, 8, true);
22201 if (TARGET_64BIT)
22203 dest = change_address (destmem, DImode, destptr);
22204 emit_insn (gen_strset (destptr, dest, value));
22206 else
22208 dest = change_address (destmem, SImode, destptr);
22209 emit_insn (gen_strset (destptr, dest, value));
22210 emit_insn (gen_strset (destptr, dest, value));
22212 emit_label (label);
22213 LABEL_NUSES (label) = 1;
22215 if (max_size > 4)
22217 rtx label = ix86_expand_aligntest (count, 4, true);
22218 dest = change_address (destmem, SImode, destptr);
22219 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22220 emit_label (label);
22221 LABEL_NUSES (label) = 1;
22223 if (max_size > 2)
22225 rtx label = ix86_expand_aligntest (count, 2, true);
22226 dest = change_address (destmem, HImode, destptr);
22227 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22228 emit_label (label);
22229 LABEL_NUSES (label) = 1;
22231 if (max_size > 1)
22233 rtx label = ix86_expand_aligntest (count, 1, true);
22234 dest = change_address (destmem, QImode, destptr);
22235 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22236 emit_label (label);
22237 LABEL_NUSES (label) = 1;
22241 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22242 DESIRED_ALIGNMENT. */
22243 static void
22244 expand_movmem_prologue (rtx destmem, rtx srcmem,
22245 rtx destptr, rtx srcptr, rtx count,
22246 int align, int desired_alignment)
22248 if (align <= 1 && desired_alignment > 1)
22250 rtx label = ix86_expand_aligntest (destptr, 1, false);
22251 srcmem = change_address (srcmem, QImode, srcptr);
22252 destmem = change_address (destmem, QImode, destptr);
22253 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22254 ix86_adjust_counter (count, 1);
22255 emit_label (label);
22256 LABEL_NUSES (label) = 1;
22258 if (align <= 2 && desired_alignment > 2)
22260 rtx label = ix86_expand_aligntest (destptr, 2, false);
22261 srcmem = change_address (srcmem, HImode, srcptr);
22262 destmem = change_address (destmem, HImode, destptr);
22263 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22264 ix86_adjust_counter (count, 2);
22265 emit_label (label);
22266 LABEL_NUSES (label) = 1;
22268 if (align <= 4 && desired_alignment > 4)
22270 rtx label = ix86_expand_aligntest (destptr, 4, false);
22271 srcmem = change_address (srcmem, SImode, srcptr);
22272 destmem = change_address (destmem, SImode, destptr);
22273 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22274 ix86_adjust_counter (count, 4);
22275 emit_label (label);
22276 LABEL_NUSES (label) = 1;
22278 gcc_assert (desired_alignment <= 8);
22281 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22282 ALIGN_BYTES is how many bytes need to be copied. */
22283 static rtx
22284 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22285 int desired_align, int align_bytes)
22287 rtx src = *srcp;
22288 rtx orig_dst = dst;
22289 rtx orig_src = src;
22290 int off = 0;
22291 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22292 if (src_align_bytes >= 0)
22293 src_align_bytes = desired_align - src_align_bytes;
22294 if (align_bytes & 1)
22296 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22297 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22298 off = 1;
22299 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22301 if (align_bytes & 2)
22303 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22304 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22305 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22306 set_mem_align (dst, 2 * BITS_PER_UNIT);
22307 if (src_align_bytes >= 0
22308 && (src_align_bytes & 1) == (align_bytes & 1)
22309 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22310 set_mem_align (src, 2 * BITS_PER_UNIT);
22311 off = 2;
22312 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22314 if (align_bytes & 4)
22316 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22317 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22318 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22319 set_mem_align (dst, 4 * BITS_PER_UNIT);
22320 if (src_align_bytes >= 0)
22322 unsigned int src_align = 0;
22323 if ((src_align_bytes & 3) == (align_bytes & 3))
22324 src_align = 4;
22325 else if ((src_align_bytes & 1) == (align_bytes & 1))
22326 src_align = 2;
22327 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22328 set_mem_align (src, src_align * BITS_PER_UNIT);
22330 off = 4;
22331 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22333 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22334 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22335 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22336 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22337 if (src_align_bytes >= 0)
22339 unsigned int src_align = 0;
22340 if ((src_align_bytes & 7) == (align_bytes & 7))
22341 src_align = 8;
22342 else if ((src_align_bytes & 3) == (align_bytes & 3))
22343 src_align = 4;
22344 else if ((src_align_bytes & 1) == (align_bytes & 1))
22345 src_align = 2;
22346 if (src_align > (unsigned int) desired_align)
22347 src_align = desired_align;
22348 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22349 set_mem_align (src, src_align * BITS_PER_UNIT);
22351 if (MEM_SIZE_KNOWN_P (orig_dst))
22352 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22353 if (MEM_SIZE_KNOWN_P (orig_src))
22354 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22355 *srcp = src;
22356 return dst;
22359 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22360 DESIRED_ALIGNMENT. */
22361 static void
22362 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22363 int align, int desired_alignment)
22365 if (align <= 1 && desired_alignment > 1)
22367 rtx label = ix86_expand_aligntest (destptr, 1, false);
22368 destmem = change_address (destmem, QImode, destptr);
22369 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22370 ix86_adjust_counter (count, 1);
22371 emit_label (label);
22372 LABEL_NUSES (label) = 1;
22374 if (align <= 2 && desired_alignment > 2)
22376 rtx label = ix86_expand_aligntest (destptr, 2, false);
22377 destmem = change_address (destmem, HImode, destptr);
22378 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22379 ix86_adjust_counter (count, 2);
22380 emit_label (label);
22381 LABEL_NUSES (label) = 1;
22383 if (align <= 4 && desired_alignment > 4)
22385 rtx label = ix86_expand_aligntest (destptr, 4, false);
22386 destmem = change_address (destmem, SImode, destptr);
22387 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22388 ix86_adjust_counter (count, 4);
22389 emit_label (label);
22390 LABEL_NUSES (label) = 1;
22392 gcc_assert (desired_alignment <= 8);
22395 /* Set enough from DST to align DST known to by aligned by ALIGN to
22396 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22397 static rtx
22398 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22399 int desired_align, int align_bytes)
22401 int off = 0;
22402 rtx orig_dst = dst;
22403 if (align_bytes & 1)
22405 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22406 off = 1;
22407 emit_insn (gen_strset (destreg, dst,
22408 gen_lowpart (QImode, value)));
22410 if (align_bytes & 2)
22412 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22413 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22414 set_mem_align (dst, 2 * BITS_PER_UNIT);
22415 off = 2;
22416 emit_insn (gen_strset (destreg, dst,
22417 gen_lowpart (HImode, value)));
22419 if (align_bytes & 4)
22421 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22422 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22423 set_mem_align (dst, 4 * BITS_PER_UNIT);
22424 off = 4;
22425 emit_insn (gen_strset (destreg, dst,
22426 gen_lowpart (SImode, value)));
22428 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22429 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22430 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22431 if (MEM_SIZE_KNOWN_P (orig_dst))
22432 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22433 return dst;
22436 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22437 static enum stringop_alg
22438 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22439 int *dynamic_check, bool *noalign)
22441 const struct stringop_algs * algs;
22442 bool optimize_for_speed;
22443 /* Algorithms using the rep prefix want at least edi and ecx;
22444 additionally, memset wants eax and memcpy wants esi. Don't
22445 consider such algorithms if the user has appropriated those
22446 registers for their own purposes. */
22447 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22448 || (memset
22449 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22450 *noalign = false;
22452 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22453 || (alg != rep_prefix_1_byte \
22454 && alg != rep_prefix_4_byte \
22455 && alg != rep_prefix_8_byte))
22456 const struct processor_costs *cost;
22458 /* Even if the string operation call is cold, we still might spend a lot
22459 of time processing large blocks. */
22460 if (optimize_function_for_size_p (cfun)
22461 || (optimize_insn_for_size_p ()
22462 && expected_size != -1 && expected_size < 256))
22463 optimize_for_speed = false;
22464 else
22465 optimize_for_speed = true;
22467 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22469 *dynamic_check = -1;
22470 if (memset)
22471 algs = &cost->memset[TARGET_64BIT != 0];
22472 else
22473 algs = &cost->memcpy[TARGET_64BIT != 0];
22474 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22475 return ix86_stringop_alg;
22476 /* rep; movq or rep; movl is the smallest variant. */
22477 else if (!optimize_for_speed)
22479 if (!count || (count & 3))
22480 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22481 else
22482 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22484 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22486 else if (expected_size != -1 && expected_size < 4)
22487 return loop_1_byte;
22488 else if (expected_size != -1)
22490 unsigned int i;
22491 enum stringop_alg alg = libcall;
22492 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22494 /* We get here if the algorithms that were not libcall-based
22495 were rep-prefix based and we are unable to use rep prefixes
22496 based on global register usage. Break out of the loop and
22497 use the heuristic below. */
22498 if (algs->size[i].max == 0)
22499 break;
22500 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22502 enum stringop_alg candidate = algs->size[i].alg;
22504 if (candidate != libcall && ALG_USABLE_P (candidate))
22505 alg = candidate;
22506 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22507 last non-libcall inline algorithm. */
22508 if (TARGET_INLINE_ALL_STRINGOPS)
22510 /* When the current size is best to be copied by a libcall,
22511 but we are still forced to inline, run the heuristic below
22512 that will pick code for medium sized blocks. */
22513 if (alg != libcall)
22514 return alg;
22515 break;
22517 else if (ALG_USABLE_P (candidate))
22519 *noalign = algs->size[i].noalign;
22520 return candidate;
22524 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22526 /* When asked to inline the call anyway, try to pick meaningful choice.
22527 We look for maximal size of block that is faster to copy by hand and
22528 take blocks of at most of that size guessing that average size will
22529 be roughly half of the block.
22531 If this turns out to be bad, we might simply specify the preferred
22532 choice in ix86_costs. */
22533 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22534 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22536 int max = -1;
22537 enum stringop_alg alg;
22538 int i;
22539 bool any_alg_usable_p = true;
22541 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22543 enum stringop_alg candidate = algs->size[i].alg;
22544 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22546 if (candidate != libcall && candidate
22547 && ALG_USABLE_P (candidate))
22548 max = algs->size[i].max;
22550 /* If there aren't any usable algorithms, then recursing on
22551 smaller sizes isn't going to find anything. Just return the
22552 simple byte-at-a-time copy loop. */
22553 if (!any_alg_usable_p)
22555 /* Pick something reasonable. */
22556 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22557 *dynamic_check = 128;
22558 return loop_1_byte;
22560 if (max == -1)
22561 max = 4096;
22562 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22563 gcc_assert (*dynamic_check == -1);
22564 gcc_assert (alg != libcall);
22565 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22566 *dynamic_check = max;
22567 return alg;
22569 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22570 #undef ALG_USABLE_P
22573 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22574 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22575 static int
22576 decide_alignment (int align,
22577 enum stringop_alg alg,
22578 int expected_size)
22580 int desired_align = 0;
22581 switch (alg)
22583 case no_stringop:
22584 gcc_unreachable ();
22585 case loop:
22586 case unrolled_loop:
22587 desired_align = GET_MODE_SIZE (Pmode);
22588 break;
22589 case rep_prefix_8_byte:
22590 desired_align = 8;
22591 break;
22592 case rep_prefix_4_byte:
22593 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22594 copying whole cacheline at once. */
22595 if (TARGET_PENTIUMPRO)
22596 desired_align = 8;
22597 else
22598 desired_align = 4;
22599 break;
22600 case rep_prefix_1_byte:
22601 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22602 copying whole cacheline at once. */
22603 if (TARGET_PENTIUMPRO)
22604 desired_align = 8;
22605 else
22606 desired_align = 1;
22607 break;
22608 case loop_1_byte:
22609 desired_align = 1;
22610 break;
22611 case libcall:
22612 return 0;
22615 if (optimize_size)
22616 desired_align = 1;
22617 if (desired_align < align)
22618 desired_align = align;
22619 if (expected_size != -1 && expected_size < 4)
22620 desired_align = align;
22621 return desired_align;
22624 /* Return the smallest power of 2 greater than VAL. */
22625 static int
22626 smallest_pow2_greater_than (int val)
22628 int ret = 1;
22629 while (ret <= val)
22630 ret <<= 1;
22631 return ret;
22634 /* Expand string move (memcpy) operation. Use i386 string operations
22635 when profitable. expand_setmem contains similar code. The code
22636 depends upon architecture, block size and alignment, but always has
22637 the same overall structure:
22639 1) Prologue guard: Conditional that jumps up to epilogues for small
22640 blocks that can be handled by epilogue alone. This is faster
22641 but also needed for correctness, since prologue assume the block
22642 is larger than the desired alignment.
22644 Optional dynamic check for size and libcall for large
22645 blocks is emitted here too, with -minline-stringops-dynamically.
22647 2) Prologue: copy first few bytes in order to get destination
22648 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22649 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22650 copied. We emit either a jump tree on power of two sized
22651 blocks, or a byte loop.
22653 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22654 with specified algorithm.
22656 4) Epilogue: code copying tail of the block that is too small to be
22657 handled by main body (or up to size guarded by prologue guard). */
22659 bool
22660 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22661 rtx expected_align_exp, rtx expected_size_exp)
22663 rtx destreg;
22664 rtx srcreg;
22665 rtx label = NULL;
22666 rtx tmp;
22667 rtx jump_around_label = NULL;
22668 HOST_WIDE_INT align = 1;
22669 unsigned HOST_WIDE_INT count = 0;
22670 HOST_WIDE_INT expected_size = -1;
22671 int size_needed = 0, epilogue_size_needed;
22672 int desired_align = 0, align_bytes = 0;
22673 enum stringop_alg alg;
22674 int dynamic_check;
22675 bool need_zero_guard = false;
22676 bool noalign;
22678 if (CONST_INT_P (align_exp))
22679 align = INTVAL (align_exp);
22680 /* i386 can do misaligned access on reasonably increased cost. */
22681 if (CONST_INT_P (expected_align_exp)
22682 && INTVAL (expected_align_exp) > align)
22683 align = INTVAL (expected_align_exp);
22684 /* ALIGN is the minimum of destination and source alignment, but we care here
22685 just about destination alignment. */
22686 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22687 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22689 if (CONST_INT_P (count_exp))
22690 count = expected_size = INTVAL (count_exp);
22691 if (CONST_INT_P (expected_size_exp) && count == 0)
22692 expected_size = INTVAL (expected_size_exp);
22694 /* Make sure we don't need to care about overflow later on. */
22695 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22696 return false;
22698 /* Step 0: Decide on preferred algorithm, desired alignment and
22699 size of chunks to be copied by main loop. */
22701 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22702 desired_align = decide_alignment (align, alg, expected_size);
22704 if (!TARGET_ALIGN_STRINGOPS || noalign)
22705 align = desired_align;
22707 if (alg == libcall)
22708 return false;
22709 gcc_assert (alg != no_stringop);
22710 if (!count)
22711 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22712 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
22713 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
22714 switch (alg)
22716 case libcall:
22717 case no_stringop:
22718 gcc_unreachable ();
22719 case loop:
22720 need_zero_guard = true;
22721 size_needed = GET_MODE_SIZE (word_mode);
22722 break;
22723 case unrolled_loop:
22724 need_zero_guard = true;
22725 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22726 break;
22727 case rep_prefix_8_byte:
22728 size_needed = 8;
22729 break;
22730 case rep_prefix_4_byte:
22731 size_needed = 4;
22732 break;
22733 case rep_prefix_1_byte:
22734 size_needed = 1;
22735 break;
22736 case loop_1_byte:
22737 need_zero_guard = true;
22738 size_needed = 1;
22739 break;
22742 epilogue_size_needed = size_needed;
22744 /* Step 1: Prologue guard. */
22746 /* Alignment code needs count to be in register. */
22747 if (CONST_INT_P (count_exp) && desired_align > align)
22749 if (INTVAL (count_exp) > desired_align
22750 && INTVAL (count_exp) > size_needed)
22752 align_bytes
22753 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22754 if (align_bytes <= 0)
22755 align_bytes = 0;
22756 else
22757 align_bytes = desired_align - align_bytes;
22759 if (align_bytes == 0)
22760 count_exp = force_reg (counter_mode (count_exp), count_exp);
22762 gcc_assert (desired_align >= 1 && align >= 1);
22764 /* Ensure that alignment prologue won't copy past end of block. */
22765 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22767 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22768 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22769 Make sure it is power of 2. */
22770 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22772 if (count)
22774 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22776 /* If main algorithm works on QImode, no epilogue is needed.
22777 For small sizes just don't align anything. */
22778 if (size_needed == 1)
22779 desired_align = align;
22780 else
22781 goto epilogue;
22784 else
22786 label = gen_label_rtx ();
22787 emit_cmp_and_jump_insns (count_exp,
22788 GEN_INT (epilogue_size_needed),
22789 LTU, 0, counter_mode (count_exp), 1, label);
22790 if (expected_size == -1 || expected_size < epilogue_size_needed)
22791 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22792 else
22793 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22797 /* Emit code to decide on runtime whether library call or inline should be
22798 used. */
22799 if (dynamic_check != -1)
22801 if (CONST_INT_P (count_exp))
22803 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22805 emit_block_move_via_libcall (dst, src, count_exp, false);
22806 count_exp = const0_rtx;
22807 goto epilogue;
22810 else
22812 rtx hot_label = gen_label_rtx ();
22813 jump_around_label = gen_label_rtx ();
22814 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22815 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22816 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22817 emit_block_move_via_libcall (dst, src, count_exp, false);
22818 emit_jump (jump_around_label);
22819 emit_label (hot_label);
22823 /* Step 2: Alignment prologue. */
22825 if (desired_align > align)
22827 if (align_bytes == 0)
22829 /* Except for the first move in epilogue, we no longer know
22830 constant offset in aliasing info. It don't seems to worth
22831 the pain to maintain it for the first move, so throw away
22832 the info early. */
22833 src = change_address (src, BLKmode, srcreg);
22834 dst = change_address (dst, BLKmode, destreg);
22835 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22836 desired_align);
22838 else
22840 /* If we know how many bytes need to be stored before dst is
22841 sufficiently aligned, maintain aliasing info accurately. */
22842 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22843 desired_align, align_bytes);
22844 count_exp = plus_constant (counter_mode (count_exp),
22845 count_exp, -align_bytes);
22846 count -= align_bytes;
22848 if (need_zero_guard
22849 && (count < (unsigned HOST_WIDE_INT) size_needed
22850 || (align_bytes == 0
22851 && count < ((unsigned HOST_WIDE_INT) size_needed
22852 + desired_align - align))))
22854 /* It is possible that we copied enough so the main loop will not
22855 execute. */
22856 gcc_assert (size_needed > 1);
22857 if (label == NULL_RTX)
22858 label = gen_label_rtx ();
22859 emit_cmp_and_jump_insns (count_exp,
22860 GEN_INT (size_needed),
22861 LTU, 0, counter_mode (count_exp), 1, label);
22862 if (expected_size == -1
22863 || expected_size < (desired_align - align) / 2 + size_needed)
22864 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22865 else
22866 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22869 if (label && size_needed == 1)
22871 emit_label (label);
22872 LABEL_NUSES (label) = 1;
22873 label = NULL;
22874 epilogue_size_needed = 1;
22876 else if (label == NULL_RTX)
22877 epilogue_size_needed = size_needed;
22879 /* Step 3: Main loop. */
22881 switch (alg)
22883 case libcall:
22884 case no_stringop:
22885 gcc_unreachable ();
22886 case loop_1_byte:
22887 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22888 count_exp, QImode, 1, expected_size);
22889 break;
22890 case loop:
22891 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22892 count_exp, word_mode, 1, expected_size);
22893 break;
22894 case unrolled_loop:
22895 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22896 registers for 4 temporaries anyway. */
22897 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22898 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22899 expected_size);
22900 break;
22901 case rep_prefix_8_byte:
22902 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22903 DImode);
22904 break;
22905 case rep_prefix_4_byte:
22906 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22907 SImode);
22908 break;
22909 case rep_prefix_1_byte:
22910 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22911 QImode);
22912 break;
22914 /* Adjust properly the offset of src and dest memory for aliasing. */
22915 if (CONST_INT_P (count_exp))
22917 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22918 (count / size_needed) * size_needed);
22919 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22920 (count / size_needed) * size_needed);
22922 else
22924 src = change_address (src, BLKmode, srcreg);
22925 dst = change_address (dst, BLKmode, destreg);
22928 /* Step 4: Epilogue to copy the remaining bytes. */
22929 epilogue:
22930 if (label)
22932 /* When the main loop is done, COUNT_EXP might hold original count,
22933 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22934 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22935 bytes. Compensate if needed. */
22937 if (size_needed < epilogue_size_needed)
22939 tmp =
22940 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22941 GEN_INT (size_needed - 1), count_exp, 1,
22942 OPTAB_DIRECT);
22943 if (tmp != count_exp)
22944 emit_move_insn (count_exp, tmp);
22946 emit_label (label);
22947 LABEL_NUSES (label) = 1;
22950 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22951 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22952 epilogue_size_needed);
22953 if (jump_around_label)
22954 emit_label (jump_around_label);
22955 return true;
22958 /* Helper function for memcpy. For QImode value 0xXY produce
22959 0xXYXYXYXY of wide specified by MODE. This is essentially
22960 a * 0x10101010, but we can do slightly better than
22961 synth_mult by unwinding the sequence by hand on CPUs with
22962 slow multiply. */
22963 static rtx
22964 promote_duplicated_reg (enum machine_mode mode, rtx val)
22966 enum machine_mode valmode = GET_MODE (val);
22967 rtx tmp;
22968 int nops = mode == DImode ? 3 : 2;
22970 gcc_assert (mode == SImode || mode == DImode);
22971 if (val == const0_rtx)
22972 return copy_to_mode_reg (mode, const0_rtx);
22973 if (CONST_INT_P (val))
22975 HOST_WIDE_INT v = INTVAL (val) & 255;
22977 v |= v << 8;
22978 v |= v << 16;
22979 if (mode == DImode)
22980 v |= (v << 16) << 16;
22981 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22984 if (valmode == VOIDmode)
22985 valmode = QImode;
22986 if (valmode != QImode)
22987 val = gen_lowpart (QImode, val);
22988 if (mode == QImode)
22989 return val;
22990 if (!TARGET_PARTIAL_REG_STALL)
22991 nops--;
22992 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22993 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22994 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22995 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22997 rtx reg = convert_modes (mode, QImode, val, true);
22998 tmp = promote_duplicated_reg (mode, const1_rtx);
22999 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23000 OPTAB_DIRECT);
23002 else
23004 rtx reg = convert_modes (mode, QImode, val, true);
23006 if (!TARGET_PARTIAL_REG_STALL)
23007 if (mode == SImode)
23008 emit_insn (gen_movsi_insv_1 (reg, reg));
23009 else
23010 emit_insn (gen_movdi_insv_1 (reg, reg));
23011 else
23013 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23014 NULL, 1, OPTAB_DIRECT);
23015 reg =
23016 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23018 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23019 NULL, 1, OPTAB_DIRECT);
23020 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23021 if (mode == SImode)
23022 return reg;
23023 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23024 NULL, 1, OPTAB_DIRECT);
23025 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23026 return reg;
23030 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23031 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23032 alignment from ALIGN to DESIRED_ALIGN. */
23033 static rtx
23034 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23036 rtx promoted_val;
23038 if (TARGET_64BIT
23039 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23040 promoted_val = promote_duplicated_reg (DImode, val);
23041 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23042 promoted_val = promote_duplicated_reg (SImode, val);
23043 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23044 promoted_val = promote_duplicated_reg (HImode, val);
23045 else
23046 promoted_val = val;
23048 return promoted_val;
23051 /* Expand string clear operation (bzero). Use i386 string operations when
23052 profitable. See expand_movmem comment for explanation of individual
23053 steps performed. */
23054 bool
23055 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23056 rtx expected_align_exp, rtx expected_size_exp)
23058 rtx destreg;
23059 rtx label = NULL;
23060 rtx tmp;
23061 rtx jump_around_label = NULL;
23062 HOST_WIDE_INT align = 1;
23063 unsigned HOST_WIDE_INT count = 0;
23064 HOST_WIDE_INT expected_size = -1;
23065 int size_needed = 0, epilogue_size_needed;
23066 int desired_align = 0, align_bytes = 0;
23067 enum stringop_alg alg;
23068 rtx promoted_val = NULL;
23069 bool force_loopy_epilogue = false;
23070 int dynamic_check;
23071 bool need_zero_guard = false;
23072 bool noalign;
23074 if (CONST_INT_P (align_exp))
23075 align = INTVAL (align_exp);
23076 /* i386 can do misaligned access on reasonably increased cost. */
23077 if (CONST_INT_P (expected_align_exp)
23078 && INTVAL (expected_align_exp) > align)
23079 align = INTVAL (expected_align_exp);
23080 if (CONST_INT_P (count_exp))
23081 count = expected_size = INTVAL (count_exp);
23082 if (CONST_INT_P (expected_size_exp) && count == 0)
23083 expected_size = INTVAL (expected_size_exp);
23085 /* Make sure we don't need to care about overflow later on. */
23086 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23087 return false;
23089 /* Step 0: Decide on preferred algorithm, desired alignment and
23090 size of chunks to be copied by main loop. */
23092 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23093 desired_align = decide_alignment (align, alg, expected_size);
23095 if (!TARGET_ALIGN_STRINGOPS || noalign)
23096 align = desired_align;
23098 if (alg == libcall)
23099 return false;
23100 gcc_assert (alg != no_stringop);
23101 if (!count)
23102 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23103 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23104 switch (alg)
23106 case libcall:
23107 case no_stringop:
23108 gcc_unreachable ();
23109 case loop:
23110 need_zero_guard = true;
23111 size_needed = GET_MODE_SIZE (word_mode);
23112 break;
23113 case unrolled_loop:
23114 need_zero_guard = true;
23115 size_needed = GET_MODE_SIZE (word_mode) * 4;
23116 break;
23117 case rep_prefix_8_byte:
23118 size_needed = 8;
23119 break;
23120 case rep_prefix_4_byte:
23121 size_needed = 4;
23122 break;
23123 case rep_prefix_1_byte:
23124 size_needed = 1;
23125 break;
23126 case loop_1_byte:
23127 need_zero_guard = true;
23128 size_needed = 1;
23129 break;
23131 epilogue_size_needed = size_needed;
23133 /* Step 1: Prologue guard. */
23135 /* Alignment code needs count to be in register. */
23136 if (CONST_INT_P (count_exp) && desired_align > align)
23138 if (INTVAL (count_exp) > desired_align
23139 && INTVAL (count_exp) > size_needed)
23141 align_bytes
23142 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23143 if (align_bytes <= 0)
23144 align_bytes = 0;
23145 else
23146 align_bytes = desired_align - align_bytes;
23148 if (align_bytes == 0)
23150 enum machine_mode mode = SImode;
23151 if (TARGET_64BIT && (count & ~0xffffffff))
23152 mode = DImode;
23153 count_exp = force_reg (mode, count_exp);
23156 /* Do the cheap promotion to allow better CSE across the
23157 main loop and epilogue (ie one load of the big constant in the
23158 front of all code. */
23159 if (CONST_INT_P (val_exp))
23160 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23161 desired_align, align);
23162 /* Ensure that alignment prologue won't copy past end of block. */
23163 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23165 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23166 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23167 Make sure it is power of 2. */
23168 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23170 /* To improve performance of small blocks, we jump around the VAL
23171 promoting mode. This mean that if the promoted VAL is not constant,
23172 we might not use it in the epilogue and have to use byte
23173 loop variant. */
23174 if (epilogue_size_needed > 2 && !promoted_val)
23175 force_loopy_epilogue = true;
23176 if (count)
23178 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23180 /* If main algorithm works on QImode, no epilogue is needed.
23181 For small sizes just don't align anything. */
23182 if (size_needed == 1)
23183 desired_align = align;
23184 else
23185 goto epilogue;
23188 else
23190 label = gen_label_rtx ();
23191 emit_cmp_and_jump_insns (count_exp,
23192 GEN_INT (epilogue_size_needed),
23193 LTU, 0, counter_mode (count_exp), 1, label);
23194 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23195 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23196 else
23197 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23200 if (dynamic_check != -1)
23202 rtx hot_label = gen_label_rtx ();
23203 jump_around_label = gen_label_rtx ();
23204 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23205 LEU, 0, counter_mode (count_exp), 1, hot_label);
23206 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23207 set_storage_via_libcall (dst, count_exp, val_exp, false);
23208 emit_jump (jump_around_label);
23209 emit_label (hot_label);
23212 /* Step 2: Alignment prologue. */
23214 /* Do the expensive promotion once we branched off the small blocks. */
23215 if (!promoted_val)
23216 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23217 desired_align, align);
23218 gcc_assert (desired_align >= 1 && align >= 1);
23220 if (desired_align > align)
23222 if (align_bytes == 0)
23224 /* Except for the first move in epilogue, we no longer know
23225 constant offset in aliasing info. It don't seems to worth
23226 the pain to maintain it for the first move, so throw away
23227 the info early. */
23228 dst = change_address (dst, BLKmode, destreg);
23229 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23230 desired_align);
23232 else
23234 /* If we know how many bytes need to be stored before dst is
23235 sufficiently aligned, maintain aliasing info accurately. */
23236 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23237 desired_align, align_bytes);
23238 count_exp = plus_constant (counter_mode (count_exp),
23239 count_exp, -align_bytes);
23240 count -= align_bytes;
23242 if (need_zero_guard
23243 && (count < (unsigned HOST_WIDE_INT) size_needed
23244 || (align_bytes == 0
23245 && count < ((unsigned HOST_WIDE_INT) size_needed
23246 + desired_align - align))))
23248 /* It is possible that we copied enough so the main loop will not
23249 execute. */
23250 gcc_assert (size_needed > 1);
23251 if (label == NULL_RTX)
23252 label = gen_label_rtx ();
23253 emit_cmp_and_jump_insns (count_exp,
23254 GEN_INT (size_needed),
23255 LTU, 0, counter_mode (count_exp), 1, label);
23256 if (expected_size == -1
23257 || expected_size < (desired_align - align) / 2 + size_needed)
23258 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23259 else
23260 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23263 if (label && size_needed == 1)
23265 emit_label (label);
23266 LABEL_NUSES (label) = 1;
23267 label = NULL;
23268 promoted_val = val_exp;
23269 epilogue_size_needed = 1;
23271 else if (label == NULL_RTX)
23272 epilogue_size_needed = size_needed;
23274 /* Step 3: Main loop. */
23276 switch (alg)
23278 case libcall:
23279 case no_stringop:
23280 gcc_unreachable ();
23281 case loop_1_byte:
23282 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23283 count_exp, QImode, 1, expected_size);
23284 break;
23285 case loop:
23286 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23287 count_exp, word_mode, 1, expected_size);
23288 break;
23289 case unrolled_loop:
23290 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23291 count_exp, word_mode, 4, expected_size);
23292 break;
23293 case rep_prefix_8_byte:
23294 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23295 DImode, val_exp);
23296 break;
23297 case rep_prefix_4_byte:
23298 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23299 SImode, val_exp);
23300 break;
23301 case rep_prefix_1_byte:
23302 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23303 QImode, val_exp);
23304 break;
23306 /* Adjust properly the offset of src and dest memory for aliasing. */
23307 if (CONST_INT_P (count_exp))
23308 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23309 (count / size_needed) * size_needed);
23310 else
23311 dst = change_address (dst, BLKmode, destreg);
23313 /* Step 4: Epilogue to copy the remaining bytes. */
23315 if (label)
23317 /* When the main loop is done, COUNT_EXP might hold original count,
23318 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23319 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23320 bytes. Compensate if needed. */
23322 if (size_needed < epilogue_size_needed)
23324 tmp =
23325 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23326 GEN_INT (size_needed - 1), count_exp, 1,
23327 OPTAB_DIRECT);
23328 if (tmp != count_exp)
23329 emit_move_insn (count_exp, tmp);
23331 emit_label (label);
23332 LABEL_NUSES (label) = 1;
23334 epilogue:
23335 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23337 if (force_loopy_epilogue)
23338 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23339 epilogue_size_needed);
23340 else
23341 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23342 epilogue_size_needed);
23344 if (jump_around_label)
23345 emit_label (jump_around_label);
23346 return true;
23349 /* Expand the appropriate insns for doing strlen if not just doing
23350 repnz; scasb
23352 out = result, initialized with the start address
23353 align_rtx = alignment of the address.
23354 scratch = scratch register, initialized with the startaddress when
23355 not aligned, otherwise undefined
23357 This is just the body. It needs the initializations mentioned above and
23358 some address computing at the end. These things are done in i386.md. */
23360 static void
23361 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23363 int align;
23364 rtx tmp;
23365 rtx align_2_label = NULL_RTX;
23366 rtx align_3_label = NULL_RTX;
23367 rtx align_4_label = gen_label_rtx ();
23368 rtx end_0_label = gen_label_rtx ();
23369 rtx mem;
23370 rtx tmpreg = gen_reg_rtx (SImode);
23371 rtx scratch = gen_reg_rtx (SImode);
23372 rtx cmp;
23374 align = 0;
23375 if (CONST_INT_P (align_rtx))
23376 align = INTVAL (align_rtx);
23378 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23380 /* Is there a known alignment and is it less than 4? */
23381 if (align < 4)
23383 rtx scratch1 = gen_reg_rtx (Pmode);
23384 emit_move_insn (scratch1, out);
23385 /* Is there a known alignment and is it not 2? */
23386 if (align != 2)
23388 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23389 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23391 /* Leave just the 3 lower bits. */
23392 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23393 NULL_RTX, 0, OPTAB_WIDEN);
23395 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23396 Pmode, 1, align_4_label);
23397 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23398 Pmode, 1, align_2_label);
23399 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23400 Pmode, 1, align_3_label);
23402 else
23404 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23405 check if is aligned to 4 - byte. */
23407 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23408 NULL_RTX, 0, OPTAB_WIDEN);
23410 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23411 Pmode, 1, align_4_label);
23414 mem = change_address (src, QImode, out);
23416 /* Now compare the bytes. */
23418 /* Compare the first n unaligned byte on a byte per byte basis. */
23419 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23420 QImode, 1, end_0_label);
23422 /* Increment the address. */
23423 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23425 /* Not needed with an alignment of 2 */
23426 if (align != 2)
23428 emit_label (align_2_label);
23430 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23431 end_0_label);
23433 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23435 emit_label (align_3_label);
23438 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23439 end_0_label);
23441 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23444 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23445 align this loop. It gives only huge programs, but does not help to
23446 speed up. */
23447 emit_label (align_4_label);
23449 mem = change_address (src, SImode, out);
23450 emit_move_insn (scratch, mem);
23451 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23453 /* This formula yields a nonzero result iff one of the bytes is zero.
23454 This saves three branches inside loop and many cycles. */
23456 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23457 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23458 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23459 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23460 gen_int_mode (0x80808080, SImode)));
23461 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23462 align_4_label);
23464 if (TARGET_CMOVE)
23466 rtx reg = gen_reg_rtx (SImode);
23467 rtx reg2 = gen_reg_rtx (Pmode);
23468 emit_move_insn (reg, tmpreg);
23469 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23471 /* If zero is not in the first two bytes, move two bytes forward. */
23472 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23473 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23474 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23475 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23476 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23477 reg,
23478 tmpreg)));
23479 /* Emit lea manually to avoid clobbering of flags. */
23480 emit_insn (gen_rtx_SET (SImode, reg2,
23481 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23483 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23484 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23485 emit_insn (gen_rtx_SET (VOIDmode, out,
23486 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23487 reg2,
23488 out)));
23490 else
23492 rtx end_2_label = gen_label_rtx ();
23493 /* Is zero in the first two bytes? */
23495 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23496 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23497 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23498 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23499 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23500 pc_rtx);
23501 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23502 JUMP_LABEL (tmp) = end_2_label;
23504 /* Not in the first two. Move two bytes forward. */
23505 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23506 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23508 emit_label (end_2_label);
23512 /* Avoid branch in fixing the byte. */
23513 tmpreg = gen_lowpart (QImode, tmpreg);
23514 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23515 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23516 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23517 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23519 emit_label (end_0_label);
23522 /* Expand strlen. */
23524 bool
23525 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23527 rtx addr, scratch1, scratch2, scratch3, scratch4;
23529 /* The generic case of strlen expander is long. Avoid it's
23530 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23532 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23533 && !TARGET_INLINE_ALL_STRINGOPS
23534 && !optimize_insn_for_size_p ()
23535 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23536 return false;
23538 addr = force_reg (Pmode, XEXP (src, 0));
23539 scratch1 = gen_reg_rtx (Pmode);
23541 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23542 && !optimize_insn_for_size_p ())
23544 /* Well it seems that some optimizer does not combine a call like
23545 foo(strlen(bar), strlen(bar));
23546 when the move and the subtraction is done here. It does calculate
23547 the length just once when these instructions are done inside of
23548 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23549 often used and I use one fewer register for the lifetime of
23550 output_strlen_unroll() this is better. */
23552 emit_move_insn (out, addr);
23554 ix86_expand_strlensi_unroll_1 (out, src, align);
23556 /* strlensi_unroll_1 returns the address of the zero at the end of
23557 the string, like memchr(), so compute the length by subtracting
23558 the start address. */
23559 emit_insn (ix86_gen_sub3 (out, out, addr));
23561 else
23563 rtx unspec;
23565 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23566 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23567 return false;
23569 scratch2 = gen_reg_rtx (Pmode);
23570 scratch3 = gen_reg_rtx (Pmode);
23571 scratch4 = force_reg (Pmode, constm1_rtx);
23573 emit_move_insn (scratch3, addr);
23574 eoschar = force_reg (QImode, eoschar);
23576 src = replace_equiv_address_nv (src, scratch3);
23578 /* If .md starts supporting :P, this can be done in .md. */
23579 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23580 scratch4), UNSPEC_SCAS);
23581 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23582 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23583 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23585 return true;
23588 /* For given symbol (function) construct code to compute address of it's PLT
23589 entry in large x86-64 PIC model. */
23590 static rtx
23591 construct_plt_address (rtx symbol)
23593 rtx tmp, unspec;
23595 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23596 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23597 gcc_assert (Pmode == DImode);
23599 tmp = gen_reg_rtx (Pmode);
23600 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23602 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23603 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23604 return tmp;
23608 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23609 rtx callarg2,
23610 rtx pop, bool sibcall)
23612 /* We need to represent that SI and DI registers are clobbered
23613 by SYSV calls. */
23614 static int clobbered_registers[] = {
23615 XMM6_REG, XMM7_REG, XMM8_REG,
23616 XMM9_REG, XMM10_REG, XMM11_REG,
23617 XMM12_REG, XMM13_REG, XMM14_REG,
23618 XMM15_REG, SI_REG, DI_REG
23620 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23621 rtx use = NULL, call;
23622 unsigned int vec_len;
23624 if (pop == const0_rtx)
23625 pop = NULL;
23626 gcc_assert (!TARGET_64BIT || !pop);
23628 if (TARGET_MACHO && !TARGET_64BIT)
23630 #if TARGET_MACHO
23631 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23632 fnaddr = machopic_indirect_call_target (fnaddr);
23633 #endif
23635 else
23637 /* Static functions and indirect calls don't need the pic register. */
23638 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23639 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23640 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23641 use_reg (&use, pic_offset_table_rtx);
23644 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23646 rtx al = gen_rtx_REG (QImode, AX_REG);
23647 emit_move_insn (al, callarg2);
23648 use_reg (&use, al);
23651 if (ix86_cmodel == CM_LARGE_PIC
23652 && MEM_P (fnaddr)
23653 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23654 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23655 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23656 else if (sibcall
23657 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23658 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23660 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23661 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23664 vec_len = 0;
23665 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23666 if (retval)
23667 call = gen_rtx_SET (VOIDmode, retval, call);
23668 vec[vec_len++] = call;
23670 if (pop)
23672 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23673 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23674 vec[vec_len++] = pop;
23677 if (TARGET_64BIT_MS_ABI
23678 && (!callarg2 || INTVAL (callarg2) != -2))
23680 unsigned i;
23682 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23683 UNSPEC_MS_TO_SYSV_CALL);
23685 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23686 vec[vec_len++]
23687 = gen_rtx_CLOBBER (VOIDmode,
23688 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23689 ? TImode : DImode,
23690 clobbered_registers[i]));
23693 if (vec_len > 1)
23694 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23695 call = emit_call_insn (call);
23696 if (use)
23697 CALL_INSN_FUNCTION_USAGE (call) = use;
23699 return call;
23702 /* Output the assembly for a call instruction. */
23704 const char *
23705 ix86_output_call_insn (rtx insn, rtx call_op)
23707 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23708 bool seh_nop_p = false;
23709 const char *xasm;
23711 if (SIBLING_CALL_P (insn))
23713 if (direct_p)
23714 xasm = "jmp\t%P0";
23715 /* SEH epilogue detection requires the indirect branch case
23716 to include REX.W. */
23717 else if (TARGET_SEH)
23718 xasm = "rex.W jmp %A0";
23719 else
23720 xasm = "jmp\t%A0";
23722 output_asm_insn (xasm, &call_op);
23723 return "";
23726 /* SEH unwinding can require an extra nop to be emitted in several
23727 circumstances. Determine if we have one of those. */
23728 if (TARGET_SEH)
23730 rtx i;
23732 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23734 /* If we get to another real insn, we don't need the nop. */
23735 if (INSN_P (i))
23736 break;
23738 /* If we get to the epilogue note, prevent a catch region from
23739 being adjacent to the standard epilogue sequence. If non-
23740 call-exceptions, we'll have done this during epilogue emission. */
23741 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23742 && !flag_non_call_exceptions
23743 && !can_throw_internal (insn))
23745 seh_nop_p = true;
23746 break;
23750 /* If we didn't find a real insn following the call, prevent the
23751 unwinder from looking into the next function. */
23752 if (i == NULL)
23753 seh_nop_p = true;
23756 if (direct_p)
23757 xasm = "call\t%P0";
23758 else
23759 xasm = "call\t%A0";
23761 output_asm_insn (xasm, &call_op);
23763 if (seh_nop_p)
23764 return "nop";
23766 return "";
23769 /* Clear stack slot assignments remembered from previous functions.
23770 This is called from INIT_EXPANDERS once before RTL is emitted for each
23771 function. */
23773 static struct machine_function *
23774 ix86_init_machine_status (void)
23776 struct machine_function *f;
23778 f = ggc_alloc_cleared_machine_function ();
23779 f->use_fast_prologue_epilogue_nregs = -1;
23780 f->call_abi = ix86_abi;
23782 return f;
23785 /* Return a MEM corresponding to a stack slot with mode MODE.
23786 Allocate a new slot if necessary.
23788 The RTL for a function can have several slots available: N is
23789 which slot to use. */
23792 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23794 struct stack_local_entry *s;
23796 gcc_assert (n < MAX_386_STACK_LOCALS);
23798 for (s = ix86_stack_locals; s; s = s->next)
23799 if (s->mode == mode && s->n == n)
23800 return validize_mem (copy_rtx (s->rtl));
23802 s = ggc_alloc_stack_local_entry ();
23803 s->n = n;
23804 s->mode = mode;
23805 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23807 s->next = ix86_stack_locals;
23808 ix86_stack_locals = s;
23809 return validize_mem (s->rtl);
23812 static void
23813 ix86_instantiate_decls (void)
23815 struct stack_local_entry *s;
23817 for (s = ix86_stack_locals; s; s = s->next)
23818 if (s->rtl != NULL_RTX)
23819 instantiate_decl_rtl (s->rtl);
23822 /* Calculate the length of the memory address in the instruction encoding.
23823 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23824 or other prefixes. We never generate addr32 prefix for LEA insn. */
23827 memory_address_length (rtx addr, bool lea)
23829 struct ix86_address parts;
23830 rtx base, index, disp;
23831 int len;
23832 int ok;
23834 if (GET_CODE (addr) == PRE_DEC
23835 || GET_CODE (addr) == POST_INC
23836 || GET_CODE (addr) == PRE_MODIFY
23837 || GET_CODE (addr) == POST_MODIFY)
23838 return 0;
23840 ok = ix86_decompose_address (addr, &parts);
23841 gcc_assert (ok);
23843 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23845 /* If this is not LEA instruction, add the length of addr32 prefix. */
23846 if (TARGET_64BIT && !lea
23847 && (SImode_address_operand (addr, VOIDmode)
23848 || (parts.base && GET_MODE (parts.base) == SImode)
23849 || (parts.index && GET_MODE (parts.index) == SImode)))
23850 len++;
23852 base = parts.base;
23853 index = parts.index;
23854 disp = parts.disp;
23856 if (base && GET_CODE (base) == SUBREG)
23857 base = SUBREG_REG (base);
23858 if (index && GET_CODE (index) == SUBREG)
23859 index = SUBREG_REG (index);
23861 gcc_assert (base == NULL_RTX || REG_P (base));
23862 gcc_assert (index == NULL_RTX || REG_P (index));
23864 /* Rule of thumb:
23865 - esp as the base always wants an index,
23866 - ebp as the base always wants a displacement,
23867 - r12 as the base always wants an index,
23868 - r13 as the base always wants a displacement. */
23870 /* Register Indirect. */
23871 if (base && !index && !disp)
23873 /* esp (for its index) and ebp (for its displacement) need
23874 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23875 code. */
23876 if (base == arg_pointer_rtx
23877 || base == frame_pointer_rtx
23878 || REGNO (base) == SP_REG
23879 || REGNO (base) == BP_REG
23880 || REGNO (base) == R12_REG
23881 || REGNO (base) == R13_REG)
23882 len++;
23885 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23886 is not disp32, but disp32(%rip), so for disp32
23887 SIB byte is needed, unless print_operand_address
23888 optimizes it into disp32(%rip) or (%rip) is implied
23889 by UNSPEC. */
23890 else if (disp && !base && !index)
23892 len += 4;
23893 if (TARGET_64BIT)
23895 rtx symbol = disp;
23897 if (GET_CODE (disp) == CONST)
23898 symbol = XEXP (disp, 0);
23899 if (GET_CODE (symbol) == PLUS
23900 && CONST_INT_P (XEXP (symbol, 1)))
23901 symbol = XEXP (symbol, 0);
23903 if (GET_CODE (symbol) != LABEL_REF
23904 && (GET_CODE (symbol) != SYMBOL_REF
23905 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23906 && (GET_CODE (symbol) != UNSPEC
23907 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23908 && XINT (symbol, 1) != UNSPEC_PCREL
23909 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23910 len++;
23913 else
23915 /* Find the length of the displacement constant. */
23916 if (disp)
23918 if (base && satisfies_constraint_K (disp))
23919 len += 1;
23920 else
23921 len += 4;
23923 /* ebp always wants a displacement. Similarly r13. */
23924 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23925 len++;
23927 /* An index requires the two-byte modrm form.... */
23928 if (index
23929 /* ...like esp (or r12), which always wants an index. */
23930 || base == arg_pointer_rtx
23931 || base == frame_pointer_rtx
23932 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23933 len++;
23936 return len;
23939 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23940 is set, expect that insn have 8bit immediate alternative. */
23942 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23944 int len = 0;
23945 int i;
23946 extract_insn_cached (insn);
23947 for (i = recog_data.n_operands - 1; i >= 0; --i)
23948 if (CONSTANT_P (recog_data.operand[i]))
23950 enum attr_mode mode = get_attr_mode (insn);
23952 gcc_assert (!len);
23953 if (shortform && CONST_INT_P (recog_data.operand[i]))
23955 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23956 switch (mode)
23958 case MODE_QI:
23959 len = 1;
23960 continue;
23961 case MODE_HI:
23962 ival = trunc_int_for_mode (ival, HImode);
23963 break;
23964 case MODE_SI:
23965 ival = trunc_int_for_mode (ival, SImode);
23966 break;
23967 default:
23968 break;
23970 if (IN_RANGE (ival, -128, 127))
23972 len = 1;
23973 continue;
23976 switch (mode)
23978 case MODE_QI:
23979 len = 1;
23980 break;
23981 case MODE_HI:
23982 len = 2;
23983 break;
23984 case MODE_SI:
23985 len = 4;
23986 break;
23987 /* Immediates for DImode instructions are encoded
23988 as 32bit sign extended values. */
23989 case MODE_DI:
23990 len = 4;
23991 break;
23992 default:
23993 fatal_insn ("unknown insn mode", insn);
23996 return len;
23999 /* Compute default value for "length_address" attribute. */
24001 ix86_attr_length_address_default (rtx insn)
24003 int i;
24005 if (get_attr_type (insn) == TYPE_LEA)
24007 rtx set = PATTERN (insn), addr;
24009 if (GET_CODE (set) == PARALLEL)
24010 set = XVECEXP (set, 0, 0);
24012 gcc_assert (GET_CODE (set) == SET);
24014 addr = SET_SRC (set);
24016 return memory_address_length (addr, true);
24019 extract_insn_cached (insn);
24020 for (i = recog_data.n_operands - 1; i >= 0; --i)
24021 if (MEM_P (recog_data.operand[i]))
24023 constrain_operands_cached (reload_completed);
24024 if (which_alternative != -1)
24026 const char *constraints = recog_data.constraints[i];
24027 int alt = which_alternative;
24029 while (*constraints == '=' || *constraints == '+')
24030 constraints++;
24031 while (alt-- > 0)
24032 while (*constraints++ != ',')
24034 /* Skip ignored operands. */
24035 if (*constraints == 'X')
24036 continue;
24038 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24040 return 0;
24043 /* Compute default value for "length_vex" attribute. It includes
24044 2 or 3 byte VEX prefix and 1 opcode byte. */
24047 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24049 int i;
24051 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24052 byte VEX prefix. */
24053 if (!has_0f_opcode || has_vex_w)
24054 return 3 + 1;
24056 /* We can always use 2 byte VEX prefix in 32bit. */
24057 if (!TARGET_64BIT)
24058 return 2 + 1;
24060 extract_insn_cached (insn);
24062 for (i = recog_data.n_operands - 1; i >= 0; --i)
24063 if (REG_P (recog_data.operand[i]))
24065 /* REX.W bit uses 3 byte VEX prefix. */
24066 if (GET_MODE (recog_data.operand[i]) == DImode
24067 && GENERAL_REG_P (recog_data.operand[i]))
24068 return 3 + 1;
24070 else
24072 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24073 if (MEM_P (recog_data.operand[i])
24074 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24075 return 3 + 1;
24078 return 2 + 1;
24081 /* Return the maximum number of instructions a cpu can issue. */
24083 static int
24084 ix86_issue_rate (void)
24086 switch (ix86_tune)
24088 case PROCESSOR_PENTIUM:
24089 case PROCESSOR_ATOM:
24090 case PROCESSOR_K6:
24091 case PROCESSOR_BTVER2:
24092 return 2;
24094 case PROCESSOR_PENTIUMPRO:
24095 case PROCESSOR_PENTIUM4:
24096 case PROCESSOR_CORE2:
24097 case PROCESSOR_COREI7:
24098 case PROCESSOR_HASWELL:
24099 case PROCESSOR_ATHLON:
24100 case PROCESSOR_K8:
24101 case PROCESSOR_AMDFAM10:
24102 case PROCESSOR_NOCONA:
24103 case PROCESSOR_GENERIC32:
24104 case PROCESSOR_GENERIC64:
24105 case PROCESSOR_BDVER1:
24106 case PROCESSOR_BDVER2:
24107 case PROCESSOR_BDVER3:
24108 case PROCESSOR_BTVER1:
24109 return 3;
24111 default:
24112 return 1;
24116 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24117 by DEP_INSN and nothing set by DEP_INSN. */
24119 static bool
24120 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24122 rtx set, set2;
24124 /* Simplify the test for uninteresting insns. */
24125 if (insn_type != TYPE_SETCC
24126 && insn_type != TYPE_ICMOV
24127 && insn_type != TYPE_FCMOV
24128 && insn_type != TYPE_IBR)
24129 return false;
24131 if ((set = single_set (dep_insn)) != 0)
24133 set = SET_DEST (set);
24134 set2 = NULL_RTX;
24136 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24137 && XVECLEN (PATTERN (dep_insn), 0) == 2
24138 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24139 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24141 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24142 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24144 else
24145 return false;
24147 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24148 return false;
24150 /* This test is true if the dependent insn reads the flags but
24151 not any other potentially set register. */
24152 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24153 return false;
24155 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24156 return false;
24158 return true;
24161 /* Return true iff USE_INSN has a memory address with operands set by
24162 SET_INSN. */
24164 bool
24165 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24167 int i;
24168 extract_insn_cached (use_insn);
24169 for (i = recog_data.n_operands - 1; i >= 0; --i)
24170 if (MEM_P (recog_data.operand[i]))
24172 rtx addr = XEXP (recog_data.operand[i], 0);
24173 return modified_in_p (addr, set_insn) != 0;
24175 return false;
24178 static int
24179 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24181 enum attr_type insn_type, dep_insn_type;
24182 enum attr_memory memory;
24183 rtx set, set2;
24184 int dep_insn_code_number;
24186 /* Anti and output dependencies have zero cost on all CPUs. */
24187 if (REG_NOTE_KIND (link) != 0)
24188 return 0;
24190 dep_insn_code_number = recog_memoized (dep_insn);
24192 /* If we can't recognize the insns, we can't really do anything. */
24193 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24194 return cost;
24196 insn_type = get_attr_type (insn);
24197 dep_insn_type = get_attr_type (dep_insn);
24199 switch (ix86_tune)
24201 case PROCESSOR_PENTIUM:
24202 /* Address Generation Interlock adds a cycle of latency. */
24203 if (insn_type == TYPE_LEA)
24205 rtx addr = PATTERN (insn);
24207 if (GET_CODE (addr) == PARALLEL)
24208 addr = XVECEXP (addr, 0, 0);
24210 gcc_assert (GET_CODE (addr) == SET);
24212 addr = SET_SRC (addr);
24213 if (modified_in_p (addr, dep_insn))
24214 cost += 1;
24216 else if (ix86_agi_dependent (dep_insn, insn))
24217 cost += 1;
24219 /* ??? Compares pair with jump/setcc. */
24220 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24221 cost = 0;
24223 /* Floating point stores require value to be ready one cycle earlier. */
24224 if (insn_type == TYPE_FMOV
24225 && get_attr_memory (insn) == MEMORY_STORE
24226 && !ix86_agi_dependent (dep_insn, insn))
24227 cost += 1;
24228 break;
24230 case PROCESSOR_PENTIUMPRO:
24231 memory = get_attr_memory (insn);
24233 /* INT->FP conversion is expensive. */
24234 if (get_attr_fp_int_src (dep_insn))
24235 cost += 5;
24237 /* There is one cycle extra latency between an FP op and a store. */
24238 if (insn_type == TYPE_FMOV
24239 && (set = single_set (dep_insn)) != NULL_RTX
24240 && (set2 = single_set (insn)) != NULL_RTX
24241 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24242 && MEM_P (SET_DEST (set2)))
24243 cost += 1;
24245 /* Show ability of reorder buffer to hide latency of load by executing
24246 in parallel with previous instruction in case
24247 previous instruction is not needed to compute the address. */
24248 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24249 && !ix86_agi_dependent (dep_insn, insn))
24251 /* Claim moves to take one cycle, as core can issue one load
24252 at time and the next load can start cycle later. */
24253 if (dep_insn_type == TYPE_IMOV
24254 || dep_insn_type == TYPE_FMOV)
24255 cost = 1;
24256 else if (cost > 1)
24257 cost--;
24259 break;
24261 case PROCESSOR_K6:
24262 memory = get_attr_memory (insn);
24264 /* The esp dependency is resolved before the instruction is really
24265 finished. */
24266 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24267 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24268 return 1;
24270 /* INT->FP conversion is expensive. */
24271 if (get_attr_fp_int_src (dep_insn))
24272 cost += 5;
24274 /* Show ability of reorder buffer to hide latency of load by executing
24275 in parallel with previous instruction in case
24276 previous instruction is not needed to compute the address. */
24277 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24278 && !ix86_agi_dependent (dep_insn, insn))
24280 /* Claim moves to take one cycle, as core can issue one load
24281 at time and the next load can start cycle later. */
24282 if (dep_insn_type == TYPE_IMOV
24283 || dep_insn_type == TYPE_FMOV)
24284 cost = 1;
24285 else if (cost > 2)
24286 cost -= 2;
24287 else
24288 cost = 1;
24290 break;
24292 case PROCESSOR_ATHLON:
24293 case PROCESSOR_K8:
24294 case PROCESSOR_AMDFAM10:
24295 case PROCESSOR_BDVER1:
24296 case PROCESSOR_BDVER2:
24297 case PROCESSOR_BDVER3:
24298 case PROCESSOR_BTVER1:
24299 case PROCESSOR_BTVER2:
24300 case PROCESSOR_ATOM:
24301 case PROCESSOR_GENERIC32:
24302 case PROCESSOR_GENERIC64:
24303 memory = get_attr_memory (insn);
24305 /* Show ability of reorder buffer to hide latency of load by executing
24306 in parallel with previous instruction in case
24307 previous instruction is not needed to compute the address. */
24308 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24309 && !ix86_agi_dependent (dep_insn, insn))
24311 enum attr_unit unit = get_attr_unit (insn);
24312 int loadcost = 3;
24314 /* Because of the difference between the length of integer and
24315 floating unit pipeline preparation stages, the memory operands
24316 for floating point are cheaper.
24318 ??? For Athlon it the difference is most probably 2. */
24319 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24320 loadcost = 3;
24321 else
24322 loadcost = TARGET_ATHLON ? 2 : 0;
24324 if (cost >= loadcost)
24325 cost -= loadcost;
24326 else
24327 cost = 0;
24330 default:
24331 break;
24334 return cost;
24337 /* How many alternative schedules to try. This should be as wide as the
24338 scheduling freedom in the DFA, but no wider. Making this value too
24339 large results extra work for the scheduler. */
24341 static int
24342 ia32_multipass_dfa_lookahead (void)
24344 switch (ix86_tune)
24346 case PROCESSOR_PENTIUM:
24347 return 2;
24349 case PROCESSOR_PENTIUMPRO:
24350 case PROCESSOR_K6:
24351 return 1;
24353 case PROCESSOR_CORE2:
24354 case PROCESSOR_COREI7:
24355 case PROCESSOR_HASWELL:
24356 case PROCESSOR_ATOM:
24357 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24358 as many instructions can be executed on a cycle, i.e.,
24359 issue_rate. I wonder why tuning for many CPUs does not do this. */
24360 if (reload_completed)
24361 return ix86_issue_rate ();
24362 /* Don't use lookahead for pre-reload schedule to save compile time. */
24363 return 0;
24365 default:
24366 return 0;
24370 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24371 execution. It is applied if
24372 (1) IMUL instruction is on the top of list;
24373 (2) There exists the only producer of independent IMUL instruction in
24374 ready list;
24375 (3) Put found producer on the top of ready list.
24376 Returns issue rate. */
24378 static int
24379 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24380 int clock_var ATTRIBUTE_UNUSED)
24382 static int issue_rate = -1;
24383 int n_ready = *pn_ready;
24384 rtx insn, insn1, insn2;
24385 int i;
24386 sd_iterator_def sd_it;
24387 dep_t dep;
24388 int index = -1;
24390 /* Set up issue rate. */
24391 issue_rate = ix86_issue_rate();
24393 /* Do reodering for Atom only. */
24394 if (ix86_tune != PROCESSOR_ATOM)
24395 return issue_rate;
24396 /* Do not perform ready list reodering for pre-reload schedule pass. */
24397 if (!reload_completed)
24398 return issue_rate;
24399 /* Nothing to do if ready list contains only 1 instruction. */
24400 if (n_ready <= 1)
24401 return issue_rate;
24403 /* Check that IMUL instruction is on the top of ready list. */
24404 insn = ready[n_ready - 1];
24405 if (!NONDEBUG_INSN_P (insn))
24406 return issue_rate;
24407 insn = PATTERN (insn);
24408 if (GET_CODE (insn) == PARALLEL)
24409 insn = XVECEXP (insn, 0, 0);
24410 if (GET_CODE (insn) != SET)
24411 return issue_rate;
24412 if (!(GET_CODE (SET_SRC (insn)) == MULT
24413 && GET_MODE (SET_SRC (insn)) == SImode))
24414 return issue_rate;
24416 /* Search for producer of independent IMUL instruction. */
24417 for (i = n_ready - 2; i>= 0; i--)
24419 insn = ready[i];
24420 if (!NONDEBUG_INSN_P (insn))
24421 continue;
24422 /* Skip IMUL instruction. */
24423 insn2 = PATTERN (insn);
24424 if (GET_CODE (insn2) == PARALLEL)
24425 insn2 = XVECEXP (insn2, 0, 0);
24426 if (GET_CODE (insn2) == SET
24427 && GET_CODE (SET_SRC (insn2)) == MULT
24428 && GET_MODE (SET_SRC (insn2)) == SImode)
24429 continue;
24431 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24433 rtx con;
24434 con = DEP_CON (dep);
24435 if (!NONDEBUG_INSN_P (con))
24436 continue;
24437 insn1 = PATTERN (con);
24438 if (GET_CODE (insn1) == PARALLEL)
24439 insn1 = XVECEXP (insn1, 0, 0);
24441 if (GET_CODE (insn1) == SET
24442 && GET_CODE (SET_SRC (insn1)) == MULT
24443 && GET_MODE (SET_SRC (insn1)) == SImode)
24445 sd_iterator_def sd_it1;
24446 dep_t dep1;
24447 /* Check if there is no other dependee for IMUL. */
24448 index = i;
24449 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24451 rtx pro;
24452 pro = DEP_PRO (dep1);
24453 if (!NONDEBUG_INSN_P (pro))
24454 continue;
24455 if (pro != insn)
24456 index = -1;
24458 if (index >= 0)
24459 break;
24462 if (index >= 0)
24463 break;
24465 if (index < 0)
24466 return issue_rate; /* Didn't find IMUL producer. */
24468 if (sched_verbose > 1)
24469 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24470 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24472 /* Put IMUL producer (ready[index]) at the top of ready list. */
24473 insn1= ready[index];
24474 for (i = index; i < n_ready - 1; i++)
24475 ready[i] = ready[i + 1];
24476 ready[n_ready - 1] = insn1;
24478 return issue_rate;
24481 static bool
24482 ix86_class_likely_spilled_p (reg_class_t);
24484 /* Returns true if lhs of insn is HW function argument register and set up
24485 is_spilled to true if it is likely spilled HW register. */
24486 static bool
24487 insn_is_function_arg (rtx insn, bool* is_spilled)
24489 rtx dst;
24491 if (!NONDEBUG_INSN_P (insn))
24492 return false;
24493 /* Call instructions are not movable, ignore it. */
24494 if (CALL_P (insn))
24495 return false;
24496 insn = PATTERN (insn);
24497 if (GET_CODE (insn) == PARALLEL)
24498 insn = XVECEXP (insn, 0, 0);
24499 if (GET_CODE (insn) != SET)
24500 return false;
24501 dst = SET_DEST (insn);
24502 if (REG_P (dst) && HARD_REGISTER_P (dst)
24503 && ix86_function_arg_regno_p (REGNO (dst)))
24505 /* Is it likely spilled HW register? */
24506 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24507 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24508 *is_spilled = true;
24509 return true;
24511 return false;
24514 /* Add output dependencies for chain of function adjacent arguments if only
24515 there is a move to likely spilled HW register. Return first argument
24516 if at least one dependence was added or NULL otherwise. */
24517 static rtx
24518 add_parameter_dependencies (rtx call, rtx head)
24520 rtx insn;
24521 rtx last = call;
24522 rtx first_arg = NULL;
24523 bool is_spilled = false;
24525 head = PREV_INSN (head);
24527 /* Find nearest to call argument passing instruction. */
24528 while (true)
24530 last = PREV_INSN (last);
24531 if (last == head)
24532 return NULL;
24533 if (!NONDEBUG_INSN_P (last))
24534 continue;
24535 if (insn_is_function_arg (last, &is_spilled))
24536 break;
24537 return NULL;
24540 first_arg = last;
24541 while (true)
24543 insn = PREV_INSN (last);
24544 if (!INSN_P (insn))
24545 break;
24546 if (insn == head)
24547 break;
24548 if (!NONDEBUG_INSN_P (insn))
24550 last = insn;
24551 continue;
24553 if (insn_is_function_arg (insn, &is_spilled))
24555 /* Add output depdendence between two function arguments if chain
24556 of output arguments contains likely spilled HW registers. */
24557 if (is_spilled)
24558 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24559 first_arg = last = insn;
24561 else
24562 break;
24564 if (!is_spilled)
24565 return NULL;
24566 return first_arg;
24569 /* Add output or anti dependency from insn to first_arg to restrict its code
24570 motion. */
24571 static void
24572 avoid_func_arg_motion (rtx first_arg, rtx insn)
24574 rtx set;
24575 rtx tmp;
24577 set = single_set (insn);
24578 if (!set)
24579 return;
24580 tmp = SET_DEST (set);
24581 if (REG_P (tmp))
24583 /* Add output dependency to the first function argument. */
24584 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24585 return;
24587 /* Add anti dependency. */
24588 add_dependence (first_arg, insn, REG_DEP_ANTI);
24591 /* Avoid cross block motion of function argument through adding dependency
24592 from the first non-jump instruction in bb. */
24593 static void
24594 add_dependee_for_func_arg (rtx arg, basic_block bb)
24596 rtx insn = BB_END (bb);
24598 while (insn)
24600 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24602 rtx set = single_set (insn);
24603 if (set)
24605 avoid_func_arg_motion (arg, insn);
24606 return;
24609 if (insn == BB_HEAD (bb))
24610 return;
24611 insn = PREV_INSN (insn);
24615 /* Hook for pre-reload schedule - avoid motion of function arguments
24616 passed in likely spilled HW registers. */
24617 static void
24618 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24620 rtx insn;
24621 rtx first_arg = NULL;
24622 if (reload_completed)
24623 return;
24624 while (head != tail && DEBUG_INSN_P (head))
24625 head = NEXT_INSN (head);
24626 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24627 if (INSN_P (insn) && CALL_P (insn))
24629 first_arg = add_parameter_dependencies (insn, head);
24630 if (first_arg)
24632 /* Add dependee for first argument to predecessors if only
24633 region contains more than one block. */
24634 basic_block bb = BLOCK_FOR_INSN (insn);
24635 int rgn = CONTAINING_RGN (bb->index);
24636 int nr_blks = RGN_NR_BLOCKS (rgn);
24637 /* Skip trivial regions and region head blocks that can have
24638 predecessors outside of region. */
24639 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24641 edge e;
24642 edge_iterator ei;
24643 /* Assume that region is SCC, i.e. all immediate predecessors
24644 of non-head block are in the same region. */
24645 FOR_EACH_EDGE (e, ei, bb->preds)
24647 /* Avoid creating of loop-carried dependencies through
24648 using topological odering in region. */
24649 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24650 add_dependee_for_func_arg (first_arg, e->src);
24653 insn = first_arg;
24654 if (insn == head)
24655 break;
24658 else if (first_arg)
24659 avoid_func_arg_motion (first_arg, insn);
24662 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24663 HW registers to maximum, to schedule them at soon as possible. These are
24664 moves from function argument registers at the top of the function entry
24665 and moves from function return value registers after call. */
24666 static int
24667 ix86_adjust_priority (rtx insn, int priority)
24669 rtx set;
24671 if (reload_completed)
24672 return priority;
24674 if (!NONDEBUG_INSN_P (insn))
24675 return priority;
24677 set = single_set (insn);
24678 if (set)
24680 rtx tmp = SET_SRC (set);
24681 if (REG_P (tmp)
24682 && HARD_REGISTER_P (tmp)
24683 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24684 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24685 return current_sched_info->sched_max_insns_priority;
24688 return priority;
24691 /* Model decoder of Core 2/i7.
24692 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24693 track the instruction fetch block boundaries and make sure that long
24694 (9+ bytes) instructions are assigned to D0. */
24696 /* Maximum length of an insn that can be handled by
24697 a secondary decoder unit. '8' for Core 2/i7. */
24698 static int core2i7_secondary_decoder_max_insn_size;
24700 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24701 '16' for Core 2/i7. */
24702 static int core2i7_ifetch_block_size;
24704 /* Maximum number of instructions decoder can handle per cycle.
24705 '6' for Core 2/i7. */
24706 static int core2i7_ifetch_block_max_insns;
24708 typedef struct ix86_first_cycle_multipass_data_ *
24709 ix86_first_cycle_multipass_data_t;
24710 typedef const struct ix86_first_cycle_multipass_data_ *
24711 const_ix86_first_cycle_multipass_data_t;
24713 /* A variable to store target state across calls to max_issue within
24714 one cycle. */
24715 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24716 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24718 /* Initialize DATA. */
24719 static void
24720 core2i7_first_cycle_multipass_init (void *_data)
24722 ix86_first_cycle_multipass_data_t data
24723 = (ix86_first_cycle_multipass_data_t) _data;
24725 data->ifetch_block_len = 0;
24726 data->ifetch_block_n_insns = 0;
24727 data->ready_try_change = NULL;
24728 data->ready_try_change_size = 0;
24731 /* Advancing the cycle; reset ifetch block counts. */
24732 static void
24733 core2i7_dfa_post_advance_cycle (void)
24735 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24737 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24739 data->ifetch_block_len = 0;
24740 data->ifetch_block_n_insns = 0;
24743 static int min_insn_size (rtx);
24745 /* Filter out insns from ready_try that the core will not be able to issue
24746 on current cycle due to decoder. */
24747 static void
24748 core2i7_first_cycle_multipass_filter_ready_try
24749 (const_ix86_first_cycle_multipass_data_t data,
24750 char *ready_try, int n_ready, bool first_cycle_insn_p)
24752 while (n_ready--)
24754 rtx insn;
24755 int insn_size;
24757 if (ready_try[n_ready])
24758 continue;
24760 insn = get_ready_element (n_ready);
24761 insn_size = min_insn_size (insn);
24763 if (/* If this is a too long an insn for a secondary decoder ... */
24764 (!first_cycle_insn_p
24765 && insn_size > core2i7_secondary_decoder_max_insn_size)
24766 /* ... or it would not fit into the ifetch block ... */
24767 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24768 /* ... or the decoder is full already ... */
24769 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24770 /* ... mask the insn out. */
24772 ready_try[n_ready] = 1;
24774 if (data->ready_try_change)
24775 bitmap_set_bit (data->ready_try_change, n_ready);
24780 /* Prepare for a new round of multipass lookahead scheduling. */
24781 static void
24782 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24783 bool first_cycle_insn_p)
24785 ix86_first_cycle_multipass_data_t data
24786 = (ix86_first_cycle_multipass_data_t) _data;
24787 const_ix86_first_cycle_multipass_data_t prev_data
24788 = ix86_first_cycle_multipass_data;
24790 /* Restore the state from the end of the previous round. */
24791 data->ifetch_block_len = prev_data->ifetch_block_len;
24792 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24794 /* Filter instructions that cannot be issued on current cycle due to
24795 decoder restrictions. */
24796 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24797 first_cycle_insn_p);
24800 /* INSN is being issued in current solution. Account for its impact on
24801 the decoder model. */
24802 static void
24803 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24804 rtx insn, const void *_prev_data)
24806 ix86_first_cycle_multipass_data_t data
24807 = (ix86_first_cycle_multipass_data_t) _data;
24808 const_ix86_first_cycle_multipass_data_t prev_data
24809 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24811 int insn_size = min_insn_size (insn);
24813 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24814 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24815 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24816 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24818 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24819 if (!data->ready_try_change)
24821 data->ready_try_change = sbitmap_alloc (n_ready);
24822 data->ready_try_change_size = n_ready;
24824 else if (data->ready_try_change_size < n_ready)
24826 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24827 n_ready, 0);
24828 data->ready_try_change_size = n_ready;
24830 bitmap_clear (data->ready_try_change);
24832 /* Filter out insns from ready_try that the core will not be able to issue
24833 on current cycle due to decoder. */
24834 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24835 false);
24838 /* Revert the effect on ready_try. */
24839 static void
24840 core2i7_first_cycle_multipass_backtrack (const void *_data,
24841 char *ready_try,
24842 int n_ready ATTRIBUTE_UNUSED)
24844 const_ix86_first_cycle_multipass_data_t data
24845 = (const_ix86_first_cycle_multipass_data_t) _data;
24846 unsigned int i = 0;
24847 sbitmap_iterator sbi;
24849 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24850 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24852 ready_try[i] = 0;
24856 /* Save the result of multipass lookahead scheduling for the next round. */
24857 static void
24858 core2i7_first_cycle_multipass_end (const void *_data)
24860 const_ix86_first_cycle_multipass_data_t data
24861 = (const_ix86_first_cycle_multipass_data_t) _data;
24862 ix86_first_cycle_multipass_data_t next_data
24863 = ix86_first_cycle_multipass_data;
24865 if (data != NULL)
24867 next_data->ifetch_block_len = data->ifetch_block_len;
24868 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24872 /* Deallocate target data. */
24873 static void
24874 core2i7_first_cycle_multipass_fini (void *_data)
24876 ix86_first_cycle_multipass_data_t data
24877 = (ix86_first_cycle_multipass_data_t) _data;
24879 if (data->ready_try_change)
24881 sbitmap_free (data->ready_try_change);
24882 data->ready_try_change = NULL;
24883 data->ready_try_change_size = 0;
24887 /* Prepare for scheduling pass. */
24888 static void
24889 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24890 int verbose ATTRIBUTE_UNUSED,
24891 int max_uid ATTRIBUTE_UNUSED)
24893 /* Install scheduling hooks for current CPU. Some of these hooks are used
24894 in time-critical parts of the scheduler, so we only set them up when
24895 they are actually used. */
24896 switch (ix86_tune)
24898 case PROCESSOR_CORE2:
24899 case PROCESSOR_COREI7:
24900 case PROCESSOR_HASWELL:
24901 /* Do not perform multipass scheduling for pre-reload schedule
24902 to save compile time. */
24903 if (reload_completed)
24905 targetm.sched.dfa_post_advance_cycle
24906 = core2i7_dfa_post_advance_cycle;
24907 targetm.sched.first_cycle_multipass_init
24908 = core2i7_first_cycle_multipass_init;
24909 targetm.sched.first_cycle_multipass_begin
24910 = core2i7_first_cycle_multipass_begin;
24911 targetm.sched.first_cycle_multipass_issue
24912 = core2i7_first_cycle_multipass_issue;
24913 targetm.sched.first_cycle_multipass_backtrack
24914 = core2i7_first_cycle_multipass_backtrack;
24915 targetm.sched.first_cycle_multipass_end
24916 = core2i7_first_cycle_multipass_end;
24917 targetm.sched.first_cycle_multipass_fini
24918 = core2i7_first_cycle_multipass_fini;
24920 /* Set decoder parameters. */
24921 core2i7_secondary_decoder_max_insn_size = 8;
24922 core2i7_ifetch_block_size = 16;
24923 core2i7_ifetch_block_max_insns = 6;
24924 break;
24926 /* ... Fall through ... */
24927 default:
24928 targetm.sched.dfa_post_advance_cycle = NULL;
24929 targetm.sched.first_cycle_multipass_init = NULL;
24930 targetm.sched.first_cycle_multipass_begin = NULL;
24931 targetm.sched.first_cycle_multipass_issue = NULL;
24932 targetm.sched.first_cycle_multipass_backtrack = NULL;
24933 targetm.sched.first_cycle_multipass_end = NULL;
24934 targetm.sched.first_cycle_multipass_fini = NULL;
24935 break;
24940 /* Compute the alignment given to a constant that is being placed in memory.
24941 EXP is the constant and ALIGN is the alignment that the object would
24942 ordinarily have.
24943 The value of this function is used instead of that alignment to align
24944 the object. */
24947 ix86_constant_alignment (tree exp, int align)
24949 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24950 || TREE_CODE (exp) == INTEGER_CST)
24952 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24953 return 64;
24954 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24955 return 128;
24957 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24958 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24959 return BITS_PER_WORD;
24961 return align;
24964 /* Compute the alignment for a static variable.
24965 TYPE is the data type, and ALIGN is the alignment that
24966 the object would ordinarily have. The value of this function is used
24967 instead of that alignment to align the object. */
24970 ix86_data_alignment (tree type, int align)
24972 int max_align
24973 = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24975 if (AGGREGATE_TYPE_P (type)
24976 && TYPE_SIZE (type)
24977 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24978 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24979 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24980 && align < max_align)
24981 align = max_align;
24983 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24984 to 16byte boundary. */
24985 if (TARGET_64BIT)
24987 if (AGGREGATE_TYPE_P (type)
24988 && TYPE_SIZE (type)
24989 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24990 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24991 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24992 return 128;
24995 if (TREE_CODE (type) == ARRAY_TYPE)
24997 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24998 return 64;
24999 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25000 return 128;
25002 else if (TREE_CODE (type) == COMPLEX_TYPE)
25005 if (TYPE_MODE (type) == DCmode && align < 64)
25006 return 64;
25007 if ((TYPE_MODE (type) == XCmode
25008 || TYPE_MODE (type) == TCmode) && align < 128)
25009 return 128;
25011 else if ((TREE_CODE (type) == RECORD_TYPE
25012 || TREE_CODE (type) == UNION_TYPE
25013 || TREE_CODE (type) == QUAL_UNION_TYPE)
25014 && TYPE_FIELDS (type))
25016 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25017 return 64;
25018 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25019 return 128;
25021 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25022 || TREE_CODE (type) == INTEGER_TYPE)
25024 if (TYPE_MODE (type) == DFmode && align < 64)
25025 return 64;
25026 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25027 return 128;
25030 return align;
25033 /* Compute the alignment for a local variable or a stack slot. EXP is
25034 the data type or decl itself, MODE is the widest mode available and
25035 ALIGN is the alignment that the object would ordinarily have. The
25036 value of this macro is used instead of that alignment to align the
25037 object. */
25039 unsigned int
25040 ix86_local_alignment (tree exp, enum machine_mode mode,
25041 unsigned int align)
25043 tree type, decl;
25045 if (exp && DECL_P (exp))
25047 type = TREE_TYPE (exp);
25048 decl = exp;
25050 else
25052 type = exp;
25053 decl = NULL;
25056 /* Don't do dynamic stack realignment for long long objects with
25057 -mpreferred-stack-boundary=2. */
25058 if (!TARGET_64BIT
25059 && align == 64
25060 && ix86_preferred_stack_boundary < 64
25061 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25062 && (!type || !TYPE_USER_ALIGN (type))
25063 && (!decl || !DECL_USER_ALIGN (decl)))
25064 align = 32;
25066 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25067 register in MODE. We will return the largest alignment of XF
25068 and DF. */
25069 if (!type)
25071 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25072 align = GET_MODE_ALIGNMENT (DFmode);
25073 return align;
25076 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25077 to 16byte boundary. Exact wording is:
25079 An array uses the same alignment as its elements, except that a local or
25080 global array variable of length at least 16 bytes or
25081 a C99 variable-length array variable always has alignment of at least 16 bytes.
25083 This was added to allow use of aligned SSE instructions at arrays. This
25084 rule is meant for static storage (where compiler can not do the analysis
25085 by itself). We follow it for automatic variables only when convenient.
25086 We fully control everything in the function compiled and functions from
25087 other unit can not rely on the alignment.
25089 Exclude va_list type. It is the common case of local array where
25090 we can not benefit from the alignment. */
25091 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25092 && TARGET_SSE)
25094 if (AGGREGATE_TYPE_P (type)
25095 && (va_list_type_node == NULL_TREE
25096 || (TYPE_MAIN_VARIANT (type)
25097 != TYPE_MAIN_VARIANT (va_list_type_node)))
25098 && TYPE_SIZE (type)
25099 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25100 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25101 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25102 return 128;
25104 if (TREE_CODE (type) == ARRAY_TYPE)
25106 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25107 return 64;
25108 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25109 return 128;
25111 else if (TREE_CODE (type) == COMPLEX_TYPE)
25113 if (TYPE_MODE (type) == DCmode && align < 64)
25114 return 64;
25115 if ((TYPE_MODE (type) == XCmode
25116 || TYPE_MODE (type) == TCmode) && align < 128)
25117 return 128;
25119 else if ((TREE_CODE (type) == RECORD_TYPE
25120 || TREE_CODE (type) == UNION_TYPE
25121 || TREE_CODE (type) == QUAL_UNION_TYPE)
25122 && TYPE_FIELDS (type))
25124 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25125 return 64;
25126 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25127 return 128;
25129 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25130 || TREE_CODE (type) == INTEGER_TYPE)
25133 if (TYPE_MODE (type) == DFmode && align < 64)
25134 return 64;
25135 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25136 return 128;
25138 return align;
25141 /* Compute the minimum required alignment for dynamic stack realignment
25142 purposes for a local variable, parameter or a stack slot. EXP is
25143 the data type or decl itself, MODE is its mode and ALIGN is the
25144 alignment that the object would ordinarily have. */
25146 unsigned int
25147 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25148 unsigned int align)
25150 tree type, decl;
25152 if (exp && DECL_P (exp))
25154 type = TREE_TYPE (exp);
25155 decl = exp;
25157 else
25159 type = exp;
25160 decl = NULL;
25163 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25164 return align;
25166 /* Don't do dynamic stack realignment for long long objects with
25167 -mpreferred-stack-boundary=2. */
25168 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25169 && (!type || !TYPE_USER_ALIGN (type))
25170 && (!decl || !DECL_USER_ALIGN (decl)))
25171 return 32;
25173 return align;
25176 /* Find a location for the static chain incoming to a nested function.
25177 This is a register, unless all free registers are used by arguments. */
25179 static rtx
25180 ix86_static_chain (const_tree fndecl, bool incoming_p)
25182 unsigned regno;
25184 if (!DECL_STATIC_CHAIN (fndecl))
25185 return NULL;
25187 if (TARGET_64BIT)
25189 /* We always use R10 in 64-bit mode. */
25190 regno = R10_REG;
25192 else
25194 tree fntype;
25195 unsigned int ccvt;
25197 /* By default in 32-bit mode we use ECX to pass the static chain. */
25198 regno = CX_REG;
25200 fntype = TREE_TYPE (fndecl);
25201 ccvt = ix86_get_callcvt (fntype);
25202 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25204 /* Fastcall functions use ecx/edx for arguments, which leaves
25205 us with EAX for the static chain.
25206 Thiscall functions use ecx for arguments, which also
25207 leaves us with EAX for the static chain. */
25208 regno = AX_REG;
25210 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25212 /* Thiscall functions use ecx for arguments, which leaves
25213 us with EAX and EDX for the static chain.
25214 We are using for abi-compatibility EAX. */
25215 regno = AX_REG;
25217 else if (ix86_function_regparm (fntype, fndecl) == 3)
25219 /* For regparm 3, we have no free call-clobbered registers in
25220 which to store the static chain. In order to implement this,
25221 we have the trampoline push the static chain to the stack.
25222 However, we can't push a value below the return address when
25223 we call the nested function directly, so we have to use an
25224 alternate entry point. For this we use ESI, and have the
25225 alternate entry point push ESI, so that things appear the
25226 same once we're executing the nested function. */
25227 if (incoming_p)
25229 if (fndecl == current_function_decl)
25230 ix86_static_chain_on_stack = true;
25231 return gen_frame_mem (SImode,
25232 plus_constant (Pmode,
25233 arg_pointer_rtx, -8));
25235 regno = SI_REG;
25239 return gen_rtx_REG (Pmode, regno);
25242 /* Emit RTL insns to initialize the variable parts of a trampoline.
25243 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25244 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25245 to be passed to the target function. */
25247 static void
25248 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25250 rtx mem, fnaddr;
25251 int opcode;
25252 int offset = 0;
25254 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25256 if (TARGET_64BIT)
25258 int size;
25260 /* Load the function address to r11. Try to load address using
25261 the shorter movl instead of movabs. We may want to support
25262 movq for kernel mode, but kernel does not use trampolines at
25263 the moment. FNADDR is a 32bit address and may not be in
25264 DImode when ptr_mode == SImode. Always use movl in this
25265 case. */
25266 if (ptr_mode == SImode
25267 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25269 fnaddr = copy_addr_to_reg (fnaddr);
25271 mem = adjust_address (m_tramp, HImode, offset);
25272 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25274 mem = adjust_address (m_tramp, SImode, offset + 2);
25275 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25276 offset += 6;
25278 else
25280 mem = adjust_address (m_tramp, HImode, offset);
25281 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25283 mem = adjust_address (m_tramp, DImode, offset + 2);
25284 emit_move_insn (mem, fnaddr);
25285 offset += 10;
25288 /* Load static chain using movabs to r10. Use the shorter movl
25289 instead of movabs when ptr_mode == SImode. */
25290 if (ptr_mode == SImode)
25292 opcode = 0xba41;
25293 size = 6;
25295 else
25297 opcode = 0xba49;
25298 size = 10;
25301 mem = adjust_address (m_tramp, HImode, offset);
25302 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25304 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25305 emit_move_insn (mem, chain_value);
25306 offset += size;
25308 /* Jump to r11; the last (unused) byte is a nop, only there to
25309 pad the write out to a single 32-bit store. */
25310 mem = adjust_address (m_tramp, SImode, offset);
25311 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25312 offset += 4;
25314 else
25316 rtx disp, chain;
25318 /* Depending on the static chain location, either load a register
25319 with a constant, or push the constant to the stack. All of the
25320 instructions are the same size. */
25321 chain = ix86_static_chain (fndecl, true);
25322 if (REG_P (chain))
25324 switch (REGNO (chain))
25326 case AX_REG:
25327 opcode = 0xb8; break;
25328 case CX_REG:
25329 opcode = 0xb9; break;
25330 default:
25331 gcc_unreachable ();
25334 else
25335 opcode = 0x68;
25337 mem = adjust_address (m_tramp, QImode, offset);
25338 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25340 mem = adjust_address (m_tramp, SImode, offset + 1);
25341 emit_move_insn (mem, chain_value);
25342 offset += 5;
25344 mem = adjust_address (m_tramp, QImode, offset);
25345 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25347 mem = adjust_address (m_tramp, SImode, offset + 1);
25349 /* Compute offset from the end of the jmp to the target function.
25350 In the case in which the trampoline stores the static chain on
25351 the stack, we need to skip the first insn which pushes the
25352 (call-saved) register static chain; this push is 1 byte. */
25353 offset += 5;
25354 disp = expand_binop (SImode, sub_optab, fnaddr,
25355 plus_constant (Pmode, XEXP (m_tramp, 0),
25356 offset - (MEM_P (chain) ? 1 : 0)),
25357 NULL_RTX, 1, OPTAB_DIRECT);
25358 emit_move_insn (mem, disp);
25361 gcc_assert (offset <= TRAMPOLINE_SIZE);
25363 #ifdef HAVE_ENABLE_EXECUTE_STACK
25364 #ifdef CHECK_EXECUTE_STACK_ENABLED
25365 if (CHECK_EXECUTE_STACK_ENABLED)
25366 #endif
25367 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25368 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25369 #endif
25372 /* The following file contains several enumerations and data structures
25373 built from the definitions in i386-builtin-types.def. */
25375 #include "i386-builtin-types.inc"
25377 /* Table for the ix86 builtin non-function types. */
25378 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25380 /* Retrieve an element from the above table, building some of
25381 the types lazily. */
25383 static tree
25384 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25386 unsigned int index;
25387 tree type, itype;
25389 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25391 type = ix86_builtin_type_tab[(int) tcode];
25392 if (type != NULL)
25393 return type;
25395 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25396 if (tcode <= IX86_BT_LAST_VECT)
25398 enum machine_mode mode;
25400 index = tcode - IX86_BT_LAST_PRIM - 1;
25401 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25402 mode = ix86_builtin_type_vect_mode[index];
25404 type = build_vector_type_for_mode (itype, mode);
25406 else
25408 int quals;
25410 index = tcode - IX86_BT_LAST_VECT - 1;
25411 if (tcode <= IX86_BT_LAST_PTR)
25412 quals = TYPE_UNQUALIFIED;
25413 else
25414 quals = TYPE_QUAL_CONST;
25416 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25417 if (quals != TYPE_UNQUALIFIED)
25418 itype = build_qualified_type (itype, quals);
25420 type = build_pointer_type (itype);
25423 ix86_builtin_type_tab[(int) tcode] = type;
25424 return type;
25427 /* Table for the ix86 builtin function types. */
25428 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25430 /* Retrieve an element from the above table, building some of
25431 the types lazily. */
25433 static tree
25434 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25436 tree type;
25438 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25440 type = ix86_builtin_func_type_tab[(int) tcode];
25441 if (type != NULL)
25442 return type;
25444 if (tcode <= IX86_BT_LAST_FUNC)
25446 unsigned start = ix86_builtin_func_start[(int) tcode];
25447 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25448 tree rtype, atype, args = void_list_node;
25449 unsigned i;
25451 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25452 for (i = after - 1; i > start; --i)
25454 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25455 args = tree_cons (NULL, atype, args);
25458 type = build_function_type (rtype, args);
25460 else
25462 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25463 enum ix86_builtin_func_type icode;
25465 icode = ix86_builtin_func_alias_base[index];
25466 type = ix86_get_builtin_func_type (icode);
25469 ix86_builtin_func_type_tab[(int) tcode] = type;
25470 return type;
25474 /* Codes for all the SSE/MMX builtins. */
25475 enum ix86_builtins
25477 IX86_BUILTIN_ADDPS,
25478 IX86_BUILTIN_ADDSS,
25479 IX86_BUILTIN_DIVPS,
25480 IX86_BUILTIN_DIVSS,
25481 IX86_BUILTIN_MULPS,
25482 IX86_BUILTIN_MULSS,
25483 IX86_BUILTIN_SUBPS,
25484 IX86_BUILTIN_SUBSS,
25486 IX86_BUILTIN_CMPEQPS,
25487 IX86_BUILTIN_CMPLTPS,
25488 IX86_BUILTIN_CMPLEPS,
25489 IX86_BUILTIN_CMPGTPS,
25490 IX86_BUILTIN_CMPGEPS,
25491 IX86_BUILTIN_CMPNEQPS,
25492 IX86_BUILTIN_CMPNLTPS,
25493 IX86_BUILTIN_CMPNLEPS,
25494 IX86_BUILTIN_CMPNGTPS,
25495 IX86_BUILTIN_CMPNGEPS,
25496 IX86_BUILTIN_CMPORDPS,
25497 IX86_BUILTIN_CMPUNORDPS,
25498 IX86_BUILTIN_CMPEQSS,
25499 IX86_BUILTIN_CMPLTSS,
25500 IX86_BUILTIN_CMPLESS,
25501 IX86_BUILTIN_CMPNEQSS,
25502 IX86_BUILTIN_CMPNLTSS,
25503 IX86_BUILTIN_CMPNLESS,
25504 IX86_BUILTIN_CMPNGTSS,
25505 IX86_BUILTIN_CMPNGESS,
25506 IX86_BUILTIN_CMPORDSS,
25507 IX86_BUILTIN_CMPUNORDSS,
25509 IX86_BUILTIN_COMIEQSS,
25510 IX86_BUILTIN_COMILTSS,
25511 IX86_BUILTIN_COMILESS,
25512 IX86_BUILTIN_COMIGTSS,
25513 IX86_BUILTIN_COMIGESS,
25514 IX86_BUILTIN_COMINEQSS,
25515 IX86_BUILTIN_UCOMIEQSS,
25516 IX86_BUILTIN_UCOMILTSS,
25517 IX86_BUILTIN_UCOMILESS,
25518 IX86_BUILTIN_UCOMIGTSS,
25519 IX86_BUILTIN_UCOMIGESS,
25520 IX86_BUILTIN_UCOMINEQSS,
25522 IX86_BUILTIN_CVTPI2PS,
25523 IX86_BUILTIN_CVTPS2PI,
25524 IX86_BUILTIN_CVTSI2SS,
25525 IX86_BUILTIN_CVTSI642SS,
25526 IX86_BUILTIN_CVTSS2SI,
25527 IX86_BUILTIN_CVTSS2SI64,
25528 IX86_BUILTIN_CVTTPS2PI,
25529 IX86_BUILTIN_CVTTSS2SI,
25530 IX86_BUILTIN_CVTTSS2SI64,
25532 IX86_BUILTIN_MAXPS,
25533 IX86_BUILTIN_MAXSS,
25534 IX86_BUILTIN_MINPS,
25535 IX86_BUILTIN_MINSS,
25537 IX86_BUILTIN_LOADUPS,
25538 IX86_BUILTIN_STOREUPS,
25539 IX86_BUILTIN_MOVSS,
25541 IX86_BUILTIN_MOVHLPS,
25542 IX86_BUILTIN_MOVLHPS,
25543 IX86_BUILTIN_LOADHPS,
25544 IX86_BUILTIN_LOADLPS,
25545 IX86_BUILTIN_STOREHPS,
25546 IX86_BUILTIN_STORELPS,
25548 IX86_BUILTIN_MASKMOVQ,
25549 IX86_BUILTIN_MOVMSKPS,
25550 IX86_BUILTIN_PMOVMSKB,
25552 IX86_BUILTIN_MOVNTPS,
25553 IX86_BUILTIN_MOVNTQ,
25555 IX86_BUILTIN_LOADDQU,
25556 IX86_BUILTIN_STOREDQU,
25558 IX86_BUILTIN_PACKSSWB,
25559 IX86_BUILTIN_PACKSSDW,
25560 IX86_BUILTIN_PACKUSWB,
25562 IX86_BUILTIN_PADDB,
25563 IX86_BUILTIN_PADDW,
25564 IX86_BUILTIN_PADDD,
25565 IX86_BUILTIN_PADDQ,
25566 IX86_BUILTIN_PADDSB,
25567 IX86_BUILTIN_PADDSW,
25568 IX86_BUILTIN_PADDUSB,
25569 IX86_BUILTIN_PADDUSW,
25570 IX86_BUILTIN_PSUBB,
25571 IX86_BUILTIN_PSUBW,
25572 IX86_BUILTIN_PSUBD,
25573 IX86_BUILTIN_PSUBQ,
25574 IX86_BUILTIN_PSUBSB,
25575 IX86_BUILTIN_PSUBSW,
25576 IX86_BUILTIN_PSUBUSB,
25577 IX86_BUILTIN_PSUBUSW,
25579 IX86_BUILTIN_PAND,
25580 IX86_BUILTIN_PANDN,
25581 IX86_BUILTIN_POR,
25582 IX86_BUILTIN_PXOR,
25584 IX86_BUILTIN_PAVGB,
25585 IX86_BUILTIN_PAVGW,
25587 IX86_BUILTIN_PCMPEQB,
25588 IX86_BUILTIN_PCMPEQW,
25589 IX86_BUILTIN_PCMPEQD,
25590 IX86_BUILTIN_PCMPGTB,
25591 IX86_BUILTIN_PCMPGTW,
25592 IX86_BUILTIN_PCMPGTD,
25594 IX86_BUILTIN_PMADDWD,
25596 IX86_BUILTIN_PMAXSW,
25597 IX86_BUILTIN_PMAXUB,
25598 IX86_BUILTIN_PMINSW,
25599 IX86_BUILTIN_PMINUB,
25601 IX86_BUILTIN_PMULHUW,
25602 IX86_BUILTIN_PMULHW,
25603 IX86_BUILTIN_PMULLW,
25605 IX86_BUILTIN_PSADBW,
25606 IX86_BUILTIN_PSHUFW,
25608 IX86_BUILTIN_PSLLW,
25609 IX86_BUILTIN_PSLLD,
25610 IX86_BUILTIN_PSLLQ,
25611 IX86_BUILTIN_PSRAW,
25612 IX86_BUILTIN_PSRAD,
25613 IX86_BUILTIN_PSRLW,
25614 IX86_BUILTIN_PSRLD,
25615 IX86_BUILTIN_PSRLQ,
25616 IX86_BUILTIN_PSLLWI,
25617 IX86_BUILTIN_PSLLDI,
25618 IX86_BUILTIN_PSLLQI,
25619 IX86_BUILTIN_PSRAWI,
25620 IX86_BUILTIN_PSRADI,
25621 IX86_BUILTIN_PSRLWI,
25622 IX86_BUILTIN_PSRLDI,
25623 IX86_BUILTIN_PSRLQI,
25625 IX86_BUILTIN_PUNPCKHBW,
25626 IX86_BUILTIN_PUNPCKHWD,
25627 IX86_BUILTIN_PUNPCKHDQ,
25628 IX86_BUILTIN_PUNPCKLBW,
25629 IX86_BUILTIN_PUNPCKLWD,
25630 IX86_BUILTIN_PUNPCKLDQ,
25632 IX86_BUILTIN_SHUFPS,
25634 IX86_BUILTIN_RCPPS,
25635 IX86_BUILTIN_RCPSS,
25636 IX86_BUILTIN_RSQRTPS,
25637 IX86_BUILTIN_RSQRTPS_NR,
25638 IX86_BUILTIN_RSQRTSS,
25639 IX86_BUILTIN_RSQRTF,
25640 IX86_BUILTIN_SQRTPS,
25641 IX86_BUILTIN_SQRTPS_NR,
25642 IX86_BUILTIN_SQRTSS,
25644 IX86_BUILTIN_UNPCKHPS,
25645 IX86_BUILTIN_UNPCKLPS,
25647 IX86_BUILTIN_ANDPS,
25648 IX86_BUILTIN_ANDNPS,
25649 IX86_BUILTIN_ORPS,
25650 IX86_BUILTIN_XORPS,
25652 IX86_BUILTIN_EMMS,
25653 IX86_BUILTIN_LDMXCSR,
25654 IX86_BUILTIN_STMXCSR,
25655 IX86_BUILTIN_SFENCE,
25657 IX86_BUILTIN_FXSAVE,
25658 IX86_BUILTIN_FXRSTOR,
25659 IX86_BUILTIN_FXSAVE64,
25660 IX86_BUILTIN_FXRSTOR64,
25662 IX86_BUILTIN_XSAVE,
25663 IX86_BUILTIN_XRSTOR,
25664 IX86_BUILTIN_XSAVE64,
25665 IX86_BUILTIN_XRSTOR64,
25667 IX86_BUILTIN_XSAVEOPT,
25668 IX86_BUILTIN_XSAVEOPT64,
25670 /* 3DNow! Original */
25671 IX86_BUILTIN_FEMMS,
25672 IX86_BUILTIN_PAVGUSB,
25673 IX86_BUILTIN_PF2ID,
25674 IX86_BUILTIN_PFACC,
25675 IX86_BUILTIN_PFADD,
25676 IX86_BUILTIN_PFCMPEQ,
25677 IX86_BUILTIN_PFCMPGE,
25678 IX86_BUILTIN_PFCMPGT,
25679 IX86_BUILTIN_PFMAX,
25680 IX86_BUILTIN_PFMIN,
25681 IX86_BUILTIN_PFMUL,
25682 IX86_BUILTIN_PFRCP,
25683 IX86_BUILTIN_PFRCPIT1,
25684 IX86_BUILTIN_PFRCPIT2,
25685 IX86_BUILTIN_PFRSQIT1,
25686 IX86_BUILTIN_PFRSQRT,
25687 IX86_BUILTIN_PFSUB,
25688 IX86_BUILTIN_PFSUBR,
25689 IX86_BUILTIN_PI2FD,
25690 IX86_BUILTIN_PMULHRW,
25692 /* 3DNow! Athlon Extensions */
25693 IX86_BUILTIN_PF2IW,
25694 IX86_BUILTIN_PFNACC,
25695 IX86_BUILTIN_PFPNACC,
25696 IX86_BUILTIN_PI2FW,
25697 IX86_BUILTIN_PSWAPDSI,
25698 IX86_BUILTIN_PSWAPDSF,
25700 /* SSE2 */
25701 IX86_BUILTIN_ADDPD,
25702 IX86_BUILTIN_ADDSD,
25703 IX86_BUILTIN_DIVPD,
25704 IX86_BUILTIN_DIVSD,
25705 IX86_BUILTIN_MULPD,
25706 IX86_BUILTIN_MULSD,
25707 IX86_BUILTIN_SUBPD,
25708 IX86_BUILTIN_SUBSD,
25710 IX86_BUILTIN_CMPEQPD,
25711 IX86_BUILTIN_CMPLTPD,
25712 IX86_BUILTIN_CMPLEPD,
25713 IX86_BUILTIN_CMPGTPD,
25714 IX86_BUILTIN_CMPGEPD,
25715 IX86_BUILTIN_CMPNEQPD,
25716 IX86_BUILTIN_CMPNLTPD,
25717 IX86_BUILTIN_CMPNLEPD,
25718 IX86_BUILTIN_CMPNGTPD,
25719 IX86_BUILTIN_CMPNGEPD,
25720 IX86_BUILTIN_CMPORDPD,
25721 IX86_BUILTIN_CMPUNORDPD,
25722 IX86_BUILTIN_CMPEQSD,
25723 IX86_BUILTIN_CMPLTSD,
25724 IX86_BUILTIN_CMPLESD,
25725 IX86_BUILTIN_CMPNEQSD,
25726 IX86_BUILTIN_CMPNLTSD,
25727 IX86_BUILTIN_CMPNLESD,
25728 IX86_BUILTIN_CMPORDSD,
25729 IX86_BUILTIN_CMPUNORDSD,
25731 IX86_BUILTIN_COMIEQSD,
25732 IX86_BUILTIN_COMILTSD,
25733 IX86_BUILTIN_COMILESD,
25734 IX86_BUILTIN_COMIGTSD,
25735 IX86_BUILTIN_COMIGESD,
25736 IX86_BUILTIN_COMINEQSD,
25737 IX86_BUILTIN_UCOMIEQSD,
25738 IX86_BUILTIN_UCOMILTSD,
25739 IX86_BUILTIN_UCOMILESD,
25740 IX86_BUILTIN_UCOMIGTSD,
25741 IX86_BUILTIN_UCOMIGESD,
25742 IX86_BUILTIN_UCOMINEQSD,
25744 IX86_BUILTIN_MAXPD,
25745 IX86_BUILTIN_MAXSD,
25746 IX86_BUILTIN_MINPD,
25747 IX86_BUILTIN_MINSD,
25749 IX86_BUILTIN_ANDPD,
25750 IX86_BUILTIN_ANDNPD,
25751 IX86_BUILTIN_ORPD,
25752 IX86_BUILTIN_XORPD,
25754 IX86_BUILTIN_SQRTPD,
25755 IX86_BUILTIN_SQRTSD,
25757 IX86_BUILTIN_UNPCKHPD,
25758 IX86_BUILTIN_UNPCKLPD,
25760 IX86_BUILTIN_SHUFPD,
25762 IX86_BUILTIN_LOADUPD,
25763 IX86_BUILTIN_STOREUPD,
25764 IX86_BUILTIN_MOVSD,
25766 IX86_BUILTIN_LOADHPD,
25767 IX86_BUILTIN_LOADLPD,
25769 IX86_BUILTIN_CVTDQ2PD,
25770 IX86_BUILTIN_CVTDQ2PS,
25772 IX86_BUILTIN_CVTPD2DQ,
25773 IX86_BUILTIN_CVTPD2PI,
25774 IX86_BUILTIN_CVTPD2PS,
25775 IX86_BUILTIN_CVTTPD2DQ,
25776 IX86_BUILTIN_CVTTPD2PI,
25778 IX86_BUILTIN_CVTPI2PD,
25779 IX86_BUILTIN_CVTSI2SD,
25780 IX86_BUILTIN_CVTSI642SD,
25782 IX86_BUILTIN_CVTSD2SI,
25783 IX86_BUILTIN_CVTSD2SI64,
25784 IX86_BUILTIN_CVTSD2SS,
25785 IX86_BUILTIN_CVTSS2SD,
25786 IX86_BUILTIN_CVTTSD2SI,
25787 IX86_BUILTIN_CVTTSD2SI64,
25789 IX86_BUILTIN_CVTPS2DQ,
25790 IX86_BUILTIN_CVTPS2PD,
25791 IX86_BUILTIN_CVTTPS2DQ,
25793 IX86_BUILTIN_MOVNTI,
25794 IX86_BUILTIN_MOVNTI64,
25795 IX86_BUILTIN_MOVNTPD,
25796 IX86_BUILTIN_MOVNTDQ,
25798 IX86_BUILTIN_MOVQ128,
25800 /* SSE2 MMX */
25801 IX86_BUILTIN_MASKMOVDQU,
25802 IX86_BUILTIN_MOVMSKPD,
25803 IX86_BUILTIN_PMOVMSKB128,
25805 IX86_BUILTIN_PACKSSWB128,
25806 IX86_BUILTIN_PACKSSDW128,
25807 IX86_BUILTIN_PACKUSWB128,
25809 IX86_BUILTIN_PADDB128,
25810 IX86_BUILTIN_PADDW128,
25811 IX86_BUILTIN_PADDD128,
25812 IX86_BUILTIN_PADDQ128,
25813 IX86_BUILTIN_PADDSB128,
25814 IX86_BUILTIN_PADDSW128,
25815 IX86_BUILTIN_PADDUSB128,
25816 IX86_BUILTIN_PADDUSW128,
25817 IX86_BUILTIN_PSUBB128,
25818 IX86_BUILTIN_PSUBW128,
25819 IX86_BUILTIN_PSUBD128,
25820 IX86_BUILTIN_PSUBQ128,
25821 IX86_BUILTIN_PSUBSB128,
25822 IX86_BUILTIN_PSUBSW128,
25823 IX86_BUILTIN_PSUBUSB128,
25824 IX86_BUILTIN_PSUBUSW128,
25826 IX86_BUILTIN_PAND128,
25827 IX86_BUILTIN_PANDN128,
25828 IX86_BUILTIN_POR128,
25829 IX86_BUILTIN_PXOR128,
25831 IX86_BUILTIN_PAVGB128,
25832 IX86_BUILTIN_PAVGW128,
25834 IX86_BUILTIN_PCMPEQB128,
25835 IX86_BUILTIN_PCMPEQW128,
25836 IX86_BUILTIN_PCMPEQD128,
25837 IX86_BUILTIN_PCMPGTB128,
25838 IX86_BUILTIN_PCMPGTW128,
25839 IX86_BUILTIN_PCMPGTD128,
25841 IX86_BUILTIN_PMADDWD128,
25843 IX86_BUILTIN_PMAXSW128,
25844 IX86_BUILTIN_PMAXUB128,
25845 IX86_BUILTIN_PMINSW128,
25846 IX86_BUILTIN_PMINUB128,
25848 IX86_BUILTIN_PMULUDQ,
25849 IX86_BUILTIN_PMULUDQ128,
25850 IX86_BUILTIN_PMULHUW128,
25851 IX86_BUILTIN_PMULHW128,
25852 IX86_BUILTIN_PMULLW128,
25854 IX86_BUILTIN_PSADBW128,
25855 IX86_BUILTIN_PSHUFHW,
25856 IX86_BUILTIN_PSHUFLW,
25857 IX86_BUILTIN_PSHUFD,
25859 IX86_BUILTIN_PSLLDQI128,
25860 IX86_BUILTIN_PSLLWI128,
25861 IX86_BUILTIN_PSLLDI128,
25862 IX86_BUILTIN_PSLLQI128,
25863 IX86_BUILTIN_PSRAWI128,
25864 IX86_BUILTIN_PSRADI128,
25865 IX86_BUILTIN_PSRLDQI128,
25866 IX86_BUILTIN_PSRLWI128,
25867 IX86_BUILTIN_PSRLDI128,
25868 IX86_BUILTIN_PSRLQI128,
25870 IX86_BUILTIN_PSLLDQ128,
25871 IX86_BUILTIN_PSLLW128,
25872 IX86_BUILTIN_PSLLD128,
25873 IX86_BUILTIN_PSLLQ128,
25874 IX86_BUILTIN_PSRAW128,
25875 IX86_BUILTIN_PSRAD128,
25876 IX86_BUILTIN_PSRLW128,
25877 IX86_BUILTIN_PSRLD128,
25878 IX86_BUILTIN_PSRLQ128,
25880 IX86_BUILTIN_PUNPCKHBW128,
25881 IX86_BUILTIN_PUNPCKHWD128,
25882 IX86_BUILTIN_PUNPCKHDQ128,
25883 IX86_BUILTIN_PUNPCKHQDQ128,
25884 IX86_BUILTIN_PUNPCKLBW128,
25885 IX86_BUILTIN_PUNPCKLWD128,
25886 IX86_BUILTIN_PUNPCKLDQ128,
25887 IX86_BUILTIN_PUNPCKLQDQ128,
25889 IX86_BUILTIN_CLFLUSH,
25890 IX86_BUILTIN_MFENCE,
25891 IX86_BUILTIN_LFENCE,
25892 IX86_BUILTIN_PAUSE,
25894 IX86_BUILTIN_BSRSI,
25895 IX86_BUILTIN_BSRDI,
25896 IX86_BUILTIN_RDPMC,
25897 IX86_BUILTIN_RDTSC,
25898 IX86_BUILTIN_RDTSCP,
25899 IX86_BUILTIN_ROLQI,
25900 IX86_BUILTIN_ROLHI,
25901 IX86_BUILTIN_RORQI,
25902 IX86_BUILTIN_RORHI,
25904 /* SSE3. */
25905 IX86_BUILTIN_ADDSUBPS,
25906 IX86_BUILTIN_HADDPS,
25907 IX86_BUILTIN_HSUBPS,
25908 IX86_BUILTIN_MOVSHDUP,
25909 IX86_BUILTIN_MOVSLDUP,
25910 IX86_BUILTIN_ADDSUBPD,
25911 IX86_BUILTIN_HADDPD,
25912 IX86_BUILTIN_HSUBPD,
25913 IX86_BUILTIN_LDDQU,
25915 IX86_BUILTIN_MONITOR,
25916 IX86_BUILTIN_MWAIT,
25918 /* SSSE3. */
25919 IX86_BUILTIN_PHADDW,
25920 IX86_BUILTIN_PHADDD,
25921 IX86_BUILTIN_PHADDSW,
25922 IX86_BUILTIN_PHSUBW,
25923 IX86_BUILTIN_PHSUBD,
25924 IX86_BUILTIN_PHSUBSW,
25925 IX86_BUILTIN_PMADDUBSW,
25926 IX86_BUILTIN_PMULHRSW,
25927 IX86_BUILTIN_PSHUFB,
25928 IX86_BUILTIN_PSIGNB,
25929 IX86_BUILTIN_PSIGNW,
25930 IX86_BUILTIN_PSIGND,
25931 IX86_BUILTIN_PALIGNR,
25932 IX86_BUILTIN_PABSB,
25933 IX86_BUILTIN_PABSW,
25934 IX86_BUILTIN_PABSD,
25936 IX86_BUILTIN_PHADDW128,
25937 IX86_BUILTIN_PHADDD128,
25938 IX86_BUILTIN_PHADDSW128,
25939 IX86_BUILTIN_PHSUBW128,
25940 IX86_BUILTIN_PHSUBD128,
25941 IX86_BUILTIN_PHSUBSW128,
25942 IX86_BUILTIN_PMADDUBSW128,
25943 IX86_BUILTIN_PMULHRSW128,
25944 IX86_BUILTIN_PSHUFB128,
25945 IX86_BUILTIN_PSIGNB128,
25946 IX86_BUILTIN_PSIGNW128,
25947 IX86_BUILTIN_PSIGND128,
25948 IX86_BUILTIN_PALIGNR128,
25949 IX86_BUILTIN_PABSB128,
25950 IX86_BUILTIN_PABSW128,
25951 IX86_BUILTIN_PABSD128,
25953 /* AMDFAM10 - SSE4A New Instructions. */
25954 IX86_BUILTIN_MOVNTSD,
25955 IX86_BUILTIN_MOVNTSS,
25956 IX86_BUILTIN_EXTRQI,
25957 IX86_BUILTIN_EXTRQ,
25958 IX86_BUILTIN_INSERTQI,
25959 IX86_BUILTIN_INSERTQ,
25961 /* SSE4.1. */
25962 IX86_BUILTIN_BLENDPD,
25963 IX86_BUILTIN_BLENDPS,
25964 IX86_BUILTIN_BLENDVPD,
25965 IX86_BUILTIN_BLENDVPS,
25966 IX86_BUILTIN_PBLENDVB128,
25967 IX86_BUILTIN_PBLENDW128,
25969 IX86_BUILTIN_DPPD,
25970 IX86_BUILTIN_DPPS,
25972 IX86_BUILTIN_INSERTPS128,
25974 IX86_BUILTIN_MOVNTDQA,
25975 IX86_BUILTIN_MPSADBW128,
25976 IX86_BUILTIN_PACKUSDW128,
25977 IX86_BUILTIN_PCMPEQQ,
25978 IX86_BUILTIN_PHMINPOSUW128,
25980 IX86_BUILTIN_PMAXSB128,
25981 IX86_BUILTIN_PMAXSD128,
25982 IX86_BUILTIN_PMAXUD128,
25983 IX86_BUILTIN_PMAXUW128,
25985 IX86_BUILTIN_PMINSB128,
25986 IX86_BUILTIN_PMINSD128,
25987 IX86_BUILTIN_PMINUD128,
25988 IX86_BUILTIN_PMINUW128,
25990 IX86_BUILTIN_PMOVSXBW128,
25991 IX86_BUILTIN_PMOVSXBD128,
25992 IX86_BUILTIN_PMOVSXBQ128,
25993 IX86_BUILTIN_PMOVSXWD128,
25994 IX86_BUILTIN_PMOVSXWQ128,
25995 IX86_BUILTIN_PMOVSXDQ128,
25997 IX86_BUILTIN_PMOVZXBW128,
25998 IX86_BUILTIN_PMOVZXBD128,
25999 IX86_BUILTIN_PMOVZXBQ128,
26000 IX86_BUILTIN_PMOVZXWD128,
26001 IX86_BUILTIN_PMOVZXWQ128,
26002 IX86_BUILTIN_PMOVZXDQ128,
26004 IX86_BUILTIN_PMULDQ128,
26005 IX86_BUILTIN_PMULLD128,
26007 IX86_BUILTIN_ROUNDSD,
26008 IX86_BUILTIN_ROUNDSS,
26010 IX86_BUILTIN_ROUNDPD,
26011 IX86_BUILTIN_ROUNDPS,
26013 IX86_BUILTIN_FLOORPD,
26014 IX86_BUILTIN_CEILPD,
26015 IX86_BUILTIN_TRUNCPD,
26016 IX86_BUILTIN_RINTPD,
26017 IX86_BUILTIN_ROUNDPD_AZ,
26019 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26020 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26021 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26023 IX86_BUILTIN_FLOORPS,
26024 IX86_BUILTIN_CEILPS,
26025 IX86_BUILTIN_TRUNCPS,
26026 IX86_BUILTIN_RINTPS,
26027 IX86_BUILTIN_ROUNDPS_AZ,
26029 IX86_BUILTIN_FLOORPS_SFIX,
26030 IX86_BUILTIN_CEILPS_SFIX,
26031 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26033 IX86_BUILTIN_PTESTZ,
26034 IX86_BUILTIN_PTESTC,
26035 IX86_BUILTIN_PTESTNZC,
26037 IX86_BUILTIN_VEC_INIT_V2SI,
26038 IX86_BUILTIN_VEC_INIT_V4HI,
26039 IX86_BUILTIN_VEC_INIT_V8QI,
26040 IX86_BUILTIN_VEC_EXT_V2DF,
26041 IX86_BUILTIN_VEC_EXT_V2DI,
26042 IX86_BUILTIN_VEC_EXT_V4SF,
26043 IX86_BUILTIN_VEC_EXT_V4SI,
26044 IX86_BUILTIN_VEC_EXT_V8HI,
26045 IX86_BUILTIN_VEC_EXT_V2SI,
26046 IX86_BUILTIN_VEC_EXT_V4HI,
26047 IX86_BUILTIN_VEC_EXT_V16QI,
26048 IX86_BUILTIN_VEC_SET_V2DI,
26049 IX86_BUILTIN_VEC_SET_V4SF,
26050 IX86_BUILTIN_VEC_SET_V4SI,
26051 IX86_BUILTIN_VEC_SET_V8HI,
26052 IX86_BUILTIN_VEC_SET_V4HI,
26053 IX86_BUILTIN_VEC_SET_V16QI,
26055 IX86_BUILTIN_VEC_PACK_SFIX,
26056 IX86_BUILTIN_VEC_PACK_SFIX256,
26058 /* SSE4.2. */
26059 IX86_BUILTIN_CRC32QI,
26060 IX86_BUILTIN_CRC32HI,
26061 IX86_BUILTIN_CRC32SI,
26062 IX86_BUILTIN_CRC32DI,
26064 IX86_BUILTIN_PCMPESTRI128,
26065 IX86_BUILTIN_PCMPESTRM128,
26066 IX86_BUILTIN_PCMPESTRA128,
26067 IX86_BUILTIN_PCMPESTRC128,
26068 IX86_BUILTIN_PCMPESTRO128,
26069 IX86_BUILTIN_PCMPESTRS128,
26070 IX86_BUILTIN_PCMPESTRZ128,
26071 IX86_BUILTIN_PCMPISTRI128,
26072 IX86_BUILTIN_PCMPISTRM128,
26073 IX86_BUILTIN_PCMPISTRA128,
26074 IX86_BUILTIN_PCMPISTRC128,
26075 IX86_BUILTIN_PCMPISTRO128,
26076 IX86_BUILTIN_PCMPISTRS128,
26077 IX86_BUILTIN_PCMPISTRZ128,
26079 IX86_BUILTIN_PCMPGTQ,
26081 /* AES instructions */
26082 IX86_BUILTIN_AESENC128,
26083 IX86_BUILTIN_AESENCLAST128,
26084 IX86_BUILTIN_AESDEC128,
26085 IX86_BUILTIN_AESDECLAST128,
26086 IX86_BUILTIN_AESIMC128,
26087 IX86_BUILTIN_AESKEYGENASSIST128,
26089 /* PCLMUL instruction */
26090 IX86_BUILTIN_PCLMULQDQ128,
26092 /* AVX */
26093 IX86_BUILTIN_ADDPD256,
26094 IX86_BUILTIN_ADDPS256,
26095 IX86_BUILTIN_ADDSUBPD256,
26096 IX86_BUILTIN_ADDSUBPS256,
26097 IX86_BUILTIN_ANDPD256,
26098 IX86_BUILTIN_ANDPS256,
26099 IX86_BUILTIN_ANDNPD256,
26100 IX86_BUILTIN_ANDNPS256,
26101 IX86_BUILTIN_BLENDPD256,
26102 IX86_BUILTIN_BLENDPS256,
26103 IX86_BUILTIN_BLENDVPD256,
26104 IX86_BUILTIN_BLENDVPS256,
26105 IX86_BUILTIN_DIVPD256,
26106 IX86_BUILTIN_DIVPS256,
26107 IX86_BUILTIN_DPPS256,
26108 IX86_BUILTIN_HADDPD256,
26109 IX86_BUILTIN_HADDPS256,
26110 IX86_BUILTIN_HSUBPD256,
26111 IX86_BUILTIN_HSUBPS256,
26112 IX86_BUILTIN_MAXPD256,
26113 IX86_BUILTIN_MAXPS256,
26114 IX86_BUILTIN_MINPD256,
26115 IX86_BUILTIN_MINPS256,
26116 IX86_BUILTIN_MULPD256,
26117 IX86_BUILTIN_MULPS256,
26118 IX86_BUILTIN_ORPD256,
26119 IX86_BUILTIN_ORPS256,
26120 IX86_BUILTIN_SHUFPD256,
26121 IX86_BUILTIN_SHUFPS256,
26122 IX86_BUILTIN_SUBPD256,
26123 IX86_BUILTIN_SUBPS256,
26124 IX86_BUILTIN_XORPD256,
26125 IX86_BUILTIN_XORPS256,
26126 IX86_BUILTIN_CMPSD,
26127 IX86_BUILTIN_CMPSS,
26128 IX86_BUILTIN_CMPPD,
26129 IX86_BUILTIN_CMPPS,
26130 IX86_BUILTIN_CMPPD256,
26131 IX86_BUILTIN_CMPPS256,
26132 IX86_BUILTIN_CVTDQ2PD256,
26133 IX86_BUILTIN_CVTDQ2PS256,
26134 IX86_BUILTIN_CVTPD2PS256,
26135 IX86_BUILTIN_CVTPS2DQ256,
26136 IX86_BUILTIN_CVTPS2PD256,
26137 IX86_BUILTIN_CVTTPD2DQ256,
26138 IX86_BUILTIN_CVTPD2DQ256,
26139 IX86_BUILTIN_CVTTPS2DQ256,
26140 IX86_BUILTIN_EXTRACTF128PD256,
26141 IX86_BUILTIN_EXTRACTF128PS256,
26142 IX86_BUILTIN_EXTRACTF128SI256,
26143 IX86_BUILTIN_VZEROALL,
26144 IX86_BUILTIN_VZEROUPPER,
26145 IX86_BUILTIN_VPERMILVARPD,
26146 IX86_BUILTIN_VPERMILVARPS,
26147 IX86_BUILTIN_VPERMILVARPD256,
26148 IX86_BUILTIN_VPERMILVARPS256,
26149 IX86_BUILTIN_VPERMILPD,
26150 IX86_BUILTIN_VPERMILPS,
26151 IX86_BUILTIN_VPERMILPD256,
26152 IX86_BUILTIN_VPERMILPS256,
26153 IX86_BUILTIN_VPERMIL2PD,
26154 IX86_BUILTIN_VPERMIL2PS,
26155 IX86_BUILTIN_VPERMIL2PD256,
26156 IX86_BUILTIN_VPERMIL2PS256,
26157 IX86_BUILTIN_VPERM2F128PD256,
26158 IX86_BUILTIN_VPERM2F128PS256,
26159 IX86_BUILTIN_VPERM2F128SI256,
26160 IX86_BUILTIN_VBROADCASTSS,
26161 IX86_BUILTIN_VBROADCASTSD256,
26162 IX86_BUILTIN_VBROADCASTSS256,
26163 IX86_BUILTIN_VBROADCASTPD256,
26164 IX86_BUILTIN_VBROADCASTPS256,
26165 IX86_BUILTIN_VINSERTF128PD256,
26166 IX86_BUILTIN_VINSERTF128PS256,
26167 IX86_BUILTIN_VINSERTF128SI256,
26168 IX86_BUILTIN_LOADUPD256,
26169 IX86_BUILTIN_LOADUPS256,
26170 IX86_BUILTIN_STOREUPD256,
26171 IX86_BUILTIN_STOREUPS256,
26172 IX86_BUILTIN_LDDQU256,
26173 IX86_BUILTIN_MOVNTDQ256,
26174 IX86_BUILTIN_MOVNTPD256,
26175 IX86_BUILTIN_MOVNTPS256,
26176 IX86_BUILTIN_LOADDQU256,
26177 IX86_BUILTIN_STOREDQU256,
26178 IX86_BUILTIN_MASKLOADPD,
26179 IX86_BUILTIN_MASKLOADPS,
26180 IX86_BUILTIN_MASKSTOREPD,
26181 IX86_BUILTIN_MASKSTOREPS,
26182 IX86_BUILTIN_MASKLOADPD256,
26183 IX86_BUILTIN_MASKLOADPS256,
26184 IX86_BUILTIN_MASKSTOREPD256,
26185 IX86_BUILTIN_MASKSTOREPS256,
26186 IX86_BUILTIN_MOVSHDUP256,
26187 IX86_BUILTIN_MOVSLDUP256,
26188 IX86_BUILTIN_MOVDDUP256,
26190 IX86_BUILTIN_SQRTPD256,
26191 IX86_BUILTIN_SQRTPS256,
26192 IX86_BUILTIN_SQRTPS_NR256,
26193 IX86_BUILTIN_RSQRTPS256,
26194 IX86_BUILTIN_RSQRTPS_NR256,
26196 IX86_BUILTIN_RCPPS256,
26198 IX86_BUILTIN_ROUNDPD256,
26199 IX86_BUILTIN_ROUNDPS256,
26201 IX86_BUILTIN_FLOORPD256,
26202 IX86_BUILTIN_CEILPD256,
26203 IX86_BUILTIN_TRUNCPD256,
26204 IX86_BUILTIN_RINTPD256,
26205 IX86_BUILTIN_ROUNDPD_AZ256,
26207 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26208 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26209 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26211 IX86_BUILTIN_FLOORPS256,
26212 IX86_BUILTIN_CEILPS256,
26213 IX86_BUILTIN_TRUNCPS256,
26214 IX86_BUILTIN_RINTPS256,
26215 IX86_BUILTIN_ROUNDPS_AZ256,
26217 IX86_BUILTIN_FLOORPS_SFIX256,
26218 IX86_BUILTIN_CEILPS_SFIX256,
26219 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26221 IX86_BUILTIN_UNPCKHPD256,
26222 IX86_BUILTIN_UNPCKLPD256,
26223 IX86_BUILTIN_UNPCKHPS256,
26224 IX86_BUILTIN_UNPCKLPS256,
26226 IX86_BUILTIN_SI256_SI,
26227 IX86_BUILTIN_PS256_PS,
26228 IX86_BUILTIN_PD256_PD,
26229 IX86_BUILTIN_SI_SI256,
26230 IX86_BUILTIN_PS_PS256,
26231 IX86_BUILTIN_PD_PD256,
26233 IX86_BUILTIN_VTESTZPD,
26234 IX86_BUILTIN_VTESTCPD,
26235 IX86_BUILTIN_VTESTNZCPD,
26236 IX86_BUILTIN_VTESTZPS,
26237 IX86_BUILTIN_VTESTCPS,
26238 IX86_BUILTIN_VTESTNZCPS,
26239 IX86_BUILTIN_VTESTZPD256,
26240 IX86_BUILTIN_VTESTCPD256,
26241 IX86_BUILTIN_VTESTNZCPD256,
26242 IX86_BUILTIN_VTESTZPS256,
26243 IX86_BUILTIN_VTESTCPS256,
26244 IX86_BUILTIN_VTESTNZCPS256,
26245 IX86_BUILTIN_PTESTZ256,
26246 IX86_BUILTIN_PTESTC256,
26247 IX86_BUILTIN_PTESTNZC256,
26249 IX86_BUILTIN_MOVMSKPD256,
26250 IX86_BUILTIN_MOVMSKPS256,
26252 /* AVX2 */
26253 IX86_BUILTIN_MPSADBW256,
26254 IX86_BUILTIN_PABSB256,
26255 IX86_BUILTIN_PABSW256,
26256 IX86_BUILTIN_PABSD256,
26257 IX86_BUILTIN_PACKSSDW256,
26258 IX86_BUILTIN_PACKSSWB256,
26259 IX86_BUILTIN_PACKUSDW256,
26260 IX86_BUILTIN_PACKUSWB256,
26261 IX86_BUILTIN_PADDB256,
26262 IX86_BUILTIN_PADDW256,
26263 IX86_BUILTIN_PADDD256,
26264 IX86_BUILTIN_PADDQ256,
26265 IX86_BUILTIN_PADDSB256,
26266 IX86_BUILTIN_PADDSW256,
26267 IX86_BUILTIN_PADDUSB256,
26268 IX86_BUILTIN_PADDUSW256,
26269 IX86_BUILTIN_PALIGNR256,
26270 IX86_BUILTIN_AND256I,
26271 IX86_BUILTIN_ANDNOT256I,
26272 IX86_BUILTIN_PAVGB256,
26273 IX86_BUILTIN_PAVGW256,
26274 IX86_BUILTIN_PBLENDVB256,
26275 IX86_BUILTIN_PBLENDVW256,
26276 IX86_BUILTIN_PCMPEQB256,
26277 IX86_BUILTIN_PCMPEQW256,
26278 IX86_BUILTIN_PCMPEQD256,
26279 IX86_BUILTIN_PCMPEQQ256,
26280 IX86_BUILTIN_PCMPGTB256,
26281 IX86_BUILTIN_PCMPGTW256,
26282 IX86_BUILTIN_PCMPGTD256,
26283 IX86_BUILTIN_PCMPGTQ256,
26284 IX86_BUILTIN_PHADDW256,
26285 IX86_BUILTIN_PHADDD256,
26286 IX86_BUILTIN_PHADDSW256,
26287 IX86_BUILTIN_PHSUBW256,
26288 IX86_BUILTIN_PHSUBD256,
26289 IX86_BUILTIN_PHSUBSW256,
26290 IX86_BUILTIN_PMADDUBSW256,
26291 IX86_BUILTIN_PMADDWD256,
26292 IX86_BUILTIN_PMAXSB256,
26293 IX86_BUILTIN_PMAXSW256,
26294 IX86_BUILTIN_PMAXSD256,
26295 IX86_BUILTIN_PMAXUB256,
26296 IX86_BUILTIN_PMAXUW256,
26297 IX86_BUILTIN_PMAXUD256,
26298 IX86_BUILTIN_PMINSB256,
26299 IX86_BUILTIN_PMINSW256,
26300 IX86_BUILTIN_PMINSD256,
26301 IX86_BUILTIN_PMINUB256,
26302 IX86_BUILTIN_PMINUW256,
26303 IX86_BUILTIN_PMINUD256,
26304 IX86_BUILTIN_PMOVMSKB256,
26305 IX86_BUILTIN_PMOVSXBW256,
26306 IX86_BUILTIN_PMOVSXBD256,
26307 IX86_BUILTIN_PMOVSXBQ256,
26308 IX86_BUILTIN_PMOVSXWD256,
26309 IX86_BUILTIN_PMOVSXWQ256,
26310 IX86_BUILTIN_PMOVSXDQ256,
26311 IX86_BUILTIN_PMOVZXBW256,
26312 IX86_BUILTIN_PMOVZXBD256,
26313 IX86_BUILTIN_PMOVZXBQ256,
26314 IX86_BUILTIN_PMOVZXWD256,
26315 IX86_BUILTIN_PMOVZXWQ256,
26316 IX86_BUILTIN_PMOVZXDQ256,
26317 IX86_BUILTIN_PMULDQ256,
26318 IX86_BUILTIN_PMULHRSW256,
26319 IX86_BUILTIN_PMULHUW256,
26320 IX86_BUILTIN_PMULHW256,
26321 IX86_BUILTIN_PMULLW256,
26322 IX86_BUILTIN_PMULLD256,
26323 IX86_BUILTIN_PMULUDQ256,
26324 IX86_BUILTIN_POR256,
26325 IX86_BUILTIN_PSADBW256,
26326 IX86_BUILTIN_PSHUFB256,
26327 IX86_BUILTIN_PSHUFD256,
26328 IX86_BUILTIN_PSHUFHW256,
26329 IX86_BUILTIN_PSHUFLW256,
26330 IX86_BUILTIN_PSIGNB256,
26331 IX86_BUILTIN_PSIGNW256,
26332 IX86_BUILTIN_PSIGND256,
26333 IX86_BUILTIN_PSLLDQI256,
26334 IX86_BUILTIN_PSLLWI256,
26335 IX86_BUILTIN_PSLLW256,
26336 IX86_BUILTIN_PSLLDI256,
26337 IX86_BUILTIN_PSLLD256,
26338 IX86_BUILTIN_PSLLQI256,
26339 IX86_BUILTIN_PSLLQ256,
26340 IX86_BUILTIN_PSRAWI256,
26341 IX86_BUILTIN_PSRAW256,
26342 IX86_BUILTIN_PSRADI256,
26343 IX86_BUILTIN_PSRAD256,
26344 IX86_BUILTIN_PSRLDQI256,
26345 IX86_BUILTIN_PSRLWI256,
26346 IX86_BUILTIN_PSRLW256,
26347 IX86_BUILTIN_PSRLDI256,
26348 IX86_BUILTIN_PSRLD256,
26349 IX86_BUILTIN_PSRLQI256,
26350 IX86_BUILTIN_PSRLQ256,
26351 IX86_BUILTIN_PSUBB256,
26352 IX86_BUILTIN_PSUBW256,
26353 IX86_BUILTIN_PSUBD256,
26354 IX86_BUILTIN_PSUBQ256,
26355 IX86_BUILTIN_PSUBSB256,
26356 IX86_BUILTIN_PSUBSW256,
26357 IX86_BUILTIN_PSUBUSB256,
26358 IX86_BUILTIN_PSUBUSW256,
26359 IX86_BUILTIN_PUNPCKHBW256,
26360 IX86_BUILTIN_PUNPCKHWD256,
26361 IX86_BUILTIN_PUNPCKHDQ256,
26362 IX86_BUILTIN_PUNPCKHQDQ256,
26363 IX86_BUILTIN_PUNPCKLBW256,
26364 IX86_BUILTIN_PUNPCKLWD256,
26365 IX86_BUILTIN_PUNPCKLDQ256,
26366 IX86_BUILTIN_PUNPCKLQDQ256,
26367 IX86_BUILTIN_PXOR256,
26368 IX86_BUILTIN_MOVNTDQA256,
26369 IX86_BUILTIN_VBROADCASTSS_PS,
26370 IX86_BUILTIN_VBROADCASTSS_PS256,
26371 IX86_BUILTIN_VBROADCASTSD_PD256,
26372 IX86_BUILTIN_VBROADCASTSI256,
26373 IX86_BUILTIN_PBLENDD256,
26374 IX86_BUILTIN_PBLENDD128,
26375 IX86_BUILTIN_PBROADCASTB256,
26376 IX86_BUILTIN_PBROADCASTW256,
26377 IX86_BUILTIN_PBROADCASTD256,
26378 IX86_BUILTIN_PBROADCASTQ256,
26379 IX86_BUILTIN_PBROADCASTB128,
26380 IX86_BUILTIN_PBROADCASTW128,
26381 IX86_BUILTIN_PBROADCASTD128,
26382 IX86_BUILTIN_PBROADCASTQ128,
26383 IX86_BUILTIN_VPERMVARSI256,
26384 IX86_BUILTIN_VPERMDF256,
26385 IX86_BUILTIN_VPERMVARSF256,
26386 IX86_BUILTIN_VPERMDI256,
26387 IX86_BUILTIN_VPERMTI256,
26388 IX86_BUILTIN_VEXTRACT128I256,
26389 IX86_BUILTIN_VINSERT128I256,
26390 IX86_BUILTIN_MASKLOADD,
26391 IX86_BUILTIN_MASKLOADQ,
26392 IX86_BUILTIN_MASKLOADD256,
26393 IX86_BUILTIN_MASKLOADQ256,
26394 IX86_BUILTIN_MASKSTORED,
26395 IX86_BUILTIN_MASKSTOREQ,
26396 IX86_BUILTIN_MASKSTORED256,
26397 IX86_BUILTIN_MASKSTOREQ256,
26398 IX86_BUILTIN_PSLLVV4DI,
26399 IX86_BUILTIN_PSLLVV2DI,
26400 IX86_BUILTIN_PSLLVV8SI,
26401 IX86_BUILTIN_PSLLVV4SI,
26402 IX86_BUILTIN_PSRAVV8SI,
26403 IX86_BUILTIN_PSRAVV4SI,
26404 IX86_BUILTIN_PSRLVV4DI,
26405 IX86_BUILTIN_PSRLVV2DI,
26406 IX86_BUILTIN_PSRLVV8SI,
26407 IX86_BUILTIN_PSRLVV4SI,
26409 IX86_BUILTIN_GATHERSIV2DF,
26410 IX86_BUILTIN_GATHERSIV4DF,
26411 IX86_BUILTIN_GATHERDIV2DF,
26412 IX86_BUILTIN_GATHERDIV4DF,
26413 IX86_BUILTIN_GATHERSIV4SF,
26414 IX86_BUILTIN_GATHERSIV8SF,
26415 IX86_BUILTIN_GATHERDIV4SF,
26416 IX86_BUILTIN_GATHERDIV8SF,
26417 IX86_BUILTIN_GATHERSIV2DI,
26418 IX86_BUILTIN_GATHERSIV4DI,
26419 IX86_BUILTIN_GATHERDIV2DI,
26420 IX86_BUILTIN_GATHERDIV4DI,
26421 IX86_BUILTIN_GATHERSIV4SI,
26422 IX86_BUILTIN_GATHERSIV8SI,
26423 IX86_BUILTIN_GATHERDIV4SI,
26424 IX86_BUILTIN_GATHERDIV8SI,
26426 /* Alternate 4 element gather for the vectorizer where
26427 all operands are 32-byte wide. */
26428 IX86_BUILTIN_GATHERALTSIV4DF,
26429 IX86_BUILTIN_GATHERALTDIV8SF,
26430 IX86_BUILTIN_GATHERALTSIV4DI,
26431 IX86_BUILTIN_GATHERALTDIV8SI,
26433 /* TFmode support builtins. */
26434 IX86_BUILTIN_INFQ,
26435 IX86_BUILTIN_HUGE_VALQ,
26436 IX86_BUILTIN_FABSQ,
26437 IX86_BUILTIN_COPYSIGNQ,
26439 /* Vectorizer support builtins. */
26440 IX86_BUILTIN_CPYSGNPS,
26441 IX86_BUILTIN_CPYSGNPD,
26442 IX86_BUILTIN_CPYSGNPS256,
26443 IX86_BUILTIN_CPYSGNPD256,
26445 /* FMA4 instructions. */
26446 IX86_BUILTIN_VFMADDSS,
26447 IX86_BUILTIN_VFMADDSD,
26448 IX86_BUILTIN_VFMADDPS,
26449 IX86_BUILTIN_VFMADDPD,
26450 IX86_BUILTIN_VFMADDPS256,
26451 IX86_BUILTIN_VFMADDPD256,
26452 IX86_BUILTIN_VFMADDSUBPS,
26453 IX86_BUILTIN_VFMADDSUBPD,
26454 IX86_BUILTIN_VFMADDSUBPS256,
26455 IX86_BUILTIN_VFMADDSUBPD256,
26457 /* FMA3 instructions. */
26458 IX86_BUILTIN_VFMADDSS3,
26459 IX86_BUILTIN_VFMADDSD3,
26461 /* XOP instructions. */
26462 IX86_BUILTIN_VPCMOV,
26463 IX86_BUILTIN_VPCMOV_V2DI,
26464 IX86_BUILTIN_VPCMOV_V4SI,
26465 IX86_BUILTIN_VPCMOV_V8HI,
26466 IX86_BUILTIN_VPCMOV_V16QI,
26467 IX86_BUILTIN_VPCMOV_V4SF,
26468 IX86_BUILTIN_VPCMOV_V2DF,
26469 IX86_BUILTIN_VPCMOV256,
26470 IX86_BUILTIN_VPCMOV_V4DI256,
26471 IX86_BUILTIN_VPCMOV_V8SI256,
26472 IX86_BUILTIN_VPCMOV_V16HI256,
26473 IX86_BUILTIN_VPCMOV_V32QI256,
26474 IX86_BUILTIN_VPCMOV_V8SF256,
26475 IX86_BUILTIN_VPCMOV_V4DF256,
26477 IX86_BUILTIN_VPPERM,
26479 IX86_BUILTIN_VPMACSSWW,
26480 IX86_BUILTIN_VPMACSWW,
26481 IX86_BUILTIN_VPMACSSWD,
26482 IX86_BUILTIN_VPMACSWD,
26483 IX86_BUILTIN_VPMACSSDD,
26484 IX86_BUILTIN_VPMACSDD,
26485 IX86_BUILTIN_VPMACSSDQL,
26486 IX86_BUILTIN_VPMACSSDQH,
26487 IX86_BUILTIN_VPMACSDQL,
26488 IX86_BUILTIN_VPMACSDQH,
26489 IX86_BUILTIN_VPMADCSSWD,
26490 IX86_BUILTIN_VPMADCSWD,
26492 IX86_BUILTIN_VPHADDBW,
26493 IX86_BUILTIN_VPHADDBD,
26494 IX86_BUILTIN_VPHADDBQ,
26495 IX86_BUILTIN_VPHADDWD,
26496 IX86_BUILTIN_VPHADDWQ,
26497 IX86_BUILTIN_VPHADDDQ,
26498 IX86_BUILTIN_VPHADDUBW,
26499 IX86_BUILTIN_VPHADDUBD,
26500 IX86_BUILTIN_VPHADDUBQ,
26501 IX86_BUILTIN_VPHADDUWD,
26502 IX86_BUILTIN_VPHADDUWQ,
26503 IX86_BUILTIN_VPHADDUDQ,
26504 IX86_BUILTIN_VPHSUBBW,
26505 IX86_BUILTIN_VPHSUBWD,
26506 IX86_BUILTIN_VPHSUBDQ,
26508 IX86_BUILTIN_VPROTB,
26509 IX86_BUILTIN_VPROTW,
26510 IX86_BUILTIN_VPROTD,
26511 IX86_BUILTIN_VPROTQ,
26512 IX86_BUILTIN_VPROTB_IMM,
26513 IX86_BUILTIN_VPROTW_IMM,
26514 IX86_BUILTIN_VPROTD_IMM,
26515 IX86_BUILTIN_VPROTQ_IMM,
26517 IX86_BUILTIN_VPSHLB,
26518 IX86_BUILTIN_VPSHLW,
26519 IX86_BUILTIN_VPSHLD,
26520 IX86_BUILTIN_VPSHLQ,
26521 IX86_BUILTIN_VPSHAB,
26522 IX86_BUILTIN_VPSHAW,
26523 IX86_BUILTIN_VPSHAD,
26524 IX86_BUILTIN_VPSHAQ,
26526 IX86_BUILTIN_VFRCZSS,
26527 IX86_BUILTIN_VFRCZSD,
26528 IX86_BUILTIN_VFRCZPS,
26529 IX86_BUILTIN_VFRCZPD,
26530 IX86_BUILTIN_VFRCZPS256,
26531 IX86_BUILTIN_VFRCZPD256,
26533 IX86_BUILTIN_VPCOMEQUB,
26534 IX86_BUILTIN_VPCOMNEUB,
26535 IX86_BUILTIN_VPCOMLTUB,
26536 IX86_BUILTIN_VPCOMLEUB,
26537 IX86_BUILTIN_VPCOMGTUB,
26538 IX86_BUILTIN_VPCOMGEUB,
26539 IX86_BUILTIN_VPCOMFALSEUB,
26540 IX86_BUILTIN_VPCOMTRUEUB,
26542 IX86_BUILTIN_VPCOMEQUW,
26543 IX86_BUILTIN_VPCOMNEUW,
26544 IX86_BUILTIN_VPCOMLTUW,
26545 IX86_BUILTIN_VPCOMLEUW,
26546 IX86_BUILTIN_VPCOMGTUW,
26547 IX86_BUILTIN_VPCOMGEUW,
26548 IX86_BUILTIN_VPCOMFALSEUW,
26549 IX86_BUILTIN_VPCOMTRUEUW,
26551 IX86_BUILTIN_VPCOMEQUD,
26552 IX86_BUILTIN_VPCOMNEUD,
26553 IX86_BUILTIN_VPCOMLTUD,
26554 IX86_BUILTIN_VPCOMLEUD,
26555 IX86_BUILTIN_VPCOMGTUD,
26556 IX86_BUILTIN_VPCOMGEUD,
26557 IX86_BUILTIN_VPCOMFALSEUD,
26558 IX86_BUILTIN_VPCOMTRUEUD,
26560 IX86_BUILTIN_VPCOMEQUQ,
26561 IX86_BUILTIN_VPCOMNEUQ,
26562 IX86_BUILTIN_VPCOMLTUQ,
26563 IX86_BUILTIN_VPCOMLEUQ,
26564 IX86_BUILTIN_VPCOMGTUQ,
26565 IX86_BUILTIN_VPCOMGEUQ,
26566 IX86_BUILTIN_VPCOMFALSEUQ,
26567 IX86_BUILTIN_VPCOMTRUEUQ,
26569 IX86_BUILTIN_VPCOMEQB,
26570 IX86_BUILTIN_VPCOMNEB,
26571 IX86_BUILTIN_VPCOMLTB,
26572 IX86_BUILTIN_VPCOMLEB,
26573 IX86_BUILTIN_VPCOMGTB,
26574 IX86_BUILTIN_VPCOMGEB,
26575 IX86_BUILTIN_VPCOMFALSEB,
26576 IX86_BUILTIN_VPCOMTRUEB,
26578 IX86_BUILTIN_VPCOMEQW,
26579 IX86_BUILTIN_VPCOMNEW,
26580 IX86_BUILTIN_VPCOMLTW,
26581 IX86_BUILTIN_VPCOMLEW,
26582 IX86_BUILTIN_VPCOMGTW,
26583 IX86_BUILTIN_VPCOMGEW,
26584 IX86_BUILTIN_VPCOMFALSEW,
26585 IX86_BUILTIN_VPCOMTRUEW,
26587 IX86_BUILTIN_VPCOMEQD,
26588 IX86_BUILTIN_VPCOMNED,
26589 IX86_BUILTIN_VPCOMLTD,
26590 IX86_BUILTIN_VPCOMLED,
26591 IX86_BUILTIN_VPCOMGTD,
26592 IX86_BUILTIN_VPCOMGED,
26593 IX86_BUILTIN_VPCOMFALSED,
26594 IX86_BUILTIN_VPCOMTRUED,
26596 IX86_BUILTIN_VPCOMEQQ,
26597 IX86_BUILTIN_VPCOMNEQ,
26598 IX86_BUILTIN_VPCOMLTQ,
26599 IX86_BUILTIN_VPCOMLEQ,
26600 IX86_BUILTIN_VPCOMGTQ,
26601 IX86_BUILTIN_VPCOMGEQ,
26602 IX86_BUILTIN_VPCOMFALSEQ,
26603 IX86_BUILTIN_VPCOMTRUEQ,
26605 /* LWP instructions. */
26606 IX86_BUILTIN_LLWPCB,
26607 IX86_BUILTIN_SLWPCB,
26608 IX86_BUILTIN_LWPVAL32,
26609 IX86_BUILTIN_LWPVAL64,
26610 IX86_BUILTIN_LWPINS32,
26611 IX86_BUILTIN_LWPINS64,
26613 IX86_BUILTIN_CLZS,
26615 /* RTM */
26616 IX86_BUILTIN_XBEGIN,
26617 IX86_BUILTIN_XEND,
26618 IX86_BUILTIN_XABORT,
26619 IX86_BUILTIN_XTEST,
26621 /* BMI instructions. */
26622 IX86_BUILTIN_BEXTR32,
26623 IX86_BUILTIN_BEXTR64,
26624 IX86_BUILTIN_CTZS,
26626 /* TBM instructions. */
26627 IX86_BUILTIN_BEXTRI32,
26628 IX86_BUILTIN_BEXTRI64,
26630 /* BMI2 instructions. */
26631 IX86_BUILTIN_BZHI32,
26632 IX86_BUILTIN_BZHI64,
26633 IX86_BUILTIN_PDEP32,
26634 IX86_BUILTIN_PDEP64,
26635 IX86_BUILTIN_PEXT32,
26636 IX86_BUILTIN_PEXT64,
26638 /* ADX instructions. */
26639 IX86_BUILTIN_ADDCARRYX32,
26640 IX86_BUILTIN_ADDCARRYX64,
26642 /* FSGSBASE instructions. */
26643 IX86_BUILTIN_RDFSBASE32,
26644 IX86_BUILTIN_RDFSBASE64,
26645 IX86_BUILTIN_RDGSBASE32,
26646 IX86_BUILTIN_RDGSBASE64,
26647 IX86_BUILTIN_WRFSBASE32,
26648 IX86_BUILTIN_WRFSBASE64,
26649 IX86_BUILTIN_WRGSBASE32,
26650 IX86_BUILTIN_WRGSBASE64,
26652 /* RDRND instructions. */
26653 IX86_BUILTIN_RDRAND16_STEP,
26654 IX86_BUILTIN_RDRAND32_STEP,
26655 IX86_BUILTIN_RDRAND64_STEP,
26657 /* RDSEED instructions. */
26658 IX86_BUILTIN_RDSEED16_STEP,
26659 IX86_BUILTIN_RDSEED32_STEP,
26660 IX86_BUILTIN_RDSEED64_STEP,
26662 /* F16C instructions. */
26663 IX86_BUILTIN_CVTPH2PS,
26664 IX86_BUILTIN_CVTPH2PS256,
26665 IX86_BUILTIN_CVTPS2PH,
26666 IX86_BUILTIN_CVTPS2PH256,
26668 /* CFString built-in for darwin */
26669 IX86_BUILTIN_CFSTRING,
26671 /* Builtins to get CPU type and supported features. */
26672 IX86_BUILTIN_CPU_INIT,
26673 IX86_BUILTIN_CPU_IS,
26674 IX86_BUILTIN_CPU_SUPPORTS,
26676 IX86_BUILTIN_MAX
26679 /* Table for the ix86 builtin decls. */
26680 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26682 /* Table of all of the builtin functions that are possible with different ISA's
26683 but are waiting to be built until a function is declared to use that
26684 ISA. */
26685 struct builtin_isa {
26686 const char *name; /* function name */
26687 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26688 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26689 bool const_p; /* true if the declaration is constant */
26690 bool set_and_not_built_p;
26693 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26696 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26697 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26698 function decl in the ix86_builtins array. Returns the function decl or
26699 NULL_TREE, if the builtin was not added.
26701 If the front end has a special hook for builtin functions, delay adding
26702 builtin functions that aren't in the current ISA until the ISA is changed
26703 with function specific optimization. Doing so, can save about 300K for the
26704 default compiler. When the builtin is expanded, check at that time whether
26705 it is valid.
26707 If the front end doesn't have a special hook, record all builtins, even if
26708 it isn't an instruction set in the current ISA in case the user uses
26709 function specific options for a different ISA, so that we don't get scope
26710 errors if a builtin is added in the middle of a function scope. */
26712 static inline tree
26713 def_builtin (HOST_WIDE_INT mask, const char *name,
26714 enum ix86_builtin_func_type tcode,
26715 enum ix86_builtins code)
26717 tree decl = NULL_TREE;
26719 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26721 ix86_builtins_isa[(int) code].isa = mask;
26723 mask &= ~OPTION_MASK_ISA_64BIT;
26724 if (mask == 0
26725 || (mask & ix86_isa_flags) != 0
26726 || (lang_hooks.builtin_function
26727 == lang_hooks.builtin_function_ext_scope))
26730 tree type = ix86_get_builtin_func_type (tcode);
26731 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26732 NULL, NULL_TREE);
26733 ix86_builtins[(int) code] = decl;
26734 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26736 else
26738 ix86_builtins[(int) code] = NULL_TREE;
26739 ix86_builtins_isa[(int) code].tcode = tcode;
26740 ix86_builtins_isa[(int) code].name = name;
26741 ix86_builtins_isa[(int) code].const_p = false;
26742 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26746 return decl;
26749 /* Like def_builtin, but also marks the function decl "const". */
26751 static inline tree
26752 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26753 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26755 tree decl = def_builtin (mask, name, tcode, code);
26756 if (decl)
26757 TREE_READONLY (decl) = 1;
26758 else
26759 ix86_builtins_isa[(int) code].const_p = true;
26761 return decl;
26764 /* Add any new builtin functions for a given ISA that may not have been
26765 declared. This saves a bit of space compared to adding all of the
26766 declarations to the tree, even if we didn't use them. */
26768 static void
26769 ix86_add_new_builtins (HOST_WIDE_INT isa)
26771 int i;
26773 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26775 if ((ix86_builtins_isa[i].isa & isa) != 0
26776 && ix86_builtins_isa[i].set_and_not_built_p)
26778 tree decl, type;
26780 /* Don't define the builtin again. */
26781 ix86_builtins_isa[i].set_and_not_built_p = false;
26783 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26784 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26785 type, i, BUILT_IN_MD, NULL,
26786 NULL_TREE);
26788 ix86_builtins[i] = decl;
26789 if (ix86_builtins_isa[i].const_p)
26790 TREE_READONLY (decl) = 1;
26795 /* Bits for builtin_description.flag. */
26797 /* Set when we don't support the comparison natively, and should
26798 swap_comparison in order to support it. */
26799 #define BUILTIN_DESC_SWAP_OPERANDS 1
26801 struct builtin_description
26803 const HOST_WIDE_INT mask;
26804 const enum insn_code icode;
26805 const char *const name;
26806 const enum ix86_builtins code;
26807 const enum rtx_code comparison;
26808 const int flag;
26811 static const struct builtin_description bdesc_comi[] =
26813 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26814 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26815 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26816 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26817 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26818 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26819 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26820 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26821 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26822 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26823 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26824 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26825 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26826 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26827 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26828 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26829 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26830 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26831 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26832 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26833 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26834 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26835 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26836 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26839 static const struct builtin_description bdesc_pcmpestr[] =
26841 /* SSE4.2 */
26842 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26843 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26844 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26845 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26846 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26847 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26848 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26851 static const struct builtin_description bdesc_pcmpistr[] =
26853 /* SSE4.2 */
26854 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26855 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26856 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26857 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26858 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26859 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26860 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26863 /* Special builtins with variable number of arguments. */
26864 static const struct builtin_description bdesc_special_args[] =
26866 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26867 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26868 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26870 /* MMX */
26871 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26873 /* 3DNow! */
26874 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26876 /* FXSR, XSAVE and XSAVEOPT */
26877 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26878 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26879 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26880 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26881 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26883 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26884 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26885 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26886 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26887 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26889 /* SSE */
26890 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26891 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26892 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26894 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26895 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26896 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26897 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26899 /* SSE or 3DNow!A */
26900 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26901 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26903 /* SSE2 */
26904 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26905 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26906 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26907 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26908 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26909 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26910 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26911 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26912 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26913 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26915 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26916 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26918 /* SSE3 */
26919 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26921 /* SSE4.1 */
26922 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26924 /* SSE4A */
26925 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26926 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26928 /* AVX */
26929 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26930 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26932 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26933 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26934 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26935 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26936 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26938 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26939 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26940 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26941 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26942 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26943 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26944 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26946 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26947 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26948 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26950 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26951 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26952 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26953 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26954 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26955 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26956 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26957 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26959 /* AVX2 */
26960 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26961 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26962 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26963 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26964 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26965 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26966 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26967 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26968 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26970 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26971 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26972 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26973 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26974 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26975 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26977 /* FSGSBASE */
26978 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26979 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26980 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26981 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26982 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26983 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26984 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26985 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26987 /* RTM */
26988 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26989 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26990 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26993 /* Builtins with variable number of arguments. */
26994 static const struct builtin_description bdesc_args[] =
26996 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26997 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26998 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26999 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27000 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27001 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27002 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27004 /* MMX */
27005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27006 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27010 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27013 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27015 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27016 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27017 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27018 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27019 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27021 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27022 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27024 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27025 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27026 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27027 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27029 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27030 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27031 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27032 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27033 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27034 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27036 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27037 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27038 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27039 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27040 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27041 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27043 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27044 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27045 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27047 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27049 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27050 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27051 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27052 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27053 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27054 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27056 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27057 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27058 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27059 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27060 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27061 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27063 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27064 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27065 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27066 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27068 /* 3DNow! */
27069 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27070 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27071 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27072 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27074 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27075 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27076 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27077 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27078 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27079 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27080 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27081 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27082 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27083 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27084 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27085 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27086 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27087 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27088 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27090 /* 3DNow!A */
27091 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27092 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27093 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27094 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27095 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27096 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27098 /* SSE */
27099 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27100 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27101 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27102 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27103 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27105 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27106 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27107 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27108 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27109 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27110 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27112 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27114 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27115 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27116 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27117 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27118 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27120 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27124 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27125 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27126 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27127 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27128 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27129 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27130 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27131 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27132 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27133 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27134 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27135 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27136 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27137 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27138 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27139 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27140 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27141 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27142 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27143 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27144 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27146 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27147 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27148 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27149 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27151 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27152 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27153 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27154 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27156 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27158 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27159 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27160 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27161 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27162 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27164 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27165 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27166 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27168 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27170 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27171 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27172 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27174 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27175 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27177 /* SSE MMX or 3Dnow!A */
27178 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27179 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27180 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27182 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27183 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27184 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27185 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27187 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27188 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27190 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27192 /* SSE2 */
27193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27198 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27199 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27201 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27202 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27203 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27204 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27207 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27210 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27211 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27212 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27214 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27215 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27216 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27218 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27220 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27221 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27223 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27225 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27227 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27228 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27229 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27230 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27232 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27233 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27234 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27235 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27237 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27238 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27239 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27240 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27241 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27242 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27243 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27244 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27246 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27248 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27249 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27250 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27251 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27254 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27255 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27256 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27258 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27260 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27261 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27264 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27266 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27267 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27268 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27269 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27270 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27271 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27272 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27273 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27278 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27280 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27281 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27282 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27284 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27287 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27288 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27289 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27290 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27293 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27295 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27296 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27297 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27298 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27300 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27302 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27303 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27304 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27307 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27308 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27309 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27310 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27311 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27312 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27313 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27314 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27316 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27317 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27318 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27320 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27321 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27323 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27324 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27326 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27328 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27329 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27330 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27331 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27333 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27334 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27335 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27336 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27337 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27338 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27339 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27341 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27342 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27343 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27344 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27345 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27346 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27347 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27349 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27350 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27351 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27352 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27354 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27355 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27356 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27358 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27360 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27362 /* SSE2 MMX */
27363 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27364 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27366 /* SSE3 */
27367 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27368 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27370 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27371 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27372 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27373 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27374 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27375 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27377 /* SSSE3 */
27378 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27379 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27380 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27381 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27382 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27383 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27385 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27386 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27387 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27388 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27389 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27390 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27391 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27392 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27393 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27394 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27395 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27396 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27397 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27398 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27399 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27400 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27401 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27402 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27403 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27404 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27405 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27406 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27407 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27408 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27410 /* SSSE3. */
27411 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27412 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27414 /* SSE4.1 */
27415 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27416 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27417 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27418 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27419 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27420 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27421 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27422 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27423 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27424 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27426 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27427 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27428 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27429 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27430 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27431 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27432 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27433 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27434 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27435 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27436 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27437 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27438 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27440 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27441 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27442 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27443 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27444 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27445 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27446 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27447 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27448 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27449 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27450 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27451 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27453 /* SSE4.1 */
27454 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27455 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27456 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27457 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27459 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27460 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27461 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27462 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27464 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27465 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27467 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27468 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27470 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27471 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27472 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27473 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27475 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27476 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27478 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27479 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27481 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27482 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27483 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27485 /* SSE4.2 */
27486 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27487 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27488 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27489 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27490 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27492 /* SSE4A */
27493 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27494 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27495 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27496 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27498 /* AES */
27499 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27500 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27502 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27503 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27504 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27505 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27507 /* PCLMUL */
27508 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27510 /* AVX */
27511 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27512 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27513 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27514 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27519 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27523 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27524 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27526 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27527 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27528 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27530 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27532 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27536 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27538 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27541 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27546 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27549 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27559 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27560 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27562 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27563 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27564 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27566 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27572 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27574 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27582 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27584 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27590 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27591 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27593 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27594 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27595 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27596 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27598 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27599 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27601 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27602 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27604 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27605 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27606 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27607 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27609 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27610 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27612 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27613 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27615 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27616 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27617 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27618 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27621 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27622 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27623 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27624 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27625 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27627 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27628 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27629 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27630 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27631 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27632 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27635 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27636 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27637 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27638 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27639 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27640 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27641 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27643 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27644 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27646 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27647 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27649 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27651 /* AVX2 */
27652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27653 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27654 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27655 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27747 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27748 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27749 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27750 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27751 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27752 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27753 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27754 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27755 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27756 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27757 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27758 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27759 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27760 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27761 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27762 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27763 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27764 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27765 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27766 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27782 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27783 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27784 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27785 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27787 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27788 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27793 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27794 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27795 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27796 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27797 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27799 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27801 /* BMI */
27802 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27803 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27804 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27806 /* TBM */
27807 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27808 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27810 /* F16C */
27811 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27812 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27813 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27814 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27816 /* BMI2 */
27817 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27818 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27819 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27820 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27821 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27822 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27825 /* FMA4 and XOP. */
27826 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27827 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27828 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27829 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27830 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27831 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27832 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27833 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27834 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27835 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27836 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27837 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27838 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27839 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27840 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27841 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27842 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27843 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27844 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27845 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27846 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27847 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27848 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27849 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27850 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27851 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27852 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27853 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27854 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27855 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27856 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27857 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27858 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27859 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27860 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27861 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27862 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27863 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27864 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27865 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27866 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27867 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27868 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27869 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27870 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27871 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27872 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27873 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27874 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27875 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27876 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27877 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27879 static const struct builtin_description bdesc_multi_arg[] =
27881 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27882 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27883 UNKNOWN, (int)MULTI_ARG_3_SF },
27884 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27885 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27886 UNKNOWN, (int)MULTI_ARG_3_DF },
27888 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27889 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27890 UNKNOWN, (int)MULTI_ARG_3_SF },
27891 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27892 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27893 UNKNOWN, (int)MULTI_ARG_3_DF },
27895 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27896 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27897 UNKNOWN, (int)MULTI_ARG_3_SF },
27898 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27899 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27900 UNKNOWN, (int)MULTI_ARG_3_DF },
27901 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27902 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27903 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27904 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27905 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27906 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27908 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27909 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27910 UNKNOWN, (int)MULTI_ARG_3_SF },
27911 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27912 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27913 UNKNOWN, (int)MULTI_ARG_3_DF },
27914 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27915 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27916 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27917 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27918 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27919 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27923 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_1_SF },
27970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_1_DF },
27971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
28000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
28004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
28005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
28006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
28008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
28012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
28013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28062 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28063 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28068 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28071 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28074 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28076 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28077 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28081 /* TM vector builtins. */
28083 /* Reuse the existing x86-specific `struct builtin_description' cause
28084 we're lazy. Add casts to make them fit. */
28085 static const struct builtin_description bdesc_tm[] =
28087 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28088 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28089 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28090 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28091 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28092 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28093 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28095 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28096 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28097 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28098 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28099 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28100 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28101 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28103 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28104 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28105 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28106 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28107 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28108 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28109 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28111 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28112 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28113 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28116 /* TM callbacks. */
28118 /* Return the builtin decl needed to load a vector of TYPE. */
28120 static tree
28121 ix86_builtin_tm_load (tree type)
28123 if (TREE_CODE (type) == VECTOR_TYPE)
28125 switch (tree_low_cst (TYPE_SIZE (type), 1))
28127 case 64:
28128 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28129 case 128:
28130 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28131 case 256:
28132 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28135 return NULL_TREE;
28138 /* Return the builtin decl needed to store a vector of TYPE. */
28140 static tree
28141 ix86_builtin_tm_store (tree type)
28143 if (TREE_CODE (type) == VECTOR_TYPE)
28145 switch (tree_low_cst (TYPE_SIZE (type), 1))
28147 case 64:
28148 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28149 case 128:
28150 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28151 case 256:
28152 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28155 return NULL_TREE;
28158 /* Initialize the transactional memory vector load/store builtins. */
28160 static void
28161 ix86_init_tm_builtins (void)
28163 enum ix86_builtin_func_type ftype;
28164 const struct builtin_description *d;
28165 size_t i;
28166 tree decl;
28167 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28168 tree attrs_log, attrs_type_log;
28170 if (!flag_tm)
28171 return;
28173 /* If there are no builtins defined, we must be compiling in a
28174 language without trans-mem support. */
28175 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28176 return;
28178 /* Use whatever attributes a normal TM load has. */
28179 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28180 attrs_load = DECL_ATTRIBUTES (decl);
28181 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28182 /* Use whatever attributes a normal TM store has. */
28183 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28184 attrs_store = DECL_ATTRIBUTES (decl);
28185 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28186 /* Use whatever attributes a normal TM log has. */
28187 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28188 attrs_log = DECL_ATTRIBUTES (decl);
28189 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28191 for (i = 0, d = bdesc_tm;
28192 i < ARRAY_SIZE (bdesc_tm);
28193 i++, d++)
28195 if ((d->mask & ix86_isa_flags) != 0
28196 || (lang_hooks.builtin_function
28197 == lang_hooks.builtin_function_ext_scope))
28199 tree type, attrs, attrs_type;
28200 enum built_in_function code = (enum built_in_function) d->code;
28202 ftype = (enum ix86_builtin_func_type) d->flag;
28203 type = ix86_get_builtin_func_type (ftype);
28205 if (BUILTIN_TM_LOAD_P (code))
28207 attrs = attrs_load;
28208 attrs_type = attrs_type_load;
28210 else if (BUILTIN_TM_STORE_P (code))
28212 attrs = attrs_store;
28213 attrs_type = attrs_type_store;
28215 else
28217 attrs = attrs_log;
28218 attrs_type = attrs_type_log;
28220 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28221 /* The builtin without the prefix for
28222 calling it directly. */
28223 d->name + strlen ("__builtin_"),
28224 attrs);
28225 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28226 set the TYPE_ATTRIBUTES. */
28227 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28229 set_builtin_decl (code, decl, false);
28234 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28235 in the current target ISA to allow the user to compile particular modules
28236 with different target specific options that differ from the command line
28237 options. */
28238 static void
28239 ix86_init_mmx_sse_builtins (void)
28241 const struct builtin_description * d;
28242 enum ix86_builtin_func_type ftype;
28243 size_t i;
28245 /* Add all special builtins with variable number of operands. */
28246 for (i = 0, d = bdesc_special_args;
28247 i < ARRAY_SIZE (bdesc_special_args);
28248 i++, d++)
28250 if (d->name == 0)
28251 continue;
28253 ftype = (enum ix86_builtin_func_type) d->flag;
28254 def_builtin (d->mask, d->name, ftype, d->code);
28257 /* Add all builtins with variable number of operands. */
28258 for (i = 0, d = bdesc_args;
28259 i < ARRAY_SIZE (bdesc_args);
28260 i++, d++)
28262 if (d->name == 0)
28263 continue;
28265 ftype = (enum ix86_builtin_func_type) d->flag;
28266 def_builtin_const (d->mask, d->name, ftype, d->code);
28269 /* pcmpestr[im] insns. */
28270 for (i = 0, d = bdesc_pcmpestr;
28271 i < ARRAY_SIZE (bdesc_pcmpestr);
28272 i++, d++)
28274 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28275 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28276 else
28277 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28278 def_builtin_const (d->mask, d->name, ftype, d->code);
28281 /* pcmpistr[im] insns. */
28282 for (i = 0, d = bdesc_pcmpistr;
28283 i < ARRAY_SIZE (bdesc_pcmpistr);
28284 i++, d++)
28286 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28287 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28288 else
28289 ftype = INT_FTYPE_V16QI_V16QI_INT;
28290 def_builtin_const (d->mask, d->name, ftype, d->code);
28293 /* comi/ucomi insns. */
28294 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28296 if (d->mask == OPTION_MASK_ISA_SSE2)
28297 ftype = INT_FTYPE_V2DF_V2DF;
28298 else
28299 ftype = INT_FTYPE_V4SF_V4SF;
28300 def_builtin_const (d->mask, d->name, ftype, d->code);
28303 /* SSE */
28304 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28305 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28306 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28307 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28309 /* SSE or 3DNow!A */
28310 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28311 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28312 IX86_BUILTIN_MASKMOVQ);
28314 /* SSE2 */
28315 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28316 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28318 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28319 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28320 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28321 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28323 /* SSE3. */
28324 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28325 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28326 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28327 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28329 /* AES */
28330 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28331 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28332 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28333 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28334 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28335 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28336 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28337 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28338 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28339 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28340 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28341 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28343 /* PCLMUL */
28344 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28345 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28347 /* RDRND */
28348 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28349 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28350 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28351 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28352 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28353 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28354 IX86_BUILTIN_RDRAND64_STEP);
28356 /* AVX2 */
28357 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28358 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28359 IX86_BUILTIN_GATHERSIV2DF);
28361 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28362 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28363 IX86_BUILTIN_GATHERSIV4DF);
28365 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28366 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28367 IX86_BUILTIN_GATHERDIV2DF);
28369 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28370 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28371 IX86_BUILTIN_GATHERDIV4DF);
28373 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28374 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28375 IX86_BUILTIN_GATHERSIV4SF);
28377 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28378 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28379 IX86_BUILTIN_GATHERSIV8SF);
28381 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28382 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28383 IX86_BUILTIN_GATHERDIV4SF);
28385 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28386 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28387 IX86_BUILTIN_GATHERDIV8SF);
28389 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28390 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28391 IX86_BUILTIN_GATHERSIV2DI);
28393 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28394 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28395 IX86_BUILTIN_GATHERSIV4DI);
28397 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28398 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28399 IX86_BUILTIN_GATHERDIV2DI);
28401 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28402 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28403 IX86_BUILTIN_GATHERDIV4DI);
28405 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28406 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28407 IX86_BUILTIN_GATHERSIV4SI);
28409 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28410 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28411 IX86_BUILTIN_GATHERSIV8SI);
28413 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28414 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28415 IX86_BUILTIN_GATHERDIV4SI);
28417 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28418 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28419 IX86_BUILTIN_GATHERDIV8SI);
28421 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28422 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28423 IX86_BUILTIN_GATHERALTSIV4DF);
28425 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28426 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28427 IX86_BUILTIN_GATHERALTDIV8SF);
28429 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28430 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28431 IX86_BUILTIN_GATHERALTSIV4DI);
28433 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28434 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28435 IX86_BUILTIN_GATHERALTDIV8SI);
28437 /* RTM. */
28438 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28439 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28441 /* MMX access to the vec_init patterns. */
28442 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28443 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28445 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28446 V4HI_FTYPE_HI_HI_HI_HI,
28447 IX86_BUILTIN_VEC_INIT_V4HI);
28449 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28450 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28451 IX86_BUILTIN_VEC_INIT_V8QI);
28453 /* Access to the vec_extract patterns. */
28454 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28455 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28456 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28457 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28458 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28459 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28460 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28461 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28462 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28463 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28465 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28466 "__builtin_ia32_vec_ext_v4hi",
28467 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28469 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28470 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28472 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28473 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28475 /* Access to the vec_set patterns. */
28476 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28477 "__builtin_ia32_vec_set_v2di",
28478 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28480 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28481 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28483 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28484 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28486 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28487 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28489 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28490 "__builtin_ia32_vec_set_v4hi",
28491 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28493 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28494 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28496 /* RDSEED */
28497 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28498 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28499 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28500 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28501 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28502 "__builtin_ia32_rdseed_di_step",
28503 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28505 /* ADCX */
28506 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28507 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28508 def_builtin (OPTION_MASK_ISA_64BIT,
28509 "__builtin_ia32_addcarryx_u64",
28510 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28511 IX86_BUILTIN_ADDCARRYX64);
28513 /* Add FMA4 multi-arg argument instructions */
28514 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28516 if (d->name == 0)
28517 continue;
28519 ftype = (enum ix86_builtin_func_type) d->flag;
28520 def_builtin_const (d->mask, d->name, ftype, d->code);
28524 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28525 to return a pointer to VERSION_DECL if the outcome of the expression
28526 formed by PREDICATE_CHAIN is true. This function will be called during
28527 version dispatch to decide which function version to execute. It returns
28528 the basic block at the end, to which more conditions can be added. */
28530 static basic_block
28531 add_condition_to_bb (tree function_decl, tree version_decl,
28532 tree predicate_chain, basic_block new_bb)
28534 gimple return_stmt;
28535 tree convert_expr, result_var;
28536 gimple convert_stmt;
28537 gimple call_cond_stmt;
28538 gimple if_else_stmt;
28540 basic_block bb1, bb2, bb3;
28541 edge e12, e23;
28543 tree cond_var, and_expr_var = NULL_TREE;
28544 gimple_seq gseq;
28546 tree predicate_decl, predicate_arg;
28548 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28550 gcc_assert (new_bb != NULL);
28551 gseq = bb_seq (new_bb);
28554 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28555 build_fold_addr_expr (version_decl));
28556 result_var = create_tmp_var (ptr_type_node, NULL);
28557 convert_stmt = gimple_build_assign (result_var, convert_expr);
28558 return_stmt = gimple_build_return (result_var);
28560 if (predicate_chain == NULL_TREE)
28562 gimple_seq_add_stmt (&gseq, convert_stmt);
28563 gimple_seq_add_stmt (&gseq, return_stmt);
28564 set_bb_seq (new_bb, gseq);
28565 gimple_set_bb (convert_stmt, new_bb);
28566 gimple_set_bb (return_stmt, new_bb);
28567 pop_cfun ();
28568 return new_bb;
28571 while (predicate_chain != NULL)
28573 cond_var = create_tmp_var (integer_type_node, NULL);
28574 predicate_decl = TREE_PURPOSE (predicate_chain);
28575 predicate_arg = TREE_VALUE (predicate_chain);
28576 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28577 gimple_call_set_lhs (call_cond_stmt, cond_var);
28579 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28580 gimple_set_bb (call_cond_stmt, new_bb);
28581 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28583 predicate_chain = TREE_CHAIN (predicate_chain);
28585 if (and_expr_var == NULL)
28586 and_expr_var = cond_var;
28587 else
28589 gimple assign_stmt;
28590 /* Use MIN_EXPR to check if any integer is zero?.
28591 and_expr_var = min_expr <cond_var, and_expr_var> */
28592 assign_stmt = gimple_build_assign (and_expr_var,
28593 build2 (MIN_EXPR, integer_type_node,
28594 cond_var, and_expr_var));
28596 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28597 gimple_set_bb (assign_stmt, new_bb);
28598 gimple_seq_add_stmt (&gseq, assign_stmt);
28602 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28603 integer_zero_node,
28604 NULL_TREE, NULL_TREE);
28605 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28606 gimple_set_bb (if_else_stmt, new_bb);
28607 gimple_seq_add_stmt (&gseq, if_else_stmt);
28609 gimple_seq_add_stmt (&gseq, convert_stmt);
28610 gimple_seq_add_stmt (&gseq, return_stmt);
28611 set_bb_seq (new_bb, gseq);
28613 bb1 = new_bb;
28614 e12 = split_block (bb1, if_else_stmt);
28615 bb2 = e12->dest;
28616 e12->flags &= ~EDGE_FALLTHRU;
28617 e12->flags |= EDGE_TRUE_VALUE;
28619 e23 = split_block (bb2, return_stmt);
28621 gimple_set_bb (convert_stmt, bb2);
28622 gimple_set_bb (return_stmt, bb2);
28624 bb3 = e23->dest;
28625 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28627 remove_edge (e23);
28628 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28630 pop_cfun ();
28632 return bb3;
28635 /* This parses the attribute arguments to target in DECL and determines
28636 the right builtin to use to match the platform specification.
28637 It returns the priority value for this version decl. If PREDICATE_LIST
28638 is not NULL, it stores the list of cpu features that need to be checked
28639 before dispatching this function. */
28641 static unsigned int
28642 get_builtin_code_for_version (tree decl, tree *predicate_list)
28644 tree attrs;
28645 struct cl_target_option cur_target;
28646 tree target_node;
28647 struct cl_target_option *new_target;
28648 const char *arg_str = NULL;
28649 const char *attrs_str = NULL;
28650 char *tok_str = NULL;
28651 char *token;
28653 /* Priority of i386 features, greater value is higher priority. This is
28654 used to decide the order in which function dispatch must happen. For
28655 instance, a version specialized for SSE4.2 should be checked for dispatch
28656 before a version for SSE3, as SSE4.2 implies SSE3. */
28657 enum feature_priority
28659 P_ZERO = 0,
28660 P_MMX,
28661 P_SSE,
28662 P_SSE2,
28663 P_SSE3,
28664 P_SSSE3,
28665 P_PROC_SSSE3,
28666 P_SSE4_a,
28667 P_PROC_SSE4_a,
28668 P_SSE4_1,
28669 P_SSE4_2,
28670 P_PROC_SSE4_2,
28671 P_POPCNT,
28672 P_AVX,
28673 P_AVX2,
28674 P_FMA,
28675 P_PROC_FMA
28678 enum feature_priority priority = P_ZERO;
28680 /* These are the target attribute strings for which a dispatcher is
28681 available, from fold_builtin_cpu. */
28683 static struct _feature_list
28685 const char *const name;
28686 const enum feature_priority priority;
28688 const feature_list[] =
28690 {"mmx", P_MMX},
28691 {"sse", P_SSE},
28692 {"sse2", P_SSE2},
28693 {"sse3", P_SSE3},
28694 {"ssse3", P_SSSE3},
28695 {"sse4.1", P_SSE4_1},
28696 {"sse4.2", P_SSE4_2},
28697 {"popcnt", P_POPCNT},
28698 {"avx", P_AVX},
28699 {"avx2", P_AVX2}
28703 static unsigned int NUM_FEATURES
28704 = sizeof (feature_list) / sizeof (struct _feature_list);
28706 unsigned int i;
28708 tree predicate_chain = NULL_TREE;
28709 tree predicate_decl, predicate_arg;
28711 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28712 gcc_assert (attrs != NULL);
28714 attrs = TREE_VALUE (TREE_VALUE (attrs));
28716 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28717 attrs_str = TREE_STRING_POINTER (attrs);
28719 /* Return priority zero for default function. */
28720 if (strcmp (attrs_str, "default") == 0)
28721 return 0;
28723 /* Handle arch= if specified. For priority, set it to be 1 more than
28724 the best instruction set the processor can handle. For instance, if
28725 there is a version for atom and a version for ssse3 (the highest ISA
28726 priority for atom), the atom version must be checked for dispatch
28727 before the ssse3 version. */
28728 if (strstr (attrs_str, "arch=") != NULL)
28730 cl_target_option_save (&cur_target, &global_options);
28731 target_node = ix86_valid_target_attribute_tree (attrs);
28733 gcc_assert (target_node);
28734 new_target = TREE_TARGET_OPTION (target_node);
28735 gcc_assert (new_target);
28737 if (new_target->arch_specified && new_target->arch > 0)
28739 switch (new_target->arch)
28741 case PROCESSOR_CORE2:
28742 arg_str = "core2";
28743 priority = P_PROC_SSSE3;
28744 break;
28745 case PROCESSOR_COREI7:
28746 arg_str = "corei7";
28747 priority = P_PROC_SSE4_2;
28748 break;
28749 case PROCESSOR_ATOM:
28750 arg_str = "atom";
28751 priority = P_PROC_SSSE3;
28752 break;
28753 case PROCESSOR_AMDFAM10:
28754 arg_str = "amdfam10h";
28755 priority = P_PROC_SSE4_a;
28756 break;
28757 case PROCESSOR_BDVER1:
28758 arg_str = "bdver1";
28759 priority = P_PROC_FMA;
28760 break;
28761 case PROCESSOR_BDVER2:
28762 arg_str = "bdver2";
28763 priority = P_PROC_FMA;
28764 break;
28768 cl_target_option_restore (&global_options, &cur_target);
28770 if (predicate_list && arg_str == NULL)
28772 error_at (DECL_SOURCE_LOCATION (decl),
28773 "No dispatcher found for the versioning attributes");
28774 return 0;
28777 if (predicate_list)
28779 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28780 /* For a C string literal the length includes the trailing NULL. */
28781 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28782 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28783 predicate_chain);
28787 /* Process feature name. */
28788 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28789 strcpy (tok_str, attrs_str);
28790 token = strtok (tok_str, ",");
28791 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28793 while (token != NULL)
28795 /* Do not process "arch=" */
28796 if (strncmp (token, "arch=", 5) == 0)
28798 token = strtok (NULL, ",");
28799 continue;
28801 for (i = 0; i < NUM_FEATURES; ++i)
28803 if (strcmp (token, feature_list[i].name) == 0)
28805 if (predicate_list)
28807 predicate_arg = build_string_literal (
28808 strlen (feature_list[i].name) + 1,
28809 feature_list[i].name);
28810 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28811 predicate_chain);
28813 /* Find the maximum priority feature. */
28814 if (feature_list[i].priority > priority)
28815 priority = feature_list[i].priority;
28817 break;
28820 if (predicate_list && i == NUM_FEATURES)
28822 error_at (DECL_SOURCE_LOCATION (decl),
28823 "No dispatcher found for %s", token);
28824 return 0;
28826 token = strtok (NULL, ",");
28828 free (tok_str);
28830 if (predicate_list && predicate_chain == NULL_TREE)
28832 error_at (DECL_SOURCE_LOCATION (decl),
28833 "No dispatcher found for the versioning attributes : %s",
28834 attrs_str);
28835 return 0;
28837 else if (predicate_list)
28839 predicate_chain = nreverse (predicate_chain);
28840 *predicate_list = predicate_chain;
28843 return priority;
28846 /* This compares the priority of target features in function DECL1
28847 and DECL2. It returns positive value if DECL1 is higher priority,
28848 negative value if DECL2 is higher priority and 0 if they are the
28849 same. */
28851 static int
28852 ix86_compare_version_priority (tree decl1, tree decl2)
28854 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
28855 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
28857 return (int)priority1 - (int)priority2;
28860 /* V1 and V2 point to function versions with different priorities
28861 based on the target ISA. This function compares their priorities. */
28863 static int
28864 feature_compare (const void *v1, const void *v2)
28866 typedef struct _function_version_info
28868 tree version_decl;
28869 tree predicate_chain;
28870 unsigned int dispatch_priority;
28871 } function_version_info;
28873 const function_version_info c1 = *(const function_version_info *)v1;
28874 const function_version_info c2 = *(const function_version_info *)v2;
28875 return (c2.dispatch_priority - c1.dispatch_priority);
28878 /* This function generates the dispatch function for
28879 multi-versioned functions. DISPATCH_DECL is the function which will
28880 contain the dispatch logic. FNDECLS are the function choices for
28881 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28882 in DISPATCH_DECL in which the dispatch code is generated. */
28884 static int
28885 dispatch_function_versions (tree dispatch_decl,
28886 void *fndecls_p,
28887 basic_block *empty_bb)
28889 tree default_decl;
28890 gimple ifunc_cpu_init_stmt;
28891 gimple_seq gseq;
28892 int ix;
28893 tree ele;
28894 vec<tree> *fndecls;
28895 unsigned int num_versions = 0;
28896 unsigned int actual_versions = 0;
28897 unsigned int i;
28899 struct _function_version_info
28901 tree version_decl;
28902 tree predicate_chain;
28903 unsigned int dispatch_priority;
28904 }*function_version_info;
28906 gcc_assert (dispatch_decl != NULL
28907 && fndecls_p != NULL
28908 && empty_bb != NULL);
28910 /*fndecls_p is actually a vector. */
28911 fndecls = static_cast<vec<tree> *> (fndecls_p);
28913 /* At least one more version other than the default. */
28914 num_versions = fndecls->length ();
28915 gcc_assert (num_versions >= 2);
28917 function_version_info = (struct _function_version_info *)
28918 XNEWVEC (struct _function_version_info, (num_versions - 1));
28920 /* The first version in the vector is the default decl. */
28921 default_decl = (*fndecls)[0];
28923 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
28925 gseq = bb_seq (*empty_bb);
28926 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
28927 constructors, so explicity call __builtin_cpu_init here. */
28928 ifunc_cpu_init_stmt = gimple_build_call_vec (
28929 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
28930 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
28931 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
28932 set_bb_seq (*empty_bb, gseq);
28934 pop_cfun ();
28937 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
28939 tree version_decl = ele;
28940 tree predicate_chain = NULL_TREE;
28941 unsigned int priority;
28942 /* Get attribute string, parse it and find the right predicate decl.
28943 The predicate function could be a lengthy combination of many
28944 features, like arch-type and various isa-variants. */
28945 priority = get_builtin_code_for_version (version_decl,
28946 &predicate_chain);
28948 if (predicate_chain == NULL_TREE)
28949 continue;
28951 function_version_info [actual_versions].version_decl = version_decl;
28952 function_version_info [actual_versions].predicate_chain
28953 = predicate_chain;
28954 function_version_info [actual_versions].dispatch_priority = priority;
28955 actual_versions++;
28958 /* Sort the versions according to descending order of dispatch priority. The
28959 priority is based on the ISA. This is not a perfect solution. There
28960 could still be ambiguity. If more than one function version is suitable
28961 to execute, which one should be dispatched? In future, allow the user
28962 to specify a dispatch priority next to the version. */
28963 qsort (function_version_info, actual_versions,
28964 sizeof (struct _function_version_info), feature_compare);
28966 for (i = 0; i < actual_versions; ++i)
28967 *empty_bb = add_condition_to_bb (dispatch_decl,
28968 function_version_info[i].version_decl,
28969 function_version_info[i].predicate_chain,
28970 *empty_bb);
28972 /* dispatch default version at the end. */
28973 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
28974 NULL, *empty_bb);
28976 free (function_version_info);
28977 return 0;
28980 /* Comparator function to be used in qsort routine to sort attribute
28981 specification strings to "target". */
28983 static int
28984 attr_strcmp (const void *v1, const void *v2)
28986 const char *c1 = *(char *const*)v1;
28987 const char *c2 = *(char *const*)v2;
28988 return strcmp (c1, c2);
28991 /* ARGLIST is the argument to target attribute. This function tokenizes
28992 the comma separated arguments, sorts them and returns a string which
28993 is a unique identifier for the comma separated arguments. It also
28994 replaces non-identifier characters "=,-" with "_". */
28996 static char *
28997 sorted_attr_string (tree arglist)
28999 tree arg;
29000 size_t str_len_sum = 0;
29001 char **args = NULL;
29002 char *attr_str, *ret_str;
29003 char *attr = NULL;
29004 unsigned int argnum = 1;
29005 unsigned int i;
29007 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29009 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29010 size_t len = strlen (str);
29011 str_len_sum += len + 1;
29012 if (arg != arglist)
29013 argnum++;
29014 for (i = 0; i < strlen (str); i++)
29015 if (str[i] == ',')
29016 argnum++;
29019 attr_str = XNEWVEC (char, str_len_sum);
29020 str_len_sum = 0;
29021 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29023 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29024 size_t len = strlen (str);
29025 memcpy (attr_str + str_len_sum, str, len);
29026 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29027 str_len_sum += len + 1;
29030 /* Replace "=,-" with "_". */
29031 for (i = 0; i < strlen (attr_str); i++)
29032 if (attr_str[i] == '=' || attr_str[i]== '-')
29033 attr_str[i] = '_';
29035 if (argnum == 1)
29036 return attr_str;
29038 args = XNEWVEC (char *, argnum);
29040 i = 0;
29041 attr = strtok (attr_str, ",");
29042 while (attr != NULL)
29044 args[i] = attr;
29045 i++;
29046 attr = strtok (NULL, ",");
29049 qsort (args, argnum, sizeof (char *), attr_strcmp);
29051 ret_str = XNEWVEC (char, str_len_sum);
29052 str_len_sum = 0;
29053 for (i = 0; i < argnum; i++)
29055 size_t len = strlen (args[i]);
29056 memcpy (ret_str + str_len_sum, args[i], len);
29057 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29058 str_len_sum += len + 1;
29061 XDELETEVEC (args);
29062 XDELETEVEC (attr_str);
29063 return ret_str;
29066 /* This function changes the assembler name for functions that are
29067 versions. If DECL is a function version and has a "target"
29068 attribute, it appends the attribute string to its assembler name. */
29070 static tree
29071 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29073 tree version_attr;
29074 const char *orig_name, *version_string;
29075 char *attr_str, *assembler_name;
29077 if (DECL_DECLARED_INLINE_P (decl)
29078 && lookup_attribute ("gnu_inline",
29079 DECL_ATTRIBUTES (decl)))
29080 error_at (DECL_SOURCE_LOCATION (decl),
29081 "Function versions cannot be marked as gnu_inline,"
29082 " bodies have to be generated");
29084 if (DECL_VIRTUAL_P (decl)
29085 || DECL_VINDEX (decl))
29086 sorry ("Virtual function multiversioning not supported");
29088 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29090 /* target attribute string cannot be NULL. */
29091 gcc_assert (version_attr != NULL_TREE);
29093 orig_name = IDENTIFIER_POINTER (id);
29094 version_string
29095 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29097 if (strcmp (version_string, "default") == 0)
29098 return id;
29100 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29101 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29103 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29105 /* Allow assembler name to be modified if already set. */
29106 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29107 SET_DECL_RTL (decl, NULL);
29109 tree ret = get_identifier (assembler_name);
29110 XDELETEVEC (attr_str);
29111 XDELETEVEC (assembler_name);
29112 return ret;
29115 /* This function returns true if FN1 and FN2 are versions of the same function,
29116 that is, the target strings of the function decls are different. This assumes
29117 that FN1 and FN2 have the same signature. */
29119 static bool
29120 ix86_function_versions (tree fn1, tree fn2)
29122 tree attr1, attr2;
29123 char *target1, *target2;
29124 bool result;
29126 if (TREE_CODE (fn1) != FUNCTION_DECL
29127 || TREE_CODE (fn2) != FUNCTION_DECL)
29128 return false;
29130 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29131 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29133 /* At least one function decl should have the target attribute specified. */
29134 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29135 return false;
29137 /* Diagnose missing target attribute if one of the decls is already
29138 multi-versioned. */
29139 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29141 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29143 if (attr2 != NULL_TREE)
29145 tree tem = fn1;
29146 fn1 = fn2;
29147 fn2 = tem;
29148 attr1 = attr2;
29150 error_at (DECL_SOURCE_LOCATION (fn2),
29151 "missing %<target%> attribute for multi-versioned %D",
29152 fn2);
29153 error_at (DECL_SOURCE_LOCATION (fn1),
29154 "previous declaration of %D", fn1);
29155 /* Prevent diagnosing of the same error multiple times. */
29156 DECL_ATTRIBUTES (fn2)
29157 = tree_cons (get_identifier ("target"),
29158 copy_node (TREE_VALUE (attr1)),
29159 DECL_ATTRIBUTES (fn2));
29161 return false;
29164 target1 = sorted_attr_string (TREE_VALUE (attr1));
29165 target2 = sorted_attr_string (TREE_VALUE (attr2));
29167 /* The sorted target strings must be different for fn1 and fn2
29168 to be versions. */
29169 if (strcmp (target1, target2) == 0)
29170 result = false;
29171 else
29172 result = true;
29174 XDELETEVEC (target1);
29175 XDELETEVEC (target2);
29177 return result;
29180 static tree
29181 ix86_mangle_decl_assembler_name (tree decl, tree id)
29183 /* For function version, add the target suffix to the assembler name. */
29184 if (TREE_CODE (decl) == FUNCTION_DECL
29185 && DECL_FUNCTION_VERSIONED (decl))
29186 id = ix86_mangle_function_version_assembler_name (decl, id);
29187 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29188 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29189 #endif
29191 return id;
29194 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29195 is true, append the full path name of the source file. */
29197 static char *
29198 make_name (tree decl, const char *suffix, bool make_unique)
29200 char *global_var_name;
29201 int name_len;
29202 const char *name;
29203 const char *unique_name = NULL;
29205 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29207 /* Get a unique name that can be used globally without any chances
29208 of collision at link time. */
29209 if (make_unique)
29210 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29212 name_len = strlen (name) + strlen (suffix) + 2;
29214 if (make_unique)
29215 name_len += strlen (unique_name) + 1;
29216 global_var_name = XNEWVEC (char, name_len);
29218 /* Use '.' to concatenate names as it is demangler friendly. */
29219 if (make_unique)
29220 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29221 suffix);
29222 else
29223 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29225 return global_var_name;
29228 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29230 /* Make a dispatcher declaration for the multi-versioned function DECL.
29231 Calls to DECL function will be replaced with calls to the dispatcher
29232 by the front-end. Return the decl created. */
29234 static tree
29235 make_dispatcher_decl (const tree decl)
29237 tree func_decl;
29238 char *func_name;
29239 tree fn_type, func_type;
29240 bool is_uniq = false;
29242 if (TREE_PUBLIC (decl) == 0)
29243 is_uniq = true;
29245 func_name = make_name (decl, "ifunc", is_uniq);
29247 fn_type = TREE_TYPE (decl);
29248 func_type = build_function_type (TREE_TYPE (fn_type),
29249 TYPE_ARG_TYPES (fn_type));
29251 func_decl = build_fn_decl (func_name, func_type);
29252 XDELETEVEC (func_name);
29253 TREE_USED (func_decl) = 1;
29254 DECL_CONTEXT (func_decl) = NULL_TREE;
29255 DECL_INITIAL (func_decl) = error_mark_node;
29256 DECL_ARTIFICIAL (func_decl) = 1;
29257 /* Mark this func as external, the resolver will flip it again if
29258 it gets generated. */
29259 DECL_EXTERNAL (func_decl) = 1;
29260 /* This will be of type IFUNCs have to be externally visible. */
29261 TREE_PUBLIC (func_decl) = 1;
29263 return func_decl;
29266 #endif
29268 /* Returns true if decl is multi-versioned and DECL is the default function,
29269 that is it is not tagged with target specific optimization. */
29271 static bool
29272 is_function_default_version (const tree decl)
29274 if (TREE_CODE (decl) != FUNCTION_DECL
29275 || !DECL_FUNCTION_VERSIONED (decl))
29276 return false;
29277 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29278 gcc_assert (attr);
29279 attr = TREE_VALUE (TREE_VALUE (attr));
29280 return (TREE_CODE (attr) == STRING_CST
29281 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29284 /* Make a dispatcher declaration for the multi-versioned function DECL.
29285 Calls to DECL function will be replaced with calls to the dispatcher
29286 by the front-end. Returns the decl of the dispatcher function. */
29288 static tree
29289 ix86_get_function_versions_dispatcher (void *decl)
29291 tree fn = (tree) decl;
29292 struct cgraph_node *node = NULL;
29293 struct cgraph_node *default_node = NULL;
29294 struct cgraph_function_version_info *node_v = NULL;
29295 struct cgraph_function_version_info *first_v = NULL;
29297 tree dispatch_decl = NULL;
29299 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29300 struct cgraph_function_version_info *it_v = NULL;
29301 struct cgraph_node *dispatcher_node = NULL;
29302 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29303 #endif
29305 struct cgraph_function_version_info *default_version_info = NULL;
29307 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29309 node = cgraph_get_node (fn);
29310 gcc_assert (node != NULL);
29312 node_v = get_cgraph_node_version (node);
29313 gcc_assert (node_v != NULL);
29315 if (node_v->dispatcher_resolver != NULL)
29316 return node_v->dispatcher_resolver;
29318 /* Find the default version and make it the first node. */
29319 first_v = node_v;
29320 /* Go to the beginnig of the chain. */
29321 while (first_v->prev != NULL)
29322 first_v = first_v->prev;
29323 default_version_info = first_v;
29324 while (default_version_info != NULL)
29326 if (is_function_default_version
29327 (default_version_info->this_node->symbol.decl))
29328 break;
29329 default_version_info = default_version_info->next;
29332 /* If there is no default node, just return NULL. */
29333 if (default_version_info == NULL)
29334 return NULL;
29336 /* Make default info the first node. */
29337 if (first_v != default_version_info)
29339 default_version_info->prev->next = default_version_info->next;
29340 if (default_version_info->next)
29341 default_version_info->next->prev = default_version_info->prev;
29342 first_v->prev = default_version_info;
29343 default_version_info->next = first_v;
29344 default_version_info->prev = NULL;
29347 default_node = default_version_info->this_node;
29349 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29350 /* Right now, the dispatching is done via ifunc. */
29351 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29353 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29354 gcc_assert (dispatcher_node != NULL);
29355 dispatcher_node->dispatcher_function = 1;
29356 dispatcher_version_info
29357 = insert_new_cgraph_node_version (dispatcher_node);
29358 dispatcher_version_info->next = default_version_info;
29359 dispatcher_node->local.finalized = 1;
29361 /* Set the dispatcher for all the versions. */
29362 it_v = default_version_info;
29363 while (it_v != NULL)
29365 it_v->dispatcher_resolver = dispatch_decl;
29366 it_v = it_v->next;
29368 #else
29369 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29370 "multiversioning needs ifunc which is not supported "
29371 "in this configuration");
29372 #endif
29373 return dispatch_decl;
29376 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29377 it to CHAIN. */
29379 static tree
29380 make_attribute (const char *name, const char *arg_name, tree chain)
29382 tree attr_name;
29383 tree attr_arg_name;
29384 tree attr_args;
29385 tree attr;
29387 attr_name = get_identifier (name);
29388 attr_arg_name = build_string (strlen (arg_name), arg_name);
29389 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29390 attr = tree_cons (attr_name, attr_args, chain);
29391 return attr;
29394 /* Make the resolver function decl to dispatch the versions of
29395 a multi-versioned function, DEFAULT_DECL. Create an
29396 empty basic block in the resolver and store the pointer in
29397 EMPTY_BB. Return the decl of the resolver function. */
29399 static tree
29400 make_resolver_func (const tree default_decl,
29401 const tree dispatch_decl,
29402 basic_block *empty_bb)
29404 char *resolver_name;
29405 tree decl, type, decl_name, t;
29406 bool is_uniq = false;
29408 /* IFUNC's have to be globally visible. So, if the default_decl is
29409 not, then the name of the IFUNC should be made unique. */
29410 if (TREE_PUBLIC (default_decl) == 0)
29411 is_uniq = true;
29413 /* Append the filename to the resolver function if the versions are
29414 not externally visible. This is because the resolver function has
29415 to be externally visible for the loader to find it. So, appending
29416 the filename will prevent conflicts with a resolver function from
29417 another module which is based on the same version name. */
29418 resolver_name = make_name (default_decl, "resolver", is_uniq);
29420 /* The resolver function should return a (void *). */
29421 type = build_function_type_list (ptr_type_node, NULL_TREE);
29423 decl = build_fn_decl (resolver_name, type);
29424 decl_name = get_identifier (resolver_name);
29425 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29427 DECL_NAME (decl) = decl_name;
29428 TREE_USED (decl) = 1;
29429 DECL_ARTIFICIAL (decl) = 1;
29430 DECL_IGNORED_P (decl) = 0;
29431 /* IFUNC resolvers have to be externally visible. */
29432 TREE_PUBLIC (decl) = 1;
29433 DECL_UNINLINABLE (decl) = 0;
29435 /* Resolver is not external, body is generated. */
29436 DECL_EXTERNAL (decl) = 0;
29437 DECL_EXTERNAL (dispatch_decl) = 0;
29439 DECL_CONTEXT (decl) = NULL_TREE;
29440 DECL_INITIAL (decl) = make_node (BLOCK);
29441 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29443 if (DECL_COMDAT_GROUP (default_decl)
29444 || TREE_PUBLIC (default_decl))
29446 /* In this case, each translation unit with a call to this
29447 versioned function will put out a resolver. Ensure it
29448 is comdat to keep just one copy. */
29449 DECL_COMDAT (decl) = 1;
29450 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29452 /* Build result decl and add to function_decl. */
29453 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29454 DECL_ARTIFICIAL (t) = 1;
29455 DECL_IGNORED_P (t) = 1;
29456 DECL_RESULT (decl) = t;
29458 gimplify_function_tree (decl);
29459 push_cfun (DECL_STRUCT_FUNCTION (decl));
29460 *empty_bb = init_lowered_empty_function (decl, false);
29462 cgraph_add_new_function (decl, true);
29463 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29465 pop_cfun ();
29467 gcc_assert (dispatch_decl != NULL);
29468 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29469 DECL_ATTRIBUTES (dispatch_decl)
29470 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29472 /* Create the alias for dispatch to resolver here. */
29473 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29474 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29475 XDELETEVEC (resolver_name);
29476 return decl;
29479 /* Generate the dispatching code body to dispatch multi-versioned function
29480 DECL. The target hook is called to process the "target" attributes and
29481 provide the code to dispatch the right function at run-time. NODE points
29482 to the dispatcher decl whose body will be created. */
29484 static tree
29485 ix86_generate_version_dispatcher_body (void *node_p)
29487 tree resolver_decl;
29488 basic_block empty_bb;
29489 vec<tree> fn_ver_vec = vNULL;
29490 tree default_ver_decl;
29491 struct cgraph_node *versn;
29492 struct cgraph_node *node;
29494 struct cgraph_function_version_info *node_version_info = NULL;
29495 struct cgraph_function_version_info *versn_info = NULL;
29497 node = (cgraph_node *)node_p;
29499 node_version_info = get_cgraph_node_version (node);
29500 gcc_assert (node->dispatcher_function
29501 && node_version_info != NULL);
29503 if (node_version_info->dispatcher_resolver)
29504 return node_version_info->dispatcher_resolver;
29506 /* The first version in the chain corresponds to the default version. */
29507 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29509 /* node is going to be an alias, so remove the finalized bit. */
29510 node->local.finalized = false;
29512 resolver_decl = make_resolver_func (default_ver_decl,
29513 node->symbol.decl, &empty_bb);
29515 node_version_info->dispatcher_resolver = resolver_decl;
29517 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29519 fn_ver_vec.create (2);
29521 for (versn_info = node_version_info->next; versn_info;
29522 versn_info = versn_info->next)
29524 versn = versn_info->this_node;
29525 /* Check for virtual functions here again, as by this time it should
29526 have been determined if this function needs a vtable index or
29527 not. This happens for methods in derived classes that override
29528 virtual methods in base classes but are not explicitly marked as
29529 virtual. */
29530 if (DECL_VINDEX (versn->symbol.decl))
29531 sorry ("Virtual function multiversioning not supported");
29533 fn_ver_vec.safe_push (versn->symbol.decl);
29536 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29537 fn_ver_vec.release ();
29538 rebuild_cgraph_edges ();
29539 pop_cfun ();
29540 return resolver_decl;
29542 /* This builds the processor_model struct type defined in
29543 libgcc/config/i386/cpuinfo.c */
29545 static tree
29546 build_processor_model_struct (void)
29548 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29549 "__cpu_features"};
29550 tree field = NULL_TREE, field_chain = NULL_TREE;
29551 int i;
29552 tree type = make_node (RECORD_TYPE);
29554 /* The first 3 fields are unsigned int. */
29555 for (i = 0; i < 3; ++i)
29557 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29558 get_identifier (field_name[i]), unsigned_type_node);
29559 if (field_chain != NULL_TREE)
29560 DECL_CHAIN (field) = field_chain;
29561 field_chain = field;
29564 /* The last field is an array of unsigned integers of size one. */
29565 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29566 get_identifier (field_name[3]),
29567 build_array_type (unsigned_type_node,
29568 build_index_type (size_one_node)));
29569 if (field_chain != NULL_TREE)
29570 DECL_CHAIN (field) = field_chain;
29571 field_chain = field;
29573 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29574 return type;
29577 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29579 static tree
29580 make_var_decl (tree type, const char *name)
29582 tree new_decl;
29584 new_decl = build_decl (UNKNOWN_LOCATION,
29585 VAR_DECL,
29586 get_identifier(name),
29587 type);
29589 DECL_EXTERNAL (new_decl) = 1;
29590 TREE_STATIC (new_decl) = 1;
29591 TREE_PUBLIC (new_decl) = 1;
29592 DECL_INITIAL (new_decl) = 0;
29593 DECL_ARTIFICIAL (new_decl) = 0;
29594 DECL_PRESERVE_P (new_decl) = 1;
29596 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29597 assemble_variable (new_decl, 0, 0, 0);
29599 return new_decl;
29602 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29603 into an integer defined in libgcc/config/i386/cpuinfo.c */
29605 static tree
29606 fold_builtin_cpu (tree fndecl, tree *args)
29608 unsigned int i;
29609 enum ix86_builtins fn_code = (enum ix86_builtins)
29610 DECL_FUNCTION_CODE (fndecl);
29611 tree param_string_cst = NULL;
29613 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29614 enum processor_features
29616 F_CMOV = 0,
29617 F_MMX,
29618 F_POPCNT,
29619 F_SSE,
29620 F_SSE2,
29621 F_SSE3,
29622 F_SSSE3,
29623 F_SSE4_1,
29624 F_SSE4_2,
29625 F_AVX,
29626 F_AVX2,
29627 F_MAX
29630 /* These are the values for vendor types and cpu types and subtypes
29631 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29632 the corresponding start value. */
29633 enum processor_model
29635 M_INTEL = 1,
29636 M_AMD,
29637 M_CPU_TYPE_START,
29638 M_INTEL_ATOM,
29639 M_INTEL_CORE2,
29640 M_INTEL_COREI7,
29641 M_AMDFAM10H,
29642 M_AMDFAM15H,
29643 M_CPU_SUBTYPE_START,
29644 M_INTEL_COREI7_NEHALEM,
29645 M_INTEL_COREI7_WESTMERE,
29646 M_INTEL_COREI7_SANDYBRIDGE,
29647 M_AMDFAM10H_BARCELONA,
29648 M_AMDFAM10H_SHANGHAI,
29649 M_AMDFAM10H_ISTANBUL,
29650 M_AMDFAM15H_BDVER1,
29651 M_AMDFAM15H_BDVER2,
29652 M_AMDFAM15H_BDVER3
29655 static struct _arch_names_table
29657 const char *const name;
29658 const enum processor_model model;
29660 const arch_names_table[] =
29662 {"amd", M_AMD},
29663 {"intel", M_INTEL},
29664 {"atom", M_INTEL_ATOM},
29665 {"core2", M_INTEL_CORE2},
29666 {"corei7", M_INTEL_COREI7},
29667 {"nehalem", M_INTEL_COREI7_NEHALEM},
29668 {"westmere", M_INTEL_COREI7_WESTMERE},
29669 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29670 {"amdfam10h", M_AMDFAM10H},
29671 {"barcelona", M_AMDFAM10H_BARCELONA},
29672 {"shanghai", M_AMDFAM10H_SHANGHAI},
29673 {"istanbul", M_AMDFAM10H_ISTANBUL},
29674 {"amdfam15h", M_AMDFAM15H},
29675 {"bdver1", M_AMDFAM15H_BDVER1},
29676 {"bdver2", M_AMDFAM15H_BDVER2},
29677 {"bdver3", M_AMDFAM15H_BDVER3},
29680 static struct _isa_names_table
29682 const char *const name;
29683 const enum processor_features feature;
29685 const isa_names_table[] =
29687 {"cmov", F_CMOV},
29688 {"mmx", F_MMX},
29689 {"popcnt", F_POPCNT},
29690 {"sse", F_SSE},
29691 {"sse2", F_SSE2},
29692 {"sse3", F_SSE3},
29693 {"ssse3", F_SSSE3},
29694 {"sse4.1", F_SSE4_1},
29695 {"sse4.2", F_SSE4_2},
29696 {"avx", F_AVX},
29697 {"avx2", F_AVX2}
29700 tree __processor_model_type = build_processor_model_struct ();
29701 tree __cpu_model_var = make_var_decl (__processor_model_type,
29702 "__cpu_model");
29704 gcc_assert ((args != NULL) && (*args != NULL));
29706 param_string_cst = *args;
29707 while (param_string_cst
29708 && TREE_CODE (param_string_cst) != STRING_CST)
29710 /* *args must be a expr that can contain other EXPRS leading to a
29711 STRING_CST. */
29712 if (!EXPR_P (param_string_cst))
29714 error ("Parameter to builtin must be a string constant or literal");
29715 return integer_zero_node;
29717 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29720 gcc_assert (param_string_cst);
29722 if (fn_code == IX86_BUILTIN_CPU_IS)
29724 tree ref;
29725 tree field;
29726 tree final;
29728 unsigned int field_val = 0;
29729 unsigned int NUM_ARCH_NAMES
29730 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29732 for (i = 0; i < NUM_ARCH_NAMES; i++)
29733 if (strcmp (arch_names_table[i].name,
29734 TREE_STRING_POINTER (param_string_cst)) == 0)
29735 break;
29737 if (i == NUM_ARCH_NAMES)
29739 error ("Parameter to builtin not valid: %s",
29740 TREE_STRING_POINTER (param_string_cst));
29741 return integer_zero_node;
29744 field = TYPE_FIELDS (__processor_model_type);
29745 field_val = arch_names_table[i].model;
29747 /* CPU types are stored in the next field. */
29748 if (field_val > M_CPU_TYPE_START
29749 && field_val < M_CPU_SUBTYPE_START)
29751 field = DECL_CHAIN (field);
29752 field_val -= M_CPU_TYPE_START;
29755 /* CPU subtypes are stored in the next field. */
29756 if (field_val > M_CPU_SUBTYPE_START)
29758 field = DECL_CHAIN ( DECL_CHAIN (field));
29759 field_val -= M_CPU_SUBTYPE_START;
29762 /* Get the appropriate field in __cpu_model. */
29763 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29764 field, NULL_TREE);
29766 /* Check the value. */
29767 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29768 build_int_cstu (unsigned_type_node, field_val));
29769 return build1 (CONVERT_EXPR, integer_type_node, final);
29771 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29773 tree ref;
29774 tree array_elt;
29775 tree field;
29776 tree final;
29778 unsigned int field_val = 0;
29779 unsigned int NUM_ISA_NAMES
29780 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29782 for (i = 0; i < NUM_ISA_NAMES; i++)
29783 if (strcmp (isa_names_table[i].name,
29784 TREE_STRING_POINTER (param_string_cst)) == 0)
29785 break;
29787 if (i == NUM_ISA_NAMES)
29789 error ("Parameter to builtin not valid: %s",
29790 TREE_STRING_POINTER (param_string_cst));
29791 return integer_zero_node;
29794 field = TYPE_FIELDS (__processor_model_type);
29795 /* Get the last field, which is __cpu_features. */
29796 while (DECL_CHAIN (field))
29797 field = DECL_CHAIN (field);
29799 /* Get the appropriate field: __cpu_model.__cpu_features */
29800 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29801 field, NULL_TREE);
29803 /* Access the 0th element of __cpu_features array. */
29804 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29805 integer_zero_node, NULL_TREE, NULL_TREE);
29807 field_val = (1 << isa_names_table[i].feature);
29808 /* Return __cpu_model.__cpu_features[0] & field_val */
29809 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29810 build_int_cstu (unsigned_type_node, field_val));
29811 return build1 (CONVERT_EXPR, integer_type_node, final);
29813 gcc_unreachable ();
29816 static tree
29817 ix86_fold_builtin (tree fndecl, int n_args,
29818 tree *args, bool ignore ATTRIBUTE_UNUSED)
29820 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29822 enum ix86_builtins fn_code = (enum ix86_builtins)
29823 DECL_FUNCTION_CODE (fndecl);
29824 if (fn_code == IX86_BUILTIN_CPU_IS
29825 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29827 gcc_assert (n_args == 1);
29828 return fold_builtin_cpu (fndecl, args);
29832 #ifdef SUBTARGET_FOLD_BUILTIN
29833 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29834 #endif
29836 return NULL_TREE;
29839 /* Make builtins to detect cpu type and features supported. NAME is
29840 the builtin name, CODE is the builtin code, and FTYPE is the function
29841 type of the builtin. */
29843 static void
29844 make_cpu_type_builtin (const char* name, int code,
29845 enum ix86_builtin_func_type ftype, bool is_const)
29847 tree decl;
29848 tree type;
29850 type = ix86_get_builtin_func_type (ftype);
29851 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29852 NULL, NULL_TREE);
29853 gcc_assert (decl != NULL_TREE);
29854 ix86_builtins[(int) code] = decl;
29855 TREE_READONLY (decl) = is_const;
29858 /* Make builtins to get CPU type and features supported. The created
29859 builtins are :
29861 __builtin_cpu_init (), to detect cpu type and features,
29862 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29863 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29866 static void
29867 ix86_init_platform_type_builtins (void)
29869 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29870 INT_FTYPE_VOID, false);
29871 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29872 INT_FTYPE_PCCHAR, true);
29873 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29874 INT_FTYPE_PCCHAR, true);
29877 /* Internal method for ix86_init_builtins. */
29879 static void
29880 ix86_init_builtins_va_builtins_abi (void)
29882 tree ms_va_ref, sysv_va_ref;
29883 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29884 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29885 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29886 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29888 if (!TARGET_64BIT)
29889 return;
29890 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
29891 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
29892 ms_va_ref = build_reference_type (ms_va_list_type_node);
29893 sysv_va_ref =
29894 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
29896 fnvoid_va_end_ms =
29897 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29898 fnvoid_va_start_ms =
29899 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29900 fnvoid_va_end_sysv =
29901 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
29902 fnvoid_va_start_sysv =
29903 build_varargs_function_type_list (void_type_node, sysv_va_ref,
29904 NULL_TREE);
29905 fnvoid_va_copy_ms =
29906 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
29907 NULL_TREE);
29908 fnvoid_va_copy_sysv =
29909 build_function_type_list (void_type_node, sysv_va_ref,
29910 sysv_va_ref, NULL_TREE);
29912 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
29913 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
29914 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
29915 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
29916 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
29917 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
29918 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
29919 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29920 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
29921 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29922 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
29923 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29926 static void
29927 ix86_init_builtin_types (void)
29929 tree float128_type_node, float80_type_node;
29931 /* The __float80 type. */
29932 float80_type_node = long_double_type_node;
29933 if (TYPE_MODE (float80_type_node) != XFmode)
29935 /* The __float80 type. */
29936 float80_type_node = make_node (REAL_TYPE);
29938 TYPE_PRECISION (float80_type_node) = 80;
29939 layout_type (float80_type_node);
29941 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
29943 /* The __float128 type. */
29944 float128_type_node = make_node (REAL_TYPE);
29945 TYPE_PRECISION (float128_type_node) = 128;
29946 layout_type (float128_type_node);
29947 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
29949 /* This macro is built by i386-builtin-types.awk. */
29950 DEFINE_BUILTIN_PRIMITIVE_TYPES;
29953 static void
29954 ix86_init_builtins (void)
29956 tree t;
29958 ix86_init_builtin_types ();
29960 /* Builtins to get CPU type and features. */
29961 ix86_init_platform_type_builtins ();
29963 /* TFmode support builtins. */
29964 def_builtin_const (0, "__builtin_infq",
29965 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
29966 def_builtin_const (0, "__builtin_huge_valq",
29967 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
29969 /* We will expand them to normal call if SSE isn't available since
29970 they are used by libgcc. */
29971 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
29972 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
29973 BUILT_IN_MD, "__fabstf2", NULL_TREE);
29974 TREE_READONLY (t) = 1;
29975 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
29977 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
29978 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
29979 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
29980 TREE_READONLY (t) = 1;
29981 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
29983 ix86_init_tm_builtins ();
29984 ix86_init_mmx_sse_builtins ();
29986 if (TARGET_LP64)
29987 ix86_init_builtins_va_builtins_abi ();
29989 #ifdef SUBTARGET_INIT_BUILTINS
29990 SUBTARGET_INIT_BUILTINS;
29991 #endif
29994 /* Return the ix86 builtin for CODE. */
29996 static tree
29997 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
29999 if (code >= IX86_BUILTIN_MAX)
30000 return error_mark_node;
30002 return ix86_builtins[code];
30005 /* Errors in the source file can cause expand_expr to return const0_rtx
30006 where we expect a vector. To avoid crashing, use one of the vector
30007 clear instructions. */
30008 static rtx
30009 safe_vector_operand (rtx x, enum machine_mode mode)
30011 if (x == const0_rtx)
30012 x = CONST0_RTX (mode);
30013 return x;
30016 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30018 static rtx
30019 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30021 rtx pat;
30022 tree arg0 = CALL_EXPR_ARG (exp, 0);
30023 tree arg1 = CALL_EXPR_ARG (exp, 1);
30024 rtx op0 = expand_normal (arg0);
30025 rtx op1 = expand_normal (arg1);
30026 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30027 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30028 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30030 if (VECTOR_MODE_P (mode0))
30031 op0 = safe_vector_operand (op0, mode0);
30032 if (VECTOR_MODE_P (mode1))
30033 op1 = safe_vector_operand (op1, mode1);
30035 if (optimize || !target
30036 || GET_MODE (target) != tmode
30037 || !insn_data[icode].operand[0].predicate (target, tmode))
30038 target = gen_reg_rtx (tmode);
30040 if (GET_MODE (op1) == SImode && mode1 == TImode)
30042 rtx x = gen_reg_rtx (V4SImode);
30043 emit_insn (gen_sse2_loadd (x, op1));
30044 op1 = gen_lowpart (TImode, x);
30047 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30048 op0 = copy_to_mode_reg (mode0, op0);
30049 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30050 op1 = copy_to_mode_reg (mode1, op1);
30052 pat = GEN_FCN (icode) (target, op0, op1);
30053 if (! pat)
30054 return 0;
30056 emit_insn (pat);
30058 return target;
30061 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30063 static rtx
30064 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30065 enum ix86_builtin_func_type m_type,
30066 enum rtx_code sub_code)
30068 rtx pat;
30069 int i;
30070 int nargs;
30071 bool comparison_p = false;
30072 bool tf_p = false;
30073 bool last_arg_constant = false;
30074 int num_memory = 0;
30075 struct {
30076 rtx op;
30077 enum machine_mode mode;
30078 } args[4];
30080 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30082 switch (m_type)
30084 case MULTI_ARG_4_DF2_DI_I:
30085 case MULTI_ARG_4_DF2_DI_I1:
30086 case MULTI_ARG_4_SF2_SI_I:
30087 case MULTI_ARG_4_SF2_SI_I1:
30088 nargs = 4;
30089 last_arg_constant = true;
30090 break;
30092 case MULTI_ARG_3_SF:
30093 case MULTI_ARG_3_DF:
30094 case MULTI_ARG_3_SF2:
30095 case MULTI_ARG_3_DF2:
30096 case MULTI_ARG_3_DI:
30097 case MULTI_ARG_3_SI:
30098 case MULTI_ARG_3_SI_DI:
30099 case MULTI_ARG_3_HI:
30100 case MULTI_ARG_3_HI_SI:
30101 case MULTI_ARG_3_QI:
30102 case MULTI_ARG_3_DI2:
30103 case MULTI_ARG_3_SI2:
30104 case MULTI_ARG_3_HI2:
30105 case MULTI_ARG_3_QI2:
30106 nargs = 3;
30107 break;
30109 case MULTI_ARG_2_SF:
30110 case MULTI_ARG_2_DF:
30111 case MULTI_ARG_2_DI:
30112 case MULTI_ARG_2_SI:
30113 case MULTI_ARG_2_HI:
30114 case MULTI_ARG_2_QI:
30115 nargs = 2;
30116 break;
30118 case MULTI_ARG_2_DI_IMM:
30119 case MULTI_ARG_2_SI_IMM:
30120 case MULTI_ARG_2_HI_IMM:
30121 case MULTI_ARG_2_QI_IMM:
30122 nargs = 2;
30123 last_arg_constant = true;
30124 break;
30126 case MULTI_ARG_1_SF:
30127 case MULTI_ARG_1_DF:
30128 case MULTI_ARG_1_SF2:
30129 case MULTI_ARG_1_DF2:
30130 case MULTI_ARG_1_DI:
30131 case MULTI_ARG_1_SI:
30132 case MULTI_ARG_1_HI:
30133 case MULTI_ARG_1_QI:
30134 case MULTI_ARG_1_SI_DI:
30135 case MULTI_ARG_1_HI_DI:
30136 case MULTI_ARG_1_HI_SI:
30137 case MULTI_ARG_1_QI_DI:
30138 case MULTI_ARG_1_QI_SI:
30139 case MULTI_ARG_1_QI_HI:
30140 nargs = 1;
30141 break;
30143 case MULTI_ARG_2_DI_CMP:
30144 case MULTI_ARG_2_SI_CMP:
30145 case MULTI_ARG_2_HI_CMP:
30146 case MULTI_ARG_2_QI_CMP:
30147 nargs = 2;
30148 comparison_p = true;
30149 break;
30151 case MULTI_ARG_2_SF_TF:
30152 case MULTI_ARG_2_DF_TF:
30153 case MULTI_ARG_2_DI_TF:
30154 case MULTI_ARG_2_SI_TF:
30155 case MULTI_ARG_2_HI_TF:
30156 case MULTI_ARG_2_QI_TF:
30157 nargs = 2;
30158 tf_p = true;
30159 break;
30161 default:
30162 gcc_unreachable ();
30165 if (optimize || !target
30166 || GET_MODE (target) != tmode
30167 || !insn_data[icode].operand[0].predicate (target, tmode))
30168 target = gen_reg_rtx (tmode);
30170 gcc_assert (nargs <= 4);
30172 for (i = 0; i < nargs; i++)
30174 tree arg = CALL_EXPR_ARG (exp, i);
30175 rtx op = expand_normal (arg);
30176 int adjust = (comparison_p) ? 1 : 0;
30177 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30179 if (last_arg_constant && i == nargs - 1)
30181 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30183 enum insn_code new_icode = icode;
30184 switch (icode)
30186 case CODE_FOR_xop_vpermil2v2df3:
30187 case CODE_FOR_xop_vpermil2v4sf3:
30188 case CODE_FOR_xop_vpermil2v4df3:
30189 case CODE_FOR_xop_vpermil2v8sf3:
30190 error ("the last argument must be a 2-bit immediate");
30191 return gen_reg_rtx (tmode);
30192 case CODE_FOR_xop_rotlv2di3:
30193 new_icode = CODE_FOR_rotlv2di3;
30194 goto xop_rotl;
30195 case CODE_FOR_xop_rotlv4si3:
30196 new_icode = CODE_FOR_rotlv4si3;
30197 goto xop_rotl;
30198 case CODE_FOR_xop_rotlv8hi3:
30199 new_icode = CODE_FOR_rotlv8hi3;
30200 goto xop_rotl;
30201 case CODE_FOR_xop_rotlv16qi3:
30202 new_icode = CODE_FOR_rotlv16qi3;
30203 xop_rotl:
30204 if (CONST_INT_P (op))
30206 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30207 op = GEN_INT (INTVAL (op) & mask);
30208 gcc_checking_assert
30209 (insn_data[icode].operand[i + 1].predicate (op, mode));
30211 else
30213 gcc_checking_assert
30214 (nargs == 2
30215 && insn_data[new_icode].operand[0].mode == tmode
30216 && insn_data[new_icode].operand[1].mode == tmode
30217 && insn_data[new_icode].operand[2].mode == mode
30218 && insn_data[new_icode].operand[0].predicate
30219 == insn_data[icode].operand[0].predicate
30220 && insn_data[new_icode].operand[1].predicate
30221 == insn_data[icode].operand[1].predicate);
30222 icode = new_icode;
30223 goto non_constant;
30225 break;
30226 default:
30227 gcc_unreachable ();
30231 else
30233 non_constant:
30234 if (VECTOR_MODE_P (mode))
30235 op = safe_vector_operand (op, mode);
30237 /* If we aren't optimizing, only allow one memory operand to be
30238 generated. */
30239 if (memory_operand (op, mode))
30240 num_memory++;
30242 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30244 if (optimize
30245 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30246 || num_memory > 1)
30247 op = force_reg (mode, op);
30250 args[i].op = op;
30251 args[i].mode = mode;
30254 switch (nargs)
30256 case 1:
30257 pat = GEN_FCN (icode) (target, args[0].op);
30258 break;
30260 case 2:
30261 if (tf_p)
30262 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30263 GEN_INT ((int)sub_code));
30264 else if (! comparison_p)
30265 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30266 else
30268 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30269 args[0].op,
30270 args[1].op);
30272 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30274 break;
30276 case 3:
30277 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30278 break;
30280 case 4:
30281 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30282 break;
30284 default:
30285 gcc_unreachable ();
30288 if (! pat)
30289 return 0;
30291 emit_insn (pat);
30292 return target;
30295 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30296 insns with vec_merge. */
30298 static rtx
30299 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30300 rtx target)
30302 rtx pat;
30303 tree arg0 = CALL_EXPR_ARG (exp, 0);
30304 rtx op1, op0 = expand_normal (arg0);
30305 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30306 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30308 if (optimize || !target
30309 || GET_MODE (target) != tmode
30310 || !insn_data[icode].operand[0].predicate (target, tmode))
30311 target = gen_reg_rtx (tmode);
30313 if (VECTOR_MODE_P (mode0))
30314 op0 = safe_vector_operand (op0, mode0);
30316 if ((optimize && !register_operand (op0, mode0))
30317 || !insn_data[icode].operand[1].predicate (op0, mode0))
30318 op0 = copy_to_mode_reg (mode0, op0);
30320 op1 = op0;
30321 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30322 op1 = copy_to_mode_reg (mode0, op1);
30324 pat = GEN_FCN (icode) (target, op0, op1);
30325 if (! pat)
30326 return 0;
30327 emit_insn (pat);
30328 return target;
30331 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30333 static rtx
30334 ix86_expand_sse_compare (const struct builtin_description *d,
30335 tree exp, rtx target, bool swap)
30337 rtx pat;
30338 tree arg0 = CALL_EXPR_ARG (exp, 0);
30339 tree arg1 = CALL_EXPR_ARG (exp, 1);
30340 rtx op0 = expand_normal (arg0);
30341 rtx op1 = expand_normal (arg1);
30342 rtx op2;
30343 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30344 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30345 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30346 enum rtx_code comparison = d->comparison;
30348 if (VECTOR_MODE_P (mode0))
30349 op0 = safe_vector_operand (op0, mode0);
30350 if (VECTOR_MODE_P (mode1))
30351 op1 = safe_vector_operand (op1, mode1);
30353 /* Swap operands if we have a comparison that isn't available in
30354 hardware. */
30355 if (swap)
30357 rtx tmp = gen_reg_rtx (mode1);
30358 emit_move_insn (tmp, op1);
30359 op1 = op0;
30360 op0 = tmp;
30363 if (optimize || !target
30364 || GET_MODE (target) != tmode
30365 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30366 target = gen_reg_rtx (tmode);
30368 if ((optimize && !register_operand (op0, mode0))
30369 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30370 op0 = copy_to_mode_reg (mode0, op0);
30371 if ((optimize && !register_operand (op1, mode1))
30372 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30373 op1 = copy_to_mode_reg (mode1, op1);
30375 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30376 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30377 if (! pat)
30378 return 0;
30379 emit_insn (pat);
30380 return target;
30383 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30385 static rtx
30386 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30387 rtx target)
30389 rtx pat;
30390 tree arg0 = CALL_EXPR_ARG (exp, 0);
30391 tree arg1 = CALL_EXPR_ARG (exp, 1);
30392 rtx op0 = expand_normal (arg0);
30393 rtx op1 = expand_normal (arg1);
30394 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30395 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30396 enum rtx_code comparison = d->comparison;
30398 if (VECTOR_MODE_P (mode0))
30399 op0 = safe_vector_operand (op0, mode0);
30400 if (VECTOR_MODE_P (mode1))
30401 op1 = safe_vector_operand (op1, mode1);
30403 /* Swap operands if we have a comparison that isn't available in
30404 hardware. */
30405 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30407 rtx tmp = op1;
30408 op1 = op0;
30409 op0 = tmp;
30412 target = gen_reg_rtx (SImode);
30413 emit_move_insn (target, const0_rtx);
30414 target = gen_rtx_SUBREG (QImode, target, 0);
30416 if ((optimize && !register_operand (op0, mode0))
30417 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30418 op0 = copy_to_mode_reg (mode0, op0);
30419 if ((optimize && !register_operand (op1, mode1))
30420 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30421 op1 = copy_to_mode_reg (mode1, op1);
30423 pat = GEN_FCN (d->icode) (op0, op1);
30424 if (! pat)
30425 return 0;
30426 emit_insn (pat);
30427 emit_insn (gen_rtx_SET (VOIDmode,
30428 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30429 gen_rtx_fmt_ee (comparison, QImode,
30430 SET_DEST (pat),
30431 const0_rtx)));
30433 return SUBREG_REG (target);
30436 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30438 static rtx
30439 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30440 rtx target)
30442 rtx pat;
30443 tree arg0 = CALL_EXPR_ARG (exp, 0);
30444 rtx op1, op0 = expand_normal (arg0);
30445 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30446 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30448 if (optimize || target == 0
30449 || GET_MODE (target) != tmode
30450 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30451 target = gen_reg_rtx (tmode);
30453 if (VECTOR_MODE_P (mode0))
30454 op0 = safe_vector_operand (op0, mode0);
30456 if ((optimize && !register_operand (op0, mode0))
30457 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30458 op0 = copy_to_mode_reg (mode0, op0);
30460 op1 = GEN_INT (d->comparison);
30462 pat = GEN_FCN (d->icode) (target, op0, op1);
30463 if (! pat)
30464 return 0;
30465 emit_insn (pat);
30466 return target;
30469 static rtx
30470 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30471 tree exp, rtx target)
30473 rtx pat;
30474 tree arg0 = CALL_EXPR_ARG (exp, 0);
30475 tree arg1 = CALL_EXPR_ARG (exp, 1);
30476 rtx op0 = expand_normal (arg0);
30477 rtx op1 = expand_normal (arg1);
30478 rtx op2;
30479 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30480 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30481 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30483 if (optimize || target == 0
30484 || GET_MODE (target) != tmode
30485 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30486 target = gen_reg_rtx (tmode);
30488 op0 = safe_vector_operand (op0, mode0);
30489 op1 = safe_vector_operand (op1, mode1);
30491 if ((optimize && !register_operand (op0, mode0))
30492 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30493 op0 = copy_to_mode_reg (mode0, op0);
30494 if ((optimize && !register_operand (op1, mode1))
30495 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30496 op1 = copy_to_mode_reg (mode1, op1);
30498 op2 = GEN_INT (d->comparison);
30500 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30501 if (! pat)
30502 return 0;
30503 emit_insn (pat);
30504 return target;
30507 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30509 static rtx
30510 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30511 rtx target)
30513 rtx pat;
30514 tree arg0 = CALL_EXPR_ARG (exp, 0);
30515 tree arg1 = CALL_EXPR_ARG (exp, 1);
30516 rtx op0 = expand_normal (arg0);
30517 rtx op1 = expand_normal (arg1);
30518 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30519 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30520 enum rtx_code comparison = d->comparison;
30522 if (VECTOR_MODE_P (mode0))
30523 op0 = safe_vector_operand (op0, mode0);
30524 if (VECTOR_MODE_P (mode1))
30525 op1 = safe_vector_operand (op1, mode1);
30527 target = gen_reg_rtx (SImode);
30528 emit_move_insn (target, const0_rtx);
30529 target = gen_rtx_SUBREG (QImode, target, 0);
30531 if ((optimize && !register_operand (op0, mode0))
30532 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30533 op0 = copy_to_mode_reg (mode0, op0);
30534 if ((optimize && !register_operand (op1, mode1))
30535 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30536 op1 = copy_to_mode_reg (mode1, op1);
30538 pat = GEN_FCN (d->icode) (op0, op1);
30539 if (! pat)
30540 return 0;
30541 emit_insn (pat);
30542 emit_insn (gen_rtx_SET (VOIDmode,
30543 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30544 gen_rtx_fmt_ee (comparison, QImode,
30545 SET_DEST (pat),
30546 const0_rtx)));
30548 return SUBREG_REG (target);
30551 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30553 static rtx
30554 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30555 tree exp, rtx target)
30557 rtx pat;
30558 tree arg0 = CALL_EXPR_ARG (exp, 0);
30559 tree arg1 = CALL_EXPR_ARG (exp, 1);
30560 tree arg2 = CALL_EXPR_ARG (exp, 2);
30561 tree arg3 = CALL_EXPR_ARG (exp, 3);
30562 tree arg4 = CALL_EXPR_ARG (exp, 4);
30563 rtx scratch0, scratch1;
30564 rtx op0 = expand_normal (arg0);
30565 rtx op1 = expand_normal (arg1);
30566 rtx op2 = expand_normal (arg2);
30567 rtx op3 = expand_normal (arg3);
30568 rtx op4 = expand_normal (arg4);
30569 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30571 tmode0 = insn_data[d->icode].operand[0].mode;
30572 tmode1 = insn_data[d->icode].operand[1].mode;
30573 modev2 = insn_data[d->icode].operand[2].mode;
30574 modei3 = insn_data[d->icode].operand[3].mode;
30575 modev4 = insn_data[d->icode].operand[4].mode;
30576 modei5 = insn_data[d->icode].operand[5].mode;
30577 modeimm = insn_data[d->icode].operand[6].mode;
30579 if (VECTOR_MODE_P (modev2))
30580 op0 = safe_vector_operand (op0, modev2);
30581 if (VECTOR_MODE_P (modev4))
30582 op2 = safe_vector_operand (op2, modev4);
30584 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30585 op0 = copy_to_mode_reg (modev2, op0);
30586 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30587 op1 = copy_to_mode_reg (modei3, op1);
30588 if ((optimize && !register_operand (op2, modev4))
30589 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30590 op2 = copy_to_mode_reg (modev4, op2);
30591 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30592 op3 = copy_to_mode_reg (modei5, op3);
30594 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30596 error ("the fifth argument must be an 8-bit immediate");
30597 return const0_rtx;
30600 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30602 if (optimize || !target
30603 || GET_MODE (target) != tmode0
30604 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30605 target = gen_reg_rtx (tmode0);
30607 scratch1 = gen_reg_rtx (tmode1);
30609 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30611 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30613 if (optimize || !target
30614 || GET_MODE (target) != tmode1
30615 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30616 target = gen_reg_rtx (tmode1);
30618 scratch0 = gen_reg_rtx (tmode0);
30620 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30622 else
30624 gcc_assert (d->flag);
30626 scratch0 = gen_reg_rtx (tmode0);
30627 scratch1 = gen_reg_rtx (tmode1);
30629 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30632 if (! pat)
30633 return 0;
30635 emit_insn (pat);
30637 if (d->flag)
30639 target = gen_reg_rtx (SImode);
30640 emit_move_insn (target, const0_rtx);
30641 target = gen_rtx_SUBREG (QImode, target, 0);
30643 emit_insn
30644 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30645 gen_rtx_fmt_ee (EQ, QImode,
30646 gen_rtx_REG ((enum machine_mode) d->flag,
30647 FLAGS_REG),
30648 const0_rtx)));
30649 return SUBREG_REG (target);
30651 else
30652 return target;
30656 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30658 static rtx
30659 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30660 tree exp, rtx target)
30662 rtx pat;
30663 tree arg0 = CALL_EXPR_ARG (exp, 0);
30664 tree arg1 = CALL_EXPR_ARG (exp, 1);
30665 tree arg2 = CALL_EXPR_ARG (exp, 2);
30666 rtx scratch0, scratch1;
30667 rtx op0 = expand_normal (arg0);
30668 rtx op1 = expand_normal (arg1);
30669 rtx op2 = expand_normal (arg2);
30670 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30672 tmode0 = insn_data[d->icode].operand[0].mode;
30673 tmode1 = insn_data[d->icode].operand[1].mode;
30674 modev2 = insn_data[d->icode].operand[2].mode;
30675 modev3 = insn_data[d->icode].operand[3].mode;
30676 modeimm = insn_data[d->icode].operand[4].mode;
30678 if (VECTOR_MODE_P (modev2))
30679 op0 = safe_vector_operand (op0, modev2);
30680 if (VECTOR_MODE_P (modev3))
30681 op1 = safe_vector_operand (op1, modev3);
30683 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30684 op0 = copy_to_mode_reg (modev2, op0);
30685 if ((optimize && !register_operand (op1, modev3))
30686 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30687 op1 = copy_to_mode_reg (modev3, op1);
30689 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30691 error ("the third argument must be an 8-bit immediate");
30692 return const0_rtx;
30695 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30697 if (optimize || !target
30698 || GET_MODE (target) != tmode0
30699 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30700 target = gen_reg_rtx (tmode0);
30702 scratch1 = gen_reg_rtx (tmode1);
30704 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30706 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30708 if (optimize || !target
30709 || GET_MODE (target) != tmode1
30710 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30711 target = gen_reg_rtx (tmode1);
30713 scratch0 = gen_reg_rtx (tmode0);
30715 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30717 else
30719 gcc_assert (d->flag);
30721 scratch0 = gen_reg_rtx (tmode0);
30722 scratch1 = gen_reg_rtx (tmode1);
30724 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30727 if (! pat)
30728 return 0;
30730 emit_insn (pat);
30732 if (d->flag)
30734 target = gen_reg_rtx (SImode);
30735 emit_move_insn (target, const0_rtx);
30736 target = gen_rtx_SUBREG (QImode, target, 0);
30738 emit_insn
30739 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30740 gen_rtx_fmt_ee (EQ, QImode,
30741 gen_rtx_REG ((enum machine_mode) d->flag,
30742 FLAGS_REG),
30743 const0_rtx)));
30744 return SUBREG_REG (target);
30746 else
30747 return target;
30750 /* Subroutine of ix86_expand_builtin to take care of insns with
30751 variable number of operands. */
30753 static rtx
30754 ix86_expand_args_builtin (const struct builtin_description *d,
30755 tree exp, rtx target)
30757 rtx pat, real_target;
30758 unsigned int i, nargs;
30759 unsigned int nargs_constant = 0;
30760 int num_memory = 0;
30761 struct
30763 rtx op;
30764 enum machine_mode mode;
30765 } args[4];
30766 bool last_arg_count = false;
30767 enum insn_code icode = d->icode;
30768 const struct insn_data_d *insn_p = &insn_data[icode];
30769 enum machine_mode tmode = insn_p->operand[0].mode;
30770 enum machine_mode rmode = VOIDmode;
30771 bool swap = false;
30772 enum rtx_code comparison = d->comparison;
30774 switch ((enum ix86_builtin_func_type) d->flag)
30776 case V2DF_FTYPE_V2DF_ROUND:
30777 case V4DF_FTYPE_V4DF_ROUND:
30778 case V4SF_FTYPE_V4SF_ROUND:
30779 case V8SF_FTYPE_V8SF_ROUND:
30780 case V4SI_FTYPE_V4SF_ROUND:
30781 case V8SI_FTYPE_V8SF_ROUND:
30782 return ix86_expand_sse_round (d, exp, target);
30783 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30784 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30785 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30786 case INT_FTYPE_V8SF_V8SF_PTEST:
30787 case INT_FTYPE_V4DI_V4DI_PTEST:
30788 case INT_FTYPE_V4DF_V4DF_PTEST:
30789 case INT_FTYPE_V4SF_V4SF_PTEST:
30790 case INT_FTYPE_V2DI_V2DI_PTEST:
30791 case INT_FTYPE_V2DF_V2DF_PTEST:
30792 return ix86_expand_sse_ptest (d, exp, target);
30793 case FLOAT128_FTYPE_FLOAT128:
30794 case FLOAT_FTYPE_FLOAT:
30795 case INT_FTYPE_INT:
30796 case UINT64_FTYPE_INT:
30797 case UINT16_FTYPE_UINT16:
30798 case INT64_FTYPE_INT64:
30799 case INT64_FTYPE_V4SF:
30800 case INT64_FTYPE_V2DF:
30801 case INT_FTYPE_V16QI:
30802 case INT_FTYPE_V8QI:
30803 case INT_FTYPE_V8SF:
30804 case INT_FTYPE_V4DF:
30805 case INT_FTYPE_V4SF:
30806 case INT_FTYPE_V2DF:
30807 case INT_FTYPE_V32QI:
30808 case V16QI_FTYPE_V16QI:
30809 case V8SI_FTYPE_V8SF:
30810 case V8SI_FTYPE_V4SI:
30811 case V8HI_FTYPE_V8HI:
30812 case V8HI_FTYPE_V16QI:
30813 case V8QI_FTYPE_V8QI:
30814 case V8SF_FTYPE_V8SF:
30815 case V8SF_FTYPE_V8SI:
30816 case V8SF_FTYPE_V4SF:
30817 case V8SF_FTYPE_V8HI:
30818 case V4SI_FTYPE_V4SI:
30819 case V4SI_FTYPE_V16QI:
30820 case V4SI_FTYPE_V4SF:
30821 case V4SI_FTYPE_V8SI:
30822 case V4SI_FTYPE_V8HI:
30823 case V4SI_FTYPE_V4DF:
30824 case V4SI_FTYPE_V2DF:
30825 case V4HI_FTYPE_V4HI:
30826 case V4DF_FTYPE_V4DF:
30827 case V4DF_FTYPE_V4SI:
30828 case V4DF_FTYPE_V4SF:
30829 case V4DF_FTYPE_V2DF:
30830 case V4SF_FTYPE_V4SF:
30831 case V4SF_FTYPE_V4SI:
30832 case V4SF_FTYPE_V8SF:
30833 case V4SF_FTYPE_V4DF:
30834 case V4SF_FTYPE_V8HI:
30835 case V4SF_FTYPE_V2DF:
30836 case V2DI_FTYPE_V2DI:
30837 case V2DI_FTYPE_V16QI:
30838 case V2DI_FTYPE_V8HI:
30839 case V2DI_FTYPE_V4SI:
30840 case V2DF_FTYPE_V2DF:
30841 case V2DF_FTYPE_V4SI:
30842 case V2DF_FTYPE_V4DF:
30843 case V2DF_FTYPE_V4SF:
30844 case V2DF_FTYPE_V2SI:
30845 case V2SI_FTYPE_V2SI:
30846 case V2SI_FTYPE_V4SF:
30847 case V2SI_FTYPE_V2SF:
30848 case V2SI_FTYPE_V2DF:
30849 case V2SF_FTYPE_V2SF:
30850 case V2SF_FTYPE_V2SI:
30851 case V32QI_FTYPE_V32QI:
30852 case V32QI_FTYPE_V16QI:
30853 case V16HI_FTYPE_V16HI:
30854 case V16HI_FTYPE_V8HI:
30855 case V8SI_FTYPE_V8SI:
30856 case V16HI_FTYPE_V16QI:
30857 case V8SI_FTYPE_V16QI:
30858 case V4DI_FTYPE_V16QI:
30859 case V8SI_FTYPE_V8HI:
30860 case V4DI_FTYPE_V8HI:
30861 case V4DI_FTYPE_V4SI:
30862 case V4DI_FTYPE_V2DI:
30863 nargs = 1;
30864 break;
30865 case V4SF_FTYPE_V4SF_VEC_MERGE:
30866 case V2DF_FTYPE_V2DF_VEC_MERGE:
30867 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30868 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30869 case V16QI_FTYPE_V16QI_V16QI:
30870 case V16QI_FTYPE_V8HI_V8HI:
30871 case V8QI_FTYPE_V8QI_V8QI:
30872 case V8QI_FTYPE_V4HI_V4HI:
30873 case V8HI_FTYPE_V8HI_V8HI:
30874 case V8HI_FTYPE_V16QI_V16QI:
30875 case V8HI_FTYPE_V4SI_V4SI:
30876 case V8SF_FTYPE_V8SF_V8SF:
30877 case V8SF_FTYPE_V8SF_V8SI:
30878 case V4SI_FTYPE_V4SI_V4SI:
30879 case V4SI_FTYPE_V8HI_V8HI:
30880 case V4SI_FTYPE_V4SF_V4SF:
30881 case V4SI_FTYPE_V2DF_V2DF:
30882 case V4HI_FTYPE_V4HI_V4HI:
30883 case V4HI_FTYPE_V8QI_V8QI:
30884 case V4HI_FTYPE_V2SI_V2SI:
30885 case V4DF_FTYPE_V4DF_V4DF:
30886 case V4DF_FTYPE_V4DF_V4DI:
30887 case V4SF_FTYPE_V4SF_V4SF:
30888 case V4SF_FTYPE_V4SF_V4SI:
30889 case V4SF_FTYPE_V4SF_V2SI:
30890 case V4SF_FTYPE_V4SF_V2DF:
30891 case V4SF_FTYPE_V4SF_DI:
30892 case V4SF_FTYPE_V4SF_SI:
30893 case V2DI_FTYPE_V2DI_V2DI:
30894 case V2DI_FTYPE_V16QI_V16QI:
30895 case V2DI_FTYPE_V4SI_V4SI:
30896 case V2UDI_FTYPE_V4USI_V4USI:
30897 case V2DI_FTYPE_V2DI_V16QI:
30898 case V2DI_FTYPE_V2DF_V2DF:
30899 case V2SI_FTYPE_V2SI_V2SI:
30900 case V2SI_FTYPE_V4HI_V4HI:
30901 case V2SI_FTYPE_V2SF_V2SF:
30902 case V2DF_FTYPE_V2DF_V2DF:
30903 case V2DF_FTYPE_V2DF_V4SF:
30904 case V2DF_FTYPE_V2DF_V2DI:
30905 case V2DF_FTYPE_V2DF_DI:
30906 case V2DF_FTYPE_V2DF_SI:
30907 case V2SF_FTYPE_V2SF_V2SF:
30908 case V1DI_FTYPE_V1DI_V1DI:
30909 case V1DI_FTYPE_V8QI_V8QI:
30910 case V1DI_FTYPE_V2SI_V2SI:
30911 case V32QI_FTYPE_V16HI_V16HI:
30912 case V16HI_FTYPE_V8SI_V8SI:
30913 case V32QI_FTYPE_V32QI_V32QI:
30914 case V16HI_FTYPE_V32QI_V32QI:
30915 case V16HI_FTYPE_V16HI_V16HI:
30916 case V8SI_FTYPE_V4DF_V4DF:
30917 case V8SI_FTYPE_V8SI_V8SI:
30918 case V8SI_FTYPE_V16HI_V16HI:
30919 case V4DI_FTYPE_V4DI_V4DI:
30920 case V4DI_FTYPE_V8SI_V8SI:
30921 case V4UDI_FTYPE_V8USI_V8USI:
30922 if (comparison == UNKNOWN)
30923 return ix86_expand_binop_builtin (icode, exp, target);
30924 nargs = 2;
30925 break;
30926 case V4SF_FTYPE_V4SF_V4SF_SWAP:
30927 case V2DF_FTYPE_V2DF_V2DF_SWAP:
30928 gcc_assert (comparison != UNKNOWN);
30929 nargs = 2;
30930 swap = true;
30931 break;
30932 case V16HI_FTYPE_V16HI_V8HI_COUNT:
30933 case V16HI_FTYPE_V16HI_SI_COUNT:
30934 case V8SI_FTYPE_V8SI_V4SI_COUNT:
30935 case V8SI_FTYPE_V8SI_SI_COUNT:
30936 case V4DI_FTYPE_V4DI_V2DI_COUNT:
30937 case V4DI_FTYPE_V4DI_INT_COUNT:
30938 case V8HI_FTYPE_V8HI_V8HI_COUNT:
30939 case V8HI_FTYPE_V8HI_SI_COUNT:
30940 case V4SI_FTYPE_V4SI_V4SI_COUNT:
30941 case V4SI_FTYPE_V4SI_SI_COUNT:
30942 case V4HI_FTYPE_V4HI_V4HI_COUNT:
30943 case V4HI_FTYPE_V4HI_SI_COUNT:
30944 case V2DI_FTYPE_V2DI_V2DI_COUNT:
30945 case V2DI_FTYPE_V2DI_SI_COUNT:
30946 case V2SI_FTYPE_V2SI_V2SI_COUNT:
30947 case V2SI_FTYPE_V2SI_SI_COUNT:
30948 case V1DI_FTYPE_V1DI_V1DI_COUNT:
30949 case V1DI_FTYPE_V1DI_SI_COUNT:
30950 nargs = 2;
30951 last_arg_count = true;
30952 break;
30953 case UINT64_FTYPE_UINT64_UINT64:
30954 case UINT_FTYPE_UINT_UINT:
30955 case UINT_FTYPE_UINT_USHORT:
30956 case UINT_FTYPE_UINT_UCHAR:
30957 case UINT16_FTYPE_UINT16_INT:
30958 case UINT8_FTYPE_UINT8_INT:
30959 nargs = 2;
30960 break;
30961 case V2DI_FTYPE_V2DI_INT_CONVERT:
30962 nargs = 2;
30963 rmode = V1TImode;
30964 nargs_constant = 1;
30965 break;
30966 case V4DI_FTYPE_V4DI_INT_CONVERT:
30967 nargs = 2;
30968 rmode = V2TImode;
30969 nargs_constant = 1;
30970 break;
30971 case V8HI_FTYPE_V8HI_INT:
30972 case V8HI_FTYPE_V8SF_INT:
30973 case V8HI_FTYPE_V4SF_INT:
30974 case V8SF_FTYPE_V8SF_INT:
30975 case V4SI_FTYPE_V4SI_INT:
30976 case V4SI_FTYPE_V8SI_INT:
30977 case V4HI_FTYPE_V4HI_INT:
30978 case V4DF_FTYPE_V4DF_INT:
30979 case V4SF_FTYPE_V4SF_INT:
30980 case V4SF_FTYPE_V8SF_INT:
30981 case V2DI_FTYPE_V2DI_INT:
30982 case V2DF_FTYPE_V2DF_INT:
30983 case V2DF_FTYPE_V4DF_INT:
30984 case V16HI_FTYPE_V16HI_INT:
30985 case V8SI_FTYPE_V8SI_INT:
30986 case V4DI_FTYPE_V4DI_INT:
30987 case V2DI_FTYPE_V4DI_INT:
30988 nargs = 2;
30989 nargs_constant = 1;
30990 break;
30991 case V16QI_FTYPE_V16QI_V16QI_V16QI:
30992 case V8SF_FTYPE_V8SF_V8SF_V8SF:
30993 case V4DF_FTYPE_V4DF_V4DF_V4DF:
30994 case V4SF_FTYPE_V4SF_V4SF_V4SF:
30995 case V2DF_FTYPE_V2DF_V2DF_V2DF:
30996 case V32QI_FTYPE_V32QI_V32QI_V32QI:
30997 nargs = 3;
30998 break;
30999 case V32QI_FTYPE_V32QI_V32QI_INT:
31000 case V16HI_FTYPE_V16HI_V16HI_INT:
31001 case V16QI_FTYPE_V16QI_V16QI_INT:
31002 case V4DI_FTYPE_V4DI_V4DI_INT:
31003 case V8HI_FTYPE_V8HI_V8HI_INT:
31004 case V8SI_FTYPE_V8SI_V8SI_INT:
31005 case V8SI_FTYPE_V8SI_V4SI_INT:
31006 case V8SF_FTYPE_V8SF_V8SF_INT:
31007 case V8SF_FTYPE_V8SF_V4SF_INT:
31008 case V4SI_FTYPE_V4SI_V4SI_INT:
31009 case V4DF_FTYPE_V4DF_V4DF_INT:
31010 case V4DF_FTYPE_V4DF_V2DF_INT:
31011 case V4SF_FTYPE_V4SF_V4SF_INT:
31012 case V2DI_FTYPE_V2DI_V2DI_INT:
31013 case V4DI_FTYPE_V4DI_V2DI_INT:
31014 case V2DF_FTYPE_V2DF_V2DF_INT:
31015 nargs = 3;
31016 nargs_constant = 1;
31017 break;
31018 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31019 nargs = 3;
31020 rmode = V4DImode;
31021 nargs_constant = 1;
31022 break;
31023 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31024 nargs = 3;
31025 rmode = V2DImode;
31026 nargs_constant = 1;
31027 break;
31028 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31029 nargs = 3;
31030 rmode = DImode;
31031 nargs_constant = 1;
31032 break;
31033 case V2DI_FTYPE_V2DI_UINT_UINT:
31034 nargs = 3;
31035 nargs_constant = 2;
31036 break;
31037 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31038 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31039 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31040 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31041 nargs = 4;
31042 nargs_constant = 1;
31043 break;
31044 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31045 nargs = 4;
31046 nargs_constant = 2;
31047 break;
31048 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31049 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31050 nargs = 4;
31051 break;
31052 default:
31053 gcc_unreachable ();
31056 gcc_assert (nargs <= ARRAY_SIZE (args));
31058 if (comparison != UNKNOWN)
31060 gcc_assert (nargs == 2);
31061 return ix86_expand_sse_compare (d, exp, target, swap);
31064 if (rmode == VOIDmode || rmode == tmode)
31066 if (optimize
31067 || target == 0
31068 || GET_MODE (target) != tmode
31069 || !insn_p->operand[0].predicate (target, tmode))
31070 target = gen_reg_rtx (tmode);
31071 real_target = target;
31073 else
31075 target = gen_reg_rtx (rmode);
31076 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31079 for (i = 0; i < nargs; i++)
31081 tree arg = CALL_EXPR_ARG (exp, i);
31082 rtx op = expand_normal (arg);
31083 enum machine_mode mode = insn_p->operand[i + 1].mode;
31084 bool match = insn_p->operand[i + 1].predicate (op, mode);
31086 if (last_arg_count && (i + 1) == nargs)
31088 /* SIMD shift insns take either an 8-bit immediate or
31089 register as count. But builtin functions take int as
31090 count. If count doesn't match, we put it in register. */
31091 if (!match)
31093 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31094 if (!insn_p->operand[i + 1].predicate (op, mode))
31095 op = copy_to_reg (op);
31098 else if ((nargs - i) <= nargs_constant)
31100 if (!match)
31101 switch (icode)
31103 case CODE_FOR_avx2_inserti128:
31104 case CODE_FOR_avx2_extracti128:
31105 error ("the last argument must be an 1-bit immediate");
31106 return const0_rtx;
31108 case CODE_FOR_sse4_1_roundsd:
31109 case CODE_FOR_sse4_1_roundss:
31111 case CODE_FOR_sse4_1_roundpd:
31112 case CODE_FOR_sse4_1_roundps:
31113 case CODE_FOR_avx_roundpd256:
31114 case CODE_FOR_avx_roundps256:
31116 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31117 case CODE_FOR_sse4_1_roundps_sfix:
31118 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31119 case CODE_FOR_avx_roundps_sfix256:
31121 case CODE_FOR_sse4_1_blendps:
31122 case CODE_FOR_avx_blendpd256:
31123 case CODE_FOR_avx_vpermilv4df:
31124 error ("the last argument must be a 4-bit immediate");
31125 return const0_rtx;
31127 case CODE_FOR_sse4_1_blendpd:
31128 case CODE_FOR_avx_vpermilv2df:
31129 case CODE_FOR_xop_vpermil2v2df3:
31130 case CODE_FOR_xop_vpermil2v4sf3:
31131 case CODE_FOR_xop_vpermil2v4df3:
31132 case CODE_FOR_xop_vpermil2v8sf3:
31133 error ("the last argument must be a 2-bit immediate");
31134 return const0_rtx;
31136 case CODE_FOR_avx_vextractf128v4df:
31137 case CODE_FOR_avx_vextractf128v8sf:
31138 case CODE_FOR_avx_vextractf128v8si:
31139 case CODE_FOR_avx_vinsertf128v4df:
31140 case CODE_FOR_avx_vinsertf128v8sf:
31141 case CODE_FOR_avx_vinsertf128v8si:
31142 error ("the last argument must be a 1-bit immediate");
31143 return const0_rtx;
31145 case CODE_FOR_avx_vmcmpv2df3:
31146 case CODE_FOR_avx_vmcmpv4sf3:
31147 case CODE_FOR_avx_cmpv2df3:
31148 case CODE_FOR_avx_cmpv4sf3:
31149 case CODE_FOR_avx_cmpv4df3:
31150 case CODE_FOR_avx_cmpv8sf3:
31151 error ("the last argument must be a 5-bit immediate");
31152 return const0_rtx;
31154 default:
31155 switch (nargs_constant)
31157 case 2:
31158 if ((nargs - i) == nargs_constant)
31160 error ("the next to last argument must be an 8-bit immediate");
31161 break;
31163 case 1:
31164 error ("the last argument must be an 8-bit immediate");
31165 break;
31166 default:
31167 gcc_unreachable ();
31169 return const0_rtx;
31172 else
31174 if (VECTOR_MODE_P (mode))
31175 op = safe_vector_operand (op, mode);
31177 /* If we aren't optimizing, only allow one memory operand to
31178 be generated. */
31179 if (memory_operand (op, mode))
31180 num_memory++;
31182 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31184 if (optimize || !match || num_memory > 1)
31185 op = copy_to_mode_reg (mode, op);
31187 else
31189 op = copy_to_reg (op);
31190 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31194 args[i].op = op;
31195 args[i].mode = mode;
31198 switch (nargs)
31200 case 1:
31201 pat = GEN_FCN (icode) (real_target, args[0].op);
31202 break;
31203 case 2:
31204 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31205 break;
31206 case 3:
31207 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31208 args[2].op);
31209 break;
31210 case 4:
31211 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31212 args[2].op, args[3].op);
31213 break;
31214 default:
31215 gcc_unreachable ();
31218 if (! pat)
31219 return 0;
31221 emit_insn (pat);
31222 return target;
31225 /* Subroutine of ix86_expand_builtin to take care of special insns
31226 with variable number of operands. */
31228 static rtx
31229 ix86_expand_special_args_builtin (const struct builtin_description *d,
31230 tree exp, rtx target)
31232 tree arg;
31233 rtx pat, op;
31234 unsigned int i, nargs, arg_adjust, memory;
31235 bool aligned_mem = false;
31236 struct
31238 rtx op;
31239 enum machine_mode mode;
31240 } args[3];
31241 enum insn_code icode = d->icode;
31242 bool last_arg_constant = false;
31243 const struct insn_data_d *insn_p = &insn_data[icode];
31244 enum machine_mode tmode = insn_p->operand[0].mode;
31245 enum { load, store } klass;
31247 switch ((enum ix86_builtin_func_type) d->flag)
31249 case VOID_FTYPE_VOID:
31250 emit_insn (GEN_FCN (icode) (target));
31251 return 0;
31252 case VOID_FTYPE_UINT64:
31253 case VOID_FTYPE_UNSIGNED:
31254 nargs = 0;
31255 klass = store;
31256 memory = 0;
31257 break;
31259 case INT_FTYPE_VOID:
31260 case UINT64_FTYPE_VOID:
31261 case UNSIGNED_FTYPE_VOID:
31262 nargs = 0;
31263 klass = load;
31264 memory = 0;
31265 break;
31266 case UINT64_FTYPE_PUNSIGNED:
31267 case V2DI_FTYPE_PV2DI:
31268 case V4DI_FTYPE_PV4DI:
31269 case V32QI_FTYPE_PCCHAR:
31270 case V16QI_FTYPE_PCCHAR:
31271 case V8SF_FTYPE_PCV4SF:
31272 case V8SF_FTYPE_PCFLOAT:
31273 case V4SF_FTYPE_PCFLOAT:
31274 case V4DF_FTYPE_PCV2DF:
31275 case V4DF_FTYPE_PCDOUBLE:
31276 case V2DF_FTYPE_PCDOUBLE:
31277 case VOID_FTYPE_PVOID:
31278 nargs = 1;
31279 klass = load;
31280 memory = 0;
31281 switch (icode)
31283 case CODE_FOR_sse4_1_movntdqa:
31284 case CODE_FOR_avx2_movntdqa:
31285 aligned_mem = true;
31286 break;
31287 default:
31288 break;
31290 break;
31291 case VOID_FTYPE_PV2SF_V4SF:
31292 case VOID_FTYPE_PV4DI_V4DI:
31293 case VOID_FTYPE_PV2DI_V2DI:
31294 case VOID_FTYPE_PCHAR_V32QI:
31295 case VOID_FTYPE_PCHAR_V16QI:
31296 case VOID_FTYPE_PFLOAT_V8SF:
31297 case VOID_FTYPE_PFLOAT_V4SF:
31298 case VOID_FTYPE_PDOUBLE_V4DF:
31299 case VOID_FTYPE_PDOUBLE_V2DF:
31300 case VOID_FTYPE_PLONGLONG_LONGLONG:
31301 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31302 case VOID_FTYPE_PINT_INT:
31303 nargs = 1;
31304 klass = store;
31305 /* Reserve memory operand for target. */
31306 memory = ARRAY_SIZE (args);
31307 switch (icode)
31309 /* These builtins and instructions require the memory
31310 to be properly aligned. */
31311 case CODE_FOR_avx_movntv4di:
31312 case CODE_FOR_sse2_movntv2di:
31313 case CODE_FOR_avx_movntv8sf:
31314 case CODE_FOR_sse_movntv4sf:
31315 case CODE_FOR_sse4a_vmmovntv4sf:
31316 case CODE_FOR_avx_movntv4df:
31317 case CODE_FOR_sse2_movntv2df:
31318 case CODE_FOR_sse4a_vmmovntv2df:
31319 case CODE_FOR_sse2_movntidi:
31320 case CODE_FOR_sse_movntq:
31321 case CODE_FOR_sse2_movntisi:
31322 aligned_mem = true;
31323 break;
31324 default:
31325 break;
31327 break;
31328 case V4SF_FTYPE_V4SF_PCV2SF:
31329 case V2DF_FTYPE_V2DF_PCDOUBLE:
31330 nargs = 2;
31331 klass = load;
31332 memory = 1;
31333 break;
31334 case V8SF_FTYPE_PCV8SF_V8SI:
31335 case V4DF_FTYPE_PCV4DF_V4DI:
31336 case V4SF_FTYPE_PCV4SF_V4SI:
31337 case V2DF_FTYPE_PCV2DF_V2DI:
31338 case V8SI_FTYPE_PCV8SI_V8SI:
31339 case V4DI_FTYPE_PCV4DI_V4DI:
31340 case V4SI_FTYPE_PCV4SI_V4SI:
31341 case V2DI_FTYPE_PCV2DI_V2DI:
31342 nargs = 2;
31343 klass = load;
31344 memory = 0;
31345 break;
31346 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31347 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31348 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31349 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31350 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31351 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31352 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31353 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31354 nargs = 2;
31355 klass = store;
31356 /* Reserve memory operand for target. */
31357 memory = ARRAY_SIZE (args);
31358 break;
31359 case VOID_FTYPE_UINT_UINT_UINT:
31360 case VOID_FTYPE_UINT64_UINT_UINT:
31361 case UCHAR_FTYPE_UINT_UINT_UINT:
31362 case UCHAR_FTYPE_UINT64_UINT_UINT:
31363 nargs = 3;
31364 klass = load;
31365 memory = ARRAY_SIZE (args);
31366 last_arg_constant = true;
31367 break;
31368 default:
31369 gcc_unreachable ();
31372 gcc_assert (nargs <= ARRAY_SIZE (args));
31374 if (klass == store)
31376 arg = CALL_EXPR_ARG (exp, 0);
31377 op = expand_normal (arg);
31378 gcc_assert (target == 0);
31379 if (memory)
31381 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31382 target = gen_rtx_MEM (tmode, op);
31383 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
31384 on it. Try to improve it using get_pointer_alignment,
31385 and if the special builtin is one that requires strict
31386 mode alignment, also from it's GET_MODE_ALIGNMENT.
31387 Failure to do so could lead to ix86_legitimate_combined_insn
31388 rejecting all changes to such insns. */
31389 unsigned int align = get_pointer_alignment (arg);
31390 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
31391 align = GET_MODE_ALIGNMENT (tmode);
31392 if (MEM_ALIGN (target) < align)
31393 set_mem_align (target, align);
31395 else
31396 target = force_reg (tmode, op);
31397 arg_adjust = 1;
31399 else
31401 arg_adjust = 0;
31402 if (optimize
31403 || target == 0
31404 || !register_operand (target, tmode)
31405 || GET_MODE (target) != tmode)
31406 target = gen_reg_rtx (tmode);
31409 for (i = 0; i < nargs; i++)
31411 enum machine_mode mode = insn_p->operand[i + 1].mode;
31412 bool match;
31414 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31415 op = expand_normal (arg);
31416 match = insn_p->operand[i + 1].predicate (op, mode);
31418 if (last_arg_constant && (i + 1) == nargs)
31420 if (!match)
31422 if (icode == CODE_FOR_lwp_lwpvalsi3
31423 || icode == CODE_FOR_lwp_lwpinssi3
31424 || icode == CODE_FOR_lwp_lwpvaldi3
31425 || icode == CODE_FOR_lwp_lwpinsdi3)
31426 error ("the last argument must be a 32-bit immediate");
31427 else
31428 error ("the last argument must be an 8-bit immediate");
31429 return const0_rtx;
31432 else
31434 if (i == memory)
31436 /* This must be the memory operand. */
31437 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31438 op = gen_rtx_MEM (mode, op);
31439 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
31440 on it. Try to improve it using get_pointer_alignment,
31441 and if the special builtin is one that requires strict
31442 mode alignment, also from it's GET_MODE_ALIGNMENT.
31443 Failure to do so could lead to ix86_legitimate_combined_insn
31444 rejecting all changes to such insns. */
31445 unsigned int align = get_pointer_alignment (arg);
31446 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
31447 align = GET_MODE_ALIGNMENT (mode);
31448 if (MEM_ALIGN (op) < align)
31449 set_mem_align (op, align);
31451 else
31453 /* This must be register. */
31454 if (VECTOR_MODE_P (mode))
31455 op = safe_vector_operand (op, mode);
31457 gcc_assert (GET_MODE (op) == mode
31458 || GET_MODE (op) == VOIDmode);
31459 op = copy_to_mode_reg (mode, op);
31463 args[i].op = op;
31464 args[i].mode = mode;
31467 switch (nargs)
31469 case 0:
31470 pat = GEN_FCN (icode) (target);
31471 break;
31472 case 1:
31473 pat = GEN_FCN (icode) (target, args[0].op);
31474 break;
31475 case 2:
31476 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31477 break;
31478 case 3:
31479 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31480 break;
31481 default:
31482 gcc_unreachable ();
31485 if (! pat)
31486 return 0;
31487 emit_insn (pat);
31488 return klass == store ? 0 : target;
31491 /* Return the integer constant in ARG. Constrain it to be in the range
31492 of the subparts of VEC_TYPE; issue an error if not. */
31494 static int
31495 get_element_number (tree vec_type, tree arg)
31497 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31499 if (!host_integerp (arg, 1)
31500 || (elt = tree_low_cst (arg, 1), elt > max))
31502 error ("selector must be an integer constant in the range 0..%wi", max);
31503 return 0;
31506 return elt;
31509 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31510 ix86_expand_vector_init. We DO have language-level syntax for this, in
31511 the form of (type){ init-list }. Except that since we can't place emms
31512 instructions from inside the compiler, we can't allow the use of MMX
31513 registers unless the user explicitly asks for it. So we do *not* define
31514 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31515 we have builtins invoked by mmintrin.h that gives us license to emit
31516 these sorts of instructions. */
31518 static rtx
31519 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31521 enum machine_mode tmode = TYPE_MODE (type);
31522 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31523 int i, n_elt = GET_MODE_NUNITS (tmode);
31524 rtvec v = rtvec_alloc (n_elt);
31526 gcc_assert (VECTOR_MODE_P (tmode));
31527 gcc_assert (call_expr_nargs (exp) == n_elt);
31529 for (i = 0; i < n_elt; ++i)
31531 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31532 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31535 if (!target || !register_operand (target, tmode))
31536 target = gen_reg_rtx (tmode);
31538 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31539 return target;
31542 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31543 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31544 had a language-level syntax for referencing vector elements. */
31546 static rtx
31547 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31549 enum machine_mode tmode, mode0;
31550 tree arg0, arg1;
31551 int elt;
31552 rtx op0;
31554 arg0 = CALL_EXPR_ARG (exp, 0);
31555 arg1 = CALL_EXPR_ARG (exp, 1);
31557 op0 = expand_normal (arg0);
31558 elt = get_element_number (TREE_TYPE (arg0), arg1);
31560 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31561 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31562 gcc_assert (VECTOR_MODE_P (mode0));
31564 op0 = force_reg (mode0, op0);
31566 if (optimize || !target || !register_operand (target, tmode))
31567 target = gen_reg_rtx (tmode);
31569 ix86_expand_vector_extract (true, target, op0, elt);
31571 return target;
31574 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31575 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31576 a language-level syntax for referencing vector elements. */
31578 static rtx
31579 ix86_expand_vec_set_builtin (tree exp)
31581 enum machine_mode tmode, mode1;
31582 tree arg0, arg1, arg2;
31583 int elt;
31584 rtx op0, op1, target;
31586 arg0 = CALL_EXPR_ARG (exp, 0);
31587 arg1 = CALL_EXPR_ARG (exp, 1);
31588 arg2 = CALL_EXPR_ARG (exp, 2);
31590 tmode = TYPE_MODE (TREE_TYPE (arg0));
31591 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31592 gcc_assert (VECTOR_MODE_P (tmode));
31594 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31595 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31596 elt = get_element_number (TREE_TYPE (arg0), arg2);
31598 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31599 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31601 op0 = force_reg (tmode, op0);
31602 op1 = force_reg (mode1, op1);
31604 /* OP0 is the source of these builtin functions and shouldn't be
31605 modified. Create a copy, use it and return it as target. */
31606 target = gen_reg_rtx (tmode);
31607 emit_move_insn (target, op0);
31608 ix86_expand_vector_set (true, target, op1, elt);
31610 return target;
31613 /* Expand an expression EXP that calls a built-in function,
31614 with result going to TARGET if that's convenient
31615 (and in mode MODE if that's convenient).
31616 SUBTARGET may be used as the target for computing one of EXP's operands.
31617 IGNORE is nonzero if the value is to be ignored. */
31619 static rtx
31620 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31621 enum machine_mode mode ATTRIBUTE_UNUSED,
31622 int ignore ATTRIBUTE_UNUSED)
31624 const struct builtin_description *d;
31625 size_t i;
31626 enum insn_code icode;
31627 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31628 tree arg0, arg1, arg2, arg3, arg4;
31629 rtx op0, op1, op2, op3, op4, pat, insn;
31630 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31631 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31633 /* For CPU builtins that can be folded, fold first and expand the fold. */
31634 switch (fcode)
31636 case IX86_BUILTIN_CPU_INIT:
31638 /* Make it call __cpu_indicator_init in libgcc. */
31639 tree call_expr, fndecl, type;
31640 type = build_function_type_list (integer_type_node, NULL_TREE);
31641 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31642 call_expr = build_call_expr (fndecl, 0);
31643 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31645 case IX86_BUILTIN_CPU_IS:
31646 case IX86_BUILTIN_CPU_SUPPORTS:
31648 tree arg0 = CALL_EXPR_ARG (exp, 0);
31649 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31650 gcc_assert (fold_expr != NULL_TREE);
31651 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31655 /* Determine whether the builtin function is available under the current ISA.
31656 Originally the builtin was not created if it wasn't applicable to the
31657 current ISA based on the command line switches. With function specific
31658 options, we need to check in the context of the function making the call
31659 whether it is supported. */
31660 if (ix86_builtins_isa[fcode].isa
31661 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31663 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31664 NULL, (enum fpmath_unit) 0, false);
31666 if (!opts)
31667 error ("%qE needs unknown isa option", fndecl);
31668 else
31670 gcc_assert (opts != NULL);
31671 error ("%qE needs isa option %s", fndecl, opts);
31672 free (opts);
31674 return const0_rtx;
31677 switch (fcode)
31679 case IX86_BUILTIN_MASKMOVQ:
31680 case IX86_BUILTIN_MASKMOVDQU:
31681 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31682 ? CODE_FOR_mmx_maskmovq
31683 : CODE_FOR_sse2_maskmovdqu);
31684 /* Note the arg order is different from the operand order. */
31685 arg1 = CALL_EXPR_ARG (exp, 0);
31686 arg2 = CALL_EXPR_ARG (exp, 1);
31687 arg0 = CALL_EXPR_ARG (exp, 2);
31688 op0 = expand_normal (arg0);
31689 op1 = expand_normal (arg1);
31690 op2 = expand_normal (arg2);
31691 mode0 = insn_data[icode].operand[0].mode;
31692 mode1 = insn_data[icode].operand[1].mode;
31693 mode2 = insn_data[icode].operand[2].mode;
31695 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31696 op0 = gen_rtx_MEM (mode1, op0);
31698 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31699 op0 = copy_to_mode_reg (mode0, op0);
31700 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31701 op1 = copy_to_mode_reg (mode1, op1);
31702 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31703 op2 = copy_to_mode_reg (mode2, op2);
31704 pat = GEN_FCN (icode) (op0, op1, op2);
31705 if (! pat)
31706 return 0;
31707 emit_insn (pat);
31708 return 0;
31710 case IX86_BUILTIN_LDMXCSR:
31711 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31712 target = assign_386_stack_local (SImode, SLOT_TEMP);
31713 emit_move_insn (target, op0);
31714 emit_insn (gen_sse_ldmxcsr (target));
31715 return 0;
31717 case IX86_BUILTIN_STMXCSR:
31718 target = assign_386_stack_local (SImode, SLOT_TEMP);
31719 emit_insn (gen_sse_stmxcsr (target));
31720 return copy_to_mode_reg (SImode, target);
31722 case IX86_BUILTIN_CLFLUSH:
31723 arg0 = CALL_EXPR_ARG (exp, 0);
31724 op0 = expand_normal (arg0);
31725 icode = CODE_FOR_sse2_clflush;
31726 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31727 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31729 emit_insn (gen_sse2_clflush (op0));
31730 return 0;
31732 case IX86_BUILTIN_MONITOR:
31733 arg0 = CALL_EXPR_ARG (exp, 0);
31734 arg1 = CALL_EXPR_ARG (exp, 1);
31735 arg2 = CALL_EXPR_ARG (exp, 2);
31736 op0 = expand_normal (arg0);
31737 op1 = expand_normal (arg1);
31738 op2 = expand_normal (arg2);
31739 if (!REG_P (op0))
31740 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31741 if (!REG_P (op1))
31742 op1 = copy_to_mode_reg (SImode, op1);
31743 if (!REG_P (op2))
31744 op2 = copy_to_mode_reg (SImode, op2);
31745 emit_insn (ix86_gen_monitor (op0, op1, op2));
31746 return 0;
31748 case IX86_BUILTIN_MWAIT:
31749 arg0 = CALL_EXPR_ARG (exp, 0);
31750 arg1 = CALL_EXPR_ARG (exp, 1);
31751 op0 = expand_normal (arg0);
31752 op1 = expand_normal (arg1);
31753 if (!REG_P (op0))
31754 op0 = copy_to_mode_reg (SImode, op0);
31755 if (!REG_P (op1))
31756 op1 = copy_to_mode_reg (SImode, op1);
31757 emit_insn (gen_sse3_mwait (op0, op1));
31758 return 0;
31760 case IX86_BUILTIN_VEC_INIT_V2SI:
31761 case IX86_BUILTIN_VEC_INIT_V4HI:
31762 case IX86_BUILTIN_VEC_INIT_V8QI:
31763 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31765 case IX86_BUILTIN_VEC_EXT_V2DF:
31766 case IX86_BUILTIN_VEC_EXT_V2DI:
31767 case IX86_BUILTIN_VEC_EXT_V4SF:
31768 case IX86_BUILTIN_VEC_EXT_V4SI:
31769 case IX86_BUILTIN_VEC_EXT_V8HI:
31770 case IX86_BUILTIN_VEC_EXT_V2SI:
31771 case IX86_BUILTIN_VEC_EXT_V4HI:
31772 case IX86_BUILTIN_VEC_EXT_V16QI:
31773 return ix86_expand_vec_ext_builtin (exp, target);
31775 case IX86_BUILTIN_VEC_SET_V2DI:
31776 case IX86_BUILTIN_VEC_SET_V4SF:
31777 case IX86_BUILTIN_VEC_SET_V4SI:
31778 case IX86_BUILTIN_VEC_SET_V8HI:
31779 case IX86_BUILTIN_VEC_SET_V4HI:
31780 case IX86_BUILTIN_VEC_SET_V16QI:
31781 return ix86_expand_vec_set_builtin (exp);
31783 case IX86_BUILTIN_INFQ:
31784 case IX86_BUILTIN_HUGE_VALQ:
31786 REAL_VALUE_TYPE inf;
31787 rtx tmp;
31789 real_inf (&inf);
31790 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31792 tmp = validize_mem (force_const_mem (mode, tmp));
31794 if (target == 0)
31795 target = gen_reg_rtx (mode);
31797 emit_move_insn (target, tmp);
31798 return target;
31801 case IX86_BUILTIN_RDPMC:
31802 case IX86_BUILTIN_RDTSC:
31803 case IX86_BUILTIN_RDTSCP:
31805 op0 = gen_reg_rtx (DImode);
31806 op1 = gen_reg_rtx (DImode);
31808 if (fcode == IX86_BUILTIN_RDPMC)
31810 arg0 = CALL_EXPR_ARG (exp, 0);
31811 op2 = expand_normal (arg0);
31812 if (!register_operand (op2, SImode))
31813 op2 = copy_to_mode_reg (SImode, op2);
31815 insn = (TARGET_64BIT
31816 ? gen_rdpmc_rex64 (op0, op1, op2)
31817 : gen_rdpmc (op0, op2));
31818 emit_insn (insn);
31820 else if (fcode == IX86_BUILTIN_RDTSC)
31822 insn = (TARGET_64BIT
31823 ? gen_rdtsc_rex64 (op0, op1)
31824 : gen_rdtsc (op0));
31825 emit_insn (insn);
31827 else
31829 op2 = gen_reg_rtx (SImode);
31831 insn = (TARGET_64BIT
31832 ? gen_rdtscp_rex64 (op0, op1, op2)
31833 : gen_rdtscp (op0, op2));
31834 emit_insn (insn);
31836 arg0 = CALL_EXPR_ARG (exp, 0);
31837 op4 = expand_normal (arg0);
31838 if (!address_operand (op4, VOIDmode))
31840 op4 = convert_memory_address (Pmode, op4);
31841 op4 = copy_addr_to_reg (op4);
31843 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31846 if (target == 0)
31848 /* mode is VOIDmode if __builtin_rd* has been called
31849 without lhs. */
31850 if (mode == VOIDmode)
31851 return target;
31852 target = gen_reg_rtx (mode);
31855 if (TARGET_64BIT)
31857 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31858 op1, 1, OPTAB_DIRECT);
31859 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31860 op0, 1, OPTAB_DIRECT);
31863 emit_move_insn (target, op0);
31864 return target;
31866 case IX86_BUILTIN_FXSAVE:
31867 case IX86_BUILTIN_FXRSTOR:
31868 case IX86_BUILTIN_FXSAVE64:
31869 case IX86_BUILTIN_FXRSTOR64:
31870 switch (fcode)
31872 case IX86_BUILTIN_FXSAVE:
31873 icode = CODE_FOR_fxsave;
31874 break;
31875 case IX86_BUILTIN_FXRSTOR:
31876 icode = CODE_FOR_fxrstor;
31877 break;
31878 case IX86_BUILTIN_FXSAVE64:
31879 icode = CODE_FOR_fxsave64;
31880 break;
31881 case IX86_BUILTIN_FXRSTOR64:
31882 icode = CODE_FOR_fxrstor64;
31883 break;
31884 default:
31885 gcc_unreachable ();
31888 arg0 = CALL_EXPR_ARG (exp, 0);
31889 op0 = expand_normal (arg0);
31891 if (!address_operand (op0, VOIDmode))
31893 op0 = convert_memory_address (Pmode, op0);
31894 op0 = copy_addr_to_reg (op0);
31896 op0 = gen_rtx_MEM (BLKmode, op0);
31898 pat = GEN_FCN (icode) (op0);
31899 if (pat)
31900 emit_insn (pat);
31901 return 0;
31903 case IX86_BUILTIN_XSAVE:
31904 case IX86_BUILTIN_XRSTOR:
31905 case IX86_BUILTIN_XSAVE64:
31906 case IX86_BUILTIN_XRSTOR64:
31907 case IX86_BUILTIN_XSAVEOPT:
31908 case IX86_BUILTIN_XSAVEOPT64:
31909 arg0 = CALL_EXPR_ARG (exp, 0);
31910 arg1 = CALL_EXPR_ARG (exp, 1);
31911 op0 = expand_normal (arg0);
31912 op1 = expand_normal (arg1);
31914 if (!address_operand (op0, VOIDmode))
31916 op0 = convert_memory_address (Pmode, op0);
31917 op0 = copy_addr_to_reg (op0);
31919 op0 = gen_rtx_MEM (BLKmode, op0);
31921 op1 = force_reg (DImode, op1);
31923 if (TARGET_64BIT)
31925 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31926 NULL, 1, OPTAB_DIRECT);
31927 switch (fcode)
31929 case IX86_BUILTIN_XSAVE:
31930 icode = CODE_FOR_xsave_rex64;
31931 break;
31932 case IX86_BUILTIN_XRSTOR:
31933 icode = CODE_FOR_xrstor_rex64;
31934 break;
31935 case IX86_BUILTIN_XSAVE64:
31936 icode = CODE_FOR_xsave64;
31937 break;
31938 case IX86_BUILTIN_XRSTOR64:
31939 icode = CODE_FOR_xrstor64;
31940 break;
31941 case IX86_BUILTIN_XSAVEOPT:
31942 icode = CODE_FOR_xsaveopt_rex64;
31943 break;
31944 case IX86_BUILTIN_XSAVEOPT64:
31945 icode = CODE_FOR_xsaveopt64;
31946 break;
31947 default:
31948 gcc_unreachable ();
31951 op2 = gen_lowpart (SImode, op2);
31952 op1 = gen_lowpart (SImode, op1);
31953 pat = GEN_FCN (icode) (op0, op1, op2);
31955 else
31957 switch (fcode)
31959 case IX86_BUILTIN_XSAVE:
31960 icode = CODE_FOR_xsave;
31961 break;
31962 case IX86_BUILTIN_XRSTOR:
31963 icode = CODE_FOR_xrstor;
31964 break;
31965 case IX86_BUILTIN_XSAVEOPT:
31966 icode = CODE_FOR_xsaveopt;
31967 break;
31968 default:
31969 gcc_unreachable ();
31971 pat = GEN_FCN (icode) (op0, op1);
31974 if (pat)
31975 emit_insn (pat);
31976 return 0;
31978 case IX86_BUILTIN_LLWPCB:
31979 arg0 = CALL_EXPR_ARG (exp, 0);
31980 op0 = expand_normal (arg0);
31981 icode = CODE_FOR_lwp_llwpcb;
31982 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31983 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31984 emit_insn (gen_lwp_llwpcb (op0));
31985 return 0;
31987 case IX86_BUILTIN_SLWPCB:
31988 icode = CODE_FOR_lwp_slwpcb;
31989 if (!target
31990 || !insn_data[icode].operand[0].predicate (target, Pmode))
31991 target = gen_reg_rtx (Pmode);
31992 emit_insn (gen_lwp_slwpcb (target));
31993 return target;
31995 case IX86_BUILTIN_BEXTRI32:
31996 case IX86_BUILTIN_BEXTRI64:
31997 arg0 = CALL_EXPR_ARG (exp, 0);
31998 arg1 = CALL_EXPR_ARG (exp, 1);
31999 op0 = expand_normal (arg0);
32000 op1 = expand_normal (arg1);
32001 icode = (fcode == IX86_BUILTIN_BEXTRI32
32002 ? CODE_FOR_tbm_bextri_si
32003 : CODE_FOR_tbm_bextri_di);
32004 if (!CONST_INT_P (op1))
32006 error ("last argument must be an immediate");
32007 return const0_rtx;
32009 else
32011 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
32012 unsigned char lsb_index = INTVAL (op1) & 0xFF;
32013 op1 = GEN_INT (length);
32014 op2 = GEN_INT (lsb_index);
32015 pat = GEN_FCN (icode) (target, op0, op1, op2);
32016 if (pat)
32017 emit_insn (pat);
32018 return target;
32021 case IX86_BUILTIN_RDRAND16_STEP:
32022 icode = CODE_FOR_rdrandhi_1;
32023 mode0 = HImode;
32024 goto rdrand_step;
32026 case IX86_BUILTIN_RDRAND32_STEP:
32027 icode = CODE_FOR_rdrandsi_1;
32028 mode0 = SImode;
32029 goto rdrand_step;
32031 case IX86_BUILTIN_RDRAND64_STEP:
32032 icode = CODE_FOR_rdranddi_1;
32033 mode0 = DImode;
32035 rdrand_step:
32036 op0 = gen_reg_rtx (mode0);
32037 emit_insn (GEN_FCN (icode) (op0));
32039 arg0 = CALL_EXPR_ARG (exp, 0);
32040 op1 = expand_normal (arg0);
32041 if (!address_operand (op1, VOIDmode))
32043 op1 = convert_memory_address (Pmode, op1);
32044 op1 = copy_addr_to_reg (op1);
32046 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32048 op1 = gen_reg_rtx (SImode);
32049 emit_move_insn (op1, CONST1_RTX (SImode));
32051 /* Emit SImode conditional move. */
32052 if (mode0 == HImode)
32054 op2 = gen_reg_rtx (SImode);
32055 emit_insn (gen_zero_extendhisi2 (op2, op0));
32057 else if (mode0 == SImode)
32058 op2 = op0;
32059 else
32060 op2 = gen_rtx_SUBREG (SImode, op0, 0);
32062 if (target == 0)
32063 target = gen_reg_rtx (SImode);
32065 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
32066 const0_rtx);
32067 emit_insn (gen_rtx_SET (VOIDmode, target,
32068 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32069 return target;
32071 case IX86_BUILTIN_RDSEED16_STEP:
32072 icode = CODE_FOR_rdseedhi_1;
32073 mode0 = HImode;
32074 goto rdseed_step;
32076 case IX86_BUILTIN_RDSEED32_STEP:
32077 icode = CODE_FOR_rdseedsi_1;
32078 mode0 = SImode;
32079 goto rdseed_step;
32081 case IX86_BUILTIN_RDSEED64_STEP:
32082 icode = CODE_FOR_rdseeddi_1;
32083 mode0 = DImode;
32085 rdseed_step:
32086 op0 = gen_reg_rtx (mode0);
32087 emit_insn (GEN_FCN (icode) (op0));
32089 arg0 = CALL_EXPR_ARG (exp, 0);
32090 op1 = expand_normal (arg0);
32091 if (!address_operand (op1, VOIDmode))
32093 op1 = convert_memory_address (Pmode, op1);
32094 op1 = copy_addr_to_reg (op1);
32096 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32098 op2 = gen_reg_rtx (QImode);
32100 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32101 const0_rtx);
32102 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32104 if (target == 0)
32105 target = gen_reg_rtx (SImode);
32107 emit_insn (gen_zero_extendqisi2 (target, op2));
32108 return target;
32110 case IX86_BUILTIN_ADDCARRYX32:
32111 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32112 mode0 = SImode;
32113 goto addcarryx;
32115 case IX86_BUILTIN_ADDCARRYX64:
32116 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32117 mode0 = DImode;
32119 addcarryx:
32120 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32121 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32122 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32123 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32125 op0 = gen_reg_rtx (QImode);
32127 /* Generate CF from input operand. */
32128 op1 = expand_normal (arg0);
32129 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32130 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32132 /* Gen ADCX instruction to compute X+Y+CF. */
32133 op2 = expand_normal (arg1);
32134 op3 = expand_normal (arg2);
32136 if (!REG_P (op2))
32137 op2 = copy_to_mode_reg (mode0, op2);
32138 if (!REG_P (op3))
32139 op3 = copy_to_mode_reg (mode0, op3);
32141 op0 = gen_reg_rtx (mode0);
32143 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32144 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32145 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32147 /* Store the result. */
32148 op4 = expand_normal (arg3);
32149 if (!address_operand (op4, VOIDmode))
32151 op4 = convert_memory_address (Pmode, op4);
32152 op4 = copy_addr_to_reg (op4);
32154 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32156 /* Return current CF value. */
32157 if (target == 0)
32158 target = gen_reg_rtx (QImode);
32160 PUT_MODE (pat, QImode);
32161 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32162 return target;
32164 case IX86_BUILTIN_GATHERSIV2DF:
32165 icode = CODE_FOR_avx2_gathersiv2df;
32166 goto gather_gen;
32167 case IX86_BUILTIN_GATHERSIV4DF:
32168 icode = CODE_FOR_avx2_gathersiv4df;
32169 goto gather_gen;
32170 case IX86_BUILTIN_GATHERDIV2DF:
32171 icode = CODE_FOR_avx2_gatherdiv2df;
32172 goto gather_gen;
32173 case IX86_BUILTIN_GATHERDIV4DF:
32174 icode = CODE_FOR_avx2_gatherdiv4df;
32175 goto gather_gen;
32176 case IX86_BUILTIN_GATHERSIV4SF:
32177 icode = CODE_FOR_avx2_gathersiv4sf;
32178 goto gather_gen;
32179 case IX86_BUILTIN_GATHERSIV8SF:
32180 icode = CODE_FOR_avx2_gathersiv8sf;
32181 goto gather_gen;
32182 case IX86_BUILTIN_GATHERDIV4SF:
32183 icode = CODE_FOR_avx2_gatherdiv4sf;
32184 goto gather_gen;
32185 case IX86_BUILTIN_GATHERDIV8SF:
32186 icode = CODE_FOR_avx2_gatherdiv8sf;
32187 goto gather_gen;
32188 case IX86_BUILTIN_GATHERSIV2DI:
32189 icode = CODE_FOR_avx2_gathersiv2di;
32190 goto gather_gen;
32191 case IX86_BUILTIN_GATHERSIV4DI:
32192 icode = CODE_FOR_avx2_gathersiv4di;
32193 goto gather_gen;
32194 case IX86_BUILTIN_GATHERDIV2DI:
32195 icode = CODE_FOR_avx2_gatherdiv2di;
32196 goto gather_gen;
32197 case IX86_BUILTIN_GATHERDIV4DI:
32198 icode = CODE_FOR_avx2_gatherdiv4di;
32199 goto gather_gen;
32200 case IX86_BUILTIN_GATHERSIV4SI:
32201 icode = CODE_FOR_avx2_gathersiv4si;
32202 goto gather_gen;
32203 case IX86_BUILTIN_GATHERSIV8SI:
32204 icode = CODE_FOR_avx2_gathersiv8si;
32205 goto gather_gen;
32206 case IX86_BUILTIN_GATHERDIV4SI:
32207 icode = CODE_FOR_avx2_gatherdiv4si;
32208 goto gather_gen;
32209 case IX86_BUILTIN_GATHERDIV8SI:
32210 icode = CODE_FOR_avx2_gatherdiv8si;
32211 goto gather_gen;
32212 case IX86_BUILTIN_GATHERALTSIV4DF:
32213 icode = CODE_FOR_avx2_gathersiv4df;
32214 goto gather_gen;
32215 case IX86_BUILTIN_GATHERALTDIV8SF:
32216 icode = CODE_FOR_avx2_gatherdiv8sf;
32217 goto gather_gen;
32218 case IX86_BUILTIN_GATHERALTSIV4DI:
32219 icode = CODE_FOR_avx2_gathersiv4di;
32220 goto gather_gen;
32221 case IX86_BUILTIN_GATHERALTDIV8SI:
32222 icode = CODE_FOR_avx2_gatherdiv8si;
32223 goto gather_gen;
32225 gather_gen:
32226 arg0 = CALL_EXPR_ARG (exp, 0);
32227 arg1 = CALL_EXPR_ARG (exp, 1);
32228 arg2 = CALL_EXPR_ARG (exp, 2);
32229 arg3 = CALL_EXPR_ARG (exp, 3);
32230 arg4 = CALL_EXPR_ARG (exp, 4);
32231 op0 = expand_normal (arg0);
32232 op1 = expand_normal (arg1);
32233 op2 = expand_normal (arg2);
32234 op3 = expand_normal (arg3);
32235 op4 = expand_normal (arg4);
32236 /* Note the arg order is different from the operand order. */
32237 mode0 = insn_data[icode].operand[1].mode;
32238 mode2 = insn_data[icode].operand[3].mode;
32239 mode3 = insn_data[icode].operand[4].mode;
32240 mode4 = insn_data[icode].operand[5].mode;
32242 if (target == NULL_RTX
32243 || GET_MODE (target) != insn_data[icode].operand[0].mode
32244 || !insn_data[icode].operand[0].predicate (target,
32245 GET_MODE (target)))
32246 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32247 else
32248 subtarget = target;
32250 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32251 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32253 rtx half = gen_reg_rtx (V4SImode);
32254 if (!nonimmediate_operand (op2, V8SImode))
32255 op2 = copy_to_mode_reg (V8SImode, op2);
32256 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32257 op2 = half;
32259 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32260 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32262 rtx (*gen) (rtx, rtx);
32263 rtx half = gen_reg_rtx (mode0);
32264 if (mode0 == V4SFmode)
32265 gen = gen_vec_extract_lo_v8sf;
32266 else
32267 gen = gen_vec_extract_lo_v8si;
32268 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32269 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32270 emit_insn (gen (half, op0));
32271 op0 = half;
32272 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32273 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32274 emit_insn (gen (half, op3));
32275 op3 = half;
32278 /* Force memory operand only with base register here. But we
32279 don't want to do it on memory operand for other builtin
32280 functions. */
32281 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32283 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32284 op0 = copy_to_mode_reg (mode0, op0);
32285 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32286 op1 = copy_to_mode_reg (Pmode, op1);
32287 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32288 op2 = copy_to_mode_reg (mode2, op2);
32289 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32290 op3 = copy_to_mode_reg (mode3, op3);
32291 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32293 error ("last argument must be scale 1, 2, 4, 8");
32294 return const0_rtx;
32297 /* Optimize. If mask is known to have all high bits set,
32298 replace op0 with pc_rtx to signal that the instruction
32299 overwrites the whole destination and doesn't use its
32300 previous contents. */
32301 if (optimize)
32303 if (TREE_CODE (arg3) == VECTOR_CST)
32305 unsigned int negative = 0;
32306 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32308 tree cst = VECTOR_CST_ELT (arg3, i);
32309 if (TREE_CODE (cst) == INTEGER_CST
32310 && tree_int_cst_sign_bit (cst))
32311 negative++;
32312 else if (TREE_CODE (cst) == REAL_CST
32313 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32314 negative++;
32316 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32317 op0 = pc_rtx;
32319 else if (TREE_CODE (arg3) == SSA_NAME)
32321 /* Recognize also when mask is like:
32322 __v2df src = _mm_setzero_pd ();
32323 __v2df mask = _mm_cmpeq_pd (src, src);
32325 __v8sf src = _mm256_setzero_ps ();
32326 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32327 as that is a cheaper way to load all ones into
32328 a register than having to load a constant from
32329 memory. */
32330 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32331 if (is_gimple_call (def_stmt))
32333 tree fndecl = gimple_call_fndecl (def_stmt);
32334 if (fndecl
32335 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32336 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32338 case IX86_BUILTIN_CMPPD:
32339 case IX86_BUILTIN_CMPPS:
32340 case IX86_BUILTIN_CMPPD256:
32341 case IX86_BUILTIN_CMPPS256:
32342 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32343 break;
32344 /* FALLTHRU */
32345 case IX86_BUILTIN_CMPEQPD:
32346 case IX86_BUILTIN_CMPEQPS:
32347 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32348 && initializer_zerop (gimple_call_arg (def_stmt,
32349 1)))
32350 op0 = pc_rtx;
32351 break;
32352 default:
32353 break;
32359 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32360 if (! pat)
32361 return const0_rtx;
32362 emit_insn (pat);
32364 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32365 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32367 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32368 ? V4SFmode : V4SImode;
32369 if (target == NULL_RTX)
32370 target = gen_reg_rtx (tmode);
32371 if (tmode == V4SFmode)
32372 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32373 else
32374 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32376 else
32377 target = subtarget;
32379 return target;
32381 case IX86_BUILTIN_XABORT:
32382 icode = CODE_FOR_xabort;
32383 arg0 = CALL_EXPR_ARG (exp, 0);
32384 op0 = expand_normal (arg0);
32385 mode0 = insn_data[icode].operand[0].mode;
32386 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32388 error ("the xabort's argument must be an 8-bit immediate");
32389 return const0_rtx;
32391 emit_insn (gen_xabort (op0));
32392 return 0;
32394 default:
32395 break;
32398 for (i = 0, d = bdesc_special_args;
32399 i < ARRAY_SIZE (bdesc_special_args);
32400 i++, d++)
32401 if (d->code == fcode)
32402 return ix86_expand_special_args_builtin (d, exp, target);
32404 for (i = 0, d = bdesc_args;
32405 i < ARRAY_SIZE (bdesc_args);
32406 i++, d++)
32407 if (d->code == fcode)
32408 switch (fcode)
32410 case IX86_BUILTIN_FABSQ:
32411 case IX86_BUILTIN_COPYSIGNQ:
32412 if (!TARGET_SSE)
32413 /* Emit a normal call if SSE isn't available. */
32414 return expand_call (exp, target, ignore);
32415 default:
32416 return ix86_expand_args_builtin (d, exp, target);
32419 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32420 if (d->code == fcode)
32421 return ix86_expand_sse_comi (d, exp, target);
32423 for (i = 0, d = bdesc_pcmpestr;
32424 i < ARRAY_SIZE (bdesc_pcmpestr);
32425 i++, d++)
32426 if (d->code == fcode)
32427 return ix86_expand_sse_pcmpestr (d, exp, target);
32429 for (i = 0, d = bdesc_pcmpistr;
32430 i < ARRAY_SIZE (bdesc_pcmpistr);
32431 i++, d++)
32432 if (d->code == fcode)
32433 return ix86_expand_sse_pcmpistr (d, exp, target);
32435 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32436 if (d->code == fcode)
32437 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32438 (enum ix86_builtin_func_type)
32439 d->flag, d->comparison);
32441 gcc_unreachable ();
32444 /* Returns a function decl for a vectorized version of the builtin function
32445 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32446 if it is not available. */
32448 static tree
32449 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32450 tree type_in)
32452 enum machine_mode in_mode, out_mode;
32453 int in_n, out_n;
32454 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32456 if (TREE_CODE (type_out) != VECTOR_TYPE
32457 || TREE_CODE (type_in) != VECTOR_TYPE
32458 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32459 return NULL_TREE;
32461 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32462 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32463 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32464 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32466 switch (fn)
32468 case BUILT_IN_SQRT:
32469 if (out_mode == DFmode && in_mode == DFmode)
32471 if (out_n == 2 && in_n == 2)
32472 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32473 else if (out_n == 4 && in_n == 4)
32474 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32476 break;
32478 case BUILT_IN_SQRTF:
32479 if (out_mode == SFmode && in_mode == SFmode)
32481 if (out_n == 4 && in_n == 4)
32482 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32483 else if (out_n == 8 && in_n == 8)
32484 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32486 break;
32488 case BUILT_IN_IFLOOR:
32489 case BUILT_IN_LFLOOR:
32490 case BUILT_IN_LLFLOOR:
32491 /* The round insn does not trap on denormals. */
32492 if (flag_trapping_math || !TARGET_ROUND)
32493 break;
32495 if (out_mode == SImode && in_mode == DFmode)
32497 if (out_n == 4 && in_n == 2)
32498 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32499 else if (out_n == 8 && in_n == 4)
32500 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32502 break;
32504 case BUILT_IN_IFLOORF:
32505 case BUILT_IN_LFLOORF:
32506 case BUILT_IN_LLFLOORF:
32507 /* The round insn does not trap on denormals. */
32508 if (flag_trapping_math || !TARGET_ROUND)
32509 break;
32511 if (out_mode == SImode && in_mode == SFmode)
32513 if (out_n == 4 && in_n == 4)
32514 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32515 else if (out_n == 8 && in_n == 8)
32516 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32518 break;
32520 case BUILT_IN_ICEIL:
32521 case BUILT_IN_LCEIL:
32522 case BUILT_IN_LLCEIL:
32523 /* The round insn does not trap on denormals. */
32524 if (flag_trapping_math || !TARGET_ROUND)
32525 break;
32527 if (out_mode == SImode && in_mode == DFmode)
32529 if (out_n == 4 && in_n == 2)
32530 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32531 else if (out_n == 8 && in_n == 4)
32532 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32534 break;
32536 case BUILT_IN_ICEILF:
32537 case BUILT_IN_LCEILF:
32538 case BUILT_IN_LLCEILF:
32539 /* The round insn does not trap on denormals. */
32540 if (flag_trapping_math || !TARGET_ROUND)
32541 break;
32543 if (out_mode == SImode && in_mode == SFmode)
32545 if (out_n == 4 && in_n == 4)
32546 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32547 else if (out_n == 8 && in_n == 8)
32548 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32550 break;
32552 case BUILT_IN_IRINT:
32553 case BUILT_IN_LRINT:
32554 case BUILT_IN_LLRINT:
32555 if (out_mode == SImode && in_mode == DFmode)
32557 if (out_n == 4 && in_n == 2)
32558 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32559 else if (out_n == 8 && in_n == 4)
32560 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32562 break;
32564 case BUILT_IN_IRINTF:
32565 case BUILT_IN_LRINTF:
32566 case BUILT_IN_LLRINTF:
32567 if (out_mode == SImode && in_mode == SFmode)
32569 if (out_n == 4 && in_n == 4)
32570 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32571 else if (out_n == 8 && in_n == 8)
32572 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32574 break;
32576 case BUILT_IN_IROUND:
32577 case BUILT_IN_LROUND:
32578 case BUILT_IN_LLROUND:
32579 /* The round insn does not trap on denormals. */
32580 if (flag_trapping_math || !TARGET_ROUND)
32581 break;
32583 if (out_mode == SImode && in_mode == DFmode)
32585 if (out_n == 4 && in_n == 2)
32586 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32587 else if (out_n == 8 && in_n == 4)
32588 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32590 break;
32592 case BUILT_IN_IROUNDF:
32593 case BUILT_IN_LROUNDF:
32594 case BUILT_IN_LLROUNDF:
32595 /* The round insn does not trap on denormals. */
32596 if (flag_trapping_math || !TARGET_ROUND)
32597 break;
32599 if (out_mode == SImode && in_mode == SFmode)
32601 if (out_n == 4 && in_n == 4)
32602 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32603 else if (out_n == 8 && in_n == 8)
32604 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32606 break;
32608 case BUILT_IN_COPYSIGN:
32609 if (out_mode == DFmode && in_mode == DFmode)
32611 if (out_n == 2 && in_n == 2)
32612 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32613 else if (out_n == 4 && in_n == 4)
32614 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32616 break;
32618 case BUILT_IN_COPYSIGNF:
32619 if (out_mode == SFmode && in_mode == SFmode)
32621 if (out_n == 4 && in_n == 4)
32622 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32623 else if (out_n == 8 && in_n == 8)
32624 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32626 break;
32628 case BUILT_IN_FLOOR:
32629 /* The round insn does not trap on denormals. */
32630 if (flag_trapping_math || !TARGET_ROUND)
32631 break;
32633 if (out_mode == DFmode && in_mode == DFmode)
32635 if (out_n == 2 && in_n == 2)
32636 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32637 else if (out_n == 4 && in_n == 4)
32638 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32640 break;
32642 case BUILT_IN_FLOORF:
32643 /* The round insn does not trap on denormals. */
32644 if (flag_trapping_math || !TARGET_ROUND)
32645 break;
32647 if (out_mode == SFmode && in_mode == SFmode)
32649 if (out_n == 4 && in_n == 4)
32650 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32651 else if (out_n == 8 && in_n == 8)
32652 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32654 break;
32656 case BUILT_IN_CEIL:
32657 /* The round insn does not trap on denormals. */
32658 if (flag_trapping_math || !TARGET_ROUND)
32659 break;
32661 if (out_mode == DFmode && in_mode == DFmode)
32663 if (out_n == 2 && in_n == 2)
32664 return ix86_builtins[IX86_BUILTIN_CEILPD];
32665 else if (out_n == 4 && in_n == 4)
32666 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32668 break;
32670 case BUILT_IN_CEILF:
32671 /* The round insn does not trap on denormals. */
32672 if (flag_trapping_math || !TARGET_ROUND)
32673 break;
32675 if (out_mode == SFmode && in_mode == SFmode)
32677 if (out_n == 4 && in_n == 4)
32678 return ix86_builtins[IX86_BUILTIN_CEILPS];
32679 else if (out_n == 8 && in_n == 8)
32680 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32682 break;
32684 case BUILT_IN_TRUNC:
32685 /* The round insn does not trap on denormals. */
32686 if (flag_trapping_math || !TARGET_ROUND)
32687 break;
32689 if (out_mode == DFmode && in_mode == DFmode)
32691 if (out_n == 2 && in_n == 2)
32692 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32693 else if (out_n == 4 && in_n == 4)
32694 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32696 break;
32698 case BUILT_IN_TRUNCF:
32699 /* The round insn does not trap on denormals. */
32700 if (flag_trapping_math || !TARGET_ROUND)
32701 break;
32703 if (out_mode == SFmode && in_mode == SFmode)
32705 if (out_n == 4 && in_n == 4)
32706 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32707 else if (out_n == 8 && in_n == 8)
32708 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32710 break;
32712 case BUILT_IN_RINT:
32713 /* The round insn does not trap on denormals. */
32714 if (flag_trapping_math || !TARGET_ROUND)
32715 break;
32717 if (out_mode == DFmode && in_mode == DFmode)
32719 if (out_n == 2 && in_n == 2)
32720 return ix86_builtins[IX86_BUILTIN_RINTPD];
32721 else if (out_n == 4 && in_n == 4)
32722 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32724 break;
32726 case BUILT_IN_RINTF:
32727 /* The round insn does not trap on denormals. */
32728 if (flag_trapping_math || !TARGET_ROUND)
32729 break;
32731 if (out_mode == SFmode && in_mode == SFmode)
32733 if (out_n == 4 && in_n == 4)
32734 return ix86_builtins[IX86_BUILTIN_RINTPS];
32735 else if (out_n == 8 && in_n == 8)
32736 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32738 break;
32740 case BUILT_IN_ROUND:
32741 /* The round insn does not trap on denormals. */
32742 if (flag_trapping_math || !TARGET_ROUND)
32743 break;
32745 if (out_mode == DFmode && in_mode == DFmode)
32747 if (out_n == 2 && in_n == 2)
32748 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32749 else if (out_n == 4 && in_n == 4)
32750 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32752 break;
32754 case BUILT_IN_ROUNDF:
32755 /* The round insn does not trap on denormals. */
32756 if (flag_trapping_math || !TARGET_ROUND)
32757 break;
32759 if (out_mode == SFmode && in_mode == SFmode)
32761 if (out_n == 4 && in_n == 4)
32762 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32763 else if (out_n == 8 && in_n == 8)
32764 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32766 break;
32768 case BUILT_IN_FMA:
32769 if (out_mode == DFmode && in_mode == DFmode)
32771 if (out_n == 2 && in_n == 2)
32772 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32773 if (out_n == 4 && in_n == 4)
32774 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32776 break;
32778 case BUILT_IN_FMAF:
32779 if (out_mode == SFmode && in_mode == SFmode)
32781 if (out_n == 4 && in_n == 4)
32782 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32783 if (out_n == 8 && in_n == 8)
32784 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32786 break;
32788 default:
32789 break;
32792 /* Dispatch to a handler for a vectorization library. */
32793 if (ix86_veclib_handler)
32794 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32795 type_in);
32797 return NULL_TREE;
32800 /* Handler for an SVML-style interface to
32801 a library with vectorized intrinsics. */
32803 static tree
32804 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32806 char name[20];
32807 tree fntype, new_fndecl, args;
32808 unsigned arity;
32809 const char *bname;
32810 enum machine_mode el_mode, in_mode;
32811 int n, in_n;
32813 /* The SVML is suitable for unsafe math only. */
32814 if (!flag_unsafe_math_optimizations)
32815 return NULL_TREE;
32817 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32818 n = TYPE_VECTOR_SUBPARTS (type_out);
32819 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32820 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32821 if (el_mode != in_mode
32822 || n != in_n)
32823 return NULL_TREE;
32825 switch (fn)
32827 case BUILT_IN_EXP:
32828 case BUILT_IN_LOG:
32829 case BUILT_IN_LOG10:
32830 case BUILT_IN_POW:
32831 case BUILT_IN_TANH:
32832 case BUILT_IN_TAN:
32833 case BUILT_IN_ATAN:
32834 case BUILT_IN_ATAN2:
32835 case BUILT_IN_ATANH:
32836 case BUILT_IN_CBRT:
32837 case BUILT_IN_SINH:
32838 case BUILT_IN_SIN:
32839 case BUILT_IN_ASINH:
32840 case BUILT_IN_ASIN:
32841 case BUILT_IN_COSH:
32842 case BUILT_IN_COS:
32843 case BUILT_IN_ACOSH:
32844 case BUILT_IN_ACOS:
32845 if (el_mode != DFmode || n != 2)
32846 return NULL_TREE;
32847 break;
32849 case BUILT_IN_EXPF:
32850 case BUILT_IN_LOGF:
32851 case BUILT_IN_LOG10F:
32852 case BUILT_IN_POWF:
32853 case BUILT_IN_TANHF:
32854 case BUILT_IN_TANF:
32855 case BUILT_IN_ATANF:
32856 case BUILT_IN_ATAN2F:
32857 case BUILT_IN_ATANHF:
32858 case BUILT_IN_CBRTF:
32859 case BUILT_IN_SINHF:
32860 case BUILT_IN_SINF:
32861 case BUILT_IN_ASINHF:
32862 case BUILT_IN_ASINF:
32863 case BUILT_IN_COSHF:
32864 case BUILT_IN_COSF:
32865 case BUILT_IN_ACOSHF:
32866 case BUILT_IN_ACOSF:
32867 if (el_mode != SFmode || n != 4)
32868 return NULL_TREE;
32869 break;
32871 default:
32872 return NULL_TREE;
32875 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32877 if (fn == BUILT_IN_LOGF)
32878 strcpy (name, "vmlsLn4");
32879 else if (fn == BUILT_IN_LOG)
32880 strcpy (name, "vmldLn2");
32881 else if (n == 4)
32883 sprintf (name, "vmls%s", bname+10);
32884 name[strlen (name)-1] = '4';
32886 else
32887 sprintf (name, "vmld%s2", bname+10);
32889 /* Convert to uppercase. */
32890 name[4] &= ~0x20;
32892 arity = 0;
32893 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32894 args;
32895 args = TREE_CHAIN (args))
32896 arity++;
32898 if (arity == 1)
32899 fntype = build_function_type_list (type_out, type_in, NULL);
32900 else
32901 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32903 /* Build a function declaration for the vectorized function. */
32904 new_fndecl = build_decl (BUILTINS_LOCATION,
32905 FUNCTION_DECL, get_identifier (name), fntype);
32906 TREE_PUBLIC (new_fndecl) = 1;
32907 DECL_EXTERNAL (new_fndecl) = 1;
32908 DECL_IS_NOVOPS (new_fndecl) = 1;
32909 TREE_READONLY (new_fndecl) = 1;
32911 return new_fndecl;
32914 /* Handler for an ACML-style interface to
32915 a library with vectorized intrinsics. */
32917 static tree
32918 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32920 char name[20] = "__vr.._";
32921 tree fntype, new_fndecl, args;
32922 unsigned arity;
32923 const char *bname;
32924 enum machine_mode el_mode, in_mode;
32925 int n, in_n;
32927 /* The ACML is 64bits only and suitable for unsafe math only as
32928 it does not correctly support parts of IEEE with the required
32929 precision such as denormals. */
32930 if (!TARGET_64BIT
32931 || !flag_unsafe_math_optimizations)
32932 return NULL_TREE;
32934 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32935 n = TYPE_VECTOR_SUBPARTS (type_out);
32936 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32937 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32938 if (el_mode != in_mode
32939 || n != in_n)
32940 return NULL_TREE;
32942 switch (fn)
32944 case BUILT_IN_SIN:
32945 case BUILT_IN_COS:
32946 case BUILT_IN_EXP:
32947 case BUILT_IN_LOG:
32948 case BUILT_IN_LOG2:
32949 case BUILT_IN_LOG10:
32950 name[4] = 'd';
32951 name[5] = '2';
32952 if (el_mode != DFmode
32953 || n != 2)
32954 return NULL_TREE;
32955 break;
32957 case BUILT_IN_SINF:
32958 case BUILT_IN_COSF:
32959 case BUILT_IN_EXPF:
32960 case BUILT_IN_POWF:
32961 case BUILT_IN_LOGF:
32962 case BUILT_IN_LOG2F:
32963 case BUILT_IN_LOG10F:
32964 name[4] = 's';
32965 name[5] = '4';
32966 if (el_mode != SFmode
32967 || n != 4)
32968 return NULL_TREE;
32969 break;
32971 default:
32972 return NULL_TREE;
32975 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32976 sprintf (name + 7, "%s", bname+10);
32978 arity = 0;
32979 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32980 args;
32981 args = TREE_CHAIN (args))
32982 arity++;
32984 if (arity == 1)
32985 fntype = build_function_type_list (type_out, type_in, NULL);
32986 else
32987 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32989 /* Build a function declaration for the vectorized function. */
32990 new_fndecl = build_decl (BUILTINS_LOCATION,
32991 FUNCTION_DECL, get_identifier (name), fntype);
32992 TREE_PUBLIC (new_fndecl) = 1;
32993 DECL_EXTERNAL (new_fndecl) = 1;
32994 DECL_IS_NOVOPS (new_fndecl) = 1;
32995 TREE_READONLY (new_fndecl) = 1;
32997 return new_fndecl;
33000 /* Returns a decl of a function that implements gather load with
33001 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
33002 Return NULL_TREE if it is not available. */
33004 static tree
33005 ix86_vectorize_builtin_gather (const_tree mem_vectype,
33006 const_tree index_type, int scale)
33008 bool si;
33009 enum ix86_builtins code;
33011 if (! TARGET_AVX2)
33012 return NULL_TREE;
33014 if ((TREE_CODE (index_type) != INTEGER_TYPE
33015 && !POINTER_TYPE_P (index_type))
33016 || (TYPE_MODE (index_type) != SImode
33017 && TYPE_MODE (index_type) != DImode))
33018 return NULL_TREE;
33020 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
33021 return NULL_TREE;
33023 /* v*gather* insn sign extends index to pointer mode. */
33024 if (TYPE_PRECISION (index_type) < POINTER_SIZE
33025 && TYPE_UNSIGNED (index_type))
33026 return NULL_TREE;
33028 if (scale <= 0
33029 || scale > 8
33030 || (scale & (scale - 1)) != 0)
33031 return NULL_TREE;
33033 si = TYPE_MODE (index_type) == SImode;
33034 switch (TYPE_MODE (mem_vectype))
33036 case V2DFmode:
33037 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
33038 break;
33039 case V4DFmode:
33040 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
33041 break;
33042 case V2DImode:
33043 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
33044 break;
33045 case V4DImode:
33046 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
33047 break;
33048 case V4SFmode:
33049 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
33050 break;
33051 case V8SFmode:
33052 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
33053 break;
33054 case V4SImode:
33055 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
33056 break;
33057 case V8SImode:
33058 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
33059 break;
33060 default:
33061 return NULL_TREE;
33064 return ix86_builtins[code];
33067 /* Returns a code for a target-specific builtin that implements
33068 reciprocal of the function, or NULL_TREE if not available. */
33070 static tree
33071 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33072 bool sqrt ATTRIBUTE_UNUSED)
33074 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33075 && flag_finite_math_only && !flag_trapping_math
33076 && flag_unsafe_math_optimizations))
33077 return NULL_TREE;
33079 if (md_fn)
33080 /* Machine dependent builtins. */
33081 switch (fn)
33083 /* Vectorized version of sqrt to rsqrt conversion. */
33084 case IX86_BUILTIN_SQRTPS_NR:
33085 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33087 case IX86_BUILTIN_SQRTPS_NR256:
33088 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33090 default:
33091 return NULL_TREE;
33093 else
33094 /* Normal builtins. */
33095 switch (fn)
33097 /* Sqrt to rsqrt conversion. */
33098 case BUILT_IN_SQRTF:
33099 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33101 default:
33102 return NULL_TREE;
33106 /* Helper for avx_vpermilps256_operand et al. This is also used by
33107 the expansion functions to turn the parallel back into a mask.
33108 The return value is 0 for no match and the imm8+1 for a match. */
33111 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33113 unsigned i, nelt = GET_MODE_NUNITS (mode);
33114 unsigned mask = 0;
33115 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33117 if (XVECLEN (par, 0) != (int) nelt)
33118 return 0;
33120 /* Validate that all of the elements are constants, and not totally
33121 out of range. Copy the data into an integral array to make the
33122 subsequent checks easier. */
33123 for (i = 0; i < nelt; ++i)
33125 rtx er = XVECEXP (par, 0, i);
33126 unsigned HOST_WIDE_INT ei;
33128 if (!CONST_INT_P (er))
33129 return 0;
33130 ei = INTVAL (er);
33131 if (ei >= nelt)
33132 return 0;
33133 ipar[i] = ei;
33136 switch (mode)
33138 case V4DFmode:
33139 /* In the 256-bit DFmode case, we can only move elements within
33140 a 128-bit lane. */
33141 for (i = 0; i < 2; ++i)
33143 if (ipar[i] >= 2)
33144 return 0;
33145 mask |= ipar[i] << i;
33147 for (i = 2; i < 4; ++i)
33149 if (ipar[i] < 2)
33150 return 0;
33151 mask |= (ipar[i] - 2) << i;
33153 break;
33155 case V8SFmode:
33156 /* In the 256-bit SFmode case, we have full freedom of movement
33157 within the low 128-bit lane, but the high 128-bit lane must
33158 mirror the exact same pattern. */
33159 for (i = 0; i < 4; ++i)
33160 if (ipar[i] + 4 != ipar[i + 4])
33161 return 0;
33162 nelt = 4;
33163 /* FALLTHRU */
33165 case V2DFmode:
33166 case V4SFmode:
33167 /* In the 128-bit case, we've full freedom in the placement of
33168 the elements from the source operand. */
33169 for (i = 0; i < nelt; ++i)
33170 mask |= ipar[i] << (i * (nelt / 2));
33171 break;
33173 default:
33174 gcc_unreachable ();
33177 /* Make sure success has a non-zero value by adding one. */
33178 return mask + 1;
33181 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33182 the expansion functions to turn the parallel back into a mask.
33183 The return value is 0 for no match and the imm8+1 for a match. */
33186 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33188 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33189 unsigned mask = 0;
33190 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33192 if (XVECLEN (par, 0) != (int) nelt)
33193 return 0;
33195 /* Validate that all of the elements are constants, and not totally
33196 out of range. Copy the data into an integral array to make the
33197 subsequent checks easier. */
33198 for (i = 0; i < nelt; ++i)
33200 rtx er = XVECEXP (par, 0, i);
33201 unsigned HOST_WIDE_INT ei;
33203 if (!CONST_INT_P (er))
33204 return 0;
33205 ei = INTVAL (er);
33206 if (ei >= 2 * nelt)
33207 return 0;
33208 ipar[i] = ei;
33211 /* Validate that the halves of the permute are halves. */
33212 for (i = 0; i < nelt2 - 1; ++i)
33213 if (ipar[i] + 1 != ipar[i + 1])
33214 return 0;
33215 for (i = nelt2; i < nelt - 1; ++i)
33216 if (ipar[i] + 1 != ipar[i + 1])
33217 return 0;
33219 /* Reconstruct the mask. */
33220 for (i = 0; i < 2; ++i)
33222 unsigned e = ipar[i * nelt2];
33223 if (e % nelt2)
33224 return 0;
33225 e /= nelt2;
33226 mask |= e << (i * 4);
33229 /* Make sure success has a non-zero value by adding one. */
33230 return mask + 1;
33233 /* Store OPERAND to the memory after reload is completed. This means
33234 that we can't easily use assign_stack_local. */
33236 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33238 rtx result;
33240 gcc_assert (reload_completed);
33241 if (ix86_using_red_zone ())
33243 result = gen_rtx_MEM (mode,
33244 gen_rtx_PLUS (Pmode,
33245 stack_pointer_rtx,
33246 GEN_INT (-RED_ZONE_SIZE)));
33247 emit_move_insn (result, operand);
33249 else if (TARGET_64BIT)
33251 switch (mode)
33253 case HImode:
33254 case SImode:
33255 operand = gen_lowpart (DImode, operand);
33256 /* FALLTHRU */
33257 case DImode:
33258 emit_insn (
33259 gen_rtx_SET (VOIDmode,
33260 gen_rtx_MEM (DImode,
33261 gen_rtx_PRE_DEC (DImode,
33262 stack_pointer_rtx)),
33263 operand));
33264 break;
33265 default:
33266 gcc_unreachable ();
33268 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33270 else
33272 switch (mode)
33274 case DImode:
33276 rtx operands[2];
33277 split_double_mode (mode, &operand, 1, operands, operands + 1);
33278 emit_insn (
33279 gen_rtx_SET (VOIDmode,
33280 gen_rtx_MEM (SImode,
33281 gen_rtx_PRE_DEC (Pmode,
33282 stack_pointer_rtx)),
33283 operands[1]));
33284 emit_insn (
33285 gen_rtx_SET (VOIDmode,
33286 gen_rtx_MEM (SImode,
33287 gen_rtx_PRE_DEC (Pmode,
33288 stack_pointer_rtx)),
33289 operands[0]));
33291 break;
33292 case HImode:
33293 /* Store HImodes as SImodes. */
33294 operand = gen_lowpart (SImode, operand);
33295 /* FALLTHRU */
33296 case SImode:
33297 emit_insn (
33298 gen_rtx_SET (VOIDmode,
33299 gen_rtx_MEM (GET_MODE (operand),
33300 gen_rtx_PRE_DEC (SImode,
33301 stack_pointer_rtx)),
33302 operand));
33303 break;
33304 default:
33305 gcc_unreachable ();
33307 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33309 return result;
33312 /* Free operand from the memory. */
33313 void
33314 ix86_free_from_memory (enum machine_mode mode)
33316 if (!ix86_using_red_zone ())
33318 int size;
33320 if (mode == DImode || TARGET_64BIT)
33321 size = 8;
33322 else
33323 size = 4;
33324 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33325 to pop or add instruction if registers are available. */
33326 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33327 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33328 GEN_INT (size))));
33332 /* Return a register priority for hard reg REGNO. */
33333 static int
33334 ix86_register_priority (int hard_regno)
33336 /* ebp and r13 as the base always wants a displacement, r12 as the
33337 base always wants an index. So discourage their usage in an
33338 address. */
33339 if (hard_regno == R12_REG || hard_regno == R13_REG)
33340 return 0;
33341 if (hard_regno == BP_REG)
33342 return 1;
33343 /* New x86-64 int registers result in bigger code size. Discourage
33344 them. */
33345 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33346 return 2;
33347 /* New x86-64 SSE registers result in bigger code size. Discourage
33348 them. */
33349 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33350 return 2;
33351 /* Usage of AX register results in smaller code. Prefer it. */
33352 if (hard_regno == 0)
33353 return 4;
33354 return 3;
33357 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33359 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33360 QImode must go into class Q_REGS.
33361 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33362 movdf to do mem-to-mem moves through integer regs. */
33364 static reg_class_t
33365 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33367 enum machine_mode mode = GET_MODE (x);
33369 /* We're only allowed to return a subclass of CLASS. Many of the
33370 following checks fail for NO_REGS, so eliminate that early. */
33371 if (regclass == NO_REGS)
33372 return NO_REGS;
33374 /* All classes can load zeros. */
33375 if (x == CONST0_RTX (mode))
33376 return regclass;
33378 /* Force constants into memory if we are loading a (nonzero) constant into
33379 an MMX or SSE register. This is because there are no MMX/SSE instructions
33380 to load from a constant. */
33381 if (CONSTANT_P (x)
33382 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33383 return NO_REGS;
33385 /* Prefer SSE regs only, if we can use them for math. */
33386 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33387 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33389 /* Floating-point constants need more complex checks. */
33390 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33392 /* General regs can load everything. */
33393 if (reg_class_subset_p (regclass, GENERAL_REGS))
33394 return regclass;
33396 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33397 zero above. We only want to wind up preferring 80387 registers if
33398 we plan on doing computation with them. */
33399 if (TARGET_80387
33400 && standard_80387_constant_p (x) > 0)
33402 /* Limit class to non-sse. */
33403 if (regclass == FLOAT_SSE_REGS)
33404 return FLOAT_REGS;
33405 if (regclass == FP_TOP_SSE_REGS)
33406 return FP_TOP_REG;
33407 if (regclass == FP_SECOND_SSE_REGS)
33408 return FP_SECOND_REG;
33409 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33410 return regclass;
33413 return NO_REGS;
33416 /* Generally when we see PLUS here, it's the function invariant
33417 (plus soft-fp const_int). Which can only be computed into general
33418 regs. */
33419 if (GET_CODE (x) == PLUS)
33420 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33422 /* QImode constants are easy to load, but non-constant QImode data
33423 must go into Q_REGS. */
33424 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33426 if (reg_class_subset_p (regclass, Q_REGS))
33427 return regclass;
33428 if (reg_class_subset_p (Q_REGS, regclass))
33429 return Q_REGS;
33430 return NO_REGS;
33433 return regclass;
33436 /* Discourage putting floating-point values in SSE registers unless
33437 SSE math is being used, and likewise for the 387 registers. */
33438 static reg_class_t
33439 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33441 enum machine_mode mode = GET_MODE (x);
33443 /* Restrict the output reload class to the register bank that we are doing
33444 math on. If we would like not to return a subset of CLASS, reject this
33445 alternative: if reload cannot do this, it will still use its choice. */
33446 mode = GET_MODE (x);
33447 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33448 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33450 if (X87_FLOAT_MODE_P (mode))
33452 if (regclass == FP_TOP_SSE_REGS)
33453 return FP_TOP_REG;
33454 else if (regclass == FP_SECOND_SSE_REGS)
33455 return FP_SECOND_REG;
33456 else
33457 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33460 return regclass;
33463 static reg_class_t
33464 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33465 enum machine_mode mode, secondary_reload_info *sri)
33467 /* Double-word spills from general registers to non-offsettable memory
33468 references (zero-extended addresses) require special handling. */
33469 if (TARGET_64BIT
33470 && MEM_P (x)
33471 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33472 && rclass == GENERAL_REGS
33473 && !offsettable_memref_p (x))
33475 sri->icode = (in_p
33476 ? CODE_FOR_reload_noff_load
33477 : CODE_FOR_reload_noff_store);
33478 /* Add the cost of moving address to a temporary. */
33479 sri->extra_cost = 1;
33481 return NO_REGS;
33484 /* QImode spills from non-QI registers require
33485 intermediate register on 32bit targets. */
33486 if (!TARGET_64BIT
33487 && !in_p && mode == QImode
33488 && (rclass == GENERAL_REGS
33489 || rclass == LEGACY_REGS
33490 || rclass == NON_Q_REGS
33491 || rclass == SIREG
33492 || rclass == DIREG
33493 || rclass == INDEX_REGS))
33495 int regno;
33497 if (REG_P (x))
33498 regno = REGNO (x);
33499 else
33500 regno = -1;
33502 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33503 regno = true_regnum (x);
33505 /* Return Q_REGS if the operand is in memory. */
33506 if (regno == -1)
33507 return Q_REGS;
33510 /* This condition handles corner case where an expression involving
33511 pointers gets vectorized. We're trying to use the address of a
33512 stack slot as a vector initializer.
33514 (set (reg:V2DI 74 [ vect_cst_.2 ])
33515 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33517 Eventually frame gets turned into sp+offset like this:
33519 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33520 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33521 (const_int 392 [0x188]))))
33523 That later gets turned into:
33525 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33526 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33527 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33529 We'll have the following reload recorded:
33531 Reload 0: reload_in (DI) =
33532 (plus:DI (reg/f:DI 7 sp)
33533 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33534 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33535 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33536 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33537 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33538 reload_reg_rtx: (reg:V2DI 22 xmm1)
33540 Which isn't going to work since SSE instructions can't handle scalar
33541 additions. Returning GENERAL_REGS forces the addition into integer
33542 register and reload can handle subsequent reloads without problems. */
33544 if (in_p && GET_CODE (x) == PLUS
33545 && SSE_CLASS_P (rclass)
33546 && SCALAR_INT_MODE_P (mode))
33547 return GENERAL_REGS;
33549 return NO_REGS;
33552 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33554 static bool
33555 ix86_class_likely_spilled_p (reg_class_t rclass)
33557 switch (rclass)
33559 case AREG:
33560 case DREG:
33561 case CREG:
33562 case BREG:
33563 case AD_REGS:
33564 case SIREG:
33565 case DIREG:
33566 case SSE_FIRST_REG:
33567 case FP_TOP_REG:
33568 case FP_SECOND_REG:
33569 return true;
33571 default:
33572 break;
33575 return false;
33578 /* If we are copying between general and FP registers, we need a memory
33579 location. The same is true for SSE and MMX registers.
33581 To optimize register_move_cost performance, allow inline variant.
33583 The macro can't work reliably when one of the CLASSES is class containing
33584 registers from multiple units (SSE, MMX, integer). We avoid this by never
33585 combining those units in single alternative in the machine description.
33586 Ensure that this constraint holds to avoid unexpected surprises.
33588 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33589 enforce these sanity checks. */
33591 static inline bool
33592 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33593 enum machine_mode mode, int strict)
33595 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33596 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33597 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33598 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33599 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33600 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33602 gcc_assert (!strict || lra_in_progress);
33603 return true;
33606 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33607 return true;
33609 /* ??? This is a lie. We do have moves between mmx/general, and for
33610 mmx/sse2. But by saying we need secondary memory we discourage the
33611 register allocator from using the mmx registers unless needed. */
33612 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33613 return true;
33615 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33617 /* SSE1 doesn't have any direct moves from other classes. */
33618 if (!TARGET_SSE2)
33619 return true;
33621 /* If the target says that inter-unit moves are more expensive
33622 than moving through memory, then don't generate them. */
33623 if (!TARGET_INTER_UNIT_MOVES)
33624 return true;
33626 /* Between SSE and general, we have moves no larger than word size. */
33627 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33628 return true;
33631 return false;
33634 bool
33635 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33636 enum machine_mode mode, int strict)
33638 return inline_secondary_memory_needed (class1, class2, mode, strict);
33641 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33643 On the 80386, this is the size of MODE in words,
33644 except in the FP regs, where a single reg is always enough. */
33646 static unsigned char
33647 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33649 if (MAYBE_INTEGER_CLASS_P (rclass))
33651 if (mode == XFmode)
33652 return (TARGET_64BIT ? 2 : 3);
33653 else if (mode == XCmode)
33654 return (TARGET_64BIT ? 4 : 6);
33655 else
33656 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33658 else
33660 if (COMPLEX_MODE_P (mode))
33661 return 2;
33662 else
33663 return 1;
33667 /* Return true if the registers in CLASS cannot represent the change from
33668 modes FROM to TO. */
33670 bool
33671 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33672 enum reg_class regclass)
33674 if (from == to)
33675 return false;
33677 /* x87 registers can't do subreg at all, as all values are reformatted
33678 to extended precision. */
33679 if (MAYBE_FLOAT_CLASS_P (regclass))
33680 return true;
33682 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33684 /* Vector registers do not support QI or HImode loads. If we don't
33685 disallow a change to these modes, reload will assume it's ok to
33686 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33687 the vec_dupv4hi pattern. */
33688 if (GET_MODE_SIZE (from) < 4)
33689 return true;
33691 /* Vector registers do not support subreg with nonzero offsets, which
33692 are otherwise valid for integer registers. Since we can't see
33693 whether we have a nonzero offset from here, prohibit all
33694 nonparadoxical subregs changing size. */
33695 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33696 return true;
33699 return false;
33702 /* Return the cost of moving data of mode M between a
33703 register and memory. A value of 2 is the default; this cost is
33704 relative to those in `REGISTER_MOVE_COST'.
33706 This function is used extensively by register_move_cost that is used to
33707 build tables at startup. Make it inline in this case.
33708 When IN is 2, return maximum of in and out move cost.
33710 If moving between registers and memory is more expensive than
33711 between two registers, you should define this macro to express the
33712 relative cost.
33714 Model also increased moving costs of QImode registers in non
33715 Q_REGS classes.
33717 static inline int
33718 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33719 int in)
33721 int cost;
33722 if (FLOAT_CLASS_P (regclass))
33724 int index;
33725 switch (mode)
33727 case SFmode:
33728 index = 0;
33729 break;
33730 case DFmode:
33731 index = 1;
33732 break;
33733 case XFmode:
33734 index = 2;
33735 break;
33736 default:
33737 return 100;
33739 if (in == 2)
33740 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33741 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33743 if (SSE_CLASS_P (regclass))
33745 int index;
33746 switch (GET_MODE_SIZE (mode))
33748 case 4:
33749 index = 0;
33750 break;
33751 case 8:
33752 index = 1;
33753 break;
33754 case 16:
33755 index = 2;
33756 break;
33757 default:
33758 return 100;
33760 if (in == 2)
33761 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33762 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33764 if (MMX_CLASS_P (regclass))
33766 int index;
33767 switch (GET_MODE_SIZE (mode))
33769 case 4:
33770 index = 0;
33771 break;
33772 case 8:
33773 index = 1;
33774 break;
33775 default:
33776 return 100;
33778 if (in)
33779 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33780 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33782 switch (GET_MODE_SIZE (mode))
33784 case 1:
33785 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33787 if (!in)
33788 return ix86_cost->int_store[0];
33789 if (TARGET_PARTIAL_REG_DEPENDENCY
33790 && optimize_function_for_speed_p (cfun))
33791 cost = ix86_cost->movzbl_load;
33792 else
33793 cost = ix86_cost->int_load[0];
33794 if (in == 2)
33795 return MAX (cost, ix86_cost->int_store[0]);
33796 return cost;
33798 else
33800 if (in == 2)
33801 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33802 if (in)
33803 return ix86_cost->movzbl_load;
33804 else
33805 return ix86_cost->int_store[0] + 4;
33807 break;
33808 case 2:
33809 if (in == 2)
33810 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33811 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33812 default:
33813 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33814 if (mode == TFmode)
33815 mode = XFmode;
33816 if (in == 2)
33817 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33818 else if (in)
33819 cost = ix86_cost->int_load[2];
33820 else
33821 cost = ix86_cost->int_store[2];
33822 return (cost * (((int) GET_MODE_SIZE (mode)
33823 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33827 static int
33828 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33829 bool in)
33831 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33835 /* Return the cost of moving data from a register in class CLASS1 to
33836 one in class CLASS2.
33838 It is not required that the cost always equal 2 when FROM is the same as TO;
33839 on some machines it is expensive to move between registers if they are not
33840 general registers. */
33842 static int
33843 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33844 reg_class_t class2_i)
33846 enum reg_class class1 = (enum reg_class) class1_i;
33847 enum reg_class class2 = (enum reg_class) class2_i;
33849 /* In case we require secondary memory, compute cost of the store followed
33850 by load. In order to avoid bad register allocation choices, we need
33851 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33853 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33855 int cost = 1;
33857 cost += inline_memory_move_cost (mode, class1, 2);
33858 cost += inline_memory_move_cost (mode, class2, 2);
33860 /* In case of copying from general_purpose_register we may emit multiple
33861 stores followed by single load causing memory size mismatch stall.
33862 Count this as arbitrarily high cost of 20. */
33863 if (targetm.class_max_nregs (class1, mode)
33864 > targetm.class_max_nregs (class2, mode))
33865 cost += 20;
33867 /* In the case of FP/MMX moves, the registers actually overlap, and we
33868 have to switch modes in order to treat them differently. */
33869 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33870 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33871 cost += 20;
33873 return cost;
33876 /* Moves between SSE/MMX and integer unit are expensive. */
33877 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33878 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33880 /* ??? By keeping returned value relatively high, we limit the number
33881 of moves between integer and MMX/SSE registers for all targets.
33882 Additionally, high value prevents problem with x86_modes_tieable_p(),
33883 where integer modes in MMX/SSE registers are not tieable
33884 because of missing QImode and HImode moves to, from or between
33885 MMX/SSE registers. */
33886 return MAX (8, ix86_cost->mmxsse_to_integer);
33888 if (MAYBE_FLOAT_CLASS_P (class1))
33889 return ix86_cost->fp_move;
33890 if (MAYBE_SSE_CLASS_P (class1))
33891 return ix86_cost->sse_move;
33892 if (MAYBE_MMX_CLASS_P (class1))
33893 return ix86_cost->mmx_move;
33894 return 2;
33897 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33898 MODE. */
33900 bool
33901 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33903 /* Flags and only flags can only hold CCmode values. */
33904 if (CC_REGNO_P (regno))
33905 return GET_MODE_CLASS (mode) == MODE_CC;
33906 if (GET_MODE_CLASS (mode) == MODE_CC
33907 || GET_MODE_CLASS (mode) == MODE_RANDOM
33908 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33909 return false;
33910 if (STACK_REGNO_P (regno))
33911 return VALID_FP_MODE_P (mode);
33912 if (SSE_REGNO_P (regno))
33914 /* We implement the move patterns for all vector modes into and
33915 out of SSE registers, even when no operation instructions
33916 are available. OImode and AVX modes are available only when
33917 AVX is enabled. */
33918 return ((TARGET_AVX
33919 && VALID_AVX256_REG_OR_OI_MODE (mode))
33920 || VALID_SSE_REG_MODE (mode)
33921 || VALID_SSE2_REG_MODE (mode)
33922 || VALID_MMX_REG_MODE (mode)
33923 || VALID_MMX_REG_MODE_3DNOW (mode));
33925 if (MMX_REGNO_P (regno))
33927 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33928 so if the register is available at all, then we can move data of
33929 the given mode into or out of it. */
33930 return (VALID_MMX_REG_MODE (mode)
33931 || VALID_MMX_REG_MODE_3DNOW (mode));
33934 if (mode == QImode)
33936 /* Take care for QImode values - they can be in non-QI regs,
33937 but then they do cause partial register stalls. */
33938 if (TARGET_64BIT || QI_REGNO_P (regno))
33939 return true;
33940 if (!TARGET_PARTIAL_REG_STALL)
33941 return true;
33942 /* LRA checks if the hard register is OK for the given mode.
33943 QImode values can live in non-QI regs, so we allow all
33944 registers here. */
33945 if (lra_in_progress)
33946 return true;
33947 return !can_create_pseudo_p ();
33949 /* We handle both integer and floats in the general purpose registers. */
33950 else if (VALID_INT_MODE_P (mode))
33951 return true;
33952 else if (VALID_FP_MODE_P (mode))
33953 return true;
33954 else if (VALID_DFP_MODE_P (mode))
33955 return true;
33956 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
33957 on to use that value in smaller contexts, this can easily force a
33958 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
33959 supporting DImode, allow it. */
33960 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
33961 return true;
33963 return false;
33966 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
33967 tieable integer mode. */
33969 static bool
33970 ix86_tieable_integer_mode_p (enum machine_mode mode)
33972 switch (mode)
33974 case HImode:
33975 case SImode:
33976 return true;
33978 case QImode:
33979 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
33981 case DImode:
33982 return TARGET_64BIT;
33984 default:
33985 return false;
33989 /* Return true if MODE1 is accessible in a register that can hold MODE2
33990 without copying. That is, all register classes that can hold MODE2
33991 can also hold MODE1. */
33993 bool
33994 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
33996 if (mode1 == mode2)
33997 return true;
33999 if (ix86_tieable_integer_mode_p (mode1)
34000 && ix86_tieable_integer_mode_p (mode2))
34001 return true;
34003 /* MODE2 being XFmode implies fp stack or general regs, which means we
34004 can tie any smaller floating point modes to it. Note that we do not
34005 tie this with TFmode. */
34006 if (mode2 == XFmode)
34007 return mode1 == SFmode || mode1 == DFmode;
34009 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
34010 that we can tie it with SFmode. */
34011 if (mode2 == DFmode)
34012 return mode1 == SFmode;
34014 /* If MODE2 is only appropriate for an SSE register, then tie with
34015 any other mode acceptable to SSE registers. */
34016 if (GET_MODE_SIZE (mode2) == 32
34017 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34018 return (GET_MODE_SIZE (mode1) == 32
34019 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34020 if (GET_MODE_SIZE (mode2) == 16
34021 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34022 return (GET_MODE_SIZE (mode1) == 16
34023 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34025 /* If MODE2 is appropriate for an MMX register, then tie
34026 with any other mode acceptable to MMX registers. */
34027 if (GET_MODE_SIZE (mode2) == 8
34028 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
34029 return (GET_MODE_SIZE (mode1) == 8
34030 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
34032 return false;
34035 /* Return the cost of moving between two registers of mode MODE. */
34037 static int
34038 ix86_set_reg_reg_cost (enum machine_mode mode)
34040 unsigned int units = UNITS_PER_WORD;
34042 switch (GET_MODE_CLASS (mode))
34044 default:
34045 break;
34047 case MODE_CC:
34048 units = GET_MODE_SIZE (CCmode);
34049 break;
34051 case MODE_FLOAT:
34052 if ((TARGET_SSE && mode == TFmode)
34053 || (TARGET_80387 && mode == XFmode)
34054 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
34055 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
34056 units = GET_MODE_SIZE (mode);
34057 break;
34059 case MODE_COMPLEX_FLOAT:
34060 if ((TARGET_SSE && mode == TCmode)
34061 || (TARGET_80387 && mode == XCmode)
34062 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
34063 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
34064 units = GET_MODE_SIZE (mode);
34065 break;
34067 case MODE_VECTOR_INT:
34068 case MODE_VECTOR_FLOAT:
34069 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34070 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34071 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34072 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
34073 units = GET_MODE_SIZE (mode);
34076 /* Return the cost of moving between two registers of mode MODE,
34077 assuming that the move will be in pieces of at most UNITS bytes. */
34078 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
34081 /* Compute a (partial) cost for rtx X. Return true if the complete
34082 cost has been computed, and false if subexpressions should be
34083 scanned. In either case, *TOTAL contains the cost result. */
34085 static bool
34086 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34087 bool speed)
34089 enum rtx_code code = (enum rtx_code) code_i;
34090 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34091 enum machine_mode mode = GET_MODE (x);
34092 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34094 switch (code)
34096 case SET:
34097 if (register_operand (SET_DEST (x), VOIDmode)
34098 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34100 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34101 return true;
34103 return false;
34105 case CONST_INT:
34106 case CONST:
34107 case LABEL_REF:
34108 case SYMBOL_REF:
34109 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34110 *total = 3;
34111 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34112 *total = 2;
34113 else if (flag_pic && SYMBOLIC_CONST (x)
34114 && (!TARGET_64BIT
34115 || (!GET_CODE (x) != LABEL_REF
34116 && (GET_CODE (x) != SYMBOL_REF
34117 || !SYMBOL_REF_LOCAL_P (x)))))
34118 *total = 1;
34119 else
34120 *total = 0;
34121 return true;
34123 case CONST_DOUBLE:
34124 if (mode == VOIDmode)
34126 *total = 0;
34127 return true;
34129 switch (standard_80387_constant_p (x))
34131 case 1: /* 0.0 */
34132 *total = 1;
34133 return true;
34134 default: /* Other constants */
34135 *total = 2;
34136 return true;
34137 case 0:
34138 case -1:
34139 break;
34141 if (SSE_FLOAT_MODE_P (mode))
34143 case CONST_VECTOR:
34144 switch (standard_sse_constant_p (x))
34146 case 0:
34147 break;
34148 case 1: /* 0: xor eliminates false dependency */
34149 *total = 0;
34150 return true;
34151 default: /* -1: cmp contains false dependency */
34152 *total = 1;
34153 return true;
34156 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34157 it'll probably end up. Add a penalty for size. */
34158 *total = (COSTS_N_INSNS (1)
34159 + (flag_pic != 0 && !TARGET_64BIT)
34160 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34161 return true;
34163 case ZERO_EXTEND:
34164 /* The zero extensions is often completely free on x86_64, so make
34165 it as cheap as possible. */
34166 if (TARGET_64BIT && mode == DImode
34167 && GET_MODE (XEXP (x, 0)) == SImode)
34168 *total = 1;
34169 else if (TARGET_ZERO_EXTEND_WITH_AND)
34170 *total = cost->add;
34171 else
34172 *total = cost->movzx;
34173 return false;
34175 case SIGN_EXTEND:
34176 *total = cost->movsx;
34177 return false;
34179 case ASHIFT:
34180 if (SCALAR_INT_MODE_P (mode)
34181 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34182 && CONST_INT_P (XEXP (x, 1)))
34184 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34185 if (value == 1)
34187 *total = cost->add;
34188 return false;
34190 if ((value == 2 || value == 3)
34191 && cost->lea <= cost->shift_const)
34193 *total = cost->lea;
34194 return false;
34197 /* FALLTHRU */
34199 case ROTATE:
34200 case ASHIFTRT:
34201 case LSHIFTRT:
34202 case ROTATERT:
34203 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34205 /* ??? Should be SSE vector operation cost. */
34206 /* At least for published AMD latencies, this really is the same
34207 as the latency for a simple fpu operation like fabs. */
34208 /* V*QImode is emulated with 1-11 insns. */
34209 if (mode == V16QImode || mode == V32QImode)
34211 int count = 11;
34212 if (TARGET_XOP && mode == V16QImode)
34214 /* For XOP we use vpshab, which requires a broadcast of the
34215 value to the variable shift insn. For constants this
34216 means a V16Q const in mem; even when we can perform the
34217 shift with one insn set the cost to prefer paddb. */
34218 if (CONSTANT_P (XEXP (x, 1)))
34220 *total = (cost->fabs
34221 + rtx_cost (XEXP (x, 0), code, 0, speed)
34222 + (speed ? 2 : COSTS_N_BYTES (16)));
34223 return true;
34225 count = 3;
34227 else if (TARGET_SSSE3)
34228 count = 7;
34229 *total = cost->fabs * count;
34231 else
34232 *total = cost->fabs;
34234 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34236 if (CONST_INT_P (XEXP (x, 1)))
34238 if (INTVAL (XEXP (x, 1)) > 32)
34239 *total = cost->shift_const + COSTS_N_INSNS (2);
34240 else
34241 *total = cost->shift_const * 2;
34243 else
34245 if (GET_CODE (XEXP (x, 1)) == AND)
34246 *total = cost->shift_var * 2;
34247 else
34248 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34251 else
34253 if (CONST_INT_P (XEXP (x, 1)))
34254 *total = cost->shift_const;
34255 else if (GET_CODE (XEXP (x, 1)) == SUBREG
34256 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
34258 /* Return the cost after shift-and truncation. */
34259 *total = cost->shift_var;
34260 return true;
34262 else
34263 *total = cost->shift_var;
34265 return false;
34267 case FMA:
34269 rtx sub;
34271 gcc_assert (FLOAT_MODE_P (mode));
34272 gcc_assert (TARGET_FMA || TARGET_FMA4);
34274 /* ??? SSE scalar/vector cost should be used here. */
34275 /* ??? Bald assumption that fma has the same cost as fmul. */
34276 *total = cost->fmul;
34277 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34279 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34280 sub = XEXP (x, 0);
34281 if (GET_CODE (sub) == NEG)
34282 sub = XEXP (sub, 0);
34283 *total += rtx_cost (sub, FMA, 0, speed);
34285 sub = XEXP (x, 2);
34286 if (GET_CODE (sub) == NEG)
34287 sub = XEXP (sub, 0);
34288 *total += rtx_cost (sub, FMA, 2, speed);
34289 return true;
34292 case MULT:
34293 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34295 /* ??? SSE scalar cost should be used here. */
34296 *total = cost->fmul;
34297 return false;
34299 else if (X87_FLOAT_MODE_P (mode))
34301 *total = cost->fmul;
34302 return false;
34304 else if (FLOAT_MODE_P (mode))
34306 /* ??? SSE vector cost should be used here. */
34307 *total = cost->fmul;
34308 return false;
34310 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34312 /* V*QImode is emulated with 7-13 insns. */
34313 if (mode == V16QImode || mode == V32QImode)
34315 int extra = 11;
34316 if (TARGET_XOP && mode == V16QImode)
34317 extra = 5;
34318 else if (TARGET_SSSE3)
34319 extra = 6;
34320 *total = cost->fmul * 2 + cost->fabs * extra;
34322 /* V*DImode is emulated with 5-8 insns. */
34323 else if (mode == V2DImode || mode == V4DImode)
34325 if (TARGET_XOP && mode == V2DImode)
34326 *total = cost->fmul * 2 + cost->fabs * 3;
34327 else
34328 *total = cost->fmul * 3 + cost->fabs * 5;
34330 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34331 insns, including two PMULUDQ. */
34332 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34333 *total = cost->fmul * 2 + cost->fabs * 5;
34334 else
34335 *total = cost->fmul;
34336 return false;
34338 else
34340 rtx op0 = XEXP (x, 0);
34341 rtx op1 = XEXP (x, 1);
34342 int nbits;
34343 if (CONST_INT_P (XEXP (x, 1)))
34345 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34346 for (nbits = 0; value != 0; value &= value - 1)
34347 nbits++;
34349 else
34350 /* This is arbitrary. */
34351 nbits = 7;
34353 /* Compute costs correctly for widening multiplication. */
34354 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34355 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34356 == GET_MODE_SIZE (mode))
34358 int is_mulwiden = 0;
34359 enum machine_mode inner_mode = GET_MODE (op0);
34361 if (GET_CODE (op0) == GET_CODE (op1))
34362 is_mulwiden = 1, op1 = XEXP (op1, 0);
34363 else if (CONST_INT_P (op1))
34365 if (GET_CODE (op0) == SIGN_EXTEND)
34366 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34367 == INTVAL (op1);
34368 else
34369 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34372 if (is_mulwiden)
34373 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34376 *total = (cost->mult_init[MODE_INDEX (mode)]
34377 + nbits * cost->mult_bit
34378 + rtx_cost (op0, outer_code, opno, speed)
34379 + rtx_cost (op1, outer_code, opno, speed));
34381 return true;
34384 case DIV:
34385 case UDIV:
34386 case MOD:
34387 case UMOD:
34388 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34389 /* ??? SSE cost should be used here. */
34390 *total = cost->fdiv;
34391 else if (X87_FLOAT_MODE_P (mode))
34392 *total = cost->fdiv;
34393 else if (FLOAT_MODE_P (mode))
34394 /* ??? SSE vector cost should be used here. */
34395 *total = cost->fdiv;
34396 else
34397 *total = cost->divide[MODE_INDEX (mode)];
34398 return false;
34400 case PLUS:
34401 if (GET_MODE_CLASS (mode) == MODE_INT
34402 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34404 if (GET_CODE (XEXP (x, 0)) == PLUS
34405 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34406 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34407 && CONSTANT_P (XEXP (x, 1)))
34409 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34410 if (val == 2 || val == 4 || val == 8)
34412 *total = cost->lea;
34413 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34414 outer_code, opno, speed);
34415 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34416 outer_code, opno, speed);
34417 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34418 return true;
34421 else if (GET_CODE (XEXP (x, 0)) == MULT
34422 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34424 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34425 if (val == 2 || val == 4 || val == 8)
34427 *total = cost->lea;
34428 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34429 outer_code, opno, speed);
34430 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34431 return true;
34434 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34436 *total = cost->lea;
34437 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34438 outer_code, opno, speed);
34439 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34440 outer_code, opno, speed);
34441 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34442 return true;
34445 /* FALLTHRU */
34447 case MINUS:
34448 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34450 /* ??? SSE cost should be used here. */
34451 *total = cost->fadd;
34452 return false;
34454 else if (X87_FLOAT_MODE_P (mode))
34456 *total = cost->fadd;
34457 return false;
34459 else if (FLOAT_MODE_P (mode))
34461 /* ??? SSE vector cost should be used here. */
34462 *total = cost->fadd;
34463 return false;
34465 /* FALLTHRU */
34467 case AND:
34468 case IOR:
34469 case XOR:
34470 if (GET_MODE_CLASS (mode) == MODE_INT
34471 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34473 *total = (cost->add * 2
34474 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34475 << (GET_MODE (XEXP (x, 0)) != DImode))
34476 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34477 << (GET_MODE (XEXP (x, 1)) != DImode)));
34478 return true;
34480 /* FALLTHRU */
34482 case NEG:
34483 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34485 /* ??? SSE cost should be used here. */
34486 *total = cost->fchs;
34487 return false;
34489 else if (X87_FLOAT_MODE_P (mode))
34491 *total = cost->fchs;
34492 return false;
34494 else if (FLOAT_MODE_P (mode))
34496 /* ??? SSE vector cost should be used here. */
34497 *total = cost->fchs;
34498 return false;
34500 /* FALLTHRU */
34502 case NOT:
34503 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34505 /* ??? Should be SSE vector operation cost. */
34506 /* At least for published AMD latencies, this really is the same
34507 as the latency for a simple fpu operation like fabs. */
34508 *total = cost->fabs;
34510 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34511 *total = cost->add * 2;
34512 else
34513 *total = cost->add;
34514 return false;
34516 case COMPARE:
34517 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34518 && XEXP (XEXP (x, 0), 1) == const1_rtx
34519 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34520 && XEXP (x, 1) == const0_rtx)
34522 /* This kind of construct is implemented using test[bwl].
34523 Treat it as if we had an AND. */
34524 *total = (cost->add
34525 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34526 + rtx_cost (const1_rtx, outer_code, opno, speed));
34527 return true;
34529 return false;
34531 case FLOAT_EXTEND:
34532 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34533 *total = 0;
34534 return false;
34536 case ABS:
34537 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34538 /* ??? SSE cost should be used here. */
34539 *total = cost->fabs;
34540 else if (X87_FLOAT_MODE_P (mode))
34541 *total = cost->fabs;
34542 else if (FLOAT_MODE_P (mode))
34543 /* ??? SSE vector cost should be used here. */
34544 *total = cost->fabs;
34545 return false;
34547 case SQRT:
34548 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34549 /* ??? SSE cost should be used here. */
34550 *total = cost->fsqrt;
34551 else if (X87_FLOAT_MODE_P (mode))
34552 *total = cost->fsqrt;
34553 else if (FLOAT_MODE_P (mode))
34554 /* ??? SSE vector cost should be used here. */
34555 *total = cost->fsqrt;
34556 return false;
34558 case UNSPEC:
34559 if (XINT (x, 1) == UNSPEC_TP)
34560 *total = 0;
34561 return false;
34563 case VEC_SELECT:
34564 case VEC_CONCAT:
34565 case VEC_MERGE:
34566 case VEC_DUPLICATE:
34567 /* ??? Assume all of these vector manipulation patterns are
34568 recognizable. In which case they all pretty much have the
34569 same cost. */
34570 *total = cost->fabs;
34571 return true;
34573 default:
34574 return false;
34578 #if TARGET_MACHO
34580 static int current_machopic_label_num;
34582 /* Given a symbol name and its associated stub, write out the
34583 definition of the stub. */
34585 void
34586 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34588 unsigned int length;
34589 char *binder_name, *symbol_name, lazy_ptr_name[32];
34590 int label = ++current_machopic_label_num;
34592 /* For 64-bit we shouldn't get here. */
34593 gcc_assert (!TARGET_64BIT);
34595 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34596 symb = targetm.strip_name_encoding (symb);
34598 length = strlen (stub);
34599 binder_name = XALLOCAVEC (char, length + 32);
34600 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34602 length = strlen (symb);
34603 symbol_name = XALLOCAVEC (char, length + 32);
34604 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34606 sprintf (lazy_ptr_name, "L%d$lz", label);
34608 if (MACHOPIC_ATT_STUB)
34609 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34610 else if (MACHOPIC_PURE)
34611 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34612 else
34613 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34615 fprintf (file, "%s:\n", stub);
34616 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34618 if (MACHOPIC_ATT_STUB)
34620 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34622 else if (MACHOPIC_PURE)
34624 /* PIC stub. */
34625 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34626 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34627 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34628 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34629 label, lazy_ptr_name, label);
34630 fprintf (file, "\tjmp\t*%%ecx\n");
34632 else
34633 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34635 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34636 it needs no stub-binding-helper. */
34637 if (MACHOPIC_ATT_STUB)
34638 return;
34640 fprintf (file, "%s:\n", binder_name);
34642 if (MACHOPIC_PURE)
34644 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34645 fprintf (file, "\tpushl\t%%ecx\n");
34647 else
34648 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34650 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34652 /* N.B. Keep the correspondence of these
34653 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34654 old-pic/new-pic/non-pic stubs; altering this will break
34655 compatibility with existing dylibs. */
34656 if (MACHOPIC_PURE)
34658 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34659 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34661 else
34662 /* 16-byte -mdynamic-no-pic stub. */
34663 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34665 fprintf (file, "%s:\n", lazy_ptr_name);
34666 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34667 fprintf (file, ASM_LONG "%s\n", binder_name);
34669 #endif /* TARGET_MACHO */
34671 /* Order the registers for register allocator. */
34673 void
34674 x86_order_regs_for_local_alloc (void)
34676 int pos = 0;
34677 int i;
34679 /* First allocate the local general purpose registers. */
34680 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34681 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34682 reg_alloc_order [pos++] = i;
34684 /* Global general purpose registers. */
34685 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34686 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34687 reg_alloc_order [pos++] = i;
34689 /* x87 registers come first in case we are doing FP math
34690 using them. */
34691 if (!TARGET_SSE_MATH)
34692 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34693 reg_alloc_order [pos++] = i;
34695 /* SSE registers. */
34696 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34697 reg_alloc_order [pos++] = i;
34698 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34699 reg_alloc_order [pos++] = i;
34701 /* x87 registers. */
34702 if (TARGET_SSE_MATH)
34703 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34704 reg_alloc_order [pos++] = i;
34706 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34707 reg_alloc_order [pos++] = i;
34709 /* Initialize the rest of array as we do not allocate some registers
34710 at all. */
34711 while (pos < FIRST_PSEUDO_REGISTER)
34712 reg_alloc_order [pos++] = 0;
34715 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34716 in struct attribute_spec handler. */
34717 static tree
34718 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34719 tree args,
34720 int flags ATTRIBUTE_UNUSED,
34721 bool *no_add_attrs)
34723 if (TREE_CODE (*node) != FUNCTION_TYPE
34724 && TREE_CODE (*node) != METHOD_TYPE
34725 && TREE_CODE (*node) != FIELD_DECL
34726 && TREE_CODE (*node) != TYPE_DECL)
34728 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34729 name);
34730 *no_add_attrs = true;
34731 return NULL_TREE;
34733 if (TARGET_64BIT)
34735 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34736 name);
34737 *no_add_attrs = true;
34738 return NULL_TREE;
34740 if (is_attribute_p ("callee_pop_aggregate_return", name))
34742 tree cst;
34744 cst = TREE_VALUE (args);
34745 if (TREE_CODE (cst) != INTEGER_CST)
34747 warning (OPT_Wattributes,
34748 "%qE attribute requires an integer constant argument",
34749 name);
34750 *no_add_attrs = true;
34752 else if (compare_tree_int (cst, 0) != 0
34753 && compare_tree_int (cst, 1) != 0)
34755 warning (OPT_Wattributes,
34756 "argument to %qE attribute is neither zero, nor one",
34757 name);
34758 *no_add_attrs = true;
34761 return NULL_TREE;
34764 return NULL_TREE;
34767 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34768 struct attribute_spec.handler. */
34769 static tree
34770 ix86_handle_abi_attribute (tree *node, tree name,
34771 tree args ATTRIBUTE_UNUSED,
34772 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34774 if (TREE_CODE (*node) != FUNCTION_TYPE
34775 && TREE_CODE (*node) != METHOD_TYPE
34776 && TREE_CODE (*node) != FIELD_DECL
34777 && TREE_CODE (*node) != TYPE_DECL)
34779 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34780 name);
34781 *no_add_attrs = true;
34782 return NULL_TREE;
34785 /* Can combine regparm with all attributes but fastcall. */
34786 if (is_attribute_p ("ms_abi", name))
34788 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34790 error ("ms_abi and sysv_abi attributes are not compatible");
34793 return NULL_TREE;
34795 else if (is_attribute_p ("sysv_abi", name))
34797 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34799 error ("ms_abi and sysv_abi attributes are not compatible");
34802 return NULL_TREE;
34805 return NULL_TREE;
34808 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34809 struct attribute_spec.handler. */
34810 static tree
34811 ix86_handle_struct_attribute (tree *node, tree name,
34812 tree args ATTRIBUTE_UNUSED,
34813 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34815 tree *type = NULL;
34816 if (DECL_P (*node))
34818 if (TREE_CODE (*node) == TYPE_DECL)
34819 type = &TREE_TYPE (*node);
34821 else
34822 type = node;
34824 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34826 warning (OPT_Wattributes, "%qE attribute ignored",
34827 name);
34828 *no_add_attrs = true;
34831 else if ((is_attribute_p ("ms_struct", name)
34832 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34833 || ((is_attribute_p ("gcc_struct", name)
34834 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34836 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34837 name);
34838 *no_add_attrs = true;
34841 return NULL_TREE;
34844 static tree
34845 ix86_handle_fndecl_attribute (tree *node, tree name,
34846 tree args ATTRIBUTE_UNUSED,
34847 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34849 if (TREE_CODE (*node) != FUNCTION_DECL)
34851 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34852 name);
34853 *no_add_attrs = true;
34855 return NULL_TREE;
34858 static bool
34859 ix86_ms_bitfield_layout_p (const_tree record_type)
34861 return ((TARGET_MS_BITFIELD_LAYOUT
34862 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34863 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34866 /* Returns an expression indicating where the this parameter is
34867 located on entry to the FUNCTION. */
34869 static rtx
34870 x86_this_parameter (tree function)
34872 tree type = TREE_TYPE (function);
34873 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34874 int nregs;
34876 if (TARGET_64BIT)
34878 const int *parm_regs;
34880 if (ix86_function_type_abi (type) == MS_ABI)
34881 parm_regs = x86_64_ms_abi_int_parameter_registers;
34882 else
34883 parm_regs = x86_64_int_parameter_registers;
34884 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34887 nregs = ix86_function_regparm (type, function);
34889 if (nregs > 0 && !stdarg_p (type))
34891 int regno;
34892 unsigned int ccvt = ix86_get_callcvt (type);
34894 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34895 regno = aggr ? DX_REG : CX_REG;
34896 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34898 regno = CX_REG;
34899 if (aggr)
34900 return gen_rtx_MEM (SImode,
34901 plus_constant (Pmode, stack_pointer_rtx, 4));
34903 else
34905 regno = AX_REG;
34906 if (aggr)
34908 regno = DX_REG;
34909 if (nregs == 1)
34910 return gen_rtx_MEM (SImode,
34911 plus_constant (Pmode,
34912 stack_pointer_rtx, 4));
34915 return gen_rtx_REG (SImode, regno);
34918 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34919 aggr ? 8 : 4));
34922 /* Determine whether x86_output_mi_thunk can succeed. */
34924 static bool
34925 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34926 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34927 HOST_WIDE_INT vcall_offset, const_tree function)
34929 /* 64-bit can handle anything. */
34930 if (TARGET_64BIT)
34931 return true;
34933 /* For 32-bit, everything's fine if we have one free register. */
34934 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34935 return true;
34937 /* Need a free register for vcall_offset. */
34938 if (vcall_offset)
34939 return false;
34941 /* Need a free register for GOT references. */
34942 if (flag_pic && !targetm.binds_local_p (function))
34943 return false;
34945 /* Otherwise ok. */
34946 return true;
34949 /* Output the assembler code for a thunk function. THUNK_DECL is the
34950 declaration for the thunk function itself, FUNCTION is the decl for
34951 the target function. DELTA is an immediate constant offset to be
34952 added to THIS. If VCALL_OFFSET is nonzero, the word at
34953 *(*this + vcall_offset) should be added to THIS. */
34955 static void
34956 x86_output_mi_thunk (FILE *file,
34957 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
34958 HOST_WIDE_INT vcall_offset, tree function)
34960 rtx this_param = x86_this_parameter (function);
34961 rtx this_reg, tmp, fnaddr;
34962 unsigned int tmp_regno;
34964 if (TARGET_64BIT)
34965 tmp_regno = R10_REG;
34966 else
34968 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
34969 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34970 tmp_regno = AX_REG;
34971 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34972 tmp_regno = DX_REG;
34973 else
34974 tmp_regno = CX_REG;
34977 emit_note (NOTE_INSN_PROLOGUE_END);
34979 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
34980 pull it in now and let DELTA benefit. */
34981 if (REG_P (this_param))
34982 this_reg = this_param;
34983 else if (vcall_offset)
34985 /* Put the this parameter into %eax. */
34986 this_reg = gen_rtx_REG (Pmode, AX_REG);
34987 emit_move_insn (this_reg, this_param);
34989 else
34990 this_reg = NULL_RTX;
34992 /* Adjust the this parameter by a fixed constant. */
34993 if (delta)
34995 rtx delta_rtx = GEN_INT (delta);
34996 rtx delta_dst = this_reg ? this_reg : this_param;
34998 if (TARGET_64BIT)
35000 if (!x86_64_general_operand (delta_rtx, Pmode))
35002 tmp = gen_rtx_REG (Pmode, tmp_regno);
35003 emit_move_insn (tmp, delta_rtx);
35004 delta_rtx = tmp;
35008 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
35011 /* Adjust the this parameter by a value stored in the vtable. */
35012 if (vcall_offset)
35014 rtx vcall_addr, vcall_mem, this_mem;
35016 tmp = gen_rtx_REG (Pmode, tmp_regno);
35018 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
35019 if (Pmode != ptr_mode)
35020 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
35021 emit_move_insn (tmp, this_mem);
35023 /* Adjust the this parameter. */
35024 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
35025 if (TARGET_64BIT
35026 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
35028 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
35029 emit_move_insn (tmp2, GEN_INT (vcall_offset));
35030 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
35033 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
35034 if (Pmode != ptr_mode)
35035 emit_insn (gen_addsi_1_zext (this_reg,
35036 gen_rtx_REG (ptr_mode,
35037 REGNO (this_reg)),
35038 vcall_mem));
35039 else
35040 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
35043 /* If necessary, drop THIS back to its stack slot. */
35044 if (this_reg && this_reg != this_param)
35045 emit_move_insn (this_param, this_reg);
35047 fnaddr = XEXP (DECL_RTL (function), 0);
35048 if (TARGET_64BIT)
35050 if (!flag_pic || targetm.binds_local_p (function)
35051 || cfun->machine->call_abi == MS_ABI)
35053 else
35055 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
35056 tmp = gen_rtx_CONST (Pmode, tmp);
35057 fnaddr = gen_rtx_MEM (Pmode, tmp);
35060 else
35062 if (!flag_pic || targetm.binds_local_p (function))
35064 #if TARGET_MACHO
35065 else if (TARGET_MACHO)
35067 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
35068 fnaddr = XEXP (fnaddr, 0);
35070 #endif /* TARGET_MACHO */
35071 else
35073 tmp = gen_rtx_REG (Pmode, CX_REG);
35074 output_set_got (tmp, NULL_RTX);
35076 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
35077 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
35078 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
35082 /* Our sibling call patterns do not allow memories, because we have no
35083 predicate that can distinguish between frame and non-frame memory.
35084 For our purposes here, we can get away with (ab)using a jump pattern,
35085 because we're going to do no optimization. */
35086 if (MEM_P (fnaddr))
35087 emit_jump_insn (gen_indirect_jump (fnaddr));
35088 else
35090 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35091 fnaddr = legitimize_pic_address (fnaddr,
35092 gen_rtx_REG (Pmode, tmp_regno));
35094 if (!sibcall_insn_operand (fnaddr, word_mode))
35096 tmp = gen_rtx_REG (word_mode, tmp_regno);
35097 if (GET_MODE (fnaddr) != word_mode)
35098 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35099 emit_move_insn (tmp, fnaddr);
35100 fnaddr = tmp;
35103 tmp = gen_rtx_MEM (QImode, fnaddr);
35104 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35105 tmp = emit_call_insn (tmp);
35106 SIBLING_CALL_P (tmp) = 1;
35108 emit_barrier ();
35110 /* Emit just enough of rest_of_compilation to get the insns emitted.
35111 Note that use_thunk calls assemble_start_function et al. */
35112 tmp = get_insns ();
35113 shorten_branches (tmp);
35114 final_start_function (tmp, file, 1);
35115 final (tmp, file, 1);
35116 final_end_function ();
35119 static void
35120 x86_file_start (void)
35122 default_file_start ();
35123 #if TARGET_MACHO
35124 darwin_file_start ();
35125 #endif
35126 if (X86_FILE_START_VERSION_DIRECTIVE)
35127 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35128 if (X86_FILE_START_FLTUSED)
35129 fputs ("\t.global\t__fltused\n", asm_out_file);
35130 if (ix86_asm_dialect == ASM_INTEL)
35131 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35135 x86_field_alignment (tree field, int computed)
35137 enum machine_mode mode;
35138 tree type = TREE_TYPE (field);
35140 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35141 return computed;
35142 mode = TYPE_MODE (strip_array_types (type));
35143 if (mode == DFmode || mode == DCmode
35144 || GET_MODE_CLASS (mode) == MODE_INT
35145 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35146 return MIN (32, computed);
35147 return computed;
35150 /* Output assembler code to FILE to increment profiler label # LABELNO
35151 for profiling a function entry. */
35152 void
35153 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35155 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35156 : MCOUNT_NAME);
35158 if (TARGET_64BIT)
35160 #ifndef NO_PROFILE_COUNTERS
35161 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35162 #endif
35164 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
35165 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35166 else
35167 fprintf (file, "\tcall\t%s\n", mcount_name);
35169 else if (flag_pic)
35171 #ifndef NO_PROFILE_COUNTERS
35172 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35173 LPREFIX, labelno);
35174 #endif
35175 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35177 else
35179 #ifndef NO_PROFILE_COUNTERS
35180 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35181 LPREFIX, labelno);
35182 #endif
35183 fprintf (file, "\tcall\t%s\n", mcount_name);
35187 /* We don't have exact information about the insn sizes, but we may assume
35188 quite safely that we are informed about all 1 byte insns and memory
35189 address sizes. This is enough to eliminate unnecessary padding in
35190 99% of cases. */
35192 static int
35193 min_insn_size (rtx insn)
35195 int l = 0, len;
35197 if (!INSN_P (insn) || !active_insn_p (insn))
35198 return 0;
35200 /* Discard alignments we've emit and jump instructions. */
35201 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35202 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35203 return 0;
35204 if (JUMP_TABLE_DATA_P (insn))
35205 return 0;
35207 /* Important case - calls are always 5 bytes.
35208 It is common to have many calls in the row. */
35209 if (CALL_P (insn)
35210 && symbolic_reference_mentioned_p (PATTERN (insn))
35211 && !SIBLING_CALL_P (insn))
35212 return 5;
35213 len = get_attr_length (insn);
35214 if (len <= 1)
35215 return 1;
35217 /* For normal instructions we rely on get_attr_length being exact,
35218 with a few exceptions. */
35219 if (!JUMP_P (insn))
35221 enum attr_type type = get_attr_type (insn);
35223 switch (type)
35225 case TYPE_MULTI:
35226 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35227 || asm_noperands (PATTERN (insn)) >= 0)
35228 return 0;
35229 break;
35230 case TYPE_OTHER:
35231 case TYPE_FCMP:
35232 break;
35233 default:
35234 /* Otherwise trust get_attr_length. */
35235 return len;
35238 l = get_attr_length_address (insn);
35239 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35240 l = 4;
35242 if (l)
35243 return 1+l;
35244 else
35245 return 2;
35248 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35250 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35251 window. */
35253 static void
35254 ix86_avoid_jump_mispredicts (void)
35256 rtx insn, start = get_insns ();
35257 int nbytes = 0, njumps = 0;
35258 int isjump = 0;
35260 /* Look for all minimal intervals of instructions containing 4 jumps.
35261 The intervals are bounded by START and INSN. NBYTES is the total
35262 size of instructions in the interval including INSN and not including
35263 START. When the NBYTES is smaller than 16 bytes, it is possible
35264 that the end of START and INSN ends up in the same 16byte page.
35266 The smallest offset in the page INSN can start is the case where START
35267 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35268 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35270 Don't consider asm goto as jump, while it can contain a jump, it doesn't
35271 have to, control transfer to label(s) can be performed through other
35272 means, and also we estimate minimum length of all asm stmts as 0. */
35273 for (insn = start; insn; insn = NEXT_INSN (insn))
35275 int min_size;
35277 if (LABEL_P (insn))
35279 int align = label_to_alignment (insn);
35280 int max_skip = label_to_max_skip (insn);
35282 if (max_skip > 15)
35283 max_skip = 15;
35284 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35285 already in the current 16 byte page, because otherwise
35286 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35287 bytes to reach 16 byte boundary. */
35288 if (align <= 0
35289 || (align <= 3 && max_skip != (1 << align) - 1))
35290 max_skip = 0;
35291 if (dump_file)
35292 fprintf (dump_file, "Label %i with max_skip %i\n",
35293 INSN_UID (insn), max_skip);
35294 if (max_skip)
35296 while (nbytes + max_skip >= 16)
35298 start = NEXT_INSN (start);
35299 if ((JUMP_P (start)
35300 && asm_noperands (PATTERN (start)) < 0
35301 && GET_CODE (PATTERN (start)) != ADDR_VEC
35302 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35303 || CALL_P (start))
35304 njumps--, isjump = 1;
35305 else
35306 isjump = 0;
35307 nbytes -= min_insn_size (start);
35310 continue;
35313 min_size = min_insn_size (insn);
35314 nbytes += min_size;
35315 if (dump_file)
35316 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35317 INSN_UID (insn), min_size);
35318 if ((JUMP_P (insn)
35319 && asm_noperands (PATTERN (insn)) < 0
35320 && GET_CODE (PATTERN (insn)) != ADDR_VEC
35321 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
35322 || CALL_P (insn))
35323 njumps++;
35324 else
35325 continue;
35327 while (njumps > 3)
35329 start = NEXT_INSN (start);
35330 if ((JUMP_P (start)
35331 && asm_noperands (PATTERN (start)) < 0
35332 && GET_CODE (PATTERN (start)) != ADDR_VEC
35333 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35334 || CALL_P (start))
35335 njumps--, isjump = 1;
35336 else
35337 isjump = 0;
35338 nbytes -= min_insn_size (start);
35340 gcc_assert (njumps >= 0);
35341 if (dump_file)
35342 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35343 INSN_UID (start), INSN_UID (insn), nbytes);
35345 if (njumps == 3 && isjump && nbytes < 16)
35347 int padsize = 15 - nbytes + min_insn_size (insn);
35349 if (dump_file)
35350 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35351 INSN_UID (insn), padsize);
35352 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35356 #endif
35358 /* AMD Athlon works faster
35359 when RET is not destination of conditional jump or directly preceded
35360 by other jump instruction. We avoid the penalty by inserting NOP just
35361 before the RET instructions in such cases. */
35362 static void
35363 ix86_pad_returns (void)
35365 edge e;
35366 edge_iterator ei;
35368 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35370 basic_block bb = e->src;
35371 rtx ret = BB_END (bb);
35372 rtx prev;
35373 bool replace = false;
35375 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35376 || optimize_bb_for_size_p (bb))
35377 continue;
35378 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35379 if (active_insn_p (prev) || LABEL_P (prev))
35380 break;
35381 if (prev && LABEL_P (prev))
35383 edge e;
35384 edge_iterator ei;
35386 FOR_EACH_EDGE (e, ei, bb->preds)
35387 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35388 && !(e->flags & EDGE_FALLTHRU))
35389 replace = true;
35391 if (!replace)
35393 prev = prev_active_insn (ret);
35394 if (prev
35395 && ((JUMP_P (prev) && any_condjump_p (prev))
35396 || CALL_P (prev)))
35397 replace = true;
35398 /* Empty functions get branch mispredict even when
35399 the jump destination is not visible to us. */
35400 if (!prev && !optimize_function_for_size_p (cfun))
35401 replace = true;
35403 if (replace)
35405 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35406 delete_insn (ret);
35411 /* Count the minimum number of instructions in BB. Return 4 if the
35412 number of instructions >= 4. */
35414 static int
35415 ix86_count_insn_bb (basic_block bb)
35417 rtx insn;
35418 int insn_count = 0;
35420 /* Count number of instructions in this block. Return 4 if the number
35421 of instructions >= 4. */
35422 FOR_BB_INSNS (bb, insn)
35424 /* Only happen in exit blocks. */
35425 if (JUMP_P (insn)
35426 && ANY_RETURN_P (PATTERN (insn)))
35427 break;
35429 if (NONDEBUG_INSN_P (insn)
35430 && GET_CODE (PATTERN (insn)) != USE
35431 && GET_CODE (PATTERN (insn)) != CLOBBER)
35433 insn_count++;
35434 if (insn_count >= 4)
35435 return insn_count;
35439 return insn_count;
35443 /* Count the minimum number of instructions in code path in BB.
35444 Return 4 if the number of instructions >= 4. */
35446 static int
35447 ix86_count_insn (basic_block bb)
35449 edge e;
35450 edge_iterator ei;
35451 int min_prev_count;
35453 /* Only bother counting instructions along paths with no
35454 more than 2 basic blocks between entry and exit. Given
35455 that BB has an edge to exit, determine if a predecessor
35456 of BB has an edge from entry. If so, compute the number
35457 of instructions in the predecessor block. If there
35458 happen to be multiple such blocks, compute the minimum. */
35459 min_prev_count = 4;
35460 FOR_EACH_EDGE (e, ei, bb->preds)
35462 edge prev_e;
35463 edge_iterator prev_ei;
35465 if (e->src == ENTRY_BLOCK_PTR)
35467 min_prev_count = 0;
35468 break;
35470 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35472 if (prev_e->src == ENTRY_BLOCK_PTR)
35474 int count = ix86_count_insn_bb (e->src);
35475 if (count < min_prev_count)
35476 min_prev_count = count;
35477 break;
35482 if (min_prev_count < 4)
35483 min_prev_count += ix86_count_insn_bb (bb);
35485 return min_prev_count;
35488 /* Pad short function to 4 instructions. */
35490 static void
35491 ix86_pad_short_function (void)
35493 edge e;
35494 edge_iterator ei;
35496 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35498 rtx ret = BB_END (e->src);
35499 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35501 int insn_count = ix86_count_insn (e->src);
35503 /* Pad short function. */
35504 if (insn_count < 4)
35506 rtx insn = ret;
35508 /* Find epilogue. */
35509 while (insn
35510 && (!NOTE_P (insn)
35511 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35512 insn = PREV_INSN (insn);
35514 if (!insn)
35515 insn = ret;
35517 /* Two NOPs count as one instruction. */
35518 insn_count = 2 * (4 - insn_count);
35519 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35525 /* Fix up a Windows system unwinder issue. If an EH region falls thru into
35526 the epilogue, the Windows system unwinder will apply epilogue logic and
35527 produce incorrect offsets. This can be avoided by adding a nop between
35528 the last insn that can throw and the first insn of the epilogue. */
35530 static void
35531 ix86_seh_fixup_eh_fallthru (void)
35533 edge e;
35534 edge_iterator ei;
35536 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35538 rtx insn, next;
35540 /* Find the beginning of the epilogue. */
35541 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
35542 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
35543 break;
35544 if (insn == NULL)
35545 continue;
35547 /* We only care about preceeding insns that can throw. */
35548 insn = prev_active_insn (insn);
35549 if (insn == NULL || !can_throw_internal (insn))
35550 continue;
35552 /* Do not separate calls from their debug information. */
35553 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
35554 if (NOTE_P (next)
35555 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
35556 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
35557 insn = next;
35558 else
35559 break;
35561 emit_insn_after (gen_nops (const1_rtx), insn);
35565 /* Implement machine specific optimizations. We implement padding of returns
35566 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35567 static void
35568 ix86_reorg (void)
35570 /* We are freeing block_for_insn in the toplev to keep compatibility
35571 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35572 compute_bb_for_insn ();
35574 if (TARGET_SEH && current_function_has_exception_handlers ())
35575 ix86_seh_fixup_eh_fallthru ();
35577 if (optimize && optimize_function_for_speed_p (cfun))
35579 if (TARGET_PAD_SHORT_FUNCTION)
35580 ix86_pad_short_function ();
35581 else if (TARGET_PAD_RETURNS)
35582 ix86_pad_returns ();
35583 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35584 if (TARGET_FOUR_JUMP_LIMIT)
35585 ix86_avoid_jump_mispredicts ();
35586 #endif
35590 /* Return nonzero when QImode register that must be represented via REX prefix
35591 is used. */
35592 bool
35593 x86_extended_QIreg_mentioned_p (rtx insn)
35595 int i;
35596 extract_insn_cached (insn);
35597 for (i = 0; i < recog_data.n_operands; i++)
35598 if (GENERAL_REG_P (recog_data.operand[i])
35599 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35600 return true;
35601 return false;
35604 /* Return nonzero when P points to register encoded via REX prefix.
35605 Called via for_each_rtx. */
35606 static int
35607 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35609 unsigned int regno;
35610 if (!REG_P (*p))
35611 return 0;
35612 regno = REGNO (*p);
35613 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35616 /* Return true when INSN mentions register that must be encoded using REX
35617 prefix. */
35618 bool
35619 x86_extended_reg_mentioned_p (rtx insn)
35621 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35622 extended_reg_mentioned_1, NULL);
35625 /* If profitable, negate (without causing overflow) integer constant
35626 of mode MODE at location LOC. Return true in this case. */
35627 bool
35628 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35630 HOST_WIDE_INT val;
35632 if (!CONST_INT_P (*loc))
35633 return false;
35635 switch (mode)
35637 case DImode:
35638 /* DImode x86_64 constants must fit in 32 bits. */
35639 gcc_assert (x86_64_immediate_operand (*loc, mode));
35641 mode = SImode;
35642 break;
35644 case SImode:
35645 case HImode:
35646 case QImode:
35647 break;
35649 default:
35650 gcc_unreachable ();
35653 /* Avoid overflows. */
35654 if (mode_signbit_p (mode, *loc))
35655 return false;
35657 val = INTVAL (*loc);
35659 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35660 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35661 if ((val < 0 && val != -128)
35662 || val == 128)
35664 *loc = GEN_INT (-val);
35665 return true;
35668 return false;
35671 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35672 optabs would emit if we didn't have TFmode patterns. */
35674 void
35675 x86_emit_floatuns (rtx operands[2])
35677 rtx neglab, donelab, i0, i1, f0, in, out;
35678 enum machine_mode mode, inmode;
35680 inmode = GET_MODE (operands[1]);
35681 gcc_assert (inmode == SImode || inmode == DImode);
35683 out = operands[0];
35684 in = force_reg (inmode, operands[1]);
35685 mode = GET_MODE (out);
35686 neglab = gen_label_rtx ();
35687 donelab = gen_label_rtx ();
35688 f0 = gen_reg_rtx (mode);
35690 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35692 expand_float (out, in, 0);
35694 emit_jump_insn (gen_jump (donelab));
35695 emit_barrier ();
35697 emit_label (neglab);
35699 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35700 1, OPTAB_DIRECT);
35701 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35702 1, OPTAB_DIRECT);
35703 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35705 expand_float (f0, i0, 0);
35707 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35709 emit_label (donelab);
35712 /* AVX2 does support 32-byte integer vector operations,
35713 thus the longest vector we are faced with is V32QImode. */
35714 #define MAX_VECT_LEN 32
35716 struct expand_vec_perm_d
35718 rtx target, op0, op1;
35719 unsigned char perm[MAX_VECT_LEN];
35720 enum machine_mode vmode;
35721 unsigned char nelt;
35722 bool one_operand_p;
35723 bool testing_p;
35726 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35727 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35728 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35730 /* Get a vector mode of the same size as the original but with elements
35731 twice as wide. This is only guaranteed to apply to integral vectors. */
35733 static inline enum machine_mode
35734 get_mode_wider_vector (enum machine_mode o)
35736 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35737 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35738 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35739 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35740 return n;
35743 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35744 with all elements equal to VAR. Return true if successful. */
35746 static bool
35747 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35748 rtx target, rtx val)
35750 bool ok;
35752 switch (mode)
35754 case V2SImode:
35755 case V2SFmode:
35756 if (!mmx_ok)
35757 return false;
35758 /* FALLTHRU */
35760 case V4DFmode:
35761 case V4DImode:
35762 case V8SFmode:
35763 case V8SImode:
35764 case V2DFmode:
35765 case V2DImode:
35766 case V4SFmode:
35767 case V4SImode:
35769 rtx insn, dup;
35771 /* First attempt to recognize VAL as-is. */
35772 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35773 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35774 if (recog_memoized (insn) < 0)
35776 rtx seq;
35777 /* If that fails, force VAL into a register. */
35779 start_sequence ();
35780 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35781 seq = get_insns ();
35782 end_sequence ();
35783 if (seq)
35784 emit_insn_before (seq, insn);
35786 ok = recog_memoized (insn) >= 0;
35787 gcc_assert (ok);
35790 return true;
35792 case V4HImode:
35793 if (!mmx_ok)
35794 return false;
35795 if (TARGET_SSE || TARGET_3DNOW_A)
35797 rtx x;
35799 val = gen_lowpart (SImode, val);
35800 x = gen_rtx_TRUNCATE (HImode, val);
35801 x = gen_rtx_VEC_DUPLICATE (mode, x);
35802 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35803 return true;
35805 goto widen;
35807 case V8QImode:
35808 if (!mmx_ok)
35809 return false;
35810 goto widen;
35812 case V8HImode:
35813 if (TARGET_SSE2)
35815 struct expand_vec_perm_d dperm;
35816 rtx tmp1, tmp2;
35818 permute:
35819 memset (&dperm, 0, sizeof (dperm));
35820 dperm.target = target;
35821 dperm.vmode = mode;
35822 dperm.nelt = GET_MODE_NUNITS (mode);
35823 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35824 dperm.one_operand_p = true;
35826 /* Extend to SImode using a paradoxical SUBREG. */
35827 tmp1 = gen_reg_rtx (SImode);
35828 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35830 /* Insert the SImode value as low element of a V4SImode vector. */
35831 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35832 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35834 ok = (expand_vec_perm_1 (&dperm)
35835 || expand_vec_perm_broadcast_1 (&dperm));
35836 gcc_assert (ok);
35837 return ok;
35839 goto widen;
35841 case V16QImode:
35842 if (TARGET_SSE2)
35843 goto permute;
35844 goto widen;
35846 widen:
35847 /* Replicate the value once into the next wider mode and recurse. */
35849 enum machine_mode smode, wsmode, wvmode;
35850 rtx x;
35852 smode = GET_MODE_INNER (mode);
35853 wvmode = get_mode_wider_vector (mode);
35854 wsmode = GET_MODE_INNER (wvmode);
35856 val = convert_modes (wsmode, smode, val, true);
35857 x = expand_simple_binop (wsmode, ASHIFT, val,
35858 GEN_INT (GET_MODE_BITSIZE (smode)),
35859 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35860 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35862 x = gen_lowpart (wvmode, target);
35863 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35864 gcc_assert (ok);
35865 return ok;
35868 case V16HImode:
35869 case V32QImode:
35871 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35872 rtx x = gen_reg_rtx (hvmode);
35874 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35875 gcc_assert (ok);
35877 x = gen_rtx_VEC_CONCAT (mode, x, x);
35878 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35880 return true;
35882 default:
35883 return false;
35887 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35888 whose ONE_VAR element is VAR, and other elements are zero. Return true
35889 if successful. */
35891 static bool
35892 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35893 rtx target, rtx var, int one_var)
35895 enum machine_mode vsimode;
35896 rtx new_target;
35897 rtx x, tmp;
35898 bool use_vector_set = false;
35900 switch (mode)
35902 case V2DImode:
35903 /* For SSE4.1, we normally use vector set. But if the second
35904 element is zero and inter-unit moves are OK, we use movq
35905 instead. */
35906 use_vector_set = (TARGET_64BIT
35907 && TARGET_SSE4_1
35908 && !(TARGET_INTER_UNIT_MOVES
35909 && one_var == 0));
35910 break;
35911 case V16QImode:
35912 case V4SImode:
35913 case V4SFmode:
35914 use_vector_set = TARGET_SSE4_1;
35915 break;
35916 case V8HImode:
35917 use_vector_set = TARGET_SSE2;
35918 break;
35919 case V4HImode:
35920 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35921 break;
35922 case V32QImode:
35923 case V16HImode:
35924 case V8SImode:
35925 case V8SFmode:
35926 case V4DFmode:
35927 use_vector_set = TARGET_AVX;
35928 break;
35929 case V4DImode:
35930 /* Use ix86_expand_vector_set in 64bit mode only. */
35931 use_vector_set = TARGET_AVX && TARGET_64BIT;
35932 break;
35933 default:
35934 break;
35937 if (use_vector_set)
35939 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35940 var = force_reg (GET_MODE_INNER (mode), var);
35941 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35942 return true;
35945 switch (mode)
35947 case V2SFmode:
35948 case V2SImode:
35949 if (!mmx_ok)
35950 return false;
35951 /* FALLTHRU */
35953 case V2DFmode:
35954 case V2DImode:
35955 if (one_var != 0)
35956 return false;
35957 var = force_reg (GET_MODE_INNER (mode), var);
35958 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35959 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35960 return true;
35962 case V4SFmode:
35963 case V4SImode:
35964 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35965 new_target = gen_reg_rtx (mode);
35966 else
35967 new_target = target;
35968 var = force_reg (GET_MODE_INNER (mode), var);
35969 x = gen_rtx_VEC_DUPLICATE (mode, var);
35970 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35971 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35972 if (one_var != 0)
35974 /* We need to shuffle the value to the correct position, so
35975 create a new pseudo to store the intermediate result. */
35977 /* With SSE2, we can use the integer shuffle insns. */
35978 if (mode != V4SFmode && TARGET_SSE2)
35980 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35981 const1_rtx,
35982 GEN_INT (one_var == 1 ? 0 : 1),
35983 GEN_INT (one_var == 2 ? 0 : 1),
35984 GEN_INT (one_var == 3 ? 0 : 1)));
35985 if (target != new_target)
35986 emit_move_insn (target, new_target);
35987 return true;
35990 /* Otherwise convert the intermediate result to V4SFmode and
35991 use the SSE1 shuffle instructions. */
35992 if (mode != V4SFmode)
35994 tmp = gen_reg_rtx (V4SFmode);
35995 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
35997 else
35998 tmp = new_target;
36000 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
36001 const1_rtx,
36002 GEN_INT (one_var == 1 ? 0 : 1),
36003 GEN_INT (one_var == 2 ? 0+4 : 1+4),
36004 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
36006 if (mode != V4SFmode)
36007 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
36008 else if (tmp != target)
36009 emit_move_insn (target, tmp);
36011 else if (target != new_target)
36012 emit_move_insn (target, new_target);
36013 return true;
36015 case V8HImode:
36016 case V16QImode:
36017 vsimode = V4SImode;
36018 goto widen;
36019 case V4HImode:
36020 case V8QImode:
36021 if (!mmx_ok)
36022 return false;
36023 vsimode = V2SImode;
36024 goto widen;
36025 widen:
36026 if (one_var != 0)
36027 return false;
36029 /* Zero extend the variable element to SImode and recurse. */
36030 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
36032 x = gen_reg_rtx (vsimode);
36033 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
36034 var, one_var))
36035 gcc_unreachable ();
36037 emit_move_insn (target, gen_lowpart (mode, x));
36038 return true;
36040 default:
36041 return false;
36045 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36046 consisting of the values in VALS. It is known that all elements
36047 except ONE_VAR are constants. Return true if successful. */
36049 static bool
36050 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
36051 rtx target, rtx vals, int one_var)
36053 rtx var = XVECEXP (vals, 0, one_var);
36054 enum machine_mode wmode;
36055 rtx const_vec, x;
36057 const_vec = copy_rtx (vals);
36058 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
36059 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
36061 switch (mode)
36063 case V2DFmode:
36064 case V2DImode:
36065 case V2SFmode:
36066 case V2SImode:
36067 /* For the two element vectors, it's just as easy to use
36068 the general case. */
36069 return false;
36071 case V4DImode:
36072 /* Use ix86_expand_vector_set in 64bit mode only. */
36073 if (!TARGET_64BIT)
36074 return false;
36075 case V4DFmode:
36076 case V8SFmode:
36077 case V8SImode:
36078 case V16HImode:
36079 case V32QImode:
36080 case V4SFmode:
36081 case V4SImode:
36082 case V8HImode:
36083 case V4HImode:
36084 break;
36086 case V16QImode:
36087 if (TARGET_SSE4_1)
36088 break;
36089 wmode = V8HImode;
36090 goto widen;
36091 case V8QImode:
36092 wmode = V4HImode;
36093 goto widen;
36094 widen:
36095 /* There's no way to set one QImode entry easily. Combine
36096 the variable value with its adjacent constant value, and
36097 promote to an HImode set. */
36098 x = XVECEXP (vals, 0, one_var ^ 1);
36099 if (one_var & 1)
36101 var = convert_modes (HImode, QImode, var, true);
36102 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
36103 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36104 x = GEN_INT (INTVAL (x) & 0xff);
36106 else
36108 var = convert_modes (HImode, QImode, var, true);
36109 x = gen_int_mode (INTVAL (x) << 8, HImode);
36111 if (x != const0_rtx)
36112 var = expand_simple_binop (HImode, IOR, var, x, var,
36113 1, OPTAB_LIB_WIDEN);
36115 x = gen_reg_rtx (wmode);
36116 emit_move_insn (x, gen_lowpart (wmode, const_vec));
36117 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
36119 emit_move_insn (target, gen_lowpart (mode, x));
36120 return true;
36122 default:
36123 return false;
36126 emit_move_insn (target, const_vec);
36127 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36128 return true;
36131 /* A subroutine of ix86_expand_vector_init_general. Use vector
36132 concatenate to handle the most general case: all values variable,
36133 and none identical. */
36135 static void
36136 ix86_expand_vector_init_concat (enum machine_mode mode,
36137 rtx target, rtx *ops, int n)
36139 enum machine_mode cmode, hmode = VOIDmode;
36140 rtx first[8], second[4];
36141 rtvec v;
36142 int i, j;
36144 switch (n)
36146 case 2:
36147 switch (mode)
36149 case V8SImode:
36150 cmode = V4SImode;
36151 break;
36152 case V8SFmode:
36153 cmode = V4SFmode;
36154 break;
36155 case V4DImode:
36156 cmode = V2DImode;
36157 break;
36158 case V4DFmode:
36159 cmode = V2DFmode;
36160 break;
36161 case V4SImode:
36162 cmode = V2SImode;
36163 break;
36164 case V4SFmode:
36165 cmode = V2SFmode;
36166 break;
36167 case V2DImode:
36168 cmode = DImode;
36169 break;
36170 case V2SImode:
36171 cmode = SImode;
36172 break;
36173 case V2DFmode:
36174 cmode = DFmode;
36175 break;
36176 case V2SFmode:
36177 cmode = SFmode;
36178 break;
36179 default:
36180 gcc_unreachable ();
36183 if (!register_operand (ops[1], cmode))
36184 ops[1] = force_reg (cmode, ops[1]);
36185 if (!register_operand (ops[0], cmode))
36186 ops[0] = force_reg (cmode, ops[0]);
36187 emit_insn (gen_rtx_SET (VOIDmode, target,
36188 gen_rtx_VEC_CONCAT (mode, ops[0],
36189 ops[1])));
36190 break;
36192 case 4:
36193 switch (mode)
36195 case V4DImode:
36196 cmode = V2DImode;
36197 break;
36198 case V4DFmode:
36199 cmode = V2DFmode;
36200 break;
36201 case V4SImode:
36202 cmode = V2SImode;
36203 break;
36204 case V4SFmode:
36205 cmode = V2SFmode;
36206 break;
36207 default:
36208 gcc_unreachable ();
36210 goto half;
36212 case 8:
36213 switch (mode)
36215 case V8SImode:
36216 cmode = V2SImode;
36217 hmode = V4SImode;
36218 break;
36219 case V8SFmode:
36220 cmode = V2SFmode;
36221 hmode = V4SFmode;
36222 break;
36223 default:
36224 gcc_unreachable ();
36226 goto half;
36228 half:
36229 /* FIXME: We process inputs backward to help RA. PR 36222. */
36230 i = n - 1;
36231 j = (n >> 1) - 1;
36232 for (; i > 0; i -= 2, j--)
36234 first[j] = gen_reg_rtx (cmode);
36235 v = gen_rtvec (2, ops[i - 1], ops[i]);
36236 ix86_expand_vector_init (false, first[j],
36237 gen_rtx_PARALLEL (cmode, v));
36240 n >>= 1;
36241 if (n > 2)
36243 gcc_assert (hmode != VOIDmode);
36244 for (i = j = 0; i < n; i += 2, j++)
36246 second[j] = gen_reg_rtx (hmode);
36247 ix86_expand_vector_init_concat (hmode, second [j],
36248 &first [i], 2);
36250 n >>= 1;
36251 ix86_expand_vector_init_concat (mode, target, second, n);
36253 else
36254 ix86_expand_vector_init_concat (mode, target, first, n);
36255 break;
36257 default:
36258 gcc_unreachable ();
36262 /* A subroutine of ix86_expand_vector_init_general. Use vector
36263 interleave to handle the most general case: all values variable,
36264 and none identical. */
36266 static void
36267 ix86_expand_vector_init_interleave (enum machine_mode mode,
36268 rtx target, rtx *ops, int n)
36270 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36271 int i, j;
36272 rtx op0, op1;
36273 rtx (*gen_load_even) (rtx, rtx, rtx);
36274 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36275 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36277 switch (mode)
36279 case V8HImode:
36280 gen_load_even = gen_vec_setv8hi;
36281 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36282 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36283 inner_mode = HImode;
36284 first_imode = V4SImode;
36285 second_imode = V2DImode;
36286 third_imode = VOIDmode;
36287 break;
36288 case V16QImode:
36289 gen_load_even = gen_vec_setv16qi;
36290 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36291 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36292 inner_mode = QImode;
36293 first_imode = V8HImode;
36294 second_imode = V4SImode;
36295 third_imode = V2DImode;
36296 break;
36297 default:
36298 gcc_unreachable ();
36301 for (i = 0; i < n; i++)
36303 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36304 op0 = gen_reg_rtx (SImode);
36305 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36307 /* Insert the SImode value as low element of V4SImode vector. */
36308 op1 = gen_reg_rtx (V4SImode);
36309 op0 = gen_rtx_VEC_MERGE (V4SImode,
36310 gen_rtx_VEC_DUPLICATE (V4SImode,
36311 op0),
36312 CONST0_RTX (V4SImode),
36313 const1_rtx);
36314 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36316 /* Cast the V4SImode vector back to a vector in orignal mode. */
36317 op0 = gen_reg_rtx (mode);
36318 emit_move_insn (op0, gen_lowpart (mode, op1));
36320 /* Load even elements into the second positon. */
36321 emit_insn (gen_load_even (op0,
36322 force_reg (inner_mode,
36323 ops [i + i + 1]),
36324 const1_rtx));
36326 /* Cast vector to FIRST_IMODE vector. */
36327 ops[i] = gen_reg_rtx (first_imode);
36328 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36331 /* Interleave low FIRST_IMODE vectors. */
36332 for (i = j = 0; i < n; i += 2, j++)
36334 op0 = gen_reg_rtx (first_imode);
36335 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36337 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36338 ops[j] = gen_reg_rtx (second_imode);
36339 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36342 /* Interleave low SECOND_IMODE vectors. */
36343 switch (second_imode)
36345 case V4SImode:
36346 for (i = j = 0; i < n / 2; i += 2, j++)
36348 op0 = gen_reg_rtx (second_imode);
36349 emit_insn (gen_interleave_second_low (op0, ops[i],
36350 ops[i + 1]));
36352 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36353 vector. */
36354 ops[j] = gen_reg_rtx (third_imode);
36355 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36357 second_imode = V2DImode;
36358 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36359 /* FALLTHRU */
36361 case V2DImode:
36362 op0 = gen_reg_rtx (second_imode);
36363 emit_insn (gen_interleave_second_low (op0, ops[0],
36364 ops[1]));
36366 /* Cast the SECOND_IMODE vector back to a vector on original
36367 mode. */
36368 emit_insn (gen_rtx_SET (VOIDmode, target,
36369 gen_lowpart (mode, op0)));
36370 break;
36372 default:
36373 gcc_unreachable ();
36377 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36378 all values variable, and none identical. */
36380 static void
36381 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36382 rtx target, rtx vals)
36384 rtx ops[32], op0, op1;
36385 enum machine_mode half_mode = VOIDmode;
36386 int n, i;
36388 switch (mode)
36390 case V2SFmode:
36391 case V2SImode:
36392 if (!mmx_ok && !TARGET_SSE)
36393 break;
36394 /* FALLTHRU */
36396 case V8SFmode:
36397 case V8SImode:
36398 case V4DFmode:
36399 case V4DImode:
36400 case V4SFmode:
36401 case V4SImode:
36402 case V2DFmode:
36403 case V2DImode:
36404 n = GET_MODE_NUNITS (mode);
36405 for (i = 0; i < n; i++)
36406 ops[i] = XVECEXP (vals, 0, i);
36407 ix86_expand_vector_init_concat (mode, target, ops, n);
36408 return;
36410 case V32QImode:
36411 half_mode = V16QImode;
36412 goto half;
36414 case V16HImode:
36415 half_mode = V8HImode;
36416 goto half;
36418 half:
36419 n = GET_MODE_NUNITS (mode);
36420 for (i = 0; i < n; i++)
36421 ops[i] = XVECEXP (vals, 0, i);
36422 op0 = gen_reg_rtx (half_mode);
36423 op1 = gen_reg_rtx (half_mode);
36424 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36425 n >> 2);
36426 ix86_expand_vector_init_interleave (half_mode, op1,
36427 &ops [n >> 1], n >> 2);
36428 emit_insn (gen_rtx_SET (VOIDmode, target,
36429 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36430 return;
36432 case V16QImode:
36433 if (!TARGET_SSE4_1)
36434 break;
36435 /* FALLTHRU */
36437 case V8HImode:
36438 if (!TARGET_SSE2)
36439 break;
36441 /* Don't use ix86_expand_vector_init_interleave if we can't
36442 move from GPR to SSE register directly. */
36443 if (!TARGET_INTER_UNIT_MOVES)
36444 break;
36446 n = GET_MODE_NUNITS (mode);
36447 for (i = 0; i < n; i++)
36448 ops[i] = XVECEXP (vals, 0, i);
36449 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36450 return;
36452 case V4HImode:
36453 case V8QImode:
36454 break;
36456 default:
36457 gcc_unreachable ();
36461 int i, j, n_elts, n_words, n_elt_per_word;
36462 enum machine_mode inner_mode;
36463 rtx words[4], shift;
36465 inner_mode = GET_MODE_INNER (mode);
36466 n_elts = GET_MODE_NUNITS (mode);
36467 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36468 n_elt_per_word = n_elts / n_words;
36469 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36471 for (i = 0; i < n_words; ++i)
36473 rtx word = NULL_RTX;
36475 for (j = 0; j < n_elt_per_word; ++j)
36477 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36478 elt = convert_modes (word_mode, inner_mode, elt, true);
36480 if (j == 0)
36481 word = elt;
36482 else
36484 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36485 word, 1, OPTAB_LIB_WIDEN);
36486 word = expand_simple_binop (word_mode, IOR, word, elt,
36487 word, 1, OPTAB_LIB_WIDEN);
36491 words[i] = word;
36494 if (n_words == 1)
36495 emit_move_insn (target, gen_lowpart (mode, words[0]));
36496 else if (n_words == 2)
36498 rtx tmp = gen_reg_rtx (mode);
36499 emit_clobber (tmp);
36500 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36501 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36502 emit_move_insn (target, tmp);
36504 else if (n_words == 4)
36506 rtx tmp = gen_reg_rtx (V4SImode);
36507 gcc_assert (word_mode == SImode);
36508 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36509 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36510 emit_move_insn (target, gen_lowpart (mode, tmp));
36512 else
36513 gcc_unreachable ();
36517 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36518 instructions unless MMX_OK is true. */
36520 void
36521 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36523 enum machine_mode mode = GET_MODE (target);
36524 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36525 int n_elts = GET_MODE_NUNITS (mode);
36526 int n_var = 0, one_var = -1;
36527 bool all_same = true, all_const_zero = true;
36528 int i;
36529 rtx x;
36531 for (i = 0; i < n_elts; ++i)
36533 x = XVECEXP (vals, 0, i);
36534 if (!(CONST_INT_P (x)
36535 || GET_CODE (x) == CONST_DOUBLE
36536 || GET_CODE (x) == CONST_FIXED))
36537 n_var++, one_var = i;
36538 else if (x != CONST0_RTX (inner_mode))
36539 all_const_zero = false;
36540 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36541 all_same = false;
36544 /* Constants are best loaded from the constant pool. */
36545 if (n_var == 0)
36547 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36548 return;
36551 /* If all values are identical, broadcast the value. */
36552 if (all_same
36553 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36554 XVECEXP (vals, 0, 0)))
36555 return;
36557 /* Values where only one field is non-constant are best loaded from
36558 the pool and overwritten via move later. */
36559 if (n_var == 1)
36561 if (all_const_zero
36562 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36563 XVECEXP (vals, 0, one_var),
36564 one_var))
36565 return;
36567 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36568 return;
36571 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36574 void
36575 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36577 enum machine_mode mode = GET_MODE (target);
36578 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36579 enum machine_mode half_mode;
36580 bool use_vec_merge = false;
36581 rtx tmp;
36582 static rtx (*gen_extract[6][2]) (rtx, rtx)
36584 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36585 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36586 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36587 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36588 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36589 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36591 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36593 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36594 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36595 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36596 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36597 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36598 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36600 int i, j, n;
36602 switch (mode)
36604 case V2SFmode:
36605 case V2SImode:
36606 if (mmx_ok)
36608 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36609 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36610 if (elt == 0)
36611 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36612 else
36613 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36614 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36615 return;
36617 break;
36619 case V2DImode:
36620 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36621 if (use_vec_merge)
36622 break;
36624 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36625 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36626 if (elt == 0)
36627 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36628 else
36629 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36630 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36631 return;
36633 case V2DFmode:
36635 rtx op0, op1;
36637 /* For the two element vectors, we implement a VEC_CONCAT with
36638 the extraction of the other element. */
36640 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36641 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36643 if (elt == 0)
36644 op0 = val, op1 = tmp;
36645 else
36646 op0 = tmp, op1 = val;
36648 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36649 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36651 return;
36653 case V4SFmode:
36654 use_vec_merge = TARGET_SSE4_1;
36655 if (use_vec_merge)
36656 break;
36658 switch (elt)
36660 case 0:
36661 use_vec_merge = true;
36662 break;
36664 case 1:
36665 /* tmp = target = A B C D */
36666 tmp = copy_to_reg (target);
36667 /* target = A A B B */
36668 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36669 /* target = X A B B */
36670 ix86_expand_vector_set (false, target, val, 0);
36671 /* target = A X C D */
36672 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36673 const1_rtx, const0_rtx,
36674 GEN_INT (2+4), GEN_INT (3+4)));
36675 return;
36677 case 2:
36678 /* tmp = target = A B C D */
36679 tmp = copy_to_reg (target);
36680 /* tmp = X B C D */
36681 ix86_expand_vector_set (false, tmp, val, 0);
36682 /* target = A B X D */
36683 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36684 const0_rtx, const1_rtx,
36685 GEN_INT (0+4), GEN_INT (3+4)));
36686 return;
36688 case 3:
36689 /* tmp = target = A B C D */
36690 tmp = copy_to_reg (target);
36691 /* tmp = X B C D */
36692 ix86_expand_vector_set (false, tmp, val, 0);
36693 /* target = A B X D */
36694 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36695 const0_rtx, const1_rtx,
36696 GEN_INT (2+4), GEN_INT (0+4)));
36697 return;
36699 default:
36700 gcc_unreachable ();
36702 break;
36704 case V4SImode:
36705 use_vec_merge = TARGET_SSE4_1;
36706 if (use_vec_merge)
36707 break;
36709 /* Element 0 handled by vec_merge below. */
36710 if (elt == 0)
36712 use_vec_merge = true;
36713 break;
36716 if (TARGET_SSE2)
36718 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36719 store into element 0, then shuffle them back. */
36721 rtx order[4];
36723 order[0] = GEN_INT (elt);
36724 order[1] = const1_rtx;
36725 order[2] = const2_rtx;
36726 order[3] = GEN_INT (3);
36727 order[elt] = const0_rtx;
36729 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36730 order[1], order[2], order[3]));
36732 ix86_expand_vector_set (false, target, val, 0);
36734 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36735 order[1], order[2], order[3]));
36737 else
36739 /* For SSE1, we have to reuse the V4SF code. */
36740 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36741 gen_lowpart (SFmode, val), elt);
36743 return;
36745 case V8HImode:
36746 use_vec_merge = TARGET_SSE2;
36747 break;
36748 case V4HImode:
36749 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36750 break;
36752 case V16QImode:
36753 use_vec_merge = TARGET_SSE4_1;
36754 break;
36756 case V8QImode:
36757 break;
36759 case V32QImode:
36760 half_mode = V16QImode;
36761 j = 0;
36762 n = 16;
36763 goto half;
36765 case V16HImode:
36766 half_mode = V8HImode;
36767 j = 1;
36768 n = 8;
36769 goto half;
36771 case V8SImode:
36772 half_mode = V4SImode;
36773 j = 2;
36774 n = 4;
36775 goto half;
36777 case V4DImode:
36778 half_mode = V2DImode;
36779 j = 3;
36780 n = 2;
36781 goto half;
36783 case V8SFmode:
36784 half_mode = V4SFmode;
36785 j = 4;
36786 n = 4;
36787 goto half;
36789 case V4DFmode:
36790 half_mode = V2DFmode;
36791 j = 5;
36792 n = 2;
36793 goto half;
36795 half:
36796 /* Compute offset. */
36797 i = elt / n;
36798 elt %= n;
36800 gcc_assert (i <= 1);
36802 /* Extract the half. */
36803 tmp = gen_reg_rtx (half_mode);
36804 emit_insn (gen_extract[j][i] (tmp, target));
36806 /* Put val in tmp at elt. */
36807 ix86_expand_vector_set (false, tmp, val, elt);
36809 /* Put it back. */
36810 emit_insn (gen_insert[j][i] (target, target, tmp));
36811 return;
36813 default:
36814 break;
36817 if (use_vec_merge)
36819 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36820 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36821 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36823 else
36825 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36827 emit_move_insn (mem, target);
36829 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36830 emit_move_insn (tmp, val);
36832 emit_move_insn (target, mem);
36836 void
36837 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36839 enum machine_mode mode = GET_MODE (vec);
36840 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36841 bool use_vec_extr = false;
36842 rtx tmp;
36844 switch (mode)
36846 case V2SImode:
36847 case V2SFmode:
36848 if (!mmx_ok)
36849 break;
36850 /* FALLTHRU */
36852 case V2DFmode:
36853 case V2DImode:
36854 use_vec_extr = true;
36855 break;
36857 case V4SFmode:
36858 use_vec_extr = TARGET_SSE4_1;
36859 if (use_vec_extr)
36860 break;
36862 switch (elt)
36864 case 0:
36865 tmp = vec;
36866 break;
36868 case 1:
36869 case 3:
36870 tmp = gen_reg_rtx (mode);
36871 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36872 GEN_INT (elt), GEN_INT (elt),
36873 GEN_INT (elt+4), GEN_INT (elt+4)));
36874 break;
36876 case 2:
36877 tmp = gen_reg_rtx (mode);
36878 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36879 break;
36881 default:
36882 gcc_unreachable ();
36884 vec = tmp;
36885 use_vec_extr = true;
36886 elt = 0;
36887 break;
36889 case V4SImode:
36890 use_vec_extr = TARGET_SSE4_1;
36891 if (use_vec_extr)
36892 break;
36894 if (TARGET_SSE2)
36896 switch (elt)
36898 case 0:
36899 tmp = vec;
36900 break;
36902 case 1:
36903 case 3:
36904 tmp = gen_reg_rtx (mode);
36905 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36906 GEN_INT (elt), GEN_INT (elt),
36907 GEN_INT (elt), GEN_INT (elt)));
36908 break;
36910 case 2:
36911 tmp = gen_reg_rtx (mode);
36912 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36913 break;
36915 default:
36916 gcc_unreachable ();
36918 vec = tmp;
36919 use_vec_extr = true;
36920 elt = 0;
36922 else
36924 /* For SSE1, we have to reuse the V4SF code. */
36925 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36926 gen_lowpart (V4SFmode, vec), elt);
36927 return;
36929 break;
36931 case V8HImode:
36932 use_vec_extr = TARGET_SSE2;
36933 break;
36934 case V4HImode:
36935 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36936 break;
36938 case V16QImode:
36939 use_vec_extr = TARGET_SSE4_1;
36940 break;
36942 case V8SFmode:
36943 if (TARGET_AVX)
36945 tmp = gen_reg_rtx (V4SFmode);
36946 if (elt < 4)
36947 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36948 else
36949 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36950 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36951 return;
36953 break;
36955 case V4DFmode:
36956 if (TARGET_AVX)
36958 tmp = gen_reg_rtx (V2DFmode);
36959 if (elt < 2)
36960 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36961 else
36962 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36963 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36964 return;
36966 break;
36968 case V32QImode:
36969 if (TARGET_AVX)
36971 tmp = gen_reg_rtx (V16QImode);
36972 if (elt < 16)
36973 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36974 else
36975 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36976 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36977 return;
36979 break;
36981 case V16HImode:
36982 if (TARGET_AVX)
36984 tmp = gen_reg_rtx (V8HImode);
36985 if (elt < 8)
36986 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36987 else
36988 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36989 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36990 return;
36992 break;
36994 case V8SImode:
36995 if (TARGET_AVX)
36997 tmp = gen_reg_rtx (V4SImode);
36998 if (elt < 4)
36999 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
37000 else
37001 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
37002 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37003 return;
37005 break;
37007 case V4DImode:
37008 if (TARGET_AVX)
37010 tmp = gen_reg_rtx (V2DImode);
37011 if (elt < 2)
37012 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
37013 else
37014 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
37015 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37016 return;
37018 break;
37020 case V8QImode:
37021 /* ??? Could extract the appropriate HImode element and shift. */
37022 default:
37023 break;
37026 if (use_vec_extr)
37028 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
37029 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
37031 /* Let the rtl optimizers know about the zero extension performed. */
37032 if (inner_mode == QImode || inner_mode == HImode)
37034 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
37035 target = gen_lowpart (SImode, target);
37038 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37040 else
37042 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37044 emit_move_insn (mem, vec);
37046 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37047 emit_move_insn (target, tmp);
37051 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
37052 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
37053 The upper bits of DEST are undefined, though they shouldn't cause
37054 exceptions (some bits from src or all zeros are ok). */
37056 static void
37057 emit_reduc_half (rtx dest, rtx src, int i)
37059 rtx tem;
37060 switch (GET_MODE (src))
37062 case V4SFmode:
37063 if (i == 128)
37064 tem = gen_sse_movhlps (dest, src, src);
37065 else
37066 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
37067 GEN_INT (1 + 4), GEN_INT (1 + 4));
37068 break;
37069 case V2DFmode:
37070 tem = gen_vec_interleave_highv2df (dest, src, src);
37071 break;
37072 case V16QImode:
37073 case V8HImode:
37074 case V4SImode:
37075 case V2DImode:
37076 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
37077 gen_lowpart (V1TImode, src),
37078 GEN_INT (i / 2));
37079 break;
37080 case V8SFmode:
37081 if (i == 256)
37082 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
37083 else
37084 tem = gen_avx_shufps256 (dest, src, src,
37085 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
37086 break;
37087 case V4DFmode:
37088 if (i == 256)
37089 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
37090 else
37091 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
37092 break;
37093 case V32QImode:
37094 case V16HImode:
37095 case V8SImode:
37096 case V4DImode:
37097 if (i == 256)
37098 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
37099 gen_lowpart (V4DImode, src),
37100 gen_lowpart (V4DImode, src),
37101 const1_rtx);
37102 else
37103 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
37104 gen_lowpart (V2TImode, src),
37105 GEN_INT (i / 2));
37106 break;
37107 default:
37108 gcc_unreachable ();
37110 emit_insn (tem);
37113 /* Expand a vector reduction. FN is the binary pattern to reduce;
37114 DEST is the destination; IN is the input vector. */
37116 void
37117 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
37119 rtx half, dst, vec = in;
37120 enum machine_mode mode = GET_MODE (in);
37121 int i;
37123 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
37124 if (TARGET_SSE4_1
37125 && mode == V8HImode
37126 && fn == gen_uminv8hi3)
37128 emit_insn (gen_sse4_1_phminposuw (dest, in));
37129 return;
37132 for (i = GET_MODE_BITSIZE (mode);
37133 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
37134 i >>= 1)
37136 half = gen_reg_rtx (mode);
37137 emit_reduc_half (half, vec, i);
37138 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37139 dst = dest;
37140 else
37141 dst = gen_reg_rtx (mode);
37142 emit_insn (fn (dst, half, vec));
37143 vec = dst;
37147 /* Target hook for scalar_mode_supported_p. */
37148 static bool
37149 ix86_scalar_mode_supported_p (enum machine_mode mode)
37151 if (DECIMAL_FLOAT_MODE_P (mode))
37152 return default_decimal_float_supported_p ();
37153 else if (mode == TFmode)
37154 return true;
37155 else
37156 return default_scalar_mode_supported_p (mode);
37159 /* Implements target hook vector_mode_supported_p. */
37160 static bool
37161 ix86_vector_mode_supported_p (enum machine_mode mode)
37163 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37164 return true;
37165 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37166 return true;
37167 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37168 return true;
37169 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37170 return true;
37171 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37172 return true;
37173 return false;
37176 /* Target hook for c_mode_for_suffix. */
37177 static enum machine_mode
37178 ix86_c_mode_for_suffix (char suffix)
37180 if (suffix == 'q')
37181 return TFmode;
37182 if (suffix == 'w')
37183 return XFmode;
37185 return VOIDmode;
37188 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37190 We do this in the new i386 backend to maintain source compatibility
37191 with the old cc0-based compiler. */
37193 static tree
37194 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37195 tree inputs ATTRIBUTE_UNUSED,
37196 tree clobbers)
37198 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37199 clobbers);
37200 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37201 clobbers);
37202 return clobbers;
37205 /* Implements target vector targetm.asm.encode_section_info. */
37207 static void ATTRIBUTE_UNUSED
37208 ix86_encode_section_info (tree decl, rtx rtl, int first)
37210 default_encode_section_info (decl, rtl, first);
37212 if (TREE_CODE (decl) == VAR_DECL
37213 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37214 && ix86_in_large_data_p (decl))
37215 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37218 /* Worker function for REVERSE_CONDITION. */
37220 enum rtx_code
37221 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37223 return (mode != CCFPmode && mode != CCFPUmode
37224 ? reverse_condition (code)
37225 : reverse_condition_maybe_unordered (code));
37228 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37229 to OPERANDS[0]. */
37231 const char *
37232 output_387_reg_move (rtx insn, rtx *operands)
37234 if (REG_P (operands[0]))
37236 if (REG_P (operands[1])
37237 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37239 if (REGNO (operands[0]) == FIRST_STACK_REG)
37240 return output_387_ffreep (operands, 0);
37241 return "fstp\t%y0";
37243 if (STACK_TOP_P (operands[0]))
37244 return "fld%Z1\t%y1";
37245 return "fst\t%y0";
37247 else if (MEM_P (operands[0]))
37249 gcc_assert (REG_P (operands[1]));
37250 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37251 return "fstp%Z0\t%y0";
37252 else
37254 /* There is no non-popping store to memory for XFmode.
37255 So if we need one, follow the store with a load. */
37256 if (GET_MODE (operands[0]) == XFmode)
37257 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37258 else
37259 return "fst%Z0\t%y0";
37262 else
37263 gcc_unreachable();
37266 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37267 FP status register is set. */
37269 void
37270 ix86_emit_fp_unordered_jump (rtx label)
37272 rtx reg = gen_reg_rtx (HImode);
37273 rtx temp;
37275 emit_insn (gen_x86_fnstsw_1 (reg));
37277 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37279 emit_insn (gen_x86_sahf_1 (reg));
37281 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37282 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37284 else
37286 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37288 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37289 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37292 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37293 gen_rtx_LABEL_REF (VOIDmode, label),
37294 pc_rtx);
37295 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37297 emit_jump_insn (temp);
37298 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37301 /* Output code to perform a log1p XFmode calculation. */
37303 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37305 rtx label1 = gen_label_rtx ();
37306 rtx label2 = gen_label_rtx ();
37308 rtx tmp = gen_reg_rtx (XFmode);
37309 rtx tmp2 = gen_reg_rtx (XFmode);
37310 rtx test;
37312 emit_insn (gen_absxf2 (tmp, op1));
37313 test = gen_rtx_GE (VOIDmode, tmp,
37314 CONST_DOUBLE_FROM_REAL_VALUE (
37315 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37316 XFmode));
37317 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37319 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37320 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37321 emit_jump (label2);
37323 emit_label (label1);
37324 emit_move_insn (tmp, CONST1_RTX (XFmode));
37325 emit_insn (gen_addxf3 (tmp, op1, tmp));
37326 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37327 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37329 emit_label (label2);
37332 /* Emit code for round calculation. */
37333 void ix86_emit_i387_round (rtx op0, rtx op1)
37335 enum machine_mode inmode = GET_MODE (op1);
37336 enum machine_mode outmode = GET_MODE (op0);
37337 rtx e1, e2, res, tmp, tmp1, half;
37338 rtx scratch = gen_reg_rtx (HImode);
37339 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37340 rtx jump_label = gen_label_rtx ();
37341 rtx insn;
37342 rtx (*gen_abs) (rtx, rtx);
37343 rtx (*gen_neg) (rtx, rtx);
37345 switch (inmode)
37347 case SFmode:
37348 gen_abs = gen_abssf2;
37349 break;
37350 case DFmode:
37351 gen_abs = gen_absdf2;
37352 break;
37353 case XFmode:
37354 gen_abs = gen_absxf2;
37355 break;
37356 default:
37357 gcc_unreachable ();
37360 switch (outmode)
37362 case SFmode:
37363 gen_neg = gen_negsf2;
37364 break;
37365 case DFmode:
37366 gen_neg = gen_negdf2;
37367 break;
37368 case XFmode:
37369 gen_neg = gen_negxf2;
37370 break;
37371 case HImode:
37372 gen_neg = gen_neghi2;
37373 break;
37374 case SImode:
37375 gen_neg = gen_negsi2;
37376 break;
37377 case DImode:
37378 gen_neg = gen_negdi2;
37379 break;
37380 default:
37381 gcc_unreachable ();
37384 e1 = gen_reg_rtx (inmode);
37385 e2 = gen_reg_rtx (inmode);
37386 res = gen_reg_rtx (outmode);
37388 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37390 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37392 /* scratch = fxam(op1) */
37393 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37394 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37395 UNSPEC_FXAM)));
37396 /* e1 = fabs(op1) */
37397 emit_insn (gen_abs (e1, op1));
37399 /* e2 = e1 + 0.5 */
37400 half = force_reg (inmode, half);
37401 emit_insn (gen_rtx_SET (VOIDmode, e2,
37402 gen_rtx_PLUS (inmode, e1, half)));
37404 /* res = floor(e2) */
37405 if (inmode != XFmode)
37407 tmp1 = gen_reg_rtx (XFmode);
37409 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37410 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37412 else
37413 tmp1 = e2;
37415 switch (outmode)
37417 case SFmode:
37418 case DFmode:
37420 rtx tmp0 = gen_reg_rtx (XFmode);
37422 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37424 emit_insn (gen_rtx_SET (VOIDmode, res,
37425 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37426 UNSPEC_TRUNC_NOOP)));
37428 break;
37429 case XFmode:
37430 emit_insn (gen_frndintxf2_floor (res, tmp1));
37431 break;
37432 case HImode:
37433 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37434 break;
37435 case SImode:
37436 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37437 break;
37438 case DImode:
37439 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37440 break;
37441 default:
37442 gcc_unreachable ();
37445 /* flags = signbit(a) */
37446 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37448 /* if (flags) then res = -res */
37449 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37450 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37451 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37452 pc_rtx);
37453 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37454 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37455 JUMP_LABEL (insn) = jump_label;
37457 emit_insn (gen_neg (res, res));
37459 emit_label (jump_label);
37460 LABEL_NUSES (jump_label) = 1;
37462 emit_move_insn (op0, res);
37465 /* Output code to perform a Newton-Rhapson approximation of a single precision
37466 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37468 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37470 rtx x0, x1, e0, e1;
37472 x0 = gen_reg_rtx (mode);
37473 e0 = gen_reg_rtx (mode);
37474 e1 = gen_reg_rtx (mode);
37475 x1 = gen_reg_rtx (mode);
37477 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37479 b = force_reg (mode, b);
37481 /* x0 = rcp(b) estimate */
37482 emit_insn (gen_rtx_SET (VOIDmode, x0,
37483 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37484 UNSPEC_RCP)));
37485 /* e0 = x0 * b */
37486 emit_insn (gen_rtx_SET (VOIDmode, e0,
37487 gen_rtx_MULT (mode, x0, b)));
37489 /* e0 = x0 * e0 */
37490 emit_insn (gen_rtx_SET (VOIDmode, e0,
37491 gen_rtx_MULT (mode, x0, e0)));
37493 /* e1 = x0 + x0 */
37494 emit_insn (gen_rtx_SET (VOIDmode, e1,
37495 gen_rtx_PLUS (mode, x0, x0)));
37497 /* x1 = e1 - e0 */
37498 emit_insn (gen_rtx_SET (VOIDmode, x1,
37499 gen_rtx_MINUS (mode, e1, e0)));
37501 /* res = a * x1 */
37502 emit_insn (gen_rtx_SET (VOIDmode, res,
37503 gen_rtx_MULT (mode, a, x1)));
37506 /* Output code to perform a Newton-Rhapson approximation of a
37507 single precision floating point [reciprocal] square root. */
37509 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37510 bool recip)
37512 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37513 REAL_VALUE_TYPE r;
37515 x0 = gen_reg_rtx (mode);
37516 e0 = gen_reg_rtx (mode);
37517 e1 = gen_reg_rtx (mode);
37518 e2 = gen_reg_rtx (mode);
37519 e3 = gen_reg_rtx (mode);
37521 real_from_integer (&r, VOIDmode, -3, -1, 0);
37522 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37524 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37525 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37527 if (VECTOR_MODE_P (mode))
37529 mthree = ix86_build_const_vector (mode, true, mthree);
37530 mhalf = ix86_build_const_vector (mode, true, mhalf);
37533 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37534 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37536 a = force_reg (mode, a);
37538 /* x0 = rsqrt(a) estimate */
37539 emit_insn (gen_rtx_SET (VOIDmode, x0,
37540 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37541 UNSPEC_RSQRT)));
37543 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37544 if (!recip)
37546 rtx zero, mask;
37548 zero = gen_reg_rtx (mode);
37549 mask = gen_reg_rtx (mode);
37551 zero = force_reg (mode, CONST0_RTX(mode));
37552 emit_insn (gen_rtx_SET (VOIDmode, mask,
37553 gen_rtx_NE (mode, zero, a)));
37555 emit_insn (gen_rtx_SET (VOIDmode, x0,
37556 gen_rtx_AND (mode, x0, mask)));
37559 /* e0 = x0 * a */
37560 emit_insn (gen_rtx_SET (VOIDmode, e0,
37561 gen_rtx_MULT (mode, x0, a)));
37562 /* e1 = e0 * x0 */
37563 emit_insn (gen_rtx_SET (VOIDmode, e1,
37564 gen_rtx_MULT (mode, e0, x0)));
37566 /* e2 = e1 - 3. */
37567 mthree = force_reg (mode, mthree);
37568 emit_insn (gen_rtx_SET (VOIDmode, e2,
37569 gen_rtx_PLUS (mode, e1, mthree)));
37571 mhalf = force_reg (mode, mhalf);
37572 if (recip)
37573 /* e3 = -.5 * x0 */
37574 emit_insn (gen_rtx_SET (VOIDmode, e3,
37575 gen_rtx_MULT (mode, x0, mhalf)));
37576 else
37577 /* e3 = -.5 * e0 */
37578 emit_insn (gen_rtx_SET (VOIDmode, e3,
37579 gen_rtx_MULT (mode, e0, mhalf)));
37580 /* ret = e2 * e3 */
37581 emit_insn (gen_rtx_SET (VOIDmode, res,
37582 gen_rtx_MULT (mode, e2, e3)));
37585 #ifdef TARGET_SOLARIS
37586 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37588 static void
37589 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37590 tree decl)
37592 /* With Binutils 2.15, the "@unwind" marker must be specified on
37593 every occurrence of the ".eh_frame" section, not just the first
37594 one. */
37595 if (TARGET_64BIT
37596 && strcmp (name, ".eh_frame") == 0)
37598 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37599 flags & SECTION_WRITE ? "aw" : "a");
37600 return;
37603 #ifndef USE_GAS
37604 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37606 solaris_elf_asm_comdat_section (name, flags, decl);
37607 return;
37609 #endif
37611 default_elf_asm_named_section (name, flags, decl);
37613 #endif /* TARGET_SOLARIS */
37615 /* Return the mangling of TYPE if it is an extended fundamental type. */
37617 static const char *
37618 ix86_mangle_type (const_tree type)
37620 type = TYPE_MAIN_VARIANT (type);
37622 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37623 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37624 return NULL;
37626 switch (TYPE_MODE (type))
37628 case TFmode:
37629 /* __float128 is "g". */
37630 return "g";
37631 case XFmode:
37632 /* "long double" or __float80 is "e". */
37633 return "e";
37634 default:
37635 return NULL;
37639 /* For 32-bit code we can save PIC register setup by using
37640 __stack_chk_fail_local hidden function instead of calling
37641 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37642 register, so it is better to call __stack_chk_fail directly. */
37644 static tree ATTRIBUTE_UNUSED
37645 ix86_stack_protect_fail (void)
37647 return TARGET_64BIT
37648 ? default_external_stack_protect_fail ()
37649 : default_hidden_stack_protect_fail ();
37652 /* Select a format to encode pointers in exception handling data. CODE
37653 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37654 true if the symbol may be affected by dynamic relocations.
37656 ??? All x86 object file formats are capable of representing this.
37657 After all, the relocation needed is the same as for the call insn.
37658 Whether or not a particular assembler allows us to enter such, I
37659 guess we'll have to see. */
37661 asm_preferred_eh_data_format (int code, int global)
37663 if (flag_pic)
37665 int type = DW_EH_PE_sdata8;
37666 if (!TARGET_64BIT
37667 || ix86_cmodel == CM_SMALL_PIC
37668 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37669 type = DW_EH_PE_sdata4;
37670 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37672 if (ix86_cmodel == CM_SMALL
37673 || (ix86_cmodel == CM_MEDIUM && code))
37674 return DW_EH_PE_udata4;
37675 return DW_EH_PE_absptr;
37678 /* Expand copysign from SIGN to the positive value ABS_VALUE
37679 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37680 the sign-bit. */
37681 static void
37682 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37684 enum machine_mode mode = GET_MODE (sign);
37685 rtx sgn = gen_reg_rtx (mode);
37686 if (mask == NULL_RTX)
37688 enum machine_mode vmode;
37690 if (mode == SFmode)
37691 vmode = V4SFmode;
37692 else if (mode == DFmode)
37693 vmode = V2DFmode;
37694 else
37695 vmode = mode;
37697 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37698 if (!VECTOR_MODE_P (mode))
37700 /* We need to generate a scalar mode mask in this case. */
37701 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37702 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37703 mask = gen_reg_rtx (mode);
37704 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37707 else
37708 mask = gen_rtx_NOT (mode, mask);
37709 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37710 gen_rtx_AND (mode, mask, sign)));
37711 emit_insn (gen_rtx_SET (VOIDmode, result,
37712 gen_rtx_IOR (mode, abs_value, sgn)));
37715 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37716 mask for masking out the sign-bit is stored in *SMASK, if that is
37717 non-null. */
37718 static rtx
37719 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37721 enum machine_mode vmode, mode = GET_MODE (op0);
37722 rtx xa, mask;
37724 xa = gen_reg_rtx (mode);
37725 if (mode == SFmode)
37726 vmode = V4SFmode;
37727 else if (mode == DFmode)
37728 vmode = V2DFmode;
37729 else
37730 vmode = mode;
37731 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37732 if (!VECTOR_MODE_P (mode))
37734 /* We need to generate a scalar mode mask in this case. */
37735 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37736 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37737 mask = gen_reg_rtx (mode);
37738 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37740 emit_insn (gen_rtx_SET (VOIDmode, xa,
37741 gen_rtx_AND (mode, op0, mask)));
37743 if (smask)
37744 *smask = mask;
37746 return xa;
37749 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37750 swapping the operands if SWAP_OPERANDS is true. The expanded
37751 code is a forward jump to a newly created label in case the
37752 comparison is true. The generated label rtx is returned. */
37753 static rtx
37754 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37755 bool swap_operands)
37757 rtx label, tmp;
37759 if (swap_operands)
37761 tmp = op0;
37762 op0 = op1;
37763 op1 = tmp;
37766 label = gen_label_rtx ();
37767 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37768 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37769 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37770 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37771 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37772 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37773 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37774 JUMP_LABEL (tmp) = label;
37776 return label;
37779 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37780 using comparison code CODE. Operands are swapped for the comparison if
37781 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37782 static rtx
37783 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37784 bool swap_operands)
37786 rtx (*insn)(rtx, rtx, rtx, rtx);
37787 enum machine_mode mode = GET_MODE (op0);
37788 rtx mask = gen_reg_rtx (mode);
37790 if (swap_operands)
37792 rtx tmp = op0;
37793 op0 = op1;
37794 op1 = tmp;
37797 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37799 emit_insn (insn (mask, op0, op1,
37800 gen_rtx_fmt_ee (code, mode, op0, op1)));
37801 return mask;
37804 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37805 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37806 static rtx
37807 ix86_gen_TWO52 (enum machine_mode mode)
37809 REAL_VALUE_TYPE TWO52r;
37810 rtx TWO52;
37812 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37813 TWO52 = const_double_from_real_value (TWO52r, mode);
37814 TWO52 = force_reg (mode, TWO52);
37816 return TWO52;
37819 /* Expand SSE sequence for computing lround from OP1 storing
37820 into OP0. */
37821 void
37822 ix86_expand_lround (rtx op0, rtx op1)
37824 /* C code for the stuff we're doing below:
37825 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37826 return (long)tmp;
37828 enum machine_mode mode = GET_MODE (op1);
37829 const struct real_format *fmt;
37830 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37831 rtx adj;
37833 /* load nextafter (0.5, 0.0) */
37834 fmt = REAL_MODE_FORMAT (mode);
37835 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37836 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37838 /* adj = copysign (0.5, op1) */
37839 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37840 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37842 /* adj = op1 + adj */
37843 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37845 /* op0 = (imode)adj */
37846 expand_fix (op0, adj, 0);
37849 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37850 into OPERAND0. */
37851 void
37852 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37854 /* C code for the stuff we're doing below (for do_floor):
37855 xi = (long)op1;
37856 xi -= (double)xi > op1 ? 1 : 0;
37857 return xi;
37859 enum machine_mode fmode = GET_MODE (op1);
37860 enum machine_mode imode = GET_MODE (op0);
37861 rtx ireg, freg, label, tmp;
37863 /* reg = (long)op1 */
37864 ireg = gen_reg_rtx (imode);
37865 expand_fix (ireg, op1, 0);
37867 /* freg = (double)reg */
37868 freg = gen_reg_rtx (fmode);
37869 expand_float (freg, ireg, 0);
37871 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37872 label = ix86_expand_sse_compare_and_jump (UNLE,
37873 freg, op1, !do_floor);
37874 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37875 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37876 emit_move_insn (ireg, tmp);
37878 emit_label (label);
37879 LABEL_NUSES (label) = 1;
37881 emit_move_insn (op0, ireg);
37884 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37885 result in OPERAND0. */
37886 void
37887 ix86_expand_rint (rtx operand0, rtx operand1)
37889 /* C code for the stuff we're doing below:
37890 xa = fabs (operand1);
37891 if (!isless (xa, 2**52))
37892 return operand1;
37893 xa = xa + 2**52 - 2**52;
37894 return copysign (xa, operand1);
37896 enum machine_mode mode = GET_MODE (operand0);
37897 rtx res, xa, label, TWO52, mask;
37899 res = gen_reg_rtx (mode);
37900 emit_move_insn (res, operand1);
37902 /* xa = abs (operand1) */
37903 xa = ix86_expand_sse_fabs (res, &mask);
37905 /* if (!isless (xa, TWO52)) goto label; */
37906 TWO52 = ix86_gen_TWO52 (mode);
37907 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37909 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37910 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37912 ix86_sse_copysign_to_positive (res, xa, res, mask);
37914 emit_label (label);
37915 LABEL_NUSES (label) = 1;
37917 emit_move_insn (operand0, res);
37920 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37921 into OPERAND0. */
37922 void
37923 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37925 /* C code for the stuff we expand below.
37926 double xa = fabs (x), x2;
37927 if (!isless (xa, TWO52))
37928 return x;
37929 xa = xa + TWO52 - TWO52;
37930 x2 = copysign (xa, x);
37931 Compensate. Floor:
37932 if (x2 > x)
37933 x2 -= 1;
37934 Compensate. Ceil:
37935 if (x2 < x)
37936 x2 -= -1;
37937 return x2;
37939 enum machine_mode mode = GET_MODE (operand0);
37940 rtx xa, TWO52, tmp, label, one, res, mask;
37942 TWO52 = ix86_gen_TWO52 (mode);
37944 /* Temporary for holding the result, initialized to the input
37945 operand to ease control flow. */
37946 res = gen_reg_rtx (mode);
37947 emit_move_insn (res, operand1);
37949 /* xa = abs (operand1) */
37950 xa = ix86_expand_sse_fabs (res, &mask);
37952 /* if (!isless (xa, TWO52)) goto label; */
37953 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37955 /* xa = xa + TWO52 - TWO52; */
37956 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37957 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37959 /* xa = copysign (xa, operand1) */
37960 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37962 /* generate 1.0 or -1.0 */
37963 one = force_reg (mode,
37964 const_double_from_real_value (do_floor
37965 ? dconst1 : dconstm1, mode));
37967 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37968 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37969 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37970 gen_rtx_AND (mode, one, tmp)));
37971 /* We always need to subtract here to preserve signed zero. */
37972 tmp = expand_simple_binop (mode, MINUS,
37973 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37974 emit_move_insn (res, tmp);
37976 emit_label (label);
37977 LABEL_NUSES (label) = 1;
37979 emit_move_insn (operand0, res);
37982 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37983 into OPERAND0. */
37984 void
37985 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37987 /* C code for the stuff we expand below.
37988 double xa = fabs (x), x2;
37989 if (!isless (xa, TWO52))
37990 return x;
37991 x2 = (double)(long)x;
37992 Compensate. Floor:
37993 if (x2 > x)
37994 x2 -= 1;
37995 Compensate. Ceil:
37996 if (x2 < x)
37997 x2 += 1;
37998 if (HONOR_SIGNED_ZEROS (mode))
37999 return copysign (x2, x);
38000 return x2;
38002 enum machine_mode mode = GET_MODE (operand0);
38003 rtx xa, xi, TWO52, tmp, label, one, res, mask;
38005 TWO52 = ix86_gen_TWO52 (mode);
38007 /* Temporary for holding the result, initialized to the input
38008 operand to ease control flow. */
38009 res = gen_reg_rtx (mode);
38010 emit_move_insn (res, operand1);
38012 /* xa = abs (operand1) */
38013 xa = ix86_expand_sse_fabs (res, &mask);
38015 /* if (!isless (xa, TWO52)) goto label; */
38016 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38018 /* xa = (double)(long)x */
38019 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38020 expand_fix (xi, res, 0);
38021 expand_float (xa, xi, 0);
38023 /* generate 1.0 */
38024 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38026 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38027 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38028 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38029 gen_rtx_AND (mode, one, tmp)));
38030 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
38031 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38032 emit_move_insn (res, tmp);
38034 if (HONOR_SIGNED_ZEROS (mode))
38035 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38037 emit_label (label);
38038 LABEL_NUSES (label) = 1;
38040 emit_move_insn (operand0, res);
38043 /* Expand SSE sequence for computing round from OPERAND1 storing
38044 into OPERAND0. Sequence that works without relying on DImode truncation
38045 via cvttsd2siq that is only available on 64bit targets. */
38046 void
38047 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
38049 /* C code for the stuff we expand below.
38050 double xa = fabs (x), xa2, x2;
38051 if (!isless (xa, TWO52))
38052 return x;
38053 Using the absolute value and copying back sign makes
38054 -0.0 -> -0.0 correct.
38055 xa2 = xa + TWO52 - TWO52;
38056 Compensate.
38057 dxa = xa2 - xa;
38058 if (dxa <= -0.5)
38059 xa2 += 1;
38060 else if (dxa > 0.5)
38061 xa2 -= 1;
38062 x2 = copysign (xa2, x);
38063 return x2;
38065 enum machine_mode mode = GET_MODE (operand0);
38066 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
38068 TWO52 = ix86_gen_TWO52 (mode);
38070 /* Temporary for holding the result, initialized to the input
38071 operand to ease control flow. */
38072 res = gen_reg_rtx (mode);
38073 emit_move_insn (res, operand1);
38075 /* xa = abs (operand1) */
38076 xa = ix86_expand_sse_fabs (res, &mask);
38078 /* if (!isless (xa, TWO52)) goto label; */
38079 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38081 /* xa2 = xa + TWO52 - TWO52; */
38082 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38083 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
38085 /* dxa = xa2 - xa; */
38086 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
38088 /* generate 0.5, 1.0 and -0.5 */
38089 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
38090 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
38091 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
38092 0, OPTAB_DIRECT);
38094 /* Compensate. */
38095 tmp = gen_reg_rtx (mode);
38096 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
38097 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
38098 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38099 gen_rtx_AND (mode, one, tmp)));
38100 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38101 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
38102 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
38103 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38104 gen_rtx_AND (mode, one, tmp)));
38105 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38107 /* res = copysign (xa2, operand1) */
38108 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
38110 emit_label (label);
38111 LABEL_NUSES (label) = 1;
38113 emit_move_insn (operand0, res);
38116 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38117 into OPERAND0. */
38118 void
38119 ix86_expand_trunc (rtx operand0, rtx operand1)
38121 /* C code for SSE variant we expand below.
38122 double xa = fabs (x), x2;
38123 if (!isless (xa, TWO52))
38124 return x;
38125 x2 = (double)(long)x;
38126 if (HONOR_SIGNED_ZEROS (mode))
38127 return copysign (x2, x);
38128 return x2;
38130 enum machine_mode mode = GET_MODE (operand0);
38131 rtx xa, xi, TWO52, label, res, mask;
38133 TWO52 = ix86_gen_TWO52 (mode);
38135 /* Temporary for holding the result, initialized to the input
38136 operand to ease control flow. */
38137 res = gen_reg_rtx (mode);
38138 emit_move_insn (res, operand1);
38140 /* xa = abs (operand1) */
38141 xa = ix86_expand_sse_fabs (res, &mask);
38143 /* if (!isless (xa, TWO52)) goto label; */
38144 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38146 /* x = (double)(long)x */
38147 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38148 expand_fix (xi, res, 0);
38149 expand_float (res, xi, 0);
38151 if (HONOR_SIGNED_ZEROS (mode))
38152 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38154 emit_label (label);
38155 LABEL_NUSES (label) = 1;
38157 emit_move_insn (operand0, res);
38160 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38161 into OPERAND0. */
38162 void
38163 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38165 enum machine_mode mode = GET_MODE (operand0);
38166 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38168 /* C code for SSE variant we expand below.
38169 double xa = fabs (x), x2;
38170 if (!isless (xa, TWO52))
38171 return x;
38172 xa2 = xa + TWO52 - TWO52;
38173 Compensate:
38174 if (xa2 > xa)
38175 xa2 -= 1.0;
38176 x2 = copysign (xa2, x);
38177 return x2;
38180 TWO52 = ix86_gen_TWO52 (mode);
38182 /* Temporary for holding the result, initialized to the input
38183 operand to ease control flow. */
38184 res = gen_reg_rtx (mode);
38185 emit_move_insn (res, operand1);
38187 /* xa = abs (operand1) */
38188 xa = ix86_expand_sse_fabs (res, &smask);
38190 /* if (!isless (xa, TWO52)) goto label; */
38191 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38193 /* res = xa + TWO52 - TWO52; */
38194 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38195 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38196 emit_move_insn (res, tmp);
38198 /* generate 1.0 */
38199 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38201 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38202 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38203 emit_insn (gen_rtx_SET (VOIDmode, mask,
38204 gen_rtx_AND (mode, mask, one)));
38205 tmp = expand_simple_binop (mode, MINUS,
38206 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38207 emit_move_insn (res, tmp);
38209 /* res = copysign (res, operand1) */
38210 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38212 emit_label (label);
38213 LABEL_NUSES (label) = 1;
38215 emit_move_insn (operand0, res);
38218 /* Expand SSE sequence for computing round from OPERAND1 storing
38219 into OPERAND0. */
38220 void
38221 ix86_expand_round (rtx operand0, rtx operand1)
38223 /* C code for the stuff we're doing below:
38224 double xa = fabs (x);
38225 if (!isless (xa, TWO52))
38226 return x;
38227 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38228 return copysign (xa, x);
38230 enum machine_mode mode = GET_MODE (operand0);
38231 rtx res, TWO52, xa, label, xi, half, mask;
38232 const struct real_format *fmt;
38233 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38235 /* Temporary for holding the result, initialized to the input
38236 operand to ease control flow. */
38237 res = gen_reg_rtx (mode);
38238 emit_move_insn (res, operand1);
38240 TWO52 = ix86_gen_TWO52 (mode);
38241 xa = ix86_expand_sse_fabs (res, &mask);
38242 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38244 /* load nextafter (0.5, 0.0) */
38245 fmt = REAL_MODE_FORMAT (mode);
38246 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38247 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38249 /* xa = xa + 0.5 */
38250 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38251 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38253 /* xa = (double)(int64_t)xa */
38254 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38255 expand_fix (xi, xa, 0);
38256 expand_float (xa, xi, 0);
38258 /* res = copysign (xa, operand1) */
38259 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38261 emit_label (label);
38262 LABEL_NUSES (label) = 1;
38264 emit_move_insn (operand0, res);
38267 /* Expand SSE sequence for computing round
38268 from OP1 storing into OP0 using sse4 round insn. */
38269 void
38270 ix86_expand_round_sse4 (rtx op0, rtx op1)
38272 enum machine_mode mode = GET_MODE (op0);
38273 rtx e1, e2, res, half;
38274 const struct real_format *fmt;
38275 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38276 rtx (*gen_copysign) (rtx, rtx, rtx);
38277 rtx (*gen_round) (rtx, rtx, rtx);
38279 switch (mode)
38281 case SFmode:
38282 gen_copysign = gen_copysignsf3;
38283 gen_round = gen_sse4_1_roundsf2;
38284 break;
38285 case DFmode:
38286 gen_copysign = gen_copysigndf3;
38287 gen_round = gen_sse4_1_rounddf2;
38288 break;
38289 default:
38290 gcc_unreachable ();
38293 /* round (a) = trunc (a + copysign (0.5, a)) */
38295 /* load nextafter (0.5, 0.0) */
38296 fmt = REAL_MODE_FORMAT (mode);
38297 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38298 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38299 half = const_double_from_real_value (pred_half, mode);
38301 /* e1 = copysign (0.5, op1) */
38302 e1 = gen_reg_rtx (mode);
38303 emit_insn (gen_copysign (e1, half, op1));
38305 /* e2 = op1 + e1 */
38306 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38308 /* res = trunc (e2) */
38309 res = gen_reg_rtx (mode);
38310 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38312 emit_move_insn (op0, res);
38316 /* Table of valid machine attributes. */
38317 static const struct attribute_spec ix86_attribute_table[] =
38319 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38320 affects_type_identity } */
38321 /* Stdcall attribute says callee is responsible for popping arguments
38322 if they are not variable. */
38323 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38324 true },
38325 /* Fastcall attribute says callee is responsible for popping arguments
38326 if they are not variable. */
38327 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38328 true },
38329 /* Thiscall attribute says callee is responsible for popping arguments
38330 if they are not variable. */
38331 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38332 true },
38333 /* Cdecl attribute says the callee is a normal C declaration */
38334 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38335 true },
38336 /* Regparm attribute specifies how many integer arguments are to be
38337 passed in registers. */
38338 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38339 true },
38340 /* Sseregparm attribute says we are using x86_64 calling conventions
38341 for FP arguments. */
38342 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38343 true },
38344 /* The transactional memory builtins are implicitly regparm or fastcall
38345 depending on the ABI. Override the generic do-nothing attribute that
38346 these builtins were declared with. */
38347 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38348 true },
38349 /* force_align_arg_pointer says this function realigns the stack at entry. */
38350 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38351 false, true, true, ix86_handle_cconv_attribute, false },
38352 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38353 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38354 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38355 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38356 false },
38357 #endif
38358 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38359 false },
38360 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38361 false },
38362 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38363 SUBTARGET_ATTRIBUTE_TABLE,
38364 #endif
38365 /* ms_abi and sysv_abi calling convention function attributes. */
38366 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38367 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38368 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38369 false },
38370 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38371 ix86_handle_callee_pop_aggregate_return, true },
38372 /* End element. */
38373 { NULL, 0, 0, false, false, false, NULL, false }
38376 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38377 static int
38378 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38379 tree vectype,
38380 int misalign ATTRIBUTE_UNUSED)
38382 unsigned elements;
38384 switch (type_of_cost)
38386 case scalar_stmt:
38387 return ix86_cost->scalar_stmt_cost;
38389 case scalar_load:
38390 return ix86_cost->scalar_load_cost;
38392 case scalar_store:
38393 return ix86_cost->scalar_store_cost;
38395 case vector_stmt:
38396 return ix86_cost->vec_stmt_cost;
38398 case vector_load:
38399 return ix86_cost->vec_align_load_cost;
38401 case vector_store:
38402 return ix86_cost->vec_store_cost;
38404 case vec_to_scalar:
38405 return ix86_cost->vec_to_scalar_cost;
38407 case scalar_to_vec:
38408 return ix86_cost->scalar_to_vec_cost;
38410 case unaligned_load:
38411 case unaligned_store:
38412 return ix86_cost->vec_unalign_load_cost;
38414 case cond_branch_taken:
38415 return ix86_cost->cond_taken_branch_cost;
38417 case cond_branch_not_taken:
38418 return ix86_cost->cond_not_taken_branch_cost;
38420 case vec_perm:
38421 case vec_promote_demote:
38422 return ix86_cost->vec_stmt_cost;
38424 case vec_construct:
38425 elements = TYPE_VECTOR_SUBPARTS (vectype);
38426 return elements / 2 + 1;
38428 default:
38429 gcc_unreachable ();
38433 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38434 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38435 insn every time. */
38437 static GTY(()) rtx vselect_insn;
38439 /* Initialize vselect_insn. */
38441 static void
38442 init_vselect_insn (void)
38444 unsigned i;
38445 rtx x;
38447 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38448 for (i = 0; i < MAX_VECT_LEN; ++i)
38449 XVECEXP (x, 0, i) = const0_rtx;
38450 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38451 const0_rtx), x);
38452 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38453 start_sequence ();
38454 vselect_insn = emit_insn (x);
38455 end_sequence ();
38458 /* Construct (set target (vec_select op0 (parallel perm))) and
38459 return true if that's a valid instruction in the active ISA. */
38461 static bool
38462 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38463 unsigned nelt, bool testing_p)
38465 unsigned int i;
38466 rtx x, save_vconcat;
38467 int icode;
38469 if (vselect_insn == NULL_RTX)
38470 init_vselect_insn ();
38472 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38473 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38474 for (i = 0; i < nelt; ++i)
38475 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38476 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38477 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38478 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38479 SET_DEST (PATTERN (vselect_insn)) = target;
38480 icode = recog_memoized (vselect_insn);
38482 if (icode >= 0 && !testing_p)
38483 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38485 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38486 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38487 INSN_CODE (vselect_insn) = -1;
38489 return icode >= 0;
38492 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38494 static bool
38495 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38496 const unsigned char *perm, unsigned nelt,
38497 bool testing_p)
38499 enum machine_mode v2mode;
38500 rtx x;
38501 bool ok;
38503 if (vselect_insn == NULL_RTX)
38504 init_vselect_insn ();
38506 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38507 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38508 PUT_MODE (x, v2mode);
38509 XEXP (x, 0) = op0;
38510 XEXP (x, 1) = op1;
38511 ok = expand_vselect (target, x, perm, nelt, testing_p);
38512 XEXP (x, 0) = const0_rtx;
38513 XEXP (x, 1) = const0_rtx;
38514 return ok;
38517 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38518 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38520 static bool
38521 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38523 enum machine_mode vmode = d->vmode;
38524 unsigned i, mask, nelt = d->nelt;
38525 rtx target, op0, op1, x;
38526 rtx rperm[32], vperm;
38528 if (d->one_operand_p)
38529 return false;
38530 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38532 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38534 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38536 else
38537 return false;
38539 /* This is a blend, not a permute. Elements must stay in their
38540 respective lanes. */
38541 for (i = 0; i < nelt; ++i)
38543 unsigned e = d->perm[i];
38544 if (!(e == i || e == i + nelt))
38545 return false;
38548 if (d->testing_p)
38549 return true;
38551 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38552 decision should be extracted elsewhere, so that we only try that
38553 sequence once all budget==3 options have been tried. */
38554 target = d->target;
38555 op0 = d->op0;
38556 op1 = d->op1;
38557 mask = 0;
38559 switch (vmode)
38561 case V4DFmode:
38562 case V8SFmode:
38563 case V2DFmode:
38564 case V4SFmode:
38565 case V8HImode:
38566 case V8SImode:
38567 for (i = 0; i < nelt; ++i)
38568 mask |= (d->perm[i] >= nelt) << i;
38569 break;
38571 case V2DImode:
38572 for (i = 0; i < 2; ++i)
38573 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38574 vmode = V8HImode;
38575 goto do_subreg;
38577 case V4SImode:
38578 for (i = 0; i < 4; ++i)
38579 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38580 vmode = V8HImode;
38581 goto do_subreg;
38583 case V16QImode:
38584 /* See if bytes move in pairs so we can use pblendw with
38585 an immediate argument, rather than pblendvb with a vector
38586 argument. */
38587 for (i = 0; i < 16; i += 2)
38588 if (d->perm[i] + 1 != d->perm[i + 1])
38590 use_pblendvb:
38591 for (i = 0; i < nelt; ++i)
38592 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38594 finish_pblendvb:
38595 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38596 vperm = force_reg (vmode, vperm);
38598 if (GET_MODE_SIZE (vmode) == 16)
38599 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38600 else
38601 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38602 return true;
38605 for (i = 0; i < 8; ++i)
38606 mask |= (d->perm[i * 2] >= 16) << i;
38607 vmode = V8HImode;
38608 /* FALLTHRU */
38610 do_subreg:
38611 target = gen_lowpart (vmode, target);
38612 op0 = gen_lowpart (vmode, op0);
38613 op1 = gen_lowpart (vmode, op1);
38614 break;
38616 case V32QImode:
38617 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38618 for (i = 0; i < 32; i += 2)
38619 if (d->perm[i] + 1 != d->perm[i + 1])
38620 goto use_pblendvb;
38621 /* See if bytes move in quadruplets. If yes, vpblendd
38622 with immediate can be used. */
38623 for (i = 0; i < 32; i += 4)
38624 if (d->perm[i] + 2 != d->perm[i + 2])
38625 break;
38626 if (i < 32)
38628 /* See if bytes move the same in both lanes. If yes,
38629 vpblendw with immediate can be used. */
38630 for (i = 0; i < 16; i += 2)
38631 if (d->perm[i] + 16 != d->perm[i + 16])
38632 goto use_pblendvb;
38634 /* Use vpblendw. */
38635 for (i = 0; i < 16; ++i)
38636 mask |= (d->perm[i * 2] >= 32) << i;
38637 vmode = V16HImode;
38638 goto do_subreg;
38641 /* Use vpblendd. */
38642 for (i = 0; i < 8; ++i)
38643 mask |= (d->perm[i * 4] >= 32) << i;
38644 vmode = V8SImode;
38645 goto do_subreg;
38647 case V16HImode:
38648 /* See if words move in pairs. If yes, vpblendd can be used. */
38649 for (i = 0; i < 16; i += 2)
38650 if (d->perm[i] + 1 != d->perm[i + 1])
38651 break;
38652 if (i < 16)
38654 /* See if words move the same in both lanes. If not,
38655 vpblendvb must be used. */
38656 for (i = 0; i < 8; i++)
38657 if (d->perm[i] + 8 != d->perm[i + 8])
38659 /* Use vpblendvb. */
38660 for (i = 0; i < 32; ++i)
38661 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38663 vmode = V32QImode;
38664 nelt = 32;
38665 target = gen_lowpart (vmode, target);
38666 op0 = gen_lowpart (vmode, op0);
38667 op1 = gen_lowpart (vmode, op1);
38668 goto finish_pblendvb;
38671 /* Use vpblendw. */
38672 for (i = 0; i < 16; ++i)
38673 mask |= (d->perm[i] >= 16) << i;
38674 break;
38677 /* Use vpblendd. */
38678 for (i = 0; i < 8; ++i)
38679 mask |= (d->perm[i * 2] >= 16) << i;
38680 vmode = V8SImode;
38681 goto do_subreg;
38683 case V4DImode:
38684 /* Use vpblendd. */
38685 for (i = 0; i < 4; ++i)
38686 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38687 vmode = V8SImode;
38688 goto do_subreg;
38690 default:
38691 gcc_unreachable ();
38694 /* This matches five different patterns with the different modes. */
38695 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38696 x = gen_rtx_SET (VOIDmode, target, x);
38697 emit_insn (x);
38699 return true;
38702 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38703 in terms of the variable form of vpermilps.
38705 Note that we will have already failed the immediate input vpermilps,
38706 which requires that the high and low part shuffle be identical; the
38707 variable form doesn't require that. */
38709 static bool
38710 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38712 rtx rperm[8], vperm;
38713 unsigned i;
38715 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38716 return false;
38718 /* We can only permute within the 128-bit lane. */
38719 for (i = 0; i < 8; ++i)
38721 unsigned e = d->perm[i];
38722 if (i < 4 ? e >= 4 : e < 4)
38723 return false;
38726 if (d->testing_p)
38727 return true;
38729 for (i = 0; i < 8; ++i)
38731 unsigned e = d->perm[i];
38733 /* Within each 128-bit lane, the elements of op0 are numbered
38734 from 0 and the elements of op1 are numbered from 4. */
38735 if (e >= 8 + 4)
38736 e -= 8;
38737 else if (e >= 4)
38738 e -= 4;
38740 rperm[i] = GEN_INT (e);
38743 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38744 vperm = force_reg (V8SImode, vperm);
38745 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38747 return true;
38750 /* Return true if permutation D can be performed as VMODE permutation
38751 instead. */
38753 static bool
38754 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38756 unsigned int i, j, chunk;
38758 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38759 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38760 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38761 return false;
38763 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38764 return true;
38766 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38767 for (i = 0; i < d->nelt; i += chunk)
38768 if (d->perm[i] & (chunk - 1))
38769 return false;
38770 else
38771 for (j = 1; j < chunk; ++j)
38772 if (d->perm[i] + j != d->perm[i + j])
38773 return false;
38775 return true;
38778 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38779 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38781 static bool
38782 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38784 unsigned i, nelt, eltsz, mask;
38785 unsigned char perm[32];
38786 enum machine_mode vmode = V16QImode;
38787 rtx rperm[32], vperm, target, op0, op1;
38789 nelt = d->nelt;
38791 if (!d->one_operand_p)
38793 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38795 if (TARGET_AVX2
38796 && valid_perm_using_mode_p (V2TImode, d))
38798 if (d->testing_p)
38799 return true;
38801 /* Use vperm2i128 insn. The pattern uses
38802 V4DImode instead of V2TImode. */
38803 target = gen_lowpart (V4DImode, d->target);
38804 op0 = gen_lowpart (V4DImode, d->op0);
38805 op1 = gen_lowpart (V4DImode, d->op1);
38806 rperm[0]
38807 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38808 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38809 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38810 return true;
38812 return false;
38815 else
38817 if (GET_MODE_SIZE (d->vmode) == 16)
38819 if (!TARGET_SSSE3)
38820 return false;
38822 else if (GET_MODE_SIZE (d->vmode) == 32)
38824 if (!TARGET_AVX2)
38825 return false;
38827 /* V4DImode should be already handled through
38828 expand_vselect by vpermq instruction. */
38829 gcc_assert (d->vmode != V4DImode);
38831 vmode = V32QImode;
38832 if (d->vmode == V8SImode
38833 || d->vmode == V16HImode
38834 || d->vmode == V32QImode)
38836 /* First see if vpermq can be used for
38837 V8SImode/V16HImode/V32QImode. */
38838 if (valid_perm_using_mode_p (V4DImode, d))
38840 for (i = 0; i < 4; i++)
38841 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38842 if (d->testing_p)
38843 return true;
38844 return expand_vselect (gen_lowpart (V4DImode, d->target),
38845 gen_lowpart (V4DImode, d->op0),
38846 perm, 4, false);
38849 /* Next see if vpermd can be used. */
38850 if (valid_perm_using_mode_p (V8SImode, d))
38851 vmode = V8SImode;
38853 /* Or if vpermps can be used. */
38854 else if (d->vmode == V8SFmode)
38855 vmode = V8SImode;
38857 if (vmode == V32QImode)
38859 /* vpshufb only works intra lanes, it is not
38860 possible to shuffle bytes in between the lanes. */
38861 for (i = 0; i < nelt; ++i)
38862 if ((d->perm[i] ^ i) & (nelt / 2))
38863 return false;
38866 else
38867 return false;
38870 if (d->testing_p)
38871 return true;
38873 if (vmode == V8SImode)
38874 for (i = 0; i < 8; ++i)
38875 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38876 else
38878 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38879 if (!d->one_operand_p)
38880 mask = 2 * nelt - 1;
38881 else if (vmode == V16QImode)
38882 mask = nelt - 1;
38883 else
38884 mask = nelt / 2 - 1;
38886 for (i = 0; i < nelt; ++i)
38888 unsigned j, e = d->perm[i] & mask;
38889 for (j = 0; j < eltsz; ++j)
38890 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38894 vperm = gen_rtx_CONST_VECTOR (vmode,
38895 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38896 vperm = force_reg (vmode, vperm);
38898 target = gen_lowpart (vmode, d->target);
38899 op0 = gen_lowpart (vmode, d->op0);
38900 if (d->one_operand_p)
38902 if (vmode == V16QImode)
38903 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38904 else if (vmode == V32QImode)
38905 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38906 else if (vmode == V8SFmode)
38907 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38908 else
38909 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38911 else
38913 op1 = gen_lowpart (vmode, d->op1);
38914 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38917 return true;
38920 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38921 in a single instruction. */
38923 static bool
38924 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38926 unsigned i, nelt = d->nelt;
38927 unsigned char perm2[MAX_VECT_LEN];
38929 /* Check plain VEC_SELECT first, because AVX has instructions that could
38930 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38931 input where SEL+CONCAT may not. */
38932 if (d->one_operand_p)
38934 int mask = nelt - 1;
38935 bool identity_perm = true;
38936 bool broadcast_perm = true;
38938 for (i = 0; i < nelt; i++)
38940 perm2[i] = d->perm[i] & mask;
38941 if (perm2[i] != i)
38942 identity_perm = false;
38943 if (perm2[i])
38944 broadcast_perm = false;
38947 if (identity_perm)
38949 if (!d->testing_p)
38950 emit_move_insn (d->target, d->op0);
38951 return true;
38953 else if (broadcast_perm && TARGET_AVX2)
38955 /* Use vpbroadcast{b,w,d}. */
38956 rtx (*gen) (rtx, rtx) = NULL;
38957 switch (d->vmode)
38959 case V32QImode:
38960 gen = gen_avx2_pbroadcastv32qi_1;
38961 break;
38962 case V16HImode:
38963 gen = gen_avx2_pbroadcastv16hi_1;
38964 break;
38965 case V8SImode:
38966 gen = gen_avx2_pbroadcastv8si_1;
38967 break;
38968 case V16QImode:
38969 gen = gen_avx2_pbroadcastv16qi;
38970 break;
38971 case V8HImode:
38972 gen = gen_avx2_pbroadcastv8hi;
38973 break;
38974 case V8SFmode:
38975 gen = gen_avx2_vec_dupv8sf_1;
38976 break;
38977 /* For other modes prefer other shuffles this function creates. */
38978 default: break;
38980 if (gen != NULL)
38982 if (!d->testing_p)
38983 emit_insn (gen (d->target, d->op0));
38984 return true;
38988 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38989 return true;
38991 /* There are plenty of patterns in sse.md that are written for
38992 SEL+CONCAT and are not replicated for a single op. Perhaps
38993 that should be changed, to avoid the nastiness here. */
38995 /* Recognize interleave style patterns, which means incrementing
38996 every other permutation operand. */
38997 for (i = 0; i < nelt; i += 2)
38999 perm2[i] = d->perm[i] & mask;
39000 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
39002 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39003 d->testing_p))
39004 return true;
39006 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
39007 if (nelt >= 4)
39009 for (i = 0; i < nelt; i += 4)
39011 perm2[i + 0] = d->perm[i + 0] & mask;
39012 perm2[i + 1] = d->perm[i + 1] & mask;
39013 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
39014 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
39017 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39018 d->testing_p))
39019 return true;
39023 /* Finally, try the fully general two operand permute. */
39024 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
39025 d->testing_p))
39026 return true;
39028 /* Recognize interleave style patterns with reversed operands. */
39029 if (!d->one_operand_p)
39031 for (i = 0; i < nelt; ++i)
39033 unsigned e = d->perm[i];
39034 if (e >= nelt)
39035 e -= nelt;
39036 else
39037 e += nelt;
39038 perm2[i] = e;
39041 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
39042 d->testing_p))
39043 return true;
39046 /* Try the SSE4.1 blend variable merge instructions. */
39047 if (expand_vec_perm_blend (d))
39048 return true;
39050 /* Try one of the AVX vpermil variable permutations. */
39051 if (expand_vec_perm_vpermil (d))
39052 return true;
39054 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
39055 vpshufb, vpermd, vpermps or vpermq variable permutation. */
39056 if (expand_vec_perm_pshufb (d))
39057 return true;
39059 return false;
39062 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39063 in terms of a pair of pshuflw + pshufhw instructions. */
39065 static bool
39066 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
39068 unsigned char perm2[MAX_VECT_LEN];
39069 unsigned i;
39070 bool ok;
39072 if (d->vmode != V8HImode || !d->one_operand_p)
39073 return false;
39075 /* The two permutations only operate in 64-bit lanes. */
39076 for (i = 0; i < 4; ++i)
39077 if (d->perm[i] >= 4)
39078 return false;
39079 for (i = 4; i < 8; ++i)
39080 if (d->perm[i] < 4)
39081 return false;
39083 if (d->testing_p)
39084 return true;
39086 /* Emit the pshuflw. */
39087 memcpy (perm2, d->perm, 4);
39088 for (i = 4; i < 8; ++i)
39089 perm2[i] = i;
39090 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
39091 gcc_assert (ok);
39093 /* Emit the pshufhw. */
39094 memcpy (perm2 + 4, d->perm + 4, 4);
39095 for (i = 0; i < 4; ++i)
39096 perm2[i] = i;
39097 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
39098 gcc_assert (ok);
39100 return true;
39103 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39104 the permutation using the SSSE3 palignr instruction. This succeeds
39105 when all of the elements in PERM fit within one vector and we merely
39106 need to shift them down so that a single vector permutation has a
39107 chance to succeed. */
39109 static bool
39110 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
39112 unsigned i, nelt = d->nelt;
39113 unsigned min, max;
39114 bool in_order, ok;
39115 rtx shift;
39117 /* Even with AVX, palignr only operates on 128-bit vectors. */
39118 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39119 return false;
39121 min = nelt, max = 0;
39122 for (i = 0; i < nelt; ++i)
39124 unsigned e = d->perm[i];
39125 if (e < min)
39126 min = e;
39127 if (e > max)
39128 max = e;
39130 if (min == 0 || max - min >= nelt)
39131 return false;
39133 /* Given that we have SSSE3, we know we'll be able to implement the
39134 single operand permutation after the palignr with pshufb. */
39135 if (d->testing_p)
39136 return true;
39138 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39139 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39140 gen_lowpart (TImode, d->op1),
39141 gen_lowpart (TImode, d->op0), shift));
39143 d->op0 = d->op1 = d->target;
39144 d->one_operand_p = true;
39146 in_order = true;
39147 for (i = 0; i < nelt; ++i)
39149 unsigned e = d->perm[i] - min;
39150 if (e != i)
39151 in_order = false;
39152 d->perm[i] = e;
39155 /* Test for the degenerate case where the alignment by itself
39156 produces the desired permutation. */
39157 if (in_order)
39158 return true;
39160 ok = expand_vec_perm_1 (d);
39161 gcc_assert (ok);
39163 return ok;
39166 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39168 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39169 a two vector permutation into a single vector permutation by using
39170 an interleave operation to merge the vectors. */
39172 static bool
39173 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39175 struct expand_vec_perm_d dremap, dfinal;
39176 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39177 unsigned HOST_WIDE_INT contents;
39178 unsigned char remap[2 * MAX_VECT_LEN];
39179 rtx seq;
39180 bool ok, same_halves = false;
39182 if (GET_MODE_SIZE (d->vmode) == 16)
39184 if (d->one_operand_p)
39185 return false;
39187 else if (GET_MODE_SIZE (d->vmode) == 32)
39189 if (!TARGET_AVX)
39190 return false;
39191 /* For 32-byte modes allow even d->one_operand_p.
39192 The lack of cross-lane shuffling in some instructions
39193 might prevent a single insn shuffle. */
39194 dfinal = *d;
39195 dfinal.testing_p = true;
39196 /* If expand_vec_perm_interleave3 can expand this into
39197 a 3 insn sequence, give up and let it be expanded as
39198 3 insn sequence. While that is one insn longer,
39199 it doesn't need a memory operand and in the common
39200 case that both interleave low and high permutations
39201 with the same operands are adjacent needs 4 insns
39202 for both after CSE. */
39203 if (expand_vec_perm_interleave3 (&dfinal))
39204 return false;
39206 else
39207 return false;
39209 /* Examine from whence the elements come. */
39210 contents = 0;
39211 for (i = 0; i < nelt; ++i)
39212 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39214 memset (remap, 0xff, sizeof (remap));
39215 dremap = *d;
39217 if (GET_MODE_SIZE (d->vmode) == 16)
39219 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39221 /* Split the two input vectors into 4 halves. */
39222 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39223 h2 = h1 << nelt2;
39224 h3 = h2 << nelt2;
39225 h4 = h3 << nelt2;
39227 /* If the elements from the low halves use interleave low, and similarly
39228 for interleave high. If the elements are from mis-matched halves, we
39229 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39230 if ((contents & (h1 | h3)) == contents)
39232 /* punpckl* */
39233 for (i = 0; i < nelt2; ++i)
39235 remap[i] = i * 2;
39236 remap[i + nelt] = i * 2 + 1;
39237 dremap.perm[i * 2] = i;
39238 dremap.perm[i * 2 + 1] = i + nelt;
39240 if (!TARGET_SSE2 && d->vmode == V4SImode)
39241 dremap.vmode = V4SFmode;
39243 else if ((contents & (h2 | h4)) == contents)
39245 /* punpckh* */
39246 for (i = 0; i < nelt2; ++i)
39248 remap[i + nelt2] = i * 2;
39249 remap[i + nelt + nelt2] = i * 2 + 1;
39250 dremap.perm[i * 2] = i + nelt2;
39251 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39253 if (!TARGET_SSE2 && d->vmode == V4SImode)
39254 dremap.vmode = V4SFmode;
39256 else if ((contents & (h1 | h4)) == contents)
39258 /* shufps */
39259 for (i = 0; i < nelt2; ++i)
39261 remap[i] = i;
39262 remap[i + nelt + nelt2] = i + nelt2;
39263 dremap.perm[i] = i;
39264 dremap.perm[i + nelt2] = i + nelt + nelt2;
39266 if (nelt != 4)
39268 /* shufpd */
39269 dremap.vmode = V2DImode;
39270 dremap.nelt = 2;
39271 dremap.perm[0] = 0;
39272 dremap.perm[1] = 3;
39275 else if ((contents & (h2 | h3)) == contents)
39277 /* shufps */
39278 for (i = 0; i < nelt2; ++i)
39280 remap[i + nelt2] = i;
39281 remap[i + nelt] = i + nelt2;
39282 dremap.perm[i] = i + nelt2;
39283 dremap.perm[i + nelt2] = i + nelt;
39285 if (nelt != 4)
39287 /* shufpd */
39288 dremap.vmode = V2DImode;
39289 dremap.nelt = 2;
39290 dremap.perm[0] = 1;
39291 dremap.perm[1] = 2;
39294 else
39295 return false;
39297 else
39299 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39300 unsigned HOST_WIDE_INT q[8];
39301 unsigned int nonzero_halves[4];
39303 /* Split the two input vectors into 8 quarters. */
39304 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39305 for (i = 1; i < 8; ++i)
39306 q[i] = q[0] << (nelt4 * i);
39307 for (i = 0; i < 4; ++i)
39308 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39310 nonzero_halves[nzcnt] = i;
39311 ++nzcnt;
39314 if (nzcnt == 1)
39316 gcc_assert (d->one_operand_p);
39317 nonzero_halves[1] = nonzero_halves[0];
39318 same_halves = true;
39320 else if (d->one_operand_p)
39322 gcc_assert (nonzero_halves[0] == 0);
39323 gcc_assert (nonzero_halves[1] == 1);
39326 if (nzcnt <= 2)
39328 if (d->perm[0] / nelt2 == nonzero_halves[1])
39330 /* Attempt to increase the likelihood that dfinal
39331 shuffle will be intra-lane. */
39332 char tmph = nonzero_halves[0];
39333 nonzero_halves[0] = nonzero_halves[1];
39334 nonzero_halves[1] = tmph;
39337 /* vperm2f128 or vperm2i128. */
39338 for (i = 0; i < nelt2; ++i)
39340 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39341 remap[i + nonzero_halves[0] * nelt2] = i;
39342 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39343 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39346 if (d->vmode != V8SFmode
39347 && d->vmode != V4DFmode
39348 && d->vmode != V8SImode)
39350 dremap.vmode = V8SImode;
39351 dremap.nelt = 8;
39352 for (i = 0; i < 4; ++i)
39354 dremap.perm[i] = i + nonzero_halves[0] * 4;
39355 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39359 else if (d->one_operand_p)
39360 return false;
39361 else if (TARGET_AVX2
39362 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39364 /* vpunpckl* */
39365 for (i = 0; i < nelt4; ++i)
39367 remap[i] = i * 2;
39368 remap[i + nelt] = i * 2 + 1;
39369 remap[i + nelt2] = i * 2 + nelt2;
39370 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39371 dremap.perm[i * 2] = i;
39372 dremap.perm[i * 2 + 1] = i + nelt;
39373 dremap.perm[i * 2 + nelt2] = i + nelt2;
39374 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39377 else if (TARGET_AVX2
39378 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39380 /* vpunpckh* */
39381 for (i = 0; i < nelt4; ++i)
39383 remap[i + nelt4] = i * 2;
39384 remap[i + nelt + nelt4] = i * 2 + 1;
39385 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39386 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39387 dremap.perm[i * 2] = i + nelt4;
39388 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39389 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39390 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39393 else
39394 return false;
39397 /* Use the remapping array set up above to move the elements from their
39398 swizzled locations into their final destinations. */
39399 dfinal = *d;
39400 for (i = 0; i < nelt; ++i)
39402 unsigned e = remap[d->perm[i]];
39403 gcc_assert (e < nelt);
39404 /* If same_halves is true, both halves of the remapped vector are the
39405 same. Avoid cross-lane accesses if possible. */
39406 if (same_halves && i >= nelt2)
39408 gcc_assert (e < nelt2);
39409 dfinal.perm[i] = e + nelt2;
39411 else
39412 dfinal.perm[i] = e;
39414 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39415 dfinal.op1 = dfinal.op0;
39416 dfinal.one_operand_p = true;
39417 dremap.target = dfinal.op0;
39419 /* Test if the final remap can be done with a single insn. For V4SFmode or
39420 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39421 start_sequence ();
39422 ok = expand_vec_perm_1 (&dfinal);
39423 seq = get_insns ();
39424 end_sequence ();
39426 if (!ok)
39427 return false;
39429 if (d->testing_p)
39430 return true;
39432 if (dremap.vmode != dfinal.vmode)
39434 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39435 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39436 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39439 ok = expand_vec_perm_1 (&dremap);
39440 gcc_assert (ok);
39442 emit_insn (seq);
39443 return true;
39446 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39447 a single vector cross-lane permutation into vpermq followed
39448 by any of the single insn permutations. */
39450 static bool
39451 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39453 struct expand_vec_perm_d dremap, dfinal;
39454 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39455 unsigned contents[2];
39456 bool ok;
39458 if (!(TARGET_AVX2
39459 && (d->vmode == V32QImode || d->vmode == V16HImode)
39460 && d->one_operand_p))
39461 return false;
39463 contents[0] = 0;
39464 contents[1] = 0;
39465 for (i = 0; i < nelt2; ++i)
39467 contents[0] |= 1u << (d->perm[i] / nelt4);
39468 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39471 for (i = 0; i < 2; ++i)
39473 unsigned int cnt = 0;
39474 for (j = 0; j < 4; ++j)
39475 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39476 return false;
39479 if (d->testing_p)
39480 return true;
39482 dremap = *d;
39483 dremap.vmode = V4DImode;
39484 dremap.nelt = 4;
39485 dremap.target = gen_reg_rtx (V4DImode);
39486 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39487 dremap.op1 = dremap.op0;
39488 dremap.one_operand_p = true;
39489 for (i = 0; i < 2; ++i)
39491 unsigned int cnt = 0;
39492 for (j = 0; j < 4; ++j)
39493 if ((contents[i] & (1u << j)) != 0)
39494 dremap.perm[2 * i + cnt++] = j;
39495 for (; cnt < 2; ++cnt)
39496 dremap.perm[2 * i + cnt] = 0;
39499 dfinal = *d;
39500 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39501 dfinal.op1 = dfinal.op0;
39502 dfinal.one_operand_p = true;
39503 for (i = 0, j = 0; i < nelt; ++i)
39505 if (i == nelt2)
39506 j = 2;
39507 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39508 if ((d->perm[i] / nelt4) == dremap.perm[j])
39510 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39511 dfinal.perm[i] |= nelt4;
39512 else
39513 gcc_unreachable ();
39516 ok = expand_vec_perm_1 (&dremap);
39517 gcc_assert (ok);
39519 ok = expand_vec_perm_1 (&dfinal);
39520 gcc_assert (ok);
39522 return true;
39525 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39526 a vector permutation using two instructions, vperm2f128 resp.
39527 vperm2i128 followed by any single in-lane permutation. */
39529 static bool
39530 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39532 struct expand_vec_perm_d dfirst, dsecond;
39533 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39534 bool ok;
39536 if (!TARGET_AVX
39537 || GET_MODE_SIZE (d->vmode) != 32
39538 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39539 return false;
39541 dsecond = *d;
39542 dsecond.one_operand_p = false;
39543 dsecond.testing_p = true;
39545 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39546 immediate. For perm < 16 the second permutation uses
39547 d->op0 as first operand, for perm >= 16 it uses d->op1
39548 as first operand. The second operand is the result of
39549 vperm2[fi]128. */
39550 for (perm = 0; perm < 32; perm++)
39552 /* Ignore permutations which do not move anything cross-lane. */
39553 if (perm < 16)
39555 /* The second shuffle for e.g. V4DFmode has
39556 0123 and ABCD operands.
39557 Ignore AB23, as 23 is already in the second lane
39558 of the first operand. */
39559 if ((perm & 0xc) == (1 << 2)) continue;
39560 /* And 01CD, as 01 is in the first lane of the first
39561 operand. */
39562 if ((perm & 3) == 0) continue;
39563 /* And 4567, as then the vperm2[fi]128 doesn't change
39564 anything on the original 4567 second operand. */
39565 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39567 else
39569 /* The second shuffle for e.g. V4DFmode has
39570 4567 and ABCD operands.
39571 Ignore AB67, as 67 is already in the second lane
39572 of the first operand. */
39573 if ((perm & 0xc) == (3 << 2)) continue;
39574 /* And 45CD, as 45 is in the first lane of the first
39575 operand. */
39576 if ((perm & 3) == 2) continue;
39577 /* And 0123, as then the vperm2[fi]128 doesn't change
39578 anything on the original 0123 first operand. */
39579 if ((perm & 0xf) == (1 << 2)) continue;
39582 for (i = 0; i < nelt; i++)
39584 j = d->perm[i] / nelt2;
39585 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39586 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39587 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39588 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39589 else
39590 break;
39593 if (i == nelt)
39595 start_sequence ();
39596 ok = expand_vec_perm_1 (&dsecond);
39597 end_sequence ();
39599 else
39600 ok = false;
39602 if (ok)
39604 if (d->testing_p)
39605 return true;
39607 /* Found a usable second shuffle. dfirst will be
39608 vperm2f128 on d->op0 and d->op1. */
39609 dsecond.testing_p = false;
39610 dfirst = *d;
39611 dfirst.target = gen_reg_rtx (d->vmode);
39612 for (i = 0; i < nelt; i++)
39613 dfirst.perm[i] = (i & (nelt2 - 1))
39614 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39616 ok = expand_vec_perm_1 (&dfirst);
39617 gcc_assert (ok);
39619 /* And dsecond is some single insn shuffle, taking
39620 d->op0 and result of vperm2f128 (if perm < 16) or
39621 d->op1 and result of vperm2f128 (otherwise). */
39622 dsecond.op1 = dfirst.target;
39623 if (perm >= 16)
39624 dsecond.op0 = dfirst.op1;
39626 ok = expand_vec_perm_1 (&dsecond);
39627 gcc_assert (ok);
39629 return true;
39632 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39633 if (d->one_operand_p)
39634 return false;
39637 return false;
39640 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39641 a two vector permutation using 2 intra-lane interleave insns
39642 and cross-lane shuffle for 32-byte vectors. */
39644 static bool
39645 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39647 unsigned i, nelt;
39648 rtx (*gen) (rtx, rtx, rtx);
39650 if (d->one_operand_p)
39651 return false;
39652 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39654 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39656 else
39657 return false;
39659 nelt = d->nelt;
39660 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39661 return false;
39662 for (i = 0; i < nelt; i += 2)
39663 if (d->perm[i] != d->perm[0] + i / 2
39664 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39665 return false;
39667 if (d->testing_p)
39668 return true;
39670 switch (d->vmode)
39672 case V32QImode:
39673 if (d->perm[0])
39674 gen = gen_vec_interleave_highv32qi;
39675 else
39676 gen = gen_vec_interleave_lowv32qi;
39677 break;
39678 case V16HImode:
39679 if (d->perm[0])
39680 gen = gen_vec_interleave_highv16hi;
39681 else
39682 gen = gen_vec_interleave_lowv16hi;
39683 break;
39684 case V8SImode:
39685 if (d->perm[0])
39686 gen = gen_vec_interleave_highv8si;
39687 else
39688 gen = gen_vec_interleave_lowv8si;
39689 break;
39690 case V4DImode:
39691 if (d->perm[0])
39692 gen = gen_vec_interleave_highv4di;
39693 else
39694 gen = gen_vec_interleave_lowv4di;
39695 break;
39696 case V8SFmode:
39697 if (d->perm[0])
39698 gen = gen_vec_interleave_highv8sf;
39699 else
39700 gen = gen_vec_interleave_lowv8sf;
39701 break;
39702 case V4DFmode:
39703 if (d->perm[0])
39704 gen = gen_vec_interleave_highv4df;
39705 else
39706 gen = gen_vec_interleave_lowv4df;
39707 break;
39708 default:
39709 gcc_unreachable ();
39712 emit_insn (gen (d->target, d->op0, d->op1));
39713 return true;
39716 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39717 a single vector permutation using a single intra-lane vector
39718 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39719 the non-swapped and swapped vectors together. */
39721 static bool
39722 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39724 struct expand_vec_perm_d dfirst, dsecond;
39725 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39726 rtx seq;
39727 bool ok;
39728 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39730 if (!TARGET_AVX
39731 || TARGET_AVX2
39732 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39733 || !d->one_operand_p)
39734 return false;
39736 dfirst = *d;
39737 for (i = 0; i < nelt; i++)
39738 dfirst.perm[i] = 0xff;
39739 for (i = 0, msk = 0; i < nelt; i++)
39741 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39742 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39743 return false;
39744 dfirst.perm[j] = d->perm[i];
39745 if (j != i)
39746 msk |= (1 << i);
39748 for (i = 0; i < nelt; i++)
39749 if (dfirst.perm[i] == 0xff)
39750 dfirst.perm[i] = i;
39752 if (!d->testing_p)
39753 dfirst.target = gen_reg_rtx (dfirst.vmode);
39755 start_sequence ();
39756 ok = expand_vec_perm_1 (&dfirst);
39757 seq = get_insns ();
39758 end_sequence ();
39760 if (!ok)
39761 return false;
39763 if (d->testing_p)
39764 return true;
39766 emit_insn (seq);
39768 dsecond = *d;
39769 dsecond.op0 = dfirst.target;
39770 dsecond.op1 = dfirst.target;
39771 dsecond.one_operand_p = true;
39772 dsecond.target = gen_reg_rtx (dsecond.vmode);
39773 for (i = 0; i < nelt; i++)
39774 dsecond.perm[i] = i ^ nelt2;
39776 ok = expand_vec_perm_1 (&dsecond);
39777 gcc_assert (ok);
39779 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39780 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39781 return true;
39784 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39785 permutation using two vperm2f128, followed by a vshufpd insn blending
39786 the two vectors together. */
39788 static bool
39789 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39791 struct expand_vec_perm_d dfirst, dsecond, dthird;
39792 bool ok;
39794 if (!TARGET_AVX || (d->vmode != V4DFmode))
39795 return false;
39797 if (d->testing_p)
39798 return true;
39800 dfirst = *d;
39801 dsecond = *d;
39802 dthird = *d;
39804 dfirst.perm[0] = (d->perm[0] & ~1);
39805 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39806 dfirst.perm[2] = (d->perm[2] & ~1);
39807 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39808 dsecond.perm[0] = (d->perm[1] & ~1);
39809 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39810 dsecond.perm[2] = (d->perm[3] & ~1);
39811 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39812 dthird.perm[0] = (d->perm[0] % 2);
39813 dthird.perm[1] = (d->perm[1] % 2) + 4;
39814 dthird.perm[2] = (d->perm[2] % 2) + 2;
39815 dthird.perm[3] = (d->perm[3] % 2) + 6;
39817 dfirst.target = gen_reg_rtx (dfirst.vmode);
39818 dsecond.target = gen_reg_rtx (dsecond.vmode);
39819 dthird.op0 = dfirst.target;
39820 dthird.op1 = dsecond.target;
39821 dthird.one_operand_p = false;
39823 canonicalize_perm (&dfirst);
39824 canonicalize_perm (&dsecond);
39826 ok = expand_vec_perm_1 (&dfirst)
39827 && expand_vec_perm_1 (&dsecond)
39828 && expand_vec_perm_1 (&dthird);
39830 gcc_assert (ok);
39832 return true;
39835 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39836 permutation with two pshufb insns and an ior. We should have already
39837 failed all two instruction sequences. */
39839 static bool
39840 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39842 rtx rperm[2][16], vperm, l, h, op, m128;
39843 unsigned int i, nelt, eltsz;
39845 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39846 return false;
39847 gcc_assert (!d->one_operand_p);
39849 nelt = d->nelt;
39850 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39852 /* Generate two permutation masks. If the required element is within
39853 the given vector it is shuffled into the proper lane. If the required
39854 element is in the other vector, force a zero into the lane by setting
39855 bit 7 in the permutation mask. */
39856 m128 = GEN_INT (-128);
39857 for (i = 0; i < nelt; ++i)
39859 unsigned j, e = d->perm[i];
39860 unsigned which = (e >= nelt);
39861 if (e >= nelt)
39862 e -= nelt;
39864 for (j = 0; j < eltsz; ++j)
39866 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39867 rperm[1-which][i*eltsz + j] = m128;
39871 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39872 vperm = force_reg (V16QImode, vperm);
39874 l = gen_reg_rtx (V16QImode);
39875 op = gen_lowpart (V16QImode, d->op0);
39876 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39878 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39879 vperm = force_reg (V16QImode, vperm);
39881 h = gen_reg_rtx (V16QImode);
39882 op = gen_lowpart (V16QImode, d->op1);
39883 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39885 op = gen_lowpart (V16QImode, d->target);
39886 emit_insn (gen_iorv16qi3 (op, l, h));
39888 return true;
39891 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39892 with two vpshufb insns, vpermq and vpor. We should have already failed
39893 all two or three instruction sequences. */
39895 static bool
39896 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39898 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39899 unsigned int i, nelt, eltsz;
39901 if (!TARGET_AVX2
39902 || !d->one_operand_p
39903 || (d->vmode != V32QImode && d->vmode != V16HImode))
39904 return false;
39906 if (d->testing_p)
39907 return true;
39909 nelt = d->nelt;
39910 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39912 /* Generate two permutation masks. If the required element is within
39913 the same lane, it is shuffled in. If the required element from the
39914 other lane, force a zero by setting bit 7 in the permutation mask.
39915 In the other mask the mask has non-negative elements if element
39916 is requested from the other lane, but also moved to the other lane,
39917 so that the result of vpshufb can have the two V2TImode halves
39918 swapped. */
39919 m128 = GEN_INT (-128);
39920 for (i = 0; i < nelt; ++i)
39922 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39923 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39925 for (j = 0; j < eltsz; ++j)
39927 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39928 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39932 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39933 vperm = force_reg (V32QImode, vperm);
39935 h = gen_reg_rtx (V32QImode);
39936 op = gen_lowpart (V32QImode, d->op0);
39937 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39939 /* Swap the 128-byte lanes of h into hp. */
39940 hp = gen_reg_rtx (V4DImode);
39941 op = gen_lowpart (V4DImode, h);
39942 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39943 const1_rtx));
39945 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39946 vperm = force_reg (V32QImode, vperm);
39948 l = gen_reg_rtx (V32QImode);
39949 op = gen_lowpart (V32QImode, d->op0);
39950 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39952 op = gen_lowpart (V32QImode, d->target);
39953 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39955 return true;
39958 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39959 and extract-odd permutations of two V32QImode and V16QImode operand
39960 with two vpshufb insns, vpor and vpermq. We should have already
39961 failed all two or three instruction sequences. */
39963 static bool
39964 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39966 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39967 unsigned int i, nelt, eltsz;
39969 if (!TARGET_AVX2
39970 || d->one_operand_p
39971 || (d->vmode != V32QImode && d->vmode != V16HImode))
39972 return false;
39974 for (i = 0; i < d->nelt; ++i)
39975 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39976 return false;
39978 if (d->testing_p)
39979 return true;
39981 nelt = d->nelt;
39982 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39984 /* Generate two permutation masks. In the first permutation mask
39985 the first quarter will contain indexes for the first half
39986 of the op0, the second quarter will contain bit 7 set, third quarter
39987 will contain indexes for the second half of the op0 and the
39988 last quarter bit 7 set. In the second permutation mask
39989 the first quarter will contain bit 7 set, the second quarter
39990 indexes for the first half of the op1, the third quarter bit 7 set
39991 and last quarter indexes for the second half of the op1.
39992 I.e. the first mask e.g. for V32QImode extract even will be:
39993 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
39994 (all values masked with 0xf except for -128) and second mask
39995 for extract even will be
39996 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
39997 m128 = GEN_INT (-128);
39998 for (i = 0; i < nelt; ++i)
40000 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40001 unsigned which = d->perm[i] >= nelt;
40002 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
40004 for (j = 0; j < eltsz; ++j)
40006 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
40007 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
40011 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40012 vperm = force_reg (V32QImode, vperm);
40014 l = gen_reg_rtx (V32QImode);
40015 op = gen_lowpart (V32QImode, d->op0);
40016 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40018 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40019 vperm = force_reg (V32QImode, vperm);
40021 h = gen_reg_rtx (V32QImode);
40022 op = gen_lowpart (V32QImode, d->op1);
40023 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40025 ior = gen_reg_rtx (V32QImode);
40026 emit_insn (gen_iorv32qi3 (ior, l, h));
40028 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
40029 op = gen_lowpart (V4DImode, d->target);
40030 ior = gen_lowpart (V4DImode, ior);
40031 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
40032 const1_rtx, GEN_INT (3)));
40034 return true;
40037 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
40038 and extract-odd permutations. */
40040 static bool
40041 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
40043 rtx t1, t2, t3;
40045 switch (d->vmode)
40047 case V4DFmode:
40048 t1 = gen_reg_rtx (V4DFmode);
40049 t2 = gen_reg_rtx (V4DFmode);
40051 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40052 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
40053 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
40055 /* Now an unpck[lh]pd will produce the result required. */
40056 if (odd)
40057 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
40058 else
40059 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
40060 emit_insn (t3);
40061 break;
40063 case V8SFmode:
40065 int mask = odd ? 0xdd : 0x88;
40067 t1 = gen_reg_rtx (V8SFmode);
40068 t2 = gen_reg_rtx (V8SFmode);
40069 t3 = gen_reg_rtx (V8SFmode);
40071 /* Shuffle within the 128-bit lanes to produce:
40072 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
40073 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
40074 GEN_INT (mask)));
40076 /* Shuffle the lanes around to produce:
40077 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
40078 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
40079 GEN_INT (0x3)));
40081 /* Shuffle within the 128-bit lanes to produce:
40082 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
40083 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
40085 /* Shuffle within the 128-bit lanes to produce:
40086 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
40087 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
40089 /* Shuffle the lanes around to produce:
40090 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
40091 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
40092 GEN_INT (0x20)));
40094 break;
40096 case V2DFmode:
40097 case V4SFmode:
40098 case V2DImode:
40099 case V4SImode:
40100 /* These are always directly implementable by expand_vec_perm_1. */
40101 gcc_unreachable ();
40103 case V8HImode:
40104 if (TARGET_SSSE3)
40105 return expand_vec_perm_pshufb2 (d);
40106 else
40108 /* We need 2*log2(N)-1 operations to achieve odd/even
40109 with interleave. */
40110 t1 = gen_reg_rtx (V8HImode);
40111 t2 = gen_reg_rtx (V8HImode);
40112 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
40113 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
40114 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
40115 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
40116 if (odd)
40117 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
40118 else
40119 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
40120 emit_insn (t3);
40122 break;
40124 case V16QImode:
40125 if (TARGET_SSSE3)
40126 return expand_vec_perm_pshufb2 (d);
40127 else
40129 t1 = gen_reg_rtx (V16QImode);
40130 t2 = gen_reg_rtx (V16QImode);
40131 t3 = gen_reg_rtx (V16QImode);
40132 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
40133 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
40134 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
40135 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
40136 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
40137 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40138 if (odd)
40139 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40140 else
40141 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40142 emit_insn (t3);
40144 break;
40146 case V16HImode:
40147 case V32QImode:
40148 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40150 case V4DImode:
40151 if (!TARGET_AVX2)
40153 struct expand_vec_perm_d d_copy = *d;
40154 d_copy.vmode = V4DFmode;
40155 d_copy.target = gen_lowpart (V4DFmode, d->target);
40156 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40157 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40158 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40161 t1 = gen_reg_rtx (V4DImode);
40162 t2 = gen_reg_rtx (V4DImode);
40164 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40165 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40166 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40168 /* Now an vpunpck[lh]qdq will produce the result required. */
40169 if (odd)
40170 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40171 else
40172 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40173 emit_insn (t3);
40174 break;
40176 case V8SImode:
40177 if (!TARGET_AVX2)
40179 struct expand_vec_perm_d d_copy = *d;
40180 d_copy.vmode = V8SFmode;
40181 d_copy.target = gen_lowpart (V8SFmode, d->target);
40182 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40183 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40184 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40187 t1 = gen_reg_rtx (V8SImode);
40188 t2 = gen_reg_rtx (V8SImode);
40190 /* Shuffle the lanes around into
40191 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40192 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40193 gen_lowpart (V4DImode, d->op0),
40194 gen_lowpart (V4DImode, d->op1),
40195 GEN_INT (0x20)));
40196 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40197 gen_lowpart (V4DImode, d->op0),
40198 gen_lowpart (V4DImode, d->op1),
40199 GEN_INT (0x31)));
40201 /* Swap the 2nd and 3rd position in each lane into
40202 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40203 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40204 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40205 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40206 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40208 /* Now an vpunpck[lh]qdq will produce
40209 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40210 if (odd)
40211 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40212 gen_lowpart (V4DImode, t1),
40213 gen_lowpart (V4DImode, t2));
40214 else
40215 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40216 gen_lowpart (V4DImode, t1),
40217 gen_lowpart (V4DImode, t2));
40218 emit_insn (t3);
40219 break;
40221 default:
40222 gcc_unreachable ();
40225 return true;
40228 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40229 extract-even and extract-odd permutations. */
40231 static bool
40232 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40234 unsigned i, odd, nelt = d->nelt;
40236 odd = d->perm[0];
40237 if (odd != 0 && odd != 1)
40238 return false;
40240 for (i = 1; i < nelt; ++i)
40241 if (d->perm[i] != 2 * i + odd)
40242 return false;
40244 return expand_vec_perm_even_odd_1 (d, odd);
40247 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40248 permutations. We assume that expand_vec_perm_1 has already failed. */
40250 static bool
40251 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40253 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40254 enum machine_mode vmode = d->vmode;
40255 unsigned char perm2[4];
40256 rtx op0 = d->op0;
40257 bool ok;
40259 switch (vmode)
40261 case V4DFmode:
40262 case V8SFmode:
40263 /* These are special-cased in sse.md so that we can optionally
40264 use the vbroadcast instruction. They expand to two insns
40265 if the input happens to be in a register. */
40266 gcc_unreachable ();
40268 case V2DFmode:
40269 case V2DImode:
40270 case V4SFmode:
40271 case V4SImode:
40272 /* These are always implementable using standard shuffle patterns. */
40273 gcc_unreachable ();
40275 case V8HImode:
40276 case V16QImode:
40277 /* These can be implemented via interleave. We save one insn by
40278 stopping once we have promoted to V4SImode and then use pshufd. */
40281 rtx dest;
40282 rtx (*gen) (rtx, rtx, rtx)
40283 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40284 : gen_vec_interleave_lowv8hi;
40286 if (elt >= nelt2)
40288 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40289 : gen_vec_interleave_highv8hi;
40290 elt -= nelt2;
40292 nelt2 /= 2;
40294 dest = gen_reg_rtx (vmode);
40295 emit_insn (gen (dest, op0, op0));
40296 vmode = get_mode_wider_vector (vmode);
40297 op0 = gen_lowpart (vmode, dest);
40299 while (vmode != V4SImode);
40301 memset (perm2, elt, 4);
40302 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40303 d->testing_p);
40304 gcc_assert (ok);
40305 return true;
40307 case V32QImode:
40308 case V16HImode:
40309 case V8SImode:
40310 case V4DImode:
40311 /* For AVX2 broadcasts of the first element vpbroadcast* or
40312 vpermq should be used by expand_vec_perm_1. */
40313 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40314 return false;
40316 default:
40317 gcc_unreachable ();
40321 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40322 broadcast permutations. */
40324 static bool
40325 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40327 unsigned i, elt, nelt = d->nelt;
40329 if (!d->one_operand_p)
40330 return false;
40332 elt = d->perm[0];
40333 for (i = 1; i < nelt; ++i)
40334 if (d->perm[i] != elt)
40335 return false;
40337 return expand_vec_perm_broadcast_1 (d);
40340 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40341 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40342 all the shorter instruction sequences. */
40344 static bool
40345 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40347 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40348 unsigned int i, nelt, eltsz;
40349 bool used[4];
40351 if (!TARGET_AVX2
40352 || d->one_operand_p
40353 || (d->vmode != V32QImode && d->vmode != V16HImode))
40354 return false;
40356 if (d->testing_p)
40357 return true;
40359 nelt = d->nelt;
40360 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40362 /* Generate 4 permutation masks. If the required element is within
40363 the same lane, it is shuffled in. If the required element from the
40364 other lane, force a zero by setting bit 7 in the permutation mask.
40365 In the other mask the mask has non-negative elements if element
40366 is requested from the other lane, but also moved to the other lane,
40367 so that the result of vpshufb can have the two V2TImode halves
40368 swapped. */
40369 m128 = GEN_INT (-128);
40370 for (i = 0; i < 32; ++i)
40372 rperm[0][i] = m128;
40373 rperm[1][i] = m128;
40374 rperm[2][i] = m128;
40375 rperm[3][i] = m128;
40377 used[0] = false;
40378 used[1] = false;
40379 used[2] = false;
40380 used[3] = false;
40381 for (i = 0; i < nelt; ++i)
40383 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40384 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40385 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40387 for (j = 0; j < eltsz; ++j)
40388 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40389 used[which] = true;
40392 for (i = 0; i < 2; ++i)
40394 if (!used[2 * i + 1])
40396 h[i] = NULL_RTX;
40397 continue;
40399 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40400 gen_rtvec_v (32, rperm[2 * i + 1]));
40401 vperm = force_reg (V32QImode, vperm);
40402 h[i] = gen_reg_rtx (V32QImode);
40403 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40404 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40407 /* Swap the 128-byte lanes of h[X]. */
40408 for (i = 0; i < 2; ++i)
40410 if (h[i] == NULL_RTX)
40411 continue;
40412 op = gen_reg_rtx (V4DImode);
40413 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40414 const2_rtx, GEN_INT (3), const0_rtx,
40415 const1_rtx));
40416 h[i] = gen_lowpart (V32QImode, op);
40419 for (i = 0; i < 2; ++i)
40421 if (!used[2 * i])
40423 l[i] = NULL_RTX;
40424 continue;
40426 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40427 vperm = force_reg (V32QImode, vperm);
40428 l[i] = gen_reg_rtx (V32QImode);
40429 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40430 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40433 for (i = 0; i < 2; ++i)
40435 if (h[i] && l[i])
40437 op = gen_reg_rtx (V32QImode);
40438 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40439 l[i] = op;
40441 else if (h[i])
40442 l[i] = h[i];
40445 gcc_assert (l[0] && l[1]);
40446 op = gen_lowpart (V32QImode, d->target);
40447 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40448 return true;
40451 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40452 With all of the interface bits taken care of, perform the expansion
40453 in D and return true on success. */
40455 static bool
40456 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40458 /* Try a single instruction expansion. */
40459 if (expand_vec_perm_1 (d))
40460 return true;
40462 /* Try sequences of two instructions. */
40464 if (expand_vec_perm_pshuflw_pshufhw (d))
40465 return true;
40467 if (expand_vec_perm_palignr (d))
40468 return true;
40470 if (expand_vec_perm_interleave2 (d))
40471 return true;
40473 if (expand_vec_perm_broadcast (d))
40474 return true;
40476 if (expand_vec_perm_vpermq_perm_1 (d))
40477 return true;
40479 if (expand_vec_perm_vperm2f128 (d))
40480 return true;
40482 /* Try sequences of three instructions. */
40484 if (expand_vec_perm_2vperm2f128_vshuf (d))
40485 return true;
40487 if (expand_vec_perm_pshufb2 (d))
40488 return true;
40490 if (expand_vec_perm_interleave3 (d))
40491 return true;
40493 if (expand_vec_perm_vperm2f128_vblend (d))
40494 return true;
40496 /* Try sequences of four instructions. */
40498 if (expand_vec_perm_vpshufb2_vpermq (d))
40499 return true;
40501 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40502 return true;
40504 /* ??? Look for narrow permutations whose element orderings would
40505 allow the promotion to a wider mode. */
40507 /* ??? Look for sequences of interleave or a wider permute that place
40508 the data into the correct lanes for a half-vector shuffle like
40509 pshuf[lh]w or vpermilps. */
40511 /* ??? Look for sequences of interleave that produce the desired results.
40512 The combinatorics of punpck[lh] get pretty ugly... */
40514 if (expand_vec_perm_even_odd (d))
40515 return true;
40517 /* Even longer sequences. */
40518 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40519 return true;
40521 return false;
40524 /* If a permutation only uses one operand, make it clear. Returns true
40525 if the permutation references both operands. */
40527 static bool
40528 canonicalize_perm (struct expand_vec_perm_d *d)
40530 int i, which, nelt = d->nelt;
40532 for (i = which = 0; i < nelt; ++i)
40533 which |= (d->perm[i] < nelt ? 1 : 2);
40535 d->one_operand_p = true;
40536 switch (which)
40538 default:
40539 gcc_unreachable();
40541 case 3:
40542 if (!rtx_equal_p (d->op0, d->op1))
40544 d->one_operand_p = false;
40545 break;
40547 /* The elements of PERM do not suggest that only the first operand
40548 is used, but both operands are identical. Allow easier matching
40549 of the permutation by folding the permutation into the single
40550 input vector. */
40551 /* FALLTHRU */
40553 case 2:
40554 for (i = 0; i < nelt; ++i)
40555 d->perm[i] &= nelt - 1;
40556 d->op0 = d->op1;
40557 break;
40559 case 1:
40560 d->op1 = d->op0;
40561 break;
40564 return (which == 3);
40567 bool
40568 ix86_expand_vec_perm_const (rtx operands[4])
40570 struct expand_vec_perm_d d;
40571 unsigned char perm[MAX_VECT_LEN];
40572 int i, nelt;
40573 bool two_args;
40574 rtx sel;
40576 d.target = operands[0];
40577 d.op0 = operands[1];
40578 d.op1 = operands[2];
40579 sel = operands[3];
40581 d.vmode = GET_MODE (d.target);
40582 gcc_assert (VECTOR_MODE_P (d.vmode));
40583 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40584 d.testing_p = false;
40586 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40587 gcc_assert (XVECLEN (sel, 0) == nelt);
40588 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40590 for (i = 0; i < nelt; ++i)
40592 rtx e = XVECEXP (sel, 0, i);
40593 int ei = INTVAL (e) & (2 * nelt - 1);
40594 d.perm[i] = ei;
40595 perm[i] = ei;
40598 two_args = canonicalize_perm (&d);
40600 if (ix86_expand_vec_perm_const_1 (&d))
40601 return true;
40603 /* If the selector says both arguments are needed, but the operands are the
40604 same, the above tried to expand with one_operand_p and flattened selector.
40605 If that didn't work, retry without one_operand_p; we succeeded with that
40606 during testing. */
40607 if (two_args && d.one_operand_p)
40609 d.one_operand_p = false;
40610 memcpy (d.perm, perm, sizeof (perm));
40611 return ix86_expand_vec_perm_const_1 (&d);
40614 return false;
40617 /* Implement targetm.vectorize.vec_perm_const_ok. */
40619 static bool
40620 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40621 const unsigned char *sel)
40623 struct expand_vec_perm_d d;
40624 unsigned int i, nelt, which;
40625 bool ret;
40627 d.vmode = vmode;
40628 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40629 d.testing_p = true;
40631 /* Given sufficient ISA support we can just return true here
40632 for selected vector modes. */
40633 if (GET_MODE_SIZE (d.vmode) == 16)
40635 /* All implementable with a single vpperm insn. */
40636 if (TARGET_XOP)
40637 return true;
40638 /* All implementable with 2 pshufb + 1 ior. */
40639 if (TARGET_SSSE3)
40640 return true;
40641 /* All implementable with shufpd or unpck[lh]pd. */
40642 if (d.nelt == 2)
40643 return true;
40646 /* Extract the values from the vector CST into the permutation
40647 array in D. */
40648 memcpy (d.perm, sel, nelt);
40649 for (i = which = 0; i < nelt; ++i)
40651 unsigned char e = d.perm[i];
40652 gcc_assert (e < 2 * nelt);
40653 which |= (e < nelt ? 1 : 2);
40656 /* For all elements from second vector, fold the elements to first. */
40657 if (which == 2)
40658 for (i = 0; i < nelt; ++i)
40659 d.perm[i] -= nelt;
40661 /* Check whether the mask can be applied to the vector type. */
40662 d.one_operand_p = (which != 3);
40664 /* Implementable with shufps or pshufd. */
40665 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40666 return true;
40668 /* Otherwise we have to go through the motions and see if we can
40669 figure out how to generate the requested permutation. */
40670 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40671 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40672 if (!d.one_operand_p)
40673 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40675 start_sequence ();
40676 ret = ix86_expand_vec_perm_const_1 (&d);
40677 end_sequence ();
40679 return ret;
40682 void
40683 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40685 struct expand_vec_perm_d d;
40686 unsigned i, nelt;
40688 d.target = targ;
40689 d.op0 = op0;
40690 d.op1 = op1;
40691 d.vmode = GET_MODE (targ);
40692 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40693 d.one_operand_p = false;
40694 d.testing_p = false;
40696 for (i = 0; i < nelt; ++i)
40697 d.perm[i] = i * 2 + odd;
40699 /* We'll either be able to implement the permutation directly... */
40700 if (expand_vec_perm_1 (&d))
40701 return;
40703 /* ... or we use the special-case patterns. */
40704 expand_vec_perm_even_odd_1 (&d, odd);
40707 static void
40708 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40710 struct expand_vec_perm_d d;
40711 unsigned i, nelt, base;
40712 bool ok;
40714 d.target = targ;
40715 d.op0 = op0;
40716 d.op1 = op1;
40717 d.vmode = GET_MODE (targ);
40718 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40719 d.one_operand_p = false;
40720 d.testing_p = false;
40722 base = high_p ? nelt / 2 : 0;
40723 for (i = 0; i < nelt / 2; ++i)
40725 d.perm[i * 2] = i + base;
40726 d.perm[i * 2 + 1] = i + base + nelt;
40729 /* Note that for AVX this isn't one instruction. */
40730 ok = ix86_expand_vec_perm_const_1 (&d);
40731 gcc_assert (ok);
40735 /* Expand a vector operation CODE for a V*QImode in terms of the
40736 same operation on V*HImode. */
40738 void
40739 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40741 enum machine_mode qimode = GET_MODE (dest);
40742 enum machine_mode himode;
40743 rtx (*gen_il) (rtx, rtx, rtx);
40744 rtx (*gen_ih) (rtx, rtx, rtx);
40745 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40746 struct expand_vec_perm_d d;
40747 bool ok, full_interleave;
40748 bool uns_p = false;
40749 int i;
40751 switch (qimode)
40753 case V16QImode:
40754 himode = V8HImode;
40755 gen_il = gen_vec_interleave_lowv16qi;
40756 gen_ih = gen_vec_interleave_highv16qi;
40757 break;
40758 case V32QImode:
40759 himode = V16HImode;
40760 gen_il = gen_avx2_interleave_lowv32qi;
40761 gen_ih = gen_avx2_interleave_highv32qi;
40762 break;
40763 default:
40764 gcc_unreachable ();
40767 op2_l = op2_h = op2;
40768 switch (code)
40770 case MULT:
40771 /* Unpack data such that we've got a source byte in each low byte of
40772 each word. We don't care what goes into the high byte of each word.
40773 Rather than trying to get zero in there, most convenient is to let
40774 it be a copy of the low byte. */
40775 op2_l = gen_reg_rtx (qimode);
40776 op2_h = gen_reg_rtx (qimode);
40777 emit_insn (gen_il (op2_l, op2, op2));
40778 emit_insn (gen_ih (op2_h, op2, op2));
40779 /* FALLTHRU */
40781 op1_l = gen_reg_rtx (qimode);
40782 op1_h = gen_reg_rtx (qimode);
40783 emit_insn (gen_il (op1_l, op1, op1));
40784 emit_insn (gen_ih (op1_h, op1, op1));
40785 full_interleave = qimode == V16QImode;
40786 break;
40788 case ASHIFT:
40789 case LSHIFTRT:
40790 uns_p = true;
40791 /* FALLTHRU */
40792 case ASHIFTRT:
40793 op1_l = gen_reg_rtx (himode);
40794 op1_h = gen_reg_rtx (himode);
40795 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40796 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40797 full_interleave = true;
40798 break;
40799 default:
40800 gcc_unreachable ();
40803 /* Perform the operation. */
40804 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40805 1, OPTAB_DIRECT);
40806 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40807 1, OPTAB_DIRECT);
40808 gcc_assert (res_l && res_h);
40810 /* Merge the data back into the right place. */
40811 d.target = dest;
40812 d.op0 = gen_lowpart (qimode, res_l);
40813 d.op1 = gen_lowpart (qimode, res_h);
40814 d.vmode = qimode;
40815 d.nelt = GET_MODE_NUNITS (qimode);
40816 d.one_operand_p = false;
40817 d.testing_p = false;
40819 if (full_interleave)
40821 /* For SSE2, we used an full interleave, so the desired
40822 results are in the even elements. */
40823 for (i = 0; i < 32; ++i)
40824 d.perm[i] = i * 2;
40826 else
40828 /* For AVX, the interleave used above was not cross-lane. So the
40829 extraction is evens but with the second and third quarter swapped.
40830 Happily, that is even one insn shorter than even extraction. */
40831 for (i = 0; i < 32; ++i)
40832 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40835 ok = ix86_expand_vec_perm_const_1 (&d);
40836 gcc_assert (ok);
40838 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40839 gen_rtx_fmt_ee (code, qimode, op1, op2));
40842 void
40843 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40844 bool uns_p, bool odd_p)
40846 enum machine_mode mode = GET_MODE (op1);
40847 enum machine_mode wmode = GET_MODE (dest);
40848 rtx x;
40850 /* We only play even/odd games with vectors of SImode. */
40851 gcc_assert (mode == V4SImode || mode == V8SImode);
40853 /* If we're looking for the odd results, shift those members down to
40854 the even slots. For some cpus this is faster than a PSHUFD. */
40855 if (odd_p)
40857 /* For XOP use vpmacsdqh, but only for smult, as it is only
40858 signed. */
40859 if (TARGET_XOP && mode == V4SImode && !uns_p)
40861 x = force_reg (wmode, CONST0_RTX (wmode));
40862 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40863 return;
40866 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40867 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40868 x, NULL, 1, OPTAB_DIRECT);
40869 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40870 x, NULL, 1, OPTAB_DIRECT);
40871 op1 = gen_lowpart (mode, op1);
40872 op2 = gen_lowpart (mode, op2);
40875 if (mode == V8SImode)
40877 if (uns_p)
40878 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40879 else
40880 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40882 else if (uns_p)
40883 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40884 else if (TARGET_SSE4_1)
40885 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40886 else
40888 rtx s1, s2, t0, t1, t2;
40890 /* The easiest way to implement this without PMULDQ is to go through
40891 the motions as if we are performing a full 64-bit multiply. With
40892 the exception that we need to do less shuffling of the elements. */
40894 /* Compute the sign-extension, aka highparts, of the two operands. */
40895 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40896 op1, pc_rtx, pc_rtx);
40897 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40898 op2, pc_rtx, pc_rtx);
40900 /* Multiply LO(A) * HI(B), and vice-versa. */
40901 t1 = gen_reg_rtx (wmode);
40902 t2 = gen_reg_rtx (wmode);
40903 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40904 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40906 /* Multiply LO(A) * LO(B). */
40907 t0 = gen_reg_rtx (wmode);
40908 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40910 /* Combine and shift the highparts into place. */
40911 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40912 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40913 1, OPTAB_DIRECT);
40915 /* Combine high and low parts. */
40916 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40917 return;
40919 emit_insn (x);
40922 void
40923 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40924 bool uns_p, bool high_p)
40926 enum machine_mode wmode = GET_MODE (dest);
40927 enum machine_mode mode = GET_MODE (op1);
40928 rtx t1, t2, t3, t4, mask;
40930 switch (mode)
40932 case V4SImode:
40933 t1 = gen_reg_rtx (mode);
40934 t2 = gen_reg_rtx (mode);
40935 if (TARGET_XOP && !uns_p)
40937 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40938 shuffle the elements once so that all elements are in the right
40939 place for immediate use: { A C B D }. */
40940 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40941 const1_rtx, GEN_INT (3)));
40942 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40943 const1_rtx, GEN_INT (3)));
40945 else
40947 /* Put the elements into place for the multiply. */
40948 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40949 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40950 high_p = false;
40952 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40953 break;
40955 case V8SImode:
40956 /* Shuffle the elements between the lanes. After this we
40957 have { A B E F | C D G H } for each operand. */
40958 t1 = gen_reg_rtx (V4DImode);
40959 t2 = gen_reg_rtx (V4DImode);
40960 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40961 const0_rtx, const2_rtx,
40962 const1_rtx, GEN_INT (3)));
40963 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40964 const0_rtx, const2_rtx,
40965 const1_rtx, GEN_INT (3)));
40967 /* Shuffle the elements within the lanes. After this we
40968 have { A A B B | C C D D } or { E E F F | G G H H }. */
40969 t3 = gen_reg_rtx (V8SImode);
40970 t4 = gen_reg_rtx (V8SImode);
40971 mask = GEN_INT (high_p
40972 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
40973 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
40974 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
40975 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
40977 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
40978 break;
40980 case V8HImode:
40981 case V16HImode:
40982 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
40983 uns_p, OPTAB_DIRECT);
40984 t2 = expand_binop (mode,
40985 uns_p ? umul_highpart_optab : smul_highpart_optab,
40986 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
40987 gcc_assert (t1 && t2);
40989 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
40990 break;
40992 case V16QImode:
40993 case V32QImode:
40994 t1 = gen_reg_rtx (wmode);
40995 t2 = gen_reg_rtx (wmode);
40996 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
40997 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
40999 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
41000 break;
41002 default:
41003 gcc_unreachable ();
41007 void
41008 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
41010 rtx res_1, res_2;
41012 res_1 = gen_reg_rtx (V4SImode);
41013 res_2 = gen_reg_rtx (V4SImode);
41014 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
41015 op1, op2, true, false);
41016 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
41017 op1, op2, true, true);
41019 /* Move the results in element 2 down to element 1; we don't care
41020 what goes in elements 2 and 3. Then we can merge the parts
41021 back together with an interleave.
41023 Note that two other sequences were tried:
41024 (1) Use interleaves at the start instead of psrldq, which allows
41025 us to use a single shufps to merge things back at the end.
41026 (2) Use shufps here to combine the two vectors, then pshufd to
41027 put the elements in the correct order.
41028 In both cases the cost of the reformatting stall was too high
41029 and the overall sequence slower. */
41031 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
41032 const0_rtx, const0_rtx));
41033 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
41034 const0_rtx, const0_rtx));
41035 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
41037 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
41040 void
41041 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
41043 enum machine_mode mode = GET_MODE (op0);
41044 rtx t1, t2, t3, t4, t5, t6;
41046 if (TARGET_XOP && mode == V2DImode)
41048 /* op1: A,B,C,D, op2: E,F,G,H */
41049 op1 = gen_lowpart (V4SImode, op1);
41050 op2 = gen_lowpart (V4SImode, op2);
41052 t1 = gen_reg_rtx (V4SImode);
41053 t2 = gen_reg_rtx (V4SImode);
41054 t3 = gen_reg_rtx (V2DImode);
41055 t4 = gen_reg_rtx (V2DImode);
41057 /* t1: B,A,D,C */
41058 emit_insn (gen_sse2_pshufd_1 (t1, op1,
41059 GEN_INT (1),
41060 GEN_INT (0),
41061 GEN_INT (3),
41062 GEN_INT (2)));
41064 /* t2: (B*E),(A*F),(D*G),(C*H) */
41065 emit_insn (gen_mulv4si3 (t2, t1, op2));
41067 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
41068 emit_insn (gen_xop_phadddq (t3, t2));
41070 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
41071 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
41073 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
41074 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
41076 else
41078 enum machine_mode nmode;
41079 rtx (*umul) (rtx, rtx, rtx);
41081 if (mode == V2DImode)
41083 umul = gen_vec_widen_umult_even_v4si;
41084 nmode = V4SImode;
41086 else if (mode == V4DImode)
41088 umul = gen_vec_widen_umult_even_v8si;
41089 nmode = V8SImode;
41091 else
41092 gcc_unreachable ();
41095 /* Multiply low parts. */
41096 t1 = gen_reg_rtx (mode);
41097 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
41099 /* Shift input vectors right 32 bits so we can multiply high parts. */
41100 t6 = GEN_INT (32);
41101 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
41102 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
41104 /* Multiply high parts by low parts. */
41105 t4 = gen_reg_rtx (mode);
41106 t5 = gen_reg_rtx (mode);
41107 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
41108 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
41110 /* Combine and shift the highparts back. */
41111 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
41112 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
41114 /* Combine high and low parts. */
41115 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
41118 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41119 gen_rtx_MULT (mode, op1, op2));
41122 /* Expand an insert into a vector register through pinsr insn.
41123 Return true if successful. */
41125 bool
41126 ix86_expand_pinsr (rtx *operands)
41128 rtx dst = operands[0];
41129 rtx src = operands[3];
41131 unsigned int size = INTVAL (operands[1]);
41132 unsigned int pos = INTVAL (operands[2]);
41134 if (GET_CODE (dst) == SUBREG)
41136 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
41137 dst = SUBREG_REG (dst);
41140 if (GET_CODE (src) == SUBREG)
41141 src = SUBREG_REG (src);
41143 switch (GET_MODE (dst))
41145 case V16QImode:
41146 case V8HImode:
41147 case V4SImode:
41148 case V2DImode:
41150 enum machine_mode srcmode, dstmode;
41151 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41153 srcmode = mode_for_size (size, MODE_INT, 0);
41155 switch (srcmode)
41157 case QImode:
41158 if (!TARGET_SSE4_1)
41159 return false;
41160 dstmode = V16QImode;
41161 pinsr = gen_sse4_1_pinsrb;
41162 break;
41164 case HImode:
41165 if (!TARGET_SSE2)
41166 return false;
41167 dstmode = V8HImode;
41168 pinsr = gen_sse2_pinsrw;
41169 break;
41171 case SImode:
41172 if (!TARGET_SSE4_1)
41173 return false;
41174 dstmode = V4SImode;
41175 pinsr = gen_sse4_1_pinsrd;
41176 break;
41178 case DImode:
41179 gcc_assert (TARGET_64BIT);
41180 if (!TARGET_SSE4_1)
41181 return false;
41182 dstmode = V2DImode;
41183 pinsr = gen_sse4_1_pinsrq;
41184 break;
41186 default:
41187 return false;
41190 dst = gen_lowpart (dstmode, dst);
41191 src = gen_lowpart (srcmode, src);
41193 pos /= size;
41195 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41196 return true;
41199 default:
41200 return false;
41204 /* This function returns the calling abi specific va_list type node.
41205 It returns the FNDECL specific va_list type. */
41207 static tree
41208 ix86_fn_abi_va_list (tree fndecl)
41210 if (!TARGET_64BIT)
41211 return va_list_type_node;
41212 gcc_assert (fndecl != NULL_TREE);
41214 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41215 return ms_va_list_type_node;
41216 else
41217 return sysv_va_list_type_node;
41220 /* Returns the canonical va_list type specified by TYPE. If there
41221 is no valid TYPE provided, it return NULL_TREE. */
41223 static tree
41224 ix86_canonical_va_list_type (tree type)
41226 tree wtype, htype;
41228 /* Resolve references and pointers to va_list type. */
41229 if (TREE_CODE (type) == MEM_REF)
41230 type = TREE_TYPE (type);
41231 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41232 type = TREE_TYPE (type);
41233 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41234 type = TREE_TYPE (type);
41236 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41238 wtype = va_list_type_node;
41239 gcc_assert (wtype != NULL_TREE);
41240 htype = type;
41241 if (TREE_CODE (wtype) == ARRAY_TYPE)
41243 /* If va_list is an array type, the argument may have decayed
41244 to a pointer type, e.g. by being passed to another function.
41245 In that case, unwrap both types so that we can compare the
41246 underlying records. */
41247 if (TREE_CODE (htype) == ARRAY_TYPE
41248 || POINTER_TYPE_P (htype))
41250 wtype = TREE_TYPE (wtype);
41251 htype = TREE_TYPE (htype);
41254 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41255 return va_list_type_node;
41256 wtype = sysv_va_list_type_node;
41257 gcc_assert (wtype != NULL_TREE);
41258 htype = type;
41259 if (TREE_CODE (wtype) == ARRAY_TYPE)
41261 /* If va_list is an array type, the argument may have decayed
41262 to a pointer type, e.g. by being passed to another function.
41263 In that case, unwrap both types so that we can compare the
41264 underlying records. */
41265 if (TREE_CODE (htype) == ARRAY_TYPE
41266 || POINTER_TYPE_P (htype))
41268 wtype = TREE_TYPE (wtype);
41269 htype = TREE_TYPE (htype);
41272 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41273 return sysv_va_list_type_node;
41274 wtype = ms_va_list_type_node;
41275 gcc_assert (wtype != NULL_TREE);
41276 htype = type;
41277 if (TREE_CODE (wtype) == ARRAY_TYPE)
41279 /* If va_list is an array type, the argument may have decayed
41280 to a pointer type, e.g. by being passed to another function.
41281 In that case, unwrap both types so that we can compare the
41282 underlying records. */
41283 if (TREE_CODE (htype) == ARRAY_TYPE
41284 || POINTER_TYPE_P (htype))
41286 wtype = TREE_TYPE (wtype);
41287 htype = TREE_TYPE (htype);
41290 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41291 return ms_va_list_type_node;
41292 return NULL_TREE;
41294 return std_canonical_va_list_type (type);
41297 /* Iterate through the target-specific builtin types for va_list.
41298 IDX denotes the iterator, *PTREE is set to the result type of
41299 the va_list builtin, and *PNAME to its internal type.
41300 Returns zero if there is no element for this index, otherwise
41301 IDX should be increased upon the next call.
41302 Note, do not iterate a base builtin's name like __builtin_va_list.
41303 Used from c_common_nodes_and_builtins. */
41305 static int
41306 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41308 if (TARGET_64BIT)
41310 switch (idx)
41312 default:
41313 break;
41315 case 0:
41316 *ptree = ms_va_list_type_node;
41317 *pname = "__builtin_ms_va_list";
41318 return 1;
41320 case 1:
41321 *ptree = sysv_va_list_type_node;
41322 *pname = "__builtin_sysv_va_list";
41323 return 1;
41327 return 0;
41330 #undef TARGET_SCHED_DISPATCH
41331 #define TARGET_SCHED_DISPATCH has_dispatch
41332 #undef TARGET_SCHED_DISPATCH_DO
41333 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41334 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41335 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41336 #undef TARGET_SCHED_REORDER
41337 #define TARGET_SCHED_REORDER ix86_sched_reorder
41338 #undef TARGET_SCHED_ADJUST_PRIORITY
41339 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41340 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41341 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
41342 ix86_dependencies_evaluation_hook
41344 /* The size of the dispatch window is the total number of bytes of
41345 object code allowed in a window. */
41346 #define DISPATCH_WINDOW_SIZE 16
41348 /* Number of dispatch windows considered for scheduling. */
41349 #define MAX_DISPATCH_WINDOWS 3
41351 /* Maximum number of instructions in a window. */
41352 #define MAX_INSN 4
41354 /* Maximum number of immediate operands in a window. */
41355 #define MAX_IMM 4
41357 /* Maximum number of immediate bits allowed in a window. */
41358 #define MAX_IMM_SIZE 128
41360 /* Maximum number of 32 bit immediates allowed in a window. */
41361 #define MAX_IMM_32 4
41363 /* Maximum number of 64 bit immediates allowed in a window. */
41364 #define MAX_IMM_64 2
41366 /* Maximum total of loads or prefetches allowed in a window. */
41367 #define MAX_LOAD 2
41369 /* Maximum total of stores allowed in a window. */
41370 #define MAX_STORE 1
41372 #undef BIG
41373 #define BIG 100
41376 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41377 enum dispatch_group {
41378 disp_no_group = 0,
41379 disp_load,
41380 disp_store,
41381 disp_load_store,
41382 disp_prefetch,
41383 disp_imm,
41384 disp_imm_32,
41385 disp_imm_64,
41386 disp_branch,
41387 disp_cmp,
41388 disp_jcc,
41389 disp_last
41392 /* Number of allowable groups in a dispatch window. It is an array
41393 indexed by dispatch_group enum. 100 is used as a big number,
41394 because the number of these kind of operations does not have any
41395 effect in dispatch window, but we need them for other reasons in
41396 the table. */
41397 static unsigned int num_allowable_groups[disp_last] = {
41398 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41401 char group_name[disp_last + 1][16] = {
41402 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41403 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41404 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41407 /* Instruction path. */
41408 enum insn_path {
41409 no_path = 0,
41410 path_single, /* Single micro op. */
41411 path_double, /* Double micro op. */
41412 path_multi, /* Instructions with more than 2 micro op.. */
41413 last_path
41416 /* sched_insn_info defines a window to the instructions scheduled in
41417 the basic block. It contains a pointer to the insn_info table and
41418 the instruction scheduled.
41420 Windows are allocated for each basic block and are linked
41421 together. */
41422 typedef struct sched_insn_info_s {
41423 rtx insn;
41424 enum dispatch_group group;
41425 enum insn_path path;
41426 int byte_len;
41427 int imm_bytes;
41428 } sched_insn_info;
41430 /* Linked list of dispatch windows. This is a two way list of
41431 dispatch windows of a basic block. It contains information about
41432 the number of uops in the window and the total number of
41433 instructions and of bytes in the object code for this dispatch
41434 window. */
41435 typedef struct dispatch_windows_s {
41436 int num_insn; /* Number of insn in the window. */
41437 int num_uops; /* Number of uops in the window. */
41438 int window_size; /* Number of bytes in the window. */
41439 int window_num; /* Window number between 0 or 1. */
41440 int num_imm; /* Number of immediates in an insn. */
41441 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41442 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41443 int imm_size; /* Total immediates in the window. */
41444 int num_loads; /* Total memory loads in the window. */
41445 int num_stores; /* Total memory stores in the window. */
41446 int violation; /* Violation exists in window. */
41447 sched_insn_info *window; /* Pointer to the window. */
41448 struct dispatch_windows_s *next;
41449 struct dispatch_windows_s *prev;
41450 } dispatch_windows;
41452 /* Immediate valuse used in an insn. */
41453 typedef struct imm_info_s
41455 int imm;
41456 int imm32;
41457 int imm64;
41458 } imm_info;
41460 static dispatch_windows *dispatch_window_list;
41461 static dispatch_windows *dispatch_window_list1;
41463 /* Get dispatch group of insn. */
41465 static enum dispatch_group
41466 get_mem_group (rtx insn)
41468 enum attr_memory memory;
41470 if (INSN_CODE (insn) < 0)
41471 return disp_no_group;
41472 memory = get_attr_memory (insn);
41473 if (memory == MEMORY_STORE)
41474 return disp_store;
41476 if (memory == MEMORY_LOAD)
41477 return disp_load;
41479 if (memory == MEMORY_BOTH)
41480 return disp_load_store;
41482 return disp_no_group;
41485 /* Return true if insn is a compare instruction. */
41487 static bool
41488 is_cmp (rtx insn)
41490 enum attr_type type;
41492 type = get_attr_type (insn);
41493 return (type == TYPE_TEST
41494 || type == TYPE_ICMP
41495 || type == TYPE_FCMP
41496 || GET_CODE (PATTERN (insn)) == COMPARE);
41499 /* Return true if a dispatch violation encountered. */
41501 static bool
41502 dispatch_violation (void)
41504 if (dispatch_window_list->next)
41505 return dispatch_window_list->next->violation;
41506 return dispatch_window_list->violation;
41509 /* Return true if insn is a branch instruction. */
41511 static bool
41512 is_branch (rtx insn)
41514 return (CALL_P (insn) || JUMP_P (insn));
41517 /* Return true if insn is a prefetch instruction. */
41519 static bool
41520 is_prefetch (rtx insn)
41522 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41525 /* This function initializes a dispatch window and the list container holding a
41526 pointer to the window. */
41528 static void
41529 init_window (int window_num)
41531 int i;
41532 dispatch_windows *new_list;
41534 if (window_num == 0)
41535 new_list = dispatch_window_list;
41536 else
41537 new_list = dispatch_window_list1;
41539 new_list->num_insn = 0;
41540 new_list->num_uops = 0;
41541 new_list->window_size = 0;
41542 new_list->next = NULL;
41543 new_list->prev = NULL;
41544 new_list->window_num = window_num;
41545 new_list->num_imm = 0;
41546 new_list->num_imm_32 = 0;
41547 new_list->num_imm_64 = 0;
41548 new_list->imm_size = 0;
41549 new_list->num_loads = 0;
41550 new_list->num_stores = 0;
41551 new_list->violation = false;
41553 for (i = 0; i < MAX_INSN; i++)
41555 new_list->window[i].insn = NULL;
41556 new_list->window[i].group = disp_no_group;
41557 new_list->window[i].path = no_path;
41558 new_list->window[i].byte_len = 0;
41559 new_list->window[i].imm_bytes = 0;
41561 return;
41564 /* This function allocates and initializes a dispatch window and the
41565 list container holding a pointer to the window. */
41567 static dispatch_windows *
41568 allocate_window (void)
41570 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41571 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41573 return new_list;
41576 /* This routine initializes the dispatch scheduling information. It
41577 initiates building dispatch scheduler tables and constructs the
41578 first dispatch window. */
41580 static void
41581 init_dispatch_sched (void)
41583 /* Allocate a dispatch list and a window. */
41584 dispatch_window_list = allocate_window ();
41585 dispatch_window_list1 = allocate_window ();
41586 init_window (0);
41587 init_window (1);
41590 /* This function returns true if a branch is detected. End of a basic block
41591 does not have to be a branch, but here we assume only branches end a
41592 window. */
41594 static bool
41595 is_end_basic_block (enum dispatch_group group)
41597 return group == disp_branch;
41600 /* This function is called when the end of a window processing is reached. */
41602 static void
41603 process_end_window (void)
41605 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41606 if (dispatch_window_list->next)
41608 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41609 gcc_assert (dispatch_window_list->window_size
41610 + dispatch_window_list1->window_size <= 48);
41611 init_window (1);
41613 init_window (0);
41616 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41617 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41618 for 48 bytes of instructions. Note that these windows are not dispatch
41619 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41621 static dispatch_windows *
41622 allocate_next_window (int window_num)
41624 if (window_num == 0)
41626 if (dispatch_window_list->next)
41627 init_window (1);
41628 init_window (0);
41629 return dispatch_window_list;
41632 dispatch_window_list->next = dispatch_window_list1;
41633 dispatch_window_list1->prev = dispatch_window_list;
41635 return dispatch_window_list1;
41638 /* Increment the number of immediate operands of an instruction. */
41640 static int
41641 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41643 if (*in_rtx == 0)
41644 return 0;
41646 switch ( GET_CODE (*in_rtx))
41648 case CONST:
41649 case SYMBOL_REF:
41650 case CONST_INT:
41651 (imm_values->imm)++;
41652 if (x86_64_immediate_operand (*in_rtx, SImode))
41653 (imm_values->imm32)++;
41654 else
41655 (imm_values->imm64)++;
41656 break;
41658 case CONST_DOUBLE:
41659 (imm_values->imm)++;
41660 (imm_values->imm64)++;
41661 break;
41663 case CODE_LABEL:
41664 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41666 (imm_values->imm)++;
41667 (imm_values->imm32)++;
41669 break;
41671 default:
41672 break;
41675 return 0;
41678 /* Compute number of immediate operands of an instruction. */
41680 static void
41681 find_constant (rtx in_rtx, imm_info *imm_values)
41683 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41684 (rtx_function) find_constant_1, (void *) imm_values);
41687 /* Return total size of immediate operands of an instruction along with number
41688 of corresponding immediate-operands. It initializes its parameters to zero
41689 befor calling FIND_CONSTANT.
41690 INSN is the input instruction. IMM is the total of immediates.
41691 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41692 bit immediates. */
41694 static int
41695 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41697 imm_info imm_values = {0, 0, 0};
41699 find_constant (insn, &imm_values);
41700 *imm = imm_values.imm;
41701 *imm32 = imm_values.imm32;
41702 *imm64 = imm_values.imm64;
41703 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41706 /* This function indicates if an operand of an instruction is an
41707 immediate. */
41709 static bool
41710 has_immediate (rtx insn)
41712 int num_imm_operand;
41713 int num_imm32_operand;
41714 int num_imm64_operand;
41716 if (insn)
41717 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41718 &num_imm64_operand);
41719 return false;
41722 /* Return single or double path for instructions. */
41724 static enum insn_path
41725 get_insn_path (rtx insn)
41727 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41729 if ((int)path == 0)
41730 return path_single;
41732 if ((int)path == 1)
41733 return path_double;
41735 return path_multi;
41738 /* Return insn dispatch group. */
41740 static enum dispatch_group
41741 get_insn_group (rtx insn)
41743 enum dispatch_group group = get_mem_group (insn);
41744 if (group)
41745 return group;
41747 if (is_branch (insn))
41748 return disp_branch;
41750 if (is_cmp (insn))
41751 return disp_cmp;
41753 if (has_immediate (insn))
41754 return disp_imm;
41756 if (is_prefetch (insn))
41757 return disp_prefetch;
41759 return disp_no_group;
41762 /* Count number of GROUP restricted instructions in a dispatch
41763 window WINDOW_LIST. */
41765 static int
41766 count_num_restricted (rtx insn, dispatch_windows *window_list)
41768 enum dispatch_group group = get_insn_group (insn);
41769 int imm_size;
41770 int num_imm_operand;
41771 int num_imm32_operand;
41772 int num_imm64_operand;
41774 if (group == disp_no_group)
41775 return 0;
41777 if (group == disp_imm)
41779 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41780 &num_imm64_operand);
41781 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41782 || num_imm_operand + window_list->num_imm > MAX_IMM
41783 || (num_imm32_operand > 0
41784 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41785 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41786 || (num_imm64_operand > 0
41787 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41788 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41789 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41790 && num_imm64_operand > 0
41791 && ((window_list->num_imm_64 > 0
41792 && window_list->num_insn >= 2)
41793 || window_list->num_insn >= 3)))
41794 return BIG;
41796 return 1;
41799 if ((group == disp_load_store
41800 && (window_list->num_loads >= MAX_LOAD
41801 || window_list->num_stores >= MAX_STORE))
41802 || ((group == disp_load
41803 || group == disp_prefetch)
41804 && window_list->num_loads >= MAX_LOAD)
41805 || (group == disp_store
41806 && window_list->num_stores >= MAX_STORE))
41807 return BIG;
41809 return 1;
41812 /* This function returns true if insn satisfies dispatch rules on the
41813 last window scheduled. */
41815 static bool
41816 fits_dispatch_window (rtx insn)
41818 dispatch_windows *window_list = dispatch_window_list;
41819 dispatch_windows *window_list_next = dispatch_window_list->next;
41820 unsigned int num_restrict;
41821 enum dispatch_group group = get_insn_group (insn);
41822 enum insn_path path = get_insn_path (insn);
41823 int sum;
41825 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41826 instructions should be given the lowest priority in the
41827 scheduling process in Haifa scheduler to make sure they will be
41828 scheduled in the same dispatch window as the reference to them. */
41829 if (group == disp_jcc || group == disp_cmp)
41830 return false;
41832 /* Check nonrestricted. */
41833 if (group == disp_no_group || group == disp_branch)
41834 return true;
41836 /* Get last dispatch window. */
41837 if (window_list_next)
41838 window_list = window_list_next;
41840 if (window_list->window_num == 1)
41842 sum = window_list->prev->window_size + window_list->window_size;
41844 if (sum == 32
41845 || (min_insn_size (insn) + sum) >= 48)
41846 /* Window 1 is full. Go for next window. */
41847 return true;
41850 num_restrict = count_num_restricted (insn, window_list);
41852 if (num_restrict > num_allowable_groups[group])
41853 return false;
41855 /* See if it fits in the first window. */
41856 if (window_list->window_num == 0)
41858 /* The first widow should have only single and double path
41859 uops. */
41860 if (path == path_double
41861 && (window_list->num_uops + 2) > MAX_INSN)
41862 return false;
41863 else if (path != path_single)
41864 return false;
41866 return true;
41869 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41870 dispatch window WINDOW_LIST. */
41872 static void
41873 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41875 int byte_len = min_insn_size (insn);
41876 int num_insn = window_list->num_insn;
41877 int imm_size;
41878 sched_insn_info *window = window_list->window;
41879 enum dispatch_group group = get_insn_group (insn);
41880 enum insn_path path = get_insn_path (insn);
41881 int num_imm_operand;
41882 int num_imm32_operand;
41883 int num_imm64_operand;
41885 if (!window_list->violation && group != disp_cmp
41886 && !fits_dispatch_window (insn))
41887 window_list->violation = true;
41889 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41890 &num_imm64_operand);
41892 /* Initialize window with new instruction. */
41893 window[num_insn].insn = insn;
41894 window[num_insn].byte_len = byte_len;
41895 window[num_insn].group = group;
41896 window[num_insn].path = path;
41897 window[num_insn].imm_bytes = imm_size;
41899 window_list->window_size += byte_len;
41900 window_list->num_insn = num_insn + 1;
41901 window_list->num_uops = window_list->num_uops + num_uops;
41902 window_list->imm_size += imm_size;
41903 window_list->num_imm += num_imm_operand;
41904 window_list->num_imm_32 += num_imm32_operand;
41905 window_list->num_imm_64 += num_imm64_operand;
41907 if (group == disp_store)
41908 window_list->num_stores += 1;
41909 else if (group == disp_load
41910 || group == disp_prefetch)
41911 window_list->num_loads += 1;
41912 else if (group == disp_load_store)
41914 window_list->num_stores += 1;
41915 window_list->num_loads += 1;
41919 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41920 If the total bytes of instructions or the number of instructions in
41921 the window exceed allowable, it allocates a new window. */
41923 static void
41924 add_to_dispatch_window (rtx insn)
41926 int byte_len;
41927 dispatch_windows *window_list;
41928 dispatch_windows *next_list;
41929 dispatch_windows *window0_list;
41930 enum insn_path path;
41931 enum dispatch_group insn_group;
41932 bool insn_fits;
41933 int num_insn;
41934 int num_uops;
41935 int window_num;
41936 int insn_num_uops;
41937 int sum;
41939 if (INSN_CODE (insn) < 0)
41940 return;
41942 byte_len = min_insn_size (insn);
41943 window_list = dispatch_window_list;
41944 next_list = window_list->next;
41945 path = get_insn_path (insn);
41946 insn_group = get_insn_group (insn);
41948 /* Get the last dispatch window. */
41949 if (next_list)
41950 window_list = dispatch_window_list->next;
41952 if (path == path_single)
41953 insn_num_uops = 1;
41954 else if (path == path_double)
41955 insn_num_uops = 2;
41956 else
41957 insn_num_uops = (int) path;
41959 /* If current window is full, get a new window.
41960 Window number zero is full, if MAX_INSN uops are scheduled in it.
41961 Window number one is full, if window zero's bytes plus window
41962 one's bytes is 32, or if the bytes of the new instruction added
41963 to the total makes it greater than 48, or it has already MAX_INSN
41964 instructions in it. */
41965 num_insn = window_list->num_insn;
41966 num_uops = window_list->num_uops;
41967 window_num = window_list->window_num;
41968 insn_fits = fits_dispatch_window (insn);
41970 if (num_insn >= MAX_INSN
41971 || num_uops + insn_num_uops > MAX_INSN
41972 || !(insn_fits))
41974 window_num = ~window_num & 1;
41975 window_list = allocate_next_window (window_num);
41978 if (window_num == 0)
41980 add_insn_window (insn, window_list, insn_num_uops);
41981 if (window_list->num_insn >= MAX_INSN
41982 && insn_group == disp_branch)
41984 process_end_window ();
41985 return;
41988 else if (window_num == 1)
41990 window0_list = window_list->prev;
41991 sum = window0_list->window_size + window_list->window_size;
41992 if (sum == 32
41993 || (byte_len + sum) >= 48)
41995 process_end_window ();
41996 window_list = dispatch_window_list;
41999 add_insn_window (insn, window_list, insn_num_uops);
42001 else
42002 gcc_unreachable ();
42004 if (is_end_basic_block (insn_group))
42006 /* End of basic block is reached do end-basic-block process. */
42007 process_end_window ();
42008 return;
42012 /* Print the dispatch window, WINDOW_NUM, to FILE. */
42014 DEBUG_FUNCTION static void
42015 debug_dispatch_window_file (FILE *file, int window_num)
42017 dispatch_windows *list;
42018 int i;
42020 if (window_num == 0)
42021 list = dispatch_window_list;
42022 else
42023 list = dispatch_window_list1;
42025 fprintf (file, "Window #%d:\n", list->window_num);
42026 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
42027 list->num_insn, list->num_uops, list->window_size);
42028 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42029 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
42031 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
42032 list->num_stores);
42033 fprintf (file, " insn info:\n");
42035 for (i = 0; i < MAX_INSN; i++)
42037 if (!list->window[i].insn)
42038 break;
42039 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
42040 i, group_name[list->window[i].group],
42041 i, (void *)list->window[i].insn,
42042 i, list->window[i].path,
42043 i, list->window[i].byte_len,
42044 i, list->window[i].imm_bytes);
42048 /* Print to stdout a dispatch window. */
42050 DEBUG_FUNCTION void
42051 debug_dispatch_window (int window_num)
42053 debug_dispatch_window_file (stdout, window_num);
42056 /* Print INSN dispatch information to FILE. */
42058 DEBUG_FUNCTION static void
42059 debug_insn_dispatch_info_file (FILE *file, rtx insn)
42061 int byte_len;
42062 enum insn_path path;
42063 enum dispatch_group group;
42064 int imm_size;
42065 int num_imm_operand;
42066 int num_imm32_operand;
42067 int num_imm64_operand;
42069 if (INSN_CODE (insn) < 0)
42070 return;
42072 byte_len = min_insn_size (insn);
42073 path = get_insn_path (insn);
42074 group = get_insn_group (insn);
42075 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42076 &num_imm64_operand);
42078 fprintf (file, " insn info:\n");
42079 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
42080 group_name[group], path, byte_len);
42081 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42082 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
42085 /* Print to STDERR the status of the ready list with respect to
42086 dispatch windows. */
42088 DEBUG_FUNCTION void
42089 debug_ready_dispatch (void)
42091 int i;
42092 int no_ready = number_in_ready ();
42094 fprintf (stdout, "Number of ready: %d\n", no_ready);
42096 for (i = 0; i < no_ready; i++)
42097 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
42100 /* This routine is the driver of the dispatch scheduler. */
42102 static void
42103 do_dispatch (rtx insn, int mode)
42105 if (mode == DISPATCH_INIT)
42106 init_dispatch_sched ();
42107 else if (mode == ADD_TO_DISPATCH_WINDOW)
42108 add_to_dispatch_window (insn);
42111 /* Return TRUE if Dispatch Scheduling is supported. */
42113 static bool
42114 has_dispatch (rtx insn, int action)
42116 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
42117 && flag_dispatch_scheduler)
42118 switch (action)
42120 default:
42121 return false;
42123 case IS_DISPATCH_ON:
42124 return true;
42125 break;
42127 case IS_CMP:
42128 return is_cmp (insn);
42130 case DISPATCH_VIOLATION:
42131 return dispatch_violation ();
42133 case FITS_DISPATCH_WINDOW:
42134 return fits_dispatch_window (insn);
42137 return false;
42140 /* Implementation of reassociation_width target hook used by
42141 reassoc phase to identify parallelism level in reassociated
42142 tree. Statements tree_code is passed in OPC. Arguments type
42143 is passed in MODE.
42145 Currently parallel reassociation is enabled for Atom
42146 processors only and we set reassociation width to be 2
42147 because Atom may issue up to 2 instructions per cycle.
42149 Return value should be fixed if parallel reassociation is
42150 enabled for other processors. */
42152 static int
42153 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42154 enum machine_mode mode)
42156 int res = 1;
42158 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42159 res = 2;
42160 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42161 res = 2;
42163 return res;
42166 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42167 place emms and femms instructions. */
42169 static enum machine_mode
42170 ix86_preferred_simd_mode (enum machine_mode mode)
42172 if (!TARGET_SSE)
42173 return word_mode;
42175 switch (mode)
42177 case QImode:
42178 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42179 case HImode:
42180 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42181 case SImode:
42182 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42183 case DImode:
42184 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42186 case SFmode:
42187 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42188 return V8SFmode;
42189 else
42190 return V4SFmode;
42192 case DFmode:
42193 if (!TARGET_VECTORIZE_DOUBLE)
42194 return word_mode;
42195 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42196 return V4DFmode;
42197 else if (TARGET_SSE2)
42198 return V2DFmode;
42199 /* FALLTHRU */
42201 default:
42202 return word_mode;
42206 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42207 vectors. */
42209 static unsigned int
42210 ix86_autovectorize_vector_sizes (void)
42212 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42217 /* Return class of registers which could be used for pseudo of MODE
42218 and of class RCLASS for spilling instead of memory. Return NO_REGS
42219 if it is not possible or non-profitable. */
42220 static reg_class_t
42221 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42223 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42224 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42225 && INTEGER_CLASS_P (rclass))
42226 return SSE_REGS;
42227 return NO_REGS;
42230 /* Implement targetm.vectorize.init_cost. */
42232 static void *
42233 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42235 unsigned *cost = XNEWVEC (unsigned, 3);
42236 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42237 return cost;
42240 /* Implement targetm.vectorize.add_stmt_cost. */
42242 static unsigned
42243 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42244 struct _stmt_vec_info *stmt_info, int misalign,
42245 enum vect_cost_model_location where)
42247 unsigned *cost = (unsigned *) data;
42248 unsigned retval = 0;
42250 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42251 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42253 /* Statements in an inner loop relative to the loop being
42254 vectorized are weighted more heavily. The value here is
42255 arbitrary and could potentially be improved with analysis. */
42256 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42257 count *= 50; /* FIXME. */
42259 retval = (unsigned) (count * stmt_cost);
42260 cost[where] += retval;
42262 return retval;
42265 /* Implement targetm.vectorize.finish_cost. */
42267 static void
42268 ix86_finish_cost (void *data, unsigned *prologue_cost,
42269 unsigned *body_cost, unsigned *epilogue_cost)
42271 unsigned *cost = (unsigned *) data;
42272 *prologue_cost = cost[vect_prologue];
42273 *body_cost = cost[vect_body];
42274 *epilogue_cost = cost[vect_epilogue];
42277 /* Implement targetm.vectorize.destroy_cost_data. */
42279 static void
42280 ix86_destroy_cost_data (void *data)
42282 free (data);
42285 /* Validate target specific memory model bits in VAL. */
42287 static unsigned HOST_WIDE_INT
42288 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42290 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42291 bool strong;
42293 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42294 |MEMMODEL_MASK)
42295 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42297 warning (OPT_Winvalid_memory_model,
42298 "Unknown architecture specific memory model");
42299 return MEMMODEL_SEQ_CST;
42301 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42302 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42304 warning (OPT_Winvalid_memory_model,
42305 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42306 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42308 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42310 warning (OPT_Winvalid_memory_model,
42311 "HLE_RELEASE not used with RELEASE or stronger memory model");
42312 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42314 return val;
42317 /* Initialize the GCC target structure. */
42318 #undef TARGET_RETURN_IN_MEMORY
42319 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42321 #undef TARGET_LEGITIMIZE_ADDRESS
42322 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42324 #undef TARGET_ATTRIBUTE_TABLE
42325 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42326 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
42327 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
42328 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42329 # undef TARGET_MERGE_DECL_ATTRIBUTES
42330 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42331 #endif
42333 #undef TARGET_COMP_TYPE_ATTRIBUTES
42334 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42336 #undef TARGET_INIT_BUILTINS
42337 #define TARGET_INIT_BUILTINS ix86_init_builtins
42338 #undef TARGET_BUILTIN_DECL
42339 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42340 #undef TARGET_EXPAND_BUILTIN
42341 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42343 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42344 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42345 ix86_builtin_vectorized_function
42347 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42348 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42350 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42351 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42353 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42354 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42356 #undef TARGET_BUILTIN_RECIPROCAL
42357 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42359 #undef TARGET_ASM_FUNCTION_EPILOGUE
42360 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42362 #undef TARGET_ENCODE_SECTION_INFO
42363 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42364 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42365 #else
42366 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42367 #endif
42369 #undef TARGET_ASM_OPEN_PAREN
42370 #define TARGET_ASM_OPEN_PAREN ""
42371 #undef TARGET_ASM_CLOSE_PAREN
42372 #define TARGET_ASM_CLOSE_PAREN ""
42374 #undef TARGET_ASM_BYTE_OP
42375 #define TARGET_ASM_BYTE_OP ASM_BYTE
42377 #undef TARGET_ASM_ALIGNED_HI_OP
42378 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42379 #undef TARGET_ASM_ALIGNED_SI_OP
42380 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42381 #ifdef ASM_QUAD
42382 #undef TARGET_ASM_ALIGNED_DI_OP
42383 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42384 #endif
42386 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42387 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42389 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42390 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42392 #undef TARGET_ASM_UNALIGNED_HI_OP
42393 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42394 #undef TARGET_ASM_UNALIGNED_SI_OP
42395 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42396 #undef TARGET_ASM_UNALIGNED_DI_OP
42397 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42399 #undef TARGET_PRINT_OPERAND
42400 #define TARGET_PRINT_OPERAND ix86_print_operand
42401 #undef TARGET_PRINT_OPERAND_ADDRESS
42402 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42403 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42404 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42405 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42406 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42408 #undef TARGET_SCHED_INIT_GLOBAL
42409 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42410 #undef TARGET_SCHED_ADJUST_COST
42411 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42412 #undef TARGET_SCHED_ISSUE_RATE
42413 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42414 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42415 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42416 ia32_multipass_dfa_lookahead
42418 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42419 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42421 #undef TARGET_MEMMODEL_CHECK
42422 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42424 #ifdef HAVE_AS_TLS
42425 #undef TARGET_HAVE_TLS
42426 #define TARGET_HAVE_TLS true
42427 #endif
42428 #undef TARGET_CANNOT_FORCE_CONST_MEM
42429 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42430 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42431 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42433 #undef TARGET_DELEGITIMIZE_ADDRESS
42434 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42436 #undef TARGET_MS_BITFIELD_LAYOUT_P
42437 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42439 #if TARGET_MACHO
42440 #undef TARGET_BINDS_LOCAL_P
42441 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42442 #endif
42443 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42444 #undef TARGET_BINDS_LOCAL_P
42445 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42446 #endif
42448 #undef TARGET_ASM_OUTPUT_MI_THUNK
42449 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42450 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42451 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42453 #undef TARGET_ASM_FILE_START
42454 #define TARGET_ASM_FILE_START x86_file_start
42456 #undef TARGET_OPTION_OVERRIDE
42457 #define TARGET_OPTION_OVERRIDE ix86_option_override
42459 #undef TARGET_REGISTER_MOVE_COST
42460 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42461 #undef TARGET_MEMORY_MOVE_COST
42462 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42463 #undef TARGET_RTX_COSTS
42464 #define TARGET_RTX_COSTS ix86_rtx_costs
42465 #undef TARGET_ADDRESS_COST
42466 #define TARGET_ADDRESS_COST ix86_address_cost
42468 #undef TARGET_FIXED_CONDITION_CODE_REGS
42469 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42470 #undef TARGET_CC_MODES_COMPATIBLE
42471 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42473 #undef TARGET_MACHINE_DEPENDENT_REORG
42474 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42476 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42477 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42479 #undef TARGET_BUILD_BUILTIN_VA_LIST
42480 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42482 #undef TARGET_FOLD_BUILTIN
42483 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42485 #undef TARGET_COMPARE_VERSION_PRIORITY
42486 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42488 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42489 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42490 ix86_generate_version_dispatcher_body
42492 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42493 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42494 ix86_get_function_versions_dispatcher
42496 #undef TARGET_ENUM_VA_LIST_P
42497 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42499 #undef TARGET_FN_ABI_VA_LIST
42500 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42502 #undef TARGET_CANONICAL_VA_LIST_TYPE
42503 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42505 #undef TARGET_EXPAND_BUILTIN_VA_START
42506 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42508 #undef TARGET_MD_ASM_CLOBBERS
42509 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42511 #undef TARGET_PROMOTE_PROTOTYPES
42512 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42513 #undef TARGET_SETUP_INCOMING_VARARGS
42514 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42515 #undef TARGET_MUST_PASS_IN_STACK
42516 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42517 #undef TARGET_FUNCTION_ARG_ADVANCE
42518 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42519 #undef TARGET_FUNCTION_ARG
42520 #define TARGET_FUNCTION_ARG ix86_function_arg
42521 #undef TARGET_FUNCTION_ARG_BOUNDARY
42522 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42523 #undef TARGET_PASS_BY_REFERENCE
42524 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42525 #undef TARGET_INTERNAL_ARG_POINTER
42526 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42527 #undef TARGET_UPDATE_STACK_BOUNDARY
42528 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42529 #undef TARGET_GET_DRAP_RTX
42530 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42531 #undef TARGET_STRICT_ARGUMENT_NAMING
42532 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42533 #undef TARGET_STATIC_CHAIN
42534 #define TARGET_STATIC_CHAIN ix86_static_chain
42535 #undef TARGET_TRAMPOLINE_INIT
42536 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42537 #undef TARGET_RETURN_POPS_ARGS
42538 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42540 #undef TARGET_LEGITIMATE_COMBINED_INSN
42541 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42543 #undef TARGET_ASAN_SHADOW_OFFSET
42544 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42546 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42547 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42549 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42550 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42552 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42553 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42555 #undef TARGET_C_MODE_FOR_SUFFIX
42556 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42558 #ifdef HAVE_AS_TLS
42559 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42560 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42561 #endif
42563 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42564 #undef TARGET_INSERT_ATTRIBUTES
42565 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42566 #endif
42568 #undef TARGET_MANGLE_TYPE
42569 #define TARGET_MANGLE_TYPE ix86_mangle_type
42571 #if !TARGET_MACHO
42572 #undef TARGET_STACK_PROTECT_FAIL
42573 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42574 #endif
42576 #undef TARGET_FUNCTION_VALUE
42577 #define TARGET_FUNCTION_VALUE ix86_function_value
42579 #undef TARGET_FUNCTION_VALUE_REGNO_P
42580 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42582 #undef TARGET_PROMOTE_FUNCTION_MODE
42583 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42585 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42586 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42588 #undef TARGET_INSTANTIATE_DECLS
42589 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42591 #undef TARGET_SECONDARY_RELOAD
42592 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42594 #undef TARGET_CLASS_MAX_NREGS
42595 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42597 #undef TARGET_PREFERRED_RELOAD_CLASS
42598 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42599 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42600 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42601 #undef TARGET_CLASS_LIKELY_SPILLED_P
42602 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42604 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42605 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42606 ix86_builtin_vectorization_cost
42607 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42608 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42609 ix86_vectorize_vec_perm_const_ok
42610 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42611 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42612 ix86_preferred_simd_mode
42613 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42614 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42615 ix86_autovectorize_vector_sizes
42616 #undef TARGET_VECTORIZE_INIT_COST
42617 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42618 #undef TARGET_VECTORIZE_ADD_STMT_COST
42619 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42620 #undef TARGET_VECTORIZE_FINISH_COST
42621 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42622 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42623 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42625 #undef TARGET_SET_CURRENT_FUNCTION
42626 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42628 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42629 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42631 #undef TARGET_OPTION_SAVE
42632 #define TARGET_OPTION_SAVE ix86_function_specific_save
42634 #undef TARGET_OPTION_RESTORE
42635 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42637 #undef TARGET_OPTION_PRINT
42638 #define TARGET_OPTION_PRINT ix86_function_specific_print
42640 #undef TARGET_OPTION_FUNCTION_VERSIONS
42641 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42643 #undef TARGET_CAN_INLINE_P
42644 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42646 #undef TARGET_EXPAND_TO_RTL_HOOK
42647 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42649 #undef TARGET_LEGITIMATE_ADDRESS_P
42650 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42652 #undef TARGET_LRA_P
42653 #define TARGET_LRA_P hook_bool_void_true
42655 #undef TARGET_REGISTER_PRIORITY
42656 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42658 #undef TARGET_LEGITIMATE_CONSTANT_P
42659 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42661 #undef TARGET_FRAME_POINTER_REQUIRED
42662 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42664 #undef TARGET_CAN_ELIMINATE
42665 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42667 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42668 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42670 #undef TARGET_ASM_CODE_END
42671 #define TARGET_ASM_CODE_END ix86_code_end
42673 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42674 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42676 #if TARGET_MACHO
42677 #undef TARGET_INIT_LIBFUNCS
42678 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42679 #endif
42681 #undef TARGET_SPILL_CLASS
42682 #define TARGET_SPILL_CLASS ix86_spill_class
42684 struct gcc_target targetm = TARGET_INITIALIZER;
42686 #include "gt-i386.h"