* i386.h (TARGET_GENERIC32, TARGET_GENERIC64): Remove.
[official-gcc.git] / gcc / config / i386 / i386.c
blobb6f3c2d748f00d73d217c66723e4cc4be22cd40c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
65 #include "context.h"
66 #include "pass_manager.h"
68 static rtx legitimize_dllimport_symbol (rtx, bool);
69 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
70 static rtx legitimize_pe_coff_symbol (rtx, bool);
72 #ifndef CHECK_STACK_LIMIT
73 #define CHECK_STACK_LIMIT (-1)
74 #endif
76 /* Return index of given mode in mult and division cost tables. */
77 #define MODE_INDEX(mode) \
78 ((mode) == QImode ? 0 \
79 : (mode) == HImode ? 1 \
80 : (mode) == SImode ? 2 \
81 : (mode) == DImode ? 3 \
82 : 4)
84 /* Processor costs (relative to an add) */
85 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
86 #define COSTS_N_BYTES(N) ((N) * 2)
88 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
90 static stringop_algs ix86_size_memcpy[2] = {
91 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
92 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
93 static stringop_algs ix86_size_memset[2] = {
94 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
95 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
97 const
98 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
99 COSTS_N_BYTES (2), /* cost of an add instruction */
100 COSTS_N_BYTES (3), /* cost of a lea instruction */
101 COSTS_N_BYTES (2), /* variable shift costs */
102 COSTS_N_BYTES (3), /* constant shift costs */
103 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
104 COSTS_N_BYTES (3), /* HI */
105 COSTS_N_BYTES (3), /* SI */
106 COSTS_N_BYTES (3), /* DI */
107 COSTS_N_BYTES (5)}, /* other */
108 0, /* cost of multiply per each bit set */
109 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
110 COSTS_N_BYTES (3), /* HI */
111 COSTS_N_BYTES (3), /* SI */
112 COSTS_N_BYTES (3), /* DI */
113 COSTS_N_BYTES (5)}, /* other */
114 COSTS_N_BYTES (3), /* cost of movsx */
115 COSTS_N_BYTES (3), /* cost of movzx */
116 0, /* "large" insn */
117 2, /* MOVE_RATIO */
118 2, /* cost for loading QImode using movzbl */
119 {2, 2, 2}, /* cost of loading integer registers
120 in QImode, HImode and SImode.
121 Relative to reg-reg move (2). */
122 {2, 2, 2}, /* cost of storing integer registers */
123 2, /* cost of reg,reg fld/fst */
124 {2, 2, 2}, /* cost of loading fp registers
125 in SFmode, DFmode and XFmode */
126 {2, 2, 2}, /* cost of storing fp registers
127 in SFmode, DFmode and XFmode */
128 3, /* cost of moving MMX register */
129 {3, 3}, /* cost of loading MMX registers
130 in SImode and DImode */
131 {3, 3}, /* cost of storing MMX registers
132 in SImode and DImode */
133 3, /* cost of moving SSE register */
134 {3, 3, 3}, /* cost of loading SSE registers
135 in SImode, DImode and TImode */
136 {3, 3, 3}, /* cost of storing SSE registers
137 in SImode, DImode and TImode */
138 3, /* MMX or SSE register to integer */
139 0, /* size of l1 cache */
140 0, /* size of l2 cache */
141 0, /* size of prefetch block */
142 0, /* number of parallel prefetches */
143 2, /* Branch cost */
144 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
145 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
146 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
147 COSTS_N_BYTES (2), /* cost of FABS instruction. */
148 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
149 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
150 ix86_size_memcpy,
151 ix86_size_memset,
152 1, /* scalar_stmt_cost. */
153 1, /* scalar load_cost. */
154 1, /* scalar_store_cost. */
155 1, /* vec_stmt_cost. */
156 1, /* vec_to_scalar_cost. */
157 1, /* scalar_to_vec_cost. */
158 1, /* vec_align_load_cost. */
159 1, /* vec_unalign_load_cost. */
160 1, /* vec_store_cost. */
161 1, /* cond_taken_branch_cost. */
162 1, /* cond_not_taken_branch_cost. */
165 /* Processor costs (relative to an add) */
166 static stringop_algs i386_memcpy[2] = {
167 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
168 DUMMY_STRINGOP_ALGS};
169 static stringop_algs i386_memset[2] = {
170 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
171 DUMMY_STRINGOP_ALGS};
173 static const
174 struct processor_costs i386_cost = { /* 386 specific costs */
175 COSTS_N_INSNS (1), /* cost of an add instruction */
176 COSTS_N_INSNS (1), /* cost of a lea instruction */
177 COSTS_N_INSNS (3), /* variable shift costs */
178 COSTS_N_INSNS (2), /* constant shift costs */
179 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
180 COSTS_N_INSNS (6), /* HI */
181 COSTS_N_INSNS (6), /* SI */
182 COSTS_N_INSNS (6), /* DI */
183 COSTS_N_INSNS (6)}, /* other */
184 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
185 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
186 COSTS_N_INSNS (23), /* HI */
187 COSTS_N_INSNS (23), /* SI */
188 COSTS_N_INSNS (23), /* DI */
189 COSTS_N_INSNS (23)}, /* other */
190 COSTS_N_INSNS (3), /* cost of movsx */
191 COSTS_N_INSNS (2), /* cost of movzx */
192 15, /* "large" insn */
193 3, /* MOVE_RATIO */
194 4, /* cost for loading QImode using movzbl */
195 {2, 4, 2}, /* cost of loading integer registers
196 in QImode, HImode and SImode.
197 Relative to reg-reg move (2). */
198 {2, 4, 2}, /* cost of storing integer registers */
199 2, /* cost of reg,reg fld/fst */
200 {8, 8, 8}, /* cost of loading fp registers
201 in SFmode, DFmode and XFmode */
202 {8, 8, 8}, /* cost of storing fp registers
203 in SFmode, DFmode and XFmode */
204 2, /* cost of moving MMX register */
205 {4, 8}, /* cost of loading MMX registers
206 in SImode and DImode */
207 {4, 8}, /* cost of storing MMX registers
208 in SImode and DImode */
209 2, /* cost of moving SSE register */
210 {4, 8, 16}, /* cost of loading SSE registers
211 in SImode, DImode and TImode */
212 {4, 8, 16}, /* cost of storing SSE registers
213 in SImode, DImode and TImode */
214 3, /* MMX or SSE register to integer */
215 0, /* size of l1 cache */
216 0, /* size of l2 cache */
217 0, /* size of prefetch block */
218 0, /* number of parallel prefetches */
219 1, /* Branch cost */
220 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
221 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
222 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
223 COSTS_N_INSNS (22), /* cost of FABS instruction. */
224 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
225 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
226 i386_memcpy,
227 i386_memset,
228 1, /* scalar_stmt_cost. */
229 1, /* scalar load_cost. */
230 1, /* scalar_store_cost. */
231 1, /* vec_stmt_cost. */
232 1, /* vec_to_scalar_cost. */
233 1, /* scalar_to_vec_cost. */
234 1, /* vec_align_load_cost. */
235 2, /* vec_unalign_load_cost. */
236 1, /* vec_store_cost. */
237 3, /* cond_taken_branch_cost. */
238 1, /* cond_not_taken_branch_cost. */
241 static stringop_algs i486_memcpy[2] = {
242 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
243 DUMMY_STRINGOP_ALGS};
244 static stringop_algs i486_memset[2] = {
245 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
246 DUMMY_STRINGOP_ALGS};
248 static const
249 struct processor_costs i486_cost = { /* 486 specific costs */
250 COSTS_N_INSNS (1), /* cost of an add instruction */
251 COSTS_N_INSNS (1), /* cost of a lea instruction */
252 COSTS_N_INSNS (3), /* variable shift costs */
253 COSTS_N_INSNS (2), /* constant shift costs */
254 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
255 COSTS_N_INSNS (12), /* HI */
256 COSTS_N_INSNS (12), /* SI */
257 COSTS_N_INSNS (12), /* DI */
258 COSTS_N_INSNS (12)}, /* other */
259 1, /* cost of multiply per each bit set */
260 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
261 COSTS_N_INSNS (40), /* HI */
262 COSTS_N_INSNS (40), /* SI */
263 COSTS_N_INSNS (40), /* DI */
264 COSTS_N_INSNS (40)}, /* other */
265 COSTS_N_INSNS (3), /* cost of movsx */
266 COSTS_N_INSNS (2), /* cost of movzx */
267 15, /* "large" insn */
268 3, /* MOVE_RATIO */
269 4, /* cost for loading QImode using movzbl */
270 {2, 4, 2}, /* cost of loading integer registers
271 in QImode, HImode and SImode.
272 Relative to reg-reg move (2). */
273 {2, 4, 2}, /* cost of storing integer registers */
274 2, /* cost of reg,reg fld/fst */
275 {8, 8, 8}, /* cost of loading fp registers
276 in SFmode, DFmode and XFmode */
277 {8, 8, 8}, /* cost of storing fp registers
278 in SFmode, DFmode and XFmode */
279 2, /* cost of moving MMX register */
280 {4, 8}, /* cost of loading MMX registers
281 in SImode and DImode */
282 {4, 8}, /* cost of storing MMX registers
283 in SImode and DImode */
284 2, /* cost of moving SSE register */
285 {4, 8, 16}, /* cost of loading SSE registers
286 in SImode, DImode and TImode */
287 {4, 8, 16}, /* cost of storing SSE registers
288 in SImode, DImode and TImode */
289 3, /* MMX or SSE register to integer */
290 4, /* size of l1 cache. 486 has 8kB cache
291 shared for code and data, so 4kB is
292 not really precise. */
293 4, /* size of l2 cache */
294 0, /* size of prefetch block */
295 0, /* number of parallel prefetches */
296 1, /* Branch cost */
297 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
298 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
299 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
300 COSTS_N_INSNS (3), /* cost of FABS instruction. */
301 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
302 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
303 i486_memcpy,
304 i486_memset,
305 1, /* scalar_stmt_cost. */
306 1, /* scalar load_cost. */
307 1, /* scalar_store_cost. */
308 1, /* vec_stmt_cost. */
309 1, /* vec_to_scalar_cost. */
310 1, /* scalar_to_vec_cost. */
311 1, /* vec_align_load_cost. */
312 2, /* vec_unalign_load_cost. */
313 1, /* vec_store_cost. */
314 3, /* cond_taken_branch_cost. */
315 1, /* cond_not_taken_branch_cost. */
318 static stringop_algs pentium_memcpy[2] = {
319 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
320 DUMMY_STRINGOP_ALGS};
321 static stringop_algs pentium_memset[2] = {
322 {libcall, {{-1, rep_prefix_4_byte, false}}},
323 DUMMY_STRINGOP_ALGS};
325 static const
326 struct processor_costs pentium_cost = {
327 COSTS_N_INSNS (1), /* cost of an add instruction */
328 COSTS_N_INSNS (1), /* cost of a lea instruction */
329 COSTS_N_INSNS (4), /* variable shift costs */
330 COSTS_N_INSNS (1), /* constant shift costs */
331 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
332 COSTS_N_INSNS (11), /* HI */
333 COSTS_N_INSNS (11), /* SI */
334 COSTS_N_INSNS (11), /* DI */
335 COSTS_N_INSNS (11)}, /* other */
336 0, /* cost of multiply per each bit set */
337 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
338 COSTS_N_INSNS (25), /* HI */
339 COSTS_N_INSNS (25), /* SI */
340 COSTS_N_INSNS (25), /* DI */
341 COSTS_N_INSNS (25)}, /* other */
342 COSTS_N_INSNS (3), /* cost of movsx */
343 COSTS_N_INSNS (2), /* cost of movzx */
344 8, /* "large" insn */
345 6, /* MOVE_RATIO */
346 6, /* cost for loading QImode using movzbl */
347 {2, 4, 2}, /* cost of loading integer registers
348 in QImode, HImode and SImode.
349 Relative to reg-reg move (2). */
350 {2, 4, 2}, /* cost of storing integer registers */
351 2, /* cost of reg,reg fld/fst */
352 {2, 2, 6}, /* cost of loading fp registers
353 in SFmode, DFmode and XFmode */
354 {4, 4, 6}, /* cost of storing fp registers
355 in SFmode, DFmode and XFmode */
356 8, /* cost of moving MMX register */
357 {8, 8}, /* cost of loading MMX registers
358 in SImode and DImode */
359 {8, 8}, /* cost of storing MMX registers
360 in SImode and DImode */
361 2, /* cost of moving SSE register */
362 {4, 8, 16}, /* cost of loading SSE registers
363 in SImode, DImode and TImode */
364 {4, 8, 16}, /* cost of storing SSE registers
365 in SImode, DImode and TImode */
366 3, /* MMX or SSE register to integer */
367 8, /* size of l1 cache. */
368 8, /* size of l2 cache */
369 0, /* size of prefetch block */
370 0, /* number of parallel prefetches */
371 2, /* Branch cost */
372 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
373 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
374 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
375 COSTS_N_INSNS (1), /* cost of FABS instruction. */
376 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
377 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
378 pentium_memcpy,
379 pentium_memset,
380 1, /* scalar_stmt_cost. */
381 1, /* scalar load_cost. */
382 1, /* scalar_store_cost. */
383 1, /* vec_stmt_cost. */
384 1, /* vec_to_scalar_cost. */
385 1, /* scalar_to_vec_cost. */
386 1, /* vec_align_load_cost. */
387 2, /* vec_unalign_load_cost. */
388 1, /* vec_store_cost. */
389 3, /* cond_taken_branch_cost. */
390 1, /* cond_not_taken_branch_cost. */
393 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
394 (we ensure the alignment). For small blocks inline loop is still a
395 noticeable win, for bigger blocks either rep movsl or rep movsb is
396 way to go. Rep movsb has apparently more expensive startup time in CPU,
397 but after 4K the difference is down in the noise. */
398 static stringop_algs pentiumpro_memcpy[2] = {
399 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
400 {8192, rep_prefix_4_byte, false},
401 {-1, rep_prefix_1_byte, false}}},
402 DUMMY_STRINGOP_ALGS};
403 static stringop_algs pentiumpro_memset[2] = {
404 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
405 {8192, rep_prefix_4_byte, false},
406 {-1, libcall, false}}},
407 DUMMY_STRINGOP_ALGS};
408 static const
409 struct processor_costs pentiumpro_cost = {
410 COSTS_N_INSNS (1), /* cost of an add instruction */
411 COSTS_N_INSNS (1), /* cost of a lea instruction */
412 COSTS_N_INSNS (1), /* variable shift costs */
413 COSTS_N_INSNS (1), /* constant shift costs */
414 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
415 COSTS_N_INSNS (4), /* HI */
416 COSTS_N_INSNS (4), /* SI */
417 COSTS_N_INSNS (4), /* DI */
418 COSTS_N_INSNS (4)}, /* other */
419 0, /* cost of multiply per each bit set */
420 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
421 COSTS_N_INSNS (17), /* HI */
422 COSTS_N_INSNS (17), /* SI */
423 COSTS_N_INSNS (17), /* DI */
424 COSTS_N_INSNS (17)}, /* other */
425 COSTS_N_INSNS (1), /* cost of movsx */
426 COSTS_N_INSNS (1), /* cost of movzx */
427 8, /* "large" insn */
428 6, /* MOVE_RATIO */
429 2, /* cost for loading QImode using movzbl */
430 {4, 4, 4}, /* cost of loading integer registers
431 in QImode, HImode and SImode.
432 Relative to reg-reg move (2). */
433 {2, 2, 2}, /* cost of storing integer registers */
434 2, /* cost of reg,reg fld/fst */
435 {2, 2, 6}, /* cost of loading fp registers
436 in SFmode, DFmode and XFmode */
437 {4, 4, 6}, /* cost of storing fp registers
438 in SFmode, DFmode and XFmode */
439 2, /* cost of moving MMX register */
440 {2, 2}, /* cost of loading MMX registers
441 in SImode and DImode */
442 {2, 2}, /* cost of storing MMX registers
443 in SImode and DImode */
444 2, /* cost of moving SSE register */
445 {2, 2, 8}, /* cost of loading SSE registers
446 in SImode, DImode and TImode */
447 {2, 2, 8}, /* cost of storing SSE registers
448 in SImode, DImode and TImode */
449 3, /* MMX or SSE register to integer */
450 8, /* size of l1 cache. */
451 256, /* size of l2 cache */
452 32, /* size of prefetch block */
453 6, /* number of parallel prefetches */
454 2, /* Branch cost */
455 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
456 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
457 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
458 COSTS_N_INSNS (2), /* cost of FABS instruction. */
459 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
460 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
461 pentiumpro_memcpy,
462 pentiumpro_memset,
463 1, /* scalar_stmt_cost. */
464 1, /* scalar load_cost. */
465 1, /* scalar_store_cost. */
466 1, /* vec_stmt_cost. */
467 1, /* vec_to_scalar_cost. */
468 1, /* scalar_to_vec_cost. */
469 1, /* vec_align_load_cost. */
470 2, /* vec_unalign_load_cost. */
471 1, /* vec_store_cost. */
472 3, /* cond_taken_branch_cost. */
473 1, /* cond_not_taken_branch_cost. */
476 static stringop_algs geode_memcpy[2] = {
477 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
478 DUMMY_STRINGOP_ALGS};
479 static stringop_algs geode_memset[2] = {
480 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
481 DUMMY_STRINGOP_ALGS};
482 static const
483 struct processor_costs geode_cost = {
484 COSTS_N_INSNS (1), /* cost of an add instruction */
485 COSTS_N_INSNS (1), /* cost of a lea instruction */
486 COSTS_N_INSNS (2), /* variable shift costs */
487 COSTS_N_INSNS (1), /* constant shift costs */
488 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
489 COSTS_N_INSNS (4), /* HI */
490 COSTS_N_INSNS (7), /* SI */
491 COSTS_N_INSNS (7), /* DI */
492 COSTS_N_INSNS (7)}, /* other */
493 0, /* cost of multiply per each bit set */
494 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
495 COSTS_N_INSNS (23), /* HI */
496 COSTS_N_INSNS (39), /* SI */
497 COSTS_N_INSNS (39), /* DI */
498 COSTS_N_INSNS (39)}, /* other */
499 COSTS_N_INSNS (1), /* cost of movsx */
500 COSTS_N_INSNS (1), /* cost of movzx */
501 8, /* "large" insn */
502 4, /* MOVE_RATIO */
503 1, /* cost for loading QImode using movzbl */
504 {1, 1, 1}, /* cost of loading integer registers
505 in QImode, HImode and SImode.
506 Relative to reg-reg move (2). */
507 {1, 1, 1}, /* cost of storing integer registers */
508 1, /* cost of reg,reg fld/fst */
509 {1, 1, 1}, /* cost of loading fp registers
510 in SFmode, DFmode and XFmode */
511 {4, 6, 6}, /* cost of storing fp registers
512 in SFmode, DFmode and XFmode */
514 1, /* cost of moving MMX register */
515 {1, 1}, /* cost of loading MMX registers
516 in SImode and DImode */
517 {1, 1}, /* cost of storing MMX registers
518 in SImode and DImode */
519 1, /* cost of moving SSE register */
520 {1, 1, 1}, /* cost of loading SSE registers
521 in SImode, DImode and TImode */
522 {1, 1, 1}, /* cost of storing SSE registers
523 in SImode, DImode and TImode */
524 1, /* MMX or SSE register to integer */
525 64, /* size of l1 cache. */
526 128, /* size of l2 cache. */
527 32, /* size of prefetch block */
528 1, /* number of parallel prefetches */
529 1, /* Branch cost */
530 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
531 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
532 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
533 COSTS_N_INSNS (1), /* cost of FABS instruction. */
534 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
535 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
536 geode_memcpy,
537 geode_memset,
538 1, /* scalar_stmt_cost. */
539 1, /* scalar load_cost. */
540 1, /* scalar_store_cost. */
541 1, /* vec_stmt_cost. */
542 1, /* vec_to_scalar_cost. */
543 1, /* scalar_to_vec_cost. */
544 1, /* vec_align_load_cost. */
545 2, /* vec_unalign_load_cost. */
546 1, /* vec_store_cost. */
547 3, /* cond_taken_branch_cost. */
548 1, /* cond_not_taken_branch_cost. */
551 static stringop_algs k6_memcpy[2] = {
552 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
553 DUMMY_STRINGOP_ALGS};
554 static stringop_algs k6_memset[2] = {
555 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
556 DUMMY_STRINGOP_ALGS};
557 static const
558 struct processor_costs k6_cost = {
559 COSTS_N_INSNS (1), /* cost of an add instruction */
560 COSTS_N_INSNS (2), /* cost of a lea instruction */
561 COSTS_N_INSNS (1), /* variable shift costs */
562 COSTS_N_INSNS (1), /* constant shift costs */
563 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
564 COSTS_N_INSNS (3), /* HI */
565 COSTS_N_INSNS (3), /* SI */
566 COSTS_N_INSNS (3), /* DI */
567 COSTS_N_INSNS (3)}, /* other */
568 0, /* cost of multiply per each bit set */
569 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
570 COSTS_N_INSNS (18), /* HI */
571 COSTS_N_INSNS (18), /* SI */
572 COSTS_N_INSNS (18), /* DI */
573 COSTS_N_INSNS (18)}, /* other */
574 COSTS_N_INSNS (2), /* cost of movsx */
575 COSTS_N_INSNS (2), /* cost of movzx */
576 8, /* "large" insn */
577 4, /* MOVE_RATIO */
578 3, /* cost for loading QImode using movzbl */
579 {4, 5, 4}, /* cost of loading integer registers
580 in QImode, HImode and SImode.
581 Relative to reg-reg move (2). */
582 {2, 3, 2}, /* cost of storing integer registers */
583 4, /* cost of reg,reg fld/fst */
584 {6, 6, 6}, /* cost of loading fp registers
585 in SFmode, DFmode and XFmode */
586 {4, 4, 4}, /* cost of storing fp registers
587 in SFmode, DFmode and XFmode */
588 2, /* cost of moving MMX register */
589 {2, 2}, /* cost of loading MMX registers
590 in SImode and DImode */
591 {2, 2}, /* cost of storing MMX registers
592 in SImode and DImode */
593 2, /* cost of moving SSE register */
594 {2, 2, 8}, /* cost of loading SSE registers
595 in SImode, DImode and TImode */
596 {2, 2, 8}, /* cost of storing SSE registers
597 in SImode, DImode and TImode */
598 6, /* MMX or SSE register to integer */
599 32, /* size of l1 cache. */
600 32, /* size of l2 cache. Some models
601 have integrated l2 cache, but
602 optimizing for k6 is not important
603 enough to worry about that. */
604 32, /* size of prefetch block */
605 1, /* number of parallel prefetches */
606 1, /* Branch cost */
607 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
608 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
609 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
610 COSTS_N_INSNS (2), /* cost of FABS instruction. */
611 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
612 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
613 k6_memcpy,
614 k6_memset,
615 1, /* scalar_stmt_cost. */
616 1, /* scalar load_cost. */
617 1, /* scalar_store_cost. */
618 1, /* vec_stmt_cost. */
619 1, /* vec_to_scalar_cost. */
620 1, /* scalar_to_vec_cost. */
621 1, /* vec_align_load_cost. */
622 2, /* vec_unalign_load_cost. */
623 1, /* vec_store_cost. */
624 3, /* cond_taken_branch_cost. */
625 1, /* cond_not_taken_branch_cost. */
628 /* For some reason, Athlon deals better with REP prefix (relative to loops)
629 compared to K8. Alignment becomes important after 8 bytes for memcpy and
630 128 bytes for memset. */
631 static stringop_algs athlon_memcpy[2] = {
632 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
633 DUMMY_STRINGOP_ALGS};
634 static stringop_algs athlon_memset[2] = {
635 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
636 DUMMY_STRINGOP_ALGS};
637 static const
638 struct processor_costs athlon_cost = {
639 COSTS_N_INSNS (1), /* cost of an add instruction */
640 COSTS_N_INSNS (2), /* cost of a lea instruction */
641 COSTS_N_INSNS (1), /* variable shift costs */
642 COSTS_N_INSNS (1), /* constant shift costs */
643 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
644 COSTS_N_INSNS (5), /* HI */
645 COSTS_N_INSNS (5), /* SI */
646 COSTS_N_INSNS (5), /* DI */
647 COSTS_N_INSNS (5)}, /* other */
648 0, /* cost of multiply per each bit set */
649 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
650 COSTS_N_INSNS (26), /* HI */
651 COSTS_N_INSNS (42), /* SI */
652 COSTS_N_INSNS (74), /* DI */
653 COSTS_N_INSNS (74)}, /* other */
654 COSTS_N_INSNS (1), /* cost of movsx */
655 COSTS_N_INSNS (1), /* cost of movzx */
656 8, /* "large" insn */
657 9, /* MOVE_RATIO */
658 4, /* cost for loading QImode using movzbl */
659 {3, 4, 3}, /* cost of loading integer registers
660 in QImode, HImode and SImode.
661 Relative to reg-reg move (2). */
662 {3, 4, 3}, /* cost of storing integer registers */
663 4, /* cost of reg,reg fld/fst */
664 {4, 4, 12}, /* cost of loading fp registers
665 in SFmode, DFmode and XFmode */
666 {6, 6, 8}, /* cost of storing fp registers
667 in SFmode, DFmode and XFmode */
668 2, /* cost of moving MMX register */
669 {4, 4}, /* cost of loading MMX registers
670 in SImode and DImode */
671 {4, 4}, /* cost of storing MMX registers
672 in SImode and DImode */
673 2, /* cost of moving SSE register */
674 {4, 4, 6}, /* cost of loading SSE registers
675 in SImode, DImode and TImode */
676 {4, 4, 5}, /* cost of storing SSE registers
677 in SImode, DImode and TImode */
678 5, /* MMX or SSE register to integer */
679 64, /* size of l1 cache. */
680 256, /* size of l2 cache. */
681 64, /* size of prefetch block */
682 6, /* number of parallel prefetches */
683 5, /* Branch cost */
684 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
685 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
686 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
687 COSTS_N_INSNS (2), /* cost of FABS instruction. */
688 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
689 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
690 athlon_memcpy,
691 athlon_memset,
692 1, /* scalar_stmt_cost. */
693 1, /* scalar load_cost. */
694 1, /* scalar_store_cost. */
695 1, /* vec_stmt_cost. */
696 1, /* vec_to_scalar_cost. */
697 1, /* scalar_to_vec_cost. */
698 1, /* vec_align_load_cost. */
699 2, /* vec_unalign_load_cost. */
700 1, /* vec_store_cost. */
701 3, /* cond_taken_branch_cost. */
702 1, /* cond_not_taken_branch_cost. */
705 /* K8 has optimized REP instruction for medium sized blocks, but for very
706 small blocks it is better to use loop. For large blocks, libcall can
707 do nontemporary accesses and beat inline considerably. */
708 static stringop_algs k8_memcpy[2] = {
709 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
710 {-1, rep_prefix_4_byte, false}}},
711 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
712 {-1, libcall, false}}}};
713 static stringop_algs k8_memset[2] = {
714 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
715 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
716 {libcall, {{48, unrolled_loop, false},
717 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
718 static const
719 struct processor_costs k8_cost = {
720 COSTS_N_INSNS (1), /* cost of an add instruction */
721 COSTS_N_INSNS (2), /* cost of a lea instruction */
722 COSTS_N_INSNS (1), /* variable shift costs */
723 COSTS_N_INSNS (1), /* constant shift costs */
724 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
725 COSTS_N_INSNS (4), /* HI */
726 COSTS_N_INSNS (3), /* SI */
727 COSTS_N_INSNS (4), /* DI */
728 COSTS_N_INSNS (5)}, /* other */
729 0, /* cost of multiply per each bit set */
730 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
731 COSTS_N_INSNS (26), /* HI */
732 COSTS_N_INSNS (42), /* SI */
733 COSTS_N_INSNS (74), /* DI */
734 COSTS_N_INSNS (74)}, /* other */
735 COSTS_N_INSNS (1), /* cost of movsx */
736 COSTS_N_INSNS (1), /* cost of movzx */
737 8, /* "large" insn */
738 9, /* MOVE_RATIO */
739 4, /* cost for loading QImode using movzbl */
740 {3, 4, 3}, /* cost of loading integer registers
741 in QImode, HImode and SImode.
742 Relative to reg-reg move (2). */
743 {3, 4, 3}, /* cost of storing integer registers */
744 4, /* cost of reg,reg fld/fst */
745 {4, 4, 12}, /* cost of loading fp registers
746 in SFmode, DFmode and XFmode */
747 {6, 6, 8}, /* cost of storing fp registers
748 in SFmode, DFmode and XFmode */
749 2, /* cost of moving MMX register */
750 {3, 3}, /* cost of loading MMX registers
751 in SImode and DImode */
752 {4, 4}, /* cost of storing MMX registers
753 in SImode and DImode */
754 2, /* cost of moving SSE register */
755 {4, 3, 6}, /* cost of loading SSE registers
756 in SImode, DImode and TImode */
757 {4, 4, 5}, /* cost of storing SSE registers
758 in SImode, DImode and TImode */
759 5, /* MMX or SSE register to integer */
760 64, /* size of l1 cache. */
761 512, /* size of l2 cache. */
762 64, /* size of prefetch block */
763 /* New AMD processors never drop prefetches; if they cannot be performed
764 immediately, they are queued. We set number of simultaneous prefetches
765 to a large constant to reflect this (it probably is not a good idea not
766 to limit number of prefetches at all, as their execution also takes some
767 time). */
768 100, /* number of parallel prefetches */
769 3, /* Branch cost */
770 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
771 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
772 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
773 COSTS_N_INSNS (2), /* cost of FABS instruction. */
774 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
775 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
777 k8_memcpy,
778 k8_memset,
779 4, /* scalar_stmt_cost. */
780 2, /* scalar load_cost. */
781 2, /* scalar_store_cost. */
782 5, /* vec_stmt_cost. */
783 0, /* vec_to_scalar_cost. */
784 2, /* scalar_to_vec_cost. */
785 2, /* vec_align_load_cost. */
786 3, /* vec_unalign_load_cost. */
787 3, /* vec_store_cost. */
788 3, /* cond_taken_branch_cost. */
789 2, /* cond_not_taken_branch_cost. */
792 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
793 very small blocks it is better to use loop. For large blocks, libcall can
794 do nontemporary accesses and beat inline considerably. */
795 static stringop_algs amdfam10_memcpy[2] = {
796 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
797 {-1, rep_prefix_4_byte, false}}},
798 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
799 {-1, libcall, false}}}};
800 static stringop_algs amdfam10_memset[2] = {
801 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
802 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
803 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
804 {-1, libcall, false}}}};
805 struct processor_costs amdfam10_cost = {
806 COSTS_N_INSNS (1), /* cost of an add instruction */
807 COSTS_N_INSNS (2), /* cost of a lea instruction */
808 COSTS_N_INSNS (1), /* variable shift costs */
809 COSTS_N_INSNS (1), /* constant shift costs */
810 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
811 COSTS_N_INSNS (4), /* HI */
812 COSTS_N_INSNS (3), /* SI */
813 COSTS_N_INSNS (4), /* DI */
814 COSTS_N_INSNS (5)}, /* other */
815 0, /* cost of multiply per each bit set */
816 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
817 COSTS_N_INSNS (35), /* HI */
818 COSTS_N_INSNS (51), /* SI */
819 COSTS_N_INSNS (83), /* DI */
820 COSTS_N_INSNS (83)}, /* other */
821 COSTS_N_INSNS (1), /* cost of movsx */
822 COSTS_N_INSNS (1), /* cost of movzx */
823 8, /* "large" insn */
824 9, /* MOVE_RATIO */
825 4, /* cost for loading QImode using movzbl */
826 {3, 4, 3}, /* cost of loading integer registers
827 in QImode, HImode and SImode.
828 Relative to reg-reg move (2). */
829 {3, 4, 3}, /* cost of storing integer registers */
830 4, /* cost of reg,reg fld/fst */
831 {4, 4, 12}, /* cost of loading fp registers
832 in SFmode, DFmode and XFmode */
833 {6, 6, 8}, /* cost of storing fp registers
834 in SFmode, DFmode and XFmode */
835 2, /* cost of moving MMX register */
836 {3, 3}, /* cost of loading MMX registers
837 in SImode and DImode */
838 {4, 4}, /* cost of storing MMX registers
839 in SImode and DImode */
840 2, /* cost of moving SSE register */
841 {4, 4, 3}, /* cost of loading SSE registers
842 in SImode, DImode and TImode */
843 {4, 4, 5}, /* cost of storing SSE registers
844 in SImode, DImode and TImode */
845 3, /* MMX or SSE register to integer */
846 /* On K8:
847 MOVD reg64, xmmreg Double FSTORE 4
848 MOVD reg32, xmmreg Double FSTORE 4
849 On AMDFAM10:
850 MOVD reg64, xmmreg Double FADD 3
851 1/1 1/1
852 MOVD reg32, xmmreg Double FADD 3
853 1/1 1/1 */
854 64, /* size of l1 cache. */
855 512, /* size of l2 cache. */
856 64, /* size of prefetch block */
857 /* New AMD processors never drop prefetches; if they cannot be performed
858 immediately, they are queued. We set number of simultaneous prefetches
859 to a large constant to reflect this (it probably is not a good idea not
860 to limit number of prefetches at all, as their execution also takes some
861 time). */
862 100, /* number of parallel prefetches */
863 2, /* Branch cost */
864 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
865 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
866 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
867 COSTS_N_INSNS (2), /* cost of FABS instruction. */
868 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
869 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
871 amdfam10_memcpy,
872 amdfam10_memset,
873 4, /* scalar_stmt_cost. */
874 2, /* scalar load_cost. */
875 2, /* scalar_store_cost. */
876 6, /* vec_stmt_cost. */
877 0, /* vec_to_scalar_cost. */
878 2, /* scalar_to_vec_cost. */
879 2, /* vec_align_load_cost. */
880 2, /* vec_unalign_load_cost. */
881 2, /* vec_store_cost. */
882 2, /* cond_taken_branch_cost. */
883 1, /* cond_not_taken_branch_cost. */
886 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
887 very small blocks it is better to use loop. For large blocks, libcall
888 can do nontemporary accesses and beat inline considerably. */
889 static stringop_algs bdver1_memcpy[2] = {
890 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
891 {-1, rep_prefix_4_byte, false}}},
892 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
893 {-1, libcall, false}}}};
894 static stringop_algs bdver1_memset[2] = {
895 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
896 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
897 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
898 {-1, libcall, false}}}};
900 const struct processor_costs bdver1_cost = {
901 COSTS_N_INSNS (1), /* cost of an add instruction */
902 COSTS_N_INSNS (1), /* cost of a lea instruction */
903 COSTS_N_INSNS (1), /* variable shift costs */
904 COSTS_N_INSNS (1), /* constant shift costs */
905 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
906 COSTS_N_INSNS (4), /* HI */
907 COSTS_N_INSNS (4), /* SI */
908 COSTS_N_INSNS (6), /* DI */
909 COSTS_N_INSNS (6)}, /* other */
910 0, /* cost of multiply per each bit set */
911 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
912 COSTS_N_INSNS (35), /* HI */
913 COSTS_N_INSNS (51), /* SI */
914 COSTS_N_INSNS (83), /* DI */
915 COSTS_N_INSNS (83)}, /* other */
916 COSTS_N_INSNS (1), /* cost of movsx */
917 COSTS_N_INSNS (1), /* cost of movzx */
918 8, /* "large" insn */
919 9, /* MOVE_RATIO */
920 4, /* cost for loading QImode using movzbl */
921 {5, 5, 4}, /* cost of loading integer registers
922 in QImode, HImode and SImode.
923 Relative to reg-reg move (2). */
924 {4, 4, 4}, /* cost of storing integer registers */
925 2, /* cost of reg,reg fld/fst */
926 {5, 5, 12}, /* cost of loading fp registers
927 in SFmode, DFmode and XFmode */
928 {4, 4, 8}, /* cost of storing fp registers
929 in SFmode, DFmode and XFmode */
930 2, /* cost of moving MMX register */
931 {4, 4}, /* cost of loading MMX registers
932 in SImode and DImode */
933 {4, 4}, /* cost of storing MMX registers
934 in SImode and DImode */
935 2, /* cost of moving SSE register */
936 {4, 4, 4}, /* cost of loading SSE registers
937 in SImode, DImode and TImode */
938 {4, 4, 4}, /* cost of storing SSE registers
939 in SImode, DImode and TImode */
940 2, /* MMX or SSE register to integer */
941 /* On K8:
942 MOVD reg64, xmmreg Double FSTORE 4
943 MOVD reg32, xmmreg Double FSTORE 4
944 On AMDFAM10:
945 MOVD reg64, xmmreg Double FADD 3
946 1/1 1/1
947 MOVD reg32, xmmreg Double FADD 3
948 1/1 1/1 */
949 16, /* size of l1 cache. */
950 2048, /* size of l2 cache. */
951 64, /* size of prefetch block */
952 /* New AMD processors never drop prefetches; if they cannot be performed
953 immediately, they are queued. We set number of simultaneous prefetches
954 to a large constant to reflect this (it probably is not a good idea not
955 to limit number of prefetches at all, as their execution also takes some
956 time). */
957 100, /* number of parallel prefetches */
958 2, /* Branch cost */
959 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
960 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
961 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
962 COSTS_N_INSNS (2), /* cost of FABS instruction. */
963 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
964 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
966 bdver1_memcpy,
967 bdver1_memset,
968 6, /* scalar_stmt_cost. */
969 4, /* scalar load_cost. */
970 4, /* scalar_store_cost. */
971 6, /* vec_stmt_cost. */
972 0, /* vec_to_scalar_cost. */
973 2, /* scalar_to_vec_cost. */
974 4, /* vec_align_load_cost. */
975 4, /* vec_unalign_load_cost. */
976 4, /* vec_store_cost. */
977 2, /* cond_taken_branch_cost. */
978 1, /* cond_not_taken_branch_cost. */
981 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
982 very small blocks it is better to use loop. For large blocks, libcall
983 can do nontemporary accesses and beat inline considerably. */
985 static stringop_algs bdver2_memcpy[2] = {
986 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
987 {-1, rep_prefix_4_byte, false}}},
988 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
989 {-1, libcall, false}}}};
990 static stringop_algs bdver2_memset[2] = {
991 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
992 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
993 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
994 {-1, libcall, false}}}};
996 const struct processor_costs bdver2_cost = {
997 COSTS_N_INSNS (1), /* cost of an add instruction */
998 COSTS_N_INSNS (1), /* cost of a lea instruction */
999 COSTS_N_INSNS (1), /* variable shift costs */
1000 COSTS_N_INSNS (1), /* constant shift costs */
1001 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1002 COSTS_N_INSNS (4), /* HI */
1003 COSTS_N_INSNS (4), /* SI */
1004 COSTS_N_INSNS (6), /* DI */
1005 COSTS_N_INSNS (6)}, /* other */
1006 0, /* cost of multiply per each bit set */
1007 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1008 COSTS_N_INSNS (35), /* HI */
1009 COSTS_N_INSNS (51), /* SI */
1010 COSTS_N_INSNS (83), /* DI */
1011 COSTS_N_INSNS (83)}, /* other */
1012 COSTS_N_INSNS (1), /* cost of movsx */
1013 COSTS_N_INSNS (1), /* cost of movzx */
1014 8, /* "large" insn */
1015 9, /* MOVE_RATIO */
1016 4, /* cost for loading QImode using movzbl */
1017 {5, 5, 4}, /* cost of loading integer registers
1018 in QImode, HImode and SImode.
1019 Relative to reg-reg move (2). */
1020 {4, 4, 4}, /* cost of storing integer registers */
1021 2, /* cost of reg,reg fld/fst */
1022 {5, 5, 12}, /* cost of loading fp registers
1023 in SFmode, DFmode and XFmode */
1024 {4, 4, 8}, /* cost of storing fp registers
1025 in SFmode, DFmode and XFmode */
1026 2, /* cost of moving MMX register */
1027 {4, 4}, /* cost of loading MMX registers
1028 in SImode and DImode */
1029 {4, 4}, /* cost of storing MMX registers
1030 in SImode and DImode */
1031 2, /* cost of moving SSE register */
1032 {4, 4, 4}, /* cost of loading SSE registers
1033 in SImode, DImode and TImode */
1034 {4, 4, 4}, /* cost of storing SSE registers
1035 in SImode, DImode and TImode */
1036 2, /* MMX or SSE register to integer */
1037 /* On K8:
1038 MOVD reg64, xmmreg Double FSTORE 4
1039 MOVD reg32, xmmreg Double FSTORE 4
1040 On AMDFAM10:
1041 MOVD reg64, xmmreg Double FADD 3
1042 1/1 1/1
1043 MOVD reg32, xmmreg Double FADD 3
1044 1/1 1/1 */
1045 16, /* size of l1 cache. */
1046 2048, /* size of l2 cache. */
1047 64, /* size of prefetch block */
1048 /* New AMD processors never drop prefetches; if they cannot be performed
1049 immediately, they are queued. We set number of simultaneous prefetches
1050 to a large constant to reflect this (it probably is not a good idea not
1051 to limit number of prefetches at all, as their execution also takes some
1052 time). */
1053 100, /* number of parallel prefetches */
1054 2, /* Branch cost */
1055 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1056 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1057 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1058 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1059 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1060 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1062 bdver2_memcpy,
1063 bdver2_memset,
1064 6, /* scalar_stmt_cost. */
1065 4, /* scalar load_cost. */
1066 4, /* scalar_store_cost. */
1067 6, /* vec_stmt_cost. */
1068 0, /* vec_to_scalar_cost. */
1069 2, /* scalar_to_vec_cost. */
1070 4, /* vec_align_load_cost. */
1071 4, /* vec_unalign_load_cost. */
1072 4, /* vec_store_cost. */
1073 2, /* cond_taken_branch_cost. */
1074 1, /* cond_not_taken_branch_cost. */
1078 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1079 very small blocks it is better to use loop. For large blocks, libcall
1080 can do nontemporary accesses and beat inline considerably. */
1081 static stringop_algs bdver3_memcpy[2] = {
1082 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1083 {-1, rep_prefix_4_byte, false}}},
1084 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}};
1086 static stringop_algs bdver3_memset[2] = {
1087 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1088 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1089 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1090 {-1, libcall, false}}}};
1091 struct processor_costs bdver3_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (1), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (4), /* SI */
1099 COSTS_N_INSNS (6), /* DI */
1100 COSTS_N_INSNS (6)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (35), /* HI */
1104 COSTS_N_INSNS (51), /* SI */
1105 COSTS_N_INSNS (83), /* DI */
1106 COSTS_N_INSNS (83)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {5, 5, 4}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {4, 4, 4}, /* cost of storing integer registers */
1116 2, /* cost of reg,reg fld/fst */
1117 {5, 5, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {4, 4, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {4, 4}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 4, 4}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 4}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 2, /* MMX or SSE register to integer */
1132 16, /* size of l1 cache. */
1133 2048, /* size of l2 cache. */
1134 64, /* size of prefetch block */
1135 /* New AMD processors never drop prefetches; if they cannot be performed
1136 immediately, they are queued. We set number of simultaneous prefetches
1137 to a large constant to reflect this (it probably is not a good idea not
1138 to limit number of prefetches at all, as their execution also takes some
1139 time). */
1140 100, /* number of parallel prefetches */
1141 2, /* Branch cost */
1142 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1143 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1144 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1145 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1146 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1147 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1149 bdver3_memcpy,
1150 bdver3_memset,
1151 6, /* scalar_stmt_cost. */
1152 4, /* scalar load_cost. */
1153 4, /* scalar_store_cost. */
1154 6, /* vec_stmt_cost. */
1155 0, /* vec_to_scalar_cost. */
1156 2, /* scalar_to_vec_cost. */
1157 4, /* vec_align_load_cost. */
1158 4, /* vec_unalign_load_cost. */
1159 4, /* vec_store_cost. */
1160 2, /* cond_taken_branch_cost. */
1161 1, /* cond_not_taken_branch_cost. */
1164 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1165 very small blocks it is better to use loop. For large blocks, libcall can
1166 do nontemporary accesses and beat inline considerably. */
1167 static stringop_algs btver1_memcpy[2] = {
1168 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1169 {-1, rep_prefix_4_byte, false}}},
1170 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1171 {-1, libcall, false}}}};
1172 static stringop_algs btver1_memset[2] = {
1173 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1174 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1175 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1176 {-1, libcall, false}}}};
1177 const struct processor_costs btver1_cost = {
1178 COSTS_N_INSNS (1), /* cost of an add instruction */
1179 COSTS_N_INSNS (2), /* cost of a lea instruction */
1180 COSTS_N_INSNS (1), /* variable shift costs */
1181 COSTS_N_INSNS (1), /* constant shift costs */
1182 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1183 COSTS_N_INSNS (4), /* HI */
1184 COSTS_N_INSNS (3), /* SI */
1185 COSTS_N_INSNS (4), /* DI */
1186 COSTS_N_INSNS (5)}, /* other */
1187 0, /* cost of multiply per each bit set */
1188 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1189 COSTS_N_INSNS (35), /* HI */
1190 COSTS_N_INSNS (51), /* SI */
1191 COSTS_N_INSNS (83), /* DI */
1192 COSTS_N_INSNS (83)}, /* other */
1193 COSTS_N_INSNS (1), /* cost of movsx */
1194 COSTS_N_INSNS (1), /* cost of movzx */
1195 8, /* "large" insn */
1196 9, /* MOVE_RATIO */
1197 4, /* cost for loading QImode using movzbl */
1198 {3, 4, 3}, /* cost of loading integer registers
1199 in QImode, HImode and SImode.
1200 Relative to reg-reg move (2). */
1201 {3, 4, 3}, /* cost of storing integer registers */
1202 4, /* cost of reg,reg fld/fst */
1203 {4, 4, 12}, /* cost of loading fp registers
1204 in SFmode, DFmode and XFmode */
1205 {6, 6, 8}, /* cost of storing fp registers
1206 in SFmode, DFmode and XFmode */
1207 2, /* cost of moving MMX register */
1208 {3, 3}, /* cost of loading MMX registers
1209 in SImode and DImode */
1210 {4, 4}, /* cost of storing MMX registers
1211 in SImode and DImode */
1212 2, /* cost of moving SSE register */
1213 {4, 4, 3}, /* cost of loading SSE registers
1214 in SImode, DImode and TImode */
1215 {4, 4, 5}, /* cost of storing SSE registers
1216 in SImode, DImode and TImode */
1217 3, /* MMX or SSE register to integer */
1218 /* On K8:
1219 MOVD reg64, xmmreg Double FSTORE 4
1220 MOVD reg32, xmmreg Double FSTORE 4
1221 On AMDFAM10:
1222 MOVD reg64, xmmreg Double FADD 3
1223 1/1 1/1
1224 MOVD reg32, xmmreg Double FADD 3
1225 1/1 1/1 */
1226 32, /* size of l1 cache. */
1227 512, /* size of l2 cache. */
1228 64, /* size of prefetch block */
1229 100, /* number of parallel prefetches */
1230 2, /* Branch cost */
1231 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1232 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1233 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1234 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1235 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1236 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1238 btver1_memcpy,
1239 btver1_memset,
1240 4, /* scalar_stmt_cost. */
1241 2, /* scalar load_cost. */
1242 2, /* scalar_store_cost. */
1243 6, /* vec_stmt_cost. */
1244 0, /* vec_to_scalar_cost. */
1245 2, /* scalar_to_vec_cost. */
1246 2, /* vec_align_load_cost. */
1247 2, /* vec_unalign_load_cost. */
1248 2, /* vec_store_cost. */
1249 2, /* cond_taken_branch_cost. */
1250 1, /* cond_not_taken_branch_cost. */
1253 static stringop_algs btver2_memcpy[2] = {
1254 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1255 {-1, rep_prefix_4_byte, false}}},
1256 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1257 {-1, libcall, false}}}};
1258 static stringop_algs btver2_memset[2] = {
1259 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1260 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1261 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1262 {-1, libcall, false}}}};
1263 const struct processor_costs btver2_cost = {
1264 COSTS_N_INSNS (1), /* cost of an add instruction */
1265 COSTS_N_INSNS (2), /* cost of a lea instruction */
1266 COSTS_N_INSNS (1), /* variable shift costs */
1267 COSTS_N_INSNS (1), /* constant shift costs */
1268 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1269 COSTS_N_INSNS (4), /* HI */
1270 COSTS_N_INSNS (3), /* SI */
1271 COSTS_N_INSNS (4), /* DI */
1272 COSTS_N_INSNS (5)}, /* other */
1273 0, /* cost of multiply per each bit set */
1274 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1275 COSTS_N_INSNS (35), /* HI */
1276 COSTS_N_INSNS (51), /* SI */
1277 COSTS_N_INSNS (83), /* DI */
1278 COSTS_N_INSNS (83)}, /* other */
1279 COSTS_N_INSNS (1), /* cost of movsx */
1280 COSTS_N_INSNS (1), /* cost of movzx */
1281 8, /* "large" insn */
1282 9, /* MOVE_RATIO */
1283 4, /* cost for loading QImode using movzbl */
1284 {3, 4, 3}, /* cost of loading integer registers
1285 in QImode, HImode and SImode.
1286 Relative to reg-reg move (2). */
1287 {3, 4, 3}, /* cost of storing integer registers */
1288 4, /* cost of reg,reg fld/fst */
1289 {4, 4, 12}, /* cost of loading fp registers
1290 in SFmode, DFmode and XFmode */
1291 {6, 6, 8}, /* cost of storing fp registers
1292 in SFmode, DFmode and XFmode */
1293 2, /* cost of moving MMX register */
1294 {3, 3}, /* cost of loading MMX registers
1295 in SImode and DImode */
1296 {4, 4}, /* cost of storing MMX registers
1297 in SImode and DImode */
1298 2, /* cost of moving SSE register */
1299 {4, 4, 3}, /* cost of loading SSE registers
1300 in SImode, DImode and TImode */
1301 {4, 4, 5}, /* cost of storing SSE registers
1302 in SImode, DImode and TImode */
1303 3, /* MMX or SSE register to integer */
1304 /* On K8:
1305 MOVD reg64, xmmreg Double FSTORE 4
1306 MOVD reg32, xmmreg Double FSTORE 4
1307 On AMDFAM10:
1308 MOVD reg64, xmmreg Double FADD 3
1309 1/1 1/1
1310 MOVD reg32, xmmreg Double FADD 3
1311 1/1 1/1 */
1312 32, /* size of l1 cache. */
1313 2048, /* size of l2 cache. */
1314 64, /* size of prefetch block */
1315 100, /* number of parallel prefetches */
1316 2, /* Branch cost */
1317 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1318 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1319 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1320 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1321 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1322 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1323 btver2_memcpy,
1324 btver2_memset,
1325 4, /* scalar_stmt_cost. */
1326 2, /* scalar load_cost. */
1327 2, /* scalar_store_cost. */
1328 6, /* vec_stmt_cost. */
1329 0, /* vec_to_scalar_cost. */
1330 2, /* scalar_to_vec_cost. */
1331 2, /* vec_align_load_cost. */
1332 2, /* vec_unalign_load_cost. */
1333 2, /* vec_store_cost. */
1334 2, /* cond_taken_branch_cost. */
1335 1, /* cond_not_taken_branch_cost. */
1338 static stringop_algs pentium4_memcpy[2] = {
1339 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1340 DUMMY_STRINGOP_ALGS};
1341 static stringop_algs pentium4_memset[2] = {
1342 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1343 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1344 DUMMY_STRINGOP_ALGS};
1346 static const
1347 struct processor_costs pentium4_cost = {
1348 COSTS_N_INSNS (1), /* cost of an add instruction */
1349 COSTS_N_INSNS (3), /* cost of a lea instruction */
1350 COSTS_N_INSNS (4), /* variable shift costs */
1351 COSTS_N_INSNS (4), /* constant shift costs */
1352 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1353 COSTS_N_INSNS (15), /* HI */
1354 COSTS_N_INSNS (15), /* SI */
1355 COSTS_N_INSNS (15), /* DI */
1356 COSTS_N_INSNS (15)}, /* other */
1357 0, /* cost of multiply per each bit set */
1358 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1359 COSTS_N_INSNS (56), /* HI */
1360 COSTS_N_INSNS (56), /* SI */
1361 COSTS_N_INSNS (56), /* DI */
1362 COSTS_N_INSNS (56)}, /* other */
1363 COSTS_N_INSNS (1), /* cost of movsx */
1364 COSTS_N_INSNS (1), /* cost of movzx */
1365 16, /* "large" insn */
1366 6, /* MOVE_RATIO */
1367 2, /* cost for loading QImode using movzbl */
1368 {4, 5, 4}, /* cost of loading integer registers
1369 in QImode, HImode and SImode.
1370 Relative to reg-reg move (2). */
1371 {2, 3, 2}, /* cost of storing integer registers */
1372 2, /* cost of reg,reg fld/fst */
1373 {2, 2, 6}, /* cost of loading fp registers
1374 in SFmode, DFmode and XFmode */
1375 {4, 4, 6}, /* cost of storing fp registers
1376 in SFmode, DFmode and XFmode */
1377 2, /* cost of moving MMX register */
1378 {2, 2}, /* cost of loading MMX registers
1379 in SImode and DImode */
1380 {2, 2}, /* cost of storing MMX registers
1381 in SImode and DImode */
1382 12, /* cost of moving SSE register */
1383 {12, 12, 12}, /* cost of loading SSE registers
1384 in SImode, DImode and TImode */
1385 {2, 2, 8}, /* cost of storing SSE registers
1386 in SImode, DImode and TImode */
1387 10, /* MMX or SSE register to integer */
1388 8, /* size of l1 cache. */
1389 256, /* size of l2 cache. */
1390 64, /* size of prefetch block */
1391 6, /* number of parallel prefetches */
1392 2, /* Branch cost */
1393 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1394 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1395 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1396 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1397 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1398 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1399 pentium4_memcpy,
1400 pentium4_memset,
1401 1, /* scalar_stmt_cost. */
1402 1, /* scalar load_cost. */
1403 1, /* scalar_store_cost. */
1404 1, /* vec_stmt_cost. */
1405 1, /* vec_to_scalar_cost. */
1406 1, /* scalar_to_vec_cost. */
1407 1, /* vec_align_load_cost. */
1408 2, /* vec_unalign_load_cost. */
1409 1, /* vec_store_cost. */
1410 3, /* cond_taken_branch_cost. */
1411 1, /* cond_not_taken_branch_cost. */
1414 static stringop_algs nocona_memcpy[2] = {
1415 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1416 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1417 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1419 static stringop_algs nocona_memset[2] = {
1420 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1421 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1422 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1423 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1425 static const
1426 struct processor_costs nocona_cost = {
1427 COSTS_N_INSNS (1), /* cost of an add instruction */
1428 COSTS_N_INSNS (1), /* cost of a lea instruction */
1429 COSTS_N_INSNS (1), /* variable shift costs */
1430 COSTS_N_INSNS (1), /* constant shift costs */
1431 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1432 COSTS_N_INSNS (10), /* HI */
1433 COSTS_N_INSNS (10), /* SI */
1434 COSTS_N_INSNS (10), /* DI */
1435 COSTS_N_INSNS (10)}, /* other */
1436 0, /* cost of multiply per each bit set */
1437 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1438 COSTS_N_INSNS (66), /* HI */
1439 COSTS_N_INSNS (66), /* SI */
1440 COSTS_N_INSNS (66), /* DI */
1441 COSTS_N_INSNS (66)}, /* other */
1442 COSTS_N_INSNS (1), /* cost of movsx */
1443 COSTS_N_INSNS (1), /* cost of movzx */
1444 16, /* "large" insn */
1445 17, /* MOVE_RATIO */
1446 4, /* cost for loading QImode using movzbl */
1447 {4, 4, 4}, /* cost of loading integer registers
1448 in QImode, HImode and SImode.
1449 Relative to reg-reg move (2). */
1450 {4, 4, 4}, /* cost of storing integer registers */
1451 3, /* cost of reg,reg fld/fst */
1452 {12, 12, 12}, /* cost of loading fp registers
1453 in SFmode, DFmode and XFmode */
1454 {4, 4, 4}, /* cost of storing fp registers
1455 in SFmode, DFmode and XFmode */
1456 6, /* cost of moving MMX register */
1457 {12, 12}, /* cost of loading MMX registers
1458 in SImode and DImode */
1459 {12, 12}, /* cost of storing MMX registers
1460 in SImode and DImode */
1461 6, /* cost of moving SSE register */
1462 {12, 12, 12}, /* cost of loading SSE registers
1463 in SImode, DImode and TImode */
1464 {12, 12, 12}, /* cost of storing SSE registers
1465 in SImode, DImode and TImode */
1466 8, /* MMX or SSE register to integer */
1467 8, /* size of l1 cache. */
1468 1024, /* size of l2 cache. */
1469 128, /* size of prefetch block */
1470 8, /* number of parallel prefetches */
1471 1, /* Branch cost */
1472 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1473 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1474 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1475 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1476 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1477 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1478 nocona_memcpy,
1479 nocona_memset,
1480 1, /* scalar_stmt_cost. */
1481 1, /* scalar load_cost. */
1482 1, /* scalar_store_cost. */
1483 1, /* vec_stmt_cost. */
1484 1, /* vec_to_scalar_cost. */
1485 1, /* scalar_to_vec_cost. */
1486 1, /* vec_align_load_cost. */
1487 2, /* vec_unalign_load_cost. */
1488 1, /* vec_store_cost. */
1489 3, /* cond_taken_branch_cost. */
1490 1, /* cond_not_taken_branch_cost. */
1493 static stringop_algs atom_memcpy[2] = {
1494 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1495 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1496 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1497 static stringop_algs atom_memset[2] = {
1498 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1499 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1500 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1501 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1502 static const
1503 struct processor_costs atom_cost = {
1504 COSTS_N_INSNS (1), /* cost of an add instruction */
1505 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1506 COSTS_N_INSNS (1), /* variable shift costs */
1507 COSTS_N_INSNS (1), /* constant shift costs */
1508 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1509 COSTS_N_INSNS (4), /* HI */
1510 COSTS_N_INSNS (3), /* SI */
1511 COSTS_N_INSNS (4), /* DI */
1512 COSTS_N_INSNS (2)}, /* other */
1513 0, /* cost of multiply per each bit set */
1514 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1515 COSTS_N_INSNS (26), /* HI */
1516 COSTS_N_INSNS (42), /* SI */
1517 COSTS_N_INSNS (74), /* DI */
1518 COSTS_N_INSNS (74)}, /* other */
1519 COSTS_N_INSNS (1), /* cost of movsx */
1520 COSTS_N_INSNS (1), /* cost of movzx */
1521 8, /* "large" insn */
1522 17, /* MOVE_RATIO */
1523 4, /* cost for loading QImode using movzbl */
1524 {4, 4, 4}, /* cost of loading integer registers
1525 in QImode, HImode and SImode.
1526 Relative to reg-reg move (2). */
1527 {4, 4, 4}, /* cost of storing integer registers */
1528 4, /* cost of reg,reg fld/fst */
1529 {12, 12, 12}, /* cost of loading fp registers
1530 in SFmode, DFmode and XFmode */
1531 {6, 6, 8}, /* cost of storing fp registers
1532 in SFmode, DFmode and XFmode */
1533 2, /* cost of moving MMX register */
1534 {8, 8}, /* cost of loading MMX registers
1535 in SImode and DImode */
1536 {8, 8}, /* cost of storing MMX registers
1537 in SImode and DImode */
1538 2, /* cost of moving SSE register */
1539 {8, 8, 8}, /* cost of loading SSE registers
1540 in SImode, DImode and TImode */
1541 {8, 8, 8}, /* cost of storing SSE registers
1542 in SImode, DImode and TImode */
1543 5, /* MMX or SSE register to integer */
1544 32, /* size of l1 cache. */
1545 256, /* size of l2 cache. */
1546 64, /* size of prefetch block */
1547 6, /* number of parallel prefetches */
1548 3, /* Branch cost */
1549 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1550 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1551 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1552 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1553 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1554 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1555 atom_memcpy,
1556 atom_memset,
1557 1, /* scalar_stmt_cost. */
1558 1, /* scalar load_cost. */
1559 1, /* scalar_store_cost. */
1560 1, /* vec_stmt_cost. */
1561 1, /* vec_to_scalar_cost. */
1562 1, /* scalar_to_vec_cost. */
1563 1, /* vec_align_load_cost. */
1564 2, /* vec_unalign_load_cost. */
1565 1, /* vec_store_cost. */
1566 3, /* cond_taken_branch_cost. */
1567 1, /* cond_not_taken_branch_cost. */
1570 static stringop_algs slm_memcpy[2] = {
1571 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1572 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1573 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1574 static stringop_algs slm_memset[2] = {
1575 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1576 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1577 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1578 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1579 static const
1580 struct processor_costs slm_cost = {
1581 COSTS_N_INSNS (1), /* cost of an add instruction */
1582 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1583 COSTS_N_INSNS (1), /* variable shift costs */
1584 COSTS_N_INSNS (1), /* constant shift costs */
1585 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1586 COSTS_N_INSNS (4), /* HI */
1587 COSTS_N_INSNS (3), /* SI */
1588 COSTS_N_INSNS (4), /* DI */
1589 COSTS_N_INSNS (2)}, /* other */
1590 0, /* cost of multiply per each bit set */
1591 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1592 COSTS_N_INSNS (26), /* HI */
1593 COSTS_N_INSNS (42), /* SI */
1594 COSTS_N_INSNS (74), /* DI */
1595 COSTS_N_INSNS (74)}, /* other */
1596 COSTS_N_INSNS (1), /* cost of movsx */
1597 COSTS_N_INSNS (1), /* cost of movzx */
1598 8, /* "large" insn */
1599 17, /* MOVE_RATIO */
1600 4, /* cost for loading QImode using movzbl */
1601 {4, 4, 4}, /* cost of loading integer registers
1602 in QImode, HImode and SImode.
1603 Relative to reg-reg move (2). */
1604 {4, 4, 4}, /* cost of storing integer registers */
1605 4, /* cost of reg,reg fld/fst */
1606 {12, 12, 12}, /* cost of loading fp registers
1607 in SFmode, DFmode and XFmode */
1608 {6, 6, 8}, /* cost of storing fp registers
1609 in SFmode, DFmode and XFmode */
1610 2, /* cost of moving MMX register */
1611 {8, 8}, /* cost of loading MMX registers
1612 in SImode and DImode */
1613 {8, 8}, /* cost of storing MMX registers
1614 in SImode and DImode */
1615 2, /* cost of moving SSE register */
1616 {8, 8, 8}, /* cost of loading SSE registers
1617 in SImode, DImode and TImode */
1618 {8, 8, 8}, /* cost of storing SSE registers
1619 in SImode, DImode and TImode */
1620 5, /* MMX or SSE register to integer */
1621 32, /* size of l1 cache. */
1622 256, /* size of l2 cache. */
1623 64, /* size of prefetch block */
1624 6, /* number of parallel prefetches */
1625 3, /* Branch cost */
1626 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1627 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1628 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1629 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1630 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1631 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1632 slm_memcpy,
1633 slm_memset,
1634 1, /* scalar_stmt_cost. */
1635 1, /* scalar load_cost. */
1636 1, /* scalar_store_cost. */
1637 1, /* vec_stmt_cost. */
1638 1, /* vec_to_scalar_cost. */
1639 1, /* scalar_to_vec_cost. */
1640 1, /* vec_align_load_cost. */
1641 2, /* vec_unalign_load_cost. */
1642 1, /* vec_store_cost. */
1643 3, /* cond_taken_branch_cost. */
1644 1, /* cond_not_taken_branch_cost. */
1647 /* Generic should produce code tuned for Core-i7 (and newer chips)
1648 and btver1 (and newer chips). */
1650 static stringop_algs generic_memcpy[2] = {
1651 DUMMY_STRINGOP_ALGS,
1652 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1653 {-1, libcall, false}}}};
1654 static stringop_algs generic_memset[2] = {
1655 DUMMY_STRINGOP_ALGS,
1656 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1657 {-1, libcall, false}}}};
1658 static const
1659 struct processor_costs generic_cost = {
1660 COSTS_N_INSNS (1), /* cost of an add instruction */
1661 /* On all chips taken into consideration lea is 2 cycles and more. With
1662 this cost however our current implementation of synth_mult results in
1663 use of unnecessary temporary registers causing regression on several
1664 SPECfp benchmarks. */
1665 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1666 COSTS_N_INSNS (1), /* variable shift costs */
1667 COSTS_N_INSNS (1), /* constant shift costs */
1668 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1669 COSTS_N_INSNS (4), /* HI */
1670 COSTS_N_INSNS (3), /* SI */
1671 COSTS_N_INSNS (4), /* DI */
1672 COSTS_N_INSNS (2)}, /* other */
1673 0, /* cost of multiply per each bit set */
1674 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1675 COSTS_N_INSNS (26), /* HI */
1676 COSTS_N_INSNS (42), /* SI */
1677 COSTS_N_INSNS (74), /* DI */
1678 COSTS_N_INSNS (74)}, /* other */
1679 COSTS_N_INSNS (1), /* cost of movsx */
1680 COSTS_N_INSNS (1), /* cost of movzx */
1681 8, /* "large" insn */
1682 17, /* MOVE_RATIO */
1683 4, /* cost for loading QImode using movzbl */
1684 {4, 4, 4}, /* cost of loading integer registers
1685 in QImode, HImode and SImode.
1686 Relative to reg-reg move (2). */
1687 {4, 4, 4}, /* cost of storing integer registers */
1688 4, /* cost of reg,reg fld/fst */
1689 {12, 12, 12}, /* cost of loading fp registers
1690 in SFmode, DFmode and XFmode */
1691 {6, 6, 8}, /* cost of storing fp registers
1692 in SFmode, DFmode and XFmode */
1693 2, /* cost of moving MMX register */
1694 {8, 8}, /* cost of loading MMX registers
1695 in SImode and DImode */
1696 {8, 8}, /* cost of storing MMX registers
1697 in SImode and DImode */
1698 2, /* cost of moving SSE register */
1699 {8, 8, 8}, /* cost of loading SSE registers
1700 in SImode, DImode and TImode */
1701 {8, 8, 8}, /* cost of storing SSE registers
1702 in SImode, DImode and TImode */
1703 5, /* MMX or SSE register to integer */
1704 32, /* size of l1 cache. */
1705 512, /* size of l2 cache. */
1706 64, /* size of prefetch block */
1707 6, /* number of parallel prefetches */
1708 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1709 value is increased to perhaps more appropriate value of 5. */
1710 3, /* Branch cost */
1711 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1712 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1713 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1714 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1715 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1716 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1717 generic_memcpy,
1718 generic_memset,
1719 1, /* scalar_stmt_cost. */
1720 1, /* scalar load_cost. */
1721 1, /* scalar_store_cost. */
1722 1, /* vec_stmt_cost. */
1723 1, /* vec_to_scalar_cost. */
1724 1, /* scalar_to_vec_cost. */
1725 1, /* vec_align_load_cost. */
1726 2, /* vec_unalign_load_cost. */
1727 1, /* vec_store_cost. */
1728 3, /* cond_taken_branch_cost. */
1729 1, /* cond_not_taken_branch_cost. */
1732 /* core_cost should produce code tuned for Core familly of CPUs. */
1733 static stringop_algs core_memcpy[2] = {
1734 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1735 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1736 {-1, libcall, false}}}};
1737 static stringop_algs core_memset[2] = {
1738 {libcall, {{6, loop_1_byte, true},
1739 {24, loop, true},
1740 {8192, rep_prefix_4_byte, true},
1741 {-1, libcall, false}}},
1742 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1743 {-1, libcall, false}}}};
1745 static const
1746 struct processor_costs core_cost = {
1747 COSTS_N_INSNS (1), /* cost of an add instruction */
1748 /* On all chips taken into consideration lea is 2 cycles and more. With
1749 this cost however our current implementation of synth_mult results in
1750 use of unnecessary temporary registers causing regression on several
1751 SPECfp benchmarks. */
1752 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1753 COSTS_N_INSNS (1), /* variable shift costs */
1754 COSTS_N_INSNS (1), /* constant shift costs */
1755 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1756 COSTS_N_INSNS (4), /* HI */
1757 COSTS_N_INSNS (3), /* SI */
1758 COSTS_N_INSNS (4), /* DI */
1759 COSTS_N_INSNS (2)}, /* other */
1760 0, /* cost of multiply per each bit set */
1761 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1762 COSTS_N_INSNS (26), /* HI */
1763 COSTS_N_INSNS (42), /* SI */
1764 COSTS_N_INSNS (74), /* DI */
1765 COSTS_N_INSNS (74)}, /* other */
1766 COSTS_N_INSNS (1), /* cost of movsx */
1767 COSTS_N_INSNS (1), /* cost of movzx */
1768 8, /* "large" insn */
1769 17, /* MOVE_RATIO */
1770 4, /* cost for loading QImode using movzbl */
1771 {4, 4, 4}, /* cost of loading integer registers
1772 in QImode, HImode and SImode.
1773 Relative to reg-reg move (2). */
1774 {4, 4, 4}, /* cost of storing integer registers */
1775 4, /* cost of reg,reg fld/fst */
1776 {12, 12, 12}, /* cost of loading fp registers
1777 in SFmode, DFmode and XFmode */
1778 {6, 6, 8}, /* cost of storing fp registers
1779 in SFmode, DFmode and XFmode */
1780 2, /* cost of moving MMX register */
1781 {8, 8}, /* cost of loading MMX registers
1782 in SImode and DImode */
1783 {8, 8}, /* cost of storing MMX registers
1784 in SImode and DImode */
1785 2, /* cost of moving SSE register */
1786 {8, 8, 8}, /* cost of loading SSE registers
1787 in SImode, DImode and TImode */
1788 {8, 8, 8}, /* cost of storing SSE registers
1789 in SImode, DImode and TImode */
1790 5, /* MMX or SSE register to integer */
1791 64, /* size of l1 cache. */
1792 512, /* size of l2 cache. */
1793 64, /* size of prefetch block */
1794 6, /* number of parallel prefetches */
1795 /* FIXME perhaps more appropriate value is 5. */
1796 3, /* Branch cost */
1797 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1798 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1799 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1800 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1801 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1802 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1803 core_memcpy,
1804 core_memset,
1805 1, /* scalar_stmt_cost. */
1806 1, /* scalar load_cost. */
1807 1, /* scalar_store_cost. */
1808 1, /* vec_stmt_cost. */
1809 1, /* vec_to_scalar_cost. */
1810 1, /* scalar_to_vec_cost. */
1811 1, /* vec_align_load_cost. */
1812 2, /* vec_unalign_load_cost. */
1813 1, /* vec_store_cost. */
1814 3, /* cond_taken_branch_cost. */
1815 1, /* cond_not_taken_branch_cost. */
1819 /* Set by -mtune. */
1820 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1822 /* Set by -mtune or -Os. */
1823 const struct processor_costs *ix86_cost = &pentium_cost;
1825 /* Processor feature/optimization bitmasks. */
1826 #define m_386 (1<<PROCESSOR_I386)
1827 #define m_486 (1<<PROCESSOR_I486)
1828 #define m_PENT (1<<PROCESSOR_PENTIUM)
1829 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1830 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1831 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1832 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1833 #define m_CORE2 (1<<PROCESSOR_CORE2)
1834 #define m_COREI7 (1<<PROCESSOR_COREI7)
1835 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1836 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1837 #define m_ATOM (1<<PROCESSOR_ATOM)
1838 #define m_SLM (1<<PROCESSOR_SLM)
1840 #define m_GEODE (1<<PROCESSOR_GEODE)
1841 #define m_K6 (1<<PROCESSOR_K6)
1842 #define m_K6_GEODE (m_K6 | m_GEODE)
1843 #define m_K8 (1<<PROCESSOR_K8)
1844 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1845 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1846 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1847 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1848 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1849 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1850 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1851 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1852 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1853 #define m_BTVER (m_BTVER1 | m_BTVER2)
1854 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1856 #define m_GENERIC (1<<PROCESSOR_GENERIC)
1858 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1859 #undef DEF_TUNE
1860 #define DEF_TUNE(tune, name, selector) name,
1861 #include "x86-tune.def"
1862 #undef DEF_TUNE
1865 /* Feature tests against the various tunings. */
1866 unsigned char ix86_tune_features[X86_TUNE_LAST];
1868 /* Feature tests against the various tunings used to create ix86_tune_features
1869 based on the processor mask. */
1870 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1871 #undef DEF_TUNE
1872 #define DEF_TUNE(tune, name, selector) selector,
1873 #include "x86-tune.def"
1874 #undef DEF_TUNE
1877 /* Feature tests against the various architecture variations. */
1878 unsigned char ix86_arch_features[X86_ARCH_LAST];
1880 /* Feature tests against the various architecture variations, used to create
1881 ix86_arch_features based on the processor mask. */
1882 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1883 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1884 ~(m_386 | m_486 | m_PENT | m_K6),
1886 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1887 ~m_386,
1889 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1890 ~(m_386 | m_486),
1892 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1893 ~m_386,
1895 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1896 ~m_386,
1899 static const unsigned int x86_accumulate_outgoing_args
1900 = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
1902 static const unsigned int x86_arch_always_fancy_math_387
1903 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
1905 static const unsigned int x86_avx256_split_unaligned_load
1906 = m_COREI7 | m_GENERIC;
1908 static const unsigned int x86_avx256_split_unaligned_store
1909 = m_COREI7 | m_BDVER | m_GENERIC;
1911 /* In case the average insn count for single function invocation is
1912 lower than this constant, emit fast (but longer) prologue and
1913 epilogue code. */
1914 #define FAST_PROLOGUE_INSN_COUNT 20
1916 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1917 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1918 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1919 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1921 /* Array of the smallest class containing reg number REGNO, indexed by
1922 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1924 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1926 /* ax, dx, cx, bx */
1927 AREG, DREG, CREG, BREG,
1928 /* si, di, bp, sp */
1929 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1930 /* FP registers */
1931 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1932 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1933 /* arg pointer */
1934 NON_Q_REGS,
1935 /* flags, fpsr, fpcr, frame */
1936 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1937 /* SSE registers */
1938 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1939 SSE_REGS, SSE_REGS,
1940 /* MMX registers */
1941 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1942 MMX_REGS, MMX_REGS,
1943 /* REX registers */
1944 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1945 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1946 /* SSE REX registers */
1947 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1948 SSE_REGS, SSE_REGS,
1949 /* AVX-512 SSE registers */
1950 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1951 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1952 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1953 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1954 /* Mask registers. */
1955 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
1956 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
1959 /* The "default" register map used in 32bit mode. */
1961 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1963 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1964 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1965 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1966 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1967 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1968 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1969 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1970 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
1971 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
1972 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
1975 /* The "default" register map used in 64bit mode. */
1977 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1979 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1980 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1981 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1982 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1983 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1984 8,9,10,11,12,13,14,15, /* extended integer registers */
1985 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1986 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
1987 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
1988 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
1991 /* Define the register numbers to be used in Dwarf debugging information.
1992 The SVR4 reference port C compiler uses the following register numbers
1993 in its Dwarf output code:
1994 0 for %eax (gcc regno = 0)
1995 1 for %ecx (gcc regno = 2)
1996 2 for %edx (gcc regno = 1)
1997 3 for %ebx (gcc regno = 3)
1998 4 for %esp (gcc regno = 7)
1999 5 for %ebp (gcc regno = 6)
2000 6 for %esi (gcc regno = 4)
2001 7 for %edi (gcc regno = 5)
2002 The following three DWARF register numbers are never generated by
2003 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2004 believes these numbers have these meanings.
2005 8 for %eip (no gcc equivalent)
2006 9 for %eflags (gcc regno = 17)
2007 10 for %trapno (no gcc equivalent)
2008 It is not at all clear how we should number the FP stack registers
2009 for the x86 architecture. If the version of SDB on x86/svr4 were
2010 a bit less brain dead with respect to floating-point then we would
2011 have a precedent to follow with respect to DWARF register numbers
2012 for x86 FP registers, but the SDB on x86/svr4 is so completely
2013 broken with respect to FP registers that it is hardly worth thinking
2014 of it as something to strive for compatibility with.
2015 The version of x86/svr4 SDB I have at the moment does (partially)
2016 seem to believe that DWARF register number 11 is associated with
2017 the x86 register %st(0), but that's about all. Higher DWARF
2018 register numbers don't seem to be associated with anything in
2019 particular, and even for DWARF regno 11, SDB only seems to under-
2020 stand that it should say that a variable lives in %st(0) (when
2021 asked via an `=' command) if we said it was in DWARF regno 11,
2022 but SDB still prints garbage when asked for the value of the
2023 variable in question (via a `/' command).
2024 (Also note that the labels SDB prints for various FP stack regs
2025 when doing an `x' command are all wrong.)
2026 Note that these problems generally don't affect the native SVR4
2027 C compiler because it doesn't allow the use of -O with -g and
2028 because when it is *not* optimizing, it allocates a memory
2029 location for each floating-point variable, and the memory
2030 location is what gets described in the DWARF AT_location
2031 attribute for the variable in question.
2032 Regardless of the severe mental illness of the x86/svr4 SDB, we
2033 do something sensible here and we use the following DWARF
2034 register numbers. Note that these are all stack-top-relative
2035 numbers.
2036 11 for %st(0) (gcc regno = 8)
2037 12 for %st(1) (gcc regno = 9)
2038 13 for %st(2) (gcc regno = 10)
2039 14 for %st(3) (gcc regno = 11)
2040 15 for %st(4) (gcc regno = 12)
2041 16 for %st(5) (gcc regno = 13)
2042 17 for %st(6) (gcc regno = 14)
2043 18 for %st(7) (gcc regno = 15)
2045 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2047 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2048 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2049 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2050 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2051 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2052 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2053 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2054 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2055 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2056 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2059 /* Define parameter passing and return registers. */
2061 static int const x86_64_int_parameter_registers[6] =
2063 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2066 static int const x86_64_ms_abi_int_parameter_registers[4] =
2068 CX_REG, DX_REG, R8_REG, R9_REG
2071 static int const x86_64_int_return_registers[4] =
2073 AX_REG, DX_REG, DI_REG, SI_REG
2076 /* Additional registers that are clobbered by SYSV calls. */
2078 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2080 SI_REG, DI_REG,
2081 XMM6_REG, XMM7_REG,
2082 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2083 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2086 /* Define the structure for the machine field in struct function. */
2088 struct GTY(()) stack_local_entry {
2089 unsigned short mode;
2090 unsigned short n;
2091 rtx rtl;
2092 struct stack_local_entry *next;
2095 /* Structure describing stack frame layout.
2096 Stack grows downward:
2098 [arguments]
2099 <- ARG_POINTER
2100 saved pc
2102 saved static chain if ix86_static_chain_on_stack
2104 saved frame pointer if frame_pointer_needed
2105 <- HARD_FRAME_POINTER
2106 [saved regs]
2107 <- regs_save_offset
2108 [padding0]
2110 [saved SSE regs]
2111 <- sse_regs_save_offset
2112 [padding1] |
2113 | <- FRAME_POINTER
2114 [va_arg registers] |
2116 [frame] |
2118 [padding2] | = to_allocate
2119 <- STACK_POINTER
2121 struct ix86_frame
2123 int nsseregs;
2124 int nregs;
2125 int va_arg_size;
2126 int red_zone_size;
2127 int outgoing_arguments_size;
2129 /* The offsets relative to ARG_POINTER. */
2130 HOST_WIDE_INT frame_pointer_offset;
2131 HOST_WIDE_INT hard_frame_pointer_offset;
2132 HOST_WIDE_INT stack_pointer_offset;
2133 HOST_WIDE_INT hfp_save_offset;
2134 HOST_WIDE_INT reg_save_offset;
2135 HOST_WIDE_INT sse_reg_save_offset;
2137 /* When save_regs_using_mov is set, emit prologue using
2138 move instead of push instructions. */
2139 bool save_regs_using_mov;
2142 /* Which cpu are we scheduling for. */
2143 enum attr_cpu ix86_schedule;
2145 /* Which cpu are we optimizing for. */
2146 enum processor_type ix86_tune;
2148 /* Which instruction set architecture to use. */
2149 enum processor_type ix86_arch;
2151 /* True if processor has SSE prefetch instruction. */
2152 unsigned char x86_prefetch_sse;
2154 /* -mstackrealign option */
2155 static const char ix86_force_align_arg_pointer_string[]
2156 = "force_align_arg_pointer";
2158 static rtx (*ix86_gen_leave) (void);
2159 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2160 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2161 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2162 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2163 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2164 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2165 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2166 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2167 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2168 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2169 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2171 /* Preferred alignment for stack boundary in bits. */
2172 unsigned int ix86_preferred_stack_boundary;
2174 /* Alignment for incoming stack boundary in bits specified at
2175 command line. */
2176 static unsigned int ix86_user_incoming_stack_boundary;
2178 /* Default alignment for incoming stack boundary in bits. */
2179 static unsigned int ix86_default_incoming_stack_boundary;
2181 /* Alignment for incoming stack boundary in bits. */
2182 unsigned int ix86_incoming_stack_boundary;
2184 /* Calling abi specific va_list type nodes. */
2185 static GTY(()) tree sysv_va_list_type_node;
2186 static GTY(()) tree ms_va_list_type_node;
2188 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2189 char internal_label_prefix[16];
2190 int internal_label_prefix_len;
2192 /* Fence to use after loop using movnt. */
2193 tree x86_mfence;
2195 /* Register class used for passing given 64bit part of the argument.
2196 These represent classes as documented by the PS ABI, with the exception
2197 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2198 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2200 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2201 whenever possible (upper half does contain padding). */
2202 enum x86_64_reg_class
2204 X86_64_NO_CLASS,
2205 X86_64_INTEGER_CLASS,
2206 X86_64_INTEGERSI_CLASS,
2207 X86_64_SSE_CLASS,
2208 X86_64_SSESF_CLASS,
2209 X86_64_SSEDF_CLASS,
2210 X86_64_SSEUP_CLASS,
2211 X86_64_X87_CLASS,
2212 X86_64_X87UP_CLASS,
2213 X86_64_COMPLEX_X87_CLASS,
2214 X86_64_MEMORY_CLASS
2217 #define MAX_CLASSES 4
2219 /* Table of constants used by fldpi, fldln2, etc.... */
2220 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2221 static bool ext_80387_constants_init = 0;
2224 static struct machine_function * ix86_init_machine_status (void);
2225 static rtx ix86_function_value (const_tree, const_tree, bool);
2226 static bool ix86_function_value_regno_p (const unsigned int);
2227 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2228 const_tree);
2229 static rtx ix86_static_chain (const_tree, bool);
2230 static int ix86_function_regparm (const_tree, const_tree);
2231 static void ix86_compute_frame_layout (struct ix86_frame *);
2232 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2233 rtx, rtx, int);
2234 static void ix86_add_new_builtins (HOST_WIDE_INT);
2235 static tree ix86_canonical_va_list_type (tree);
2236 static void predict_jump (int);
2237 static unsigned int split_stack_prologue_scratch_regno (void);
2238 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2240 enum ix86_function_specific_strings
2242 IX86_FUNCTION_SPECIFIC_ARCH,
2243 IX86_FUNCTION_SPECIFIC_TUNE,
2244 IX86_FUNCTION_SPECIFIC_MAX
2247 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2248 const char *, enum fpmath_unit, bool);
2249 static void ix86_function_specific_save (struct cl_target_option *);
2250 static void ix86_function_specific_restore (struct cl_target_option *);
2251 static void ix86_function_specific_print (FILE *, int,
2252 struct cl_target_option *);
2253 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2254 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2255 struct gcc_options *);
2256 static bool ix86_can_inline_p (tree, tree);
2257 static void ix86_set_current_function (tree);
2258 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2260 static enum calling_abi ix86_function_abi (const_tree);
2263 #ifndef SUBTARGET32_DEFAULT_CPU
2264 #define SUBTARGET32_DEFAULT_CPU "i386"
2265 #endif
2267 /* Whether -mtune= or -march= were specified */
2268 static int ix86_tune_defaulted;
2269 static int ix86_arch_specified;
2271 /* Vectorization library interface and handlers. */
2272 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2274 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2275 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2277 /* Processor target table, indexed by processor number */
2278 struct ptt
2280 const struct processor_costs *cost; /* Processor costs */
2281 const int align_loop; /* Default alignments. */
2282 const int align_loop_max_skip;
2283 const int align_jump;
2284 const int align_jump_max_skip;
2285 const int align_func;
2288 static const struct ptt processor_target_table[PROCESSOR_max] =
2290 {&i386_cost, 4, 3, 4, 3, 4},
2291 {&i486_cost, 16, 15, 16, 15, 16},
2292 {&pentium_cost, 16, 7, 16, 7, 16},
2293 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2294 {&geode_cost, 0, 0, 0, 0, 0},
2295 {&k6_cost, 32, 7, 32, 7, 32},
2296 {&athlon_cost, 16, 7, 16, 7, 16},
2297 {&pentium4_cost, 0, 0, 0, 0, 0},
2298 {&k8_cost, 16, 7, 16, 7, 16},
2299 {&nocona_cost, 0, 0, 0, 0, 0},
2300 /* Core 2 */
2301 {&core_cost, 16, 10, 16, 10, 16},
2302 /* Core i7 */
2303 {&core_cost, 16, 10, 16, 10, 16},
2304 /* Core avx2 */
2305 {&core_cost, 16, 10, 16, 10, 16},
2306 {&generic_cost, 16, 10, 16, 10, 16},
2307 {&amdfam10_cost, 32, 24, 32, 7, 32},
2308 {&bdver1_cost, 16, 10, 16, 7, 11},
2309 {&bdver2_cost, 16, 10, 16, 7, 11},
2310 {&bdver3_cost, 16, 10, 16, 7, 11},
2311 {&btver1_cost, 16, 10, 16, 7, 11},
2312 {&btver2_cost, 16, 10, 16, 7, 11},
2313 {&atom_cost, 16, 15, 16, 7, 16},
2314 {&slm_cost, 16, 15, 16, 7, 16}
2317 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2319 "generic",
2320 "i386",
2321 "i486",
2322 "pentium",
2323 "pentium-mmx",
2324 "pentiumpro",
2325 "pentium2",
2326 "pentium3",
2327 "pentium4",
2328 "pentium-m",
2329 "prescott",
2330 "nocona",
2331 "core2",
2332 "corei7",
2333 "core-avx2",
2334 "atom",
2335 "slm",
2336 "geode",
2337 "k6",
2338 "k6-2",
2339 "k6-3",
2340 "athlon",
2341 "athlon-4",
2342 "k8",
2343 "amdfam10",
2344 "bdver1",
2345 "bdver2",
2346 "bdver3",
2347 "btver1",
2348 "btver2"
2351 static bool
2352 gate_insert_vzeroupper (void)
2354 return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
2357 static unsigned int
2358 rest_of_handle_insert_vzeroupper (void)
2360 int i;
2362 /* vzeroupper instructions are inserted immediately after reload to
2363 account for possible spills from 256bit registers. The pass
2364 reuses mode switching infrastructure by re-running mode insertion
2365 pass, so disable entities that have already been processed. */
2366 for (i = 0; i < MAX_386_ENTITIES; i++)
2367 ix86_optimize_mode_switching[i] = 0;
2369 ix86_optimize_mode_switching[AVX_U128] = 1;
2371 /* Call optimize_mode_switching. */
2372 g->get_passes ()->execute_pass_mode_switching ();
2373 return 0;
2376 namespace {
2378 const pass_data pass_data_insert_vzeroupper =
2380 RTL_PASS, /* type */
2381 "vzeroupper", /* name */
2382 OPTGROUP_NONE, /* optinfo_flags */
2383 true, /* has_gate */
2384 true, /* has_execute */
2385 TV_NONE, /* tv_id */
2386 0, /* properties_required */
2387 0, /* properties_provided */
2388 0, /* properties_destroyed */
2389 0, /* todo_flags_start */
2390 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2393 class pass_insert_vzeroupper : public rtl_opt_pass
2395 public:
2396 pass_insert_vzeroupper(gcc::context *ctxt)
2397 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2400 /* opt_pass methods: */
2401 bool gate () { return gate_insert_vzeroupper (); }
2402 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2404 }; // class pass_insert_vzeroupper
2406 } // anon namespace
2408 rtl_opt_pass *
2409 make_pass_insert_vzeroupper (gcc::context *ctxt)
2411 return new pass_insert_vzeroupper (ctxt);
2414 /* Return true if a red-zone is in use. */
2416 static inline bool
2417 ix86_using_red_zone (void)
2419 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2422 /* Return a string that documents the current -m options. The caller is
2423 responsible for freeing the string. */
2425 static char *
2426 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2427 const char *tune, enum fpmath_unit fpmath,
2428 bool add_nl_p)
2430 struct ix86_target_opts
2432 const char *option; /* option string */
2433 HOST_WIDE_INT mask; /* isa mask options */
2436 /* This table is ordered so that options like -msse4.2 that imply
2437 preceding options while match those first. */
2438 static struct ix86_target_opts isa_opts[] =
2440 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2441 { "-mfma", OPTION_MASK_ISA_FMA },
2442 { "-mxop", OPTION_MASK_ISA_XOP },
2443 { "-mlwp", OPTION_MASK_ISA_LWP },
2444 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2445 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2446 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2447 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2448 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2449 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2450 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2451 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2452 { "-msse3", OPTION_MASK_ISA_SSE3 },
2453 { "-msse2", OPTION_MASK_ISA_SSE2 },
2454 { "-msse", OPTION_MASK_ISA_SSE },
2455 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2456 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2457 { "-mmmx", OPTION_MASK_ISA_MMX },
2458 { "-mabm", OPTION_MASK_ISA_ABM },
2459 { "-mbmi", OPTION_MASK_ISA_BMI },
2460 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2461 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2462 { "-mhle", OPTION_MASK_ISA_HLE },
2463 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2464 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2465 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2466 { "-madx", OPTION_MASK_ISA_ADX },
2467 { "-mtbm", OPTION_MASK_ISA_TBM },
2468 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2469 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2470 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2471 { "-maes", OPTION_MASK_ISA_AES },
2472 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2473 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2474 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2475 { "-mf16c", OPTION_MASK_ISA_F16C },
2476 { "-mrtm", OPTION_MASK_ISA_RTM },
2477 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2478 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2481 /* Flag options. */
2482 static struct ix86_target_opts flag_opts[] =
2484 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2485 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2486 { "-m80387", MASK_80387 },
2487 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2488 { "-malign-double", MASK_ALIGN_DOUBLE },
2489 { "-mcld", MASK_CLD },
2490 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2491 { "-mieee-fp", MASK_IEEE_FP },
2492 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2493 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2494 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2495 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2496 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2497 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2498 { "-mno-red-zone", MASK_NO_RED_ZONE },
2499 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2500 { "-mrecip", MASK_RECIP },
2501 { "-mrtd", MASK_RTD },
2502 { "-msseregparm", MASK_SSEREGPARM },
2503 { "-mstack-arg-probe", MASK_STACK_PROBE },
2504 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2505 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2506 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2507 { "-mvzeroupper", MASK_VZEROUPPER },
2508 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2509 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2510 { "-mprefer-avx128", MASK_PREFER_AVX128},
2513 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2515 char isa_other[40];
2516 char target_other[40];
2517 unsigned num = 0;
2518 unsigned i, j;
2519 char *ret;
2520 char *ptr;
2521 size_t len;
2522 size_t line_len;
2523 size_t sep_len;
2524 const char *abi;
2526 memset (opts, '\0', sizeof (opts));
2528 /* Add -march= option. */
2529 if (arch)
2531 opts[num][0] = "-march=";
2532 opts[num++][1] = arch;
2535 /* Add -mtune= option. */
2536 if (tune)
2538 opts[num][0] = "-mtune=";
2539 opts[num++][1] = tune;
2542 /* Add -m32/-m64/-mx32. */
2543 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2545 if ((isa & OPTION_MASK_ABI_64) != 0)
2546 abi = "-m64";
2547 else
2548 abi = "-mx32";
2549 isa &= ~ (OPTION_MASK_ISA_64BIT
2550 | OPTION_MASK_ABI_64
2551 | OPTION_MASK_ABI_X32);
2553 else
2554 abi = "-m32";
2555 opts[num++][0] = abi;
2557 /* Pick out the options in isa options. */
2558 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2560 if ((isa & isa_opts[i].mask) != 0)
2562 opts[num++][0] = isa_opts[i].option;
2563 isa &= ~ isa_opts[i].mask;
2567 if (isa && add_nl_p)
2569 opts[num++][0] = isa_other;
2570 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2571 isa);
2574 /* Add flag options. */
2575 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2577 if ((flags & flag_opts[i].mask) != 0)
2579 opts[num++][0] = flag_opts[i].option;
2580 flags &= ~ flag_opts[i].mask;
2584 if (flags && add_nl_p)
2586 opts[num++][0] = target_other;
2587 sprintf (target_other, "(other flags: %#x)", flags);
2590 /* Add -fpmath= option. */
2591 if (fpmath)
2593 opts[num][0] = "-mfpmath=";
2594 switch ((int) fpmath)
2596 case FPMATH_387:
2597 opts[num++][1] = "387";
2598 break;
2600 case FPMATH_SSE:
2601 opts[num++][1] = "sse";
2602 break;
2604 case FPMATH_387 | FPMATH_SSE:
2605 opts[num++][1] = "sse+387";
2606 break;
2608 default:
2609 gcc_unreachable ();
2613 /* Any options? */
2614 if (num == 0)
2615 return NULL;
2617 gcc_assert (num < ARRAY_SIZE (opts));
2619 /* Size the string. */
2620 len = 0;
2621 sep_len = (add_nl_p) ? 3 : 1;
2622 for (i = 0; i < num; i++)
2624 len += sep_len;
2625 for (j = 0; j < 2; j++)
2626 if (opts[i][j])
2627 len += strlen (opts[i][j]);
2630 /* Build the string. */
2631 ret = ptr = (char *) xmalloc (len);
2632 line_len = 0;
2634 for (i = 0; i < num; i++)
2636 size_t len2[2];
2638 for (j = 0; j < 2; j++)
2639 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2641 if (i != 0)
2643 *ptr++ = ' ';
2644 line_len++;
2646 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2648 *ptr++ = '\\';
2649 *ptr++ = '\n';
2650 line_len = 0;
2654 for (j = 0; j < 2; j++)
2655 if (opts[i][j])
2657 memcpy (ptr, opts[i][j], len2[j]);
2658 ptr += len2[j];
2659 line_len += len2[j];
2663 *ptr = '\0';
2664 gcc_assert (ret + len >= ptr);
2666 return ret;
2669 /* Return true, if profiling code should be emitted before
2670 prologue. Otherwise it returns false.
2671 Note: For x86 with "hotfix" it is sorried. */
2672 static bool
2673 ix86_profile_before_prologue (void)
2675 return flag_fentry != 0;
2678 /* Function that is callable from the debugger to print the current
2679 options. */
2680 void ATTRIBUTE_UNUSED
2681 ix86_debug_options (void)
2683 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2684 ix86_arch_string, ix86_tune_string,
2685 ix86_fpmath, true);
2687 if (opts)
2689 fprintf (stderr, "%s\n\n", opts);
2690 free (opts);
2692 else
2693 fputs ("<no options>\n\n", stderr);
2695 return;
2698 static const char *stringop_alg_names[] = {
2699 #define DEF_ENUM
2700 #define DEF_ALG(alg, name) #name,
2701 #include "stringop.def"
2702 #undef DEF_ENUM
2703 #undef DEF_ALG
2706 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
2707 The string is of the following form (or comma separated list of it):
2709 strategy_alg:max_size:[align|noalign]
2711 where the full size range for the strategy is either [0, max_size] or
2712 [min_size, max_size], in which min_size is the max_size + 1 of the
2713 preceding range. The last size range must have max_size == -1.
2715 Examples:
2718 -mmemcpy-strategy=libcall:-1:noalign
2720 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2724 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
2726 This is to tell the compiler to use the following strategy for memset
2727 1) when the expected size is between [1, 16], use rep_8byte strategy;
2728 2) when the size is between [17, 2048], use vector_loop;
2729 3) when the size is > 2048, use libcall. */
2731 struct stringop_size_range
2733 int max;
2734 stringop_alg alg;
2735 bool noalign;
2738 static void
2739 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
2741 const struct stringop_algs *default_algs;
2742 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
2743 char *curr_range_str, *next_range_str;
2744 int i = 0, n = 0;
2746 if (is_memset)
2747 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
2748 else
2749 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
2751 curr_range_str = strategy_str;
2755 int maxs;
2756 stringop_alg alg;
2757 char alg_name[128];
2758 char align[16];
2759 next_range_str = strchr (curr_range_str, ',');
2760 if (next_range_str)
2761 *next_range_str++ = '\0';
2763 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
2764 alg_name, &maxs, align))
2766 error ("wrong arg %s to option %s", curr_range_str,
2767 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2768 return;
2771 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
2773 error ("size ranges of option %s should be increasing",
2774 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2775 return;
2778 for (i = 0; i < last_alg; i++)
2780 if (!strcmp (alg_name, stringop_alg_names[i]))
2782 alg = (stringop_alg) i;
2783 break;
2787 if (i == last_alg)
2789 error ("wrong stringop strategy name %s specified for option %s",
2790 alg_name,
2791 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2792 return;
2795 input_ranges[n].max = maxs;
2796 input_ranges[n].alg = alg;
2797 if (!strcmp (align, "align"))
2798 input_ranges[n].noalign = false;
2799 else if (!strcmp (align, "noalign"))
2800 input_ranges[n].noalign = true;
2801 else
2803 error ("unknown alignment %s specified for option %s",
2804 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2805 return;
2807 n++;
2808 curr_range_str = next_range_str;
2810 while (curr_range_str);
2812 if (input_ranges[n - 1].max != -1)
2814 error ("the max value for the last size range should be -1"
2815 " for option %s",
2816 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2817 return;
2820 if (n > MAX_STRINGOP_ALGS)
2822 error ("too many size ranges specified in option %s",
2823 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2824 return;
2827 /* Now override the default algs array. */
2828 for (i = 0; i < n; i++)
2830 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
2831 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
2832 = input_ranges[i].alg;
2833 *const_cast<int *>(&default_algs->size[i].noalign)
2834 = input_ranges[i].noalign;
2839 /* parse -mtune-ctrl= option. When DUMP is true,
2840 print the features that are explicitly set. */
2842 static void
2843 parse_mtune_ctrl_str (bool dump)
2845 if (!ix86_tune_ctrl_string)
2846 return;
2848 char *next_feature_string = NULL;
2849 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
2850 char *orig = curr_feature_string;
2851 int i;
2854 bool clear = false;
2856 next_feature_string = strchr (curr_feature_string, ',');
2857 if (next_feature_string)
2858 *next_feature_string++ = '\0';
2859 if (*curr_feature_string == '^')
2861 curr_feature_string++;
2862 clear = true;
2864 for (i = 0; i < X86_TUNE_LAST; i++)
2866 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
2868 ix86_tune_features[i] = !clear;
2869 if (dump)
2870 fprintf (stderr, "Explicitly %s feature %s\n",
2871 clear ? "clear" : "set", ix86_tune_feature_names[i]);
2872 break;
2875 if (i == X86_TUNE_LAST)
2876 error ("Unknown parameter to option -mtune-ctrl: %s",
2877 clear ? curr_feature_string - 1 : curr_feature_string);
2878 curr_feature_string = next_feature_string;
2880 while (curr_feature_string);
2881 free (orig);
2884 /* Helper function to set ix86_tune_features. IX86_TUNE is the
2885 processor type. */
2887 static void
2888 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
2890 unsigned int ix86_tune_mask = 1u << ix86_tune;
2891 int i;
2893 for (i = 0; i < X86_TUNE_LAST; ++i)
2895 if (ix86_tune_no_default)
2896 ix86_tune_features[i] = 0;
2897 else
2898 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
2901 if (dump)
2903 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
2904 for (i = 0; i < X86_TUNE_LAST; i++)
2905 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
2906 ix86_tune_features[i] ? "on" : "off");
2909 parse_mtune_ctrl_str (dump);
2913 /* Override various settings based on options. If MAIN_ARGS_P, the
2914 options are from the command line, otherwise they are from
2915 attributes. */
2917 static void
2918 ix86_option_override_internal (bool main_args_p)
2920 int i;
2921 unsigned int ix86_arch_mask, ix86_tune_mask;
2922 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2923 const char *prefix;
2924 const char *suffix;
2925 const char *sw;
2927 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2928 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2929 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2930 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2931 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2932 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2933 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2934 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2935 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2936 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2937 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2938 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2939 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2940 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2941 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2942 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2943 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2944 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2945 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2946 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2947 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2948 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2949 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2950 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2951 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2952 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2953 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2954 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2955 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2956 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2957 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2958 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2959 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2960 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2961 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2962 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2963 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2964 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2965 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2966 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2967 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
2968 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
2969 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
2970 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
2972 /* if this reaches 64, need to widen struct pta flags below */
2974 static struct pta
2976 const char *const name; /* processor name or nickname. */
2977 const enum processor_type processor;
2978 const enum attr_cpu schedule;
2979 const unsigned HOST_WIDE_INT flags;
2981 const processor_alias_table[] =
2983 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2984 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2985 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2986 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2987 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2988 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2989 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2990 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2991 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2992 PTA_MMX | PTA_SSE | PTA_FXSR},
2993 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2994 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2995 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2996 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2997 PTA_MMX | PTA_SSE | PTA_FXSR},
2998 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2999 PTA_MMX | PTA_SSE | PTA_FXSR},
3000 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3001 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3002 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3003 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3004 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3005 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3006 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3007 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3008 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3009 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3010 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3011 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3012 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3013 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3014 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3015 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3016 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3017 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
3018 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3019 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3020 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3021 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3022 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
3023 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3024 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3025 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3026 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3027 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3028 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3029 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3030 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3031 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3032 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3033 | PTA_XSAVEOPT},
3034 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3035 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3036 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3037 {"slm", PROCESSOR_SLM, CPU_SLM,
3038 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3039 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
3040 | PTA_FXSR},
3041 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3042 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3043 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3044 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3045 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3046 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3047 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3048 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3049 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3050 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3051 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3052 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3053 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3054 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3055 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3056 {"x86-64", PROCESSOR_K8, CPU_K8,
3057 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3058 {"k8", PROCESSOR_K8, CPU_K8,
3059 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3060 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3061 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3062 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3063 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3064 {"opteron", PROCESSOR_K8, CPU_K8,
3065 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3066 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3067 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3068 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3069 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3070 {"athlon64", PROCESSOR_K8, CPU_K8,
3071 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3072 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3073 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3074 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3075 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3076 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3077 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3078 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3079 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3080 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3081 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3082 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3083 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3084 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3085 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3086 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3087 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3088 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3089 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3090 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3091 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3092 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3093 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3094 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3095 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3096 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3097 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3098 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3099 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3100 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3101 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3102 | PTA_XSAVEOPT | PTA_FSGSBASE},
3103 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3104 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3105 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3106 | PTA_FXSR | PTA_XSAVE},
3107 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3108 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3109 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3110 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3111 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3112 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3114 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3115 PTA_64BIT
3116 | PTA_HLE /* flags are only used for -march switch. */ },
3119 /* -mrecip options. */
3120 static struct
3122 const char *string; /* option name */
3123 unsigned int mask; /* mask bits to set */
3125 const recip_options[] =
3127 { "all", RECIP_MASK_ALL },
3128 { "none", RECIP_MASK_NONE },
3129 { "div", RECIP_MASK_DIV },
3130 { "sqrt", RECIP_MASK_SQRT },
3131 { "vec-div", RECIP_MASK_VEC_DIV },
3132 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3135 int const pta_size = ARRAY_SIZE (processor_alias_table);
3137 /* Set up prefix/suffix so the error messages refer to either the command
3138 line argument, or the attribute(target). */
3139 if (main_args_p)
3141 prefix = "-m";
3142 suffix = "";
3143 sw = "switch";
3145 else
3147 prefix = "option(\"";
3148 suffix = "\")";
3149 sw = "attribute";
3152 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3153 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3154 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3155 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3156 #ifdef TARGET_BI_ARCH
3157 else
3159 #if TARGET_BI_ARCH == 1
3160 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3161 is on and OPTION_MASK_ABI_X32 is off. We turn off
3162 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3163 -mx32. */
3164 if (TARGET_X32)
3165 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3166 #else
3167 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3168 on and OPTION_MASK_ABI_64 is off. We turn off
3169 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3170 -m64. */
3171 if (TARGET_LP64)
3172 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3173 #endif
3175 #endif
3177 if (TARGET_X32)
3179 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3180 OPTION_MASK_ABI_64 for TARGET_X32. */
3181 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3182 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3184 else if (TARGET_LP64)
3186 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3187 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3188 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3189 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3192 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3193 SUBTARGET_OVERRIDE_OPTIONS;
3194 #endif
3196 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3197 SUBSUBTARGET_OVERRIDE_OPTIONS;
3198 #endif
3200 /* -fPIC is the default for x86_64. */
3201 if (TARGET_MACHO && TARGET_64BIT)
3202 flag_pic = 2;
3204 /* Need to check -mtune=generic first. */
3205 if (ix86_tune_string)
3207 if (!strcmp (ix86_tune_string, "generic")
3208 || !strcmp (ix86_tune_string, "i686")
3209 /* As special support for cross compilers we read -mtune=native
3210 as -mtune=generic. With native compilers we won't see the
3211 -mtune=native, as it was changed by the driver. */
3212 || !strcmp (ix86_tune_string, "native"))
3214 ix86_tune_string = "generic";
3216 /* If this call is for setting the option attribute, allow the
3217 generic that was previously set. */
3218 else if (!main_args_p
3219 && !strcmp (ix86_tune_string, "generic"))
3221 else if (!strncmp (ix86_tune_string, "generic", 7))
3222 error ("bad value (%s) for %stune=%s %s",
3223 ix86_tune_string, prefix, suffix, sw);
3224 else if (!strcmp (ix86_tune_string, "x86-64"))
3225 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3226 "%stune=k8%s or %stune=generic%s instead as appropriate",
3227 prefix, suffix, prefix, suffix, prefix, suffix);
3229 else
3231 if (ix86_arch_string)
3232 ix86_tune_string = ix86_arch_string;
3233 if (!ix86_tune_string)
3235 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3236 ix86_tune_defaulted = 1;
3239 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3240 need to use a sensible tune option. */
3241 if (!strcmp (ix86_tune_string, "generic")
3242 || !strcmp (ix86_tune_string, "x86-64")
3243 || !strcmp (ix86_tune_string, "i686"))
3245 ix86_tune_string = "generic";
3249 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3251 /* rep; movq isn't available in 32-bit code. */
3252 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3253 ix86_stringop_alg = no_stringop;
3256 if (!ix86_arch_string)
3257 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3258 else
3259 ix86_arch_specified = 1;
3261 if (global_options_set.x_ix86_pmode)
3263 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3264 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3265 error ("address mode %qs not supported in the %s bit mode",
3266 TARGET_64BIT ? "short" : "long",
3267 TARGET_64BIT ? "64" : "32");
3269 else
3270 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3272 if (!global_options_set.x_ix86_abi)
3273 ix86_abi = DEFAULT_ABI;
3275 /* For targets using ms ABI enable ms-extensions, if not
3276 explicit turned off. For non-ms ABI we turn off this
3277 option. */
3278 if (!global_options_set.x_flag_ms_extensions)
3279 flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3281 if (global_options_set.x_ix86_cmodel)
3283 switch (ix86_cmodel)
3285 case CM_SMALL:
3286 case CM_SMALL_PIC:
3287 if (flag_pic)
3288 ix86_cmodel = CM_SMALL_PIC;
3289 if (!TARGET_64BIT)
3290 error ("code model %qs not supported in the %s bit mode",
3291 "small", "32");
3292 break;
3294 case CM_MEDIUM:
3295 case CM_MEDIUM_PIC:
3296 if (flag_pic)
3297 ix86_cmodel = CM_MEDIUM_PIC;
3298 if (!TARGET_64BIT)
3299 error ("code model %qs not supported in the %s bit mode",
3300 "medium", "32");
3301 else if (TARGET_X32)
3302 error ("code model %qs not supported in x32 mode",
3303 "medium");
3304 break;
3306 case CM_LARGE:
3307 case CM_LARGE_PIC:
3308 if (flag_pic)
3309 ix86_cmodel = CM_LARGE_PIC;
3310 if (!TARGET_64BIT)
3311 error ("code model %qs not supported in the %s bit mode",
3312 "large", "32");
3313 else if (TARGET_X32)
3314 error ("code model %qs not supported in x32 mode",
3315 "large");
3316 break;
3318 case CM_32:
3319 if (flag_pic)
3320 error ("code model %s does not support PIC mode", "32");
3321 if (TARGET_64BIT)
3322 error ("code model %qs not supported in the %s bit mode",
3323 "32", "64");
3324 break;
3326 case CM_KERNEL:
3327 if (flag_pic)
3329 error ("code model %s does not support PIC mode", "kernel");
3330 ix86_cmodel = CM_32;
3332 if (!TARGET_64BIT)
3333 error ("code model %qs not supported in the %s bit mode",
3334 "kernel", "32");
3335 break;
3337 default:
3338 gcc_unreachable ();
3341 else
3343 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3344 use of rip-relative addressing. This eliminates fixups that
3345 would otherwise be needed if this object is to be placed in a
3346 DLL, and is essentially just as efficient as direct addressing. */
3347 if (TARGET_64BIT && (TARGET_RDOS || TARGET_PECOFF))
3348 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3349 else if (TARGET_64BIT)
3350 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3351 else
3352 ix86_cmodel = CM_32;
3354 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3356 error ("-masm=intel not supported in this configuration");
3357 ix86_asm_dialect = ASM_ATT;
3359 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3360 sorry ("%i-bit mode not compiled in",
3361 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3363 for (i = 0; i < pta_size; i++)
3364 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3366 ix86_schedule = processor_alias_table[i].schedule;
3367 ix86_arch = processor_alias_table[i].processor;
3368 /* Default cpu tuning to the architecture. */
3369 ix86_tune = ix86_arch;
3371 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3372 error ("CPU you selected does not support x86-64 "
3373 "instruction set");
3375 if (processor_alias_table[i].flags & PTA_MMX
3376 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3377 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3378 if (processor_alias_table[i].flags & PTA_3DNOW
3379 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3380 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3381 if (processor_alias_table[i].flags & PTA_3DNOW_A
3382 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3383 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3384 if (processor_alias_table[i].flags & PTA_SSE
3385 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3386 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3387 if (processor_alias_table[i].flags & PTA_SSE2
3388 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3389 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3390 if (processor_alias_table[i].flags & PTA_SSE3
3391 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3392 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3393 if (processor_alias_table[i].flags & PTA_SSSE3
3394 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3395 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3396 if (processor_alias_table[i].flags & PTA_SSE4_1
3397 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3398 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3399 if (processor_alias_table[i].flags & PTA_SSE4_2
3400 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3401 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3402 if (processor_alias_table[i].flags & PTA_AVX
3403 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3404 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3405 if (processor_alias_table[i].flags & PTA_AVX2
3406 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3407 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3408 if (processor_alias_table[i].flags & PTA_FMA
3409 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3410 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3411 if (processor_alias_table[i].flags & PTA_SSE4A
3412 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3413 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3414 if (processor_alias_table[i].flags & PTA_FMA4
3415 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3416 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3417 if (processor_alias_table[i].flags & PTA_XOP
3418 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3419 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3420 if (processor_alias_table[i].flags & PTA_LWP
3421 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3422 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3423 if (processor_alias_table[i].flags & PTA_ABM
3424 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3425 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3426 if (processor_alias_table[i].flags & PTA_BMI
3427 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3428 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3429 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3430 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3431 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3432 if (processor_alias_table[i].flags & PTA_TBM
3433 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3434 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3435 if (processor_alias_table[i].flags & PTA_BMI2
3436 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3437 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3438 if (processor_alias_table[i].flags & PTA_CX16
3439 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3440 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3441 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3442 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3443 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3444 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3445 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3446 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3447 if (processor_alias_table[i].flags & PTA_MOVBE
3448 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3449 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3450 if (processor_alias_table[i].flags & PTA_AES
3451 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3452 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3453 if (processor_alias_table[i].flags & PTA_PCLMUL
3454 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3455 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3456 if (processor_alias_table[i].flags & PTA_FSGSBASE
3457 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3458 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3459 if (processor_alias_table[i].flags & PTA_RDRND
3460 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3461 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3462 if (processor_alias_table[i].flags & PTA_F16C
3463 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3464 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3465 if (processor_alias_table[i].flags & PTA_RTM
3466 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3467 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3468 if (processor_alias_table[i].flags & PTA_HLE
3469 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3470 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3471 if (processor_alias_table[i].flags & PTA_PRFCHW
3472 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3473 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3474 if (processor_alias_table[i].flags & PTA_RDSEED
3475 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3476 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3477 if (processor_alias_table[i].flags & PTA_ADX
3478 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3479 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3480 if (processor_alias_table[i].flags & PTA_FXSR
3481 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3482 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3483 if (processor_alias_table[i].flags & PTA_XSAVE
3484 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3485 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3486 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3487 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3488 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3489 if (processor_alias_table[i].flags & PTA_AVX512F
3490 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
3491 ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
3492 if (processor_alias_table[i].flags & PTA_AVX512ER
3493 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
3494 ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
3495 if (processor_alias_table[i].flags & PTA_AVX512PF
3496 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
3497 ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
3498 if (processor_alias_table[i].flags & PTA_AVX512CD
3499 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
3500 ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
3501 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3502 x86_prefetch_sse = true;
3504 break;
3507 if (!strcmp (ix86_arch_string, "generic"))
3508 error ("generic CPU can be used only for %stune=%s %s",
3509 prefix, suffix, sw);
3510 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3511 error ("bad value (%s) for %sarch=%s %s",
3512 ix86_arch_string, prefix, suffix, sw);
3514 ix86_arch_mask = 1u << ix86_arch;
3515 for (i = 0; i < X86_ARCH_LAST; ++i)
3516 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3518 for (i = 0; i < pta_size; i++)
3519 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3521 ix86_schedule = processor_alias_table[i].schedule;
3522 ix86_tune = processor_alias_table[i].processor;
3523 if (TARGET_64BIT)
3525 if (!(processor_alias_table[i].flags & PTA_64BIT))
3527 if (ix86_tune_defaulted)
3529 ix86_tune_string = "x86-64";
3530 for (i = 0; i < pta_size; i++)
3531 if (! strcmp (ix86_tune_string,
3532 processor_alias_table[i].name))
3533 break;
3534 ix86_schedule = processor_alias_table[i].schedule;
3535 ix86_tune = processor_alias_table[i].processor;
3537 else
3538 error ("CPU you selected does not support x86-64 "
3539 "instruction set");
3542 /* Intel CPUs have always interpreted SSE prefetch instructions as
3543 NOPs; so, we can enable SSE prefetch instructions even when
3544 -mtune (rather than -march) points us to a processor that has them.
3545 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3546 higher processors. */
3547 if (TARGET_CMOV
3548 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3549 x86_prefetch_sse = true;
3550 break;
3553 if (ix86_tune_specified && i == pta_size)
3554 error ("bad value (%s) for %stune=%s %s",
3555 ix86_tune_string, prefix, suffix, sw);
3557 set_ix86_tune_features (ix86_tune, ix86_dump_tunes);
3559 #ifndef USE_IX86_FRAME_POINTER
3560 #define USE_IX86_FRAME_POINTER 0
3561 #endif
3563 #ifndef USE_X86_64_FRAME_POINTER
3564 #define USE_X86_64_FRAME_POINTER 0
3565 #endif
3567 /* Set the default values for switches whose default depends on TARGET_64BIT
3568 in case they weren't overwritten by command line options. */
3569 if (TARGET_64BIT)
3571 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3572 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3573 if (flag_asynchronous_unwind_tables == 2)
3574 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3575 if (flag_pcc_struct_return == 2)
3576 flag_pcc_struct_return = 0;
3578 else
3580 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3581 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3582 if (flag_asynchronous_unwind_tables == 2)
3583 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3584 if (flag_pcc_struct_return == 2)
3585 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3588 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3589 if (optimize_size)
3590 ix86_cost = &ix86_size_cost;
3591 else
3592 ix86_cost = ix86_tune_cost;
3594 /* Arrange to set up i386_stack_locals for all functions. */
3595 init_machine_status = ix86_init_machine_status;
3597 /* Validate -mregparm= value. */
3598 if (global_options_set.x_ix86_regparm)
3600 if (TARGET_64BIT)
3601 warning (0, "-mregparm is ignored in 64-bit mode");
3602 if (ix86_regparm > REGPARM_MAX)
3604 error ("-mregparm=%d is not between 0 and %d",
3605 ix86_regparm, REGPARM_MAX);
3606 ix86_regparm = 0;
3609 if (TARGET_64BIT)
3610 ix86_regparm = REGPARM_MAX;
3612 /* Default align_* from the processor table. */
3613 if (align_loops == 0)
3615 align_loops = processor_target_table[ix86_tune].align_loop;
3616 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3618 if (align_jumps == 0)
3620 align_jumps = processor_target_table[ix86_tune].align_jump;
3621 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3623 if (align_functions == 0)
3625 align_functions = processor_target_table[ix86_tune].align_func;
3628 /* Provide default for -mbranch-cost= value. */
3629 if (!global_options_set.x_ix86_branch_cost)
3630 ix86_branch_cost = ix86_cost->branch_cost;
3632 if (TARGET_64BIT)
3634 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3636 /* Enable by default the SSE and MMX builtins. Do allow the user to
3637 explicitly disable any of these. In particular, disabling SSE and
3638 MMX for kernel code is extremely useful. */
3639 if (!ix86_arch_specified)
3640 ix86_isa_flags
3641 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3642 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3644 if (TARGET_RTD)
3645 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3647 else
3649 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3651 if (!ix86_arch_specified)
3652 ix86_isa_flags
3653 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3655 /* i386 ABI does not specify red zone. It still makes sense to use it
3656 when programmer takes care to stack from being destroyed. */
3657 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3658 target_flags |= MASK_NO_RED_ZONE;
3661 /* Keep nonleaf frame pointers. */
3662 if (flag_omit_frame_pointer)
3663 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3664 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3665 flag_omit_frame_pointer = 1;
3667 /* If we're doing fast math, we don't care about comparison order
3668 wrt NaNs. This lets us use a shorter comparison sequence. */
3669 if (flag_finite_math_only)
3670 target_flags &= ~MASK_IEEE_FP;
3672 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3673 since the insns won't need emulation. */
3674 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3675 target_flags &= ~MASK_NO_FANCY_MATH_387;
3677 /* Likewise, if the target doesn't have a 387, or we've specified
3678 software floating point, don't use 387 inline intrinsics. */
3679 if (!TARGET_80387)
3680 target_flags |= MASK_NO_FANCY_MATH_387;
3682 /* Turn on MMX builtins for -msse. */
3683 if (TARGET_SSE)
3684 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3686 /* Enable SSE prefetch. */
3687 if (TARGET_SSE || (TARGET_PRFCHW && !TARGET_3DNOW))
3688 x86_prefetch_sse = true;
3690 /* Enable prefetch{,w} instructions for -m3dnow. */
3691 if (TARGET_3DNOW)
3692 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW & ~ix86_isa_flags_explicit;
3694 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3695 if (TARGET_SSE4_2 || TARGET_ABM)
3696 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3698 /* Enable lzcnt instruction for -mabm. */
3699 if (TARGET_ABM)
3700 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3702 /* Validate -mpreferred-stack-boundary= value or default it to
3703 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3704 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3705 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3707 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3708 int max = (TARGET_SEH ? 4 : 12);
3710 if (ix86_preferred_stack_boundary_arg < min
3711 || ix86_preferred_stack_boundary_arg > max)
3713 if (min == max)
3714 error ("-mpreferred-stack-boundary is not supported "
3715 "for this target");
3716 else
3717 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3718 ix86_preferred_stack_boundary_arg, min, max);
3720 else
3721 ix86_preferred_stack_boundary
3722 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3725 /* Set the default value for -mstackrealign. */
3726 if (ix86_force_align_arg_pointer == -1)
3727 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3729 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3731 /* Validate -mincoming-stack-boundary= value or default it to
3732 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3733 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3734 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3736 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3737 || ix86_incoming_stack_boundary_arg > 12)
3738 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3739 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3740 else
3742 ix86_user_incoming_stack_boundary
3743 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3744 ix86_incoming_stack_boundary
3745 = ix86_user_incoming_stack_boundary;
3749 /* Accept -msseregparm only if at least SSE support is enabled. */
3750 if (TARGET_SSEREGPARM
3751 && ! TARGET_SSE)
3752 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3754 if (global_options_set.x_ix86_fpmath)
3756 if (ix86_fpmath & FPMATH_SSE)
3758 if (!TARGET_SSE)
3760 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3761 ix86_fpmath = FPMATH_387;
3763 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3765 warning (0, "387 instruction set disabled, using SSE arithmetics");
3766 ix86_fpmath = FPMATH_SSE;
3770 else
3771 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3773 /* If the i387 is disabled, then do not return values in it. */
3774 if (!TARGET_80387)
3775 target_flags &= ~MASK_FLOAT_RETURNS;
3777 /* Use external vectorized library in vectorizing intrinsics. */
3778 if (global_options_set.x_ix86_veclibabi_type)
3779 switch (ix86_veclibabi_type)
3781 case ix86_veclibabi_type_svml:
3782 ix86_veclib_handler = ix86_veclibabi_svml;
3783 break;
3785 case ix86_veclibabi_type_acml:
3786 ix86_veclib_handler = ix86_veclibabi_acml;
3787 break;
3789 default:
3790 gcc_unreachable ();
3793 ix86_tune_mask = 1u << ix86_tune;
3794 if ((!USE_IX86_FRAME_POINTER
3795 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3796 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3797 && !optimize_size)
3798 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3800 /* ??? Unwind info is not correct around the CFG unless either a frame
3801 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3802 unwind info generation to be aware of the CFG and propagating states
3803 around edges. */
3804 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3805 || flag_exceptions || flag_non_call_exceptions)
3806 && flag_omit_frame_pointer
3807 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3809 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3810 warning (0, "unwind tables currently require either a frame pointer "
3811 "or %saccumulate-outgoing-args%s for correctness",
3812 prefix, suffix);
3813 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3816 /* If stack probes are required, the space used for large function
3817 arguments on the stack must also be probed, so enable
3818 -maccumulate-outgoing-args so this happens in the prologue. */
3819 if (TARGET_STACK_PROBE
3820 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3822 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3823 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3824 "for correctness", prefix, suffix);
3825 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3828 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3830 char *p;
3831 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3832 p = strchr (internal_label_prefix, 'X');
3833 internal_label_prefix_len = p - internal_label_prefix;
3834 *p = '\0';
3837 /* When scheduling description is not available, disable scheduler pass
3838 so it won't slow down the compilation and make x87 code slower. */
3839 if (!TARGET_SCHEDULE)
3840 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3842 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3843 ix86_tune_cost->simultaneous_prefetches,
3844 global_options.x_param_values,
3845 global_options_set.x_param_values);
3846 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3847 ix86_tune_cost->prefetch_block,
3848 global_options.x_param_values,
3849 global_options_set.x_param_values);
3850 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3851 ix86_tune_cost->l1_cache_size,
3852 global_options.x_param_values,
3853 global_options_set.x_param_values);
3854 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3855 ix86_tune_cost->l2_cache_size,
3856 global_options.x_param_values,
3857 global_options_set.x_param_values);
3859 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3860 if (flag_prefetch_loop_arrays < 0
3861 && HAVE_prefetch
3862 && (optimize >= 3 || flag_profile_use)
3863 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3864 flag_prefetch_loop_arrays = 1;
3866 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3867 can be optimized to ap = __builtin_next_arg (0). */
3868 if (!TARGET_64BIT && !flag_split_stack)
3869 targetm.expand_builtin_va_start = NULL;
3871 if (TARGET_64BIT)
3873 ix86_gen_leave = gen_leave_rex64;
3874 if (Pmode == DImode)
3876 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3877 ix86_gen_tls_local_dynamic_base_64
3878 = gen_tls_local_dynamic_base_64_di;
3880 else
3882 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3883 ix86_gen_tls_local_dynamic_base_64
3884 = gen_tls_local_dynamic_base_64_si;
3887 else
3888 ix86_gen_leave = gen_leave;
3890 if (Pmode == DImode)
3892 ix86_gen_add3 = gen_adddi3;
3893 ix86_gen_sub3 = gen_subdi3;
3894 ix86_gen_sub3_carry = gen_subdi3_carry;
3895 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3896 ix86_gen_andsp = gen_anddi3;
3897 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3898 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3899 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3900 ix86_gen_monitor = gen_sse3_monitor_di;
3902 else
3904 ix86_gen_add3 = gen_addsi3;
3905 ix86_gen_sub3 = gen_subsi3;
3906 ix86_gen_sub3_carry = gen_subsi3_carry;
3907 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3908 ix86_gen_andsp = gen_andsi3;
3909 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3910 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3911 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3912 ix86_gen_monitor = gen_sse3_monitor_si;
3915 #ifdef USE_IX86_CLD
3916 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3917 if (!TARGET_64BIT)
3918 target_flags |= MASK_CLD & ~target_flags_explicit;
3919 #endif
3921 if (!TARGET_64BIT && flag_pic)
3923 if (flag_fentry > 0)
3924 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3925 "with -fpic");
3926 flag_fentry = 0;
3928 else if (TARGET_SEH)
3930 if (flag_fentry == 0)
3931 sorry ("-mno-fentry isn%'t compatible with SEH");
3932 flag_fentry = 1;
3934 else if (flag_fentry < 0)
3936 #if defined(PROFILE_BEFORE_PROLOGUE)
3937 flag_fentry = 1;
3938 #else
3939 flag_fentry = 0;
3940 #endif
3943 /* When not optimize for size, enable vzeroupper optimization for
3944 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3945 AVX unaligned load/store. */
3946 if (!optimize_size)
3948 if (flag_expensive_optimizations
3949 && !(target_flags_explicit & MASK_VZEROUPPER))
3950 target_flags |= MASK_VZEROUPPER;
3951 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3952 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3953 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3954 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3955 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3956 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3957 /* Enable 128-bit AVX instruction generation
3958 for the auto-vectorizer. */
3959 if (TARGET_AVX128_OPTIMAL
3960 && !(target_flags_explicit & MASK_PREFER_AVX128))
3961 target_flags |= MASK_PREFER_AVX128;
3964 if (ix86_recip_name)
3966 char *p = ASTRDUP (ix86_recip_name);
3967 char *q;
3968 unsigned int mask, i;
3969 bool invert;
3971 while ((q = strtok (p, ",")) != NULL)
3973 p = NULL;
3974 if (*q == '!')
3976 invert = true;
3977 q++;
3979 else
3980 invert = false;
3982 if (!strcmp (q, "default"))
3983 mask = RECIP_MASK_ALL;
3984 else
3986 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3987 if (!strcmp (q, recip_options[i].string))
3989 mask = recip_options[i].mask;
3990 break;
3993 if (i == ARRAY_SIZE (recip_options))
3995 error ("unknown option for -mrecip=%s", q);
3996 invert = false;
3997 mask = RECIP_MASK_NONE;
4001 recip_mask_explicit |= mask;
4002 if (invert)
4003 recip_mask &= ~mask;
4004 else
4005 recip_mask |= mask;
4009 if (TARGET_RECIP)
4010 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
4011 else if (target_flags_explicit & MASK_RECIP)
4012 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
4014 /* Default long double to 64-bit for Bionic. */
4015 if (TARGET_HAS_BIONIC
4016 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
4017 target_flags |= MASK_LONG_DOUBLE_64;
4019 /* Save the initial options in case the user does function specific
4020 options. */
4021 if (main_args_p)
4022 target_option_default_node = target_option_current_node
4023 = build_target_option_node ();
4025 /* Handle stack protector */
4026 if (!global_options_set.x_ix86_stack_protector_guard)
4027 ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4029 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4030 if (ix86_tune_memcpy_strategy)
4032 char *str = xstrdup (ix86_tune_memcpy_strategy);
4033 ix86_parse_stringop_strategy_string (str, false);
4034 free (str);
4037 if (ix86_tune_memset_strategy)
4039 char *str = xstrdup (ix86_tune_memset_strategy);
4040 ix86_parse_stringop_strategy_string (str, true);
4041 free (str);
4045 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4047 static void
4048 ix86_option_override (void)
4050 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4051 static struct register_pass_info insert_vzeroupper_info
4052 = { pass_insert_vzeroupper, "reload",
4053 1, PASS_POS_INSERT_AFTER
4056 ix86_option_override_internal (true);
4059 /* This needs to be done at start up. It's convenient to do it here. */
4060 register_pass (&insert_vzeroupper_info);
4063 /* Update register usage after having seen the compiler flags. */
4065 static void
4066 ix86_conditional_register_usage (void)
4068 int i, c_mask;
4069 unsigned int j;
4071 /* The PIC register, if it exists, is fixed. */
4072 j = PIC_OFFSET_TABLE_REGNUM;
4073 if (j != INVALID_REGNUM)
4074 fixed_regs[j] = call_used_regs[j] = 1;
4076 /* For 32-bit targets, squash the REX registers. */
4077 if (! TARGET_64BIT)
4079 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4080 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4081 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4082 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4083 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4084 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4087 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4088 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4089 : TARGET_64BIT ? (1 << 2)
4090 : (1 << 1));
4092 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4094 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4096 /* Set/reset conditionally defined registers from
4097 CALL_USED_REGISTERS initializer. */
4098 if (call_used_regs[i] > 1)
4099 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4101 /* Calculate registers of CLOBBERED_REGS register set
4102 as call used registers from GENERAL_REGS register set. */
4103 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4104 && call_used_regs[i])
4105 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4108 /* If MMX is disabled, squash the registers. */
4109 if (! TARGET_MMX)
4110 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4111 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4112 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4114 /* If SSE is disabled, squash the registers. */
4115 if (! TARGET_SSE)
4116 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4117 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4118 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4120 /* If the FPU is disabled, squash the registers. */
4121 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4122 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4123 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4124 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4126 /* If AVX512F is disabled, squash the registers. */
4127 if (! TARGET_AVX512F)
4129 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4130 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4132 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4133 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4138 /* Save the current options */
4140 static void
4141 ix86_function_specific_save (struct cl_target_option *ptr)
4143 ptr->arch = ix86_arch;
4144 ptr->schedule = ix86_schedule;
4145 ptr->tune = ix86_tune;
4146 ptr->branch_cost = ix86_branch_cost;
4147 ptr->tune_defaulted = ix86_tune_defaulted;
4148 ptr->arch_specified = ix86_arch_specified;
4149 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4150 ptr->ix86_target_flags_explicit = target_flags_explicit;
4151 ptr->x_recip_mask_explicit = recip_mask_explicit;
4153 /* The fields are char but the variables are not; make sure the
4154 values fit in the fields. */
4155 gcc_assert (ptr->arch == ix86_arch);
4156 gcc_assert (ptr->schedule == ix86_schedule);
4157 gcc_assert (ptr->tune == ix86_tune);
4158 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4161 /* Restore the current options */
4163 static void
4164 ix86_function_specific_restore (struct cl_target_option *ptr)
4166 enum processor_type old_tune = ix86_tune;
4167 enum processor_type old_arch = ix86_arch;
4168 unsigned int ix86_arch_mask;
4169 int i;
4171 ix86_arch = (enum processor_type) ptr->arch;
4172 ix86_schedule = (enum attr_cpu) ptr->schedule;
4173 ix86_tune = (enum processor_type) ptr->tune;
4174 ix86_branch_cost = ptr->branch_cost;
4175 ix86_tune_defaulted = ptr->tune_defaulted;
4176 ix86_arch_specified = ptr->arch_specified;
4177 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4178 target_flags_explicit = ptr->ix86_target_flags_explicit;
4179 recip_mask_explicit = ptr->x_recip_mask_explicit;
4181 /* Recreate the arch feature tests if the arch changed */
4182 if (old_arch != ix86_arch)
4184 ix86_arch_mask = 1u << ix86_arch;
4185 for (i = 0; i < X86_ARCH_LAST; ++i)
4186 ix86_arch_features[i]
4187 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4190 /* Recreate the tune optimization tests */
4191 if (old_tune != ix86_tune)
4192 set_ix86_tune_features (ix86_tune, false);
4195 /* Print the current options */
4197 static void
4198 ix86_function_specific_print (FILE *file, int indent,
4199 struct cl_target_option *ptr)
4201 char *target_string
4202 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4203 NULL, NULL, ptr->x_ix86_fpmath, false);
4205 fprintf (file, "%*sarch = %d (%s)\n",
4206 indent, "",
4207 ptr->arch,
4208 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4209 ? cpu_names[ptr->arch]
4210 : "<unknown>"));
4212 fprintf (file, "%*stune = %d (%s)\n",
4213 indent, "",
4214 ptr->tune,
4215 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4216 ? cpu_names[ptr->tune]
4217 : "<unknown>"));
4219 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4221 if (target_string)
4223 fprintf (file, "%*s%s\n", indent, "", target_string);
4224 free (target_string);
4229 /* Inner function to process the attribute((target(...))), take an argument and
4230 set the current options from the argument. If we have a list, recursively go
4231 over the list. */
4233 static bool
4234 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4235 struct gcc_options *enum_opts_set)
4237 char *next_optstr;
4238 bool ret = true;
4240 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4241 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4242 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4243 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4244 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4246 enum ix86_opt_type
4248 ix86_opt_unknown,
4249 ix86_opt_yes,
4250 ix86_opt_no,
4251 ix86_opt_str,
4252 ix86_opt_enum,
4253 ix86_opt_isa
4256 static const struct
4258 const char *string;
4259 size_t len;
4260 enum ix86_opt_type type;
4261 int opt;
4262 int mask;
4263 } attrs[] = {
4264 /* isa options */
4265 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4266 IX86_ATTR_ISA ("abm", OPT_mabm),
4267 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4268 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4269 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4270 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4271 IX86_ATTR_ISA ("aes", OPT_maes),
4272 IX86_ATTR_ISA ("avx", OPT_mavx),
4273 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4274 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
4275 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
4276 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
4277 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
4278 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4279 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4280 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4281 IX86_ATTR_ISA ("sse", OPT_msse),
4282 IX86_ATTR_ISA ("sse2", OPT_msse2),
4283 IX86_ATTR_ISA ("sse3", OPT_msse3),
4284 IX86_ATTR_ISA ("sse4", OPT_msse4),
4285 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4286 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4287 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4288 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4289 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4290 IX86_ATTR_ISA ("fma", OPT_mfma),
4291 IX86_ATTR_ISA ("xop", OPT_mxop),
4292 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4293 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4294 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4295 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4296 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4297 IX86_ATTR_ISA ("hle", OPT_mhle),
4298 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4299 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4300 IX86_ATTR_ISA ("adx", OPT_madx),
4301 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4302 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4303 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4305 /* enum options */
4306 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4308 /* string options */
4309 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4310 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4312 /* flag options */
4313 IX86_ATTR_YES ("cld",
4314 OPT_mcld,
4315 MASK_CLD),
4317 IX86_ATTR_NO ("fancy-math-387",
4318 OPT_mfancy_math_387,
4319 MASK_NO_FANCY_MATH_387),
4321 IX86_ATTR_YES ("ieee-fp",
4322 OPT_mieee_fp,
4323 MASK_IEEE_FP),
4325 IX86_ATTR_YES ("inline-all-stringops",
4326 OPT_minline_all_stringops,
4327 MASK_INLINE_ALL_STRINGOPS),
4329 IX86_ATTR_YES ("inline-stringops-dynamically",
4330 OPT_minline_stringops_dynamically,
4331 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4333 IX86_ATTR_NO ("align-stringops",
4334 OPT_mno_align_stringops,
4335 MASK_NO_ALIGN_STRINGOPS),
4337 IX86_ATTR_YES ("recip",
4338 OPT_mrecip,
4339 MASK_RECIP),
4343 /* If this is a list, recurse to get the options. */
4344 if (TREE_CODE (args) == TREE_LIST)
4346 bool ret = true;
4348 for (; args; args = TREE_CHAIN (args))
4349 if (TREE_VALUE (args)
4350 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4351 p_strings, enum_opts_set))
4352 ret = false;
4354 return ret;
4357 else if (TREE_CODE (args) != STRING_CST)
4359 error ("attribute %<target%> argument not a string");
4360 return false;
4363 /* Handle multiple arguments separated by commas. */
4364 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4366 while (next_optstr && *next_optstr != '\0')
4368 char *p = next_optstr;
4369 char *orig_p = p;
4370 char *comma = strchr (next_optstr, ',');
4371 const char *opt_string;
4372 size_t len, opt_len;
4373 int opt;
4374 bool opt_set_p;
4375 char ch;
4376 unsigned i;
4377 enum ix86_opt_type type = ix86_opt_unknown;
4378 int mask = 0;
4380 if (comma)
4382 *comma = '\0';
4383 len = comma - next_optstr;
4384 next_optstr = comma + 1;
4386 else
4388 len = strlen (p);
4389 next_optstr = NULL;
4392 /* Recognize no-xxx. */
4393 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4395 opt_set_p = false;
4396 p += 3;
4397 len -= 3;
4399 else
4400 opt_set_p = true;
4402 /* Find the option. */
4403 ch = *p;
4404 opt = N_OPTS;
4405 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4407 type = attrs[i].type;
4408 opt_len = attrs[i].len;
4409 if (ch == attrs[i].string[0]
4410 && ((type != ix86_opt_str && type != ix86_opt_enum)
4411 ? len == opt_len
4412 : len > opt_len)
4413 && memcmp (p, attrs[i].string, opt_len) == 0)
4415 opt = attrs[i].opt;
4416 mask = attrs[i].mask;
4417 opt_string = attrs[i].string;
4418 break;
4422 /* Process the option. */
4423 if (opt == N_OPTS)
4425 error ("attribute(target(\"%s\")) is unknown", orig_p);
4426 ret = false;
4429 else if (type == ix86_opt_isa)
4431 struct cl_decoded_option decoded;
4433 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4434 ix86_handle_option (&global_options, &global_options_set,
4435 &decoded, input_location);
4438 else if (type == ix86_opt_yes || type == ix86_opt_no)
4440 if (type == ix86_opt_no)
4441 opt_set_p = !opt_set_p;
4443 if (opt_set_p)
4444 target_flags |= mask;
4445 else
4446 target_flags &= ~mask;
4449 else if (type == ix86_opt_str)
4451 if (p_strings[opt])
4453 error ("option(\"%s\") was already specified", opt_string);
4454 ret = false;
4456 else
4457 p_strings[opt] = xstrdup (p + opt_len);
4460 else if (type == ix86_opt_enum)
4462 bool arg_ok;
4463 int value;
4465 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4466 if (arg_ok)
4467 set_option (&global_options, enum_opts_set, opt, value,
4468 p + opt_len, DK_UNSPECIFIED, input_location,
4469 global_dc);
4470 else
4472 error ("attribute(target(\"%s\")) is unknown", orig_p);
4473 ret = false;
4477 else
4478 gcc_unreachable ();
4481 return ret;
4484 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4486 tree
4487 ix86_valid_target_attribute_tree (tree args)
4489 const char *orig_arch_string = ix86_arch_string;
4490 const char *orig_tune_string = ix86_tune_string;
4491 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4492 int orig_tune_defaulted = ix86_tune_defaulted;
4493 int orig_arch_specified = ix86_arch_specified;
4494 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4495 tree t = NULL_TREE;
4496 int i;
4497 struct cl_target_option *def
4498 = TREE_TARGET_OPTION (target_option_default_node);
4499 struct gcc_options enum_opts_set;
4501 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4503 /* Process each of the options on the chain. */
4504 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4505 &enum_opts_set))
4506 return error_mark_node;
4508 /* If the changed options are different from the default, rerun
4509 ix86_option_override_internal, and then save the options away.
4510 The string options are are attribute options, and will be undone
4511 when we copy the save structure. */
4512 if (ix86_isa_flags != def->x_ix86_isa_flags
4513 || target_flags != def->x_target_flags
4514 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4515 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4516 || enum_opts_set.x_ix86_fpmath)
4518 /* If we are using the default tune= or arch=, undo the string assigned,
4519 and use the default. */
4520 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4521 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4522 else if (!orig_arch_specified)
4523 ix86_arch_string = NULL;
4525 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4526 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4527 else if (orig_tune_defaulted)
4528 ix86_tune_string = NULL;
4530 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4531 if (enum_opts_set.x_ix86_fpmath)
4532 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4533 else if (!TARGET_64BIT && TARGET_SSE)
4535 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4536 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4539 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4540 ix86_option_override_internal (false);
4542 /* Add any builtin functions with the new isa if any. */
4543 ix86_add_new_builtins (ix86_isa_flags);
4545 /* Save the current options unless we are validating options for
4546 #pragma. */
4547 t = build_target_option_node ();
4549 ix86_arch_string = orig_arch_string;
4550 ix86_tune_string = orig_tune_string;
4551 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4553 /* Free up memory allocated to hold the strings */
4554 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4555 free (option_strings[i]);
4558 return t;
4561 /* Hook to validate attribute((target("string"))). */
4563 static bool
4564 ix86_valid_target_attribute_p (tree fndecl,
4565 tree ARG_UNUSED (name),
4566 tree args,
4567 int ARG_UNUSED (flags))
4569 struct cl_target_option cur_target;
4570 bool ret = true;
4572 /* attribute((target("default"))) does nothing, beyond
4573 affecting multi-versioning. */
4574 if (TREE_VALUE (args)
4575 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4576 && TREE_CHAIN (args) == NULL_TREE
4577 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4578 return true;
4580 tree old_optimize = build_optimization_node ();
4581 tree new_target, new_optimize;
4582 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4584 /* If the function changed the optimization levels as well as setting target
4585 options, start with the optimizations specified. */
4586 if (func_optimize && func_optimize != old_optimize)
4587 cl_optimization_restore (&global_options,
4588 TREE_OPTIMIZATION (func_optimize));
4590 /* The target attributes may also change some optimization flags, so update
4591 the optimization options if necessary. */
4592 cl_target_option_save (&cur_target, &global_options);
4593 new_target = ix86_valid_target_attribute_tree (args);
4594 new_optimize = build_optimization_node ();
4596 if (new_target == error_mark_node)
4597 ret = false;
4599 else if (fndecl && new_target)
4601 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4603 if (old_optimize != new_optimize)
4604 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4607 cl_target_option_restore (&global_options, &cur_target);
4609 if (old_optimize != new_optimize)
4610 cl_optimization_restore (&global_options,
4611 TREE_OPTIMIZATION (old_optimize));
4613 return ret;
4617 /* Hook to determine if one function can safely inline another. */
4619 static bool
4620 ix86_can_inline_p (tree caller, tree callee)
4622 bool ret = false;
4623 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4624 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4626 /* If callee has no option attributes, then it is ok to inline. */
4627 if (!callee_tree)
4628 ret = true;
4630 /* If caller has no option attributes, but callee does then it is not ok to
4631 inline. */
4632 else if (!caller_tree)
4633 ret = false;
4635 else
4637 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4638 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4640 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4641 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4642 function. */
4643 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4644 != callee_opts->x_ix86_isa_flags)
4645 ret = false;
4647 /* See if we have the same non-isa options. */
4648 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4649 ret = false;
4651 /* See if arch, tune, etc. are the same. */
4652 else if (caller_opts->arch != callee_opts->arch)
4653 ret = false;
4655 else if (caller_opts->tune != callee_opts->tune)
4656 ret = false;
4658 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4659 ret = false;
4661 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4662 ret = false;
4664 else
4665 ret = true;
4668 return ret;
4672 /* Remember the last target of ix86_set_current_function. */
4673 static GTY(()) tree ix86_previous_fndecl;
4675 /* Invalidate ix86_previous_fndecl cache. */
4676 void
4677 ix86_reset_previous_fndecl (void)
4679 ix86_previous_fndecl = NULL_TREE;
4682 /* Establish appropriate back-end context for processing the function
4683 FNDECL. The argument might be NULL to indicate processing at top
4684 level, outside of any function scope. */
4685 static void
4686 ix86_set_current_function (tree fndecl)
4688 /* Only change the context if the function changes. This hook is called
4689 several times in the course of compiling a function, and we don't want to
4690 slow things down too much or call target_reinit when it isn't safe. */
4691 if (fndecl && fndecl != ix86_previous_fndecl)
4693 tree old_tree = (ix86_previous_fndecl
4694 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4695 : NULL_TREE);
4697 tree new_tree = (fndecl
4698 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4699 : NULL_TREE);
4701 ix86_previous_fndecl = fndecl;
4702 if (old_tree == new_tree)
4705 else if (new_tree)
4707 cl_target_option_restore (&global_options,
4708 TREE_TARGET_OPTION (new_tree));
4709 target_reinit ();
4712 else if (old_tree)
4714 struct cl_target_option *def
4715 = TREE_TARGET_OPTION (target_option_current_node);
4717 cl_target_option_restore (&global_options, def);
4718 target_reinit ();
4724 /* Return true if this goes in large data/bss. */
4726 static bool
4727 ix86_in_large_data_p (tree exp)
4729 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4730 return false;
4732 /* Functions are never large data. */
4733 if (TREE_CODE (exp) == FUNCTION_DECL)
4734 return false;
4736 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4738 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4739 if (strcmp (section, ".ldata") == 0
4740 || strcmp (section, ".lbss") == 0)
4741 return true;
4742 return false;
4744 else
4746 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4748 /* If this is an incomplete type with size 0, then we can't put it
4749 in data because it might be too big when completed. */
4750 if (!size || size > ix86_section_threshold)
4751 return true;
4754 return false;
4757 /* Switch to the appropriate section for output of DECL.
4758 DECL is either a `VAR_DECL' node or a constant of some sort.
4759 RELOC indicates whether forming the initial value of DECL requires
4760 link-time relocations. */
4762 ATTRIBUTE_UNUSED static section *
4763 x86_64_elf_select_section (tree decl, int reloc,
4764 unsigned HOST_WIDE_INT align)
4766 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4767 && ix86_in_large_data_p (decl))
4769 const char *sname = NULL;
4770 unsigned int flags = SECTION_WRITE;
4771 switch (categorize_decl_for_section (decl, reloc))
4773 case SECCAT_DATA:
4774 sname = ".ldata";
4775 break;
4776 case SECCAT_DATA_REL:
4777 sname = ".ldata.rel";
4778 break;
4779 case SECCAT_DATA_REL_LOCAL:
4780 sname = ".ldata.rel.local";
4781 break;
4782 case SECCAT_DATA_REL_RO:
4783 sname = ".ldata.rel.ro";
4784 break;
4785 case SECCAT_DATA_REL_RO_LOCAL:
4786 sname = ".ldata.rel.ro.local";
4787 break;
4788 case SECCAT_BSS:
4789 sname = ".lbss";
4790 flags |= SECTION_BSS;
4791 break;
4792 case SECCAT_RODATA:
4793 case SECCAT_RODATA_MERGE_STR:
4794 case SECCAT_RODATA_MERGE_STR_INIT:
4795 case SECCAT_RODATA_MERGE_CONST:
4796 sname = ".lrodata";
4797 flags = 0;
4798 break;
4799 case SECCAT_SRODATA:
4800 case SECCAT_SDATA:
4801 case SECCAT_SBSS:
4802 gcc_unreachable ();
4803 case SECCAT_TEXT:
4804 case SECCAT_TDATA:
4805 case SECCAT_TBSS:
4806 /* We don't split these for medium model. Place them into
4807 default sections and hope for best. */
4808 break;
4810 if (sname)
4812 /* We might get called with string constants, but get_named_section
4813 doesn't like them as they are not DECLs. Also, we need to set
4814 flags in that case. */
4815 if (!DECL_P (decl))
4816 return get_section (sname, flags, NULL);
4817 return get_named_section (decl, sname, reloc);
4820 return default_elf_select_section (decl, reloc, align);
4823 /* Select a set of attributes for section NAME based on the properties
4824 of DECL and whether or not RELOC indicates that DECL's initializer
4825 might contain runtime relocations. */
4827 static unsigned int ATTRIBUTE_UNUSED
4828 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
4830 unsigned int flags = default_section_type_flags (decl, name, reloc);
4832 if (decl == NULL_TREE
4833 && (strcmp (name, ".ldata.rel.ro") == 0
4834 || strcmp (name, ".ldata.rel.ro.local") == 0))
4835 flags |= SECTION_RELRO;
4837 if (strcmp (name, ".lbss") == 0
4838 || strncmp (name, ".lbss.", 5) == 0
4839 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
4840 flags |= SECTION_BSS;
4842 return flags;
4845 /* Build up a unique section name, expressed as a
4846 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4847 RELOC indicates whether the initial value of EXP requires
4848 link-time relocations. */
4850 static void ATTRIBUTE_UNUSED
4851 x86_64_elf_unique_section (tree decl, int reloc)
4853 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4854 && ix86_in_large_data_p (decl))
4856 const char *prefix = NULL;
4857 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4858 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4860 switch (categorize_decl_for_section (decl, reloc))
4862 case SECCAT_DATA:
4863 case SECCAT_DATA_REL:
4864 case SECCAT_DATA_REL_LOCAL:
4865 case SECCAT_DATA_REL_RO:
4866 case SECCAT_DATA_REL_RO_LOCAL:
4867 prefix = one_only ? ".ld" : ".ldata";
4868 break;
4869 case SECCAT_BSS:
4870 prefix = one_only ? ".lb" : ".lbss";
4871 break;
4872 case SECCAT_RODATA:
4873 case SECCAT_RODATA_MERGE_STR:
4874 case SECCAT_RODATA_MERGE_STR_INIT:
4875 case SECCAT_RODATA_MERGE_CONST:
4876 prefix = one_only ? ".lr" : ".lrodata";
4877 break;
4878 case SECCAT_SRODATA:
4879 case SECCAT_SDATA:
4880 case SECCAT_SBSS:
4881 gcc_unreachable ();
4882 case SECCAT_TEXT:
4883 case SECCAT_TDATA:
4884 case SECCAT_TBSS:
4885 /* We don't split these for medium model. Place them into
4886 default sections and hope for best. */
4887 break;
4889 if (prefix)
4891 const char *name, *linkonce;
4892 char *string;
4894 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4895 name = targetm.strip_name_encoding (name);
4897 /* If we're using one_only, then there needs to be a .gnu.linkonce
4898 prefix to the section name. */
4899 linkonce = one_only ? ".gnu.linkonce" : "";
4901 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4903 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4904 return;
4907 default_unique_section (decl, reloc);
4910 #ifdef COMMON_ASM_OP
4911 /* This says how to output assembler code to declare an
4912 uninitialized external linkage data object.
4914 For medium model x86-64 we need to use .largecomm opcode for
4915 large objects. */
4916 void
4917 x86_elf_aligned_common (FILE *file,
4918 const char *name, unsigned HOST_WIDE_INT size,
4919 int align)
4921 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4922 && size > (unsigned int)ix86_section_threshold)
4923 fputs (".largecomm\t", file);
4924 else
4925 fputs (COMMON_ASM_OP, file);
4926 assemble_name (file, name);
4927 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4928 size, align / BITS_PER_UNIT);
4930 #endif
4932 /* Utility function for targets to use in implementing
4933 ASM_OUTPUT_ALIGNED_BSS. */
4935 void
4936 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4937 const char *name, unsigned HOST_WIDE_INT size,
4938 int align)
4940 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4941 && size > (unsigned int)ix86_section_threshold)
4942 switch_to_section (get_named_section (decl, ".lbss", 0));
4943 else
4944 switch_to_section (bss_section);
4945 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4946 #ifdef ASM_DECLARE_OBJECT_NAME
4947 last_assemble_variable_decl = decl;
4948 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4949 #else
4950 /* Standard thing is just output label for the object. */
4951 ASM_OUTPUT_LABEL (file, name);
4952 #endif /* ASM_DECLARE_OBJECT_NAME */
4953 ASM_OUTPUT_SKIP (file, size ? size : 1);
4956 /* Decide whether we must probe the stack before any space allocation
4957 on this target. It's essentially TARGET_STACK_PROBE except when
4958 -fstack-check causes the stack to be already probed differently. */
4960 bool
4961 ix86_target_stack_probe (void)
4963 /* Do not probe the stack twice if static stack checking is enabled. */
4964 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4965 return false;
4967 return TARGET_STACK_PROBE;
4970 /* Decide whether we can make a sibling call to a function. DECL is the
4971 declaration of the function being targeted by the call and EXP is the
4972 CALL_EXPR representing the call. */
4974 static bool
4975 ix86_function_ok_for_sibcall (tree decl, tree exp)
4977 tree type, decl_or_type;
4978 rtx a, b;
4980 /* If we are generating position-independent code, we cannot sibcall
4981 optimize any indirect call, or a direct call to a global function,
4982 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4983 if (!TARGET_MACHO
4984 && !TARGET_64BIT
4985 && flag_pic
4986 && (!decl || !targetm.binds_local_p (decl)))
4987 return false;
4989 /* If we need to align the outgoing stack, then sibcalling would
4990 unalign the stack, which may break the called function. */
4991 if (ix86_minimum_incoming_stack_boundary (true)
4992 < PREFERRED_STACK_BOUNDARY)
4993 return false;
4995 if (decl)
4997 decl_or_type = decl;
4998 type = TREE_TYPE (decl);
5000 else
5002 /* We're looking at the CALL_EXPR, we need the type of the function. */
5003 type = CALL_EXPR_FN (exp); /* pointer expression */
5004 type = TREE_TYPE (type); /* pointer type */
5005 type = TREE_TYPE (type); /* function type */
5006 decl_or_type = type;
5009 /* Check that the return value locations are the same. Like
5010 if we are returning floats on the 80387 register stack, we cannot
5011 make a sibcall from a function that doesn't return a float to a
5012 function that does or, conversely, from a function that does return
5013 a float to a function that doesn't; the necessary stack adjustment
5014 would not be executed. This is also the place we notice
5015 differences in the return value ABI. Note that it is ok for one
5016 of the functions to have void return type as long as the return
5017 value of the other is passed in a register. */
5018 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5019 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5020 cfun->decl, false);
5021 if (STACK_REG_P (a) || STACK_REG_P (b))
5023 if (!rtx_equal_p (a, b))
5024 return false;
5026 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5028 else if (!rtx_equal_p (a, b))
5029 return false;
5031 if (TARGET_64BIT)
5033 /* The SYSV ABI has more call-clobbered registers;
5034 disallow sibcalls from MS to SYSV. */
5035 if (cfun->machine->call_abi == MS_ABI
5036 && ix86_function_type_abi (type) == SYSV_ABI)
5037 return false;
5039 else
5041 /* If this call is indirect, we'll need to be able to use a
5042 call-clobbered register for the address of the target function.
5043 Make sure that all such registers are not used for passing
5044 parameters. Note that DLLIMPORT functions are indirect. */
5045 if (!decl
5046 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5048 if (ix86_function_regparm (type, NULL) >= 3)
5050 /* ??? Need to count the actual number of registers to be used,
5051 not the possible number of registers. Fix later. */
5052 return false;
5057 /* Otherwise okay. That also includes certain types of indirect calls. */
5058 return true;
5061 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5062 and "sseregparm" calling convention attributes;
5063 arguments as in struct attribute_spec.handler. */
5065 static tree
5066 ix86_handle_cconv_attribute (tree *node, tree name,
5067 tree args,
5068 int flags ATTRIBUTE_UNUSED,
5069 bool *no_add_attrs)
5071 if (TREE_CODE (*node) != FUNCTION_TYPE
5072 && TREE_CODE (*node) != METHOD_TYPE
5073 && TREE_CODE (*node) != FIELD_DECL
5074 && TREE_CODE (*node) != TYPE_DECL)
5076 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5077 name);
5078 *no_add_attrs = true;
5079 return NULL_TREE;
5082 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5083 if (is_attribute_p ("regparm", name))
5085 tree cst;
5087 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5089 error ("fastcall and regparm attributes are not compatible");
5092 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5094 error ("regparam and thiscall attributes are not compatible");
5097 cst = TREE_VALUE (args);
5098 if (TREE_CODE (cst) != INTEGER_CST)
5100 warning (OPT_Wattributes,
5101 "%qE attribute requires an integer constant argument",
5102 name);
5103 *no_add_attrs = true;
5105 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5107 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5108 name, REGPARM_MAX);
5109 *no_add_attrs = true;
5112 return NULL_TREE;
5115 if (TARGET_64BIT)
5117 /* Do not warn when emulating the MS ABI. */
5118 if ((TREE_CODE (*node) != FUNCTION_TYPE
5119 && TREE_CODE (*node) != METHOD_TYPE)
5120 || ix86_function_type_abi (*node) != MS_ABI)
5121 warning (OPT_Wattributes, "%qE attribute ignored",
5122 name);
5123 *no_add_attrs = true;
5124 return NULL_TREE;
5127 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5128 if (is_attribute_p ("fastcall", name))
5130 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5132 error ("fastcall and cdecl attributes are not compatible");
5134 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5136 error ("fastcall and stdcall attributes are not compatible");
5138 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5140 error ("fastcall and regparm attributes are not compatible");
5142 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5144 error ("fastcall and thiscall attributes are not compatible");
5148 /* Can combine stdcall with fastcall (redundant), regparm and
5149 sseregparm. */
5150 else if (is_attribute_p ("stdcall", name))
5152 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5154 error ("stdcall and cdecl attributes are not compatible");
5156 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5158 error ("stdcall and fastcall attributes are not compatible");
5160 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5162 error ("stdcall and thiscall attributes are not compatible");
5166 /* Can combine cdecl with regparm and sseregparm. */
5167 else if (is_attribute_p ("cdecl", name))
5169 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5171 error ("stdcall and cdecl attributes are not compatible");
5173 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5175 error ("fastcall and cdecl attributes are not compatible");
5177 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5179 error ("cdecl and thiscall attributes are not compatible");
5182 else if (is_attribute_p ("thiscall", name))
5184 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5185 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5186 name);
5187 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5189 error ("stdcall and thiscall attributes are not compatible");
5191 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5193 error ("fastcall and thiscall attributes are not compatible");
5195 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5197 error ("cdecl and thiscall attributes are not compatible");
5201 /* Can combine sseregparm with all attributes. */
5203 return NULL_TREE;
5206 /* The transactional memory builtins are implicitly regparm or fastcall
5207 depending on the ABI. Override the generic do-nothing attribute that
5208 these builtins were declared with, and replace it with one of the two
5209 attributes that we expect elsewhere. */
5211 static tree
5212 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5213 tree args ATTRIBUTE_UNUSED,
5214 int flags, bool *no_add_attrs)
5216 tree alt;
5218 /* In no case do we want to add the placeholder attribute. */
5219 *no_add_attrs = true;
5221 /* The 64-bit ABI is unchanged for transactional memory. */
5222 if (TARGET_64BIT)
5223 return NULL_TREE;
5225 /* ??? Is there a better way to validate 32-bit windows? We have
5226 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5227 if (CHECK_STACK_LIMIT > 0)
5228 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5229 else
5231 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5232 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5234 decl_attributes (node, alt, flags);
5236 return NULL_TREE;
5239 /* This function determines from TYPE the calling-convention. */
5241 unsigned int
5242 ix86_get_callcvt (const_tree type)
5244 unsigned int ret = 0;
5245 bool is_stdarg;
5246 tree attrs;
5248 if (TARGET_64BIT)
5249 return IX86_CALLCVT_CDECL;
5251 attrs = TYPE_ATTRIBUTES (type);
5252 if (attrs != NULL_TREE)
5254 if (lookup_attribute ("cdecl", attrs))
5255 ret |= IX86_CALLCVT_CDECL;
5256 else if (lookup_attribute ("stdcall", attrs))
5257 ret |= IX86_CALLCVT_STDCALL;
5258 else if (lookup_attribute ("fastcall", attrs))
5259 ret |= IX86_CALLCVT_FASTCALL;
5260 else if (lookup_attribute ("thiscall", attrs))
5261 ret |= IX86_CALLCVT_THISCALL;
5263 /* Regparam isn't allowed for thiscall and fastcall. */
5264 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5266 if (lookup_attribute ("regparm", attrs))
5267 ret |= IX86_CALLCVT_REGPARM;
5268 if (lookup_attribute ("sseregparm", attrs))
5269 ret |= IX86_CALLCVT_SSEREGPARM;
5272 if (IX86_BASE_CALLCVT(ret) != 0)
5273 return ret;
5276 is_stdarg = stdarg_p (type);
5277 if (TARGET_RTD && !is_stdarg)
5278 return IX86_CALLCVT_STDCALL | ret;
5280 if (ret != 0
5281 || is_stdarg
5282 || TREE_CODE (type) != METHOD_TYPE
5283 || ix86_function_type_abi (type) != MS_ABI)
5284 return IX86_CALLCVT_CDECL | ret;
5286 return IX86_CALLCVT_THISCALL;
5289 /* Return 0 if the attributes for two types are incompatible, 1 if they
5290 are compatible, and 2 if they are nearly compatible (which causes a
5291 warning to be generated). */
5293 static int
5294 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5296 unsigned int ccvt1, ccvt2;
5298 if (TREE_CODE (type1) != FUNCTION_TYPE
5299 && TREE_CODE (type1) != METHOD_TYPE)
5300 return 1;
5302 ccvt1 = ix86_get_callcvt (type1);
5303 ccvt2 = ix86_get_callcvt (type2);
5304 if (ccvt1 != ccvt2)
5305 return 0;
5306 if (ix86_function_regparm (type1, NULL)
5307 != ix86_function_regparm (type2, NULL))
5308 return 0;
5310 return 1;
5313 /* Return the regparm value for a function with the indicated TYPE and DECL.
5314 DECL may be NULL when calling function indirectly
5315 or considering a libcall. */
5317 static int
5318 ix86_function_regparm (const_tree type, const_tree decl)
5320 tree attr;
5321 int regparm;
5322 unsigned int ccvt;
5324 if (TARGET_64BIT)
5325 return (ix86_function_type_abi (type) == SYSV_ABI
5326 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5327 ccvt = ix86_get_callcvt (type);
5328 regparm = ix86_regparm;
5330 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5332 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5333 if (attr)
5335 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5336 return regparm;
5339 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5340 return 2;
5341 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5342 return 1;
5344 /* Use register calling convention for local functions when possible. */
5345 if (decl
5346 && TREE_CODE (decl) == FUNCTION_DECL
5347 && optimize
5348 && !(profile_flag && !flag_fentry))
5350 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5351 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5352 if (i && i->local && i->can_change_signature)
5354 int local_regparm, globals = 0, regno;
5356 /* Make sure no regparm register is taken by a
5357 fixed register variable. */
5358 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5359 if (fixed_regs[local_regparm])
5360 break;
5362 /* We don't want to use regparm(3) for nested functions as
5363 these use a static chain pointer in the third argument. */
5364 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5365 local_regparm = 2;
5367 /* In 32-bit mode save a register for the split stack. */
5368 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5369 local_regparm = 2;
5371 /* Each fixed register usage increases register pressure,
5372 so less registers should be used for argument passing.
5373 This functionality can be overriden by an explicit
5374 regparm value. */
5375 for (regno = AX_REG; regno <= DI_REG; regno++)
5376 if (fixed_regs[regno])
5377 globals++;
5379 local_regparm
5380 = globals < local_regparm ? local_regparm - globals : 0;
5382 if (local_regparm > regparm)
5383 regparm = local_regparm;
5387 return regparm;
5390 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5391 DFmode (2) arguments in SSE registers for a function with the
5392 indicated TYPE and DECL. DECL may be NULL when calling function
5393 indirectly or considering a libcall. Otherwise return 0. */
5395 static int
5396 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5398 gcc_assert (!TARGET_64BIT);
5400 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5401 by the sseregparm attribute. */
5402 if (TARGET_SSEREGPARM
5403 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5405 if (!TARGET_SSE)
5407 if (warn)
5409 if (decl)
5410 error ("calling %qD with attribute sseregparm without "
5411 "SSE/SSE2 enabled", decl);
5412 else
5413 error ("calling %qT with attribute sseregparm without "
5414 "SSE/SSE2 enabled", type);
5416 return 0;
5419 return 2;
5422 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5423 (and DFmode for SSE2) arguments in SSE registers. */
5424 if (decl && TARGET_SSE_MATH && optimize
5425 && !(profile_flag && !flag_fentry))
5427 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5428 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5429 if (i && i->local && i->can_change_signature)
5430 return TARGET_SSE2 ? 2 : 1;
5433 return 0;
5436 /* Return true if EAX is live at the start of the function. Used by
5437 ix86_expand_prologue to determine if we need special help before
5438 calling allocate_stack_worker. */
5440 static bool
5441 ix86_eax_live_at_start_p (void)
5443 /* Cheat. Don't bother working forward from ix86_function_regparm
5444 to the function type to whether an actual argument is located in
5445 eax. Instead just look at cfg info, which is still close enough
5446 to correct at this point. This gives false positives for broken
5447 functions that might use uninitialized data that happens to be
5448 allocated in eax, but who cares? */
5449 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5452 static bool
5453 ix86_keep_aggregate_return_pointer (tree fntype)
5455 tree attr;
5457 if (!TARGET_64BIT)
5459 attr = lookup_attribute ("callee_pop_aggregate_return",
5460 TYPE_ATTRIBUTES (fntype));
5461 if (attr)
5462 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5464 /* For 32-bit MS-ABI the default is to keep aggregate
5465 return pointer. */
5466 if (ix86_function_type_abi (fntype) == MS_ABI)
5467 return true;
5469 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5472 /* Value is the number of bytes of arguments automatically
5473 popped when returning from a subroutine call.
5474 FUNDECL is the declaration node of the function (as a tree),
5475 FUNTYPE is the data type of the function (as a tree),
5476 or for a library call it is an identifier node for the subroutine name.
5477 SIZE is the number of bytes of arguments passed on the stack.
5479 On the 80386, the RTD insn may be used to pop them if the number
5480 of args is fixed, but if the number is variable then the caller
5481 must pop them all. RTD can't be used for library calls now
5482 because the library is compiled with the Unix compiler.
5483 Use of RTD is a selectable option, since it is incompatible with
5484 standard Unix calling sequences. If the option is not selected,
5485 the caller must always pop the args.
5487 The attribute stdcall is equivalent to RTD on a per module basis. */
5489 static int
5490 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5492 unsigned int ccvt;
5494 /* None of the 64-bit ABIs pop arguments. */
5495 if (TARGET_64BIT)
5496 return 0;
5498 ccvt = ix86_get_callcvt (funtype);
5500 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5501 | IX86_CALLCVT_THISCALL)) != 0
5502 && ! stdarg_p (funtype))
5503 return size;
5505 /* Lose any fake structure return argument if it is passed on the stack. */
5506 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5507 && !ix86_keep_aggregate_return_pointer (funtype))
5509 int nregs = ix86_function_regparm (funtype, fundecl);
5510 if (nregs == 0)
5511 return GET_MODE_SIZE (Pmode);
5514 return 0;
5517 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5519 static bool
5520 ix86_legitimate_combined_insn (rtx insn)
5522 /* Check operand constraints in case hard registers were propagated
5523 into insn pattern. This check prevents combine pass from
5524 generating insn patterns with invalid hard register operands.
5525 These invalid insns can eventually confuse reload to error out
5526 with a spill failure. See also PRs 46829 and 46843. */
5527 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5529 int i;
5531 extract_insn (insn);
5532 preprocess_constraints ();
5534 for (i = 0; i < recog_data.n_operands; i++)
5536 rtx op = recog_data.operand[i];
5537 enum machine_mode mode = GET_MODE (op);
5538 struct operand_alternative *op_alt;
5539 int offset = 0;
5540 bool win;
5541 int j;
5543 /* A unary operator may be accepted by the predicate, but it
5544 is irrelevant for matching constraints. */
5545 if (UNARY_P (op))
5546 op = XEXP (op, 0);
5548 if (GET_CODE (op) == SUBREG)
5550 if (REG_P (SUBREG_REG (op))
5551 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5552 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5553 GET_MODE (SUBREG_REG (op)),
5554 SUBREG_BYTE (op),
5555 GET_MODE (op));
5556 op = SUBREG_REG (op);
5559 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5560 continue;
5562 op_alt = recog_op_alt[i];
5564 /* Operand has no constraints, anything is OK. */
5565 win = !recog_data.n_alternatives;
5567 for (j = 0; j < recog_data.n_alternatives; j++)
5569 if (op_alt[j].anything_ok
5570 || (op_alt[j].matches != -1
5571 && operands_match_p
5572 (recog_data.operand[i],
5573 recog_data.operand[op_alt[j].matches]))
5574 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5576 win = true;
5577 break;
5581 if (!win)
5582 return false;
5586 return true;
5589 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5591 static unsigned HOST_WIDE_INT
5592 ix86_asan_shadow_offset (void)
5594 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5595 : HOST_WIDE_INT_C (0x7fff8000))
5596 : (HOST_WIDE_INT_1 << 29);
5599 /* Argument support functions. */
5601 /* Return true when register may be used to pass function parameters. */
5602 bool
5603 ix86_function_arg_regno_p (int regno)
5605 int i;
5606 const int *parm_regs;
5608 if (!TARGET_64BIT)
5610 if (TARGET_MACHO)
5611 return (regno < REGPARM_MAX
5612 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5613 else
5614 return (regno < REGPARM_MAX
5615 || (TARGET_MMX && MMX_REGNO_P (regno)
5616 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5617 || (TARGET_SSE && SSE_REGNO_P (regno)
5618 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5621 if (TARGET_SSE && SSE_REGNO_P (regno)
5622 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5623 return true;
5625 /* TODO: The function should depend on current function ABI but
5626 builtins.c would need updating then. Therefore we use the
5627 default ABI. */
5629 /* RAX is used as hidden argument to va_arg functions. */
5630 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5631 return true;
5633 if (ix86_abi == MS_ABI)
5634 parm_regs = x86_64_ms_abi_int_parameter_registers;
5635 else
5636 parm_regs = x86_64_int_parameter_registers;
5637 for (i = 0; i < (ix86_abi == MS_ABI
5638 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5639 if (regno == parm_regs[i])
5640 return true;
5641 return false;
5644 /* Return if we do not know how to pass TYPE solely in registers. */
5646 static bool
5647 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5649 if (must_pass_in_stack_var_size_or_pad (mode, type))
5650 return true;
5652 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5653 The layout_type routine is crafty and tries to trick us into passing
5654 currently unsupported vector types on the stack by using TImode. */
5655 return (!TARGET_64BIT && mode == TImode
5656 && type && TREE_CODE (type) != VECTOR_TYPE);
5659 /* It returns the size, in bytes, of the area reserved for arguments passed
5660 in registers for the function represented by fndecl dependent to the used
5661 abi format. */
5663 ix86_reg_parm_stack_space (const_tree fndecl)
5665 enum calling_abi call_abi = SYSV_ABI;
5666 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5667 call_abi = ix86_function_abi (fndecl);
5668 else
5669 call_abi = ix86_function_type_abi (fndecl);
5670 if (TARGET_64BIT && call_abi == MS_ABI)
5671 return 32;
5672 return 0;
5675 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5676 call abi used. */
5677 enum calling_abi
5678 ix86_function_type_abi (const_tree fntype)
5680 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5682 enum calling_abi abi = ix86_abi;
5683 if (abi == SYSV_ABI)
5685 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5686 abi = MS_ABI;
5688 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5689 abi = SYSV_ABI;
5690 return abi;
5692 return ix86_abi;
5695 /* We add this as a workaround in order to use libc_has_function
5696 hook in i386.md. */
5697 bool
5698 ix86_libc_has_function (enum function_class fn_class)
5700 return targetm.libc_has_function (fn_class);
5703 static bool
5704 ix86_function_ms_hook_prologue (const_tree fn)
5706 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5708 if (decl_function_context (fn) != NULL_TREE)
5709 error_at (DECL_SOURCE_LOCATION (fn),
5710 "ms_hook_prologue is not compatible with nested function");
5711 else
5712 return true;
5714 return false;
5717 static enum calling_abi
5718 ix86_function_abi (const_tree fndecl)
5720 if (! fndecl)
5721 return ix86_abi;
5722 return ix86_function_type_abi (TREE_TYPE (fndecl));
5725 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5726 call abi used. */
5727 enum calling_abi
5728 ix86_cfun_abi (void)
5730 if (! cfun)
5731 return ix86_abi;
5732 return cfun->machine->call_abi;
5735 /* Write the extra assembler code needed to declare a function properly. */
5737 void
5738 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5739 tree decl)
5741 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5743 if (is_ms_hook)
5745 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5746 unsigned int filler_cc = 0xcccccccc;
5748 for (i = 0; i < filler_count; i += 4)
5749 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5752 #ifdef SUBTARGET_ASM_UNWIND_INIT
5753 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5754 #endif
5756 ASM_OUTPUT_LABEL (asm_out_file, fname);
5758 /* Output magic byte marker, if hot-patch attribute is set. */
5759 if (is_ms_hook)
5761 if (TARGET_64BIT)
5763 /* leaq [%rsp + 0], %rsp */
5764 asm_fprintf (asm_out_file, ASM_BYTE
5765 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5767 else
5769 /* movl.s %edi, %edi
5770 push %ebp
5771 movl.s %esp, %ebp */
5772 asm_fprintf (asm_out_file, ASM_BYTE
5773 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5778 /* regclass.c */
5779 extern void init_regs (void);
5781 /* Implementation of call abi switching target hook. Specific to FNDECL
5782 the specific call register sets are set. See also
5783 ix86_conditional_register_usage for more details. */
5784 void
5785 ix86_call_abi_override (const_tree fndecl)
5787 if (fndecl == NULL_TREE)
5788 cfun->machine->call_abi = ix86_abi;
5789 else
5790 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5793 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5794 expensive re-initialization of init_regs each time we switch function context
5795 since this is needed only during RTL expansion. */
5796 static void
5797 ix86_maybe_switch_abi (void)
5799 if (TARGET_64BIT &&
5800 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5801 reinit_regs ();
5804 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5805 for a call to a function whose data type is FNTYPE.
5806 For a library call, FNTYPE is 0. */
5808 void
5809 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5810 tree fntype, /* tree ptr for function decl */
5811 rtx libname, /* SYMBOL_REF of library name or 0 */
5812 tree fndecl,
5813 int caller)
5815 struct cgraph_local_info *i;
5817 memset (cum, 0, sizeof (*cum));
5819 if (fndecl)
5821 i = cgraph_local_info (fndecl);
5822 cum->call_abi = ix86_function_abi (fndecl);
5824 else
5826 i = NULL;
5827 cum->call_abi = ix86_function_type_abi (fntype);
5830 cum->caller = caller;
5832 /* Set up the number of registers to use for passing arguments. */
5834 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5835 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5836 "or subtarget optimization implying it");
5837 cum->nregs = ix86_regparm;
5838 if (TARGET_64BIT)
5840 cum->nregs = (cum->call_abi == SYSV_ABI
5841 ? X86_64_REGPARM_MAX
5842 : X86_64_MS_REGPARM_MAX);
5844 if (TARGET_SSE)
5846 cum->sse_nregs = SSE_REGPARM_MAX;
5847 if (TARGET_64BIT)
5849 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5850 ? X86_64_SSE_REGPARM_MAX
5851 : X86_64_MS_SSE_REGPARM_MAX);
5854 if (TARGET_MMX)
5855 cum->mmx_nregs = MMX_REGPARM_MAX;
5856 cum->warn_avx = true;
5857 cum->warn_sse = true;
5858 cum->warn_mmx = true;
5860 /* Because type might mismatch in between caller and callee, we need to
5861 use actual type of function for local calls.
5862 FIXME: cgraph_analyze can be told to actually record if function uses
5863 va_start so for local functions maybe_vaarg can be made aggressive
5864 helping K&R code.
5865 FIXME: once typesytem is fixed, we won't need this code anymore. */
5866 if (i && i->local && i->can_change_signature)
5867 fntype = TREE_TYPE (fndecl);
5868 cum->maybe_vaarg = (fntype
5869 ? (!prototype_p (fntype) || stdarg_p (fntype))
5870 : !libname);
5872 if (!TARGET_64BIT)
5874 /* If there are variable arguments, then we won't pass anything
5875 in registers in 32-bit mode. */
5876 if (stdarg_p (fntype))
5878 cum->nregs = 0;
5879 cum->sse_nregs = 0;
5880 cum->mmx_nregs = 0;
5881 cum->warn_avx = 0;
5882 cum->warn_sse = 0;
5883 cum->warn_mmx = 0;
5884 return;
5887 /* Use ecx and edx registers if function has fastcall attribute,
5888 else look for regparm information. */
5889 if (fntype)
5891 unsigned int ccvt = ix86_get_callcvt (fntype);
5892 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5894 cum->nregs = 1;
5895 cum->fastcall = 1; /* Same first register as in fastcall. */
5897 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5899 cum->nregs = 2;
5900 cum->fastcall = 1;
5902 else
5903 cum->nregs = ix86_function_regparm (fntype, fndecl);
5906 /* Set up the number of SSE registers used for passing SFmode
5907 and DFmode arguments. Warn for mismatching ABI. */
5908 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5912 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5913 But in the case of vector types, it is some vector mode.
5915 When we have only some of our vector isa extensions enabled, then there
5916 are some modes for which vector_mode_supported_p is false. For these
5917 modes, the generic vector support in gcc will choose some non-vector mode
5918 in order to implement the type. By computing the natural mode, we'll
5919 select the proper ABI location for the operand and not depend on whatever
5920 the middle-end decides to do with these vector types.
5922 The midde-end can't deal with the vector types > 16 bytes. In this
5923 case, we return the original mode and warn ABI change if CUM isn't
5924 NULL. */
5926 static enum machine_mode
5927 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5929 enum machine_mode mode = TYPE_MODE (type);
5931 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5933 HOST_WIDE_INT size = int_size_in_bytes (type);
5934 if ((size == 8 || size == 16 || size == 32)
5935 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5936 && TYPE_VECTOR_SUBPARTS (type) > 1)
5938 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5940 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5941 mode = MIN_MODE_VECTOR_FLOAT;
5942 else
5943 mode = MIN_MODE_VECTOR_INT;
5945 /* Get the mode which has this inner mode and number of units. */
5946 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5947 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5948 && GET_MODE_INNER (mode) == innermode)
5950 if (size == 32 && !TARGET_AVX)
5952 static bool warnedavx;
5954 if (cum
5955 && !warnedavx
5956 && cum->warn_avx)
5958 warnedavx = true;
5959 warning (0, "AVX vector argument without AVX "
5960 "enabled changes the ABI");
5962 return TYPE_MODE (type);
5964 else if ((size == 8 || size == 16) && !TARGET_SSE)
5966 static bool warnedsse;
5968 if (cum
5969 && !warnedsse
5970 && cum->warn_sse)
5972 warnedsse = true;
5973 warning (0, "SSE vector argument without SSE "
5974 "enabled changes the ABI");
5976 return mode;
5978 else
5979 return mode;
5982 gcc_unreachable ();
5986 return mode;
5989 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5990 this may not agree with the mode that the type system has chosen for the
5991 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5992 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5994 static rtx
5995 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5996 unsigned int regno)
5998 rtx tmp;
6000 if (orig_mode != BLKmode)
6001 tmp = gen_rtx_REG (orig_mode, regno);
6002 else
6004 tmp = gen_rtx_REG (mode, regno);
6005 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6006 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6009 return tmp;
6012 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6013 of this code is to classify each 8bytes of incoming argument by the register
6014 class and assign registers accordingly. */
6016 /* Return the union class of CLASS1 and CLASS2.
6017 See the x86-64 PS ABI for details. */
6019 static enum x86_64_reg_class
6020 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6022 /* Rule #1: If both classes are equal, this is the resulting class. */
6023 if (class1 == class2)
6024 return class1;
6026 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6027 the other class. */
6028 if (class1 == X86_64_NO_CLASS)
6029 return class2;
6030 if (class2 == X86_64_NO_CLASS)
6031 return class1;
6033 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6034 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6035 return X86_64_MEMORY_CLASS;
6037 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6038 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6039 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6040 return X86_64_INTEGERSI_CLASS;
6041 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6042 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6043 return X86_64_INTEGER_CLASS;
6045 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6046 MEMORY is used. */
6047 if (class1 == X86_64_X87_CLASS
6048 || class1 == X86_64_X87UP_CLASS
6049 || class1 == X86_64_COMPLEX_X87_CLASS
6050 || class2 == X86_64_X87_CLASS
6051 || class2 == X86_64_X87UP_CLASS
6052 || class2 == X86_64_COMPLEX_X87_CLASS)
6053 return X86_64_MEMORY_CLASS;
6055 /* Rule #6: Otherwise class SSE is used. */
6056 return X86_64_SSE_CLASS;
6059 /* Classify the argument of type TYPE and mode MODE.
6060 CLASSES will be filled by the register class used to pass each word
6061 of the operand. The number of words is returned. In case the parameter
6062 should be passed in memory, 0 is returned. As a special case for zero
6063 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6065 BIT_OFFSET is used internally for handling records and specifies offset
6066 of the offset in bits modulo 256 to avoid overflow cases.
6068 See the x86-64 PS ABI for details.
6071 static int
6072 classify_argument (enum machine_mode mode, const_tree type,
6073 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6075 HOST_WIDE_INT bytes =
6076 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6077 int words
6078 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6080 /* Variable sized entities are always passed/returned in memory. */
6081 if (bytes < 0)
6082 return 0;
6084 if (mode != VOIDmode
6085 && targetm.calls.must_pass_in_stack (mode, type))
6086 return 0;
6088 if (type && AGGREGATE_TYPE_P (type))
6090 int i;
6091 tree field;
6092 enum x86_64_reg_class subclasses[MAX_CLASSES];
6094 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6095 if (bytes > 32)
6096 return 0;
6098 for (i = 0; i < words; i++)
6099 classes[i] = X86_64_NO_CLASS;
6101 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6102 signalize memory class, so handle it as special case. */
6103 if (!words)
6105 classes[0] = X86_64_NO_CLASS;
6106 return 1;
6109 /* Classify each field of record and merge classes. */
6110 switch (TREE_CODE (type))
6112 case RECORD_TYPE:
6113 /* And now merge the fields of structure. */
6114 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6116 if (TREE_CODE (field) == FIELD_DECL)
6118 int num;
6120 if (TREE_TYPE (field) == error_mark_node)
6121 continue;
6123 /* Bitfields are always classified as integer. Handle them
6124 early, since later code would consider them to be
6125 misaligned integers. */
6126 if (DECL_BIT_FIELD (field))
6128 for (i = (int_bit_position (field)
6129 + (bit_offset % 64)) / 8 / 8;
6130 i < ((int_bit_position (field) + (bit_offset % 64))
6131 + tree_low_cst (DECL_SIZE (field), 0)
6132 + 63) / 8 / 8; i++)
6133 classes[i] =
6134 merge_classes (X86_64_INTEGER_CLASS,
6135 classes[i]);
6137 else
6139 int pos;
6141 type = TREE_TYPE (field);
6143 /* Flexible array member is ignored. */
6144 if (TYPE_MODE (type) == BLKmode
6145 && TREE_CODE (type) == ARRAY_TYPE
6146 && TYPE_SIZE (type) == NULL_TREE
6147 && TYPE_DOMAIN (type) != NULL_TREE
6148 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6149 == NULL_TREE))
6151 static bool warned;
6153 if (!warned && warn_psabi)
6155 warned = true;
6156 inform (input_location,
6157 "the ABI of passing struct with"
6158 " a flexible array member has"
6159 " changed in GCC 4.4");
6161 continue;
6163 num = classify_argument (TYPE_MODE (type), type,
6164 subclasses,
6165 (int_bit_position (field)
6166 + bit_offset) % 256);
6167 if (!num)
6168 return 0;
6169 pos = (int_bit_position (field)
6170 + (bit_offset % 64)) / 8 / 8;
6171 for (i = 0; i < num && (i + pos) < words; i++)
6172 classes[i + pos] =
6173 merge_classes (subclasses[i], classes[i + pos]);
6177 break;
6179 case ARRAY_TYPE:
6180 /* Arrays are handled as small records. */
6182 int num;
6183 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6184 TREE_TYPE (type), subclasses, bit_offset);
6185 if (!num)
6186 return 0;
6188 /* The partial classes are now full classes. */
6189 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6190 subclasses[0] = X86_64_SSE_CLASS;
6191 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6192 && !((bit_offset % 64) == 0 && bytes == 4))
6193 subclasses[0] = X86_64_INTEGER_CLASS;
6195 for (i = 0; i < words; i++)
6196 classes[i] = subclasses[i % num];
6198 break;
6200 case UNION_TYPE:
6201 case QUAL_UNION_TYPE:
6202 /* Unions are similar to RECORD_TYPE but offset is always 0.
6204 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6206 if (TREE_CODE (field) == FIELD_DECL)
6208 int num;
6210 if (TREE_TYPE (field) == error_mark_node)
6211 continue;
6213 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6214 TREE_TYPE (field), subclasses,
6215 bit_offset);
6216 if (!num)
6217 return 0;
6218 for (i = 0; i < num; i++)
6219 classes[i] = merge_classes (subclasses[i], classes[i]);
6222 break;
6224 default:
6225 gcc_unreachable ();
6228 if (words > 2)
6230 /* When size > 16 bytes, if the first one isn't
6231 X86_64_SSE_CLASS or any other ones aren't
6232 X86_64_SSEUP_CLASS, everything should be passed in
6233 memory. */
6234 if (classes[0] != X86_64_SSE_CLASS)
6235 return 0;
6237 for (i = 1; i < words; i++)
6238 if (classes[i] != X86_64_SSEUP_CLASS)
6239 return 0;
6242 /* Final merger cleanup. */
6243 for (i = 0; i < words; i++)
6245 /* If one class is MEMORY, everything should be passed in
6246 memory. */
6247 if (classes[i] == X86_64_MEMORY_CLASS)
6248 return 0;
6250 /* The X86_64_SSEUP_CLASS should be always preceded by
6251 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6252 if (classes[i] == X86_64_SSEUP_CLASS
6253 && classes[i - 1] != X86_64_SSE_CLASS
6254 && classes[i - 1] != X86_64_SSEUP_CLASS)
6256 /* The first one should never be X86_64_SSEUP_CLASS. */
6257 gcc_assert (i != 0);
6258 classes[i] = X86_64_SSE_CLASS;
6261 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6262 everything should be passed in memory. */
6263 if (classes[i] == X86_64_X87UP_CLASS
6264 && (classes[i - 1] != X86_64_X87_CLASS))
6266 static bool warned;
6268 /* The first one should never be X86_64_X87UP_CLASS. */
6269 gcc_assert (i != 0);
6270 if (!warned && warn_psabi)
6272 warned = true;
6273 inform (input_location,
6274 "the ABI of passing union with long double"
6275 " has changed in GCC 4.4");
6277 return 0;
6280 return words;
6283 /* Compute alignment needed. We align all types to natural boundaries with
6284 exception of XFmode that is aligned to 64bits. */
6285 if (mode != VOIDmode && mode != BLKmode)
6287 int mode_alignment = GET_MODE_BITSIZE (mode);
6289 if (mode == XFmode)
6290 mode_alignment = 128;
6291 else if (mode == XCmode)
6292 mode_alignment = 256;
6293 if (COMPLEX_MODE_P (mode))
6294 mode_alignment /= 2;
6295 /* Misaligned fields are always returned in memory. */
6296 if (bit_offset % mode_alignment)
6297 return 0;
6300 /* for V1xx modes, just use the base mode */
6301 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6302 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6303 mode = GET_MODE_INNER (mode);
6305 /* Classification of atomic types. */
6306 switch (mode)
6308 case SDmode:
6309 case DDmode:
6310 classes[0] = X86_64_SSE_CLASS;
6311 return 1;
6312 case TDmode:
6313 classes[0] = X86_64_SSE_CLASS;
6314 classes[1] = X86_64_SSEUP_CLASS;
6315 return 2;
6316 case DImode:
6317 case SImode:
6318 case HImode:
6319 case QImode:
6320 case CSImode:
6321 case CHImode:
6322 case CQImode:
6324 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6326 if (size <= 32)
6328 classes[0] = X86_64_INTEGERSI_CLASS;
6329 return 1;
6331 else if (size <= 64)
6333 classes[0] = X86_64_INTEGER_CLASS;
6334 return 1;
6336 else if (size <= 64+32)
6338 classes[0] = X86_64_INTEGER_CLASS;
6339 classes[1] = X86_64_INTEGERSI_CLASS;
6340 return 2;
6342 else if (size <= 64+64)
6344 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6345 return 2;
6347 else
6348 gcc_unreachable ();
6350 case CDImode:
6351 case TImode:
6352 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6353 return 2;
6354 case COImode:
6355 case OImode:
6356 /* OImode shouldn't be used directly. */
6357 gcc_unreachable ();
6358 case CTImode:
6359 return 0;
6360 case SFmode:
6361 if (!(bit_offset % 64))
6362 classes[0] = X86_64_SSESF_CLASS;
6363 else
6364 classes[0] = X86_64_SSE_CLASS;
6365 return 1;
6366 case DFmode:
6367 classes[0] = X86_64_SSEDF_CLASS;
6368 return 1;
6369 case XFmode:
6370 classes[0] = X86_64_X87_CLASS;
6371 classes[1] = X86_64_X87UP_CLASS;
6372 return 2;
6373 case TFmode:
6374 classes[0] = X86_64_SSE_CLASS;
6375 classes[1] = X86_64_SSEUP_CLASS;
6376 return 2;
6377 case SCmode:
6378 classes[0] = X86_64_SSE_CLASS;
6379 if (!(bit_offset % 64))
6380 return 1;
6381 else
6383 static bool warned;
6385 if (!warned && warn_psabi)
6387 warned = true;
6388 inform (input_location,
6389 "the ABI of passing structure with complex float"
6390 " member has changed in GCC 4.4");
6392 classes[1] = X86_64_SSESF_CLASS;
6393 return 2;
6395 case DCmode:
6396 classes[0] = X86_64_SSEDF_CLASS;
6397 classes[1] = X86_64_SSEDF_CLASS;
6398 return 2;
6399 case XCmode:
6400 classes[0] = X86_64_COMPLEX_X87_CLASS;
6401 return 1;
6402 case TCmode:
6403 /* This modes is larger than 16 bytes. */
6404 return 0;
6405 case V8SFmode:
6406 case V8SImode:
6407 case V32QImode:
6408 case V16HImode:
6409 case V4DFmode:
6410 case V4DImode:
6411 classes[0] = X86_64_SSE_CLASS;
6412 classes[1] = X86_64_SSEUP_CLASS;
6413 classes[2] = X86_64_SSEUP_CLASS;
6414 classes[3] = X86_64_SSEUP_CLASS;
6415 return 4;
6416 case V4SFmode:
6417 case V4SImode:
6418 case V16QImode:
6419 case V8HImode:
6420 case V2DFmode:
6421 case V2DImode:
6422 classes[0] = X86_64_SSE_CLASS;
6423 classes[1] = X86_64_SSEUP_CLASS;
6424 return 2;
6425 case V1TImode:
6426 case V1DImode:
6427 case V2SFmode:
6428 case V2SImode:
6429 case V4HImode:
6430 case V8QImode:
6431 classes[0] = X86_64_SSE_CLASS;
6432 return 1;
6433 case BLKmode:
6434 case VOIDmode:
6435 return 0;
6436 default:
6437 gcc_assert (VECTOR_MODE_P (mode));
6439 if (bytes > 16)
6440 return 0;
6442 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6444 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6445 classes[0] = X86_64_INTEGERSI_CLASS;
6446 else
6447 classes[0] = X86_64_INTEGER_CLASS;
6448 classes[1] = X86_64_INTEGER_CLASS;
6449 return 1 + (bytes > 8);
6453 /* Examine the argument and return set number of register required in each
6454 class. Return 0 iff parameter should be passed in memory. */
6455 static int
6456 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6457 int *int_nregs, int *sse_nregs)
6459 enum x86_64_reg_class regclass[MAX_CLASSES];
6460 int n = classify_argument (mode, type, regclass, 0);
6462 *int_nregs = 0;
6463 *sse_nregs = 0;
6464 if (!n)
6465 return 0;
6466 for (n--; n >= 0; n--)
6467 switch (regclass[n])
6469 case X86_64_INTEGER_CLASS:
6470 case X86_64_INTEGERSI_CLASS:
6471 (*int_nregs)++;
6472 break;
6473 case X86_64_SSE_CLASS:
6474 case X86_64_SSESF_CLASS:
6475 case X86_64_SSEDF_CLASS:
6476 (*sse_nregs)++;
6477 break;
6478 case X86_64_NO_CLASS:
6479 case X86_64_SSEUP_CLASS:
6480 break;
6481 case X86_64_X87_CLASS:
6482 case X86_64_X87UP_CLASS:
6483 if (!in_return)
6484 return 0;
6485 break;
6486 case X86_64_COMPLEX_X87_CLASS:
6487 return in_return ? 2 : 0;
6488 case X86_64_MEMORY_CLASS:
6489 gcc_unreachable ();
6491 return 1;
6494 /* Construct container for the argument used by GCC interface. See
6495 FUNCTION_ARG for the detailed description. */
6497 static rtx
6498 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6499 const_tree type, int in_return, int nintregs, int nsseregs,
6500 const int *intreg, int sse_regno)
6502 /* The following variables hold the static issued_error state. */
6503 static bool issued_sse_arg_error;
6504 static bool issued_sse_ret_error;
6505 static bool issued_x87_ret_error;
6507 enum machine_mode tmpmode;
6508 int bytes =
6509 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6510 enum x86_64_reg_class regclass[MAX_CLASSES];
6511 int n;
6512 int i;
6513 int nexps = 0;
6514 int needed_sseregs, needed_intregs;
6515 rtx exp[MAX_CLASSES];
6516 rtx ret;
6518 n = classify_argument (mode, type, regclass, 0);
6519 if (!n)
6520 return NULL;
6521 if (!examine_argument (mode, type, in_return, &needed_intregs,
6522 &needed_sseregs))
6523 return NULL;
6524 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6525 return NULL;
6527 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6528 some less clueful developer tries to use floating-point anyway. */
6529 if (needed_sseregs && !TARGET_SSE)
6531 if (in_return)
6533 if (!issued_sse_ret_error)
6535 error ("SSE register return with SSE disabled");
6536 issued_sse_ret_error = true;
6539 else if (!issued_sse_arg_error)
6541 error ("SSE register argument with SSE disabled");
6542 issued_sse_arg_error = true;
6544 return NULL;
6547 /* Likewise, error if the ABI requires us to return values in the
6548 x87 registers and the user specified -mno-80387. */
6549 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6550 for (i = 0; i < n; i++)
6551 if (regclass[i] == X86_64_X87_CLASS
6552 || regclass[i] == X86_64_X87UP_CLASS
6553 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6555 if (!issued_x87_ret_error)
6557 error ("x87 register return with x87 disabled");
6558 issued_x87_ret_error = true;
6560 return NULL;
6563 /* First construct simple cases. Avoid SCmode, since we want to use
6564 single register to pass this type. */
6565 if (n == 1 && mode != SCmode)
6566 switch (regclass[0])
6568 case X86_64_INTEGER_CLASS:
6569 case X86_64_INTEGERSI_CLASS:
6570 return gen_rtx_REG (mode, intreg[0]);
6571 case X86_64_SSE_CLASS:
6572 case X86_64_SSESF_CLASS:
6573 case X86_64_SSEDF_CLASS:
6574 if (mode != BLKmode)
6575 return gen_reg_or_parallel (mode, orig_mode,
6576 SSE_REGNO (sse_regno));
6577 break;
6578 case X86_64_X87_CLASS:
6579 case X86_64_COMPLEX_X87_CLASS:
6580 return gen_rtx_REG (mode, FIRST_STACK_REG);
6581 case X86_64_NO_CLASS:
6582 /* Zero sized array, struct or class. */
6583 return NULL;
6584 default:
6585 gcc_unreachable ();
6587 if (n == 2
6588 && regclass[0] == X86_64_SSE_CLASS
6589 && regclass[1] == X86_64_SSEUP_CLASS
6590 && mode != BLKmode)
6591 return gen_reg_or_parallel (mode, orig_mode,
6592 SSE_REGNO (sse_regno));
6593 if (n == 4
6594 && regclass[0] == X86_64_SSE_CLASS
6595 && regclass[1] == X86_64_SSEUP_CLASS
6596 && regclass[2] == X86_64_SSEUP_CLASS
6597 && regclass[3] == X86_64_SSEUP_CLASS
6598 && mode != BLKmode)
6599 return gen_reg_or_parallel (mode, orig_mode,
6600 SSE_REGNO (sse_regno));
6601 if (n == 2
6602 && regclass[0] == X86_64_X87_CLASS
6603 && regclass[1] == X86_64_X87UP_CLASS)
6604 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6606 if (n == 2
6607 && regclass[0] == X86_64_INTEGER_CLASS
6608 && regclass[1] == X86_64_INTEGER_CLASS
6609 && (mode == CDImode || mode == TImode || mode == TFmode)
6610 && intreg[0] + 1 == intreg[1])
6611 return gen_rtx_REG (mode, intreg[0]);
6613 /* Otherwise figure out the entries of the PARALLEL. */
6614 for (i = 0; i < n; i++)
6616 int pos;
6618 switch (regclass[i])
6620 case X86_64_NO_CLASS:
6621 break;
6622 case X86_64_INTEGER_CLASS:
6623 case X86_64_INTEGERSI_CLASS:
6624 /* Merge TImodes on aligned occasions here too. */
6625 if (i * 8 + 8 > bytes)
6626 tmpmode
6627 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6628 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6629 tmpmode = SImode;
6630 else
6631 tmpmode = DImode;
6632 /* We've requested 24 bytes we
6633 don't have mode for. Use DImode. */
6634 if (tmpmode == BLKmode)
6635 tmpmode = DImode;
6636 exp [nexps++]
6637 = gen_rtx_EXPR_LIST (VOIDmode,
6638 gen_rtx_REG (tmpmode, *intreg),
6639 GEN_INT (i*8));
6640 intreg++;
6641 break;
6642 case X86_64_SSESF_CLASS:
6643 exp [nexps++]
6644 = gen_rtx_EXPR_LIST (VOIDmode,
6645 gen_rtx_REG (SFmode,
6646 SSE_REGNO (sse_regno)),
6647 GEN_INT (i*8));
6648 sse_regno++;
6649 break;
6650 case X86_64_SSEDF_CLASS:
6651 exp [nexps++]
6652 = gen_rtx_EXPR_LIST (VOIDmode,
6653 gen_rtx_REG (DFmode,
6654 SSE_REGNO (sse_regno)),
6655 GEN_INT (i*8));
6656 sse_regno++;
6657 break;
6658 case X86_64_SSE_CLASS:
6659 pos = i;
6660 switch (n)
6662 case 1:
6663 tmpmode = DImode;
6664 break;
6665 case 2:
6666 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6668 tmpmode = TImode;
6669 i++;
6671 else
6672 tmpmode = DImode;
6673 break;
6674 case 4:
6675 gcc_assert (i == 0
6676 && regclass[1] == X86_64_SSEUP_CLASS
6677 && regclass[2] == X86_64_SSEUP_CLASS
6678 && regclass[3] == X86_64_SSEUP_CLASS);
6679 tmpmode = OImode;
6680 i += 3;
6681 break;
6682 default:
6683 gcc_unreachable ();
6685 exp [nexps++]
6686 = gen_rtx_EXPR_LIST (VOIDmode,
6687 gen_rtx_REG (tmpmode,
6688 SSE_REGNO (sse_regno)),
6689 GEN_INT (pos*8));
6690 sse_regno++;
6691 break;
6692 default:
6693 gcc_unreachable ();
6697 /* Empty aligned struct, union or class. */
6698 if (nexps == 0)
6699 return NULL;
6701 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6702 for (i = 0; i < nexps; i++)
6703 XVECEXP (ret, 0, i) = exp [i];
6704 return ret;
6707 /* Update the data in CUM to advance over an argument of mode MODE
6708 and data type TYPE. (TYPE is null for libcalls where that information
6709 may not be available.) */
6711 static void
6712 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6713 const_tree type, HOST_WIDE_INT bytes,
6714 HOST_WIDE_INT words)
6716 switch (mode)
6718 default:
6719 break;
6721 case BLKmode:
6722 if (bytes < 0)
6723 break;
6724 /* FALLTHRU */
6726 case DImode:
6727 case SImode:
6728 case HImode:
6729 case QImode:
6730 cum->words += words;
6731 cum->nregs -= words;
6732 cum->regno += words;
6734 if (cum->nregs <= 0)
6736 cum->nregs = 0;
6737 cum->regno = 0;
6739 break;
6741 case OImode:
6742 /* OImode shouldn't be used directly. */
6743 gcc_unreachable ();
6745 case DFmode:
6746 if (cum->float_in_sse < 2)
6747 break;
6748 case SFmode:
6749 if (cum->float_in_sse < 1)
6750 break;
6751 /* FALLTHRU */
6753 case V8SFmode:
6754 case V8SImode:
6755 case V32QImode:
6756 case V16HImode:
6757 case V4DFmode:
6758 case V4DImode:
6759 case TImode:
6760 case V16QImode:
6761 case V8HImode:
6762 case V4SImode:
6763 case V2DImode:
6764 case V4SFmode:
6765 case V2DFmode:
6766 if (!type || !AGGREGATE_TYPE_P (type))
6768 cum->sse_words += words;
6769 cum->sse_nregs -= 1;
6770 cum->sse_regno += 1;
6771 if (cum->sse_nregs <= 0)
6773 cum->sse_nregs = 0;
6774 cum->sse_regno = 0;
6777 break;
6779 case V8QImode:
6780 case V4HImode:
6781 case V2SImode:
6782 case V2SFmode:
6783 case V1TImode:
6784 case V1DImode:
6785 if (!type || !AGGREGATE_TYPE_P (type))
6787 cum->mmx_words += words;
6788 cum->mmx_nregs -= 1;
6789 cum->mmx_regno += 1;
6790 if (cum->mmx_nregs <= 0)
6792 cum->mmx_nregs = 0;
6793 cum->mmx_regno = 0;
6796 break;
6800 static void
6801 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6802 const_tree type, HOST_WIDE_INT words, bool named)
6804 int int_nregs, sse_nregs;
6806 /* Unnamed 256bit vector mode parameters are passed on stack. */
6807 if (!named && VALID_AVX256_REG_MODE (mode))
6808 return;
6810 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6811 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6813 cum->nregs -= int_nregs;
6814 cum->sse_nregs -= sse_nregs;
6815 cum->regno += int_nregs;
6816 cum->sse_regno += sse_nregs;
6818 else
6820 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6821 cum->words = (cum->words + align - 1) & ~(align - 1);
6822 cum->words += words;
6826 static void
6827 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6828 HOST_WIDE_INT words)
6830 /* Otherwise, this should be passed indirect. */
6831 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6833 cum->words += words;
6834 if (cum->nregs > 0)
6836 cum->nregs -= 1;
6837 cum->regno += 1;
6841 /* Update the data in CUM to advance over an argument of mode MODE and
6842 data type TYPE. (TYPE is null for libcalls where that information
6843 may not be available.) */
6845 static void
6846 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6847 const_tree type, bool named)
6849 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6850 HOST_WIDE_INT bytes, words;
6852 if (mode == BLKmode)
6853 bytes = int_size_in_bytes (type);
6854 else
6855 bytes = GET_MODE_SIZE (mode);
6856 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6858 if (type)
6859 mode = type_natural_mode (type, NULL);
6861 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6862 function_arg_advance_ms_64 (cum, bytes, words);
6863 else if (TARGET_64BIT)
6864 function_arg_advance_64 (cum, mode, type, words, named);
6865 else
6866 function_arg_advance_32 (cum, mode, type, bytes, words);
6869 /* Define where to put the arguments to a function.
6870 Value is zero to push the argument on the stack,
6871 or a hard register in which to store the argument.
6873 MODE is the argument's machine mode.
6874 TYPE is the data type of the argument (as a tree).
6875 This is null for libcalls where that information may
6876 not be available.
6877 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6878 the preceding args and about the function being called.
6879 NAMED is nonzero if this argument is a named parameter
6880 (otherwise it is an extra parameter matching an ellipsis). */
6882 static rtx
6883 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6884 enum machine_mode orig_mode, const_tree type,
6885 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6887 static bool warnedsse, warnedmmx;
6889 /* Avoid the AL settings for the Unix64 ABI. */
6890 if (mode == VOIDmode)
6891 return constm1_rtx;
6893 switch (mode)
6895 default:
6896 break;
6898 case BLKmode:
6899 if (bytes < 0)
6900 break;
6901 /* FALLTHRU */
6902 case DImode:
6903 case SImode:
6904 case HImode:
6905 case QImode:
6906 if (words <= cum->nregs)
6908 int regno = cum->regno;
6910 /* Fastcall allocates the first two DWORD (SImode) or
6911 smaller arguments to ECX and EDX if it isn't an
6912 aggregate type . */
6913 if (cum->fastcall)
6915 if (mode == BLKmode
6916 || mode == DImode
6917 || (type && AGGREGATE_TYPE_P (type)))
6918 break;
6920 /* ECX not EAX is the first allocated register. */
6921 if (regno == AX_REG)
6922 regno = CX_REG;
6924 return gen_rtx_REG (mode, regno);
6926 break;
6928 case DFmode:
6929 if (cum->float_in_sse < 2)
6930 break;
6931 case SFmode:
6932 if (cum->float_in_sse < 1)
6933 break;
6934 /* FALLTHRU */
6935 case TImode:
6936 /* In 32bit, we pass TImode in xmm registers. */
6937 case V16QImode:
6938 case V8HImode:
6939 case V4SImode:
6940 case V2DImode:
6941 case V4SFmode:
6942 case V2DFmode:
6943 if (!type || !AGGREGATE_TYPE_P (type))
6945 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6947 warnedsse = true;
6948 warning (0, "SSE vector argument without SSE enabled "
6949 "changes the ABI");
6951 if (cum->sse_nregs)
6952 return gen_reg_or_parallel (mode, orig_mode,
6953 cum->sse_regno + FIRST_SSE_REG);
6955 break;
6957 case OImode:
6958 /* OImode shouldn't be used directly. */
6959 gcc_unreachable ();
6961 case V8SFmode:
6962 case V8SImode:
6963 case V32QImode:
6964 case V16HImode:
6965 case V4DFmode:
6966 case V4DImode:
6967 if (!type || !AGGREGATE_TYPE_P (type))
6969 if (cum->sse_nregs)
6970 return gen_reg_or_parallel (mode, orig_mode,
6971 cum->sse_regno + FIRST_SSE_REG);
6973 break;
6975 case V8QImode:
6976 case V4HImode:
6977 case V2SImode:
6978 case V2SFmode:
6979 case V1TImode:
6980 case V1DImode:
6981 if (!type || !AGGREGATE_TYPE_P (type))
6983 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6985 warnedmmx = true;
6986 warning (0, "MMX vector argument without MMX enabled "
6987 "changes the ABI");
6989 if (cum->mmx_nregs)
6990 return gen_reg_or_parallel (mode, orig_mode,
6991 cum->mmx_regno + FIRST_MMX_REG);
6993 break;
6996 return NULL_RTX;
6999 static rtx
7000 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7001 enum machine_mode orig_mode, const_tree type, bool named)
7003 /* Handle a hidden AL argument containing number of registers
7004 for varargs x86-64 functions. */
7005 if (mode == VOIDmode)
7006 return GEN_INT (cum->maybe_vaarg
7007 ? (cum->sse_nregs < 0
7008 ? X86_64_SSE_REGPARM_MAX
7009 : cum->sse_regno)
7010 : -1);
7012 switch (mode)
7014 default:
7015 break;
7017 case V8SFmode:
7018 case V8SImode:
7019 case V32QImode:
7020 case V16HImode:
7021 case V4DFmode:
7022 case V4DImode:
7023 /* Unnamed 256bit vector mode parameters are passed on stack. */
7024 if (!named)
7025 return NULL;
7026 break;
7029 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7030 cum->sse_nregs,
7031 &x86_64_int_parameter_registers [cum->regno],
7032 cum->sse_regno);
7035 static rtx
7036 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7037 enum machine_mode orig_mode, bool named,
7038 HOST_WIDE_INT bytes)
7040 unsigned int regno;
7042 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7043 We use value of -2 to specify that current function call is MSABI. */
7044 if (mode == VOIDmode)
7045 return GEN_INT (-2);
7047 /* If we've run out of registers, it goes on the stack. */
7048 if (cum->nregs == 0)
7049 return NULL_RTX;
7051 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7053 /* Only floating point modes are passed in anything but integer regs. */
7054 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7056 if (named)
7057 regno = cum->regno + FIRST_SSE_REG;
7058 else
7060 rtx t1, t2;
7062 /* Unnamed floating parameters are passed in both the
7063 SSE and integer registers. */
7064 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7065 t2 = gen_rtx_REG (mode, regno);
7066 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7067 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7068 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7071 /* Handle aggregated types passed in register. */
7072 if (orig_mode == BLKmode)
7074 if (bytes > 0 && bytes <= 8)
7075 mode = (bytes > 4 ? DImode : SImode);
7076 if (mode == BLKmode)
7077 mode = DImode;
7080 return gen_reg_or_parallel (mode, orig_mode, regno);
7083 /* Return where to put the arguments to a function.
7084 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7086 MODE is the argument's machine mode. TYPE is the data type of the
7087 argument. It is null for libcalls where that information may not be
7088 available. CUM gives information about the preceding args and about
7089 the function being called. NAMED is nonzero if this argument is a
7090 named parameter (otherwise it is an extra parameter matching an
7091 ellipsis). */
7093 static rtx
7094 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7095 const_tree type, bool named)
7097 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7098 enum machine_mode mode = omode;
7099 HOST_WIDE_INT bytes, words;
7100 rtx arg;
7102 if (mode == BLKmode)
7103 bytes = int_size_in_bytes (type);
7104 else
7105 bytes = GET_MODE_SIZE (mode);
7106 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7108 /* To simplify the code below, represent vector types with a vector mode
7109 even if MMX/SSE are not active. */
7110 if (type && TREE_CODE (type) == VECTOR_TYPE)
7111 mode = type_natural_mode (type, cum);
7113 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7114 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7115 else if (TARGET_64BIT)
7116 arg = function_arg_64 (cum, mode, omode, type, named);
7117 else
7118 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7120 return arg;
7123 /* A C expression that indicates when an argument must be passed by
7124 reference. If nonzero for an argument, a copy of that argument is
7125 made in memory and a pointer to the argument is passed instead of
7126 the argument itself. The pointer is passed in whatever way is
7127 appropriate for passing a pointer to that type. */
7129 static bool
7130 ix86_pass_by_reference (cumulative_args_t cum_v, enum machine_mode mode,
7131 const_tree type, bool named ATTRIBUTE_UNUSED)
7133 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7135 /* See Windows x64 Software Convention. */
7136 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7138 int msize = (int) GET_MODE_SIZE (mode);
7139 if (type)
7141 /* Arrays are passed by reference. */
7142 if (TREE_CODE (type) == ARRAY_TYPE)
7143 return true;
7145 if (AGGREGATE_TYPE_P (type))
7147 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7148 are passed by reference. */
7149 msize = int_size_in_bytes (type);
7153 /* __m128 is passed by reference. */
7154 switch (msize) {
7155 case 1: case 2: case 4: case 8:
7156 break;
7157 default:
7158 return true;
7161 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7162 return 1;
7164 return 0;
7167 /* Return true when TYPE should be 128bit aligned for 32bit argument
7168 passing ABI. XXX: This function is obsolete and is only used for
7169 checking psABI compatibility with previous versions of GCC. */
7171 static bool
7172 ix86_compat_aligned_value_p (const_tree type)
7174 enum machine_mode mode = TYPE_MODE (type);
7175 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7176 || mode == TDmode
7177 || mode == TFmode
7178 || mode == TCmode)
7179 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7180 return true;
7181 if (TYPE_ALIGN (type) < 128)
7182 return false;
7184 if (AGGREGATE_TYPE_P (type))
7186 /* Walk the aggregates recursively. */
7187 switch (TREE_CODE (type))
7189 case RECORD_TYPE:
7190 case UNION_TYPE:
7191 case QUAL_UNION_TYPE:
7193 tree field;
7195 /* Walk all the structure fields. */
7196 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7198 if (TREE_CODE (field) == FIELD_DECL
7199 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7200 return true;
7202 break;
7205 case ARRAY_TYPE:
7206 /* Just for use if some languages passes arrays by value. */
7207 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7208 return true;
7209 break;
7211 default:
7212 gcc_unreachable ();
7215 return false;
7218 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7219 XXX: This function is obsolete and is only used for checking psABI
7220 compatibility with previous versions of GCC. */
7222 static unsigned int
7223 ix86_compat_function_arg_boundary (enum machine_mode mode,
7224 const_tree type, unsigned int align)
7226 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7227 natural boundaries. */
7228 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7230 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7231 make an exception for SSE modes since these require 128bit
7232 alignment.
7234 The handling here differs from field_alignment. ICC aligns MMX
7235 arguments to 4 byte boundaries, while structure fields are aligned
7236 to 8 byte boundaries. */
7237 if (!type)
7239 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7240 align = PARM_BOUNDARY;
7242 else
7244 if (!ix86_compat_aligned_value_p (type))
7245 align = PARM_BOUNDARY;
7248 if (align > BIGGEST_ALIGNMENT)
7249 align = BIGGEST_ALIGNMENT;
7250 return align;
7253 /* Return true when TYPE should be 128bit aligned for 32bit argument
7254 passing ABI. */
7256 static bool
7257 ix86_contains_aligned_value_p (const_tree type)
7259 enum machine_mode mode = TYPE_MODE (type);
7261 if (mode == XFmode || mode == XCmode)
7262 return false;
7264 if (TYPE_ALIGN (type) < 128)
7265 return false;
7267 if (AGGREGATE_TYPE_P (type))
7269 /* Walk the aggregates recursively. */
7270 switch (TREE_CODE (type))
7272 case RECORD_TYPE:
7273 case UNION_TYPE:
7274 case QUAL_UNION_TYPE:
7276 tree field;
7278 /* Walk all the structure fields. */
7279 for (field = TYPE_FIELDS (type);
7280 field;
7281 field = DECL_CHAIN (field))
7283 if (TREE_CODE (field) == FIELD_DECL
7284 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7285 return true;
7287 break;
7290 case ARRAY_TYPE:
7291 /* Just for use if some languages passes arrays by value. */
7292 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7293 return true;
7294 break;
7296 default:
7297 gcc_unreachable ();
7300 else
7301 return TYPE_ALIGN (type) >= 128;
7303 return false;
7306 /* Gives the alignment boundary, in bits, of an argument with the
7307 specified mode and type. */
7309 static unsigned int
7310 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7312 unsigned int align;
7313 if (type)
7315 /* Since the main variant type is used for call, we convert it to
7316 the main variant type. */
7317 type = TYPE_MAIN_VARIANT (type);
7318 align = TYPE_ALIGN (type);
7320 else
7321 align = GET_MODE_ALIGNMENT (mode);
7322 if (align < PARM_BOUNDARY)
7323 align = PARM_BOUNDARY;
7324 else
7326 static bool warned;
7327 unsigned int saved_align = align;
7329 if (!TARGET_64BIT)
7331 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7332 if (!type)
7334 if (mode == XFmode || mode == XCmode)
7335 align = PARM_BOUNDARY;
7337 else if (!ix86_contains_aligned_value_p (type))
7338 align = PARM_BOUNDARY;
7340 if (align < 128)
7341 align = PARM_BOUNDARY;
7344 if (warn_psabi
7345 && !warned
7346 && align != ix86_compat_function_arg_boundary (mode, type,
7347 saved_align))
7349 warned = true;
7350 inform (input_location,
7351 "The ABI for passing parameters with %d-byte"
7352 " alignment has changed in GCC 4.6",
7353 align / BITS_PER_UNIT);
7357 return align;
7360 /* Return true if N is a possible register number of function value. */
7362 static bool
7363 ix86_function_value_regno_p (const unsigned int regno)
7365 switch (regno)
7367 case AX_REG:
7368 return true;
7370 case FIRST_FLOAT_REG:
7371 /* TODO: The function should depend on current function ABI but
7372 builtins.c would need updating then. Therefore we use the
7373 default ABI. */
7374 if (TARGET_64BIT && ix86_abi == MS_ABI)
7375 return false;
7376 return TARGET_FLOAT_RETURNS_IN_80387;
7378 case FIRST_SSE_REG:
7379 return TARGET_SSE;
7381 case FIRST_MMX_REG:
7382 if (TARGET_MACHO || TARGET_64BIT)
7383 return false;
7384 return TARGET_MMX;
7387 return false;
7390 /* Define how to find the value returned by a function.
7391 VALTYPE is the data type of the value (as a tree).
7392 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7393 otherwise, FUNC is 0. */
7395 static rtx
7396 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7397 const_tree fntype, const_tree fn)
7399 unsigned int regno;
7401 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7402 we normally prevent this case when mmx is not available. However
7403 some ABIs may require the result to be returned like DImode. */
7404 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7405 regno = FIRST_MMX_REG;
7407 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7408 we prevent this case when sse is not available. However some ABIs
7409 may require the result to be returned like integer TImode. */
7410 else if (mode == TImode
7411 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7412 regno = FIRST_SSE_REG;
7414 /* 32-byte vector modes in %ymm0. */
7415 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7416 regno = FIRST_SSE_REG;
7418 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7419 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7420 regno = FIRST_FLOAT_REG;
7421 else
7422 /* Most things go in %eax. */
7423 regno = AX_REG;
7425 /* Override FP return register with %xmm0 for local functions when
7426 SSE math is enabled or for functions with sseregparm attribute. */
7427 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7429 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7430 if ((sse_level >= 1 && mode == SFmode)
7431 || (sse_level == 2 && mode == DFmode))
7432 regno = FIRST_SSE_REG;
7435 /* OImode shouldn't be used directly. */
7436 gcc_assert (mode != OImode);
7438 return gen_rtx_REG (orig_mode, regno);
7441 static rtx
7442 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7443 const_tree valtype)
7445 rtx ret;
7447 /* Handle libcalls, which don't provide a type node. */
7448 if (valtype == NULL)
7450 unsigned int regno;
7452 switch (mode)
7454 case SFmode:
7455 case SCmode:
7456 case DFmode:
7457 case DCmode:
7458 case TFmode:
7459 case SDmode:
7460 case DDmode:
7461 case TDmode:
7462 regno = FIRST_SSE_REG;
7463 break;
7464 case XFmode:
7465 case XCmode:
7466 regno = FIRST_FLOAT_REG;
7467 break;
7468 case TCmode:
7469 return NULL;
7470 default:
7471 regno = AX_REG;
7474 return gen_rtx_REG (mode, regno);
7476 else if (POINTER_TYPE_P (valtype))
7478 /* Pointers are always returned in word_mode. */
7479 mode = word_mode;
7482 ret = construct_container (mode, orig_mode, valtype, 1,
7483 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7484 x86_64_int_return_registers, 0);
7486 /* For zero sized structures, construct_container returns NULL, but we
7487 need to keep rest of compiler happy by returning meaningful value. */
7488 if (!ret)
7489 ret = gen_rtx_REG (orig_mode, AX_REG);
7491 return ret;
7494 static rtx
7495 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7496 const_tree valtype)
7498 unsigned int regno = AX_REG;
7500 if (TARGET_SSE)
7502 switch (GET_MODE_SIZE (mode))
7504 case 16:
7505 if (valtype != NULL_TREE
7506 && !VECTOR_INTEGER_TYPE_P (valtype)
7507 && !VECTOR_INTEGER_TYPE_P (valtype)
7508 && !INTEGRAL_TYPE_P (valtype)
7509 && !VECTOR_FLOAT_TYPE_P (valtype))
7510 break;
7511 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7512 && !COMPLEX_MODE_P (mode))
7513 regno = FIRST_SSE_REG;
7514 break;
7515 case 8:
7516 case 4:
7517 if (mode == SFmode || mode == DFmode)
7518 regno = FIRST_SSE_REG;
7519 break;
7520 default:
7521 break;
7524 return gen_rtx_REG (orig_mode, regno);
7527 static rtx
7528 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7529 enum machine_mode orig_mode, enum machine_mode mode)
7531 const_tree fn, fntype;
7533 fn = NULL_TREE;
7534 if (fntype_or_decl && DECL_P (fntype_or_decl))
7535 fn = fntype_or_decl;
7536 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7538 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7539 return function_value_ms_64 (orig_mode, mode, valtype);
7540 else if (TARGET_64BIT)
7541 return function_value_64 (orig_mode, mode, valtype);
7542 else
7543 return function_value_32 (orig_mode, mode, fntype, fn);
7546 static rtx
7547 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7548 bool outgoing ATTRIBUTE_UNUSED)
7550 enum machine_mode mode, orig_mode;
7552 orig_mode = TYPE_MODE (valtype);
7553 mode = type_natural_mode (valtype, NULL);
7554 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7557 /* Pointer function arguments and return values are promoted to
7558 word_mode. */
7560 static enum machine_mode
7561 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7562 int *punsignedp, const_tree fntype,
7563 int for_return)
7565 if (type != NULL_TREE && POINTER_TYPE_P (type))
7567 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7568 return word_mode;
7570 return default_promote_function_mode (type, mode, punsignedp, fntype,
7571 for_return);
7574 /* Return true if a structure, union or array with MODE containing FIELD
7575 should be accessed using BLKmode. */
7577 static bool
7578 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7580 /* Union with XFmode must be in BLKmode. */
7581 return (mode == XFmode
7582 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7583 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7587 ix86_libcall_value (enum machine_mode mode)
7589 return ix86_function_value_1 (NULL, NULL, mode, mode);
7592 /* Return true iff type is returned in memory. */
7594 static bool ATTRIBUTE_UNUSED
7595 return_in_memory_32 (const_tree type, enum machine_mode mode)
7597 HOST_WIDE_INT size;
7599 if (mode == BLKmode)
7600 return true;
7602 size = int_size_in_bytes (type);
7604 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7605 return false;
7607 if (VECTOR_MODE_P (mode) || mode == TImode)
7609 /* User-created vectors small enough to fit in EAX. */
7610 if (size < 8)
7611 return false;
7613 /* MMX/3dNow values are returned in MM0,
7614 except when it doesn't exits or the ABI prescribes otherwise. */
7615 if (size == 8)
7616 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7618 /* SSE values are returned in XMM0, except when it doesn't exist. */
7619 if (size == 16)
7620 return !TARGET_SSE;
7622 /* AVX values are returned in YMM0, except when it doesn't exist. */
7623 if (size == 32)
7624 return !TARGET_AVX;
7627 if (mode == XFmode)
7628 return false;
7630 if (size > 12)
7631 return true;
7633 /* OImode shouldn't be used directly. */
7634 gcc_assert (mode != OImode);
7636 return false;
7639 static bool ATTRIBUTE_UNUSED
7640 return_in_memory_64 (const_tree type, enum machine_mode mode)
7642 int needed_intregs, needed_sseregs;
7643 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7646 static bool ATTRIBUTE_UNUSED
7647 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7649 HOST_WIDE_INT size = int_size_in_bytes (type);
7651 /* __m128 is returned in xmm0. */
7652 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7653 || VECTOR_FLOAT_TYPE_P (type))
7654 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7655 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7656 return false;
7658 /* Otherwise, the size must be exactly in [1248]. */
7659 return size != 1 && size != 2 && size != 4 && size != 8;
7662 static bool
7663 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7665 #ifdef SUBTARGET_RETURN_IN_MEMORY
7666 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7667 #else
7668 const enum machine_mode mode = type_natural_mode (type, NULL);
7670 if (TARGET_64BIT)
7672 if (ix86_function_type_abi (fntype) == MS_ABI)
7673 return return_in_memory_ms_64 (type, mode);
7674 else
7675 return return_in_memory_64 (type, mode);
7677 else
7678 return return_in_memory_32 (type, mode);
7679 #endif
7682 /* When returning SSE vector types, we have a choice of either
7683 (1) being abi incompatible with a -march switch, or
7684 (2) generating an error.
7685 Given no good solution, I think the safest thing is one warning.
7686 The user won't be able to use -Werror, but....
7688 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7689 called in response to actually generating a caller or callee that
7690 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7691 via aggregate_value_p for general type probing from tree-ssa. */
7693 static rtx
7694 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7696 static bool warnedsse, warnedmmx;
7698 if (!TARGET_64BIT && type)
7700 /* Look at the return type of the function, not the function type. */
7701 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7703 if (!TARGET_SSE && !warnedsse)
7705 if (mode == TImode
7706 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7708 warnedsse = true;
7709 warning (0, "SSE vector return without SSE enabled "
7710 "changes the ABI");
7714 if (!TARGET_MMX && !warnedmmx)
7716 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7718 warnedmmx = true;
7719 warning (0, "MMX vector return without MMX enabled "
7720 "changes the ABI");
7725 return NULL;
7729 /* Create the va_list data type. */
7731 /* Returns the calling convention specific va_list date type.
7732 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7734 static tree
7735 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7737 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7739 /* For i386 we use plain pointer to argument area. */
7740 if (!TARGET_64BIT || abi == MS_ABI)
7741 return build_pointer_type (char_type_node);
7743 record = lang_hooks.types.make_type (RECORD_TYPE);
7744 type_decl = build_decl (BUILTINS_LOCATION,
7745 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7747 f_gpr = build_decl (BUILTINS_LOCATION,
7748 FIELD_DECL, get_identifier ("gp_offset"),
7749 unsigned_type_node);
7750 f_fpr = build_decl (BUILTINS_LOCATION,
7751 FIELD_DECL, get_identifier ("fp_offset"),
7752 unsigned_type_node);
7753 f_ovf = build_decl (BUILTINS_LOCATION,
7754 FIELD_DECL, get_identifier ("overflow_arg_area"),
7755 ptr_type_node);
7756 f_sav = build_decl (BUILTINS_LOCATION,
7757 FIELD_DECL, get_identifier ("reg_save_area"),
7758 ptr_type_node);
7760 va_list_gpr_counter_field = f_gpr;
7761 va_list_fpr_counter_field = f_fpr;
7763 DECL_FIELD_CONTEXT (f_gpr) = record;
7764 DECL_FIELD_CONTEXT (f_fpr) = record;
7765 DECL_FIELD_CONTEXT (f_ovf) = record;
7766 DECL_FIELD_CONTEXT (f_sav) = record;
7768 TYPE_STUB_DECL (record) = type_decl;
7769 TYPE_NAME (record) = type_decl;
7770 TYPE_FIELDS (record) = f_gpr;
7771 DECL_CHAIN (f_gpr) = f_fpr;
7772 DECL_CHAIN (f_fpr) = f_ovf;
7773 DECL_CHAIN (f_ovf) = f_sav;
7775 layout_type (record);
7777 /* The correct type is an array type of one element. */
7778 return build_array_type (record, build_index_type (size_zero_node));
7781 /* Setup the builtin va_list data type and for 64-bit the additional
7782 calling convention specific va_list data types. */
7784 static tree
7785 ix86_build_builtin_va_list (void)
7787 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7789 /* Initialize abi specific va_list builtin types. */
7790 if (TARGET_64BIT)
7792 tree t;
7793 if (ix86_abi == MS_ABI)
7795 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7796 if (TREE_CODE (t) != RECORD_TYPE)
7797 t = build_variant_type_copy (t);
7798 sysv_va_list_type_node = t;
7800 else
7802 t = ret;
7803 if (TREE_CODE (t) != RECORD_TYPE)
7804 t = build_variant_type_copy (t);
7805 sysv_va_list_type_node = t;
7807 if (ix86_abi != MS_ABI)
7809 t = ix86_build_builtin_va_list_abi (MS_ABI);
7810 if (TREE_CODE (t) != RECORD_TYPE)
7811 t = build_variant_type_copy (t);
7812 ms_va_list_type_node = t;
7814 else
7816 t = ret;
7817 if (TREE_CODE (t) != RECORD_TYPE)
7818 t = build_variant_type_copy (t);
7819 ms_va_list_type_node = t;
7823 return ret;
7826 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7828 static void
7829 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7831 rtx save_area, mem;
7832 alias_set_type set;
7833 int i, max;
7835 /* GPR size of varargs save area. */
7836 if (cfun->va_list_gpr_size)
7837 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7838 else
7839 ix86_varargs_gpr_size = 0;
7841 /* FPR size of varargs save area. We don't need it if we don't pass
7842 anything in SSE registers. */
7843 if (TARGET_SSE && cfun->va_list_fpr_size)
7844 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7845 else
7846 ix86_varargs_fpr_size = 0;
7848 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7849 return;
7851 save_area = frame_pointer_rtx;
7852 set = get_varargs_alias_set ();
7854 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7855 if (max > X86_64_REGPARM_MAX)
7856 max = X86_64_REGPARM_MAX;
7858 for (i = cum->regno; i < max; i++)
7860 mem = gen_rtx_MEM (word_mode,
7861 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7862 MEM_NOTRAP_P (mem) = 1;
7863 set_mem_alias_set (mem, set);
7864 emit_move_insn (mem,
7865 gen_rtx_REG (word_mode,
7866 x86_64_int_parameter_registers[i]));
7869 if (ix86_varargs_fpr_size)
7871 enum machine_mode smode;
7872 rtx label, test;
7874 /* Now emit code to save SSE registers. The AX parameter contains number
7875 of SSE parameter registers used to call this function, though all we
7876 actually check here is the zero/non-zero status. */
7878 label = gen_label_rtx ();
7879 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7880 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7881 label));
7883 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7884 we used movdqa (i.e. TImode) instead? Perhaps even better would
7885 be if we could determine the real mode of the data, via a hook
7886 into pass_stdarg. Ignore all that for now. */
7887 smode = V4SFmode;
7888 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7889 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7891 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7892 if (max > X86_64_SSE_REGPARM_MAX)
7893 max = X86_64_SSE_REGPARM_MAX;
7895 for (i = cum->sse_regno; i < max; ++i)
7897 mem = plus_constant (Pmode, save_area,
7898 i * 16 + ix86_varargs_gpr_size);
7899 mem = gen_rtx_MEM (smode, mem);
7900 MEM_NOTRAP_P (mem) = 1;
7901 set_mem_alias_set (mem, set);
7902 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7904 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7907 emit_label (label);
7911 static void
7912 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7914 alias_set_type set = get_varargs_alias_set ();
7915 int i;
7917 /* Reset to zero, as there might be a sysv vaarg used
7918 before. */
7919 ix86_varargs_gpr_size = 0;
7920 ix86_varargs_fpr_size = 0;
7922 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7924 rtx reg, mem;
7926 mem = gen_rtx_MEM (Pmode,
7927 plus_constant (Pmode, virtual_incoming_args_rtx,
7928 i * UNITS_PER_WORD));
7929 MEM_NOTRAP_P (mem) = 1;
7930 set_mem_alias_set (mem, set);
7932 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7933 emit_move_insn (mem, reg);
7937 static void
7938 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7939 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7940 int no_rtl)
7942 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7943 CUMULATIVE_ARGS next_cum;
7944 tree fntype;
7946 /* This argument doesn't appear to be used anymore. Which is good,
7947 because the old code here didn't suppress rtl generation. */
7948 gcc_assert (!no_rtl);
7950 if (!TARGET_64BIT)
7951 return;
7953 fntype = TREE_TYPE (current_function_decl);
7955 /* For varargs, we do not want to skip the dummy va_dcl argument.
7956 For stdargs, we do want to skip the last named argument. */
7957 next_cum = *cum;
7958 if (stdarg_p (fntype))
7959 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7960 true);
7962 if (cum->call_abi == MS_ABI)
7963 setup_incoming_varargs_ms_64 (&next_cum);
7964 else
7965 setup_incoming_varargs_64 (&next_cum);
7968 /* Checks if TYPE is of kind va_list char *. */
7970 static bool
7971 is_va_list_char_pointer (tree type)
7973 tree canonic;
7975 /* For 32-bit it is always true. */
7976 if (!TARGET_64BIT)
7977 return true;
7978 canonic = ix86_canonical_va_list_type (type);
7979 return (canonic == ms_va_list_type_node
7980 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7983 /* Implement va_start. */
7985 static void
7986 ix86_va_start (tree valist, rtx nextarg)
7988 HOST_WIDE_INT words, n_gpr, n_fpr;
7989 tree f_gpr, f_fpr, f_ovf, f_sav;
7990 tree gpr, fpr, ovf, sav, t;
7991 tree type;
7992 rtx ovf_rtx;
7994 if (flag_split_stack
7995 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7997 unsigned int scratch_regno;
7999 /* When we are splitting the stack, we can't refer to the stack
8000 arguments using internal_arg_pointer, because they may be on
8001 the old stack. The split stack prologue will arrange to
8002 leave a pointer to the old stack arguments in a scratch
8003 register, which we here copy to a pseudo-register. The split
8004 stack prologue can't set the pseudo-register directly because
8005 it (the prologue) runs before any registers have been saved. */
8007 scratch_regno = split_stack_prologue_scratch_regno ();
8008 if (scratch_regno != INVALID_REGNUM)
8010 rtx reg, seq;
8012 reg = gen_reg_rtx (Pmode);
8013 cfun->machine->split_stack_varargs_pointer = reg;
8015 start_sequence ();
8016 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8017 seq = get_insns ();
8018 end_sequence ();
8020 push_topmost_sequence ();
8021 emit_insn_after (seq, entry_of_function ());
8022 pop_topmost_sequence ();
8026 /* Only 64bit target needs something special. */
8027 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8029 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8030 std_expand_builtin_va_start (valist, nextarg);
8031 else
8033 rtx va_r, next;
8035 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8036 next = expand_binop (ptr_mode, add_optab,
8037 cfun->machine->split_stack_varargs_pointer,
8038 crtl->args.arg_offset_rtx,
8039 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8040 convert_move (va_r, next, 0);
8042 return;
8045 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8046 f_fpr = DECL_CHAIN (f_gpr);
8047 f_ovf = DECL_CHAIN (f_fpr);
8048 f_sav = DECL_CHAIN (f_ovf);
8050 valist = build_simple_mem_ref (valist);
8051 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8052 /* The following should be folded into the MEM_REF offset. */
8053 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8054 f_gpr, NULL_TREE);
8055 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8056 f_fpr, NULL_TREE);
8057 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8058 f_ovf, NULL_TREE);
8059 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8060 f_sav, NULL_TREE);
8062 /* Count number of gp and fp argument registers used. */
8063 words = crtl->args.info.words;
8064 n_gpr = crtl->args.info.regno;
8065 n_fpr = crtl->args.info.sse_regno;
8067 if (cfun->va_list_gpr_size)
8069 type = TREE_TYPE (gpr);
8070 t = build2 (MODIFY_EXPR, type,
8071 gpr, build_int_cst (type, n_gpr * 8));
8072 TREE_SIDE_EFFECTS (t) = 1;
8073 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8076 if (TARGET_SSE && cfun->va_list_fpr_size)
8078 type = TREE_TYPE (fpr);
8079 t = build2 (MODIFY_EXPR, type, fpr,
8080 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8081 TREE_SIDE_EFFECTS (t) = 1;
8082 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8085 /* Find the overflow area. */
8086 type = TREE_TYPE (ovf);
8087 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8088 ovf_rtx = crtl->args.internal_arg_pointer;
8089 else
8090 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8091 t = make_tree (type, ovf_rtx);
8092 if (words != 0)
8093 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8094 t = build2 (MODIFY_EXPR, type, ovf, t);
8095 TREE_SIDE_EFFECTS (t) = 1;
8096 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8098 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8100 /* Find the register save area.
8101 Prologue of the function save it right above stack frame. */
8102 type = TREE_TYPE (sav);
8103 t = make_tree (type, frame_pointer_rtx);
8104 if (!ix86_varargs_gpr_size)
8105 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8106 t = build2 (MODIFY_EXPR, type, sav, t);
8107 TREE_SIDE_EFFECTS (t) = 1;
8108 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8112 /* Implement va_arg. */
8114 static tree
8115 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8116 gimple_seq *post_p)
8118 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8119 tree f_gpr, f_fpr, f_ovf, f_sav;
8120 tree gpr, fpr, ovf, sav, t;
8121 int size, rsize;
8122 tree lab_false, lab_over = NULL_TREE;
8123 tree addr, t2;
8124 rtx container;
8125 int indirect_p = 0;
8126 tree ptrtype;
8127 enum machine_mode nat_mode;
8128 unsigned int arg_boundary;
8130 /* Only 64bit target needs something special. */
8131 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8132 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8134 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8135 f_fpr = DECL_CHAIN (f_gpr);
8136 f_ovf = DECL_CHAIN (f_fpr);
8137 f_sav = DECL_CHAIN (f_ovf);
8139 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8140 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8141 valist = build_va_arg_indirect_ref (valist);
8142 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8143 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8144 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8146 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8147 if (indirect_p)
8148 type = build_pointer_type (type);
8149 size = int_size_in_bytes (type);
8150 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8152 nat_mode = type_natural_mode (type, NULL);
8153 switch (nat_mode)
8155 case V8SFmode:
8156 case V8SImode:
8157 case V32QImode:
8158 case V16HImode:
8159 case V4DFmode:
8160 case V4DImode:
8161 /* Unnamed 256bit vector mode parameters are passed on stack. */
8162 if (!TARGET_64BIT_MS_ABI)
8164 container = NULL;
8165 break;
8168 default:
8169 container = construct_container (nat_mode, TYPE_MODE (type),
8170 type, 0, X86_64_REGPARM_MAX,
8171 X86_64_SSE_REGPARM_MAX, intreg,
8173 break;
8176 /* Pull the value out of the saved registers. */
8178 addr = create_tmp_var (ptr_type_node, "addr");
8180 if (container)
8182 int needed_intregs, needed_sseregs;
8183 bool need_temp;
8184 tree int_addr, sse_addr;
8186 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8187 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8189 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8191 need_temp = (!REG_P (container)
8192 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8193 || TYPE_ALIGN (type) > 128));
8195 /* In case we are passing structure, verify that it is consecutive block
8196 on the register save area. If not we need to do moves. */
8197 if (!need_temp && !REG_P (container))
8199 /* Verify that all registers are strictly consecutive */
8200 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8202 int i;
8204 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8206 rtx slot = XVECEXP (container, 0, i);
8207 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8208 || INTVAL (XEXP (slot, 1)) != i * 16)
8209 need_temp = 1;
8212 else
8214 int i;
8216 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8218 rtx slot = XVECEXP (container, 0, i);
8219 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8220 || INTVAL (XEXP (slot, 1)) != i * 8)
8221 need_temp = 1;
8225 if (!need_temp)
8227 int_addr = addr;
8228 sse_addr = addr;
8230 else
8232 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8233 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8236 /* First ensure that we fit completely in registers. */
8237 if (needed_intregs)
8239 t = build_int_cst (TREE_TYPE (gpr),
8240 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8241 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8242 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8243 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8244 gimplify_and_add (t, pre_p);
8246 if (needed_sseregs)
8248 t = build_int_cst (TREE_TYPE (fpr),
8249 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8250 + X86_64_REGPARM_MAX * 8);
8251 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8252 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8253 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8254 gimplify_and_add (t, pre_p);
8257 /* Compute index to start of area used for integer regs. */
8258 if (needed_intregs)
8260 /* int_addr = gpr + sav; */
8261 t = fold_build_pointer_plus (sav, gpr);
8262 gimplify_assign (int_addr, t, pre_p);
8264 if (needed_sseregs)
8266 /* sse_addr = fpr + sav; */
8267 t = fold_build_pointer_plus (sav, fpr);
8268 gimplify_assign (sse_addr, t, pre_p);
8270 if (need_temp)
8272 int i, prev_size = 0;
8273 tree temp = create_tmp_var (type, "va_arg_tmp");
8275 /* addr = &temp; */
8276 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8277 gimplify_assign (addr, t, pre_p);
8279 for (i = 0; i < XVECLEN (container, 0); i++)
8281 rtx slot = XVECEXP (container, 0, i);
8282 rtx reg = XEXP (slot, 0);
8283 enum machine_mode mode = GET_MODE (reg);
8284 tree piece_type;
8285 tree addr_type;
8286 tree daddr_type;
8287 tree src_addr, src;
8288 int src_offset;
8289 tree dest_addr, dest;
8290 int cur_size = GET_MODE_SIZE (mode);
8292 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8293 prev_size = INTVAL (XEXP (slot, 1));
8294 if (prev_size + cur_size > size)
8296 cur_size = size - prev_size;
8297 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8298 if (mode == BLKmode)
8299 mode = QImode;
8301 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8302 if (mode == GET_MODE (reg))
8303 addr_type = build_pointer_type (piece_type);
8304 else
8305 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8306 true);
8307 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8308 true);
8310 if (SSE_REGNO_P (REGNO (reg)))
8312 src_addr = sse_addr;
8313 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8315 else
8317 src_addr = int_addr;
8318 src_offset = REGNO (reg) * 8;
8320 src_addr = fold_convert (addr_type, src_addr);
8321 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8323 dest_addr = fold_convert (daddr_type, addr);
8324 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8325 if (cur_size == GET_MODE_SIZE (mode))
8327 src = build_va_arg_indirect_ref (src_addr);
8328 dest = build_va_arg_indirect_ref (dest_addr);
8330 gimplify_assign (dest, src, pre_p);
8332 else
8334 tree copy
8335 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8336 3, dest_addr, src_addr,
8337 size_int (cur_size));
8338 gimplify_and_add (copy, pre_p);
8340 prev_size += cur_size;
8344 if (needed_intregs)
8346 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8347 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8348 gimplify_assign (gpr, t, pre_p);
8351 if (needed_sseregs)
8353 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8354 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8355 gimplify_assign (fpr, t, pre_p);
8358 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8360 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8363 /* ... otherwise out of the overflow area. */
8365 /* When we align parameter on stack for caller, if the parameter
8366 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8367 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8368 here with caller. */
8369 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8370 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8371 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8373 /* Care for on-stack alignment if needed. */
8374 if (arg_boundary <= 64 || size == 0)
8375 t = ovf;
8376 else
8378 HOST_WIDE_INT align = arg_boundary / 8;
8379 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8380 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8381 build_int_cst (TREE_TYPE (t), -align));
8384 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8385 gimplify_assign (addr, t, pre_p);
8387 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8388 gimplify_assign (unshare_expr (ovf), t, pre_p);
8390 if (container)
8391 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8393 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8394 addr = fold_convert (ptrtype, addr);
8396 if (indirect_p)
8397 addr = build_va_arg_indirect_ref (addr);
8398 return build_va_arg_indirect_ref (addr);
8401 /* Return true if OPNUM's MEM should be matched
8402 in movabs* patterns. */
8404 bool
8405 ix86_check_movabs (rtx insn, int opnum)
8407 rtx set, mem;
8409 set = PATTERN (insn);
8410 if (GET_CODE (set) == PARALLEL)
8411 set = XVECEXP (set, 0, 0);
8412 gcc_assert (GET_CODE (set) == SET);
8413 mem = XEXP (set, opnum);
8414 while (GET_CODE (mem) == SUBREG)
8415 mem = SUBREG_REG (mem);
8416 gcc_assert (MEM_P (mem));
8417 return volatile_ok || !MEM_VOLATILE_P (mem);
8420 /* Initialize the table of extra 80387 mathematical constants. */
8422 static void
8423 init_ext_80387_constants (void)
8425 static const char * cst[5] =
8427 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8428 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8429 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8430 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8431 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8433 int i;
8435 for (i = 0; i < 5; i++)
8437 real_from_string (&ext_80387_constants_table[i], cst[i]);
8438 /* Ensure each constant is rounded to XFmode precision. */
8439 real_convert (&ext_80387_constants_table[i],
8440 XFmode, &ext_80387_constants_table[i]);
8443 ext_80387_constants_init = 1;
8446 /* Return non-zero if the constant is something that
8447 can be loaded with a special instruction. */
8450 standard_80387_constant_p (rtx x)
8452 enum machine_mode mode = GET_MODE (x);
8454 REAL_VALUE_TYPE r;
8456 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8457 return -1;
8459 if (x == CONST0_RTX (mode))
8460 return 1;
8461 if (x == CONST1_RTX (mode))
8462 return 2;
8464 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8466 /* For XFmode constants, try to find a special 80387 instruction when
8467 optimizing for size or on those CPUs that benefit from them. */
8468 if (mode == XFmode
8469 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8471 int i;
8473 if (! ext_80387_constants_init)
8474 init_ext_80387_constants ();
8476 for (i = 0; i < 5; i++)
8477 if (real_identical (&r, &ext_80387_constants_table[i]))
8478 return i + 3;
8481 /* Load of the constant -0.0 or -1.0 will be split as
8482 fldz;fchs or fld1;fchs sequence. */
8483 if (real_isnegzero (&r))
8484 return 8;
8485 if (real_identical (&r, &dconstm1))
8486 return 9;
8488 return 0;
8491 /* Return the opcode of the special instruction to be used to load
8492 the constant X. */
8494 const char *
8495 standard_80387_constant_opcode (rtx x)
8497 switch (standard_80387_constant_p (x))
8499 case 1:
8500 return "fldz";
8501 case 2:
8502 return "fld1";
8503 case 3:
8504 return "fldlg2";
8505 case 4:
8506 return "fldln2";
8507 case 5:
8508 return "fldl2e";
8509 case 6:
8510 return "fldl2t";
8511 case 7:
8512 return "fldpi";
8513 case 8:
8514 case 9:
8515 return "#";
8516 default:
8517 gcc_unreachable ();
8521 /* Return the CONST_DOUBLE representing the 80387 constant that is
8522 loaded by the specified special instruction. The argument IDX
8523 matches the return value from standard_80387_constant_p. */
8526 standard_80387_constant_rtx (int idx)
8528 int i;
8530 if (! ext_80387_constants_init)
8531 init_ext_80387_constants ();
8533 switch (idx)
8535 case 3:
8536 case 4:
8537 case 5:
8538 case 6:
8539 case 7:
8540 i = idx - 3;
8541 break;
8543 default:
8544 gcc_unreachable ();
8547 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8548 XFmode);
8551 /* Return 1 if X is all 0s and 2 if x is all 1s
8552 in supported SSE/AVX vector mode. */
8555 standard_sse_constant_p (rtx x)
8557 enum machine_mode mode = GET_MODE (x);
8559 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8560 return 1;
8561 if (vector_all_ones_operand (x, mode))
8562 switch (mode)
8564 case V16QImode:
8565 case V8HImode:
8566 case V4SImode:
8567 case V2DImode:
8568 if (TARGET_SSE2)
8569 return 2;
8570 case V32QImode:
8571 case V16HImode:
8572 case V8SImode:
8573 case V4DImode:
8574 if (TARGET_AVX2)
8575 return 2;
8576 default:
8577 break;
8580 return 0;
8583 /* Return the opcode of the special instruction to be used to load
8584 the constant X. */
8586 const char *
8587 standard_sse_constant_opcode (rtx insn, rtx x)
8589 switch (standard_sse_constant_p (x))
8591 case 1:
8592 switch (get_attr_mode (insn))
8594 case MODE_TI:
8595 return "%vpxor\t%0, %d0";
8596 case MODE_V2DF:
8597 return "%vxorpd\t%0, %d0";
8598 case MODE_V4SF:
8599 return "%vxorps\t%0, %d0";
8601 case MODE_OI:
8602 return "vpxor\t%x0, %x0, %x0";
8603 case MODE_V4DF:
8604 return "vxorpd\t%x0, %x0, %x0";
8605 case MODE_V8SF:
8606 return "vxorps\t%x0, %x0, %x0";
8608 default:
8609 break;
8612 case 2:
8613 if (get_attr_mode (insn) == MODE_XI
8614 || get_attr_mode (insn) == MODE_V8DF
8615 || get_attr_mode (insn) == MODE_V16SF)
8616 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
8617 if (TARGET_AVX)
8618 return "vpcmpeqd\t%0, %0, %0";
8619 else
8620 return "pcmpeqd\t%0, %0";
8622 default:
8623 break;
8625 gcc_unreachable ();
8628 /* Returns true if OP contains a symbol reference */
8630 bool
8631 symbolic_reference_mentioned_p (rtx op)
8633 const char *fmt;
8634 int i;
8636 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8637 return true;
8639 fmt = GET_RTX_FORMAT (GET_CODE (op));
8640 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8642 if (fmt[i] == 'E')
8644 int j;
8646 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8647 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8648 return true;
8651 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8652 return true;
8655 return false;
8658 /* Return true if it is appropriate to emit `ret' instructions in the
8659 body of a function. Do this only if the epilogue is simple, needing a
8660 couple of insns. Prior to reloading, we can't tell how many registers
8661 must be saved, so return false then. Return false if there is no frame
8662 marker to de-allocate. */
8664 bool
8665 ix86_can_use_return_insn_p (void)
8667 struct ix86_frame frame;
8669 if (! reload_completed || frame_pointer_needed)
8670 return 0;
8672 /* Don't allow more than 32k pop, since that's all we can do
8673 with one instruction. */
8674 if (crtl->args.pops_args && crtl->args.size >= 32768)
8675 return 0;
8677 ix86_compute_frame_layout (&frame);
8678 return (frame.stack_pointer_offset == UNITS_PER_WORD
8679 && (frame.nregs + frame.nsseregs) == 0);
8682 /* Value should be nonzero if functions must have frame pointers.
8683 Zero means the frame pointer need not be set up (and parms may
8684 be accessed via the stack pointer) in functions that seem suitable. */
8686 static bool
8687 ix86_frame_pointer_required (void)
8689 /* If we accessed previous frames, then the generated code expects
8690 to be able to access the saved ebp value in our frame. */
8691 if (cfun->machine->accesses_prev_frame)
8692 return true;
8694 /* Several x86 os'es need a frame pointer for other reasons,
8695 usually pertaining to setjmp. */
8696 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8697 return true;
8699 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8700 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8701 return true;
8703 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8704 allocation is 4GB. */
8705 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8706 return true;
8708 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8709 turns off the frame pointer by default. Turn it back on now if
8710 we've not got a leaf function. */
8711 if (TARGET_OMIT_LEAF_FRAME_POINTER
8712 && (!crtl->is_leaf
8713 || ix86_current_function_calls_tls_descriptor))
8714 return true;
8716 if (crtl->profile && !flag_fentry)
8717 return true;
8719 return false;
8722 /* Record that the current function accesses previous call frames. */
8724 void
8725 ix86_setup_frame_addresses (void)
8727 cfun->machine->accesses_prev_frame = 1;
8730 #ifndef USE_HIDDEN_LINKONCE
8731 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8732 # define USE_HIDDEN_LINKONCE 1
8733 # else
8734 # define USE_HIDDEN_LINKONCE 0
8735 # endif
8736 #endif
8738 static int pic_labels_used;
8740 /* Fills in the label name that should be used for a pc thunk for
8741 the given register. */
8743 static void
8744 get_pc_thunk_name (char name[32], unsigned int regno)
8746 gcc_assert (!TARGET_64BIT);
8748 if (USE_HIDDEN_LINKONCE)
8749 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8750 else
8751 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8755 /* This function generates code for -fpic that loads %ebx with
8756 the return address of the caller and then returns. */
8758 static void
8759 ix86_code_end (void)
8761 rtx xops[2];
8762 int regno;
8764 for (regno = AX_REG; regno <= SP_REG; regno++)
8766 char name[32];
8767 tree decl;
8769 if (!(pic_labels_used & (1 << regno)))
8770 continue;
8772 get_pc_thunk_name (name, regno);
8774 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8775 get_identifier (name),
8776 build_function_type_list (void_type_node, NULL_TREE));
8777 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8778 NULL_TREE, void_type_node);
8779 TREE_PUBLIC (decl) = 1;
8780 TREE_STATIC (decl) = 1;
8781 DECL_IGNORED_P (decl) = 1;
8783 #if TARGET_MACHO
8784 if (TARGET_MACHO)
8786 switch_to_section (darwin_sections[text_coal_section]);
8787 fputs ("\t.weak_definition\t", asm_out_file);
8788 assemble_name (asm_out_file, name);
8789 fputs ("\n\t.private_extern\t", asm_out_file);
8790 assemble_name (asm_out_file, name);
8791 putc ('\n', asm_out_file);
8792 ASM_OUTPUT_LABEL (asm_out_file, name);
8793 DECL_WEAK (decl) = 1;
8795 else
8796 #endif
8797 if (USE_HIDDEN_LINKONCE)
8799 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8801 targetm.asm_out.unique_section (decl, 0);
8802 switch_to_section (get_named_section (decl, NULL, 0));
8804 targetm.asm_out.globalize_label (asm_out_file, name);
8805 fputs ("\t.hidden\t", asm_out_file);
8806 assemble_name (asm_out_file, name);
8807 putc ('\n', asm_out_file);
8808 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8810 else
8812 switch_to_section (text_section);
8813 ASM_OUTPUT_LABEL (asm_out_file, name);
8816 DECL_INITIAL (decl) = make_node (BLOCK);
8817 current_function_decl = decl;
8818 init_function_start (decl);
8819 first_function_block_is_cold = false;
8820 /* Make sure unwind info is emitted for the thunk if needed. */
8821 final_start_function (emit_barrier (), asm_out_file, 1);
8823 /* Pad stack IP move with 4 instructions (two NOPs count
8824 as one instruction). */
8825 if (TARGET_PAD_SHORT_FUNCTION)
8827 int i = 8;
8829 while (i--)
8830 fputs ("\tnop\n", asm_out_file);
8833 xops[0] = gen_rtx_REG (Pmode, regno);
8834 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8835 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8836 fputs ("\tret\n", asm_out_file);
8837 final_end_function ();
8838 init_insn_lengths ();
8839 free_after_compilation (cfun);
8840 set_cfun (NULL);
8841 current_function_decl = NULL;
8844 if (flag_split_stack)
8845 file_end_indicate_split_stack ();
8848 /* Emit code for the SET_GOT patterns. */
8850 const char *
8851 output_set_got (rtx dest, rtx label)
8853 rtx xops[3];
8855 xops[0] = dest;
8857 if (TARGET_VXWORKS_RTP && flag_pic)
8859 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8860 xops[2] = gen_rtx_MEM (Pmode,
8861 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8862 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8864 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8865 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8866 an unadorned address. */
8867 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8868 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8869 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8870 return "";
8873 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8875 if (!flag_pic)
8877 if (TARGET_MACHO)
8878 /* We don't need a pic base, we're not producing pic. */
8879 gcc_unreachable ();
8881 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8882 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8883 targetm.asm_out.internal_label (asm_out_file, "L",
8884 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8886 else
8888 char name[32];
8889 get_pc_thunk_name (name, REGNO (dest));
8890 pic_labels_used |= 1 << REGNO (dest);
8892 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8893 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8894 output_asm_insn ("call\t%X2", xops);
8896 #if TARGET_MACHO
8897 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
8898 This is what will be referenced by the Mach-O PIC subsystem. */
8899 if (machopic_should_output_picbase_label () || !label)
8900 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8902 /* When we are restoring the pic base at the site of a nonlocal label,
8903 and we decided to emit the pic base above, we will still output a
8904 local label used for calculating the correction offset (even though
8905 the offset will be 0 in that case). */
8906 if (label)
8907 targetm.asm_out.internal_label (asm_out_file, "L",
8908 CODE_LABEL_NUMBER (label));
8909 #endif
8912 if (!TARGET_MACHO)
8913 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8915 return "";
8918 /* Generate an "push" pattern for input ARG. */
8920 static rtx
8921 gen_push (rtx arg)
8923 struct machine_function *m = cfun->machine;
8925 if (m->fs.cfa_reg == stack_pointer_rtx)
8926 m->fs.cfa_offset += UNITS_PER_WORD;
8927 m->fs.sp_offset += UNITS_PER_WORD;
8929 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8930 arg = gen_rtx_REG (word_mode, REGNO (arg));
8932 return gen_rtx_SET (VOIDmode,
8933 gen_rtx_MEM (word_mode,
8934 gen_rtx_PRE_DEC (Pmode,
8935 stack_pointer_rtx)),
8936 arg);
8939 /* Generate an "pop" pattern for input ARG. */
8941 static rtx
8942 gen_pop (rtx arg)
8944 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8945 arg = gen_rtx_REG (word_mode, REGNO (arg));
8947 return gen_rtx_SET (VOIDmode,
8948 arg,
8949 gen_rtx_MEM (word_mode,
8950 gen_rtx_POST_INC (Pmode,
8951 stack_pointer_rtx)));
8954 /* Return >= 0 if there is an unused call-clobbered register available
8955 for the entire function. */
8957 static unsigned int
8958 ix86_select_alt_pic_regnum (void)
8960 if (crtl->is_leaf
8961 && !crtl->profile
8962 && !ix86_current_function_calls_tls_descriptor)
8964 int i, drap;
8965 /* Can't use the same register for both PIC and DRAP. */
8966 if (crtl->drap_reg)
8967 drap = REGNO (crtl->drap_reg);
8968 else
8969 drap = -1;
8970 for (i = 2; i >= 0; --i)
8971 if (i != drap && !df_regs_ever_live_p (i))
8972 return i;
8975 return INVALID_REGNUM;
8978 /* Return TRUE if we need to save REGNO. */
8980 static bool
8981 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8983 if (pic_offset_table_rtx
8984 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8985 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8986 || crtl->profile
8987 || crtl->calls_eh_return
8988 || crtl->uses_const_pool
8989 || cfun->has_nonlocal_label))
8990 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8992 if (crtl->calls_eh_return && maybe_eh_return)
8994 unsigned i;
8995 for (i = 0; ; i++)
8997 unsigned test = EH_RETURN_DATA_REGNO (i);
8998 if (test == INVALID_REGNUM)
8999 break;
9000 if (test == regno)
9001 return true;
9005 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9006 return true;
9008 return (df_regs_ever_live_p (regno)
9009 && !call_used_regs[regno]
9010 && !fixed_regs[regno]
9011 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9014 /* Return number of saved general prupose registers. */
9016 static int
9017 ix86_nsaved_regs (void)
9019 int nregs = 0;
9020 int regno;
9022 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9023 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9024 nregs ++;
9025 return nregs;
9028 /* Return number of saved SSE registrers. */
9030 static int
9031 ix86_nsaved_sseregs (void)
9033 int nregs = 0;
9034 int regno;
9036 if (!TARGET_64BIT_MS_ABI)
9037 return 0;
9038 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9039 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9040 nregs ++;
9041 return nregs;
9044 /* Given FROM and TO register numbers, say whether this elimination is
9045 allowed. If stack alignment is needed, we can only replace argument
9046 pointer with hard frame pointer, or replace frame pointer with stack
9047 pointer. Otherwise, frame pointer elimination is automatically
9048 handled and all other eliminations are valid. */
9050 static bool
9051 ix86_can_eliminate (const int from, const int to)
9053 if (stack_realign_fp)
9054 return ((from == ARG_POINTER_REGNUM
9055 && to == HARD_FRAME_POINTER_REGNUM)
9056 || (from == FRAME_POINTER_REGNUM
9057 && to == STACK_POINTER_REGNUM));
9058 else
9059 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9062 /* Return the offset between two registers, one to be eliminated, and the other
9063 its replacement, at the start of a routine. */
9065 HOST_WIDE_INT
9066 ix86_initial_elimination_offset (int from, int to)
9068 struct ix86_frame frame;
9069 ix86_compute_frame_layout (&frame);
9071 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9072 return frame.hard_frame_pointer_offset;
9073 else if (from == FRAME_POINTER_REGNUM
9074 && to == HARD_FRAME_POINTER_REGNUM)
9075 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9076 else
9078 gcc_assert (to == STACK_POINTER_REGNUM);
9080 if (from == ARG_POINTER_REGNUM)
9081 return frame.stack_pointer_offset;
9083 gcc_assert (from == FRAME_POINTER_REGNUM);
9084 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9088 /* In a dynamically-aligned function, we can't know the offset from
9089 stack pointer to frame pointer, so we must ensure that setjmp
9090 eliminates fp against the hard fp (%ebp) rather than trying to
9091 index from %esp up to the top of the frame across a gap that is
9092 of unknown (at compile-time) size. */
9093 static rtx
9094 ix86_builtin_setjmp_frame_value (void)
9096 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9099 /* When using -fsplit-stack, the allocation routines set a field in
9100 the TCB to the bottom of the stack plus this much space, measured
9101 in bytes. */
9103 #define SPLIT_STACK_AVAILABLE 256
9105 /* Fill structure ix86_frame about frame of currently computed function. */
9107 static void
9108 ix86_compute_frame_layout (struct ix86_frame *frame)
9110 unsigned HOST_WIDE_INT stack_alignment_needed;
9111 HOST_WIDE_INT offset;
9112 unsigned HOST_WIDE_INT preferred_alignment;
9113 HOST_WIDE_INT size = get_frame_size ();
9114 HOST_WIDE_INT to_allocate;
9116 frame->nregs = ix86_nsaved_regs ();
9117 frame->nsseregs = ix86_nsaved_sseregs ();
9119 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9120 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9122 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9123 function prologues and leaf. */
9124 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9125 && (!crtl->is_leaf || cfun->calls_alloca != 0
9126 || ix86_current_function_calls_tls_descriptor))
9128 preferred_alignment = 16;
9129 stack_alignment_needed = 16;
9130 crtl->preferred_stack_boundary = 128;
9131 crtl->stack_alignment_needed = 128;
9134 gcc_assert (!size || stack_alignment_needed);
9135 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9136 gcc_assert (preferred_alignment <= stack_alignment_needed);
9138 /* For SEH we have to limit the amount of code movement into the prologue.
9139 At present we do this via a BLOCKAGE, at which point there's very little
9140 scheduling that can be done, which means that there's very little point
9141 in doing anything except PUSHs. */
9142 if (TARGET_SEH)
9143 cfun->machine->use_fast_prologue_epilogue = false;
9145 /* During reload iteration the amount of registers saved can change.
9146 Recompute the value as needed. Do not recompute when amount of registers
9147 didn't change as reload does multiple calls to the function and does not
9148 expect the decision to change within single iteration. */
9149 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR)
9150 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9152 int count = frame->nregs;
9153 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9155 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9157 /* The fast prologue uses move instead of push to save registers. This
9158 is significantly longer, but also executes faster as modern hardware
9159 can execute the moves in parallel, but can't do that for push/pop.
9161 Be careful about choosing what prologue to emit: When function takes
9162 many instructions to execute we may use slow version as well as in
9163 case function is known to be outside hot spot (this is known with
9164 feedback only). Weight the size of function by number of registers
9165 to save as it is cheap to use one or two push instructions but very
9166 slow to use many of them. */
9167 if (count)
9168 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9169 if (node->frequency < NODE_FREQUENCY_NORMAL
9170 || (flag_branch_probabilities
9171 && node->frequency < NODE_FREQUENCY_HOT))
9172 cfun->machine->use_fast_prologue_epilogue = false;
9173 else
9174 cfun->machine->use_fast_prologue_epilogue
9175 = !expensive_function_p (count);
9178 frame->save_regs_using_mov
9179 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9180 /* If static stack checking is enabled and done with probes,
9181 the registers need to be saved before allocating the frame. */
9182 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9184 /* Skip return address. */
9185 offset = UNITS_PER_WORD;
9187 /* Skip pushed static chain. */
9188 if (ix86_static_chain_on_stack)
9189 offset += UNITS_PER_WORD;
9191 /* Skip saved base pointer. */
9192 if (frame_pointer_needed)
9193 offset += UNITS_PER_WORD;
9194 frame->hfp_save_offset = offset;
9196 /* The traditional frame pointer location is at the top of the frame. */
9197 frame->hard_frame_pointer_offset = offset;
9199 /* Register save area */
9200 offset += frame->nregs * UNITS_PER_WORD;
9201 frame->reg_save_offset = offset;
9203 /* On SEH target, registers are pushed just before the frame pointer
9204 location. */
9205 if (TARGET_SEH)
9206 frame->hard_frame_pointer_offset = offset;
9208 /* Align and set SSE register save area. */
9209 if (frame->nsseregs)
9211 /* The only ABI that has saved SSE registers (Win64) also has a
9212 16-byte aligned default stack, and thus we don't need to be
9213 within the re-aligned local stack frame to save them. */
9214 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9215 offset = (offset + 16 - 1) & -16;
9216 offset += frame->nsseregs * 16;
9218 frame->sse_reg_save_offset = offset;
9220 /* The re-aligned stack starts here. Values before this point are not
9221 directly comparable with values below this point. In order to make
9222 sure that no value happens to be the same before and after, force
9223 the alignment computation below to add a non-zero value. */
9224 if (stack_realign_fp)
9225 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9227 /* Va-arg area */
9228 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9229 offset += frame->va_arg_size;
9231 /* Align start of frame for local function. */
9232 if (stack_realign_fp
9233 || offset != frame->sse_reg_save_offset
9234 || size != 0
9235 || !crtl->is_leaf
9236 || cfun->calls_alloca
9237 || ix86_current_function_calls_tls_descriptor)
9238 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9240 /* Frame pointer points here. */
9241 frame->frame_pointer_offset = offset;
9243 offset += size;
9245 /* Add outgoing arguments area. Can be skipped if we eliminated
9246 all the function calls as dead code.
9247 Skipping is however impossible when function calls alloca. Alloca
9248 expander assumes that last crtl->outgoing_args_size
9249 of stack frame are unused. */
9250 if (ACCUMULATE_OUTGOING_ARGS
9251 && (!crtl->is_leaf || cfun->calls_alloca
9252 || ix86_current_function_calls_tls_descriptor))
9254 offset += crtl->outgoing_args_size;
9255 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9257 else
9258 frame->outgoing_arguments_size = 0;
9260 /* Align stack boundary. Only needed if we're calling another function
9261 or using alloca. */
9262 if (!crtl->is_leaf || cfun->calls_alloca
9263 || ix86_current_function_calls_tls_descriptor)
9264 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9266 /* We've reached end of stack frame. */
9267 frame->stack_pointer_offset = offset;
9269 /* Size prologue needs to allocate. */
9270 to_allocate = offset - frame->sse_reg_save_offset;
9272 if ((!to_allocate && frame->nregs <= 1)
9273 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9274 frame->save_regs_using_mov = false;
9276 if (ix86_using_red_zone ()
9277 && crtl->sp_is_unchanging
9278 && crtl->is_leaf
9279 && !ix86_current_function_calls_tls_descriptor)
9281 frame->red_zone_size = to_allocate;
9282 if (frame->save_regs_using_mov)
9283 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9284 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9285 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9287 else
9288 frame->red_zone_size = 0;
9289 frame->stack_pointer_offset -= frame->red_zone_size;
9291 /* The SEH frame pointer location is near the bottom of the frame.
9292 This is enforced by the fact that the difference between the
9293 stack pointer and the frame pointer is limited to 240 bytes in
9294 the unwind data structure. */
9295 if (TARGET_SEH)
9297 HOST_WIDE_INT diff;
9299 /* If we can leave the frame pointer where it is, do so. Also, returns
9300 the establisher frame for __builtin_frame_address (0). */
9301 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9302 if (diff <= SEH_MAX_FRAME_SIZE
9303 && (diff > 240 || (diff & 15) != 0)
9304 && !crtl->accesses_prior_frames)
9306 /* Ideally we'd determine what portion of the local stack frame
9307 (within the constraint of the lowest 240) is most heavily used.
9308 But without that complication, simply bias the frame pointer
9309 by 128 bytes so as to maximize the amount of the local stack
9310 frame that is addressable with 8-bit offsets. */
9311 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9316 /* This is semi-inlined memory_address_length, but simplified
9317 since we know that we're always dealing with reg+offset, and
9318 to avoid having to create and discard all that rtl. */
9320 static inline int
9321 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9323 int len = 4;
9325 if (offset == 0)
9327 /* EBP and R13 cannot be encoded without an offset. */
9328 len = (regno == BP_REG || regno == R13_REG);
9330 else if (IN_RANGE (offset, -128, 127))
9331 len = 1;
9333 /* ESP and R12 must be encoded with a SIB byte. */
9334 if (regno == SP_REG || regno == R12_REG)
9335 len++;
9337 return len;
9340 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9341 The valid base registers are taken from CFUN->MACHINE->FS. */
9343 static rtx
9344 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9346 const struct machine_function *m = cfun->machine;
9347 rtx base_reg = NULL;
9348 HOST_WIDE_INT base_offset = 0;
9350 if (m->use_fast_prologue_epilogue)
9352 /* Choose the base register most likely to allow the most scheduling
9353 opportunities. Generally FP is valid throughout the function,
9354 while DRAP must be reloaded within the epilogue. But choose either
9355 over the SP due to increased encoding size. */
9357 if (m->fs.fp_valid)
9359 base_reg = hard_frame_pointer_rtx;
9360 base_offset = m->fs.fp_offset - cfa_offset;
9362 else if (m->fs.drap_valid)
9364 base_reg = crtl->drap_reg;
9365 base_offset = 0 - cfa_offset;
9367 else if (m->fs.sp_valid)
9369 base_reg = stack_pointer_rtx;
9370 base_offset = m->fs.sp_offset - cfa_offset;
9373 else
9375 HOST_WIDE_INT toffset;
9376 int len = 16, tlen;
9378 /* Choose the base register with the smallest address encoding.
9379 With a tie, choose FP > DRAP > SP. */
9380 if (m->fs.sp_valid)
9382 base_reg = stack_pointer_rtx;
9383 base_offset = m->fs.sp_offset - cfa_offset;
9384 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9386 if (m->fs.drap_valid)
9388 toffset = 0 - cfa_offset;
9389 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9390 if (tlen <= len)
9392 base_reg = crtl->drap_reg;
9393 base_offset = toffset;
9394 len = tlen;
9397 if (m->fs.fp_valid)
9399 toffset = m->fs.fp_offset - cfa_offset;
9400 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9401 if (tlen <= len)
9403 base_reg = hard_frame_pointer_rtx;
9404 base_offset = toffset;
9405 len = tlen;
9409 gcc_assert (base_reg != NULL);
9411 return plus_constant (Pmode, base_reg, base_offset);
9414 /* Emit code to save registers in the prologue. */
9416 static void
9417 ix86_emit_save_regs (void)
9419 unsigned int regno;
9420 rtx insn;
9422 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9423 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9425 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9426 RTX_FRAME_RELATED_P (insn) = 1;
9430 /* Emit a single register save at CFA - CFA_OFFSET. */
9432 static void
9433 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9434 HOST_WIDE_INT cfa_offset)
9436 struct machine_function *m = cfun->machine;
9437 rtx reg = gen_rtx_REG (mode, regno);
9438 rtx mem, addr, base, insn;
9440 addr = choose_baseaddr (cfa_offset);
9441 mem = gen_frame_mem (mode, addr);
9443 /* For SSE saves, we need to indicate the 128-bit alignment. */
9444 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9446 insn = emit_move_insn (mem, reg);
9447 RTX_FRAME_RELATED_P (insn) = 1;
9449 base = addr;
9450 if (GET_CODE (base) == PLUS)
9451 base = XEXP (base, 0);
9452 gcc_checking_assert (REG_P (base));
9454 /* When saving registers into a re-aligned local stack frame, avoid
9455 any tricky guessing by dwarf2out. */
9456 if (m->fs.realigned)
9458 gcc_checking_assert (stack_realign_drap);
9460 if (regno == REGNO (crtl->drap_reg))
9462 /* A bit of a hack. We force the DRAP register to be saved in
9463 the re-aligned stack frame, which provides us with a copy
9464 of the CFA that will last past the prologue. Install it. */
9465 gcc_checking_assert (cfun->machine->fs.fp_valid);
9466 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9467 cfun->machine->fs.fp_offset - cfa_offset);
9468 mem = gen_rtx_MEM (mode, addr);
9469 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9471 else
9473 /* The frame pointer is a stable reference within the
9474 aligned frame. Use it. */
9475 gcc_checking_assert (cfun->machine->fs.fp_valid);
9476 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9477 cfun->machine->fs.fp_offset - cfa_offset);
9478 mem = gen_rtx_MEM (mode, addr);
9479 add_reg_note (insn, REG_CFA_EXPRESSION,
9480 gen_rtx_SET (VOIDmode, mem, reg));
9484 /* The memory may not be relative to the current CFA register,
9485 which means that we may need to generate a new pattern for
9486 use by the unwind info. */
9487 else if (base != m->fs.cfa_reg)
9489 addr = plus_constant (Pmode, m->fs.cfa_reg,
9490 m->fs.cfa_offset - cfa_offset);
9491 mem = gen_rtx_MEM (mode, addr);
9492 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9496 /* Emit code to save registers using MOV insns.
9497 First register is stored at CFA - CFA_OFFSET. */
9498 static void
9499 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9501 unsigned int regno;
9503 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9504 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9506 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9507 cfa_offset -= UNITS_PER_WORD;
9511 /* Emit code to save SSE registers using MOV insns.
9512 First register is stored at CFA - CFA_OFFSET. */
9513 static void
9514 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9516 unsigned int regno;
9518 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9519 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9521 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9522 cfa_offset -= 16;
9526 static GTY(()) rtx queued_cfa_restores;
9528 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9529 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9530 Don't add the note if the previously saved value will be left untouched
9531 within stack red-zone till return, as unwinders can find the same value
9532 in the register and on the stack. */
9534 static void
9535 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9537 if (!crtl->shrink_wrapped
9538 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9539 return;
9541 if (insn)
9543 add_reg_note (insn, REG_CFA_RESTORE, reg);
9544 RTX_FRAME_RELATED_P (insn) = 1;
9546 else
9547 queued_cfa_restores
9548 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9551 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9553 static void
9554 ix86_add_queued_cfa_restore_notes (rtx insn)
9556 rtx last;
9557 if (!queued_cfa_restores)
9558 return;
9559 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9561 XEXP (last, 1) = REG_NOTES (insn);
9562 REG_NOTES (insn) = queued_cfa_restores;
9563 queued_cfa_restores = NULL_RTX;
9564 RTX_FRAME_RELATED_P (insn) = 1;
9567 /* Expand prologue or epilogue stack adjustment.
9568 The pattern exist to put a dependency on all ebp-based memory accesses.
9569 STYLE should be negative if instructions should be marked as frame related,
9570 zero if %r11 register is live and cannot be freely used and positive
9571 otherwise. */
9573 static void
9574 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9575 int style, bool set_cfa)
9577 struct machine_function *m = cfun->machine;
9578 rtx insn;
9579 bool add_frame_related_expr = false;
9581 if (Pmode == SImode)
9582 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9583 else if (x86_64_immediate_operand (offset, DImode))
9584 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9585 else
9587 rtx tmp;
9588 /* r11 is used by indirect sibcall return as well, set before the
9589 epilogue and used after the epilogue. */
9590 if (style)
9591 tmp = gen_rtx_REG (DImode, R11_REG);
9592 else
9594 gcc_assert (src != hard_frame_pointer_rtx
9595 && dest != hard_frame_pointer_rtx);
9596 tmp = hard_frame_pointer_rtx;
9598 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9599 if (style < 0)
9600 add_frame_related_expr = true;
9602 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9605 insn = emit_insn (insn);
9606 if (style >= 0)
9607 ix86_add_queued_cfa_restore_notes (insn);
9609 if (set_cfa)
9611 rtx r;
9613 gcc_assert (m->fs.cfa_reg == src);
9614 m->fs.cfa_offset += INTVAL (offset);
9615 m->fs.cfa_reg = dest;
9617 r = gen_rtx_PLUS (Pmode, src, offset);
9618 r = gen_rtx_SET (VOIDmode, dest, r);
9619 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9620 RTX_FRAME_RELATED_P (insn) = 1;
9622 else if (style < 0)
9624 RTX_FRAME_RELATED_P (insn) = 1;
9625 if (add_frame_related_expr)
9627 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9628 r = gen_rtx_SET (VOIDmode, dest, r);
9629 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9633 if (dest == stack_pointer_rtx)
9635 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9636 bool valid = m->fs.sp_valid;
9638 if (src == hard_frame_pointer_rtx)
9640 valid = m->fs.fp_valid;
9641 ooffset = m->fs.fp_offset;
9643 else if (src == crtl->drap_reg)
9645 valid = m->fs.drap_valid;
9646 ooffset = 0;
9648 else
9650 /* Else there are two possibilities: SP itself, which we set
9651 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9652 taken care of this by hand along the eh_return path. */
9653 gcc_checking_assert (src == stack_pointer_rtx
9654 || offset == const0_rtx);
9657 m->fs.sp_offset = ooffset - INTVAL (offset);
9658 m->fs.sp_valid = valid;
9662 /* Find an available register to be used as dynamic realign argument
9663 pointer regsiter. Such a register will be written in prologue and
9664 used in begin of body, so it must not be
9665 1. parameter passing register.
9666 2. GOT pointer.
9667 We reuse static-chain register if it is available. Otherwise, we
9668 use DI for i386 and R13 for x86-64. We chose R13 since it has
9669 shorter encoding.
9671 Return: the regno of chosen register. */
9673 static unsigned int
9674 find_drap_reg (void)
9676 tree decl = cfun->decl;
9678 if (TARGET_64BIT)
9680 /* Use R13 for nested function or function need static chain.
9681 Since function with tail call may use any caller-saved
9682 registers in epilogue, DRAP must not use caller-saved
9683 register in such case. */
9684 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9685 return R13_REG;
9687 return R10_REG;
9689 else
9691 /* Use DI for nested function or function need static chain.
9692 Since function with tail call may use any caller-saved
9693 registers in epilogue, DRAP must not use caller-saved
9694 register in such case. */
9695 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9696 return DI_REG;
9698 /* Reuse static chain register if it isn't used for parameter
9699 passing. */
9700 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9702 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9703 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9704 return CX_REG;
9706 return DI_REG;
9710 /* Return minimum incoming stack alignment. */
9712 static unsigned int
9713 ix86_minimum_incoming_stack_boundary (bool sibcall)
9715 unsigned int incoming_stack_boundary;
9717 /* Prefer the one specified at command line. */
9718 if (ix86_user_incoming_stack_boundary)
9719 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9720 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9721 if -mstackrealign is used, it isn't used for sibcall check and
9722 estimated stack alignment is 128bit. */
9723 else if (!sibcall
9724 && !TARGET_64BIT
9725 && ix86_force_align_arg_pointer
9726 && crtl->stack_alignment_estimated == 128)
9727 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9728 else
9729 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9731 /* Incoming stack alignment can be changed on individual functions
9732 via force_align_arg_pointer attribute. We use the smallest
9733 incoming stack boundary. */
9734 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9735 && lookup_attribute (ix86_force_align_arg_pointer_string,
9736 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9737 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9739 /* The incoming stack frame has to be aligned at least at
9740 parm_stack_boundary. */
9741 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9742 incoming_stack_boundary = crtl->parm_stack_boundary;
9744 /* Stack at entrance of main is aligned by runtime. We use the
9745 smallest incoming stack boundary. */
9746 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9747 && DECL_NAME (current_function_decl)
9748 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9749 && DECL_FILE_SCOPE_P (current_function_decl))
9750 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9752 return incoming_stack_boundary;
9755 /* Update incoming stack boundary and estimated stack alignment. */
9757 static void
9758 ix86_update_stack_boundary (void)
9760 ix86_incoming_stack_boundary
9761 = ix86_minimum_incoming_stack_boundary (false);
9763 /* x86_64 vararg needs 16byte stack alignment for register save
9764 area. */
9765 if (TARGET_64BIT
9766 && cfun->stdarg
9767 && crtl->stack_alignment_estimated < 128)
9768 crtl->stack_alignment_estimated = 128;
9771 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9772 needed or an rtx for DRAP otherwise. */
9774 static rtx
9775 ix86_get_drap_rtx (void)
9777 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9778 crtl->need_drap = true;
9780 if (stack_realign_drap)
9782 /* Assign DRAP to vDRAP and returns vDRAP */
9783 unsigned int regno = find_drap_reg ();
9784 rtx drap_vreg;
9785 rtx arg_ptr;
9786 rtx seq, insn;
9788 arg_ptr = gen_rtx_REG (Pmode, regno);
9789 crtl->drap_reg = arg_ptr;
9791 start_sequence ();
9792 drap_vreg = copy_to_reg (arg_ptr);
9793 seq = get_insns ();
9794 end_sequence ();
9796 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9797 if (!optimize)
9799 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9800 RTX_FRAME_RELATED_P (insn) = 1;
9802 return drap_vreg;
9804 else
9805 return NULL;
9808 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9810 static rtx
9811 ix86_internal_arg_pointer (void)
9813 return virtual_incoming_args_rtx;
9816 struct scratch_reg {
9817 rtx reg;
9818 bool saved;
9821 /* Return a short-lived scratch register for use on function entry.
9822 In 32-bit mode, it is valid only after the registers are saved
9823 in the prologue. This register must be released by means of
9824 release_scratch_register_on_entry once it is dead. */
9826 static void
9827 get_scratch_register_on_entry (struct scratch_reg *sr)
9829 int regno;
9831 sr->saved = false;
9833 if (TARGET_64BIT)
9835 /* We always use R11 in 64-bit mode. */
9836 regno = R11_REG;
9838 else
9840 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9841 bool fastcall_p
9842 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9843 bool thiscall_p
9844 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9845 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9846 int regparm = ix86_function_regparm (fntype, decl);
9847 int drap_regno
9848 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9850 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9851 for the static chain register. */
9852 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9853 && drap_regno != AX_REG)
9854 regno = AX_REG;
9855 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9856 for the static chain register. */
9857 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9858 regno = AX_REG;
9859 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9860 regno = DX_REG;
9861 /* ecx is the static chain register. */
9862 else if (regparm < 3 && !fastcall_p && !thiscall_p
9863 && !static_chain_p
9864 && drap_regno != CX_REG)
9865 regno = CX_REG;
9866 else if (ix86_save_reg (BX_REG, true))
9867 regno = BX_REG;
9868 /* esi is the static chain register. */
9869 else if (!(regparm == 3 && static_chain_p)
9870 && ix86_save_reg (SI_REG, true))
9871 regno = SI_REG;
9872 else if (ix86_save_reg (DI_REG, true))
9873 regno = DI_REG;
9874 else
9876 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9877 sr->saved = true;
9881 sr->reg = gen_rtx_REG (Pmode, regno);
9882 if (sr->saved)
9884 rtx insn = emit_insn (gen_push (sr->reg));
9885 RTX_FRAME_RELATED_P (insn) = 1;
9889 /* Release a scratch register obtained from the preceding function. */
9891 static void
9892 release_scratch_register_on_entry (struct scratch_reg *sr)
9894 if (sr->saved)
9896 struct machine_function *m = cfun->machine;
9897 rtx x, insn = emit_insn (gen_pop (sr->reg));
9899 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9900 RTX_FRAME_RELATED_P (insn) = 1;
9901 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9902 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9903 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9904 m->fs.sp_offset -= UNITS_PER_WORD;
9908 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9910 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9912 static void
9913 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9915 /* We skip the probe for the first interval + a small dope of 4 words and
9916 probe that many bytes past the specified size to maintain a protection
9917 area at the botton of the stack. */
9918 const int dope = 4 * UNITS_PER_WORD;
9919 rtx size_rtx = GEN_INT (size), last;
9921 /* See if we have a constant small number of probes to generate. If so,
9922 that's the easy case. The run-time loop is made up of 11 insns in the
9923 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9924 for n # of intervals. */
9925 if (size <= 5 * PROBE_INTERVAL)
9927 HOST_WIDE_INT i, adjust;
9928 bool first_probe = true;
9930 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9931 values of N from 1 until it exceeds SIZE. If only one probe is
9932 needed, this will not generate any code. Then adjust and probe
9933 to PROBE_INTERVAL + SIZE. */
9934 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9936 if (first_probe)
9938 adjust = 2 * PROBE_INTERVAL + dope;
9939 first_probe = false;
9941 else
9942 adjust = PROBE_INTERVAL;
9944 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9945 plus_constant (Pmode, stack_pointer_rtx,
9946 -adjust)));
9947 emit_stack_probe (stack_pointer_rtx);
9950 if (first_probe)
9951 adjust = size + PROBE_INTERVAL + dope;
9952 else
9953 adjust = size + PROBE_INTERVAL - i;
9955 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9956 plus_constant (Pmode, stack_pointer_rtx,
9957 -adjust)));
9958 emit_stack_probe (stack_pointer_rtx);
9960 /* Adjust back to account for the additional first interval. */
9961 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9962 plus_constant (Pmode, stack_pointer_rtx,
9963 PROBE_INTERVAL + dope)));
9966 /* Otherwise, do the same as above, but in a loop. Note that we must be
9967 extra careful with variables wrapping around because we might be at
9968 the very top (or the very bottom) of the address space and we have
9969 to be able to handle this case properly; in particular, we use an
9970 equality test for the loop condition. */
9971 else
9973 HOST_WIDE_INT rounded_size;
9974 struct scratch_reg sr;
9976 get_scratch_register_on_entry (&sr);
9979 /* Step 1: round SIZE to the previous multiple of the interval. */
9981 rounded_size = size & -PROBE_INTERVAL;
9984 /* Step 2: compute initial and final value of the loop counter. */
9986 /* SP = SP_0 + PROBE_INTERVAL. */
9987 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9988 plus_constant (Pmode, stack_pointer_rtx,
9989 - (PROBE_INTERVAL + dope))));
9991 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9992 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9993 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9994 gen_rtx_PLUS (Pmode, sr.reg,
9995 stack_pointer_rtx)));
9998 /* Step 3: the loop
10000 while (SP != LAST_ADDR)
10002 SP = SP + PROBE_INTERVAL
10003 probe at SP
10006 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10007 values of N from 1 until it is equal to ROUNDED_SIZE. */
10009 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10012 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10013 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10015 if (size != rounded_size)
10017 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10018 plus_constant (Pmode, stack_pointer_rtx,
10019 rounded_size - size)));
10020 emit_stack_probe (stack_pointer_rtx);
10023 /* Adjust back to account for the additional first interval. */
10024 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10025 plus_constant (Pmode, stack_pointer_rtx,
10026 PROBE_INTERVAL + dope)));
10028 release_scratch_register_on_entry (&sr);
10031 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10033 /* Even if the stack pointer isn't the CFA register, we need to correctly
10034 describe the adjustments made to it, in particular differentiate the
10035 frame-related ones from the frame-unrelated ones. */
10036 if (size > 0)
10038 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10039 XVECEXP (expr, 0, 0)
10040 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10041 plus_constant (Pmode, stack_pointer_rtx, -size));
10042 XVECEXP (expr, 0, 1)
10043 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10044 plus_constant (Pmode, stack_pointer_rtx,
10045 PROBE_INTERVAL + dope + size));
10046 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10047 RTX_FRAME_RELATED_P (last) = 1;
10049 cfun->machine->fs.sp_offset += size;
10052 /* Make sure nothing is scheduled before we are done. */
10053 emit_insn (gen_blockage ());
10056 /* Adjust the stack pointer up to REG while probing it. */
10058 const char *
10059 output_adjust_stack_and_probe (rtx reg)
10061 static int labelno = 0;
10062 char loop_lab[32], end_lab[32];
10063 rtx xops[2];
10065 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10066 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10068 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10070 /* Jump to END_LAB if SP == LAST_ADDR. */
10071 xops[0] = stack_pointer_rtx;
10072 xops[1] = reg;
10073 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10074 fputs ("\tje\t", asm_out_file);
10075 assemble_name_raw (asm_out_file, end_lab);
10076 fputc ('\n', asm_out_file);
10078 /* SP = SP + PROBE_INTERVAL. */
10079 xops[1] = GEN_INT (PROBE_INTERVAL);
10080 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10082 /* Probe at SP. */
10083 xops[1] = const0_rtx;
10084 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10086 fprintf (asm_out_file, "\tjmp\t");
10087 assemble_name_raw (asm_out_file, loop_lab);
10088 fputc ('\n', asm_out_file);
10090 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10092 return "";
10095 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10096 inclusive. These are offsets from the current stack pointer. */
10098 static void
10099 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10101 /* See if we have a constant small number of probes to generate. If so,
10102 that's the easy case. The run-time loop is made up of 7 insns in the
10103 generic case while the compile-time loop is made up of n insns for n #
10104 of intervals. */
10105 if (size <= 7 * PROBE_INTERVAL)
10107 HOST_WIDE_INT i;
10109 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10110 it exceeds SIZE. If only one probe is needed, this will not
10111 generate any code. Then probe at FIRST + SIZE. */
10112 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10113 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10114 -(first + i)));
10116 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10117 -(first + size)));
10120 /* Otherwise, do the same as above, but in a loop. Note that we must be
10121 extra careful with variables wrapping around because we might be at
10122 the very top (or the very bottom) of the address space and we have
10123 to be able to handle this case properly; in particular, we use an
10124 equality test for the loop condition. */
10125 else
10127 HOST_WIDE_INT rounded_size, last;
10128 struct scratch_reg sr;
10130 get_scratch_register_on_entry (&sr);
10133 /* Step 1: round SIZE to the previous multiple of the interval. */
10135 rounded_size = size & -PROBE_INTERVAL;
10138 /* Step 2: compute initial and final value of the loop counter. */
10140 /* TEST_OFFSET = FIRST. */
10141 emit_move_insn (sr.reg, GEN_INT (-first));
10143 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10144 last = first + rounded_size;
10147 /* Step 3: the loop
10149 while (TEST_ADDR != LAST_ADDR)
10151 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10152 probe at TEST_ADDR
10155 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10156 until it is equal to ROUNDED_SIZE. */
10158 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10161 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10162 that SIZE is equal to ROUNDED_SIZE. */
10164 if (size != rounded_size)
10165 emit_stack_probe (plus_constant (Pmode,
10166 gen_rtx_PLUS (Pmode,
10167 stack_pointer_rtx,
10168 sr.reg),
10169 rounded_size - size));
10171 release_scratch_register_on_entry (&sr);
10174 /* Make sure nothing is scheduled before we are done. */
10175 emit_insn (gen_blockage ());
10178 /* Probe a range of stack addresses from REG to END, inclusive. These are
10179 offsets from the current stack pointer. */
10181 const char *
10182 output_probe_stack_range (rtx reg, rtx end)
10184 static int labelno = 0;
10185 char loop_lab[32], end_lab[32];
10186 rtx xops[3];
10188 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10189 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10191 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10193 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10194 xops[0] = reg;
10195 xops[1] = end;
10196 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10197 fputs ("\tje\t", asm_out_file);
10198 assemble_name_raw (asm_out_file, end_lab);
10199 fputc ('\n', asm_out_file);
10201 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10202 xops[1] = GEN_INT (PROBE_INTERVAL);
10203 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10205 /* Probe at TEST_ADDR. */
10206 xops[0] = stack_pointer_rtx;
10207 xops[1] = reg;
10208 xops[2] = const0_rtx;
10209 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10211 fprintf (asm_out_file, "\tjmp\t");
10212 assemble_name_raw (asm_out_file, loop_lab);
10213 fputc ('\n', asm_out_file);
10215 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10217 return "";
10220 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10221 to be generated in correct form. */
10222 static void
10223 ix86_finalize_stack_realign_flags (void)
10225 /* Check if stack realign is really needed after reload, and
10226 stores result in cfun */
10227 unsigned int incoming_stack_boundary
10228 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10229 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10230 unsigned int stack_realign = (incoming_stack_boundary
10231 < (crtl->is_leaf
10232 ? crtl->max_used_stack_slot_alignment
10233 : crtl->stack_alignment_needed));
10235 if (crtl->stack_realign_finalized)
10237 /* After stack_realign_needed is finalized, we can't no longer
10238 change it. */
10239 gcc_assert (crtl->stack_realign_needed == stack_realign);
10240 return;
10243 /* If the only reason for frame_pointer_needed is that we conservatively
10244 assumed stack realignment might be needed, but in the end nothing that
10245 needed the stack alignment had been spilled, clear frame_pointer_needed
10246 and say we don't need stack realignment. */
10247 if (stack_realign
10248 && !crtl->need_drap
10249 && frame_pointer_needed
10250 && crtl->is_leaf
10251 && flag_omit_frame_pointer
10252 && crtl->sp_is_unchanging
10253 && !ix86_current_function_calls_tls_descriptor
10254 && !crtl->accesses_prior_frames
10255 && !cfun->calls_alloca
10256 && !crtl->calls_eh_return
10257 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10258 && !ix86_frame_pointer_required ()
10259 && get_frame_size () == 0
10260 && ix86_nsaved_sseregs () == 0
10261 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10263 HARD_REG_SET set_up_by_prologue, prologue_used;
10264 basic_block bb;
10266 CLEAR_HARD_REG_SET (prologue_used);
10267 CLEAR_HARD_REG_SET (set_up_by_prologue);
10268 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10269 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10270 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10271 HARD_FRAME_POINTER_REGNUM);
10272 FOR_EACH_BB (bb)
10274 rtx insn;
10275 FOR_BB_INSNS (bb, insn)
10276 if (NONDEBUG_INSN_P (insn)
10277 && requires_stack_frame_p (insn, prologue_used,
10278 set_up_by_prologue))
10280 crtl->stack_realign_needed = stack_realign;
10281 crtl->stack_realign_finalized = true;
10282 return;
10286 frame_pointer_needed = false;
10287 stack_realign = false;
10288 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10289 crtl->stack_alignment_needed = incoming_stack_boundary;
10290 crtl->stack_alignment_estimated = incoming_stack_boundary;
10291 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10292 crtl->preferred_stack_boundary = incoming_stack_boundary;
10293 df_finish_pass (true);
10294 df_scan_alloc (NULL);
10295 df_scan_blocks ();
10296 df_compute_regs_ever_live (true);
10297 df_analyze ();
10300 crtl->stack_realign_needed = stack_realign;
10301 crtl->stack_realign_finalized = true;
10304 /* Expand the prologue into a bunch of separate insns. */
10306 void
10307 ix86_expand_prologue (void)
10309 struct machine_function *m = cfun->machine;
10310 rtx insn, t;
10311 bool pic_reg_used;
10312 struct ix86_frame frame;
10313 HOST_WIDE_INT allocate;
10314 bool int_registers_saved;
10315 bool sse_registers_saved;
10317 ix86_finalize_stack_realign_flags ();
10319 /* DRAP should not coexist with stack_realign_fp */
10320 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10322 memset (&m->fs, 0, sizeof (m->fs));
10324 /* Initialize CFA state for before the prologue. */
10325 m->fs.cfa_reg = stack_pointer_rtx;
10326 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10328 /* Track SP offset to the CFA. We continue tracking this after we've
10329 swapped the CFA register away from SP. In the case of re-alignment
10330 this is fudged; we're interested to offsets within the local frame. */
10331 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10332 m->fs.sp_valid = true;
10334 ix86_compute_frame_layout (&frame);
10336 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10338 /* We should have already generated an error for any use of
10339 ms_hook on a nested function. */
10340 gcc_checking_assert (!ix86_static_chain_on_stack);
10342 /* Check if profiling is active and we shall use profiling before
10343 prologue variant. If so sorry. */
10344 if (crtl->profile && flag_fentry != 0)
10345 sorry ("ms_hook_prologue attribute isn%'t compatible "
10346 "with -mfentry for 32-bit");
10348 /* In ix86_asm_output_function_label we emitted:
10349 8b ff movl.s %edi,%edi
10350 55 push %ebp
10351 8b ec movl.s %esp,%ebp
10353 This matches the hookable function prologue in Win32 API
10354 functions in Microsoft Windows XP Service Pack 2 and newer.
10355 Wine uses this to enable Windows apps to hook the Win32 API
10356 functions provided by Wine.
10358 What that means is that we've already set up the frame pointer. */
10360 if (frame_pointer_needed
10361 && !(crtl->drap_reg && crtl->stack_realign_needed))
10363 rtx push, mov;
10365 /* We've decided to use the frame pointer already set up.
10366 Describe this to the unwinder by pretending that both
10367 push and mov insns happen right here.
10369 Putting the unwind info here at the end of the ms_hook
10370 is done so that we can make absolutely certain we get
10371 the required byte sequence at the start of the function,
10372 rather than relying on an assembler that can produce
10373 the exact encoding required.
10375 However it does mean (in the unpatched case) that we have
10376 a 1 insn window where the asynchronous unwind info is
10377 incorrect. However, if we placed the unwind info at
10378 its correct location we would have incorrect unwind info
10379 in the patched case. Which is probably all moot since
10380 I don't expect Wine generates dwarf2 unwind info for the
10381 system libraries that use this feature. */
10383 insn = emit_insn (gen_blockage ());
10385 push = gen_push (hard_frame_pointer_rtx);
10386 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10387 stack_pointer_rtx);
10388 RTX_FRAME_RELATED_P (push) = 1;
10389 RTX_FRAME_RELATED_P (mov) = 1;
10391 RTX_FRAME_RELATED_P (insn) = 1;
10392 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10393 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10395 /* Note that gen_push incremented m->fs.cfa_offset, even
10396 though we didn't emit the push insn here. */
10397 m->fs.cfa_reg = hard_frame_pointer_rtx;
10398 m->fs.fp_offset = m->fs.cfa_offset;
10399 m->fs.fp_valid = true;
10401 else
10403 /* The frame pointer is not needed so pop %ebp again.
10404 This leaves us with a pristine state. */
10405 emit_insn (gen_pop (hard_frame_pointer_rtx));
10409 /* The first insn of a function that accepts its static chain on the
10410 stack is to push the register that would be filled in by a direct
10411 call. This insn will be skipped by the trampoline. */
10412 else if (ix86_static_chain_on_stack)
10414 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10415 emit_insn (gen_blockage ());
10417 /* We don't want to interpret this push insn as a register save,
10418 only as a stack adjustment. The real copy of the register as
10419 a save will be done later, if needed. */
10420 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10421 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10422 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10423 RTX_FRAME_RELATED_P (insn) = 1;
10426 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10427 of DRAP is needed and stack realignment is really needed after reload */
10428 if (stack_realign_drap)
10430 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10432 /* Only need to push parameter pointer reg if it is caller saved. */
10433 if (!call_used_regs[REGNO (crtl->drap_reg)])
10435 /* Push arg pointer reg */
10436 insn = emit_insn (gen_push (crtl->drap_reg));
10437 RTX_FRAME_RELATED_P (insn) = 1;
10440 /* Grab the argument pointer. */
10441 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10442 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10443 RTX_FRAME_RELATED_P (insn) = 1;
10444 m->fs.cfa_reg = crtl->drap_reg;
10445 m->fs.cfa_offset = 0;
10447 /* Align the stack. */
10448 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10449 stack_pointer_rtx,
10450 GEN_INT (-align_bytes)));
10451 RTX_FRAME_RELATED_P (insn) = 1;
10453 /* Replicate the return address on the stack so that return
10454 address can be reached via (argp - 1) slot. This is needed
10455 to implement macro RETURN_ADDR_RTX and intrinsic function
10456 expand_builtin_return_addr etc. */
10457 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10458 t = gen_frame_mem (word_mode, t);
10459 insn = emit_insn (gen_push (t));
10460 RTX_FRAME_RELATED_P (insn) = 1;
10462 /* For the purposes of frame and register save area addressing,
10463 we've started over with a new frame. */
10464 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10465 m->fs.realigned = true;
10468 int_registers_saved = (frame.nregs == 0);
10469 sse_registers_saved = (frame.nsseregs == 0);
10471 if (frame_pointer_needed && !m->fs.fp_valid)
10473 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10474 slower on all targets. Also sdb doesn't like it. */
10475 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10476 RTX_FRAME_RELATED_P (insn) = 1;
10478 /* Push registers now, before setting the frame pointer
10479 on SEH target. */
10480 if (!int_registers_saved
10481 && TARGET_SEH
10482 && !frame.save_regs_using_mov)
10484 ix86_emit_save_regs ();
10485 int_registers_saved = true;
10486 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10489 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10491 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10492 RTX_FRAME_RELATED_P (insn) = 1;
10494 if (m->fs.cfa_reg == stack_pointer_rtx)
10495 m->fs.cfa_reg = hard_frame_pointer_rtx;
10496 m->fs.fp_offset = m->fs.sp_offset;
10497 m->fs.fp_valid = true;
10501 if (!int_registers_saved)
10503 /* If saving registers via PUSH, do so now. */
10504 if (!frame.save_regs_using_mov)
10506 ix86_emit_save_regs ();
10507 int_registers_saved = true;
10508 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10511 /* When using red zone we may start register saving before allocating
10512 the stack frame saving one cycle of the prologue. However, avoid
10513 doing this if we have to probe the stack; at least on x86_64 the
10514 stack probe can turn into a call that clobbers a red zone location. */
10515 else if (ix86_using_red_zone ()
10516 && (! TARGET_STACK_PROBE
10517 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10519 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10520 int_registers_saved = true;
10524 if (stack_realign_fp)
10526 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10527 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10529 /* The computation of the size of the re-aligned stack frame means
10530 that we must allocate the size of the register save area before
10531 performing the actual alignment. Otherwise we cannot guarantee
10532 that there's enough storage above the realignment point. */
10533 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10534 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10535 GEN_INT (m->fs.sp_offset
10536 - frame.sse_reg_save_offset),
10537 -1, false);
10539 /* Align the stack. */
10540 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10541 stack_pointer_rtx,
10542 GEN_INT (-align_bytes)));
10544 /* For the purposes of register save area addressing, the stack
10545 pointer is no longer valid. As for the value of sp_offset,
10546 see ix86_compute_frame_layout, which we need to match in order
10547 to pass verification of stack_pointer_offset at the end. */
10548 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10549 m->fs.sp_valid = false;
10552 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10554 if (flag_stack_usage_info)
10556 /* We start to count from ARG_POINTER. */
10557 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10559 /* If it was realigned, take into account the fake frame. */
10560 if (stack_realign_drap)
10562 if (ix86_static_chain_on_stack)
10563 stack_size += UNITS_PER_WORD;
10565 if (!call_used_regs[REGNO (crtl->drap_reg)])
10566 stack_size += UNITS_PER_WORD;
10568 /* This over-estimates by 1 minimal-stack-alignment-unit but
10569 mitigates that by counting in the new return address slot. */
10570 current_function_dynamic_stack_size
10571 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10574 current_function_static_stack_size = stack_size;
10577 /* On SEH target with very large frame size, allocate an area to save
10578 SSE registers (as the very large allocation won't be described). */
10579 if (TARGET_SEH
10580 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10581 && !sse_registers_saved)
10583 HOST_WIDE_INT sse_size =
10584 frame.sse_reg_save_offset - frame.reg_save_offset;
10586 gcc_assert (int_registers_saved);
10588 /* No need to do stack checking as the area will be immediately
10589 written. */
10590 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10591 GEN_INT (-sse_size), -1,
10592 m->fs.cfa_reg == stack_pointer_rtx);
10593 allocate -= sse_size;
10594 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10595 sse_registers_saved = true;
10598 /* The stack has already been decremented by the instruction calling us
10599 so probe if the size is non-negative to preserve the protection area. */
10600 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10602 /* We expect the registers to be saved when probes are used. */
10603 gcc_assert (int_registers_saved);
10605 if (STACK_CHECK_MOVING_SP)
10607 ix86_adjust_stack_and_probe (allocate);
10608 allocate = 0;
10610 else
10612 HOST_WIDE_INT size = allocate;
10614 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10615 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10617 if (TARGET_STACK_PROBE)
10618 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10619 else
10620 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10624 if (allocate == 0)
10626 else if (!ix86_target_stack_probe ()
10627 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10629 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10630 GEN_INT (-allocate), -1,
10631 m->fs.cfa_reg == stack_pointer_rtx);
10633 else
10635 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10636 rtx r10 = NULL;
10637 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10638 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10639 bool eax_live = false;
10640 bool r10_live = false;
10642 if (TARGET_64BIT)
10643 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10644 if (!TARGET_64BIT_MS_ABI)
10645 eax_live = ix86_eax_live_at_start_p ();
10647 /* Note that SEH directives need to continue tracking the stack
10648 pointer even after the frame pointer has been set up. */
10649 if (eax_live)
10651 insn = emit_insn (gen_push (eax));
10652 allocate -= UNITS_PER_WORD;
10653 if (sp_is_cfa_reg || TARGET_SEH)
10655 if (sp_is_cfa_reg)
10656 m->fs.cfa_offset += UNITS_PER_WORD;
10657 RTX_FRAME_RELATED_P (insn) = 1;
10661 if (r10_live)
10663 r10 = gen_rtx_REG (Pmode, R10_REG);
10664 insn = emit_insn (gen_push (r10));
10665 allocate -= UNITS_PER_WORD;
10666 if (sp_is_cfa_reg || TARGET_SEH)
10668 if (sp_is_cfa_reg)
10669 m->fs.cfa_offset += UNITS_PER_WORD;
10670 RTX_FRAME_RELATED_P (insn) = 1;
10674 emit_move_insn (eax, GEN_INT (allocate));
10675 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10677 /* Use the fact that AX still contains ALLOCATE. */
10678 adjust_stack_insn = (Pmode == DImode
10679 ? gen_pro_epilogue_adjust_stack_di_sub
10680 : gen_pro_epilogue_adjust_stack_si_sub);
10682 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10683 stack_pointer_rtx, eax));
10685 if (sp_is_cfa_reg || TARGET_SEH)
10687 if (sp_is_cfa_reg)
10688 m->fs.cfa_offset += allocate;
10689 RTX_FRAME_RELATED_P (insn) = 1;
10690 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10691 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10692 plus_constant (Pmode, stack_pointer_rtx,
10693 -allocate)));
10695 m->fs.sp_offset += allocate;
10697 if (r10_live && eax_live)
10699 t = choose_baseaddr (m->fs.sp_offset - allocate);
10700 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10701 gen_frame_mem (word_mode, t));
10702 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10703 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10704 gen_frame_mem (word_mode, t));
10706 else if (eax_live || r10_live)
10708 t = choose_baseaddr (m->fs.sp_offset - allocate);
10709 emit_move_insn (gen_rtx_REG (word_mode,
10710 (eax_live ? AX_REG : R10_REG)),
10711 gen_frame_mem (word_mode, t));
10714 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10716 /* If we havn't already set up the frame pointer, do so now. */
10717 if (frame_pointer_needed && !m->fs.fp_valid)
10719 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10720 GEN_INT (frame.stack_pointer_offset
10721 - frame.hard_frame_pointer_offset));
10722 insn = emit_insn (insn);
10723 RTX_FRAME_RELATED_P (insn) = 1;
10724 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10726 if (m->fs.cfa_reg == stack_pointer_rtx)
10727 m->fs.cfa_reg = hard_frame_pointer_rtx;
10728 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10729 m->fs.fp_valid = true;
10732 if (!int_registers_saved)
10733 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10734 if (!sse_registers_saved)
10735 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10737 pic_reg_used = false;
10738 /* We don't use pic-register for pe-coff target. */
10739 if (pic_offset_table_rtx
10740 && !TARGET_PECOFF
10741 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10742 || crtl->profile))
10744 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10746 if (alt_pic_reg_used != INVALID_REGNUM)
10747 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10749 pic_reg_used = true;
10752 if (pic_reg_used)
10754 if (TARGET_64BIT)
10756 if (ix86_cmodel == CM_LARGE_PIC)
10758 rtx label, tmp_reg;
10760 gcc_assert (Pmode == DImode);
10761 label = gen_label_rtx ();
10762 emit_label (label);
10763 LABEL_PRESERVE_P (label) = 1;
10764 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10765 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10766 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10767 label));
10768 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10769 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10770 pic_offset_table_rtx, tmp_reg));
10772 else
10773 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10775 else
10777 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10778 RTX_FRAME_RELATED_P (insn) = 1;
10779 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10783 /* In the pic_reg_used case, make sure that the got load isn't deleted
10784 when mcount needs it. Blockage to avoid call movement across mcount
10785 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10786 note. */
10787 if (crtl->profile && !flag_fentry && pic_reg_used)
10788 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10790 if (crtl->drap_reg && !crtl->stack_realign_needed)
10792 /* vDRAP is setup but after reload it turns out stack realign
10793 isn't necessary, here we will emit prologue to setup DRAP
10794 without stack realign adjustment */
10795 t = choose_baseaddr (0);
10796 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10799 /* Prevent instructions from being scheduled into register save push
10800 sequence when access to the redzone area is done through frame pointer.
10801 The offset between the frame pointer and the stack pointer is calculated
10802 relative to the value of the stack pointer at the end of the function
10803 prologue, and moving instructions that access redzone area via frame
10804 pointer inside push sequence violates this assumption. */
10805 if (frame_pointer_needed && frame.red_zone_size)
10806 emit_insn (gen_memory_blockage ());
10808 /* Emit cld instruction if stringops are used in the function. */
10809 if (TARGET_CLD && ix86_current_function_needs_cld)
10810 emit_insn (gen_cld ());
10812 /* SEH requires that the prologue end within 256 bytes of the start of
10813 the function. Prevent instruction schedules that would extend that.
10814 Further, prevent alloca modifications to the stack pointer from being
10815 combined with prologue modifications. */
10816 if (TARGET_SEH)
10817 emit_insn (gen_prologue_use (stack_pointer_rtx));
10820 /* Emit code to restore REG using a POP insn. */
10822 static void
10823 ix86_emit_restore_reg_using_pop (rtx reg)
10825 struct machine_function *m = cfun->machine;
10826 rtx insn = emit_insn (gen_pop (reg));
10828 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10829 m->fs.sp_offset -= UNITS_PER_WORD;
10831 if (m->fs.cfa_reg == crtl->drap_reg
10832 && REGNO (reg) == REGNO (crtl->drap_reg))
10834 /* Previously we'd represented the CFA as an expression
10835 like *(%ebp - 8). We've just popped that value from
10836 the stack, which means we need to reset the CFA to
10837 the drap register. This will remain until we restore
10838 the stack pointer. */
10839 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10840 RTX_FRAME_RELATED_P (insn) = 1;
10842 /* This means that the DRAP register is valid for addressing too. */
10843 m->fs.drap_valid = true;
10844 return;
10847 if (m->fs.cfa_reg == stack_pointer_rtx)
10849 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10850 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10851 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10852 RTX_FRAME_RELATED_P (insn) = 1;
10854 m->fs.cfa_offset -= UNITS_PER_WORD;
10857 /* When the frame pointer is the CFA, and we pop it, we are
10858 swapping back to the stack pointer as the CFA. This happens
10859 for stack frames that don't allocate other data, so we assume
10860 the stack pointer is now pointing at the return address, i.e.
10861 the function entry state, which makes the offset be 1 word. */
10862 if (reg == hard_frame_pointer_rtx)
10864 m->fs.fp_valid = false;
10865 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10867 m->fs.cfa_reg = stack_pointer_rtx;
10868 m->fs.cfa_offset -= UNITS_PER_WORD;
10870 add_reg_note (insn, REG_CFA_DEF_CFA,
10871 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10872 GEN_INT (m->fs.cfa_offset)));
10873 RTX_FRAME_RELATED_P (insn) = 1;
10878 /* Emit code to restore saved registers using POP insns. */
10880 static void
10881 ix86_emit_restore_regs_using_pop (void)
10883 unsigned int regno;
10885 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10886 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10887 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10890 /* Emit code and notes for the LEAVE instruction. */
10892 static void
10893 ix86_emit_leave (void)
10895 struct machine_function *m = cfun->machine;
10896 rtx insn = emit_insn (ix86_gen_leave ());
10898 ix86_add_queued_cfa_restore_notes (insn);
10900 gcc_assert (m->fs.fp_valid);
10901 m->fs.sp_valid = true;
10902 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10903 m->fs.fp_valid = false;
10905 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10907 m->fs.cfa_reg = stack_pointer_rtx;
10908 m->fs.cfa_offset = m->fs.sp_offset;
10910 add_reg_note (insn, REG_CFA_DEF_CFA,
10911 plus_constant (Pmode, stack_pointer_rtx,
10912 m->fs.sp_offset));
10913 RTX_FRAME_RELATED_P (insn) = 1;
10915 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10916 m->fs.fp_offset);
10919 /* Emit code to restore saved registers using MOV insns.
10920 First register is restored from CFA - CFA_OFFSET. */
10921 static void
10922 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10923 bool maybe_eh_return)
10925 struct machine_function *m = cfun->machine;
10926 unsigned int regno;
10928 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10929 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10931 rtx reg = gen_rtx_REG (word_mode, regno);
10932 rtx insn, mem;
10934 mem = choose_baseaddr (cfa_offset);
10935 mem = gen_frame_mem (word_mode, mem);
10936 insn = emit_move_insn (reg, mem);
10938 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10940 /* Previously we'd represented the CFA as an expression
10941 like *(%ebp - 8). We've just popped that value from
10942 the stack, which means we need to reset the CFA to
10943 the drap register. This will remain until we restore
10944 the stack pointer. */
10945 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10946 RTX_FRAME_RELATED_P (insn) = 1;
10948 /* This means that the DRAP register is valid for addressing. */
10949 m->fs.drap_valid = true;
10951 else
10952 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10954 cfa_offset -= UNITS_PER_WORD;
10958 /* Emit code to restore saved registers using MOV insns.
10959 First register is restored from CFA - CFA_OFFSET. */
10960 static void
10961 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10962 bool maybe_eh_return)
10964 unsigned int regno;
10966 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10967 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10969 rtx reg = gen_rtx_REG (V4SFmode, regno);
10970 rtx mem;
10972 mem = choose_baseaddr (cfa_offset);
10973 mem = gen_rtx_MEM (V4SFmode, mem);
10974 set_mem_align (mem, 128);
10975 emit_move_insn (reg, mem);
10977 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10979 cfa_offset -= 16;
10983 /* Restore function stack, frame, and registers. */
10985 void
10986 ix86_expand_epilogue (int style)
10988 struct machine_function *m = cfun->machine;
10989 struct machine_frame_state frame_state_save = m->fs;
10990 struct ix86_frame frame;
10991 bool restore_regs_via_mov;
10992 bool using_drap;
10994 ix86_finalize_stack_realign_flags ();
10995 ix86_compute_frame_layout (&frame);
10997 m->fs.sp_valid = (!frame_pointer_needed
10998 || (crtl->sp_is_unchanging
10999 && !stack_realign_fp));
11000 gcc_assert (!m->fs.sp_valid
11001 || m->fs.sp_offset == frame.stack_pointer_offset);
11003 /* The FP must be valid if the frame pointer is present. */
11004 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11005 gcc_assert (!m->fs.fp_valid
11006 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11008 /* We must have *some* valid pointer to the stack frame. */
11009 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11011 /* The DRAP is never valid at this point. */
11012 gcc_assert (!m->fs.drap_valid);
11014 /* See the comment about red zone and frame
11015 pointer usage in ix86_expand_prologue. */
11016 if (frame_pointer_needed && frame.red_zone_size)
11017 emit_insn (gen_memory_blockage ());
11019 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11020 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11022 /* Determine the CFA offset of the end of the red-zone. */
11023 m->fs.red_zone_offset = 0;
11024 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11026 /* The red-zone begins below the return address. */
11027 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11029 /* When the register save area is in the aligned portion of
11030 the stack, determine the maximum runtime displacement that
11031 matches up with the aligned frame. */
11032 if (stack_realign_drap)
11033 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11034 + UNITS_PER_WORD);
11037 /* Special care must be taken for the normal return case of a function
11038 using eh_return: the eax and edx registers are marked as saved, but
11039 not restored along this path. Adjust the save location to match. */
11040 if (crtl->calls_eh_return && style != 2)
11041 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11043 /* EH_RETURN requires the use of moves to function properly. */
11044 if (crtl->calls_eh_return)
11045 restore_regs_via_mov = true;
11046 /* SEH requires the use of pops to identify the epilogue. */
11047 else if (TARGET_SEH)
11048 restore_regs_via_mov = false;
11049 /* If we're only restoring one register and sp is not valid then
11050 using a move instruction to restore the register since it's
11051 less work than reloading sp and popping the register. */
11052 else if (!m->fs.sp_valid && frame.nregs <= 1)
11053 restore_regs_via_mov = true;
11054 else if (TARGET_EPILOGUE_USING_MOVE
11055 && cfun->machine->use_fast_prologue_epilogue
11056 && (frame.nregs > 1
11057 || m->fs.sp_offset != frame.reg_save_offset))
11058 restore_regs_via_mov = true;
11059 else if (frame_pointer_needed
11060 && !frame.nregs
11061 && m->fs.sp_offset != frame.reg_save_offset)
11062 restore_regs_via_mov = true;
11063 else if (frame_pointer_needed
11064 && TARGET_USE_LEAVE
11065 && cfun->machine->use_fast_prologue_epilogue
11066 && frame.nregs == 1)
11067 restore_regs_via_mov = true;
11068 else
11069 restore_regs_via_mov = false;
11071 if (restore_regs_via_mov || frame.nsseregs)
11073 /* Ensure that the entire register save area is addressable via
11074 the stack pointer, if we will restore via sp. */
11075 if (TARGET_64BIT
11076 && m->fs.sp_offset > 0x7fffffff
11077 && !(m->fs.fp_valid || m->fs.drap_valid)
11078 && (frame.nsseregs + frame.nregs) != 0)
11080 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11081 GEN_INT (m->fs.sp_offset
11082 - frame.sse_reg_save_offset),
11083 style,
11084 m->fs.cfa_reg == stack_pointer_rtx);
11088 /* If there are any SSE registers to restore, then we have to do it
11089 via moves, since there's obviously no pop for SSE regs. */
11090 if (frame.nsseregs)
11091 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11092 style == 2);
11094 if (restore_regs_via_mov)
11096 rtx t;
11098 if (frame.nregs)
11099 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11101 /* eh_return epilogues need %ecx added to the stack pointer. */
11102 if (style == 2)
11104 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11106 /* Stack align doesn't work with eh_return. */
11107 gcc_assert (!stack_realign_drap);
11108 /* Neither does regparm nested functions. */
11109 gcc_assert (!ix86_static_chain_on_stack);
11111 if (frame_pointer_needed)
11113 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11114 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11115 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11117 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11118 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11120 /* Note that we use SA as a temporary CFA, as the return
11121 address is at the proper place relative to it. We
11122 pretend this happens at the FP restore insn because
11123 prior to this insn the FP would be stored at the wrong
11124 offset relative to SA, and after this insn we have no
11125 other reasonable register to use for the CFA. We don't
11126 bother resetting the CFA to the SP for the duration of
11127 the return insn. */
11128 add_reg_note (insn, REG_CFA_DEF_CFA,
11129 plus_constant (Pmode, sa, UNITS_PER_WORD));
11130 ix86_add_queued_cfa_restore_notes (insn);
11131 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11132 RTX_FRAME_RELATED_P (insn) = 1;
11134 m->fs.cfa_reg = sa;
11135 m->fs.cfa_offset = UNITS_PER_WORD;
11136 m->fs.fp_valid = false;
11138 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11139 const0_rtx, style, false);
11141 else
11143 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11144 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11145 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11146 ix86_add_queued_cfa_restore_notes (insn);
11148 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11149 if (m->fs.cfa_offset != UNITS_PER_WORD)
11151 m->fs.cfa_offset = UNITS_PER_WORD;
11152 add_reg_note (insn, REG_CFA_DEF_CFA,
11153 plus_constant (Pmode, stack_pointer_rtx,
11154 UNITS_PER_WORD));
11155 RTX_FRAME_RELATED_P (insn) = 1;
11158 m->fs.sp_offset = UNITS_PER_WORD;
11159 m->fs.sp_valid = true;
11162 else
11164 /* SEH requires that the function end with (1) a stack adjustment
11165 if necessary, (2) a sequence of pops, and (3) a return or
11166 jump instruction. Prevent insns from the function body from
11167 being scheduled into this sequence. */
11168 if (TARGET_SEH)
11170 /* Prevent a catch region from being adjacent to the standard
11171 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11172 several other flags that would be interesting to test are
11173 not yet set up. */
11174 if (flag_non_call_exceptions)
11175 emit_insn (gen_nops (const1_rtx));
11176 else
11177 emit_insn (gen_blockage ());
11180 /* First step is to deallocate the stack frame so that we can
11181 pop the registers. Also do it on SEH target for very large
11182 frame as the emitted instructions aren't allowed by the ABI in
11183 epilogues. */
11184 if (!m->fs.sp_valid
11185 || (TARGET_SEH
11186 && (m->fs.sp_offset - frame.reg_save_offset
11187 >= SEH_MAX_FRAME_SIZE)))
11189 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11190 GEN_INT (m->fs.fp_offset
11191 - frame.reg_save_offset),
11192 style, false);
11194 else if (m->fs.sp_offset != frame.reg_save_offset)
11196 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11197 GEN_INT (m->fs.sp_offset
11198 - frame.reg_save_offset),
11199 style,
11200 m->fs.cfa_reg == stack_pointer_rtx);
11203 ix86_emit_restore_regs_using_pop ();
11206 /* If we used a stack pointer and haven't already got rid of it,
11207 then do so now. */
11208 if (m->fs.fp_valid)
11210 /* If the stack pointer is valid and pointing at the frame
11211 pointer store address, then we only need a pop. */
11212 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11213 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11214 /* Leave results in shorter dependency chains on CPUs that are
11215 able to grok it fast. */
11216 else if (TARGET_USE_LEAVE
11217 || optimize_bb_for_size_p (EXIT_BLOCK_PTR)
11218 || !cfun->machine->use_fast_prologue_epilogue)
11219 ix86_emit_leave ();
11220 else
11222 pro_epilogue_adjust_stack (stack_pointer_rtx,
11223 hard_frame_pointer_rtx,
11224 const0_rtx, style, !using_drap);
11225 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11229 if (using_drap)
11231 int param_ptr_offset = UNITS_PER_WORD;
11232 rtx insn;
11234 gcc_assert (stack_realign_drap);
11236 if (ix86_static_chain_on_stack)
11237 param_ptr_offset += UNITS_PER_WORD;
11238 if (!call_used_regs[REGNO (crtl->drap_reg)])
11239 param_ptr_offset += UNITS_PER_WORD;
11241 insn = emit_insn (gen_rtx_SET
11242 (VOIDmode, stack_pointer_rtx,
11243 gen_rtx_PLUS (Pmode,
11244 crtl->drap_reg,
11245 GEN_INT (-param_ptr_offset))));
11246 m->fs.cfa_reg = stack_pointer_rtx;
11247 m->fs.cfa_offset = param_ptr_offset;
11248 m->fs.sp_offset = param_ptr_offset;
11249 m->fs.realigned = false;
11251 add_reg_note (insn, REG_CFA_DEF_CFA,
11252 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11253 GEN_INT (param_ptr_offset)));
11254 RTX_FRAME_RELATED_P (insn) = 1;
11256 if (!call_used_regs[REGNO (crtl->drap_reg)])
11257 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11260 /* At this point the stack pointer must be valid, and we must have
11261 restored all of the registers. We may not have deallocated the
11262 entire stack frame. We've delayed this until now because it may
11263 be possible to merge the local stack deallocation with the
11264 deallocation forced by ix86_static_chain_on_stack. */
11265 gcc_assert (m->fs.sp_valid);
11266 gcc_assert (!m->fs.fp_valid);
11267 gcc_assert (!m->fs.realigned);
11268 if (m->fs.sp_offset != UNITS_PER_WORD)
11270 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11271 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11272 style, true);
11274 else
11275 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11277 /* Sibcall epilogues don't want a return instruction. */
11278 if (style == 0)
11280 m->fs = frame_state_save;
11281 return;
11284 if (crtl->args.pops_args && crtl->args.size)
11286 rtx popc = GEN_INT (crtl->args.pops_args);
11288 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11289 address, do explicit add, and jump indirectly to the caller. */
11291 if (crtl->args.pops_args >= 65536)
11293 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11294 rtx insn;
11296 /* There is no "pascal" calling convention in any 64bit ABI. */
11297 gcc_assert (!TARGET_64BIT);
11299 insn = emit_insn (gen_pop (ecx));
11300 m->fs.cfa_offset -= UNITS_PER_WORD;
11301 m->fs.sp_offset -= UNITS_PER_WORD;
11303 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11304 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11305 add_reg_note (insn, REG_CFA_REGISTER,
11306 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11307 RTX_FRAME_RELATED_P (insn) = 1;
11309 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11310 popc, -1, true);
11311 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11313 else
11314 emit_jump_insn (gen_simple_return_pop_internal (popc));
11316 else
11317 emit_jump_insn (gen_simple_return_internal ());
11319 /* Restore the state back to the state from the prologue,
11320 so that it's correct for the next epilogue. */
11321 m->fs = frame_state_save;
11324 /* Reset from the function's potential modifications. */
11326 static void
11327 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11328 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11330 if (pic_offset_table_rtx)
11331 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11332 #if TARGET_MACHO
11333 /* Mach-O doesn't support labels at the end of objects, so if
11334 it looks like we might want one, insert a NOP. */
11336 rtx insn = get_last_insn ();
11337 rtx deleted_debug_label = NULL_RTX;
11338 while (insn
11339 && NOTE_P (insn)
11340 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11342 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11343 notes only, instead set their CODE_LABEL_NUMBER to -1,
11344 otherwise there would be code generation differences
11345 in between -g and -g0. */
11346 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11347 deleted_debug_label = insn;
11348 insn = PREV_INSN (insn);
11350 if (insn
11351 && (LABEL_P (insn)
11352 || (NOTE_P (insn)
11353 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11354 fputs ("\tnop\n", file);
11355 else if (deleted_debug_label)
11356 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11357 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11358 CODE_LABEL_NUMBER (insn) = -1;
11360 #endif
11364 /* Return a scratch register to use in the split stack prologue. The
11365 split stack prologue is used for -fsplit-stack. It is the first
11366 instructions in the function, even before the regular prologue.
11367 The scratch register can be any caller-saved register which is not
11368 used for parameters or for the static chain. */
11370 static unsigned int
11371 split_stack_prologue_scratch_regno (void)
11373 if (TARGET_64BIT)
11374 return R11_REG;
11375 else
11377 bool is_fastcall, is_thiscall;
11378 int regparm;
11380 is_fastcall = (lookup_attribute ("fastcall",
11381 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11382 != NULL);
11383 is_thiscall = (lookup_attribute ("thiscall",
11384 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11385 != NULL);
11386 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11388 if (is_fastcall)
11390 if (DECL_STATIC_CHAIN (cfun->decl))
11392 sorry ("-fsplit-stack does not support fastcall with "
11393 "nested function");
11394 return INVALID_REGNUM;
11396 return AX_REG;
11398 else if (is_thiscall)
11400 if (!DECL_STATIC_CHAIN (cfun->decl))
11401 return DX_REG;
11402 return AX_REG;
11404 else if (regparm < 3)
11406 if (!DECL_STATIC_CHAIN (cfun->decl))
11407 return CX_REG;
11408 else
11410 if (regparm >= 2)
11412 sorry ("-fsplit-stack does not support 2 register "
11413 " parameters for a nested function");
11414 return INVALID_REGNUM;
11416 return DX_REG;
11419 else
11421 /* FIXME: We could make this work by pushing a register
11422 around the addition and comparison. */
11423 sorry ("-fsplit-stack does not support 3 register parameters");
11424 return INVALID_REGNUM;
11429 /* A SYMBOL_REF for the function which allocates new stackspace for
11430 -fsplit-stack. */
11432 static GTY(()) rtx split_stack_fn;
11434 /* A SYMBOL_REF for the more stack function when using the large
11435 model. */
11437 static GTY(()) rtx split_stack_fn_large;
11439 /* Handle -fsplit-stack. These are the first instructions in the
11440 function, even before the regular prologue. */
11442 void
11443 ix86_expand_split_stack_prologue (void)
11445 struct ix86_frame frame;
11446 HOST_WIDE_INT allocate;
11447 unsigned HOST_WIDE_INT args_size;
11448 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11449 rtx scratch_reg = NULL_RTX;
11450 rtx varargs_label = NULL_RTX;
11451 rtx fn;
11453 gcc_assert (flag_split_stack && reload_completed);
11455 ix86_finalize_stack_realign_flags ();
11456 ix86_compute_frame_layout (&frame);
11457 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11459 /* This is the label we will branch to if we have enough stack
11460 space. We expect the basic block reordering pass to reverse this
11461 branch if optimizing, so that we branch in the unlikely case. */
11462 label = gen_label_rtx ();
11464 /* We need to compare the stack pointer minus the frame size with
11465 the stack boundary in the TCB. The stack boundary always gives
11466 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11467 can compare directly. Otherwise we need to do an addition. */
11469 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11470 UNSPEC_STACK_CHECK);
11471 limit = gen_rtx_CONST (Pmode, limit);
11472 limit = gen_rtx_MEM (Pmode, limit);
11473 if (allocate < SPLIT_STACK_AVAILABLE)
11474 current = stack_pointer_rtx;
11475 else
11477 unsigned int scratch_regno;
11478 rtx offset;
11480 /* We need a scratch register to hold the stack pointer minus
11481 the required frame size. Since this is the very start of the
11482 function, the scratch register can be any caller-saved
11483 register which is not used for parameters. */
11484 offset = GEN_INT (- allocate);
11485 scratch_regno = split_stack_prologue_scratch_regno ();
11486 if (scratch_regno == INVALID_REGNUM)
11487 return;
11488 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11489 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11491 /* We don't use ix86_gen_add3 in this case because it will
11492 want to split to lea, but when not optimizing the insn
11493 will not be split after this point. */
11494 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11495 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11496 offset)));
11498 else
11500 emit_move_insn (scratch_reg, offset);
11501 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11502 stack_pointer_rtx));
11504 current = scratch_reg;
11507 ix86_expand_branch (GEU, current, limit, label);
11508 jump_insn = get_last_insn ();
11509 JUMP_LABEL (jump_insn) = label;
11511 /* Mark the jump as very likely to be taken. */
11512 add_reg_note (jump_insn, REG_BR_PROB,
11513 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11515 if (split_stack_fn == NULL_RTX)
11516 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11517 fn = split_stack_fn;
11519 /* Get more stack space. We pass in the desired stack space and the
11520 size of the arguments to copy to the new stack. In 32-bit mode
11521 we push the parameters; __morestack will return on a new stack
11522 anyhow. In 64-bit mode we pass the parameters in r10 and
11523 r11. */
11524 allocate_rtx = GEN_INT (allocate);
11525 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11526 call_fusage = NULL_RTX;
11527 if (TARGET_64BIT)
11529 rtx reg10, reg11;
11531 reg10 = gen_rtx_REG (Pmode, R10_REG);
11532 reg11 = gen_rtx_REG (Pmode, R11_REG);
11534 /* If this function uses a static chain, it will be in %r10.
11535 Preserve it across the call to __morestack. */
11536 if (DECL_STATIC_CHAIN (cfun->decl))
11538 rtx rax;
11540 rax = gen_rtx_REG (word_mode, AX_REG);
11541 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11542 use_reg (&call_fusage, rax);
11545 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11546 && !TARGET_PECOFF)
11548 HOST_WIDE_INT argval;
11550 gcc_assert (Pmode == DImode);
11551 /* When using the large model we need to load the address
11552 into a register, and we've run out of registers. So we
11553 switch to a different calling convention, and we call a
11554 different function: __morestack_large. We pass the
11555 argument size in the upper 32 bits of r10 and pass the
11556 frame size in the lower 32 bits. */
11557 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11558 gcc_assert ((args_size & 0xffffffff) == args_size);
11560 if (split_stack_fn_large == NULL_RTX)
11561 split_stack_fn_large =
11562 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11564 if (ix86_cmodel == CM_LARGE_PIC)
11566 rtx label, x;
11568 label = gen_label_rtx ();
11569 emit_label (label);
11570 LABEL_PRESERVE_P (label) = 1;
11571 emit_insn (gen_set_rip_rex64 (reg10, label));
11572 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11573 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11574 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11575 UNSPEC_GOT);
11576 x = gen_rtx_CONST (Pmode, x);
11577 emit_move_insn (reg11, x);
11578 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11579 x = gen_const_mem (Pmode, x);
11580 emit_move_insn (reg11, x);
11582 else
11583 emit_move_insn (reg11, split_stack_fn_large);
11585 fn = reg11;
11587 argval = ((args_size << 16) << 16) + allocate;
11588 emit_move_insn (reg10, GEN_INT (argval));
11590 else
11592 emit_move_insn (reg10, allocate_rtx);
11593 emit_move_insn (reg11, GEN_INT (args_size));
11594 use_reg (&call_fusage, reg11);
11597 use_reg (&call_fusage, reg10);
11599 else
11601 emit_insn (gen_push (GEN_INT (args_size)));
11602 emit_insn (gen_push (allocate_rtx));
11604 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11605 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11606 NULL_RTX, false);
11607 add_function_usage_to (call_insn, call_fusage);
11609 /* In order to make call/return prediction work right, we now need
11610 to execute a return instruction. See
11611 libgcc/config/i386/morestack.S for the details on how this works.
11613 For flow purposes gcc must not see this as a return
11614 instruction--we need control flow to continue at the subsequent
11615 label. Therefore, we use an unspec. */
11616 gcc_assert (crtl->args.pops_args < 65536);
11617 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11619 /* If we are in 64-bit mode and this function uses a static chain,
11620 we saved %r10 in %rax before calling _morestack. */
11621 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11622 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11623 gen_rtx_REG (word_mode, AX_REG));
11625 /* If this function calls va_start, we need to store a pointer to
11626 the arguments on the old stack, because they may not have been
11627 all copied to the new stack. At this point the old stack can be
11628 found at the frame pointer value used by __morestack, because
11629 __morestack has set that up before calling back to us. Here we
11630 store that pointer in a scratch register, and in
11631 ix86_expand_prologue we store the scratch register in a stack
11632 slot. */
11633 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11635 unsigned int scratch_regno;
11636 rtx frame_reg;
11637 int words;
11639 scratch_regno = split_stack_prologue_scratch_regno ();
11640 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11641 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11643 /* 64-bit:
11644 fp -> old fp value
11645 return address within this function
11646 return address of caller of this function
11647 stack arguments
11648 So we add three words to get to the stack arguments.
11650 32-bit:
11651 fp -> old fp value
11652 return address within this function
11653 first argument to __morestack
11654 second argument to __morestack
11655 return address of caller of this function
11656 stack arguments
11657 So we add five words to get to the stack arguments.
11659 words = TARGET_64BIT ? 3 : 5;
11660 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11661 gen_rtx_PLUS (Pmode, frame_reg,
11662 GEN_INT (words * UNITS_PER_WORD))));
11664 varargs_label = gen_label_rtx ();
11665 emit_jump_insn (gen_jump (varargs_label));
11666 JUMP_LABEL (get_last_insn ()) = varargs_label;
11668 emit_barrier ();
11671 emit_label (label);
11672 LABEL_NUSES (label) = 1;
11674 /* If this function calls va_start, we now have to set the scratch
11675 register for the case where we do not call __morestack. In this
11676 case we need to set it based on the stack pointer. */
11677 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11679 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11680 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11681 GEN_INT (UNITS_PER_WORD))));
11683 emit_label (varargs_label);
11684 LABEL_NUSES (varargs_label) = 1;
11688 /* We may have to tell the dataflow pass that the split stack prologue
11689 is initializing a scratch register. */
11691 static void
11692 ix86_live_on_entry (bitmap regs)
11694 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11696 gcc_assert (flag_split_stack);
11697 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11701 /* Determine if op is suitable SUBREG RTX for address. */
11703 static bool
11704 ix86_address_subreg_operand (rtx op)
11706 enum machine_mode mode;
11708 if (!REG_P (op))
11709 return false;
11711 mode = GET_MODE (op);
11713 if (GET_MODE_CLASS (mode) != MODE_INT)
11714 return false;
11716 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11717 failures when the register is one word out of a two word structure. */
11718 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11719 return false;
11721 /* Allow only SUBREGs of non-eliminable hard registers. */
11722 return register_no_elim_operand (op, mode);
11725 /* Extract the parts of an RTL expression that is a valid memory address
11726 for an instruction. Return 0 if the structure of the address is
11727 grossly off. Return -1 if the address contains ASHIFT, so it is not
11728 strictly valid, but still used for computing length of lea instruction. */
11731 ix86_decompose_address (rtx addr, struct ix86_address *out)
11733 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11734 rtx base_reg, index_reg;
11735 HOST_WIDE_INT scale = 1;
11736 rtx scale_rtx = NULL_RTX;
11737 rtx tmp;
11738 int retval = 1;
11739 enum ix86_address_seg seg = SEG_DEFAULT;
11741 /* Allow zero-extended SImode addresses,
11742 they will be emitted with addr32 prefix. */
11743 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11745 if (GET_CODE (addr) == ZERO_EXTEND
11746 && GET_MODE (XEXP (addr, 0)) == SImode)
11748 addr = XEXP (addr, 0);
11749 if (CONST_INT_P (addr))
11750 return 0;
11752 else if (GET_CODE (addr) == AND
11753 && const_32bit_mask (XEXP (addr, 1), DImode))
11755 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11756 if (addr == NULL_RTX)
11757 return 0;
11759 if (CONST_INT_P (addr))
11760 return 0;
11764 /* Allow SImode subregs of DImode addresses,
11765 they will be emitted with addr32 prefix. */
11766 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11768 if (GET_CODE (addr) == SUBREG
11769 && GET_MODE (SUBREG_REG (addr)) == DImode)
11771 addr = SUBREG_REG (addr);
11772 if (CONST_INT_P (addr))
11773 return 0;
11777 if (REG_P (addr))
11778 base = addr;
11779 else if (GET_CODE (addr) == SUBREG)
11781 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11782 base = addr;
11783 else
11784 return 0;
11786 else if (GET_CODE (addr) == PLUS)
11788 rtx addends[4], op;
11789 int n = 0, i;
11791 op = addr;
11794 if (n >= 4)
11795 return 0;
11796 addends[n++] = XEXP (op, 1);
11797 op = XEXP (op, 0);
11799 while (GET_CODE (op) == PLUS);
11800 if (n >= 4)
11801 return 0;
11802 addends[n] = op;
11804 for (i = n; i >= 0; --i)
11806 op = addends[i];
11807 switch (GET_CODE (op))
11809 case MULT:
11810 if (index)
11811 return 0;
11812 index = XEXP (op, 0);
11813 scale_rtx = XEXP (op, 1);
11814 break;
11816 case ASHIFT:
11817 if (index)
11818 return 0;
11819 index = XEXP (op, 0);
11820 tmp = XEXP (op, 1);
11821 if (!CONST_INT_P (tmp))
11822 return 0;
11823 scale = INTVAL (tmp);
11824 if ((unsigned HOST_WIDE_INT) scale > 3)
11825 return 0;
11826 scale = 1 << scale;
11827 break;
11829 case ZERO_EXTEND:
11830 op = XEXP (op, 0);
11831 if (GET_CODE (op) != UNSPEC)
11832 return 0;
11833 /* FALLTHRU */
11835 case UNSPEC:
11836 if (XINT (op, 1) == UNSPEC_TP
11837 && TARGET_TLS_DIRECT_SEG_REFS
11838 && seg == SEG_DEFAULT)
11839 seg = DEFAULT_TLS_SEG_REG;
11840 else
11841 return 0;
11842 break;
11844 case SUBREG:
11845 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11846 return 0;
11847 /* FALLTHRU */
11849 case REG:
11850 if (!base)
11851 base = op;
11852 else if (!index)
11853 index = op;
11854 else
11855 return 0;
11856 break;
11858 case CONST:
11859 case CONST_INT:
11860 case SYMBOL_REF:
11861 case LABEL_REF:
11862 if (disp)
11863 return 0;
11864 disp = op;
11865 break;
11867 default:
11868 return 0;
11872 else if (GET_CODE (addr) == MULT)
11874 index = XEXP (addr, 0); /* index*scale */
11875 scale_rtx = XEXP (addr, 1);
11877 else if (GET_CODE (addr) == ASHIFT)
11879 /* We're called for lea too, which implements ashift on occasion. */
11880 index = XEXP (addr, 0);
11881 tmp = XEXP (addr, 1);
11882 if (!CONST_INT_P (tmp))
11883 return 0;
11884 scale = INTVAL (tmp);
11885 if ((unsigned HOST_WIDE_INT) scale > 3)
11886 return 0;
11887 scale = 1 << scale;
11888 retval = -1;
11890 else if (CONST_INT_P (addr))
11892 if (!x86_64_immediate_operand (addr, VOIDmode))
11893 return 0;
11895 /* Constant addresses are sign extended to 64bit, we have to
11896 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11897 if (TARGET_X32
11898 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11899 return 0;
11901 disp = addr;
11903 else
11904 disp = addr; /* displacement */
11906 if (index)
11908 if (REG_P (index))
11910 else if (GET_CODE (index) == SUBREG
11911 && ix86_address_subreg_operand (SUBREG_REG (index)))
11913 else
11914 return 0;
11917 /* Address override works only on the (%reg) part of %fs:(%reg). */
11918 if (seg != SEG_DEFAULT
11919 && ((base && GET_MODE (base) != word_mode)
11920 || (index && GET_MODE (index) != word_mode)))
11921 return 0;
11923 /* Extract the integral value of scale. */
11924 if (scale_rtx)
11926 if (!CONST_INT_P (scale_rtx))
11927 return 0;
11928 scale = INTVAL (scale_rtx);
11931 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11932 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11934 /* Avoid useless 0 displacement. */
11935 if (disp == const0_rtx && (base || index))
11936 disp = NULL_RTX;
11938 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11939 if (base_reg && index_reg && scale == 1
11940 && (index_reg == arg_pointer_rtx
11941 || index_reg == frame_pointer_rtx
11942 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11944 rtx tmp;
11945 tmp = base, base = index, index = tmp;
11946 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11949 /* Special case: %ebp cannot be encoded as a base without a displacement.
11950 Similarly %r13. */
11951 if (!disp
11952 && base_reg
11953 && (base_reg == hard_frame_pointer_rtx
11954 || base_reg == frame_pointer_rtx
11955 || base_reg == arg_pointer_rtx
11956 || (REG_P (base_reg)
11957 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11958 || REGNO (base_reg) == R13_REG))))
11959 disp = const0_rtx;
11961 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11962 Avoid this by transforming to [%esi+0].
11963 Reload calls address legitimization without cfun defined, so we need
11964 to test cfun for being non-NULL. */
11965 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11966 && base_reg && !index_reg && !disp
11967 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11968 disp = const0_rtx;
11970 /* Special case: encode reg+reg instead of reg*2. */
11971 if (!base && index && scale == 2)
11972 base = index, base_reg = index_reg, scale = 1;
11974 /* Special case: scaling cannot be encoded without base or displacement. */
11975 if (!base && !disp && index && scale != 1)
11976 disp = const0_rtx;
11978 out->base = base;
11979 out->index = index;
11980 out->disp = disp;
11981 out->scale = scale;
11982 out->seg = seg;
11984 return retval;
11987 /* Return cost of the memory address x.
11988 For i386, it is better to use a complex address than let gcc copy
11989 the address into a reg and make a new pseudo. But not if the address
11990 requires to two regs - that would mean more pseudos with longer
11991 lifetimes. */
11992 static int
11993 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11994 addr_space_t as ATTRIBUTE_UNUSED,
11995 bool speed ATTRIBUTE_UNUSED)
11997 struct ix86_address parts;
11998 int cost = 1;
11999 int ok = ix86_decompose_address (x, &parts);
12001 gcc_assert (ok);
12003 if (parts.base && GET_CODE (parts.base) == SUBREG)
12004 parts.base = SUBREG_REG (parts.base);
12005 if (parts.index && GET_CODE (parts.index) == SUBREG)
12006 parts.index = SUBREG_REG (parts.index);
12008 /* Attempt to minimize number of registers in the address. */
12009 if ((parts.base
12010 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12011 || (parts.index
12012 && (!REG_P (parts.index)
12013 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12014 cost++;
12016 if (parts.base
12017 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12018 && parts.index
12019 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12020 && parts.base != parts.index)
12021 cost++;
12023 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12024 since it's predecode logic can't detect the length of instructions
12025 and it degenerates to vector decoded. Increase cost of such
12026 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12027 to split such addresses or even refuse such addresses at all.
12029 Following addressing modes are affected:
12030 [base+scale*index]
12031 [scale*index+disp]
12032 [base+index]
12034 The first and last case may be avoidable by explicitly coding the zero in
12035 memory address, but I don't have AMD-K6 machine handy to check this
12036 theory. */
12038 if (TARGET_K6
12039 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12040 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12041 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12042 cost += 10;
12044 return cost;
12047 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12048 this is used for to form addresses to local data when -fPIC is in
12049 use. */
12051 static bool
12052 darwin_local_data_pic (rtx disp)
12054 return (GET_CODE (disp) == UNSPEC
12055 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12058 /* Determine if a given RTX is a valid constant. We already know this
12059 satisfies CONSTANT_P. */
12061 static bool
12062 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12064 switch (GET_CODE (x))
12066 case CONST:
12067 x = XEXP (x, 0);
12069 if (GET_CODE (x) == PLUS)
12071 if (!CONST_INT_P (XEXP (x, 1)))
12072 return false;
12073 x = XEXP (x, 0);
12076 if (TARGET_MACHO && darwin_local_data_pic (x))
12077 return true;
12079 /* Only some unspecs are valid as "constants". */
12080 if (GET_CODE (x) == UNSPEC)
12081 switch (XINT (x, 1))
12083 case UNSPEC_GOT:
12084 case UNSPEC_GOTOFF:
12085 case UNSPEC_PLTOFF:
12086 return TARGET_64BIT;
12087 case UNSPEC_TPOFF:
12088 case UNSPEC_NTPOFF:
12089 x = XVECEXP (x, 0, 0);
12090 return (GET_CODE (x) == SYMBOL_REF
12091 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12092 case UNSPEC_DTPOFF:
12093 x = XVECEXP (x, 0, 0);
12094 return (GET_CODE (x) == SYMBOL_REF
12095 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12096 default:
12097 return false;
12100 /* We must have drilled down to a symbol. */
12101 if (GET_CODE (x) == LABEL_REF)
12102 return true;
12103 if (GET_CODE (x) != SYMBOL_REF)
12104 return false;
12105 /* FALLTHRU */
12107 case SYMBOL_REF:
12108 /* TLS symbols are never valid. */
12109 if (SYMBOL_REF_TLS_MODEL (x))
12110 return false;
12112 /* DLLIMPORT symbols are never valid. */
12113 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12114 && SYMBOL_REF_DLLIMPORT_P (x))
12115 return false;
12117 #if TARGET_MACHO
12118 /* mdynamic-no-pic */
12119 if (MACHO_DYNAMIC_NO_PIC_P)
12120 return machopic_symbol_defined_p (x);
12121 #endif
12122 break;
12124 case CONST_DOUBLE:
12125 if (GET_MODE (x) == TImode
12126 && x != CONST0_RTX (TImode)
12127 && !TARGET_64BIT)
12128 return false;
12129 break;
12131 case CONST_VECTOR:
12132 if (!standard_sse_constant_p (x))
12133 return false;
12135 default:
12136 break;
12139 /* Otherwise we handle everything else in the move patterns. */
12140 return true;
12143 /* Determine if it's legal to put X into the constant pool. This
12144 is not possible for the address of thread-local symbols, which
12145 is checked above. */
12147 static bool
12148 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12150 /* We can always put integral constants and vectors in memory. */
12151 switch (GET_CODE (x))
12153 case CONST_INT:
12154 case CONST_DOUBLE:
12155 case CONST_VECTOR:
12156 return false;
12158 default:
12159 break;
12161 return !ix86_legitimate_constant_p (mode, x);
12164 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12165 otherwise zero. */
12167 static bool
12168 is_imported_p (rtx x)
12170 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12171 || GET_CODE (x) != SYMBOL_REF)
12172 return false;
12174 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12178 /* Nonzero if the constant value X is a legitimate general operand
12179 when generating PIC code. It is given that flag_pic is on and
12180 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12182 bool
12183 legitimate_pic_operand_p (rtx x)
12185 rtx inner;
12187 switch (GET_CODE (x))
12189 case CONST:
12190 inner = XEXP (x, 0);
12191 if (GET_CODE (inner) == PLUS
12192 && CONST_INT_P (XEXP (inner, 1)))
12193 inner = XEXP (inner, 0);
12195 /* Only some unspecs are valid as "constants". */
12196 if (GET_CODE (inner) == UNSPEC)
12197 switch (XINT (inner, 1))
12199 case UNSPEC_GOT:
12200 case UNSPEC_GOTOFF:
12201 case UNSPEC_PLTOFF:
12202 return TARGET_64BIT;
12203 case UNSPEC_TPOFF:
12204 x = XVECEXP (inner, 0, 0);
12205 return (GET_CODE (x) == SYMBOL_REF
12206 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12207 case UNSPEC_MACHOPIC_OFFSET:
12208 return legitimate_pic_address_disp_p (x);
12209 default:
12210 return false;
12212 /* FALLTHRU */
12214 case SYMBOL_REF:
12215 case LABEL_REF:
12216 return legitimate_pic_address_disp_p (x);
12218 default:
12219 return true;
12223 /* Determine if a given CONST RTX is a valid memory displacement
12224 in PIC mode. */
12226 bool
12227 legitimate_pic_address_disp_p (rtx disp)
12229 bool saw_plus;
12231 /* In 64bit mode we can allow direct addresses of symbols and labels
12232 when they are not dynamic symbols. */
12233 if (TARGET_64BIT)
12235 rtx op0 = disp, op1;
12237 switch (GET_CODE (disp))
12239 case LABEL_REF:
12240 return true;
12242 case CONST:
12243 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12244 break;
12245 op0 = XEXP (XEXP (disp, 0), 0);
12246 op1 = XEXP (XEXP (disp, 0), 1);
12247 if (!CONST_INT_P (op1)
12248 || INTVAL (op1) >= 16*1024*1024
12249 || INTVAL (op1) < -16*1024*1024)
12250 break;
12251 if (GET_CODE (op0) == LABEL_REF)
12252 return true;
12253 if (GET_CODE (op0) == CONST
12254 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12255 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12256 return true;
12257 if (GET_CODE (op0) == UNSPEC
12258 && XINT (op0, 1) == UNSPEC_PCREL)
12259 return true;
12260 if (GET_CODE (op0) != SYMBOL_REF)
12261 break;
12262 /* FALLTHRU */
12264 case SYMBOL_REF:
12265 /* TLS references should always be enclosed in UNSPEC.
12266 The dllimported symbol needs always to be resolved. */
12267 if (SYMBOL_REF_TLS_MODEL (op0)
12268 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12269 return false;
12271 if (TARGET_PECOFF)
12273 if (is_imported_p (op0))
12274 return true;
12276 if (SYMBOL_REF_FAR_ADDR_P (op0)
12277 || !SYMBOL_REF_LOCAL_P (op0))
12278 break;
12280 /* Function-symbols need to be resolved only for
12281 large-model.
12282 For the small-model we don't need to resolve anything
12283 here. */
12284 if ((ix86_cmodel != CM_LARGE_PIC
12285 && SYMBOL_REF_FUNCTION_P (op0))
12286 || ix86_cmodel == CM_SMALL_PIC)
12287 return true;
12288 /* Non-external symbols don't need to be resolved for
12289 large, and medium-model. */
12290 if ((ix86_cmodel == CM_LARGE_PIC
12291 || ix86_cmodel == CM_MEDIUM_PIC)
12292 && !SYMBOL_REF_EXTERNAL_P (op0))
12293 return true;
12295 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12296 && SYMBOL_REF_LOCAL_P (op0)
12297 && ix86_cmodel != CM_LARGE_PIC)
12298 return true;
12299 break;
12301 default:
12302 break;
12305 if (GET_CODE (disp) != CONST)
12306 return false;
12307 disp = XEXP (disp, 0);
12309 if (TARGET_64BIT)
12311 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12312 of GOT tables. We should not need these anyway. */
12313 if (GET_CODE (disp) != UNSPEC
12314 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12315 && XINT (disp, 1) != UNSPEC_GOTOFF
12316 && XINT (disp, 1) != UNSPEC_PCREL
12317 && XINT (disp, 1) != UNSPEC_PLTOFF))
12318 return false;
12320 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12321 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12322 return false;
12323 return true;
12326 saw_plus = false;
12327 if (GET_CODE (disp) == PLUS)
12329 if (!CONST_INT_P (XEXP (disp, 1)))
12330 return false;
12331 disp = XEXP (disp, 0);
12332 saw_plus = true;
12335 if (TARGET_MACHO && darwin_local_data_pic (disp))
12336 return true;
12338 if (GET_CODE (disp) != UNSPEC)
12339 return false;
12341 switch (XINT (disp, 1))
12343 case UNSPEC_GOT:
12344 if (saw_plus)
12345 return false;
12346 /* We need to check for both symbols and labels because VxWorks loads
12347 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12348 details. */
12349 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12350 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12351 case UNSPEC_GOTOFF:
12352 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12353 While ABI specify also 32bit relocation but we don't produce it in
12354 small PIC model at all. */
12355 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12356 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12357 && !TARGET_64BIT)
12358 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12359 return false;
12360 case UNSPEC_GOTTPOFF:
12361 case UNSPEC_GOTNTPOFF:
12362 case UNSPEC_INDNTPOFF:
12363 if (saw_plus)
12364 return false;
12365 disp = XVECEXP (disp, 0, 0);
12366 return (GET_CODE (disp) == SYMBOL_REF
12367 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12368 case UNSPEC_NTPOFF:
12369 disp = XVECEXP (disp, 0, 0);
12370 return (GET_CODE (disp) == SYMBOL_REF
12371 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12372 case UNSPEC_DTPOFF:
12373 disp = XVECEXP (disp, 0, 0);
12374 return (GET_CODE (disp) == SYMBOL_REF
12375 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12378 return false;
12381 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12382 replace the input X, or the original X if no replacement is called for.
12383 The output parameter *WIN is 1 if the calling macro should goto WIN,
12384 0 if it should not. */
12386 bool
12387 ix86_legitimize_reload_address (rtx x,
12388 enum machine_mode mode ATTRIBUTE_UNUSED,
12389 int opnum, int type,
12390 int ind_levels ATTRIBUTE_UNUSED)
12392 /* Reload can generate:
12394 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12395 (reg:DI 97))
12396 (reg:DI 2 cx))
12398 This RTX is rejected from ix86_legitimate_address_p due to
12399 non-strictness of base register 97. Following this rejection,
12400 reload pushes all three components into separate registers,
12401 creating invalid memory address RTX.
12403 Following code reloads only the invalid part of the
12404 memory address RTX. */
12406 if (GET_CODE (x) == PLUS
12407 && REG_P (XEXP (x, 1))
12408 && GET_CODE (XEXP (x, 0)) == PLUS
12409 && REG_P (XEXP (XEXP (x, 0), 1)))
12411 rtx base, index;
12412 bool something_reloaded = false;
12414 base = XEXP (XEXP (x, 0), 1);
12415 if (!REG_OK_FOR_BASE_STRICT_P (base))
12417 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12418 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12419 opnum, (enum reload_type) type);
12420 something_reloaded = true;
12423 index = XEXP (x, 1);
12424 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12426 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12427 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12428 opnum, (enum reload_type) type);
12429 something_reloaded = true;
12432 gcc_assert (something_reloaded);
12433 return true;
12436 return false;
12439 /* Recognizes RTL expressions that are valid memory addresses for an
12440 instruction. The MODE argument is the machine mode for the MEM
12441 expression that wants to use this address.
12443 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12444 convert common non-canonical forms to canonical form so that they will
12445 be recognized. */
12447 static bool
12448 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12449 rtx addr, bool strict)
12451 struct ix86_address parts;
12452 rtx base, index, disp;
12453 HOST_WIDE_INT scale;
12455 if (ix86_decompose_address (addr, &parts) <= 0)
12456 /* Decomposition failed. */
12457 return false;
12459 base = parts.base;
12460 index = parts.index;
12461 disp = parts.disp;
12462 scale = parts.scale;
12464 /* Validate base register. */
12465 if (base)
12467 rtx reg;
12469 if (REG_P (base))
12470 reg = base;
12471 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12472 reg = SUBREG_REG (base);
12473 else
12474 /* Base is not a register. */
12475 return false;
12477 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12478 return false;
12480 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12481 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12482 /* Base is not valid. */
12483 return false;
12486 /* Validate index register. */
12487 if (index)
12489 rtx reg;
12491 if (REG_P (index))
12492 reg = index;
12493 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12494 reg = SUBREG_REG (index);
12495 else
12496 /* Index is not a register. */
12497 return false;
12499 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12500 return false;
12502 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12503 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12504 /* Index is not valid. */
12505 return false;
12508 /* Index and base should have the same mode. */
12509 if (base && index
12510 && GET_MODE (base) != GET_MODE (index))
12511 return false;
12513 /* Validate scale factor. */
12514 if (scale != 1)
12516 if (!index)
12517 /* Scale without index. */
12518 return false;
12520 if (scale != 2 && scale != 4 && scale != 8)
12521 /* Scale is not a valid multiplier. */
12522 return false;
12525 /* Validate displacement. */
12526 if (disp)
12528 if (GET_CODE (disp) == CONST
12529 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12530 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12531 switch (XINT (XEXP (disp, 0), 1))
12533 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12534 used. While ABI specify also 32bit relocations, we don't produce
12535 them at all and use IP relative instead. */
12536 case UNSPEC_GOT:
12537 case UNSPEC_GOTOFF:
12538 gcc_assert (flag_pic);
12539 if (!TARGET_64BIT)
12540 goto is_legitimate_pic;
12542 /* 64bit address unspec. */
12543 return false;
12545 case UNSPEC_GOTPCREL:
12546 case UNSPEC_PCREL:
12547 gcc_assert (flag_pic);
12548 goto is_legitimate_pic;
12550 case UNSPEC_GOTTPOFF:
12551 case UNSPEC_GOTNTPOFF:
12552 case UNSPEC_INDNTPOFF:
12553 case UNSPEC_NTPOFF:
12554 case UNSPEC_DTPOFF:
12555 break;
12557 case UNSPEC_STACK_CHECK:
12558 gcc_assert (flag_split_stack);
12559 break;
12561 default:
12562 /* Invalid address unspec. */
12563 return false;
12566 else if (SYMBOLIC_CONST (disp)
12567 && (flag_pic
12568 || (TARGET_MACHO
12569 #if TARGET_MACHO
12570 && MACHOPIC_INDIRECT
12571 && !machopic_operand_p (disp)
12572 #endif
12576 is_legitimate_pic:
12577 if (TARGET_64BIT && (index || base))
12579 /* foo@dtpoff(%rX) is ok. */
12580 if (GET_CODE (disp) != CONST
12581 || GET_CODE (XEXP (disp, 0)) != PLUS
12582 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12583 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12584 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12585 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12586 /* Non-constant pic memory reference. */
12587 return false;
12589 else if ((!TARGET_MACHO || flag_pic)
12590 && ! legitimate_pic_address_disp_p (disp))
12591 /* Displacement is an invalid pic construct. */
12592 return false;
12593 #if TARGET_MACHO
12594 else if (MACHO_DYNAMIC_NO_PIC_P
12595 && !ix86_legitimate_constant_p (Pmode, disp))
12596 /* displacment must be referenced via non_lazy_pointer */
12597 return false;
12598 #endif
12600 /* This code used to verify that a symbolic pic displacement
12601 includes the pic_offset_table_rtx register.
12603 While this is good idea, unfortunately these constructs may
12604 be created by "adds using lea" optimization for incorrect
12605 code like:
12607 int a;
12608 int foo(int i)
12610 return *(&a+i);
12613 This code is nonsensical, but results in addressing
12614 GOT table with pic_offset_table_rtx base. We can't
12615 just refuse it easily, since it gets matched by
12616 "addsi3" pattern, that later gets split to lea in the
12617 case output register differs from input. While this
12618 can be handled by separate addsi pattern for this case
12619 that never results in lea, this seems to be easier and
12620 correct fix for crash to disable this test. */
12622 else if (GET_CODE (disp) != LABEL_REF
12623 && !CONST_INT_P (disp)
12624 && (GET_CODE (disp) != CONST
12625 || !ix86_legitimate_constant_p (Pmode, disp))
12626 && (GET_CODE (disp) != SYMBOL_REF
12627 || !ix86_legitimate_constant_p (Pmode, disp)))
12628 /* Displacement is not constant. */
12629 return false;
12630 else if (TARGET_64BIT
12631 && !x86_64_immediate_operand (disp, VOIDmode))
12632 /* Displacement is out of range. */
12633 return false;
12636 /* Everything looks valid. */
12637 return true;
12640 /* Determine if a given RTX is a valid constant address. */
12642 bool
12643 constant_address_p (rtx x)
12645 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12648 /* Return a unique alias set for the GOT. */
12650 static alias_set_type
12651 ix86_GOT_alias_set (void)
12653 static alias_set_type set = -1;
12654 if (set == -1)
12655 set = new_alias_set ();
12656 return set;
12659 /* Return a legitimate reference for ORIG (an address) using the
12660 register REG. If REG is 0, a new pseudo is generated.
12662 There are two types of references that must be handled:
12664 1. Global data references must load the address from the GOT, via
12665 the PIC reg. An insn is emitted to do this load, and the reg is
12666 returned.
12668 2. Static data references, constant pool addresses, and code labels
12669 compute the address as an offset from the GOT, whose base is in
12670 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12671 differentiate them from global data objects. The returned
12672 address is the PIC reg + an unspec constant.
12674 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12675 reg also appears in the address. */
12677 static rtx
12678 legitimize_pic_address (rtx orig, rtx reg)
12680 rtx addr = orig;
12681 rtx new_rtx = orig;
12683 #if TARGET_MACHO
12684 if (TARGET_MACHO && !TARGET_64BIT)
12686 if (reg == 0)
12687 reg = gen_reg_rtx (Pmode);
12688 /* Use the generic Mach-O PIC machinery. */
12689 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12691 #endif
12693 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12695 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12696 if (tmp)
12697 return tmp;
12700 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12701 new_rtx = addr;
12702 else if (TARGET_64BIT && !TARGET_PECOFF
12703 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12705 rtx tmpreg;
12706 /* This symbol may be referenced via a displacement from the PIC
12707 base address (@GOTOFF). */
12709 if (reload_in_progress)
12710 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12711 if (GET_CODE (addr) == CONST)
12712 addr = XEXP (addr, 0);
12713 if (GET_CODE (addr) == PLUS)
12715 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12716 UNSPEC_GOTOFF);
12717 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12719 else
12720 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12721 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12722 if (!reg)
12723 tmpreg = gen_reg_rtx (Pmode);
12724 else
12725 tmpreg = reg;
12726 emit_move_insn (tmpreg, new_rtx);
12728 if (reg != 0)
12730 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12731 tmpreg, 1, OPTAB_DIRECT);
12732 new_rtx = reg;
12734 else
12735 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12737 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12739 /* This symbol may be referenced via a displacement from the PIC
12740 base address (@GOTOFF). */
12742 if (reload_in_progress)
12743 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12744 if (GET_CODE (addr) == CONST)
12745 addr = XEXP (addr, 0);
12746 if (GET_CODE (addr) == PLUS)
12748 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12749 UNSPEC_GOTOFF);
12750 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12752 else
12753 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12754 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12755 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12757 if (reg != 0)
12759 emit_move_insn (reg, new_rtx);
12760 new_rtx = reg;
12763 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12764 /* We can't use @GOTOFF for text labels on VxWorks;
12765 see gotoff_operand. */
12766 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12768 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12769 if (tmp)
12770 return tmp;
12772 /* For x64 PE-COFF there is no GOT table. So we use address
12773 directly. */
12774 if (TARGET_64BIT && TARGET_PECOFF)
12776 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12777 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12779 if (reg == 0)
12780 reg = gen_reg_rtx (Pmode);
12781 emit_move_insn (reg, new_rtx);
12782 new_rtx = reg;
12784 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12786 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12787 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12788 new_rtx = gen_const_mem (Pmode, new_rtx);
12789 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12791 if (reg == 0)
12792 reg = gen_reg_rtx (Pmode);
12793 /* Use directly gen_movsi, otherwise the address is loaded
12794 into register for CSE. We don't want to CSE this addresses,
12795 instead we CSE addresses from the GOT table, so skip this. */
12796 emit_insn (gen_movsi (reg, new_rtx));
12797 new_rtx = reg;
12799 else
12801 /* This symbol must be referenced via a load from the
12802 Global Offset Table (@GOT). */
12804 if (reload_in_progress)
12805 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12806 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12807 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12808 if (TARGET_64BIT)
12809 new_rtx = force_reg (Pmode, new_rtx);
12810 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12811 new_rtx = gen_const_mem (Pmode, new_rtx);
12812 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12814 if (reg == 0)
12815 reg = gen_reg_rtx (Pmode);
12816 emit_move_insn (reg, new_rtx);
12817 new_rtx = reg;
12820 else
12822 if (CONST_INT_P (addr)
12823 && !x86_64_immediate_operand (addr, VOIDmode))
12825 if (reg)
12827 emit_move_insn (reg, addr);
12828 new_rtx = reg;
12830 else
12831 new_rtx = force_reg (Pmode, addr);
12833 else if (GET_CODE (addr) == CONST)
12835 addr = XEXP (addr, 0);
12837 /* We must match stuff we generate before. Assume the only
12838 unspecs that can get here are ours. Not that we could do
12839 anything with them anyway.... */
12840 if (GET_CODE (addr) == UNSPEC
12841 || (GET_CODE (addr) == PLUS
12842 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12843 return orig;
12844 gcc_assert (GET_CODE (addr) == PLUS);
12846 if (GET_CODE (addr) == PLUS)
12848 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12850 /* Check first to see if this is a constant offset from a @GOTOFF
12851 symbol reference. */
12852 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
12853 && CONST_INT_P (op1))
12855 if (!TARGET_64BIT)
12857 if (reload_in_progress)
12858 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12859 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12860 UNSPEC_GOTOFF);
12861 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12862 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12863 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12865 if (reg != 0)
12867 emit_move_insn (reg, new_rtx);
12868 new_rtx = reg;
12871 else
12873 if (INTVAL (op1) < -16*1024*1024
12874 || INTVAL (op1) >= 16*1024*1024)
12876 if (!x86_64_immediate_operand (op1, Pmode))
12877 op1 = force_reg (Pmode, op1);
12878 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12882 else
12884 rtx base = legitimize_pic_address (op0, reg);
12885 enum machine_mode mode = GET_MODE (base);
12886 new_rtx
12887 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12889 if (CONST_INT_P (new_rtx))
12891 if (INTVAL (new_rtx) < -16*1024*1024
12892 || INTVAL (new_rtx) >= 16*1024*1024)
12894 if (!x86_64_immediate_operand (new_rtx, mode))
12895 new_rtx = force_reg (mode, new_rtx);
12896 new_rtx
12897 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12899 else
12900 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12902 else
12904 if (GET_CODE (new_rtx) == PLUS
12905 && CONSTANT_P (XEXP (new_rtx, 1)))
12907 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12908 new_rtx = XEXP (new_rtx, 1);
12910 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12915 return new_rtx;
12918 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12920 static rtx
12921 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12923 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12925 if (GET_MODE (tp) != tp_mode)
12927 gcc_assert (GET_MODE (tp) == SImode);
12928 gcc_assert (tp_mode == DImode);
12930 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12933 if (to_reg)
12934 tp = copy_to_mode_reg (tp_mode, tp);
12936 return tp;
12939 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12941 static GTY(()) rtx ix86_tls_symbol;
12943 static rtx
12944 ix86_tls_get_addr (void)
12946 if (!ix86_tls_symbol)
12948 const char *sym
12949 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12950 ? "___tls_get_addr" : "__tls_get_addr");
12952 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12955 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
12957 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
12958 UNSPEC_PLTOFF);
12959 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
12960 gen_rtx_CONST (Pmode, unspec));
12963 return ix86_tls_symbol;
12966 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12968 static GTY(()) rtx ix86_tls_module_base_symbol;
12971 ix86_tls_module_base (void)
12973 if (!ix86_tls_module_base_symbol)
12975 ix86_tls_module_base_symbol
12976 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12978 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12979 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12982 return ix86_tls_module_base_symbol;
12985 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12986 false if we expect this to be used for a memory address and true if
12987 we expect to load the address into a register. */
12989 static rtx
12990 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12992 rtx dest, base, off;
12993 rtx pic = NULL_RTX, tp = NULL_RTX;
12994 enum machine_mode tp_mode = Pmode;
12995 int type;
12997 switch (model)
12999 case TLS_MODEL_GLOBAL_DYNAMIC:
13000 dest = gen_reg_rtx (Pmode);
13002 if (!TARGET_64BIT)
13004 if (flag_pic && !TARGET_PECOFF)
13005 pic = pic_offset_table_rtx;
13006 else
13008 pic = gen_reg_rtx (Pmode);
13009 emit_insn (gen_set_got (pic));
13013 if (TARGET_GNU2_TLS)
13015 if (TARGET_64BIT)
13016 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13017 else
13018 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13020 tp = get_thread_pointer (Pmode, true);
13021 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13023 if (GET_MODE (x) != Pmode)
13024 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13026 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13028 else
13030 rtx caddr = ix86_tls_get_addr ();
13032 if (TARGET_64BIT)
13034 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13035 rtx insns;
13037 start_sequence ();
13038 emit_call_insn
13039 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13040 insns = get_insns ();
13041 end_sequence ();
13043 if (GET_MODE (x) != Pmode)
13044 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13046 RTL_CONST_CALL_P (insns) = 1;
13047 emit_libcall_block (insns, dest, rax, x);
13049 else
13050 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13052 break;
13054 case TLS_MODEL_LOCAL_DYNAMIC:
13055 base = gen_reg_rtx (Pmode);
13057 if (!TARGET_64BIT)
13059 if (flag_pic)
13060 pic = pic_offset_table_rtx;
13061 else
13063 pic = gen_reg_rtx (Pmode);
13064 emit_insn (gen_set_got (pic));
13068 if (TARGET_GNU2_TLS)
13070 rtx tmp = ix86_tls_module_base ();
13072 if (TARGET_64BIT)
13073 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13074 else
13075 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13077 tp = get_thread_pointer (Pmode, true);
13078 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13079 gen_rtx_MINUS (Pmode, tmp, tp));
13081 else
13083 rtx caddr = ix86_tls_get_addr ();
13085 if (TARGET_64BIT)
13087 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13088 rtx insns, eqv;
13090 start_sequence ();
13091 emit_call_insn
13092 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13093 insns = get_insns ();
13094 end_sequence ();
13096 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13097 share the LD_BASE result with other LD model accesses. */
13098 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13099 UNSPEC_TLS_LD_BASE);
13101 RTL_CONST_CALL_P (insns) = 1;
13102 emit_libcall_block (insns, base, rax, eqv);
13104 else
13105 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13108 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13109 off = gen_rtx_CONST (Pmode, off);
13111 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13113 if (TARGET_GNU2_TLS)
13115 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13117 if (GET_MODE (x) != Pmode)
13118 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13120 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13122 break;
13124 case TLS_MODEL_INITIAL_EXEC:
13125 if (TARGET_64BIT)
13127 if (TARGET_SUN_TLS && !TARGET_X32)
13129 /* The Sun linker took the AMD64 TLS spec literally
13130 and can only handle %rax as destination of the
13131 initial executable code sequence. */
13133 dest = gen_reg_rtx (DImode);
13134 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13135 return dest;
13138 /* Generate DImode references to avoid %fs:(%reg32)
13139 problems and linker IE->LE relaxation bug. */
13140 tp_mode = DImode;
13141 pic = NULL;
13142 type = UNSPEC_GOTNTPOFF;
13144 else if (flag_pic)
13146 if (reload_in_progress)
13147 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13148 pic = pic_offset_table_rtx;
13149 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13151 else if (!TARGET_ANY_GNU_TLS)
13153 pic = gen_reg_rtx (Pmode);
13154 emit_insn (gen_set_got (pic));
13155 type = UNSPEC_GOTTPOFF;
13157 else
13159 pic = NULL;
13160 type = UNSPEC_INDNTPOFF;
13163 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13164 off = gen_rtx_CONST (tp_mode, off);
13165 if (pic)
13166 off = gen_rtx_PLUS (tp_mode, pic, off);
13167 off = gen_const_mem (tp_mode, off);
13168 set_mem_alias_set (off, ix86_GOT_alias_set ());
13170 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13172 base = get_thread_pointer (tp_mode,
13173 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13174 off = force_reg (tp_mode, off);
13175 return gen_rtx_PLUS (tp_mode, base, off);
13177 else
13179 base = get_thread_pointer (Pmode, true);
13180 dest = gen_reg_rtx (Pmode);
13181 emit_insn (ix86_gen_sub3 (dest, base, off));
13183 break;
13185 case TLS_MODEL_LOCAL_EXEC:
13186 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13187 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13188 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13189 off = gen_rtx_CONST (Pmode, off);
13191 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13193 base = get_thread_pointer (Pmode,
13194 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13195 return gen_rtx_PLUS (Pmode, base, off);
13197 else
13199 base = get_thread_pointer (Pmode, true);
13200 dest = gen_reg_rtx (Pmode);
13201 emit_insn (ix86_gen_sub3 (dest, base, off));
13203 break;
13205 default:
13206 gcc_unreachable ();
13209 return dest;
13212 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13213 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13214 unique refptr-DECL symbol corresponding to symbol DECL. */
13216 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13217 htab_t dllimport_map;
13219 static tree
13220 get_dllimport_decl (tree decl, bool beimport)
13222 struct tree_map *h, in;
13223 void **loc;
13224 const char *name;
13225 const char *prefix;
13226 size_t namelen, prefixlen;
13227 char *imp_name;
13228 tree to;
13229 rtx rtl;
13231 if (!dllimport_map)
13232 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13234 in.hash = htab_hash_pointer (decl);
13235 in.base.from = decl;
13236 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13237 h = (struct tree_map *) *loc;
13238 if (h)
13239 return h->to;
13241 *loc = h = ggc_alloc_tree_map ();
13242 h->hash = in.hash;
13243 h->base.from = decl;
13244 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13245 VAR_DECL, NULL, ptr_type_node);
13246 DECL_ARTIFICIAL (to) = 1;
13247 DECL_IGNORED_P (to) = 1;
13248 DECL_EXTERNAL (to) = 1;
13249 TREE_READONLY (to) = 1;
13251 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13252 name = targetm.strip_name_encoding (name);
13253 if (beimport)
13254 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13255 ? "*__imp_" : "*__imp__";
13256 else
13257 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13258 namelen = strlen (name);
13259 prefixlen = strlen (prefix);
13260 imp_name = (char *) alloca (namelen + prefixlen + 1);
13261 memcpy (imp_name, prefix, prefixlen);
13262 memcpy (imp_name + prefixlen, name, namelen + 1);
13264 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13265 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13266 SET_SYMBOL_REF_DECL (rtl, to);
13267 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13268 if (!beimport)
13270 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13271 #ifdef SUB_TARGET_RECORD_STUB
13272 SUB_TARGET_RECORD_STUB (name);
13273 #endif
13276 rtl = gen_const_mem (Pmode, rtl);
13277 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13279 SET_DECL_RTL (to, rtl);
13280 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13282 return to;
13285 /* Expand SYMBOL into its corresponding far-addresse symbol.
13286 WANT_REG is true if we require the result be a register. */
13288 static rtx
13289 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13291 tree imp_decl;
13292 rtx x;
13294 gcc_assert (SYMBOL_REF_DECL (symbol));
13295 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13297 x = DECL_RTL (imp_decl);
13298 if (want_reg)
13299 x = force_reg (Pmode, x);
13300 return x;
13303 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13304 true if we require the result be a register. */
13306 static rtx
13307 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13309 tree imp_decl;
13310 rtx x;
13312 gcc_assert (SYMBOL_REF_DECL (symbol));
13313 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13315 x = DECL_RTL (imp_decl);
13316 if (want_reg)
13317 x = force_reg (Pmode, x);
13318 return x;
13321 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13322 is true if we require the result be a register. */
13324 static rtx
13325 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13327 if (!TARGET_PECOFF)
13328 return NULL_RTX;
13330 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13332 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13333 return legitimize_dllimport_symbol (addr, inreg);
13334 if (GET_CODE (addr) == CONST
13335 && GET_CODE (XEXP (addr, 0)) == PLUS
13336 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13337 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13339 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13340 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13344 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13345 return NULL_RTX;
13346 if (GET_CODE (addr) == SYMBOL_REF
13347 && !is_imported_p (addr)
13348 && SYMBOL_REF_EXTERNAL_P (addr)
13349 && SYMBOL_REF_DECL (addr))
13350 return legitimize_pe_coff_extern_decl (addr, inreg);
13352 if (GET_CODE (addr) == CONST
13353 && GET_CODE (XEXP (addr, 0)) == PLUS
13354 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13355 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13356 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13357 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13359 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13360 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13362 return NULL_RTX;
13365 /* Try machine-dependent ways of modifying an illegitimate address
13366 to be legitimate. If we find one, return the new, valid address.
13367 This macro is used in only one place: `memory_address' in explow.c.
13369 OLDX is the address as it was before break_out_memory_refs was called.
13370 In some cases it is useful to look at this to decide what needs to be done.
13372 It is always safe for this macro to do nothing. It exists to recognize
13373 opportunities to optimize the output.
13375 For the 80386, we handle X+REG by loading X into a register R and
13376 using R+REG. R will go in a general reg and indexing will be used.
13377 However, if REG is a broken-out memory address or multiplication,
13378 nothing needs to be done because REG can certainly go in a general reg.
13380 When -fpic is used, special handling is needed for symbolic references.
13381 See comments by legitimize_pic_address in i386.c for details. */
13383 static rtx
13384 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13385 enum machine_mode mode)
13387 int changed = 0;
13388 unsigned log;
13390 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13391 if (log)
13392 return legitimize_tls_address (x, (enum tls_model) log, false);
13393 if (GET_CODE (x) == CONST
13394 && GET_CODE (XEXP (x, 0)) == PLUS
13395 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13396 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13398 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13399 (enum tls_model) log, false);
13400 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13403 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13405 rtx tmp = legitimize_pe_coff_symbol (x, true);
13406 if (tmp)
13407 return tmp;
13410 if (flag_pic && SYMBOLIC_CONST (x))
13411 return legitimize_pic_address (x, 0);
13413 #if TARGET_MACHO
13414 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13415 return machopic_indirect_data_reference (x, 0);
13416 #endif
13418 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13419 if (GET_CODE (x) == ASHIFT
13420 && CONST_INT_P (XEXP (x, 1))
13421 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13423 changed = 1;
13424 log = INTVAL (XEXP (x, 1));
13425 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13426 GEN_INT (1 << log));
13429 if (GET_CODE (x) == PLUS)
13431 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13433 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13434 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13435 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13437 changed = 1;
13438 log = INTVAL (XEXP (XEXP (x, 0), 1));
13439 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13440 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13441 GEN_INT (1 << log));
13444 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13445 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13446 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13448 changed = 1;
13449 log = INTVAL (XEXP (XEXP (x, 1), 1));
13450 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13451 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13452 GEN_INT (1 << log));
13455 /* Put multiply first if it isn't already. */
13456 if (GET_CODE (XEXP (x, 1)) == MULT)
13458 rtx tmp = XEXP (x, 0);
13459 XEXP (x, 0) = XEXP (x, 1);
13460 XEXP (x, 1) = tmp;
13461 changed = 1;
13464 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13465 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13466 created by virtual register instantiation, register elimination, and
13467 similar optimizations. */
13468 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13470 changed = 1;
13471 x = gen_rtx_PLUS (Pmode,
13472 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13473 XEXP (XEXP (x, 1), 0)),
13474 XEXP (XEXP (x, 1), 1));
13477 /* Canonicalize
13478 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13479 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13480 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13481 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13482 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13483 && CONSTANT_P (XEXP (x, 1)))
13485 rtx constant;
13486 rtx other = NULL_RTX;
13488 if (CONST_INT_P (XEXP (x, 1)))
13490 constant = XEXP (x, 1);
13491 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13493 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13495 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13496 other = XEXP (x, 1);
13498 else
13499 constant = 0;
13501 if (constant)
13503 changed = 1;
13504 x = gen_rtx_PLUS (Pmode,
13505 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13506 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13507 plus_constant (Pmode, other,
13508 INTVAL (constant)));
13512 if (changed && ix86_legitimate_address_p (mode, x, false))
13513 return x;
13515 if (GET_CODE (XEXP (x, 0)) == MULT)
13517 changed = 1;
13518 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13521 if (GET_CODE (XEXP (x, 1)) == MULT)
13523 changed = 1;
13524 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13527 if (changed
13528 && REG_P (XEXP (x, 1))
13529 && REG_P (XEXP (x, 0)))
13530 return x;
13532 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13534 changed = 1;
13535 x = legitimize_pic_address (x, 0);
13538 if (changed && ix86_legitimate_address_p (mode, x, false))
13539 return x;
13541 if (REG_P (XEXP (x, 0)))
13543 rtx temp = gen_reg_rtx (Pmode);
13544 rtx val = force_operand (XEXP (x, 1), temp);
13545 if (val != temp)
13547 val = convert_to_mode (Pmode, val, 1);
13548 emit_move_insn (temp, val);
13551 XEXP (x, 1) = temp;
13552 return x;
13555 else if (REG_P (XEXP (x, 1)))
13557 rtx temp = gen_reg_rtx (Pmode);
13558 rtx val = force_operand (XEXP (x, 0), temp);
13559 if (val != temp)
13561 val = convert_to_mode (Pmode, val, 1);
13562 emit_move_insn (temp, val);
13565 XEXP (x, 0) = temp;
13566 return x;
13570 return x;
13573 /* Print an integer constant expression in assembler syntax. Addition
13574 and subtraction are the only arithmetic that may appear in these
13575 expressions. FILE is the stdio stream to write to, X is the rtx, and
13576 CODE is the operand print code from the output string. */
13578 static void
13579 output_pic_addr_const (FILE *file, rtx x, int code)
13581 char buf[256];
13583 switch (GET_CODE (x))
13585 case PC:
13586 gcc_assert (flag_pic);
13587 putc ('.', file);
13588 break;
13590 case SYMBOL_REF:
13591 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13592 output_addr_const (file, x);
13593 else
13595 const char *name = XSTR (x, 0);
13597 /* Mark the decl as referenced so that cgraph will
13598 output the function. */
13599 if (SYMBOL_REF_DECL (x))
13600 mark_decl_referenced (SYMBOL_REF_DECL (x));
13602 #if TARGET_MACHO
13603 if (MACHOPIC_INDIRECT
13604 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13605 name = machopic_indirection_name (x, /*stub_p=*/true);
13606 #endif
13607 assemble_name (file, name);
13609 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13610 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13611 fputs ("@PLT", file);
13612 break;
13614 case LABEL_REF:
13615 x = XEXP (x, 0);
13616 /* FALLTHRU */
13617 case CODE_LABEL:
13618 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13619 assemble_name (asm_out_file, buf);
13620 break;
13622 case CONST_INT:
13623 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13624 break;
13626 case CONST:
13627 /* This used to output parentheses around the expression,
13628 but that does not work on the 386 (either ATT or BSD assembler). */
13629 output_pic_addr_const (file, XEXP (x, 0), code);
13630 break;
13632 case CONST_DOUBLE:
13633 if (GET_MODE (x) == VOIDmode)
13635 /* We can use %d if the number is <32 bits and positive. */
13636 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13637 fprintf (file, "0x%lx%08lx",
13638 (unsigned long) CONST_DOUBLE_HIGH (x),
13639 (unsigned long) CONST_DOUBLE_LOW (x));
13640 else
13641 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13643 else
13644 /* We can't handle floating point constants;
13645 TARGET_PRINT_OPERAND must handle them. */
13646 output_operand_lossage ("floating constant misused");
13647 break;
13649 case PLUS:
13650 /* Some assemblers need integer constants to appear first. */
13651 if (CONST_INT_P (XEXP (x, 0)))
13653 output_pic_addr_const (file, XEXP (x, 0), code);
13654 putc ('+', file);
13655 output_pic_addr_const (file, XEXP (x, 1), code);
13657 else
13659 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13660 output_pic_addr_const (file, XEXP (x, 1), code);
13661 putc ('+', file);
13662 output_pic_addr_const (file, XEXP (x, 0), code);
13664 break;
13666 case MINUS:
13667 if (!TARGET_MACHO)
13668 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13669 output_pic_addr_const (file, XEXP (x, 0), code);
13670 putc ('-', file);
13671 output_pic_addr_const (file, XEXP (x, 1), code);
13672 if (!TARGET_MACHO)
13673 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13674 break;
13676 case UNSPEC:
13677 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13679 bool f = i386_asm_output_addr_const_extra (file, x);
13680 gcc_assert (f);
13681 break;
13684 gcc_assert (XVECLEN (x, 0) == 1);
13685 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13686 switch (XINT (x, 1))
13688 case UNSPEC_GOT:
13689 fputs ("@GOT", file);
13690 break;
13691 case UNSPEC_GOTOFF:
13692 fputs ("@GOTOFF", file);
13693 break;
13694 case UNSPEC_PLTOFF:
13695 fputs ("@PLTOFF", file);
13696 break;
13697 case UNSPEC_PCREL:
13698 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13699 "(%rip)" : "[rip]", file);
13700 break;
13701 case UNSPEC_GOTPCREL:
13702 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13703 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13704 break;
13705 case UNSPEC_GOTTPOFF:
13706 /* FIXME: This might be @TPOFF in Sun ld too. */
13707 fputs ("@gottpoff", file);
13708 break;
13709 case UNSPEC_TPOFF:
13710 fputs ("@tpoff", file);
13711 break;
13712 case UNSPEC_NTPOFF:
13713 if (TARGET_64BIT)
13714 fputs ("@tpoff", file);
13715 else
13716 fputs ("@ntpoff", file);
13717 break;
13718 case UNSPEC_DTPOFF:
13719 fputs ("@dtpoff", file);
13720 break;
13721 case UNSPEC_GOTNTPOFF:
13722 if (TARGET_64BIT)
13723 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13724 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13725 else
13726 fputs ("@gotntpoff", file);
13727 break;
13728 case UNSPEC_INDNTPOFF:
13729 fputs ("@indntpoff", file);
13730 break;
13731 #if TARGET_MACHO
13732 case UNSPEC_MACHOPIC_OFFSET:
13733 putc ('-', file);
13734 machopic_output_function_base_name (file);
13735 break;
13736 #endif
13737 default:
13738 output_operand_lossage ("invalid UNSPEC as operand");
13739 break;
13741 break;
13743 default:
13744 output_operand_lossage ("invalid expression as operand");
13748 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13749 We need to emit DTP-relative relocations. */
13751 static void ATTRIBUTE_UNUSED
13752 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13754 fputs (ASM_LONG, file);
13755 output_addr_const (file, x);
13756 fputs ("@dtpoff", file);
13757 switch (size)
13759 case 4:
13760 break;
13761 case 8:
13762 fputs (", 0", file);
13763 break;
13764 default:
13765 gcc_unreachable ();
13769 /* Return true if X is a representation of the PIC register. This copes
13770 with calls from ix86_find_base_term, where the register might have
13771 been replaced by a cselib value. */
13773 static bool
13774 ix86_pic_register_p (rtx x)
13776 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13777 return (pic_offset_table_rtx
13778 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13779 else
13780 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13783 /* Helper function for ix86_delegitimize_address.
13784 Attempt to delegitimize TLS local-exec accesses. */
13786 static rtx
13787 ix86_delegitimize_tls_address (rtx orig_x)
13789 rtx x = orig_x, unspec;
13790 struct ix86_address addr;
13792 if (!TARGET_TLS_DIRECT_SEG_REFS)
13793 return orig_x;
13794 if (MEM_P (x))
13795 x = XEXP (x, 0);
13796 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13797 return orig_x;
13798 if (ix86_decompose_address (x, &addr) == 0
13799 || addr.seg != DEFAULT_TLS_SEG_REG
13800 || addr.disp == NULL_RTX
13801 || GET_CODE (addr.disp) != CONST)
13802 return orig_x;
13803 unspec = XEXP (addr.disp, 0);
13804 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13805 unspec = XEXP (unspec, 0);
13806 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13807 return orig_x;
13808 x = XVECEXP (unspec, 0, 0);
13809 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13810 if (unspec != XEXP (addr.disp, 0))
13811 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13812 if (addr.index)
13814 rtx idx = addr.index;
13815 if (addr.scale != 1)
13816 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13817 x = gen_rtx_PLUS (Pmode, idx, x);
13819 if (addr.base)
13820 x = gen_rtx_PLUS (Pmode, addr.base, x);
13821 if (MEM_P (orig_x))
13822 x = replace_equiv_address_nv (orig_x, x);
13823 return x;
13826 /* In the name of slightly smaller debug output, and to cater to
13827 general assembler lossage, recognize PIC+GOTOFF and turn it back
13828 into a direct symbol reference.
13830 On Darwin, this is necessary to avoid a crash, because Darwin
13831 has a different PIC label for each routine but the DWARF debugging
13832 information is not associated with any particular routine, so it's
13833 necessary to remove references to the PIC label from RTL stored by
13834 the DWARF output code. */
13836 static rtx
13837 ix86_delegitimize_address (rtx x)
13839 rtx orig_x = delegitimize_mem_from_attrs (x);
13840 /* addend is NULL or some rtx if x is something+GOTOFF where
13841 something doesn't include the PIC register. */
13842 rtx addend = NULL_RTX;
13843 /* reg_addend is NULL or a multiple of some register. */
13844 rtx reg_addend = NULL_RTX;
13845 /* const_addend is NULL or a const_int. */
13846 rtx const_addend = NULL_RTX;
13847 /* This is the result, or NULL. */
13848 rtx result = NULL_RTX;
13850 x = orig_x;
13852 if (MEM_P (x))
13853 x = XEXP (x, 0);
13855 if (TARGET_64BIT)
13857 if (GET_CODE (x) == CONST
13858 && GET_CODE (XEXP (x, 0)) == PLUS
13859 && GET_MODE (XEXP (x, 0)) == Pmode
13860 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13861 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13862 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13864 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13865 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13866 if (MEM_P (orig_x))
13867 x = replace_equiv_address_nv (orig_x, x);
13868 return x;
13871 if (GET_CODE (x) == CONST
13872 && GET_CODE (XEXP (x, 0)) == UNSPEC
13873 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
13874 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
13875 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
13877 x = XVECEXP (XEXP (x, 0), 0, 0);
13878 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13880 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13881 GET_MODE (x), 0);
13882 if (x == NULL_RTX)
13883 return orig_x;
13885 return x;
13888 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
13889 return ix86_delegitimize_tls_address (orig_x);
13891 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
13892 and -mcmodel=medium -fpic. */
13895 if (GET_CODE (x) != PLUS
13896 || GET_CODE (XEXP (x, 1)) != CONST)
13897 return ix86_delegitimize_tls_address (orig_x);
13899 if (ix86_pic_register_p (XEXP (x, 0)))
13900 /* %ebx + GOT/GOTOFF */
13902 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13904 /* %ebx + %reg * scale + GOT/GOTOFF */
13905 reg_addend = XEXP (x, 0);
13906 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13907 reg_addend = XEXP (reg_addend, 1);
13908 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13909 reg_addend = XEXP (reg_addend, 0);
13910 else
13912 reg_addend = NULL_RTX;
13913 addend = XEXP (x, 0);
13916 else
13917 addend = XEXP (x, 0);
13919 x = XEXP (XEXP (x, 1), 0);
13920 if (GET_CODE (x) == PLUS
13921 && CONST_INT_P (XEXP (x, 1)))
13923 const_addend = XEXP (x, 1);
13924 x = XEXP (x, 0);
13927 if (GET_CODE (x) == UNSPEC
13928 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13929 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
13930 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
13931 && !MEM_P (orig_x) && !addend)))
13932 result = XVECEXP (x, 0, 0);
13934 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
13935 && !MEM_P (orig_x))
13936 result = XVECEXP (x, 0, 0);
13938 if (! result)
13939 return ix86_delegitimize_tls_address (orig_x);
13941 if (const_addend)
13942 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13943 if (reg_addend)
13944 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13945 if (addend)
13947 /* If the rest of original X doesn't involve the PIC register, add
13948 addend and subtract pic_offset_table_rtx. This can happen e.g.
13949 for code like:
13950 leal (%ebx, %ecx, 4), %ecx
13952 movl foo@GOTOFF(%ecx), %edx
13953 in which case we return (%ecx - %ebx) + foo. */
13954 if (pic_offset_table_rtx)
13955 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13956 pic_offset_table_rtx),
13957 result);
13958 else
13959 return orig_x;
13961 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13963 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13964 if (result == NULL_RTX)
13965 return orig_x;
13967 return result;
13970 /* If X is a machine specific address (i.e. a symbol or label being
13971 referenced as a displacement from the GOT implemented using an
13972 UNSPEC), then return the base term. Otherwise return X. */
13975 ix86_find_base_term (rtx x)
13977 rtx term;
13979 if (TARGET_64BIT)
13981 if (GET_CODE (x) != CONST)
13982 return x;
13983 term = XEXP (x, 0);
13984 if (GET_CODE (term) == PLUS
13985 && (CONST_INT_P (XEXP (term, 1))
13986 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13987 term = XEXP (term, 0);
13988 if (GET_CODE (term) != UNSPEC
13989 || (XINT (term, 1) != UNSPEC_GOTPCREL
13990 && XINT (term, 1) != UNSPEC_PCREL))
13991 return x;
13993 return XVECEXP (term, 0, 0);
13996 return ix86_delegitimize_address (x);
13999 static void
14000 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14001 bool fp, FILE *file)
14003 const char *suffix;
14005 if (mode == CCFPmode || mode == CCFPUmode)
14007 code = ix86_fp_compare_code_to_integer (code);
14008 mode = CCmode;
14010 if (reverse)
14011 code = reverse_condition (code);
14013 switch (code)
14015 case EQ:
14016 switch (mode)
14018 case CCAmode:
14019 suffix = "a";
14020 break;
14022 case CCCmode:
14023 suffix = "c";
14024 break;
14026 case CCOmode:
14027 suffix = "o";
14028 break;
14030 case CCSmode:
14031 suffix = "s";
14032 break;
14034 default:
14035 suffix = "e";
14037 break;
14038 case NE:
14039 switch (mode)
14041 case CCAmode:
14042 suffix = "na";
14043 break;
14045 case CCCmode:
14046 suffix = "nc";
14047 break;
14049 case CCOmode:
14050 suffix = "no";
14051 break;
14053 case CCSmode:
14054 suffix = "ns";
14055 break;
14057 default:
14058 suffix = "ne";
14060 break;
14061 case GT:
14062 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14063 suffix = "g";
14064 break;
14065 case GTU:
14066 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14067 Those same assemblers have the same but opposite lossage on cmov. */
14068 if (mode == CCmode)
14069 suffix = fp ? "nbe" : "a";
14070 else if (mode == CCCmode)
14071 suffix = "b";
14072 else
14073 gcc_unreachable ();
14074 break;
14075 case LT:
14076 switch (mode)
14078 case CCNOmode:
14079 case CCGOCmode:
14080 suffix = "s";
14081 break;
14083 case CCmode:
14084 case CCGCmode:
14085 suffix = "l";
14086 break;
14088 default:
14089 gcc_unreachable ();
14091 break;
14092 case LTU:
14093 gcc_assert (mode == CCmode || mode == CCCmode);
14094 suffix = "b";
14095 break;
14096 case GE:
14097 switch (mode)
14099 case CCNOmode:
14100 case CCGOCmode:
14101 suffix = "ns";
14102 break;
14104 case CCmode:
14105 case CCGCmode:
14106 suffix = "ge";
14107 break;
14109 default:
14110 gcc_unreachable ();
14112 break;
14113 case GEU:
14114 /* ??? As above. */
14115 gcc_assert (mode == CCmode || mode == CCCmode);
14116 suffix = fp ? "nb" : "ae";
14117 break;
14118 case LE:
14119 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14120 suffix = "le";
14121 break;
14122 case LEU:
14123 /* ??? As above. */
14124 if (mode == CCmode)
14125 suffix = "be";
14126 else if (mode == CCCmode)
14127 suffix = fp ? "nb" : "ae";
14128 else
14129 gcc_unreachable ();
14130 break;
14131 case UNORDERED:
14132 suffix = fp ? "u" : "p";
14133 break;
14134 case ORDERED:
14135 suffix = fp ? "nu" : "np";
14136 break;
14137 default:
14138 gcc_unreachable ();
14140 fputs (suffix, file);
14143 /* Print the name of register X to FILE based on its machine mode and number.
14144 If CODE is 'w', pretend the mode is HImode.
14145 If CODE is 'b', pretend the mode is QImode.
14146 If CODE is 'k', pretend the mode is SImode.
14147 If CODE is 'q', pretend the mode is DImode.
14148 If CODE is 'x', pretend the mode is V4SFmode.
14149 If CODE is 't', pretend the mode is V8SFmode.
14150 If CODE is 'g', pretend the mode is V16SFmode.
14151 If CODE is 'h', pretend the reg is the 'high' byte register.
14152 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14153 If CODE is 'd', duplicate the operand for AVX instruction.
14156 void
14157 print_reg (rtx x, int code, FILE *file)
14159 const char *reg;
14160 unsigned int regno;
14161 bool duplicated = code == 'd' && TARGET_AVX;
14163 if (ASSEMBLER_DIALECT == ASM_ATT)
14164 putc ('%', file);
14166 if (x == pc_rtx)
14168 gcc_assert (TARGET_64BIT);
14169 fputs ("rip", file);
14170 return;
14173 regno = true_regnum (x);
14174 gcc_assert (regno != ARG_POINTER_REGNUM
14175 && regno != FRAME_POINTER_REGNUM
14176 && regno != FLAGS_REG
14177 && regno != FPSR_REG
14178 && regno != FPCR_REG);
14180 if (code == 'w' || MMX_REG_P (x))
14181 code = 2;
14182 else if (code == 'b')
14183 code = 1;
14184 else if (code == 'k')
14185 code = 4;
14186 else if (code == 'q')
14187 code = 8;
14188 else if (code == 'y')
14189 code = 3;
14190 else if (code == 'h')
14191 code = 0;
14192 else if (code == 'x')
14193 code = 16;
14194 else if (code == 't')
14195 code = 32;
14196 else if (code == 'g')
14197 code = 64;
14198 else
14199 code = GET_MODE_SIZE (GET_MODE (x));
14201 /* Irritatingly, AMD extended registers use different naming convention
14202 from the normal registers: "r%d[bwd]" */
14203 if (REX_INT_REGNO_P (regno))
14205 gcc_assert (TARGET_64BIT);
14206 putc ('r', file);
14207 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14208 switch (code)
14210 case 0:
14211 error ("extended registers have no high halves");
14212 break;
14213 case 1:
14214 putc ('b', file);
14215 break;
14216 case 2:
14217 putc ('w', file);
14218 break;
14219 case 4:
14220 putc ('d', file);
14221 break;
14222 case 8:
14223 /* no suffix */
14224 break;
14225 default:
14226 error ("unsupported operand size for extended register");
14227 break;
14229 return;
14232 reg = NULL;
14233 switch (code)
14235 case 3:
14236 if (STACK_TOP_P (x))
14238 reg = "st(0)";
14239 break;
14241 /* FALLTHRU */
14242 case 8:
14243 case 4:
14244 case 12:
14245 if (! ANY_FP_REG_P (x))
14246 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14247 /* FALLTHRU */
14248 case 16:
14249 case 2:
14250 normal:
14251 reg = hi_reg_name[regno];
14252 break;
14253 case 1:
14254 if (regno >= ARRAY_SIZE (qi_reg_name))
14255 goto normal;
14256 reg = qi_reg_name[regno];
14257 break;
14258 case 0:
14259 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14260 goto normal;
14261 reg = qi_high_reg_name[regno];
14262 break;
14263 case 32:
14264 if (SSE_REG_P (x))
14266 gcc_assert (!duplicated);
14267 putc ('y', file);
14268 fputs (hi_reg_name[regno] + 1, file);
14269 return;
14271 case 64:
14272 if (SSE_REG_P (x))
14274 gcc_assert (!duplicated);
14275 putc ('z', file);
14276 fputs (hi_reg_name[REGNO (x)] + 1, file);
14277 return;
14279 break;
14280 default:
14281 gcc_unreachable ();
14284 fputs (reg, file);
14285 if (duplicated)
14287 if (ASSEMBLER_DIALECT == ASM_ATT)
14288 fprintf (file, ", %%%s", reg);
14289 else
14290 fprintf (file, ", %s", reg);
14294 /* Locate some local-dynamic symbol still in use by this function
14295 so that we can print its name in some tls_local_dynamic_base
14296 pattern. */
14298 static int
14299 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14301 rtx x = *px;
14303 if (GET_CODE (x) == SYMBOL_REF
14304 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14306 cfun->machine->some_ld_name = XSTR (x, 0);
14307 return 1;
14310 return 0;
14313 static const char *
14314 get_some_local_dynamic_name (void)
14316 rtx insn;
14318 if (cfun->machine->some_ld_name)
14319 return cfun->machine->some_ld_name;
14321 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14322 if (NONDEBUG_INSN_P (insn)
14323 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14324 return cfun->machine->some_ld_name;
14326 return NULL;
14329 /* Meaning of CODE:
14330 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14331 C -- print opcode suffix for set/cmov insn.
14332 c -- like C, but print reversed condition
14333 F,f -- likewise, but for floating-point.
14334 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14335 otherwise nothing
14336 R -- print the prefix for register names.
14337 z -- print the opcode suffix for the size of the current operand.
14338 Z -- likewise, with special suffixes for x87 instructions.
14339 * -- print a star (in certain assembler syntax)
14340 A -- print an absolute memory reference.
14341 E -- print address with DImode register names if TARGET_64BIT.
14342 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14343 s -- print a shift double count, followed by the assemblers argument
14344 delimiter.
14345 b -- print the QImode name of the register for the indicated operand.
14346 %b0 would print %al if operands[0] is reg 0.
14347 w -- likewise, print the HImode name of the register.
14348 k -- likewise, print the SImode name of the register.
14349 q -- likewise, print the DImode name of the register.
14350 x -- likewise, print the V4SFmode name of the register.
14351 t -- likewise, print the V8SFmode name of the register.
14352 g -- likewise, print the V16SFmode name of the register.
14353 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14354 y -- print "st(0)" instead of "st" as a register.
14355 d -- print duplicated register operand for AVX instruction.
14356 D -- print condition for SSE cmp instruction.
14357 P -- if PIC, print an @PLT suffix.
14358 p -- print raw symbol name.
14359 X -- don't print any sort of PIC '@' suffix for a symbol.
14360 & -- print some in-use local-dynamic symbol name.
14361 H -- print a memory address offset by 8; used for sse high-parts
14362 Y -- print condition for XOP pcom* instruction.
14363 + -- print a branch hint as 'cs' or 'ds' prefix
14364 ; -- print a semicolon (after prefixes due to bug in older gas).
14365 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14366 @ -- print a segment register of thread base pointer load
14367 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14370 void
14371 ix86_print_operand (FILE *file, rtx x, int code)
14373 if (code)
14375 switch (code)
14377 case 'A':
14378 switch (ASSEMBLER_DIALECT)
14380 case ASM_ATT:
14381 putc ('*', file);
14382 break;
14384 case ASM_INTEL:
14385 /* Intel syntax. For absolute addresses, registers should not
14386 be surrounded by braces. */
14387 if (!REG_P (x))
14389 putc ('[', file);
14390 ix86_print_operand (file, x, 0);
14391 putc (']', file);
14392 return;
14394 break;
14396 default:
14397 gcc_unreachable ();
14400 ix86_print_operand (file, x, 0);
14401 return;
14403 case 'E':
14404 /* Wrap address in an UNSPEC to declare special handling. */
14405 if (TARGET_64BIT)
14406 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14408 output_address (x);
14409 return;
14411 case 'L':
14412 if (ASSEMBLER_DIALECT == ASM_ATT)
14413 putc ('l', file);
14414 return;
14416 case 'W':
14417 if (ASSEMBLER_DIALECT == ASM_ATT)
14418 putc ('w', file);
14419 return;
14421 case 'B':
14422 if (ASSEMBLER_DIALECT == ASM_ATT)
14423 putc ('b', file);
14424 return;
14426 case 'Q':
14427 if (ASSEMBLER_DIALECT == ASM_ATT)
14428 putc ('l', file);
14429 return;
14431 case 'S':
14432 if (ASSEMBLER_DIALECT == ASM_ATT)
14433 putc ('s', file);
14434 return;
14436 case 'T':
14437 if (ASSEMBLER_DIALECT == ASM_ATT)
14438 putc ('t', file);
14439 return;
14441 case 'O':
14442 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14443 if (ASSEMBLER_DIALECT != ASM_ATT)
14444 return;
14446 switch (GET_MODE_SIZE (GET_MODE (x)))
14448 case 2:
14449 putc ('w', file);
14450 break;
14452 case 4:
14453 putc ('l', file);
14454 break;
14456 case 8:
14457 putc ('q', file);
14458 break;
14460 default:
14461 output_operand_lossage
14462 ("invalid operand size for operand code 'O'");
14463 return;
14466 putc ('.', file);
14467 #endif
14468 return;
14470 case 'z':
14471 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14473 /* Opcodes don't get size suffixes if using Intel opcodes. */
14474 if (ASSEMBLER_DIALECT == ASM_INTEL)
14475 return;
14477 switch (GET_MODE_SIZE (GET_MODE (x)))
14479 case 1:
14480 putc ('b', file);
14481 return;
14483 case 2:
14484 putc ('w', file);
14485 return;
14487 case 4:
14488 putc ('l', file);
14489 return;
14491 case 8:
14492 putc ('q', file);
14493 return;
14495 default:
14496 output_operand_lossage
14497 ("invalid operand size for operand code 'z'");
14498 return;
14502 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14503 warning
14504 (0, "non-integer operand used with operand code 'z'");
14505 /* FALLTHRU */
14507 case 'Z':
14508 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14509 if (ASSEMBLER_DIALECT == ASM_INTEL)
14510 return;
14512 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14514 switch (GET_MODE_SIZE (GET_MODE (x)))
14516 case 2:
14517 #ifdef HAVE_AS_IX86_FILDS
14518 putc ('s', file);
14519 #endif
14520 return;
14522 case 4:
14523 putc ('l', file);
14524 return;
14526 case 8:
14527 #ifdef HAVE_AS_IX86_FILDQ
14528 putc ('q', file);
14529 #else
14530 fputs ("ll", file);
14531 #endif
14532 return;
14534 default:
14535 break;
14538 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14540 /* 387 opcodes don't get size suffixes
14541 if the operands are registers. */
14542 if (STACK_REG_P (x))
14543 return;
14545 switch (GET_MODE_SIZE (GET_MODE (x)))
14547 case 4:
14548 putc ('s', file);
14549 return;
14551 case 8:
14552 putc ('l', file);
14553 return;
14555 case 12:
14556 case 16:
14557 putc ('t', file);
14558 return;
14560 default:
14561 break;
14564 else
14566 output_operand_lossage
14567 ("invalid operand type used with operand code 'Z'");
14568 return;
14571 output_operand_lossage
14572 ("invalid operand size for operand code 'Z'");
14573 return;
14575 case 'd':
14576 case 'b':
14577 case 'w':
14578 case 'k':
14579 case 'q':
14580 case 'h':
14581 case 't':
14582 case 'g':
14583 case 'y':
14584 case 'x':
14585 case 'X':
14586 case 'P':
14587 case 'p':
14588 break;
14590 case 's':
14591 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14593 ix86_print_operand (file, x, 0);
14594 fputs (", ", file);
14596 return;
14598 case 'Y':
14599 switch (GET_CODE (x))
14601 case NE:
14602 fputs ("neq", file);
14603 break;
14604 case EQ:
14605 fputs ("eq", file);
14606 break;
14607 case GE:
14608 case GEU:
14609 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14610 break;
14611 case GT:
14612 case GTU:
14613 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14614 break;
14615 case LE:
14616 case LEU:
14617 fputs ("le", file);
14618 break;
14619 case LT:
14620 case LTU:
14621 fputs ("lt", file);
14622 break;
14623 case UNORDERED:
14624 fputs ("unord", file);
14625 break;
14626 case ORDERED:
14627 fputs ("ord", file);
14628 break;
14629 case UNEQ:
14630 fputs ("ueq", file);
14631 break;
14632 case UNGE:
14633 fputs ("nlt", file);
14634 break;
14635 case UNGT:
14636 fputs ("nle", file);
14637 break;
14638 case UNLE:
14639 fputs ("ule", file);
14640 break;
14641 case UNLT:
14642 fputs ("ult", file);
14643 break;
14644 case LTGT:
14645 fputs ("une", file);
14646 break;
14647 default:
14648 output_operand_lossage ("operand is not a condition code, "
14649 "invalid operand code 'Y'");
14650 return;
14652 return;
14654 case 'D':
14655 /* Little bit of braindamage here. The SSE compare instructions
14656 does use completely different names for the comparisons that the
14657 fp conditional moves. */
14658 switch (GET_CODE (x))
14660 case UNEQ:
14661 if (TARGET_AVX)
14663 fputs ("eq_us", file);
14664 break;
14666 case EQ:
14667 fputs ("eq", file);
14668 break;
14669 case UNLT:
14670 if (TARGET_AVX)
14672 fputs ("nge", file);
14673 break;
14675 case LT:
14676 fputs ("lt", file);
14677 break;
14678 case UNLE:
14679 if (TARGET_AVX)
14681 fputs ("ngt", file);
14682 break;
14684 case LE:
14685 fputs ("le", file);
14686 break;
14687 case UNORDERED:
14688 fputs ("unord", file);
14689 break;
14690 case LTGT:
14691 if (TARGET_AVX)
14693 fputs ("neq_oq", file);
14694 break;
14696 case NE:
14697 fputs ("neq", file);
14698 break;
14699 case GE:
14700 if (TARGET_AVX)
14702 fputs ("ge", file);
14703 break;
14705 case UNGE:
14706 fputs ("nlt", file);
14707 break;
14708 case GT:
14709 if (TARGET_AVX)
14711 fputs ("gt", file);
14712 break;
14714 case UNGT:
14715 fputs ("nle", file);
14716 break;
14717 case ORDERED:
14718 fputs ("ord", file);
14719 break;
14720 default:
14721 output_operand_lossage ("operand is not a condition code, "
14722 "invalid operand code 'D'");
14723 return;
14725 return;
14727 case 'F':
14728 case 'f':
14729 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14730 if (ASSEMBLER_DIALECT == ASM_ATT)
14731 putc ('.', file);
14732 #endif
14734 case 'C':
14735 case 'c':
14736 if (!COMPARISON_P (x))
14738 output_operand_lossage ("operand is not a condition code, "
14739 "invalid operand code '%c'", code);
14740 return;
14742 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14743 code == 'c' || code == 'f',
14744 code == 'F' || code == 'f',
14745 file);
14746 return;
14748 case 'H':
14749 if (!offsettable_memref_p (x))
14751 output_operand_lossage ("operand is not an offsettable memory "
14752 "reference, invalid operand code 'H'");
14753 return;
14755 /* It doesn't actually matter what mode we use here, as we're
14756 only going to use this for printing. */
14757 x = adjust_address_nv (x, DImode, 8);
14758 /* Output 'qword ptr' for intel assembler dialect. */
14759 if (ASSEMBLER_DIALECT == ASM_INTEL)
14760 code = 'q';
14761 break;
14763 case 'K':
14764 gcc_assert (CONST_INT_P (x));
14766 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14767 #ifdef HAVE_AS_IX86_HLE
14768 fputs ("xacquire ", file);
14769 #else
14770 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14771 #endif
14772 else if (INTVAL (x) & IX86_HLE_RELEASE)
14773 #ifdef HAVE_AS_IX86_HLE
14774 fputs ("xrelease ", file);
14775 #else
14776 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14777 #endif
14778 /* We do not want to print value of the operand. */
14779 return;
14781 case '*':
14782 if (ASSEMBLER_DIALECT == ASM_ATT)
14783 putc ('*', file);
14784 return;
14786 case '&':
14788 const char *name = get_some_local_dynamic_name ();
14789 if (name == NULL)
14790 output_operand_lossage ("'%%&' used without any "
14791 "local dynamic TLS references");
14792 else
14793 assemble_name (file, name);
14794 return;
14797 case '+':
14799 rtx x;
14801 if (!optimize
14802 || optimize_function_for_size_p (cfun)
14803 || !TARGET_BRANCH_PREDICTION_HINTS)
14804 return;
14806 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14807 if (x)
14809 int pred_val = INTVAL (XEXP (x, 0));
14811 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14812 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14814 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14815 bool cputaken
14816 = final_forward_branch_p (current_output_insn) == 0;
14818 /* Emit hints only in the case default branch prediction
14819 heuristics would fail. */
14820 if (taken != cputaken)
14822 /* We use 3e (DS) prefix for taken branches and
14823 2e (CS) prefix for not taken branches. */
14824 if (taken)
14825 fputs ("ds ; ", file);
14826 else
14827 fputs ("cs ; ", file);
14831 return;
14834 case ';':
14835 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14836 putc (';', file);
14837 #endif
14838 return;
14840 case '@':
14841 if (ASSEMBLER_DIALECT == ASM_ATT)
14842 putc ('%', file);
14844 /* The kernel uses a different segment register for performance
14845 reasons; a system call would not have to trash the userspace
14846 segment register, which would be expensive. */
14847 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14848 fputs ("fs", file);
14849 else
14850 fputs ("gs", file);
14851 return;
14853 case '~':
14854 putc (TARGET_AVX2 ? 'i' : 'f', file);
14855 return;
14857 case '^':
14858 if (TARGET_64BIT && Pmode != word_mode)
14859 fputs ("addr32 ", file);
14860 return;
14862 default:
14863 output_operand_lossage ("invalid operand code '%c'", code);
14867 if (REG_P (x))
14868 print_reg (x, code, file);
14870 else if (MEM_P (x))
14872 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14873 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14874 && GET_MODE (x) != BLKmode)
14876 const char * size;
14877 switch (GET_MODE_SIZE (GET_MODE (x)))
14879 case 1: size = "BYTE"; break;
14880 case 2: size = "WORD"; break;
14881 case 4: size = "DWORD"; break;
14882 case 8: size = "QWORD"; break;
14883 case 12: size = "TBYTE"; break;
14884 case 16:
14885 if (GET_MODE (x) == XFmode)
14886 size = "TBYTE";
14887 else
14888 size = "XMMWORD";
14889 break;
14890 case 32: size = "YMMWORD"; break;
14891 case 64: size = "ZMMWORD"; break;
14892 default:
14893 gcc_unreachable ();
14896 /* Check for explicit size override (codes 'b', 'w', 'k',
14897 'q' and 'x') */
14898 if (code == 'b')
14899 size = "BYTE";
14900 else if (code == 'w')
14901 size = "WORD";
14902 else if (code == 'k')
14903 size = "DWORD";
14904 else if (code == 'q')
14905 size = "QWORD";
14906 else if (code == 'x')
14907 size = "XMMWORD";
14909 fputs (size, file);
14910 fputs (" PTR ", file);
14913 x = XEXP (x, 0);
14914 /* Avoid (%rip) for call operands. */
14915 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14916 && !CONST_INT_P (x))
14917 output_addr_const (file, x);
14918 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14919 output_operand_lossage ("invalid constraints for operand");
14920 else
14921 output_address (x);
14924 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14926 REAL_VALUE_TYPE r;
14927 long l;
14929 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14930 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14932 if (ASSEMBLER_DIALECT == ASM_ATT)
14933 putc ('$', file);
14934 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14935 if (code == 'q')
14936 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14937 (unsigned long long) (int) l);
14938 else
14939 fprintf (file, "0x%08x", (unsigned int) l);
14942 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14944 REAL_VALUE_TYPE r;
14945 long l[2];
14947 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14948 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14950 if (ASSEMBLER_DIALECT == ASM_ATT)
14951 putc ('$', file);
14952 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14955 /* These float cases don't actually occur as immediate operands. */
14956 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14958 char dstr[30];
14960 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14961 fputs (dstr, file);
14964 else
14966 /* We have patterns that allow zero sets of memory, for instance.
14967 In 64-bit mode, we should probably support all 8-byte vectors,
14968 since we can in fact encode that into an immediate. */
14969 if (GET_CODE (x) == CONST_VECTOR)
14971 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14972 x = const0_rtx;
14975 if (code != 'P' && code != 'p')
14977 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14979 if (ASSEMBLER_DIALECT == ASM_ATT)
14980 putc ('$', file);
14982 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14983 || GET_CODE (x) == LABEL_REF)
14985 if (ASSEMBLER_DIALECT == ASM_ATT)
14986 putc ('$', file);
14987 else
14988 fputs ("OFFSET FLAT:", file);
14991 if (CONST_INT_P (x))
14992 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14993 else if (flag_pic || MACHOPIC_INDIRECT)
14994 output_pic_addr_const (file, x, code);
14995 else
14996 output_addr_const (file, x);
15000 static bool
15001 ix86_print_operand_punct_valid_p (unsigned char code)
15003 return (code == '@' || code == '*' || code == '+' || code == '&'
15004 || code == ';' || code == '~' || code == '^');
15007 /* Print a memory operand whose address is ADDR. */
15009 static void
15010 ix86_print_operand_address (FILE *file, rtx addr)
15012 struct ix86_address parts;
15013 rtx base, index, disp;
15014 int scale;
15015 int ok;
15016 bool vsib = false;
15017 int code = 0;
15019 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15021 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15022 gcc_assert (parts.index == NULL_RTX);
15023 parts.index = XVECEXP (addr, 0, 1);
15024 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15025 addr = XVECEXP (addr, 0, 0);
15026 vsib = true;
15028 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15030 gcc_assert (TARGET_64BIT);
15031 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15032 code = 'q';
15034 else
15035 ok = ix86_decompose_address (addr, &parts);
15037 gcc_assert (ok);
15039 base = parts.base;
15040 index = parts.index;
15041 disp = parts.disp;
15042 scale = parts.scale;
15044 switch (parts.seg)
15046 case SEG_DEFAULT:
15047 break;
15048 case SEG_FS:
15049 case SEG_GS:
15050 if (ASSEMBLER_DIALECT == ASM_ATT)
15051 putc ('%', file);
15052 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15053 break;
15054 default:
15055 gcc_unreachable ();
15058 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15059 if (TARGET_64BIT && !base && !index)
15061 rtx symbol = disp;
15063 if (GET_CODE (disp) == CONST
15064 && GET_CODE (XEXP (disp, 0)) == PLUS
15065 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15066 symbol = XEXP (XEXP (disp, 0), 0);
15068 if (GET_CODE (symbol) == LABEL_REF
15069 || (GET_CODE (symbol) == SYMBOL_REF
15070 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15071 base = pc_rtx;
15073 if (!base && !index)
15075 /* Displacement only requires special attention. */
15077 if (CONST_INT_P (disp))
15079 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15080 fputs ("ds:", file);
15081 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15083 else if (flag_pic)
15084 output_pic_addr_const (file, disp, 0);
15085 else
15086 output_addr_const (file, disp);
15088 else
15090 /* Print SImode register names to force addr32 prefix. */
15091 if (SImode_address_operand (addr, VOIDmode))
15093 #ifdef ENABLE_CHECKING
15094 gcc_assert (TARGET_64BIT);
15095 switch (GET_CODE (addr))
15097 case SUBREG:
15098 gcc_assert (GET_MODE (addr) == SImode);
15099 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15100 break;
15101 case ZERO_EXTEND:
15102 case AND:
15103 gcc_assert (GET_MODE (addr) == DImode);
15104 break;
15105 default:
15106 gcc_unreachable ();
15108 #endif
15109 gcc_assert (!code);
15110 code = 'k';
15112 else if (code == 0
15113 && TARGET_X32
15114 && disp
15115 && CONST_INT_P (disp)
15116 && INTVAL (disp) < -16*1024*1024)
15118 /* X32 runs in 64-bit mode, where displacement, DISP, in
15119 address DISP(%r64), is encoded as 32-bit immediate sign-
15120 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15121 address is %r64 + 0xffffffffbffffd00. When %r64 <
15122 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15123 which is invalid for x32. The correct address is %r64
15124 - 0x40000300 == 0xf7ffdd64. To properly encode
15125 -0x40000300(%r64) for x32, we zero-extend negative
15126 displacement by forcing addr32 prefix which truncates
15127 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15128 zero-extend all negative displacements, including -1(%rsp).
15129 However, for small negative displacements, sign-extension
15130 won't cause overflow. We only zero-extend negative
15131 displacements if they < -16*1024*1024, which is also used
15132 to check legitimate address displacements for PIC. */
15133 code = 'k';
15136 if (ASSEMBLER_DIALECT == ASM_ATT)
15138 if (disp)
15140 if (flag_pic)
15141 output_pic_addr_const (file, disp, 0);
15142 else if (GET_CODE (disp) == LABEL_REF)
15143 output_asm_label (disp);
15144 else
15145 output_addr_const (file, disp);
15148 putc ('(', file);
15149 if (base)
15150 print_reg (base, code, file);
15151 if (index)
15153 putc (',', file);
15154 print_reg (index, vsib ? 0 : code, file);
15155 if (scale != 1 || vsib)
15156 fprintf (file, ",%d", scale);
15158 putc (')', file);
15160 else
15162 rtx offset = NULL_RTX;
15164 if (disp)
15166 /* Pull out the offset of a symbol; print any symbol itself. */
15167 if (GET_CODE (disp) == CONST
15168 && GET_CODE (XEXP (disp, 0)) == PLUS
15169 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15171 offset = XEXP (XEXP (disp, 0), 1);
15172 disp = gen_rtx_CONST (VOIDmode,
15173 XEXP (XEXP (disp, 0), 0));
15176 if (flag_pic)
15177 output_pic_addr_const (file, disp, 0);
15178 else if (GET_CODE (disp) == LABEL_REF)
15179 output_asm_label (disp);
15180 else if (CONST_INT_P (disp))
15181 offset = disp;
15182 else
15183 output_addr_const (file, disp);
15186 putc ('[', file);
15187 if (base)
15189 print_reg (base, code, file);
15190 if (offset)
15192 if (INTVAL (offset) >= 0)
15193 putc ('+', file);
15194 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15197 else if (offset)
15198 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15199 else
15200 putc ('0', file);
15202 if (index)
15204 putc ('+', file);
15205 print_reg (index, vsib ? 0 : code, file);
15206 if (scale != 1 || vsib)
15207 fprintf (file, "*%d", scale);
15209 putc (']', file);
15214 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15216 static bool
15217 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15219 rtx op;
15221 if (GET_CODE (x) != UNSPEC)
15222 return false;
15224 op = XVECEXP (x, 0, 0);
15225 switch (XINT (x, 1))
15227 case UNSPEC_GOTTPOFF:
15228 output_addr_const (file, op);
15229 /* FIXME: This might be @TPOFF in Sun ld. */
15230 fputs ("@gottpoff", file);
15231 break;
15232 case UNSPEC_TPOFF:
15233 output_addr_const (file, op);
15234 fputs ("@tpoff", file);
15235 break;
15236 case UNSPEC_NTPOFF:
15237 output_addr_const (file, op);
15238 if (TARGET_64BIT)
15239 fputs ("@tpoff", file);
15240 else
15241 fputs ("@ntpoff", file);
15242 break;
15243 case UNSPEC_DTPOFF:
15244 output_addr_const (file, op);
15245 fputs ("@dtpoff", file);
15246 break;
15247 case UNSPEC_GOTNTPOFF:
15248 output_addr_const (file, op);
15249 if (TARGET_64BIT)
15250 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15251 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15252 else
15253 fputs ("@gotntpoff", file);
15254 break;
15255 case UNSPEC_INDNTPOFF:
15256 output_addr_const (file, op);
15257 fputs ("@indntpoff", file);
15258 break;
15259 #if TARGET_MACHO
15260 case UNSPEC_MACHOPIC_OFFSET:
15261 output_addr_const (file, op);
15262 putc ('-', file);
15263 machopic_output_function_base_name (file);
15264 break;
15265 #endif
15267 case UNSPEC_STACK_CHECK:
15269 int offset;
15271 gcc_assert (flag_split_stack);
15273 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15274 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15275 #else
15276 gcc_unreachable ();
15277 #endif
15279 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15281 break;
15283 default:
15284 return false;
15287 return true;
15290 /* Split one or more double-mode RTL references into pairs of half-mode
15291 references. The RTL can be REG, offsettable MEM, integer constant, or
15292 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15293 split and "num" is its length. lo_half and hi_half are output arrays
15294 that parallel "operands". */
15296 void
15297 split_double_mode (enum machine_mode mode, rtx operands[],
15298 int num, rtx lo_half[], rtx hi_half[])
15300 enum machine_mode half_mode;
15301 unsigned int byte;
15303 switch (mode)
15305 case TImode:
15306 half_mode = DImode;
15307 break;
15308 case DImode:
15309 half_mode = SImode;
15310 break;
15311 default:
15312 gcc_unreachable ();
15315 byte = GET_MODE_SIZE (half_mode);
15317 while (num--)
15319 rtx op = operands[num];
15321 /* simplify_subreg refuse to split volatile memory addresses,
15322 but we still have to handle it. */
15323 if (MEM_P (op))
15325 lo_half[num] = adjust_address (op, half_mode, 0);
15326 hi_half[num] = adjust_address (op, half_mode, byte);
15328 else
15330 lo_half[num] = simplify_gen_subreg (half_mode, op,
15331 GET_MODE (op) == VOIDmode
15332 ? mode : GET_MODE (op), 0);
15333 hi_half[num] = simplify_gen_subreg (half_mode, op,
15334 GET_MODE (op) == VOIDmode
15335 ? mode : GET_MODE (op), byte);
15340 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15341 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15342 is the expression of the binary operation. The output may either be
15343 emitted here, or returned to the caller, like all output_* functions.
15345 There is no guarantee that the operands are the same mode, as they
15346 might be within FLOAT or FLOAT_EXTEND expressions. */
15348 #ifndef SYSV386_COMPAT
15349 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15350 wants to fix the assemblers because that causes incompatibility
15351 with gcc. No-one wants to fix gcc because that causes
15352 incompatibility with assemblers... You can use the option of
15353 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15354 #define SYSV386_COMPAT 1
15355 #endif
15357 const char *
15358 output_387_binary_op (rtx insn, rtx *operands)
15360 static char buf[40];
15361 const char *p;
15362 const char *ssep;
15363 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15365 #ifdef ENABLE_CHECKING
15366 /* Even if we do not want to check the inputs, this documents input
15367 constraints. Which helps in understanding the following code. */
15368 if (STACK_REG_P (operands[0])
15369 && ((REG_P (operands[1])
15370 && REGNO (operands[0]) == REGNO (operands[1])
15371 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15372 || (REG_P (operands[2])
15373 && REGNO (operands[0]) == REGNO (operands[2])
15374 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15375 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15376 ; /* ok */
15377 else
15378 gcc_assert (is_sse);
15379 #endif
15381 switch (GET_CODE (operands[3]))
15383 case PLUS:
15384 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15385 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15386 p = "fiadd";
15387 else
15388 p = "fadd";
15389 ssep = "vadd";
15390 break;
15392 case MINUS:
15393 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15394 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15395 p = "fisub";
15396 else
15397 p = "fsub";
15398 ssep = "vsub";
15399 break;
15401 case MULT:
15402 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15403 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15404 p = "fimul";
15405 else
15406 p = "fmul";
15407 ssep = "vmul";
15408 break;
15410 case DIV:
15411 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15412 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15413 p = "fidiv";
15414 else
15415 p = "fdiv";
15416 ssep = "vdiv";
15417 break;
15419 default:
15420 gcc_unreachable ();
15423 if (is_sse)
15425 if (TARGET_AVX)
15427 strcpy (buf, ssep);
15428 if (GET_MODE (operands[0]) == SFmode)
15429 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15430 else
15431 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15433 else
15435 strcpy (buf, ssep + 1);
15436 if (GET_MODE (operands[0]) == SFmode)
15437 strcat (buf, "ss\t{%2, %0|%0, %2}");
15438 else
15439 strcat (buf, "sd\t{%2, %0|%0, %2}");
15441 return buf;
15443 strcpy (buf, p);
15445 switch (GET_CODE (operands[3]))
15447 case MULT:
15448 case PLUS:
15449 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15451 rtx temp = operands[2];
15452 operands[2] = operands[1];
15453 operands[1] = temp;
15456 /* know operands[0] == operands[1]. */
15458 if (MEM_P (operands[2]))
15460 p = "%Z2\t%2";
15461 break;
15464 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15466 if (STACK_TOP_P (operands[0]))
15467 /* How is it that we are storing to a dead operand[2]?
15468 Well, presumably operands[1] is dead too. We can't
15469 store the result to st(0) as st(0) gets popped on this
15470 instruction. Instead store to operands[2] (which I
15471 think has to be st(1)). st(1) will be popped later.
15472 gcc <= 2.8.1 didn't have this check and generated
15473 assembly code that the Unixware assembler rejected. */
15474 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15475 else
15476 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15477 break;
15480 if (STACK_TOP_P (operands[0]))
15481 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15482 else
15483 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15484 break;
15486 case MINUS:
15487 case DIV:
15488 if (MEM_P (operands[1]))
15490 p = "r%Z1\t%1";
15491 break;
15494 if (MEM_P (operands[2]))
15496 p = "%Z2\t%2";
15497 break;
15500 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15502 #if SYSV386_COMPAT
15503 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15504 derived assemblers, confusingly reverse the direction of
15505 the operation for fsub{r} and fdiv{r} when the
15506 destination register is not st(0). The Intel assembler
15507 doesn't have this brain damage. Read !SYSV386_COMPAT to
15508 figure out what the hardware really does. */
15509 if (STACK_TOP_P (operands[0]))
15510 p = "{p\t%0, %2|rp\t%2, %0}";
15511 else
15512 p = "{rp\t%2, %0|p\t%0, %2}";
15513 #else
15514 if (STACK_TOP_P (operands[0]))
15515 /* As above for fmul/fadd, we can't store to st(0). */
15516 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15517 else
15518 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15519 #endif
15520 break;
15523 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15525 #if SYSV386_COMPAT
15526 if (STACK_TOP_P (operands[0]))
15527 p = "{rp\t%0, %1|p\t%1, %0}";
15528 else
15529 p = "{p\t%1, %0|rp\t%0, %1}";
15530 #else
15531 if (STACK_TOP_P (operands[0]))
15532 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15533 else
15534 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15535 #endif
15536 break;
15539 if (STACK_TOP_P (operands[0]))
15541 if (STACK_TOP_P (operands[1]))
15542 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15543 else
15544 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15545 break;
15547 else if (STACK_TOP_P (operands[1]))
15549 #if SYSV386_COMPAT
15550 p = "{\t%1, %0|r\t%0, %1}";
15551 #else
15552 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15553 #endif
15555 else
15557 #if SYSV386_COMPAT
15558 p = "{r\t%2, %0|\t%0, %2}";
15559 #else
15560 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15561 #endif
15563 break;
15565 default:
15566 gcc_unreachable ();
15569 strcat (buf, p);
15570 return buf;
15573 /* Check if a 256bit AVX register is referenced inside of EXP. */
15575 static int
15576 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15578 rtx exp = *pexp;
15580 if (GET_CODE (exp) == SUBREG)
15581 exp = SUBREG_REG (exp);
15583 if (REG_P (exp)
15584 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15585 return 1;
15587 return 0;
15590 /* Return needed mode for entity in optimize_mode_switching pass. */
15592 static int
15593 ix86_avx_u128_mode_needed (rtx insn)
15595 if (CALL_P (insn))
15597 rtx link;
15599 /* Needed mode is set to AVX_U128_CLEAN if there are
15600 no 256bit modes used in function arguments. */
15601 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15602 link;
15603 link = XEXP (link, 1))
15605 if (GET_CODE (XEXP (link, 0)) == USE)
15607 rtx arg = XEXP (XEXP (link, 0), 0);
15609 if (ix86_check_avx256_register (&arg, NULL))
15610 return AVX_U128_ANY;
15614 return AVX_U128_CLEAN;
15617 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15618 changes state only when a 256bit register is written to, but we need
15619 to prevent the compiler from moving optimal insertion point above
15620 eventual read from 256bit register. */
15621 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15622 return AVX_U128_DIRTY;
15624 return AVX_U128_ANY;
15627 /* Return mode that i387 must be switched into
15628 prior to the execution of insn. */
15630 static int
15631 ix86_i387_mode_needed (int entity, rtx insn)
15633 enum attr_i387_cw mode;
15635 /* The mode UNINITIALIZED is used to store control word after a
15636 function call or ASM pattern. The mode ANY specify that function
15637 has no requirements on the control word and make no changes in the
15638 bits we are interested in. */
15640 if (CALL_P (insn)
15641 || (NONJUMP_INSN_P (insn)
15642 && (asm_noperands (PATTERN (insn)) >= 0
15643 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15644 return I387_CW_UNINITIALIZED;
15646 if (recog_memoized (insn) < 0)
15647 return I387_CW_ANY;
15649 mode = get_attr_i387_cw (insn);
15651 switch (entity)
15653 case I387_TRUNC:
15654 if (mode == I387_CW_TRUNC)
15655 return mode;
15656 break;
15658 case I387_FLOOR:
15659 if (mode == I387_CW_FLOOR)
15660 return mode;
15661 break;
15663 case I387_CEIL:
15664 if (mode == I387_CW_CEIL)
15665 return mode;
15666 break;
15668 case I387_MASK_PM:
15669 if (mode == I387_CW_MASK_PM)
15670 return mode;
15671 break;
15673 default:
15674 gcc_unreachable ();
15677 return I387_CW_ANY;
15680 /* Return mode that entity must be switched into
15681 prior to the execution of insn. */
15684 ix86_mode_needed (int entity, rtx insn)
15686 switch (entity)
15688 case AVX_U128:
15689 return ix86_avx_u128_mode_needed (insn);
15690 case I387_TRUNC:
15691 case I387_FLOOR:
15692 case I387_CEIL:
15693 case I387_MASK_PM:
15694 return ix86_i387_mode_needed (entity, insn);
15695 default:
15696 gcc_unreachable ();
15698 return 0;
15701 /* Check if a 256bit AVX register is referenced in stores. */
15703 static void
15704 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15706 if (ix86_check_avx256_register (&dest, NULL))
15708 bool *used = (bool *) data;
15709 *used = true;
15713 /* Calculate mode of upper 128bit AVX registers after the insn. */
15715 static int
15716 ix86_avx_u128_mode_after (int mode, rtx insn)
15718 rtx pat = PATTERN (insn);
15720 if (vzeroupper_operation (pat, VOIDmode)
15721 || vzeroall_operation (pat, VOIDmode))
15722 return AVX_U128_CLEAN;
15724 /* We know that state is clean after CALL insn if there are no
15725 256bit registers used in the function return register. */
15726 if (CALL_P (insn))
15728 bool avx_reg256_found = false;
15729 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15730 if (!avx_reg256_found)
15731 return AVX_U128_CLEAN;
15734 /* Otherwise, return current mode. Remember that if insn
15735 references AVX 256bit registers, the mode was already changed
15736 to DIRTY from MODE_NEEDED. */
15737 return mode;
15740 /* Return the mode that an insn results in. */
15743 ix86_mode_after (int entity, int mode, rtx insn)
15745 switch (entity)
15747 case AVX_U128:
15748 return ix86_avx_u128_mode_after (mode, insn);
15749 case I387_TRUNC:
15750 case I387_FLOOR:
15751 case I387_CEIL:
15752 case I387_MASK_PM:
15753 return mode;
15754 default:
15755 gcc_unreachable ();
15759 static int
15760 ix86_avx_u128_mode_entry (void)
15762 tree arg;
15764 /* Entry mode is set to AVX_U128_DIRTY if there are
15765 256bit modes used in function arguments. */
15766 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15767 arg = TREE_CHAIN (arg))
15769 rtx incoming = DECL_INCOMING_RTL (arg);
15771 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15772 return AVX_U128_DIRTY;
15775 return AVX_U128_CLEAN;
15778 /* Return a mode that ENTITY is assumed to be
15779 switched to at function entry. */
15782 ix86_mode_entry (int entity)
15784 switch (entity)
15786 case AVX_U128:
15787 return ix86_avx_u128_mode_entry ();
15788 case I387_TRUNC:
15789 case I387_FLOOR:
15790 case I387_CEIL:
15791 case I387_MASK_PM:
15792 return I387_CW_ANY;
15793 default:
15794 gcc_unreachable ();
15798 static int
15799 ix86_avx_u128_mode_exit (void)
15801 rtx reg = crtl->return_rtx;
15803 /* Exit mode is set to AVX_U128_DIRTY if there are
15804 256bit modes used in the function return register. */
15805 if (reg && ix86_check_avx256_register (&reg, NULL))
15806 return AVX_U128_DIRTY;
15808 return AVX_U128_CLEAN;
15811 /* Return a mode that ENTITY is assumed to be
15812 switched to at function exit. */
15815 ix86_mode_exit (int entity)
15817 switch (entity)
15819 case AVX_U128:
15820 return ix86_avx_u128_mode_exit ();
15821 case I387_TRUNC:
15822 case I387_FLOOR:
15823 case I387_CEIL:
15824 case I387_MASK_PM:
15825 return I387_CW_ANY;
15826 default:
15827 gcc_unreachable ();
15831 /* Output code to initialize control word copies used by trunc?f?i and
15832 rounding patterns. CURRENT_MODE is set to current control word,
15833 while NEW_MODE is set to new control word. */
15835 static void
15836 emit_i387_cw_initialization (int mode)
15838 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15839 rtx new_mode;
15841 enum ix86_stack_slot slot;
15843 rtx reg = gen_reg_rtx (HImode);
15845 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15846 emit_move_insn (reg, copy_rtx (stored_mode));
15848 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15849 || optimize_insn_for_size_p ())
15851 switch (mode)
15853 case I387_CW_TRUNC:
15854 /* round toward zero (truncate) */
15855 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15856 slot = SLOT_CW_TRUNC;
15857 break;
15859 case I387_CW_FLOOR:
15860 /* round down toward -oo */
15861 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15862 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15863 slot = SLOT_CW_FLOOR;
15864 break;
15866 case I387_CW_CEIL:
15867 /* round up toward +oo */
15868 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15869 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15870 slot = SLOT_CW_CEIL;
15871 break;
15873 case I387_CW_MASK_PM:
15874 /* mask precision exception for nearbyint() */
15875 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15876 slot = SLOT_CW_MASK_PM;
15877 break;
15879 default:
15880 gcc_unreachable ();
15883 else
15885 switch (mode)
15887 case I387_CW_TRUNC:
15888 /* round toward zero (truncate) */
15889 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15890 slot = SLOT_CW_TRUNC;
15891 break;
15893 case I387_CW_FLOOR:
15894 /* round down toward -oo */
15895 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15896 slot = SLOT_CW_FLOOR;
15897 break;
15899 case I387_CW_CEIL:
15900 /* round up toward +oo */
15901 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15902 slot = SLOT_CW_CEIL;
15903 break;
15905 case I387_CW_MASK_PM:
15906 /* mask precision exception for nearbyint() */
15907 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15908 slot = SLOT_CW_MASK_PM;
15909 break;
15911 default:
15912 gcc_unreachable ();
15916 gcc_assert (slot < MAX_386_STACK_LOCALS);
15918 new_mode = assign_386_stack_local (HImode, slot);
15919 emit_move_insn (new_mode, reg);
15922 /* Emit vzeroupper. */
15924 void
15925 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15927 int i;
15929 /* Cancel automatic vzeroupper insertion if there are
15930 live call-saved SSE registers at the insertion point. */
15932 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15933 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15934 return;
15936 if (TARGET_64BIT)
15937 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15938 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15939 return;
15941 emit_insn (gen_avx_vzeroupper ());
15944 /* Generate one or more insns to set ENTITY to MODE. */
15946 void
15947 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15949 switch (entity)
15951 case AVX_U128:
15952 if (mode == AVX_U128_CLEAN)
15953 ix86_avx_emit_vzeroupper (regs_live);
15954 break;
15955 case I387_TRUNC:
15956 case I387_FLOOR:
15957 case I387_CEIL:
15958 case I387_MASK_PM:
15959 if (mode != I387_CW_ANY
15960 && mode != I387_CW_UNINITIALIZED)
15961 emit_i387_cw_initialization (mode);
15962 break;
15963 default:
15964 gcc_unreachable ();
15968 /* Output code for INSN to convert a float to a signed int. OPERANDS
15969 are the insn operands. The output may be [HSD]Imode and the input
15970 operand may be [SDX]Fmode. */
15972 const char *
15973 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15975 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15976 int dimode_p = GET_MODE (operands[0]) == DImode;
15977 int round_mode = get_attr_i387_cw (insn);
15979 /* Jump through a hoop or two for DImode, since the hardware has no
15980 non-popping instruction. We used to do this a different way, but
15981 that was somewhat fragile and broke with post-reload splitters. */
15982 if ((dimode_p || fisttp) && !stack_top_dies)
15983 output_asm_insn ("fld\t%y1", operands);
15985 gcc_assert (STACK_TOP_P (operands[1]));
15986 gcc_assert (MEM_P (operands[0]));
15987 gcc_assert (GET_MODE (operands[1]) != TFmode);
15989 if (fisttp)
15990 output_asm_insn ("fisttp%Z0\t%0", operands);
15991 else
15993 if (round_mode != I387_CW_ANY)
15994 output_asm_insn ("fldcw\t%3", operands);
15995 if (stack_top_dies || dimode_p)
15996 output_asm_insn ("fistp%Z0\t%0", operands);
15997 else
15998 output_asm_insn ("fist%Z0\t%0", operands);
15999 if (round_mode != I387_CW_ANY)
16000 output_asm_insn ("fldcw\t%2", operands);
16003 return "";
16006 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16007 have the values zero or one, indicates the ffreep insn's operand
16008 from the OPERANDS array. */
16010 static const char *
16011 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16013 if (TARGET_USE_FFREEP)
16014 #ifdef HAVE_AS_IX86_FFREEP
16015 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16016 #else
16018 static char retval[32];
16019 int regno = REGNO (operands[opno]);
16021 gcc_assert (STACK_REGNO_P (regno));
16023 regno -= FIRST_STACK_REG;
16025 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16026 return retval;
16028 #endif
16030 return opno ? "fstp\t%y1" : "fstp\t%y0";
16034 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16035 should be used. UNORDERED_P is true when fucom should be used. */
16037 const char *
16038 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16040 int stack_top_dies;
16041 rtx cmp_op0, cmp_op1;
16042 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16044 if (eflags_p)
16046 cmp_op0 = operands[0];
16047 cmp_op1 = operands[1];
16049 else
16051 cmp_op0 = operands[1];
16052 cmp_op1 = operands[2];
16055 if (is_sse)
16057 if (GET_MODE (operands[0]) == SFmode)
16058 if (unordered_p)
16059 return "%vucomiss\t{%1, %0|%0, %1}";
16060 else
16061 return "%vcomiss\t{%1, %0|%0, %1}";
16062 else
16063 if (unordered_p)
16064 return "%vucomisd\t{%1, %0|%0, %1}";
16065 else
16066 return "%vcomisd\t{%1, %0|%0, %1}";
16069 gcc_assert (STACK_TOP_P (cmp_op0));
16071 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16073 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16075 if (stack_top_dies)
16077 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16078 return output_387_ffreep (operands, 1);
16080 else
16081 return "ftst\n\tfnstsw\t%0";
16084 if (STACK_REG_P (cmp_op1)
16085 && stack_top_dies
16086 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16087 && REGNO (cmp_op1) != FIRST_STACK_REG)
16089 /* If both the top of the 387 stack dies, and the other operand
16090 is also a stack register that dies, then this must be a
16091 `fcompp' float compare */
16093 if (eflags_p)
16095 /* There is no double popping fcomi variant. Fortunately,
16096 eflags is immune from the fstp's cc clobbering. */
16097 if (unordered_p)
16098 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16099 else
16100 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16101 return output_387_ffreep (operands, 0);
16103 else
16105 if (unordered_p)
16106 return "fucompp\n\tfnstsw\t%0";
16107 else
16108 return "fcompp\n\tfnstsw\t%0";
16111 else
16113 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16115 static const char * const alt[16] =
16117 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16118 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16119 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16120 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16122 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16123 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16124 NULL,
16125 NULL,
16127 "fcomi\t{%y1, %0|%0, %y1}",
16128 "fcomip\t{%y1, %0|%0, %y1}",
16129 "fucomi\t{%y1, %0|%0, %y1}",
16130 "fucomip\t{%y1, %0|%0, %y1}",
16132 NULL,
16133 NULL,
16134 NULL,
16135 NULL
16138 int mask;
16139 const char *ret;
16141 mask = eflags_p << 3;
16142 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16143 mask |= unordered_p << 1;
16144 mask |= stack_top_dies;
16146 gcc_assert (mask < 16);
16147 ret = alt[mask];
16148 gcc_assert (ret);
16150 return ret;
16154 void
16155 ix86_output_addr_vec_elt (FILE *file, int value)
16157 const char *directive = ASM_LONG;
16159 #ifdef ASM_QUAD
16160 if (TARGET_LP64)
16161 directive = ASM_QUAD;
16162 #else
16163 gcc_assert (!TARGET_64BIT);
16164 #endif
16166 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16169 void
16170 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16172 const char *directive = ASM_LONG;
16174 #ifdef ASM_QUAD
16175 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16176 directive = ASM_QUAD;
16177 #else
16178 gcc_assert (!TARGET_64BIT);
16179 #endif
16180 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16181 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16182 fprintf (file, "%s%s%d-%s%d\n",
16183 directive, LPREFIX, value, LPREFIX, rel);
16184 else if (HAVE_AS_GOTOFF_IN_DATA)
16185 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16186 #if TARGET_MACHO
16187 else if (TARGET_MACHO)
16189 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16190 machopic_output_function_base_name (file);
16191 putc ('\n', file);
16193 #endif
16194 else
16195 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16196 GOT_SYMBOL_NAME, LPREFIX, value);
16199 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16200 for the target. */
16202 void
16203 ix86_expand_clear (rtx dest)
16205 rtx tmp;
16207 /* We play register width games, which are only valid after reload. */
16208 gcc_assert (reload_completed);
16210 /* Avoid HImode and its attendant prefix byte. */
16211 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16212 dest = gen_rtx_REG (SImode, REGNO (dest));
16213 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16215 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16216 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16218 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16219 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16222 emit_insn (tmp);
16225 /* X is an unchanging MEM. If it is a constant pool reference, return
16226 the constant pool rtx, else NULL. */
16229 maybe_get_pool_constant (rtx x)
16231 x = ix86_delegitimize_address (XEXP (x, 0));
16233 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16234 return get_pool_constant (x);
16236 return NULL_RTX;
16239 void
16240 ix86_expand_move (enum machine_mode mode, rtx operands[])
16242 rtx op0, op1;
16243 enum tls_model model;
16245 op0 = operands[0];
16246 op1 = operands[1];
16248 if (GET_CODE (op1) == SYMBOL_REF)
16250 rtx tmp;
16252 model = SYMBOL_REF_TLS_MODEL (op1);
16253 if (model)
16255 op1 = legitimize_tls_address (op1, model, true);
16256 op1 = force_operand (op1, op0);
16257 if (op1 == op0)
16258 return;
16259 op1 = convert_to_mode (mode, op1, 1);
16261 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16262 op1 = tmp;
16264 else if (GET_CODE (op1) == CONST
16265 && GET_CODE (XEXP (op1, 0)) == PLUS
16266 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16268 rtx addend = XEXP (XEXP (op1, 0), 1);
16269 rtx symbol = XEXP (XEXP (op1, 0), 0);
16270 rtx tmp;
16272 model = SYMBOL_REF_TLS_MODEL (symbol);
16273 if (model)
16274 tmp = legitimize_tls_address (symbol, model, true);
16275 else
16276 tmp = legitimize_pe_coff_symbol (symbol, true);
16278 if (tmp)
16280 tmp = force_operand (tmp, NULL);
16281 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16282 op0, 1, OPTAB_DIRECT);
16283 if (tmp == op0)
16284 return;
16285 op1 = convert_to_mode (mode, tmp, 1);
16289 if ((flag_pic || MACHOPIC_INDIRECT)
16290 && symbolic_operand (op1, mode))
16292 if (TARGET_MACHO && !TARGET_64BIT)
16294 #if TARGET_MACHO
16295 /* dynamic-no-pic */
16296 if (MACHOPIC_INDIRECT)
16298 rtx temp = ((reload_in_progress
16299 || ((op0 && REG_P (op0))
16300 && mode == Pmode))
16301 ? op0 : gen_reg_rtx (Pmode));
16302 op1 = machopic_indirect_data_reference (op1, temp);
16303 if (MACHOPIC_PURE)
16304 op1 = machopic_legitimize_pic_address (op1, mode,
16305 temp == op1 ? 0 : temp);
16307 if (op0 != op1 && GET_CODE (op0) != MEM)
16309 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16310 emit_insn (insn);
16311 return;
16313 if (GET_CODE (op0) == MEM)
16314 op1 = force_reg (Pmode, op1);
16315 else
16317 rtx temp = op0;
16318 if (GET_CODE (temp) != REG)
16319 temp = gen_reg_rtx (Pmode);
16320 temp = legitimize_pic_address (op1, temp);
16321 if (temp == op0)
16322 return;
16323 op1 = temp;
16325 /* dynamic-no-pic */
16326 #endif
16328 else
16330 if (MEM_P (op0))
16331 op1 = force_reg (mode, op1);
16332 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16334 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16335 op1 = legitimize_pic_address (op1, reg);
16336 if (op0 == op1)
16337 return;
16338 op1 = convert_to_mode (mode, op1, 1);
16342 else
16344 if (MEM_P (op0)
16345 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16346 || !push_operand (op0, mode))
16347 && MEM_P (op1))
16348 op1 = force_reg (mode, op1);
16350 if (push_operand (op0, mode)
16351 && ! general_no_elim_operand (op1, mode))
16352 op1 = copy_to_mode_reg (mode, op1);
16354 /* Force large constants in 64bit compilation into register
16355 to get them CSEed. */
16356 if (can_create_pseudo_p ()
16357 && (mode == DImode) && TARGET_64BIT
16358 && immediate_operand (op1, mode)
16359 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16360 && !register_operand (op0, mode)
16361 && optimize)
16362 op1 = copy_to_mode_reg (mode, op1);
16364 if (can_create_pseudo_p ()
16365 && FLOAT_MODE_P (mode)
16366 && GET_CODE (op1) == CONST_DOUBLE)
16368 /* If we are loading a floating point constant to a register,
16369 force the value to memory now, since we'll get better code
16370 out the back end. */
16372 op1 = validize_mem (force_const_mem (mode, op1));
16373 if (!register_operand (op0, mode))
16375 rtx temp = gen_reg_rtx (mode);
16376 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16377 emit_move_insn (op0, temp);
16378 return;
16383 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16386 void
16387 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16389 rtx op0 = operands[0], op1 = operands[1];
16390 unsigned int align = GET_MODE_ALIGNMENT (mode);
16392 /* Force constants other than zero into memory. We do not know how
16393 the instructions used to build constants modify the upper 64 bits
16394 of the register, once we have that information we may be able
16395 to handle some of them more efficiently. */
16396 if (can_create_pseudo_p ()
16397 && register_operand (op0, mode)
16398 && (CONSTANT_P (op1)
16399 || (GET_CODE (op1) == SUBREG
16400 && CONSTANT_P (SUBREG_REG (op1))))
16401 && !standard_sse_constant_p (op1))
16402 op1 = validize_mem (force_const_mem (mode, op1));
16404 /* We need to check memory alignment for SSE mode since attribute
16405 can make operands unaligned. */
16406 if (can_create_pseudo_p ()
16407 && SSE_REG_MODE_P (mode)
16408 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16409 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16411 rtx tmp[2];
16413 /* ix86_expand_vector_move_misalign() does not like constants ... */
16414 if (CONSTANT_P (op1)
16415 || (GET_CODE (op1) == SUBREG
16416 && CONSTANT_P (SUBREG_REG (op1))))
16417 op1 = validize_mem (force_const_mem (mode, op1));
16419 /* ... nor both arguments in memory. */
16420 if (!register_operand (op0, mode)
16421 && !register_operand (op1, mode))
16422 op1 = force_reg (mode, op1);
16424 tmp[0] = op0; tmp[1] = op1;
16425 ix86_expand_vector_move_misalign (mode, tmp);
16426 return;
16429 /* Make operand1 a register if it isn't already. */
16430 if (can_create_pseudo_p ()
16431 && !register_operand (op0, mode)
16432 && !register_operand (op1, mode))
16434 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16435 return;
16438 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16441 /* Split 32-byte AVX unaligned load and store if needed. */
16443 static void
16444 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16446 rtx m;
16447 rtx (*extract) (rtx, rtx, rtx);
16448 rtx (*load_unaligned) (rtx, rtx);
16449 rtx (*store_unaligned) (rtx, rtx);
16450 enum machine_mode mode;
16452 switch (GET_MODE (op0))
16454 default:
16455 gcc_unreachable ();
16456 case V32QImode:
16457 extract = gen_avx_vextractf128v32qi;
16458 load_unaligned = gen_avx_loaddqu256;
16459 store_unaligned = gen_avx_storedqu256;
16460 mode = V16QImode;
16461 break;
16462 case V8SFmode:
16463 extract = gen_avx_vextractf128v8sf;
16464 load_unaligned = gen_avx_loadups256;
16465 store_unaligned = gen_avx_storeups256;
16466 mode = V4SFmode;
16467 break;
16468 case V4DFmode:
16469 extract = gen_avx_vextractf128v4df;
16470 load_unaligned = gen_avx_loadupd256;
16471 store_unaligned = gen_avx_storeupd256;
16472 mode = V2DFmode;
16473 break;
16476 if (MEM_P (op1))
16478 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16480 rtx r = gen_reg_rtx (mode);
16481 m = adjust_address (op1, mode, 0);
16482 emit_move_insn (r, m);
16483 m = adjust_address (op1, mode, 16);
16484 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16485 emit_move_insn (op0, r);
16487 else
16488 emit_insn (load_unaligned (op0, op1));
16490 else if (MEM_P (op0))
16492 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16494 m = adjust_address (op0, mode, 0);
16495 emit_insn (extract (m, op1, const0_rtx));
16496 m = adjust_address (op0, mode, 16);
16497 emit_insn (extract (m, op1, const1_rtx));
16499 else
16500 emit_insn (store_unaligned (op0, op1));
16502 else
16503 gcc_unreachable ();
16506 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16507 straight to ix86_expand_vector_move. */
16508 /* Code generation for scalar reg-reg moves of single and double precision data:
16509 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16510 movaps reg, reg
16511 else
16512 movss reg, reg
16513 if (x86_sse_partial_reg_dependency == true)
16514 movapd reg, reg
16515 else
16516 movsd reg, reg
16518 Code generation for scalar loads of double precision data:
16519 if (x86_sse_split_regs == true)
16520 movlpd mem, reg (gas syntax)
16521 else
16522 movsd mem, reg
16524 Code generation for unaligned packed loads of single precision data
16525 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16526 if (x86_sse_unaligned_move_optimal)
16527 movups mem, reg
16529 if (x86_sse_partial_reg_dependency == true)
16531 xorps reg, reg
16532 movlps mem, reg
16533 movhps mem+8, reg
16535 else
16537 movlps mem, reg
16538 movhps mem+8, reg
16541 Code generation for unaligned packed loads of double precision data
16542 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16543 if (x86_sse_unaligned_move_optimal)
16544 movupd mem, reg
16546 if (x86_sse_split_regs == true)
16548 movlpd mem, reg
16549 movhpd mem+8, reg
16551 else
16553 movsd mem, reg
16554 movhpd mem+8, reg
16558 void
16559 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16561 rtx op0, op1, m;
16563 op0 = operands[0];
16564 op1 = operands[1];
16566 if (TARGET_AVX
16567 && GET_MODE_SIZE (mode) == 32)
16569 switch (GET_MODE_CLASS (mode))
16571 case MODE_VECTOR_INT:
16572 case MODE_INT:
16573 op0 = gen_lowpart (V32QImode, op0);
16574 op1 = gen_lowpart (V32QImode, op1);
16575 /* FALLTHRU */
16577 case MODE_VECTOR_FLOAT:
16578 ix86_avx256_split_vector_move_misalign (op0, op1);
16579 break;
16581 default:
16582 gcc_unreachable ();
16585 return;
16588 if (MEM_P (op1))
16590 /* ??? If we have typed data, then it would appear that using
16591 movdqu is the only way to get unaligned data loaded with
16592 integer type. */
16593 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16595 op0 = gen_lowpart (V16QImode, op0);
16596 op1 = gen_lowpart (V16QImode, op1);
16597 /* We will eventually emit movups based on insn attributes. */
16598 emit_insn (gen_sse2_loaddqu (op0, op1));
16600 else if (TARGET_SSE2 && mode == V2DFmode)
16602 rtx zero;
16604 if (TARGET_AVX
16605 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16606 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16607 || optimize_insn_for_size_p ())
16609 /* We will eventually emit movups based on insn attributes. */
16610 emit_insn (gen_sse2_loadupd (op0, op1));
16611 return;
16614 /* When SSE registers are split into halves, we can avoid
16615 writing to the top half twice. */
16616 if (TARGET_SSE_SPLIT_REGS)
16618 emit_clobber (op0);
16619 zero = op0;
16621 else
16623 /* ??? Not sure about the best option for the Intel chips.
16624 The following would seem to satisfy; the register is
16625 entirely cleared, breaking the dependency chain. We
16626 then store to the upper half, with a dependency depth
16627 of one. A rumor has it that Intel recommends two movsd
16628 followed by an unpacklpd, but this is unconfirmed. And
16629 given that the dependency depth of the unpacklpd would
16630 still be one, I'm not sure why this would be better. */
16631 zero = CONST0_RTX (V2DFmode);
16634 m = adjust_address (op1, DFmode, 0);
16635 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16636 m = adjust_address (op1, DFmode, 8);
16637 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16639 else
16641 if (TARGET_AVX
16642 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16643 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16644 || optimize_insn_for_size_p ())
16646 op0 = gen_lowpart (V4SFmode, op0);
16647 op1 = gen_lowpart (V4SFmode, op1);
16648 emit_insn (gen_sse_loadups (op0, op1));
16649 return;
16652 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16653 emit_move_insn (op0, CONST0_RTX (mode));
16654 else
16655 emit_clobber (op0);
16657 if (mode != V4SFmode)
16658 op0 = gen_lowpart (V4SFmode, op0);
16660 m = adjust_address (op1, V2SFmode, 0);
16661 emit_insn (gen_sse_loadlps (op0, op0, m));
16662 m = adjust_address (op1, V2SFmode, 8);
16663 emit_insn (gen_sse_loadhps (op0, op0, m));
16666 else if (MEM_P (op0))
16668 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16670 op0 = gen_lowpart (V16QImode, op0);
16671 op1 = gen_lowpart (V16QImode, op1);
16672 /* We will eventually emit movups based on insn attributes. */
16673 emit_insn (gen_sse2_storedqu (op0, op1));
16675 else if (TARGET_SSE2 && mode == V2DFmode)
16677 if (TARGET_AVX
16678 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16679 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16680 || optimize_insn_for_size_p ())
16681 /* We will eventually emit movups based on insn attributes. */
16682 emit_insn (gen_sse2_storeupd (op0, op1));
16683 else
16685 m = adjust_address (op0, DFmode, 0);
16686 emit_insn (gen_sse2_storelpd (m, op1));
16687 m = adjust_address (op0, DFmode, 8);
16688 emit_insn (gen_sse2_storehpd (m, op1));
16691 else
16693 if (mode != V4SFmode)
16694 op1 = gen_lowpart (V4SFmode, op1);
16696 if (TARGET_AVX
16697 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16698 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16699 || optimize_insn_for_size_p ())
16701 op0 = gen_lowpart (V4SFmode, op0);
16702 emit_insn (gen_sse_storeups (op0, op1));
16704 else
16706 m = adjust_address (op0, V2SFmode, 0);
16707 emit_insn (gen_sse_storelps (m, op1));
16708 m = adjust_address (op0, V2SFmode, 8);
16709 emit_insn (gen_sse_storehps (m, op1));
16713 else
16714 gcc_unreachable ();
16717 /* Expand a push in MODE. This is some mode for which we do not support
16718 proper push instructions, at least from the registers that we expect
16719 the value to live in. */
16721 void
16722 ix86_expand_push (enum machine_mode mode, rtx x)
16724 rtx tmp;
16726 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16727 GEN_INT (-GET_MODE_SIZE (mode)),
16728 stack_pointer_rtx, 1, OPTAB_DIRECT);
16729 if (tmp != stack_pointer_rtx)
16730 emit_move_insn (stack_pointer_rtx, tmp);
16732 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16734 /* When we push an operand onto stack, it has to be aligned at least
16735 at the function argument boundary. However since we don't have
16736 the argument type, we can't determine the actual argument
16737 boundary. */
16738 emit_move_insn (tmp, x);
16741 /* Helper function of ix86_fixup_binary_operands to canonicalize
16742 operand order. Returns true if the operands should be swapped. */
16744 static bool
16745 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16746 rtx operands[])
16748 rtx dst = operands[0];
16749 rtx src1 = operands[1];
16750 rtx src2 = operands[2];
16752 /* If the operation is not commutative, we can't do anything. */
16753 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16754 return false;
16756 /* Highest priority is that src1 should match dst. */
16757 if (rtx_equal_p (dst, src1))
16758 return false;
16759 if (rtx_equal_p (dst, src2))
16760 return true;
16762 /* Next highest priority is that immediate constants come second. */
16763 if (immediate_operand (src2, mode))
16764 return false;
16765 if (immediate_operand (src1, mode))
16766 return true;
16768 /* Lowest priority is that memory references should come second. */
16769 if (MEM_P (src2))
16770 return false;
16771 if (MEM_P (src1))
16772 return true;
16774 return false;
16778 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16779 destination to use for the operation. If different from the true
16780 destination in operands[0], a copy operation will be required. */
16783 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16784 rtx operands[])
16786 rtx dst = operands[0];
16787 rtx src1 = operands[1];
16788 rtx src2 = operands[2];
16790 /* Canonicalize operand order. */
16791 if (ix86_swap_binary_operands_p (code, mode, operands))
16793 rtx temp;
16795 /* It is invalid to swap operands of different modes. */
16796 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16798 temp = src1;
16799 src1 = src2;
16800 src2 = temp;
16803 /* Both source operands cannot be in memory. */
16804 if (MEM_P (src1) && MEM_P (src2))
16806 /* Optimization: Only read from memory once. */
16807 if (rtx_equal_p (src1, src2))
16809 src2 = force_reg (mode, src2);
16810 src1 = src2;
16812 else
16813 src2 = force_reg (mode, src2);
16816 /* If the destination is memory, and we do not have matching source
16817 operands, do things in registers. */
16818 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16819 dst = gen_reg_rtx (mode);
16821 /* Source 1 cannot be a constant. */
16822 if (CONSTANT_P (src1))
16823 src1 = force_reg (mode, src1);
16825 /* Source 1 cannot be a non-matching memory. */
16826 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16827 src1 = force_reg (mode, src1);
16829 /* Improve address combine. */
16830 if (code == PLUS
16831 && GET_MODE_CLASS (mode) == MODE_INT
16832 && MEM_P (src2))
16833 src2 = force_reg (mode, src2);
16835 operands[1] = src1;
16836 operands[2] = src2;
16837 return dst;
16840 /* Similarly, but assume that the destination has already been
16841 set up properly. */
16843 void
16844 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16845 enum machine_mode mode, rtx operands[])
16847 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16848 gcc_assert (dst == operands[0]);
16851 /* Attempt to expand a binary operator. Make the expansion closer to the
16852 actual machine, then just general_operand, which will allow 3 separate
16853 memory references (one output, two input) in a single insn. */
16855 void
16856 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16857 rtx operands[])
16859 rtx src1, src2, dst, op, clob;
16861 dst = ix86_fixup_binary_operands (code, mode, operands);
16862 src1 = operands[1];
16863 src2 = operands[2];
16865 /* Emit the instruction. */
16867 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16868 if (reload_in_progress)
16870 /* Reload doesn't know about the flags register, and doesn't know that
16871 it doesn't want to clobber it. We can only do this with PLUS. */
16872 gcc_assert (code == PLUS);
16873 emit_insn (op);
16875 else if (reload_completed
16876 && code == PLUS
16877 && !rtx_equal_p (dst, src1))
16879 /* This is going to be an LEA; avoid splitting it later. */
16880 emit_insn (op);
16882 else
16884 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16885 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16888 /* Fix up the destination if needed. */
16889 if (dst != operands[0])
16890 emit_move_insn (operands[0], dst);
16893 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16894 the given OPERANDS. */
16896 void
16897 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16898 rtx operands[])
16900 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16901 if (GET_CODE (operands[1]) == SUBREG)
16903 op1 = operands[1];
16904 op2 = operands[2];
16906 else if (GET_CODE (operands[2]) == SUBREG)
16908 op1 = operands[2];
16909 op2 = operands[1];
16911 /* Optimize (__m128i) d | (__m128i) e and similar code
16912 when d and e are float vectors into float vector logical
16913 insn. In C/C++ without using intrinsics there is no other way
16914 to express vector logical operation on float vectors than
16915 to cast them temporarily to integer vectors. */
16916 if (op1
16917 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16918 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16919 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16920 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16921 && SUBREG_BYTE (op1) == 0
16922 && (GET_CODE (op2) == CONST_VECTOR
16923 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16924 && SUBREG_BYTE (op2) == 0))
16925 && can_create_pseudo_p ())
16927 rtx dst;
16928 switch (GET_MODE (SUBREG_REG (op1)))
16930 case V4SFmode:
16931 case V8SFmode:
16932 case V2DFmode:
16933 case V4DFmode:
16934 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16935 if (GET_CODE (op2) == CONST_VECTOR)
16937 op2 = gen_lowpart (GET_MODE (dst), op2);
16938 op2 = force_reg (GET_MODE (dst), op2);
16940 else
16942 op1 = operands[1];
16943 op2 = SUBREG_REG (operands[2]);
16944 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16945 op2 = force_reg (GET_MODE (dst), op2);
16947 op1 = SUBREG_REG (op1);
16948 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16949 op1 = force_reg (GET_MODE (dst), op1);
16950 emit_insn (gen_rtx_SET (VOIDmode, dst,
16951 gen_rtx_fmt_ee (code, GET_MODE (dst),
16952 op1, op2)));
16953 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16954 return;
16955 default:
16956 break;
16959 if (!nonimmediate_operand (operands[1], mode))
16960 operands[1] = force_reg (mode, operands[1]);
16961 if (!nonimmediate_operand (operands[2], mode))
16962 operands[2] = force_reg (mode, operands[2]);
16963 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16964 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16965 gen_rtx_fmt_ee (code, mode, operands[1],
16966 operands[2])));
16969 /* Return TRUE or FALSE depending on whether the binary operator meets the
16970 appropriate constraints. */
16972 bool
16973 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16974 rtx operands[3])
16976 rtx dst = operands[0];
16977 rtx src1 = operands[1];
16978 rtx src2 = operands[2];
16980 /* Both source operands cannot be in memory. */
16981 if (MEM_P (src1) && MEM_P (src2))
16982 return false;
16984 /* Canonicalize operand order for commutative operators. */
16985 if (ix86_swap_binary_operands_p (code, mode, operands))
16987 rtx temp = src1;
16988 src1 = src2;
16989 src2 = temp;
16992 /* If the destination is memory, we must have a matching source operand. */
16993 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16994 return false;
16996 /* Source 1 cannot be a constant. */
16997 if (CONSTANT_P (src1))
16998 return false;
17000 /* Source 1 cannot be a non-matching memory. */
17001 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17002 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17003 return (code == AND
17004 && (mode == HImode
17005 || mode == SImode
17006 || (TARGET_64BIT && mode == DImode))
17007 && satisfies_constraint_L (src2));
17009 return true;
17012 /* Attempt to expand a unary operator. Make the expansion closer to the
17013 actual machine, then just general_operand, which will allow 2 separate
17014 memory references (one output, one input) in a single insn. */
17016 void
17017 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17018 rtx operands[])
17020 int matching_memory;
17021 rtx src, dst, op, clob;
17023 dst = operands[0];
17024 src = operands[1];
17026 /* If the destination is memory, and we do not have matching source
17027 operands, do things in registers. */
17028 matching_memory = 0;
17029 if (MEM_P (dst))
17031 if (rtx_equal_p (dst, src))
17032 matching_memory = 1;
17033 else
17034 dst = gen_reg_rtx (mode);
17037 /* When source operand is memory, destination must match. */
17038 if (MEM_P (src) && !matching_memory)
17039 src = force_reg (mode, src);
17041 /* Emit the instruction. */
17043 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17044 if (reload_in_progress || code == NOT)
17046 /* Reload doesn't know about the flags register, and doesn't know that
17047 it doesn't want to clobber it. */
17048 gcc_assert (code == NOT);
17049 emit_insn (op);
17051 else
17053 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17054 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17057 /* Fix up the destination if needed. */
17058 if (dst != operands[0])
17059 emit_move_insn (operands[0], dst);
17062 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17063 divisor are within the range [0-255]. */
17065 void
17066 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17067 bool signed_p)
17069 rtx end_label, qimode_label;
17070 rtx insn, div, mod;
17071 rtx scratch, tmp0, tmp1, tmp2;
17072 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17073 rtx (*gen_zero_extend) (rtx, rtx);
17074 rtx (*gen_test_ccno_1) (rtx, rtx);
17076 switch (mode)
17078 case SImode:
17079 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17080 gen_test_ccno_1 = gen_testsi_ccno_1;
17081 gen_zero_extend = gen_zero_extendqisi2;
17082 break;
17083 case DImode:
17084 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17085 gen_test_ccno_1 = gen_testdi_ccno_1;
17086 gen_zero_extend = gen_zero_extendqidi2;
17087 break;
17088 default:
17089 gcc_unreachable ();
17092 end_label = gen_label_rtx ();
17093 qimode_label = gen_label_rtx ();
17095 scratch = gen_reg_rtx (mode);
17097 /* Use 8bit unsigned divimod if dividend and divisor are within
17098 the range [0-255]. */
17099 emit_move_insn (scratch, operands[2]);
17100 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17101 scratch, 1, OPTAB_DIRECT);
17102 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17103 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17104 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17105 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17106 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17107 pc_rtx);
17108 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17109 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17110 JUMP_LABEL (insn) = qimode_label;
17112 /* Generate original signed/unsigned divimod. */
17113 div = gen_divmod4_1 (operands[0], operands[1],
17114 operands[2], operands[3]);
17115 emit_insn (div);
17117 /* Branch to the end. */
17118 emit_jump_insn (gen_jump (end_label));
17119 emit_barrier ();
17121 /* Generate 8bit unsigned divide. */
17122 emit_label (qimode_label);
17123 /* Don't use operands[0] for result of 8bit divide since not all
17124 registers support QImode ZERO_EXTRACT. */
17125 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17126 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17127 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17128 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17130 if (signed_p)
17132 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17133 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17135 else
17137 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17138 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17141 /* Extract remainder from AH. */
17142 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17143 if (REG_P (operands[1]))
17144 insn = emit_move_insn (operands[1], tmp1);
17145 else
17147 /* Need a new scratch register since the old one has result
17148 of 8bit divide. */
17149 scratch = gen_reg_rtx (mode);
17150 emit_move_insn (scratch, tmp1);
17151 insn = emit_move_insn (operands[1], scratch);
17153 set_unique_reg_note (insn, REG_EQUAL, mod);
17155 /* Zero extend quotient from AL. */
17156 tmp1 = gen_lowpart (QImode, tmp0);
17157 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17158 set_unique_reg_note (insn, REG_EQUAL, div);
17160 emit_label (end_label);
17163 #define LEA_MAX_STALL (3)
17164 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17166 /* Increase given DISTANCE in half-cycles according to
17167 dependencies between PREV and NEXT instructions.
17168 Add 1 half-cycle if there is no dependency and
17169 go to next cycle if there is some dependecy. */
17171 static unsigned int
17172 increase_distance (rtx prev, rtx next, unsigned int distance)
17174 df_ref *use_rec;
17175 df_ref *def_rec;
17177 if (!prev || !next)
17178 return distance + (distance & 1) + 2;
17180 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17181 return distance + 1;
17183 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17184 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17185 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17186 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17187 return distance + (distance & 1) + 2;
17189 return distance + 1;
17192 /* Function checks if instruction INSN defines register number
17193 REGNO1 or REGNO2. */
17195 static bool
17196 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17197 rtx insn)
17199 df_ref *def_rec;
17201 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17202 if (DF_REF_REG_DEF_P (*def_rec)
17203 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17204 && (regno1 == DF_REF_REGNO (*def_rec)
17205 || regno2 == DF_REF_REGNO (*def_rec)))
17207 return true;
17210 return false;
17213 /* Function checks if instruction INSN uses register number
17214 REGNO as a part of address expression. */
17216 static bool
17217 insn_uses_reg_mem (unsigned int regno, rtx insn)
17219 df_ref *use_rec;
17221 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17222 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17223 return true;
17225 return false;
17228 /* Search backward for non-agu definition of register number REGNO1
17229 or register number REGNO2 in basic block starting from instruction
17230 START up to head of basic block or instruction INSN.
17232 Function puts true value into *FOUND var if definition was found
17233 and false otherwise.
17235 Distance in half-cycles between START and found instruction or head
17236 of BB is added to DISTANCE and returned. */
17238 static int
17239 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17240 rtx insn, int distance,
17241 rtx start, bool *found)
17243 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17244 rtx prev = start;
17245 rtx next = NULL;
17247 *found = false;
17249 while (prev
17250 && prev != insn
17251 && distance < LEA_SEARCH_THRESHOLD)
17253 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17255 distance = increase_distance (prev, next, distance);
17256 if (insn_defines_reg (regno1, regno2, prev))
17258 if (recog_memoized (prev) < 0
17259 || get_attr_type (prev) != TYPE_LEA)
17261 *found = true;
17262 return distance;
17266 next = prev;
17268 if (prev == BB_HEAD (bb))
17269 break;
17271 prev = PREV_INSN (prev);
17274 return distance;
17277 /* Search backward for non-agu definition of register number REGNO1
17278 or register number REGNO2 in INSN's basic block until
17279 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17280 2. Reach neighbour BBs boundary, or
17281 3. Reach agu definition.
17282 Returns the distance between the non-agu definition point and INSN.
17283 If no definition point, returns -1. */
17285 static int
17286 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17287 rtx insn)
17289 basic_block bb = BLOCK_FOR_INSN (insn);
17290 int distance = 0;
17291 bool found = false;
17293 if (insn != BB_HEAD (bb))
17294 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17295 distance, PREV_INSN (insn),
17296 &found);
17298 if (!found && distance < LEA_SEARCH_THRESHOLD)
17300 edge e;
17301 edge_iterator ei;
17302 bool simple_loop = false;
17304 FOR_EACH_EDGE (e, ei, bb->preds)
17305 if (e->src == bb)
17307 simple_loop = true;
17308 break;
17311 if (simple_loop)
17312 distance = distance_non_agu_define_in_bb (regno1, regno2,
17313 insn, distance,
17314 BB_END (bb), &found);
17315 else
17317 int shortest_dist = -1;
17318 bool found_in_bb = false;
17320 FOR_EACH_EDGE (e, ei, bb->preds)
17322 int bb_dist
17323 = distance_non_agu_define_in_bb (regno1, regno2,
17324 insn, distance,
17325 BB_END (e->src),
17326 &found_in_bb);
17327 if (found_in_bb)
17329 if (shortest_dist < 0)
17330 shortest_dist = bb_dist;
17331 else if (bb_dist > 0)
17332 shortest_dist = MIN (bb_dist, shortest_dist);
17334 found = true;
17338 distance = shortest_dist;
17342 /* get_attr_type may modify recog data. We want to make sure
17343 that recog data is valid for instruction INSN, on which
17344 distance_non_agu_define is called. INSN is unchanged here. */
17345 extract_insn_cached (insn);
17347 if (!found)
17348 return -1;
17350 return distance >> 1;
17353 /* Return the distance in half-cycles between INSN and the next
17354 insn that uses register number REGNO in memory address added
17355 to DISTANCE. Return -1 if REGNO0 is set.
17357 Put true value into *FOUND if register usage was found and
17358 false otherwise.
17359 Put true value into *REDEFINED if register redefinition was
17360 found and false otherwise. */
17362 static int
17363 distance_agu_use_in_bb (unsigned int regno,
17364 rtx insn, int distance, rtx start,
17365 bool *found, bool *redefined)
17367 basic_block bb = NULL;
17368 rtx next = start;
17369 rtx prev = NULL;
17371 *found = false;
17372 *redefined = false;
17374 if (start != NULL_RTX)
17376 bb = BLOCK_FOR_INSN (start);
17377 if (start != BB_HEAD (bb))
17378 /* If insn and start belong to the same bb, set prev to insn,
17379 so the call to increase_distance will increase the distance
17380 between insns by 1. */
17381 prev = insn;
17384 while (next
17385 && next != insn
17386 && distance < LEA_SEARCH_THRESHOLD)
17388 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17390 distance = increase_distance(prev, next, distance);
17391 if (insn_uses_reg_mem (regno, next))
17393 /* Return DISTANCE if OP0 is used in memory
17394 address in NEXT. */
17395 *found = true;
17396 return distance;
17399 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17401 /* Return -1 if OP0 is set in NEXT. */
17402 *redefined = true;
17403 return -1;
17406 prev = next;
17409 if (next == BB_END (bb))
17410 break;
17412 next = NEXT_INSN (next);
17415 return distance;
17418 /* Return the distance between INSN and the next insn that uses
17419 register number REGNO0 in memory address. Return -1 if no such
17420 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17422 static int
17423 distance_agu_use (unsigned int regno0, rtx insn)
17425 basic_block bb = BLOCK_FOR_INSN (insn);
17426 int distance = 0;
17427 bool found = false;
17428 bool redefined = false;
17430 if (insn != BB_END (bb))
17431 distance = distance_agu_use_in_bb (regno0, insn, distance,
17432 NEXT_INSN (insn),
17433 &found, &redefined);
17435 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17437 edge e;
17438 edge_iterator ei;
17439 bool simple_loop = false;
17441 FOR_EACH_EDGE (e, ei, bb->succs)
17442 if (e->dest == bb)
17444 simple_loop = true;
17445 break;
17448 if (simple_loop)
17449 distance = distance_agu_use_in_bb (regno0, insn,
17450 distance, BB_HEAD (bb),
17451 &found, &redefined);
17452 else
17454 int shortest_dist = -1;
17455 bool found_in_bb = false;
17456 bool redefined_in_bb = false;
17458 FOR_EACH_EDGE (e, ei, bb->succs)
17460 int bb_dist
17461 = distance_agu_use_in_bb (regno0, insn,
17462 distance, BB_HEAD (e->dest),
17463 &found_in_bb, &redefined_in_bb);
17464 if (found_in_bb)
17466 if (shortest_dist < 0)
17467 shortest_dist = bb_dist;
17468 else if (bb_dist > 0)
17469 shortest_dist = MIN (bb_dist, shortest_dist);
17471 found = true;
17475 distance = shortest_dist;
17479 if (!found || redefined)
17480 return -1;
17482 return distance >> 1;
17485 /* Define this macro to tune LEA priority vs ADD, it take effect when
17486 there is a dilemma of choicing LEA or ADD
17487 Negative value: ADD is more preferred than LEA
17488 Zero: Netrual
17489 Positive value: LEA is more preferred than ADD*/
17490 #define IX86_LEA_PRIORITY 0
17492 /* Return true if usage of lea INSN has performance advantage
17493 over a sequence of instructions. Instructions sequence has
17494 SPLIT_COST cycles higher latency than lea latency. */
17496 static bool
17497 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17498 unsigned int regno2, int split_cost, bool has_scale)
17500 int dist_define, dist_use;
17502 /* For Silvermont if using a 2-source or 3-source LEA for
17503 non-destructive destination purposes, or due to wanting
17504 ability to use SCALE, the use of LEA is justified. */
17505 if (ix86_tune == PROCESSOR_SLM)
17507 if (has_scale)
17508 return true;
17509 if (split_cost < 1)
17510 return false;
17511 if (regno0 == regno1 || regno0 == regno2)
17512 return false;
17513 return true;
17516 dist_define = distance_non_agu_define (regno1, regno2, insn);
17517 dist_use = distance_agu_use (regno0, insn);
17519 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17521 /* If there is no non AGU operand definition, no AGU
17522 operand usage and split cost is 0 then both lea
17523 and non lea variants have same priority. Currently
17524 we prefer lea for 64 bit code and non lea on 32 bit
17525 code. */
17526 if (dist_use < 0 && split_cost == 0)
17527 return TARGET_64BIT || IX86_LEA_PRIORITY;
17528 else
17529 return true;
17532 /* With longer definitions distance lea is more preferable.
17533 Here we change it to take into account splitting cost and
17534 lea priority. */
17535 dist_define += split_cost + IX86_LEA_PRIORITY;
17537 /* If there is no use in memory addess then we just check
17538 that split cost exceeds AGU stall. */
17539 if (dist_use < 0)
17540 return dist_define > LEA_MAX_STALL;
17542 /* If this insn has both backward non-agu dependence and forward
17543 agu dependence, the one with short distance takes effect. */
17544 return dist_define >= dist_use;
17547 /* Return true if it is legal to clobber flags by INSN and
17548 false otherwise. */
17550 static bool
17551 ix86_ok_to_clobber_flags (rtx insn)
17553 basic_block bb = BLOCK_FOR_INSN (insn);
17554 df_ref *use;
17555 bitmap live;
17557 while (insn)
17559 if (NONDEBUG_INSN_P (insn))
17561 for (use = DF_INSN_USES (insn); *use; use++)
17562 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17563 return false;
17565 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17566 return true;
17569 if (insn == BB_END (bb))
17570 break;
17572 insn = NEXT_INSN (insn);
17575 live = df_get_live_out(bb);
17576 return !REGNO_REG_SET_P (live, FLAGS_REG);
17579 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17580 move and add to avoid AGU stalls. */
17582 bool
17583 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17585 unsigned int regno0, regno1, regno2;
17587 /* Check if we need to optimize. */
17588 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17589 return false;
17591 /* Check it is correct to split here. */
17592 if (!ix86_ok_to_clobber_flags(insn))
17593 return false;
17595 regno0 = true_regnum (operands[0]);
17596 regno1 = true_regnum (operands[1]);
17597 regno2 = true_regnum (operands[2]);
17599 /* We need to split only adds with non destructive
17600 destination operand. */
17601 if (regno0 == regno1 || regno0 == regno2)
17602 return false;
17603 else
17604 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17607 /* Return true if we should emit lea instruction instead of mov
17608 instruction. */
17610 bool
17611 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17613 unsigned int regno0, regno1;
17615 /* Check if we need to optimize. */
17616 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17617 return false;
17619 /* Use lea for reg to reg moves only. */
17620 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17621 return false;
17623 regno0 = true_regnum (operands[0]);
17624 regno1 = true_regnum (operands[1]);
17626 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
17629 /* Return true if we need to split lea into a sequence of
17630 instructions to avoid AGU stalls. */
17632 bool
17633 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17635 unsigned int regno0, regno1, regno2;
17636 int split_cost;
17637 struct ix86_address parts;
17638 int ok;
17640 /* Check we need to optimize. */
17641 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17642 return false;
17644 /* Check it is correct to split here. */
17645 if (!ix86_ok_to_clobber_flags(insn))
17646 return false;
17648 ok = ix86_decompose_address (operands[1], &parts);
17649 gcc_assert (ok);
17651 /* There should be at least two components in the address. */
17652 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17653 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17654 return false;
17656 /* We should not split into add if non legitimate pic
17657 operand is used as displacement. */
17658 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17659 return false;
17661 regno0 = true_regnum (operands[0]) ;
17662 regno1 = INVALID_REGNUM;
17663 regno2 = INVALID_REGNUM;
17665 if (parts.base)
17666 regno1 = true_regnum (parts.base);
17667 if (parts.index)
17668 regno2 = true_regnum (parts.index);
17670 split_cost = 0;
17672 /* Compute how many cycles we will add to execution time
17673 if split lea into a sequence of instructions. */
17674 if (parts.base || parts.index)
17676 /* Have to use mov instruction if non desctructive
17677 destination form is used. */
17678 if (regno1 != regno0 && regno2 != regno0)
17679 split_cost += 1;
17681 /* Have to add index to base if both exist. */
17682 if (parts.base && parts.index)
17683 split_cost += 1;
17685 /* Have to use shift and adds if scale is 2 or greater. */
17686 if (parts.scale > 1)
17688 if (regno0 != regno1)
17689 split_cost += 1;
17690 else if (regno2 == regno0)
17691 split_cost += 4;
17692 else
17693 split_cost += parts.scale;
17696 /* Have to use add instruction with immediate if
17697 disp is non zero. */
17698 if (parts.disp && parts.disp != const0_rtx)
17699 split_cost += 1;
17701 /* Subtract the price of lea. */
17702 split_cost -= 1;
17705 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
17706 parts.scale > 1);
17709 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17710 matches destination. RTX includes clobber of FLAGS_REG. */
17712 static void
17713 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17714 rtx dst, rtx src)
17716 rtx op, clob;
17718 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17719 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17721 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17724 /* Return true if regno1 def is nearest to the insn. */
17726 static bool
17727 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17729 rtx prev = insn;
17730 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17732 if (insn == start)
17733 return false;
17734 while (prev && prev != start)
17736 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17738 prev = PREV_INSN (prev);
17739 continue;
17741 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17742 return true;
17743 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17744 return false;
17745 prev = PREV_INSN (prev);
17748 /* None of the regs is defined in the bb. */
17749 return false;
17752 /* Split lea instructions into a sequence of instructions
17753 which are executed on ALU to avoid AGU stalls.
17754 It is assumed that it is allowed to clobber flags register
17755 at lea position. */
17757 void
17758 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17760 unsigned int regno0, regno1, regno2;
17761 struct ix86_address parts;
17762 rtx target, tmp;
17763 int ok, adds;
17765 ok = ix86_decompose_address (operands[1], &parts);
17766 gcc_assert (ok);
17768 target = gen_lowpart (mode, operands[0]);
17770 regno0 = true_regnum (target);
17771 regno1 = INVALID_REGNUM;
17772 regno2 = INVALID_REGNUM;
17774 if (parts.base)
17776 parts.base = gen_lowpart (mode, parts.base);
17777 regno1 = true_regnum (parts.base);
17780 if (parts.index)
17782 parts.index = gen_lowpart (mode, parts.index);
17783 regno2 = true_regnum (parts.index);
17786 if (parts.disp)
17787 parts.disp = gen_lowpart (mode, parts.disp);
17789 if (parts.scale > 1)
17791 /* Case r1 = r1 + ... */
17792 if (regno1 == regno0)
17794 /* If we have a case r1 = r1 + C * r1 then we
17795 should use multiplication which is very
17796 expensive. Assume cost model is wrong if we
17797 have such case here. */
17798 gcc_assert (regno2 != regno0);
17800 for (adds = parts.scale; adds > 0; adds--)
17801 ix86_emit_binop (PLUS, mode, target, parts.index);
17803 else
17805 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17806 if (regno0 != regno2)
17807 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17809 /* Use shift for scaling. */
17810 ix86_emit_binop (ASHIFT, mode, target,
17811 GEN_INT (exact_log2 (parts.scale)));
17813 if (parts.base)
17814 ix86_emit_binop (PLUS, mode, target, parts.base);
17816 if (parts.disp && parts.disp != const0_rtx)
17817 ix86_emit_binop (PLUS, mode, target, parts.disp);
17820 else if (!parts.base && !parts.index)
17822 gcc_assert(parts.disp);
17823 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17825 else
17827 if (!parts.base)
17829 if (regno0 != regno2)
17830 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17832 else if (!parts.index)
17834 if (regno0 != regno1)
17835 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17837 else
17839 if (regno0 == regno1)
17840 tmp = parts.index;
17841 else if (regno0 == regno2)
17842 tmp = parts.base;
17843 else
17845 rtx tmp1;
17847 /* Find better operand for SET instruction, depending
17848 on which definition is farther from the insn. */
17849 if (find_nearest_reg_def (insn, regno1, regno2))
17850 tmp = parts.index, tmp1 = parts.base;
17851 else
17852 tmp = parts.base, tmp1 = parts.index;
17854 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17856 if (parts.disp && parts.disp != const0_rtx)
17857 ix86_emit_binop (PLUS, mode, target, parts.disp);
17859 ix86_emit_binop (PLUS, mode, target, tmp1);
17860 return;
17863 ix86_emit_binop (PLUS, mode, target, tmp);
17866 if (parts.disp && parts.disp != const0_rtx)
17867 ix86_emit_binop (PLUS, mode, target, parts.disp);
17871 /* Return true if it is ok to optimize an ADD operation to LEA
17872 operation to avoid flag register consumation. For most processors,
17873 ADD is faster than LEA. For the processors like ATOM, if the
17874 destination register of LEA holds an actual address which will be
17875 used soon, LEA is better and otherwise ADD is better. */
17877 bool
17878 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17880 unsigned int regno0 = true_regnum (operands[0]);
17881 unsigned int regno1 = true_regnum (operands[1]);
17882 unsigned int regno2 = true_regnum (operands[2]);
17884 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17885 if (regno0 != regno1 && regno0 != regno2)
17886 return true;
17888 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17889 return false;
17891 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
17894 /* Return true if destination reg of SET_BODY is shift count of
17895 USE_BODY. */
17897 static bool
17898 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17900 rtx set_dest;
17901 rtx shift_rtx;
17902 int i;
17904 /* Retrieve destination of SET_BODY. */
17905 switch (GET_CODE (set_body))
17907 case SET:
17908 set_dest = SET_DEST (set_body);
17909 if (!set_dest || !REG_P (set_dest))
17910 return false;
17911 break;
17912 case PARALLEL:
17913 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17914 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17915 use_body))
17916 return true;
17917 default:
17918 return false;
17919 break;
17922 /* Retrieve shift count of USE_BODY. */
17923 switch (GET_CODE (use_body))
17925 case SET:
17926 shift_rtx = XEXP (use_body, 1);
17927 break;
17928 case PARALLEL:
17929 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17930 if (ix86_dep_by_shift_count_body (set_body,
17931 XVECEXP (use_body, 0, i)))
17932 return true;
17933 default:
17934 return false;
17935 break;
17938 if (shift_rtx
17939 && (GET_CODE (shift_rtx) == ASHIFT
17940 || GET_CODE (shift_rtx) == LSHIFTRT
17941 || GET_CODE (shift_rtx) == ASHIFTRT
17942 || GET_CODE (shift_rtx) == ROTATE
17943 || GET_CODE (shift_rtx) == ROTATERT))
17945 rtx shift_count = XEXP (shift_rtx, 1);
17947 /* Return true if shift count is dest of SET_BODY. */
17948 if (REG_P (shift_count))
17950 /* Add check since it can be invoked before register
17951 allocation in pre-reload schedule. */
17952 if (reload_completed
17953 && true_regnum (set_dest) == true_regnum (shift_count))
17954 return true;
17955 else if (REGNO(set_dest) == REGNO(shift_count))
17956 return true;
17960 return false;
17963 /* Return true if destination reg of SET_INSN is shift count of
17964 USE_INSN. */
17966 bool
17967 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17969 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17970 PATTERN (use_insn));
17973 /* Return TRUE or FALSE depending on whether the unary operator meets the
17974 appropriate constraints. */
17976 bool
17977 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17978 enum machine_mode mode ATTRIBUTE_UNUSED,
17979 rtx operands[2])
17981 /* If one of operands is memory, source and destination must match. */
17982 if ((MEM_P (operands[0])
17983 || MEM_P (operands[1]))
17984 && ! rtx_equal_p (operands[0], operands[1]))
17985 return false;
17986 return true;
17989 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17990 are ok, keeping in mind the possible movddup alternative. */
17992 bool
17993 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17995 if (MEM_P (operands[0]))
17996 return rtx_equal_p (operands[0], operands[1 + high]);
17997 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17998 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17999 return true;
18002 /* Post-reload splitter for converting an SF or DFmode value in an
18003 SSE register into an unsigned SImode. */
18005 void
18006 ix86_split_convert_uns_si_sse (rtx operands[])
18008 enum machine_mode vecmode;
18009 rtx value, large, zero_or_two31, input, two31, x;
18011 large = operands[1];
18012 zero_or_two31 = operands[2];
18013 input = operands[3];
18014 two31 = operands[4];
18015 vecmode = GET_MODE (large);
18016 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18018 /* Load up the value into the low element. We must ensure that the other
18019 elements are valid floats -- zero is the easiest such value. */
18020 if (MEM_P (input))
18022 if (vecmode == V4SFmode)
18023 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18024 else
18025 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18027 else
18029 input = gen_rtx_REG (vecmode, REGNO (input));
18030 emit_move_insn (value, CONST0_RTX (vecmode));
18031 if (vecmode == V4SFmode)
18032 emit_insn (gen_sse_movss (value, value, input));
18033 else
18034 emit_insn (gen_sse2_movsd (value, value, input));
18037 emit_move_insn (large, two31);
18038 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18040 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18041 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18043 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18044 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18046 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18047 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18049 large = gen_rtx_REG (V4SImode, REGNO (large));
18050 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18052 x = gen_rtx_REG (V4SImode, REGNO (value));
18053 if (vecmode == V4SFmode)
18054 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18055 else
18056 emit_insn (gen_sse2_cvttpd2dq (x, value));
18057 value = x;
18059 emit_insn (gen_xorv4si3 (value, value, large));
18062 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18063 Expects the 64-bit DImode to be supplied in a pair of integral
18064 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18065 -mfpmath=sse, !optimize_size only. */
18067 void
18068 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18070 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18071 rtx int_xmm, fp_xmm;
18072 rtx biases, exponents;
18073 rtx x;
18075 int_xmm = gen_reg_rtx (V4SImode);
18076 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18077 emit_insn (gen_movdi_to_sse (int_xmm, input));
18078 else if (TARGET_SSE_SPLIT_REGS)
18080 emit_clobber (int_xmm);
18081 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18083 else
18085 x = gen_reg_rtx (V2DImode);
18086 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18087 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18090 x = gen_rtx_CONST_VECTOR (V4SImode,
18091 gen_rtvec (4, GEN_INT (0x43300000UL),
18092 GEN_INT (0x45300000UL),
18093 const0_rtx, const0_rtx));
18094 exponents = validize_mem (force_const_mem (V4SImode, x));
18096 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18097 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18099 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18100 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18101 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18102 (0x1.0p84 + double(fp_value_hi_xmm)).
18103 Note these exponents differ by 32. */
18105 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18107 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18108 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18109 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18110 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18111 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18112 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18113 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18114 biases = validize_mem (force_const_mem (V2DFmode, biases));
18115 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18117 /* Add the upper and lower DFmode values together. */
18118 if (TARGET_SSE3)
18119 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18120 else
18122 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18123 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18124 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18127 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18130 /* Not used, but eases macroization of patterns. */
18131 void
18132 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18133 rtx input ATTRIBUTE_UNUSED)
18135 gcc_unreachable ();
18138 /* Convert an unsigned SImode value into a DFmode. Only currently used
18139 for SSE, but applicable anywhere. */
18141 void
18142 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18144 REAL_VALUE_TYPE TWO31r;
18145 rtx x, fp;
18147 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18148 NULL, 1, OPTAB_DIRECT);
18150 fp = gen_reg_rtx (DFmode);
18151 emit_insn (gen_floatsidf2 (fp, x));
18153 real_ldexp (&TWO31r, &dconst1, 31);
18154 x = const_double_from_real_value (TWO31r, DFmode);
18156 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18157 if (x != target)
18158 emit_move_insn (target, x);
18161 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18162 32-bit mode; otherwise we have a direct convert instruction. */
18164 void
18165 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18167 REAL_VALUE_TYPE TWO32r;
18168 rtx fp_lo, fp_hi, x;
18170 fp_lo = gen_reg_rtx (DFmode);
18171 fp_hi = gen_reg_rtx (DFmode);
18173 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18175 real_ldexp (&TWO32r, &dconst1, 32);
18176 x = const_double_from_real_value (TWO32r, DFmode);
18177 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18179 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18181 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18182 0, OPTAB_DIRECT);
18183 if (x != target)
18184 emit_move_insn (target, x);
18187 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18188 For x86_32, -mfpmath=sse, !optimize_size only. */
18189 void
18190 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18192 REAL_VALUE_TYPE ONE16r;
18193 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18195 real_ldexp (&ONE16r, &dconst1, 16);
18196 x = const_double_from_real_value (ONE16r, SFmode);
18197 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18198 NULL, 0, OPTAB_DIRECT);
18199 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18200 NULL, 0, OPTAB_DIRECT);
18201 fp_hi = gen_reg_rtx (SFmode);
18202 fp_lo = gen_reg_rtx (SFmode);
18203 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18204 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18205 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18206 0, OPTAB_DIRECT);
18207 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18208 0, OPTAB_DIRECT);
18209 if (!rtx_equal_p (target, fp_hi))
18210 emit_move_insn (target, fp_hi);
18213 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18214 a vector of unsigned ints VAL to vector of floats TARGET. */
18216 void
18217 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18219 rtx tmp[8];
18220 REAL_VALUE_TYPE TWO16r;
18221 enum machine_mode intmode = GET_MODE (val);
18222 enum machine_mode fltmode = GET_MODE (target);
18223 rtx (*cvt) (rtx, rtx);
18225 if (intmode == V4SImode)
18226 cvt = gen_floatv4siv4sf2;
18227 else
18228 cvt = gen_floatv8siv8sf2;
18229 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18230 tmp[0] = force_reg (intmode, tmp[0]);
18231 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18232 OPTAB_DIRECT);
18233 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18234 NULL_RTX, 1, OPTAB_DIRECT);
18235 tmp[3] = gen_reg_rtx (fltmode);
18236 emit_insn (cvt (tmp[3], tmp[1]));
18237 tmp[4] = gen_reg_rtx (fltmode);
18238 emit_insn (cvt (tmp[4], tmp[2]));
18239 real_ldexp (&TWO16r, &dconst1, 16);
18240 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18241 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18242 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18243 OPTAB_DIRECT);
18244 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18245 OPTAB_DIRECT);
18246 if (tmp[7] != target)
18247 emit_move_insn (target, tmp[7]);
18250 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18251 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18252 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18253 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18256 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18258 REAL_VALUE_TYPE TWO31r;
18259 rtx two31r, tmp[4];
18260 enum machine_mode mode = GET_MODE (val);
18261 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18262 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18263 rtx (*cmp) (rtx, rtx, rtx, rtx);
18264 int i;
18266 for (i = 0; i < 3; i++)
18267 tmp[i] = gen_reg_rtx (mode);
18268 real_ldexp (&TWO31r, &dconst1, 31);
18269 two31r = const_double_from_real_value (TWO31r, scalarmode);
18270 two31r = ix86_build_const_vector (mode, 1, two31r);
18271 two31r = force_reg (mode, two31r);
18272 switch (mode)
18274 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18275 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18276 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18277 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18278 default: gcc_unreachable ();
18280 tmp[3] = gen_rtx_LE (mode, two31r, val);
18281 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18282 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18283 0, OPTAB_DIRECT);
18284 if (intmode == V4SImode || TARGET_AVX2)
18285 *xorp = expand_simple_binop (intmode, ASHIFT,
18286 gen_lowpart (intmode, tmp[0]),
18287 GEN_INT (31), NULL_RTX, 0,
18288 OPTAB_DIRECT);
18289 else
18291 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18292 two31 = ix86_build_const_vector (intmode, 1, two31);
18293 *xorp = expand_simple_binop (intmode, AND,
18294 gen_lowpart (intmode, tmp[0]),
18295 two31, NULL_RTX, 0,
18296 OPTAB_DIRECT);
18298 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18299 0, OPTAB_DIRECT);
18302 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18303 then replicate the value for all elements of the vector
18304 register. */
18307 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18309 int i, n_elt;
18310 rtvec v;
18311 enum machine_mode scalar_mode;
18313 switch (mode)
18315 case V32QImode:
18316 case V16QImode:
18317 case V16HImode:
18318 case V8HImode:
18319 case V8SImode:
18320 case V4SImode:
18321 case V4DImode:
18322 case V2DImode:
18323 gcc_assert (vect);
18324 case V8SFmode:
18325 case V4SFmode:
18326 case V4DFmode:
18327 case V2DFmode:
18328 n_elt = GET_MODE_NUNITS (mode);
18329 v = rtvec_alloc (n_elt);
18330 scalar_mode = GET_MODE_INNER (mode);
18332 RTVEC_ELT (v, 0) = value;
18334 for (i = 1; i < n_elt; ++i)
18335 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18337 return gen_rtx_CONST_VECTOR (mode, v);
18339 default:
18340 gcc_unreachable ();
18344 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18345 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18346 for an SSE register. If VECT is true, then replicate the mask for
18347 all elements of the vector register. If INVERT is true, then create
18348 a mask excluding the sign bit. */
18351 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18353 enum machine_mode vec_mode, imode;
18354 HOST_WIDE_INT hi, lo;
18355 int shift = 63;
18356 rtx v;
18357 rtx mask;
18359 /* Find the sign bit, sign extended to 2*HWI. */
18360 switch (mode)
18362 case V8SImode:
18363 case V4SImode:
18364 case V8SFmode:
18365 case V4SFmode:
18366 vec_mode = mode;
18367 mode = GET_MODE_INNER (mode);
18368 imode = SImode;
18369 lo = 0x80000000, hi = lo < 0;
18370 break;
18372 case V4DImode:
18373 case V2DImode:
18374 case V4DFmode:
18375 case V2DFmode:
18376 vec_mode = mode;
18377 mode = GET_MODE_INNER (mode);
18378 imode = DImode;
18379 if (HOST_BITS_PER_WIDE_INT >= 64)
18380 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18381 else
18382 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18383 break;
18385 case TImode:
18386 case TFmode:
18387 vec_mode = VOIDmode;
18388 if (HOST_BITS_PER_WIDE_INT >= 64)
18390 imode = TImode;
18391 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18393 else
18395 rtvec vec;
18397 imode = DImode;
18398 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18400 if (invert)
18402 lo = ~lo, hi = ~hi;
18403 v = constm1_rtx;
18405 else
18406 v = const0_rtx;
18408 mask = immed_double_const (lo, hi, imode);
18410 vec = gen_rtvec (2, v, mask);
18411 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18412 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18414 return v;
18416 break;
18418 default:
18419 gcc_unreachable ();
18422 if (invert)
18423 lo = ~lo, hi = ~hi;
18425 /* Force this value into the low part of a fp vector constant. */
18426 mask = immed_double_const (lo, hi, imode);
18427 mask = gen_lowpart (mode, mask);
18429 if (vec_mode == VOIDmode)
18430 return force_reg (mode, mask);
18432 v = ix86_build_const_vector (vec_mode, vect, mask);
18433 return force_reg (vec_mode, v);
18436 /* Generate code for floating point ABS or NEG. */
18438 void
18439 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18440 rtx operands[])
18442 rtx mask, set, dst, src;
18443 bool use_sse = false;
18444 bool vector_mode = VECTOR_MODE_P (mode);
18445 enum machine_mode vmode = mode;
18447 if (vector_mode)
18448 use_sse = true;
18449 else if (mode == TFmode)
18450 use_sse = true;
18451 else if (TARGET_SSE_MATH)
18453 use_sse = SSE_FLOAT_MODE_P (mode);
18454 if (mode == SFmode)
18455 vmode = V4SFmode;
18456 else if (mode == DFmode)
18457 vmode = V2DFmode;
18460 /* NEG and ABS performed with SSE use bitwise mask operations.
18461 Create the appropriate mask now. */
18462 if (use_sse)
18463 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18464 else
18465 mask = NULL_RTX;
18467 dst = operands[0];
18468 src = operands[1];
18470 set = gen_rtx_fmt_e (code, mode, src);
18471 set = gen_rtx_SET (VOIDmode, dst, set);
18473 if (mask)
18475 rtx use, clob;
18476 rtvec par;
18478 use = gen_rtx_USE (VOIDmode, mask);
18479 if (vector_mode)
18480 par = gen_rtvec (2, set, use);
18481 else
18483 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18484 par = gen_rtvec (3, set, use, clob);
18486 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18488 else
18489 emit_insn (set);
18492 /* Expand a copysign operation. Special case operand 0 being a constant. */
18494 void
18495 ix86_expand_copysign (rtx operands[])
18497 enum machine_mode mode, vmode;
18498 rtx dest, op0, op1, mask, nmask;
18500 dest = operands[0];
18501 op0 = operands[1];
18502 op1 = operands[2];
18504 mode = GET_MODE (dest);
18506 if (mode == SFmode)
18507 vmode = V4SFmode;
18508 else if (mode == DFmode)
18509 vmode = V2DFmode;
18510 else
18511 vmode = mode;
18513 if (GET_CODE (op0) == CONST_DOUBLE)
18515 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18517 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18518 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18520 if (mode == SFmode || mode == DFmode)
18522 if (op0 == CONST0_RTX (mode))
18523 op0 = CONST0_RTX (vmode);
18524 else
18526 rtx v = ix86_build_const_vector (vmode, false, op0);
18528 op0 = force_reg (vmode, v);
18531 else if (op0 != CONST0_RTX (mode))
18532 op0 = force_reg (mode, op0);
18534 mask = ix86_build_signbit_mask (vmode, 0, 0);
18536 if (mode == SFmode)
18537 copysign_insn = gen_copysignsf3_const;
18538 else if (mode == DFmode)
18539 copysign_insn = gen_copysigndf3_const;
18540 else
18541 copysign_insn = gen_copysigntf3_const;
18543 emit_insn (copysign_insn (dest, op0, op1, mask));
18545 else
18547 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18549 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18550 mask = ix86_build_signbit_mask (vmode, 0, 0);
18552 if (mode == SFmode)
18553 copysign_insn = gen_copysignsf3_var;
18554 else if (mode == DFmode)
18555 copysign_insn = gen_copysigndf3_var;
18556 else
18557 copysign_insn = gen_copysigntf3_var;
18559 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18563 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18564 be a constant, and so has already been expanded into a vector constant. */
18566 void
18567 ix86_split_copysign_const (rtx operands[])
18569 enum machine_mode mode, vmode;
18570 rtx dest, op0, mask, x;
18572 dest = operands[0];
18573 op0 = operands[1];
18574 mask = operands[3];
18576 mode = GET_MODE (dest);
18577 vmode = GET_MODE (mask);
18579 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18580 x = gen_rtx_AND (vmode, dest, mask);
18581 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18583 if (op0 != CONST0_RTX (vmode))
18585 x = gen_rtx_IOR (vmode, dest, op0);
18586 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18590 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18591 so we have to do two masks. */
18593 void
18594 ix86_split_copysign_var (rtx operands[])
18596 enum machine_mode mode, vmode;
18597 rtx dest, scratch, op0, op1, mask, nmask, x;
18599 dest = operands[0];
18600 scratch = operands[1];
18601 op0 = operands[2];
18602 op1 = operands[3];
18603 nmask = operands[4];
18604 mask = operands[5];
18606 mode = GET_MODE (dest);
18607 vmode = GET_MODE (mask);
18609 if (rtx_equal_p (op0, op1))
18611 /* Shouldn't happen often (it's useless, obviously), but when it does
18612 we'd generate incorrect code if we continue below. */
18613 emit_move_insn (dest, op0);
18614 return;
18617 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18619 gcc_assert (REGNO (op1) == REGNO (scratch));
18621 x = gen_rtx_AND (vmode, scratch, mask);
18622 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18624 dest = mask;
18625 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18626 x = gen_rtx_NOT (vmode, dest);
18627 x = gen_rtx_AND (vmode, x, op0);
18628 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18630 else
18632 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18634 x = gen_rtx_AND (vmode, scratch, mask);
18636 else /* alternative 2,4 */
18638 gcc_assert (REGNO (mask) == REGNO (scratch));
18639 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18640 x = gen_rtx_AND (vmode, scratch, op1);
18642 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18644 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18646 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18647 x = gen_rtx_AND (vmode, dest, nmask);
18649 else /* alternative 3,4 */
18651 gcc_assert (REGNO (nmask) == REGNO (dest));
18652 dest = nmask;
18653 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18654 x = gen_rtx_AND (vmode, dest, op0);
18656 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18659 x = gen_rtx_IOR (vmode, dest, scratch);
18660 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18663 /* Return TRUE or FALSE depending on whether the first SET in INSN
18664 has source and destination with matching CC modes, and that the
18665 CC mode is at least as constrained as REQ_MODE. */
18667 bool
18668 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18670 rtx set;
18671 enum machine_mode set_mode;
18673 set = PATTERN (insn);
18674 if (GET_CODE (set) == PARALLEL)
18675 set = XVECEXP (set, 0, 0);
18676 gcc_assert (GET_CODE (set) == SET);
18677 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18679 set_mode = GET_MODE (SET_DEST (set));
18680 switch (set_mode)
18682 case CCNOmode:
18683 if (req_mode != CCNOmode
18684 && (req_mode != CCmode
18685 || XEXP (SET_SRC (set), 1) != const0_rtx))
18686 return false;
18687 break;
18688 case CCmode:
18689 if (req_mode == CCGCmode)
18690 return false;
18691 /* FALLTHRU */
18692 case CCGCmode:
18693 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18694 return false;
18695 /* FALLTHRU */
18696 case CCGOCmode:
18697 if (req_mode == CCZmode)
18698 return false;
18699 /* FALLTHRU */
18700 case CCZmode:
18701 break;
18703 case CCAmode:
18704 case CCCmode:
18705 case CCOmode:
18706 case CCSmode:
18707 if (set_mode != req_mode)
18708 return false;
18709 break;
18711 default:
18712 gcc_unreachable ();
18715 return GET_MODE (SET_SRC (set)) == set_mode;
18718 /* Generate insn patterns to do an integer compare of OPERANDS. */
18720 static rtx
18721 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18723 enum machine_mode cmpmode;
18724 rtx tmp, flags;
18726 cmpmode = SELECT_CC_MODE (code, op0, op1);
18727 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18729 /* This is very simple, but making the interface the same as in the
18730 FP case makes the rest of the code easier. */
18731 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18732 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18734 /* Return the test that should be put into the flags user, i.e.
18735 the bcc, scc, or cmov instruction. */
18736 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18739 /* Figure out whether to use ordered or unordered fp comparisons.
18740 Return the appropriate mode to use. */
18742 enum machine_mode
18743 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18745 /* ??? In order to make all comparisons reversible, we do all comparisons
18746 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18747 all forms trapping and nontrapping comparisons, we can make inequality
18748 comparisons trapping again, since it results in better code when using
18749 FCOM based compares. */
18750 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18753 enum machine_mode
18754 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18756 enum machine_mode mode = GET_MODE (op0);
18758 if (SCALAR_FLOAT_MODE_P (mode))
18760 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18761 return ix86_fp_compare_mode (code);
18764 switch (code)
18766 /* Only zero flag is needed. */
18767 case EQ: /* ZF=0 */
18768 case NE: /* ZF!=0 */
18769 return CCZmode;
18770 /* Codes needing carry flag. */
18771 case GEU: /* CF=0 */
18772 case LTU: /* CF=1 */
18773 /* Detect overflow checks. They need just the carry flag. */
18774 if (GET_CODE (op0) == PLUS
18775 && rtx_equal_p (op1, XEXP (op0, 0)))
18776 return CCCmode;
18777 else
18778 return CCmode;
18779 case GTU: /* CF=0 & ZF=0 */
18780 case LEU: /* CF=1 | ZF=1 */
18781 /* Detect overflow checks. They need just the carry flag. */
18782 if (GET_CODE (op0) == MINUS
18783 && rtx_equal_p (op1, XEXP (op0, 0)))
18784 return CCCmode;
18785 else
18786 return CCmode;
18787 /* Codes possibly doable only with sign flag when
18788 comparing against zero. */
18789 case GE: /* SF=OF or SF=0 */
18790 case LT: /* SF<>OF or SF=1 */
18791 if (op1 == const0_rtx)
18792 return CCGOCmode;
18793 else
18794 /* For other cases Carry flag is not required. */
18795 return CCGCmode;
18796 /* Codes doable only with sign flag when comparing
18797 against zero, but we miss jump instruction for it
18798 so we need to use relational tests against overflow
18799 that thus needs to be zero. */
18800 case GT: /* ZF=0 & SF=OF */
18801 case LE: /* ZF=1 | SF<>OF */
18802 if (op1 == const0_rtx)
18803 return CCNOmode;
18804 else
18805 return CCGCmode;
18806 /* strcmp pattern do (use flags) and combine may ask us for proper
18807 mode. */
18808 case USE:
18809 return CCmode;
18810 default:
18811 gcc_unreachable ();
18815 /* Return the fixed registers used for condition codes. */
18817 static bool
18818 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18820 *p1 = FLAGS_REG;
18821 *p2 = FPSR_REG;
18822 return true;
18825 /* If two condition code modes are compatible, return a condition code
18826 mode which is compatible with both. Otherwise, return
18827 VOIDmode. */
18829 static enum machine_mode
18830 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18832 if (m1 == m2)
18833 return m1;
18835 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18836 return VOIDmode;
18838 if ((m1 == CCGCmode && m2 == CCGOCmode)
18839 || (m1 == CCGOCmode && m2 == CCGCmode))
18840 return CCGCmode;
18842 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18843 return m2;
18844 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18845 return m1;
18847 switch (m1)
18849 default:
18850 gcc_unreachable ();
18852 case CCmode:
18853 case CCGCmode:
18854 case CCGOCmode:
18855 case CCNOmode:
18856 case CCAmode:
18857 case CCCmode:
18858 case CCOmode:
18859 case CCSmode:
18860 case CCZmode:
18861 switch (m2)
18863 default:
18864 return VOIDmode;
18866 case CCmode:
18867 case CCGCmode:
18868 case CCGOCmode:
18869 case CCNOmode:
18870 case CCAmode:
18871 case CCCmode:
18872 case CCOmode:
18873 case CCSmode:
18874 case CCZmode:
18875 return CCmode;
18878 case CCFPmode:
18879 case CCFPUmode:
18880 /* These are only compatible with themselves, which we already
18881 checked above. */
18882 return VOIDmode;
18887 /* Return a comparison we can do and that it is equivalent to
18888 swap_condition (code) apart possibly from orderedness.
18889 But, never change orderedness if TARGET_IEEE_FP, returning
18890 UNKNOWN in that case if necessary. */
18892 static enum rtx_code
18893 ix86_fp_swap_condition (enum rtx_code code)
18895 switch (code)
18897 case GT: /* GTU - CF=0 & ZF=0 */
18898 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18899 case GE: /* GEU - CF=0 */
18900 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18901 case UNLT: /* LTU - CF=1 */
18902 return TARGET_IEEE_FP ? UNKNOWN : GT;
18903 case UNLE: /* LEU - CF=1 | ZF=1 */
18904 return TARGET_IEEE_FP ? UNKNOWN : GE;
18905 default:
18906 return swap_condition (code);
18910 /* Return cost of comparison CODE using the best strategy for performance.
18911 All following functions do use number of instructions as a cost metrics.
18912 In future this should be tweaked to compute bytes for optimize_size and
18913 take into account performance of various instructions on various CPUs. */
18915 static int
18916 ix86_fp_comparison_cost (enum rtx_code code)
18918 int arith_cost;
18920 /* The cost of code using bit-twiddling on %ah. */
18921 switch (code)
18923 case UNLE:
18924 case UNLT:
18925 case LTGT:
18926 case GT:
18927 case GE:
18928 case UNORDERED:
18929 case ORDERED:
18930 case UNEQ:
18931 arith_cost = 4;
18932 break;
18933 case LT:
18934 case NE:
18935 case EQ:
18936 case UNGE:
18937 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18938 break;
18939 case LE:
18940 case UNGT:
18941 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18942 break;
18943 default:
18944 gcc_unreachable ();
18947 switch (ix86_fp_comparison_strategy (code))
18949 case IX86_FPCMP_COMI:
18950 return arith_cost > 4 ? 3 : 2;
18951 case IX86_FPCMP_SAHF:
18952 return arith_cost > 4 ? 4 : 3;
18953 default:
18954 return arith_cost;
18958 /* Return strategy to use for floating-point. We assume that fcomi is always
18959 preferrable where available, since that is also true when looking at size
18960 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18962 enum ix86_fpcmp_strategy
18963 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18965 /* Do fcomi/sahf based test when profitable. */
18967 if (TARGET_CMOVE)
18968 return IX86_FPCMP_COMI;
18970 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
18971 return IX86_FPCMP_SAHF;
18973 return IX86_FPCMP_ARITH;
18976 /* Swap, force into registers, or otherwise massage the two operands
18977 to a fp comparison. The operands are updated in place; the new
18978 comparison code is returned. */
18980 static enum rtx_code
18981 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18983 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18984 rtx op0 = *pop0, op1 = *pop1;
18985 enum machine_mode op_mode = GET_MODE (op0);
18986 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18988 /* All of the unordered compare instructions only work on registers.
18989 The same is true of the fcomi compare instructions. The XFmode
18990 compare instructions require registers except when comparing
18991 against zero or when converting operand 1 from fixed point to
18992 floating point. */
18994 if (!is_sse
18995 && (fpcmp_mode == CCFPUmode
18996 || (op_mode == XFmode
18997 && ! (standard_80387_constant_p (op0) == 1
18998 || standard_80387_constant_p (op1) == 1)
18999 && GET_CODE (op1) != FLOAT)
19000 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
19002 op0 = force_reg (op_mode, op0);
19003 op1 = force_reg (op_mode, op1);
19005 else
19007 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19008 things around if they appear profitable, otherwise force op0
19009 into a register. */
19011 if (standard_80387_constant_p (op0) == 0
19012 || (MEM_P (op0)
19013 && ! (standard_80387_constant_p (op1) == 0
19014 || MEM_P (op1))))
19016 enum rtx_code new_code = ix86_fp_swap_condition (code);
19017 if (new_code != UNKNOWN)
19019 rtx tmp;
19020 tmp = op0, op0 = op1, op1 = tmp;
19021 code = new_code;
19025 if (!REG_P (op0))
19026 op0 = force_reg (op_mode, op0);
19028 if (CONSTANT_P (op1))
19030 int tmp = standard_80387_constant_p (op1);
19031 if (tmp == 0)
19032 op1 = validize_mem (force_const_mem (op_mode, op1));
19033 else if (tmp == 1)
19035 if (TARGET_CMOVE)
19036 op1 = force_reg (op_mode, op1);
19038 else
19039 op1 = force_reg (op_mode, op1);
19043 /* Try to rearrange the comparison to make it cheaper. */
19044 if (ix86_fp_comparison_cost (code)
19045 > ix86_fp_comparison_cost (swap_condition (code))
19046 && (REG_P (op1) || can_create_pseudo_p ()))
19048 rtx tmp;
19049 tmp = op0, op0 = op1, op1 = tmp;
19050 code = swap_condition (code);
19051 if (!REG_P (op0))
19052 op0 = force_reg (op_mode, op0);
19055 *pop0 = op0;
19056 *pop1 = op1;
19057 return code;
19060 /* Convert comparison codes we use to represent FP comparison to integer
19061 code that will result in proper branch. Return UNKNOWN if no such code
19062 is available. */
19064 enum rtx_code
19065 ix86_fp_compare_code_to_integer (enum rtx_code code)
19067 switch (code)
19069 case GT:
19070 return GTU;
19071 case GE:
19072 return GEU;
19073 case ORDERED:
19074 case UNORDERED:
19075 return code;
19076 break;
19077 case UNEQ:
19078 return EQ;
19079 break;
19080 case UNLT:
19081 return LTU;
19082 break;
19083 case UNLE:
19084 return LEU;
19085 break;
19086 case LTGT:
19087 return NE;
19088 break;
19089 default:
19090 return UNKNOWN;
19094 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19096 static rtx
19097 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19099 enum machine_mode fpcmp_mode, intcmp_mode;
19100 rtx tmp, tmp2;
19102 fpcmp_mode = ix86_fp_compare_mode (code);
19103 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19105 /* Do fcomi/sahf based test when profitable. */
19106 switch (ix86_fp_comparison_strategy (code))
19108 case IX86_FPCMP_COMI:
19109 intcmp_mode = fpcmp_mode;
19110 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19111 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19112 tmp);
19113 emit_insn (tmp);
19114 break;
19116 case IX86_FPCMP_SAHF:
19117 intcmp_mode = fpcmp_mode;
19118 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19119 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19120 tmp);
19122 if (!scratch)
19123 scratch = gen_reg_rtx (HImode);
19124 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19125 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19126 break;
19128 case IX86_FPCMP_ARITH:
19129 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19130 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19131 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19132 if (!scratch)
19133 scratch = gen_reg_rtx (HImode);
19134 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19136 /* In the unordered case, we have to check C2 for NaN's, which
19137 doesn't happen to work out to anything nice combination-wise.
19138 So do some bit twiddling on the value we've got in AH to come
19139 up with an appropriate set of condition codes. */
19141 intcmp_mode = CCNOmode;
19142 switch (code)
19144 case GT:
19145 case UNGT:
19146 if (code == GT || !TARGET_IEEE_FP)
19148 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19149 code = EQ;
19151 else
19153 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19154 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19155 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19156 intcmp_mode = CCmode;
19157 code = GEU;
19159 break;
19160 case LT:
19161 case UNLT:
19162 if (code == LT && TARGET_IEEE_FP)
19164 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19165 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19166 intcmp_mode = CCmode;
19167 code = EQ;
19169 else
19171 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19172 code = NE;
19174 break;
19175 case GE:
19176 case UNGE:
19177 if (code == GE || !TARGET_IEEE_FP)
19179 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19180 code = EQ;
19182 else
19184 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19185 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19186 code = NE;
19188 break;
19189 case LE:
19190 case UNLE:
19191 if (code == LE && TARGET_IEEE_FP)
19193 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19194 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19195 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19196 intcmp_mode = CCmode;
19197 code = LTU;
19199 else
19201 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19202 code = NE;
19204 break;
19205 case EQ:
19206 case UNEQ:
19207 if (code == EQ && TARGET_IEEE_FP)
19209 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19210 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19211 intcmp_mode = CCmode;
19212 code = EQ;
19214 else
19216 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19217 code = NE;
19219 break;
19220 case NE:
19221 case LTGT:
19222 if (code == NE && TARGET_IEEE_FP)
19224 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19225 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19226 GEN_INT (0x40)));
19227 code = NE;
19229 else
19231 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19232 code = EQ;
19234 break;
19236 case UNORDERED:
19237 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19238 code = NE;
19239 break;
19240 case ORDERED:
19241 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19242 code = EQ;
19243 break;
19245 default:
19246 gcc_unreachable ();
19248 break;
19250 default:
19251 gcc_unreachable();
19254 /* Return the test that should be put into the flags user, i.e.
19255 the bcc, scc, or cmov instruction. */
19256 return gen_rtx_fmt_ee (code, VOIDmode,
19257 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19258 const0_rtx);
19261 static rtx
19262 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19264 rtx ret;
19266 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19267 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19269 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19271 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19272 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19274 else
19275 ret = ix86_expand_int_compare (code, op0, op1);
19277 return ret;
19280 void
19281 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19283 enum machine_mode mode = GET_MODE (op0);
19284 rtx tmp;
19286 switch (mode)
19288 case SFmode:
19289 case DFmode:
19290 case XFmode:
19291 case QImode:
19292 case HImode:
19293 case SImode:
19294 simple:
19295 tmp = ix86_expand_compare (code, op0, op1);
19296 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19297 gen_rtx_LABEL_REF (VOIDmode, label),
19298 pc_rtx);
19299 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19300 return;
19302 case DImode:
19303 if (TARGET_64BIT)
19304 goto simple;
19305 case TImode:
19306 /* Expand DImode branch into multiple compare+branch. */
19308 rtx lo[2], hi[2], label2;
19309 enum rtx_code code1, code2, code3;
19310 enum machine_mode submode;
19312 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19314 tmp = op0, op0 = op1, op1 = tmp;
19315 code = swap_condition (code);
19318 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19319 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19321 submode = mode == DImode ? SImode : DImode;
19323 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19324 avoid two branches. This costs one extra insn, so disable when
19325 optimizing for size. */
19327 if ((code == EQ || code == NE)
19328 && (!optimize_insn_for_size_p ()
19329 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19331 rtx xor0, xor1;
19333 xor1 = hi[0];
19334 if (hi[1] != const0_rtx)
19335 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19336 NULL_RTX, 0, OPTAB_WIDEN);
19338 xor0 = lo[0];
19339 if (lo[1] != const0_rtx)
19340 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19341 NULL_RTX, 0, OPTAB_WIDEN);
19343 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19344 NULL_RTX, 0, OPTAB_WIDEN);
19346 ix86_expand_branch (code, tmp, const0_rtx, label);
19347 return;
19350 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19351 op1 is a constant and the low word is zero, then we can just
19352 examine the high word. Similarly for low word -1 and
19353 less-or-equal-than or greater-than. */
19355 if (CONST_INT_P (hi[1]))
19356 switch (code)
19358 case LT: case LTU: case GE: case GEU:
19359 if (lo[1] == const0_rtx)
19361 ix86_expand_branch (code, hi[0], hi[1], label);
19362 return;
19364 break;
19365 case LE: case LEU: case GT: case GTU:
19366 if (lo[1] == constm1_rtx)
19368 ix86_expand_branch (code, hi[0], hi[1], label);
19369 return;
19371 break;
19372 default:
19373 break;
19376 /* Otherwise, we need two or three jumps. */
19378 label2 = gen_label_rtx ();
19380 code1 = code;
19381 code2 = swap_condition (code);
19382 code3 = unsigned_condition (code);
19384 switch (code)
19386 case LT: case GT: case LTU: case GTU:
19387 break;
19389 case LE: code1 = LT; code2 = GT; break;
19390 case GE: code1 = GT; code2 = LT; break;
19391 case LEU: code1 = LTU; code2 = GTU; break;
19392 case GEU: code1 = GTU; code2 = LTU; break;
19394 case EQ: code1 = UNKNOWN; code2 = NE; break;
19395 case NE: code2 = UNKNOWN; break;
19397 default:
19398 gcc_unreachable ();
19402 * a < b =>
19403 * if (hi(a) < hi(b)) goto true;
19404 * if (hi(a) > hi(b)) goto false;
19405 * if (lo(a) < lo(b)) goto true;
19406 * false:
19409 if (code1 != UNKNOWN)
19410 ix86_expand_branch (code1, hi[0], hi[1], label);
19411 if (code2 != UNKNOWN)
19412 ix86_expand_branch (code2, hi[0], hi[1], label2);
19414 ix86_expand_branch (code3, lo[0], lo[1], label);
19416 if (code2 != UNKNOWN)
19417 emit_label (label2);
19418 return;
19421 default:
19422 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19423 goto simple;
19427 /* Split branch based on floating point condition. */
19428 void
19429 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19430 rtx target1, rtx target2, rtx tmp, rtx pushed)
19432 rtx condition;
19433 rtx i;
19435 if (target2 != pc_rtx)
19437 rtx tmp = target2;
19438 code = reverse_condition_maybe_unordered (code);
19439 target2 = target1;
19440 target1 = tmp;
19443 condition = ix86_expand_fp_compare (code, op1, op2,
19444 tmp);
19446 /* Remove pushed operand from stack. */
19447 if (pushed)
19448 ix86_free_from_memory (GET_MODE (pushed));
19450 i = emit_jump_insn (gen_rtx_SET
19451 (VOIDmode, pc_rtx,
19452 gen_rtx_IF_THEN_ELSE (VOIDmode,
19453 condition, target1, target2)));
19454 if (split_branch_probability >= 0)
19455 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19458 void
19459 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19461 rtx ret;
19463 gcc_assert (GET_MODE (dest) == QImode);
19465 ret = ix86_expand_compare (code, op0, op1);
19466 PUT_MODE (ret, QImode);
19467 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19470 /* Expand comparison setting or clearing carry flag. Return true when
19471 successful and set pop for the operation. */
19472 static bool
19473 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19475 enum machine_mode mode =
19476 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19478 /* Do not handle double-mode compares that go through special path. */
19479 if (mode == (TARGET_64BIT ? TImode : DImode))
19480 return false;
19482 if (SCALAR_FLOAT_MODE_P (mode))
19484 rtx compare_op, compare_seq;
19486 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19488 /* Shortcut: following common codes never translate
19489 into carry flag compares. */
19490 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19491 || code == ORDERED || code == UNORDERED)
19492 return false;
19494 /* These comparisons require zero flag; swap operands so they won't. */
19495 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19496 && !TARGET_IEEE_FP)
19498 rtx tmp = op0;
19499 op0 = op1;
19500 op1 = tmp;
19501 code = swap_condition (code);
19504 /* Try to expand the comparison and verify that we end up with
19505 carry flag based comparison. This fails to be true only when
19506 we decide to expand comparison using arithmetic that is not
19507 too common scenario. */
19508 start_sequence ();
19509 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19510 compare_seq = get_insns ();
19511 end_sequence ();
19513 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19514 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19515 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19516 else
19517 code = GET_CODE (compare_op);
19519 if (code != LTU && code != GEU)
19520 return false;
19522 emit_insn (compare_seq);
19523 *pop = compare_op;
19524 return true;
19527 if (!INTEGRAL_MODE_P (mode))
19528 return false;
19530 switch (code)
19532 case LTU:
19533 case GEU:
19534 break;
19536 /* Convert a==0 into (unsigned)a<1. */
19537 case EQ:
19538 case NE:
19539 if (op1 != const0_rtx)
19540 return false;
19541 op1 = const1_rtx;
19542 code = (code == EQ ? LTU : GEU);
19543 break;
19545 /* Convert a>b into b<a or a>=b-1. */
19546 case GTU:
19547 case LEU:
19548 if (CONST_INT_P (op1))
19550 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19551 /* Bail out on overflow. We still can swap operands but that
19552 would force loading of the constant into register. */
19553 if (op1 == const0_rtx
19554 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19555 return false;
19556 code = (code == GTU ? GEU : LTU);
19558 else
19560 rtx tmp = op1;
19561 op1 = op0;
19562 op0 = tmp;
19563 code = (code == GTU ? LTU : GEU);
19565 break;
19567 /* Convert a>=0 into (unsigned)a<0x80000000. */
19568 case LT:
19569 case GE:
19570 if (mode == DImode || op1 != const0_rtx)
19571 return false;
19572 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19573 code = (code == LT ? GEU : LTU);
19574 break;
19575 case LE:
19576 case GT:
19577 if (mode == DImode || op1 != constm1_rtx)
19578 return false;
19579 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19580 code = (code == LE ? GEU : LTU);
19581 break;
19583 default:
19584 return false;
19586 /* Swapping operands may cause constant to appear as first operand. */
19587 if (!nonimmediate_operand (op0, VOIDmode))
19589 if (!can_create_pseudo_p ())
19590 return false;
19591 op0 = force_reg (mode, op0);
19593 *pop = ix86_expand_compare (code, op0, op1);
19594 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19595 return true;
19598 bool
19599 ix86_expand_int_movcc (rtx operands[])
19601 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19602 rtx compare_seq, compare_op;
19603 enum machine_mode mode = GET_MODE (operands[0]);
19604 bool sign_bit_compare_p = false;
19605 rtx op0 = XEXP (operands[1], 0);
19606 rtx op1 = XEXP (operands[1], 1);
19608 if (GET_MODE (op0) == TImode
19609 || (GET_MODE (op0) == DImode
19610 && !TARGET_64BIT))
19611 return false;
19613 start_sequence ();
19614 compare_op = ix86_expand_compare (code, op0, op1);
19615 compare_seq = get_insns ();
19616 end_sequence ();
19618 compare_code = GET_CODE (compare_op);
19620 if ((op1 == const0_rtx && (code == GE || code == LT))
19621 || (op1 == constm1_rtx && (code == GT || code == LE)))
19622 sign_bit_compare_p = true;
19624 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19625 HImode insns, we'd be swallowed in word prefix ops. */
19627 if ((mode != HImode || TARGET_FAST_PREFIX)
19628 && (mode != (TARGET_64BIT ? TImode : DImode))
19629 && CONST_INT_P (operands[2])
19630 && CONST_INT_P (operands[3]))
19632 rtx out = operands[0];
19633 HOST_WIDE_INT ct = INTVAL (operands[2]);
19634 HOST_WIDE_INT cf = INTVAL (operands[3]);
19635 HOST_WIDE_INT diff;
19637 diff = ct - cf;
19638 /* Sign bit compares are better done using shifts than we do by using
19639 sbb. */
19640 if (sign_bit_compare_p
19641 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19643 /* Detect overlap between destination and compare sources. */
19644 rtx tmp = out;
19646 if (!sign_bit_compare_p)
19648 rtx flags;
19649 bool fpcmp = false;
19651 compare_code = GET_CODE (compare_op);
19653 flags = XEXP (compare_op, 0);
19655 if (GET_MODE (flags) == CCFPmode
19656 || GET_MODE (flags) == CCFPUmode)
19658 fpcmp = true;
19659 compare_code
19660 = ix86_fp_compare_code_to_integer (compare_code);
19663 /* To simplify rest of code, restrict to the GEU case. */
19664 if (compare_code == LTU)
19666 HOST_WIDE_INT tmp = ct;
19667 ct = cf;
19668 cf = tmp;
19669 compare_code = reverse_condition (compare_code);
19670 code = reverse_condition (code);
19672 else
19674 if (fpcmp)
19675 PUT_CODE (compare_op,
19676 reverse_condition_maybe_unordered
19677 (GET_CODE (compare_op)));
19678 else
19679 PUT_CODE (compare_op,
19680 reverse_condition (GET_CODE (compare_op)));
19682 diff = ct - cf;
19684 if (reg_overlap_mentioned_p (out, op0)
19685 || reg_overlap_mentioned_p (out, op1))
19686 tmp = gen_reg_rtx (mode);
19688 if (mode == DImode)
19689 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19690 else
19691 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19692 flags, compare_op));
19694 else
19696 if (code == GT || code == GE)
19697 code = reverse_condition (code);
19698 else
19700 HOST_WIDE_INT tmp = ct;
19701 ct = cf;
19702 cf = tmp;
19703 diff = ct - cf;
19705 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19708 if (diff == 1)
19711 * cmpl op0,op1
19712 * sbbl dest,dest
19713 * [addl dest, ct]
19715 * Size 5 - 8.
19717 if (ct)
19718 tmp = expand_simple_binop (mode, PLUS,
19719 tmp, GEN_INT (ct),
19720 copy_rtx (tmp), 1, OPTAB_DIRECT);
19722 else if (cf == -1)
19725 * cmpl op0,op1
19726 * sbbl dest,dest
19727 * orl $ct, dest
19729 * Size 8.
19731 tmp = expand_simple_binop (mode, IOR,
19732 tmp, GEN_INT (ct),
19733 copy_rtx (tmp), 1, OPTAB_DIRECT);
19735 else if (diff == -1 && ct)
19738 * cmpl op0,op1
19739 * sbbl dest,dest
19740 * notl dest
19741 * [addl dest, cf]
19743 * Size 8 - 11.
19745 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19746 if (cf)
19747 tmp = expand_simple_binop (mode, PLUS,
19748 copy_rtx (tmp), GEN_INT (cf),
19749 copy_rtx (tmp), 1, OPTAB_DIRECT);
19751 else
19754 * cmpl op0,op1
19755 * sbbl dest,dest
19756 * [notl dest]
19757 * andl cf - ct, dest
19758 * [addl dest, ct]
19760 * Size 8 - 11.
19763 if (cf == 0)
19765 cf = ct;
19766 ct = 0;
19767 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19770 tmp = expand_simple_binop (mode, AND,
19771 copy_rtx (tmp),
19772 gen_int_mode (cf - ct, mode),
19773 copy_rtx (tmp), 1, OPTAB_DIRECT);
19774 if (ct)
19775 tmp = expand_simple_binop (mode, PLUS,
19776 copy_rtx (tmp), GEN_INT (ct),
19777 copy_rtx (tmp), 1, OPTAB_DIRECT);
19780 if (!rtx_equal_p (tmp, out))
19781 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19783 return true;
19786 if (diff < 0)
19788 enum machine_mode cmp_mode = GET_MODE (op0);
19790 HOST_WIDE_INT tmp;
19791 tmp = ct, ct = cf, cf = tmp;
19792 diff = -diff;
19794 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19796 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19798 /* We may be reversing unordered compare to normal compare, that
19799 is not valid in general (we may convert non-trapping condition
19800 to trapping one), however on i386 we currently emit all
19801 comparisons unordered. */
19802 compare_code = reverse_condition_maybe_unordered (compare_code);
19803 code = reverse_condition_maybe_unordered (code);
19805 else
19807 compare_code = reverse_condition (compare_code);
19808 code = reverse_condition (code);
19812 compare_code = UNKNOWN;
19813 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19814 && CONST_INT_P (op1))
19816 if (op1 == const0_rtx
19817 && (code == LT || code == GE))
19818 compare_code = code;
19819 else if (op1 == constm1_rtx)
19821 if (code == LE)
19822 compare_code = LT;
19823 else if (code == GT)
19824 compare_code = GE;
19828 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19829 if (compare_code != UNKNOWN
19830 && GET_MODE (op0) == GET_MODE (out)
19831 && (cf == -1 || ct == -1))
19833 /* If lea code below could be used, only optimize
19834 if it results in a 2 insn sequence. */
19836 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19837 || diff == 3 || diff == 5 || diff == 9)
19838 || (compare_code == LT && ct == -1)
19839 || (compare_code == GE && cf == -1))
19842 * notl op1 (if necessary)
19843 * sarl $31, op1
19844 * orl cf, op1
19846 if (ct != -1)
19848 cf = ct;
19849 ct = -1;
19850 code = reverse_condition (code);
19853 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19855 out = expand_simple_binop (mode, IOR,
19856 out, GEN_INT (cf),
19857 out, 1, OPTAB_DIRECT);
19858 if (out != operands[0])
19859 emit_move_insn (operands[0], out);
19861 return true;
19866 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19867 || diff == 3 || diff == 5 || diff == 9)
19868 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19869 && (mode != DImode
19870 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19873 * xorl dest,dest
19874 * cmpl op1,op2
19875 * setcc dest
19876 * lea cf(dest*(ct-cf)),dest
19878 * Size 14.
19880 * This also catches the degenerate setcc-only case.
19883 rtx tmp;
19884 int nops;
19886 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19888 nops = 0;
19889 /* On x86_64 the lea instruction operates on Pmode, so we need
19890 to get arithmetics done in proper mode to match. */
19891 if (diff == 1)
19892 tmp = copy_rtx (out);
19893 else
19895 rtx out1;
19896 out1 = copy_rtx (out);
19897 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19898 nops++;
19899 if (diff & 1)
19901 tmp = gen_rtx_PLUS (mode, tmp, out1);
19902 nops++;
19905 if (cf != 0)
19907 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19908 nops++;
19910 if (!rtx_equal_p (tmp, out))
19912 if (nops == 1)
19913 out = force_operand (tmp, copy_rtx (out));
19914 else
19915 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19917 if (!rtx_equal_p (out, operands[0]))
19918 emit_move_insn (operands[0], copy_rtx (out));
19920 return true;
19924 * General case: Jumpful:
19925 * xorl dest,dest cmpl op1, op2
19926 * cmpl op1, op2 movl ct, dest
19927 * setcc dest jcc 1f
19928 * decl dest movl cf, dest
19929 * andl (cf-ct),dest 1:
19930 * addl ct,dest
19932 * Size 20. Size 14.
19934 * This is reasonably steep, but branch mispredict costs are
19935 * high on modern cpus, so consider failing only if optimizing
19936 * for space.
19939 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19940 && BRANCH_COST (optimize_insn_for_speed_p (),
19941 false) >= 2)
19943 if (cf == 0)
19945 enum machine_mode cmp_mode = GET_MODE (op0);
19947 cf = ct;
19948 ct = 0;
19950 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19952 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19954 /* We may be reversing unordered compare to normal compare,
19955 that is not valid in general (we may convert non-trapping
19956 condition to trapping one), however on i386 we currently
19957 emit all comparisons unordered. */
19958 code = reverse_condition_maybe_unordered (code);
19960 else
19962 code = reverse_condition (code);
19963 if (compare_code != UNKNOWN)
19964 compare_code = reverse_condition (compare_code);
19968 if (compare_code != UNKNOWN)
19970 /* notl op1 (if needed)
19971 sarl $31, op1
19972 andl (cf-ct), op1
19973 addl ct, op1
19975 For x < 0 (resp. x <= -1) there will be no notl,
19976 so if possible swap the constants to get rid of the
19977 complement.
19978 True/false will be -1/0 while code below (store flag
19979 followed by decrement) is 0/-1, so the constants need
19980 to be exchanged once more. */
19982 if (compare_code == GE || !cf)
19984 code = reverse_condition (code);
19985 compare_code = LT;
19987 else
19989 HOST_WIDE_INT tmp = cf;
19990 cf = ct;
19991 ct = tmp;
19994 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19996 else
19998 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20000 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
20001 constm1_rtx,
20002 copy_rtx (out), 1, OPTAB_DIRECT);
20005 out = expand_simple_binop (mode, AND, copy_rtx (out),
20006 gen_int_mode (cf - ct, mode),
20007 copy_rtx (out), 1, OPTAB_DIRECT);
20008 if (ct)
20009 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20010 copy_rtx (out), 1, OPTAB_DIRECT);
20011 if (!rtx_equal_p (out, operands[0]))
20012 emit_move_insn (operands[0], copy_rtx (out));
20014 return true;
20018 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20020 /* Try a few things more with specific constants and a variable. */
20022 optab op;
20023 rtx var, orig_out, out, tmp;
20025 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20026 return false;
20028 /* If one of the two operands is an interesting constant, load a
20029 constant with the above and mask it in with a logical operation. */
20031 if (CONST_INT_P (operands[2]))
20033 var = operands[3];
20034 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20035 operands[3] = constm1_rtx, op = and_optab;
20036 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20037 operands[3] = const0_rtx, op = ior_optab;
20038 else
20039 return false;
20041 else if (CONST_INT_P (operands[3]))
20043 var = operands[2];
20044 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20045 operands[2] = constm1_rtx, op = and_optab;
20046 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20047 operands[2] = const0_rtx, op = ior_optab;
20048 else
20049 return false;
20051 else
20052 return false;
20054 orig_out = operands[0];
20055 tmp = gen_reg_rtx (mode);
20056 operands[0] = tmp;
20058 /* Recurse to get the constant loaded. */
20059 if (ix86_expand_int_movcc (operands) == 0)
20060 return false;
20062 /* Mask in the interesting variable. */
20063 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20064 OPTAB_WIDEN);
20065 if (!rtx_equal_p (out, orig_out))
20066 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20068 return true;
20072 * For comparison with above,
20074 * movl cf,dest
20075 * movl ct,tmp
20076 * cmpl op1,op2
20077 * cmovcc tmp,dest
20079 * Size 15.
20082 if (! nonimmediate_operand (operands[2], mode))
20083 operands[2] = force_reg (mode, operands[2]);
20084 if (! nonimmediate_operand (operands[3], mode))
20085 operands[3] = force_reg (mode, operands[3]);
20087 if (! register_operand (operands[2], VOIDmode)
20088 && (mode == QImode
20089 || ! register_operand (operands[3], VOIDmode)))
20090 operands[2] = force_reg (mode, operands[2]);
20092 if (mode == QImode
20093 && ! register_operand (operands[3], VOIDmode))
20094 operands[3] = force_reg (mode, operands[3]);
20096 emit_insn (compare_seq);
20097 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20098 gen_rtx_IF_THEN_ELSE (mode,
20099 compare_op, operands[2],
20100 operands[3])));
20101 return true;
20104 /* Swap, force into registers, or otherwise massage the two operands
20105 to an sse comparison with a mask result. Thus we differ a bit from
20106 ix86_prepare_fp_compare_args which expects to produce a flags result.
20108 The DEST operand exists to help determine whether to commute commutative
20109 operators. The POP0/POP1 operands are updated in place. The new
20110 comparison code is returned, or UNKNOWN if not implementable. */
20112 static enum rtx_code
20113 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20114 rtx *pop0, rtx *pop1)
20116 rtx tmp;
20118 switch (code)
20120 case LTGT:
20121 case UNEQ:
20122 /* AVX supports all the needed comparisons. */
20123 if (TARGET_AVX)
20124 break;
20125 /* We have no LTGT as an operator. We could implement it with
20126 NE & ORDERED, but this requires an extra temporary. It's
20127 not clear that it's worth it. */
20128 return UNKNOWN;
20130 case LT:
20131 case LE:
20132 case UNGT:
20133 case UNGE:
20134 /* These are supported directly. */
20135 break;
20137 case EQ:
20138 case NE:
20139 case UNORDERED:
20140 case ORDERED:
20141 /* AVX has 3 operand comparisons, no need to swap anything. */
20142 if (TARGET_AVX)
20143 break;
20144 /* For commutative operators, try to canonicalize the destination
20145 operand to be first in the comparison - this helps reload to
20146 avoid extra moves. */
20147 if (!dest || !rtx_equal_p (dest, *pop1))
20148 break;
20149 /* FALLTHRU */
20151 case GE:
20152 case GT:
20153 case UNLE:
20154 case UNLT:
20155 /* These are not supported directly before AVX, and furthermore
20156 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20157 comparison operands to transform into something that is
20158 supported. */
20159 tmp = *pop0;
20160 *pop0 = *pop1;
20161 *pop1 = tmp;
20162 code = swap_condition (code);
20163 break;
20165 default:
20166 gcc_unreachable ();
20169 return code;
20172 /* Detect conditional moves that exactly match min/max operational
20173 semantics. Note that this is IEEE safe, as long as we don't
20174 interchange the operands.
20176 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20177 and TRUE if the operation is successful and instructions are emitted. */
20179 static bool
20180 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20181 rtx cmp_op1, rtx if_true, rtx if_false)
20183 enum machine_mode mode;
20184 bool is_min;
20185 rtx tmp;
20187 if (code == LT)
20189 else if (code == UNGE)
20191 tmp = if_true;
20192 if_true = if_false;
20193 if_false = tmp;
20195 else
20196 return false;
20198 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20199 is_min = true;
20200 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20201 is_min = false;
20202 else
20203 return false;
20205 mode = GET_MODE (dest);
20207 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20208 but MODE may be a vector mode and thus not appropriate. */
20209 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20211 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20212 rtvec v;
20214 if_true = force_reg (mode, if_true);
20215 v = gen_rtvec (2, if_true, if_false);
20216 tmp = gen_rtx_UNSPEC (mode, v, u);
20218 else
20220 code = is_min ? SMIN : SMAX;
20221 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20224 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20225 return true;
20228 /* Expand an sse vector comparison. Return the register with the result. */
20230 static rtx
20231 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20232 rtx op_true, rtx op_false)
20234 enum machine_mode mode = GET_MODE (dest);
20235 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20236 rtx x;
20238 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20239 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20240 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20242 if (optimize
20243 || reg_overlap_mentioned_p (dest, op_true)
20244 || reg_overlap_mentioned_p (dest, op_false))
20245 dest = gen_reg_rtx (mode);
20247 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20248 if (cmp_mode != mode)
20250 x = force_reg (cmp_mode, x);
20251 convert_move (dest, x, false);
20253 else
20254 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20256 return dest;
20259 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20260 operations. This is used for both scalar and vector conditional moves. */
20262 static void
20263 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20265 enum machine_mode mode = GET_MODE (dest);
20266 rtx t2, t3, x;
20268 if (vector_all_ones_operand (op_true, mode)
20269 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20271 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20273 else if (op_false == CONST0_RTX (mode))
20275 op_true = force_reg (mode, op_true);
20276 x = gen_rtx_AND (mode, cmp, op_true);
20277 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20279 else if (op_true == CONST0_RTX (mode))
20281 op_false = force_reg (mode, op_false);
20282 x = gen_rtx_NOT (mode, cmp);
20283 x = gen_rtx_AND (mode, x, op_false);
20284 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20286 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20288 op_false = force_reg (mode, op_false);
20289 x = gen_rtx_IOR (mode, cmp, op_false);
20290 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20292 else if (TARGET_XOP)
20294 op_true = force_reg (mode, op_true);
20296 if (!nonimmediate_operand (op_false, mode))
20297 op_false = force_reg (mode, op_false);
20299 emit_insn (gen_rtx_SET (mode, dest,
20300 gen_rtx_IF_THEN_ELSE (mode, cmp,
20301 op_true,
20302 op_false)));
20304 else
20306 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20308 if (!nonimmediate_operand (op_true, mode))
20309 op_true = force_reg (mode, op_true);
20311 op_false = force_reg (mode, op_false);
20313 switch (mode)
20315 case V4SFmode:
20316 if (TARGET_SSE4_1)
20317 gen = gen_sse4_1_blendvps;
20318 break;
20319 case V2DFmode:
20320 if (TARGET_SSE4_1)
20321 gen = gen_sse4_1_blendvpd;
20322 break;
20323 case V16QImode:
20324 case V8HImode:
20325 case V4SImode:
20326 case V2DImode:
20327 if (TARGET_SSE4_1)
20329 gen = gen_sse4_1_pblendvb;
20330 dest = gen_lowpart (V16QImode, dest);
20331 op_false = gen_lowpart (V16QImode, op_false);
20332 op_true = gen_lowpart (V16QImode, op_true);
20333 cmp = gen_lowpart (V16QImode, cmp);
20335 break;
20336 case V8SFmode:
20337 if (TARGET_AVX)
20338 gen = gen_avx_blendvps256;
20339 break;
20340 case V4DFmode:
20341 if (TARGET_AVX)
20342 gen = gen_avx_blendvpd256;
20343 break;
20344 case V32QImode:
20345 case V16HImode:
20346 case V8SImode:
20347 case V4DImode:
20348 if (TARGET_AVX2)
20350 gen = gen_avx2_pblendvb;
20351 dest = gen_lowpart (V32QImode, dest);
20352 op_false = gen_lowpart (V32QImode, op_false);
20353 op_true = gen_lowpart (V32QImode, op_true);
20354 cmp = gen_lowpart (V32QImode, cmp);
20356 break;
20357 default:
20358 break;
20361 if (gen != NULL)
20362 emit_insn (gen (dest, op_false, op_true, cmp));
20363 else
20365 op_true = force_reg (mode, op_true);
20367 t2 = gen_reg_rtx (mode);
20368 if (optimize)
20369 t3 = gen_reg_rtx (mode);
20370 else
20371 t3 = dest;
20373 x = gen_rtx_AND (mode, op_true, cmp);
20374 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20376 x = gen_rtx_NOT (mode, cmp);
20377 x = gen_rtx_AND (mode, x, op_false);
20378 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20380 x = gen_rtx_IOR (mode, t3, t2);
20381 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20386 /* Expand a floating-point conditional move. Return true if successful. */
20388 bool
20389 ix86_expand_fp_movcc (rtx operands[])
20391 enum machine_mode mode = GET_MODE (operands[0]);
20392 enum rtx_code code = GET_CODE (operands[1]);
20393 rtx tmp, compare_op;
20394 rtx op0 = XEXP (operands[1], 0);
20395 rtx op1 = XEXP (operands[1], 1);
20397 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20399 enum machine_mode cmode;
20401 /* Since we've no cmove for sse registers, don't force bad register
20402 allocation just to gain access to it. Deny movcc when the
20403 comparison mode doesn't match the move mode. */
20404 cmode = GET_MODE (op0);
20405 if (cmode == VOIDmode)
20406 cmode = GET_MODE (op1);
20407 if (cmode != mode)
20408 return false;
20410 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20411 if (code == UNKNOWN)
20412 return false;
20414 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20415 operands[2], operands[3]))
20416 return true;
20418 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20419 operands[2], operands[3]);
20420 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20421 return true;
20424 if (GET_MODE (op0) == TImode
20425 || (GET_MODE (op0) == DImode
20426 && !TARGET_64BIT))
20427 return false;
20429 /* The floating point conditional move instructions don't directly
20430 support conditions resulting from a signed integer comparison. */
20432 compare_op = ix86_expand_compare (code, op0, op1);
20433 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20435 tmp = gen_reg_rtx (QImode);
20436 ix86_expand_setcc (tmp, code, op0, op1);
20438 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20441 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20442 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20443 operands[2], operands[3])));
20445 return true;
20448 /* Expand a floating-point vector conditional move; a vcond operation
20449 rather than a movcc operation. */
20451 bool
20452 ix86_expand_fp_vcond (rtx operands[])
20454 enum rtx_code code = GET_CODE (operands[3]);
20455 rtx cmp;
20457 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20458 &operands[4], &operands[5]);
20459 if (code == UNKNOWN)
20461 rtx temp;
20462 switch (GET_CODE (operands[3]))
20464 case LTGT:
20465 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20466 operands[5], operands[0], operands[0]);
20467 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20468 operands[5], operands[1], operands[2]);
20469 code = AND;
20470 break;
20471 case UNEQ:
20472 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20473 operands[5], operands[0], operands[0]);
20474 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20475 operands[5], operands[1], operands[2]);
20476 code = IOR;
20477 break;
20478 default:
20479 gcc_unreachable ();
20481 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20482 OPTAB_DIRECT);
20483 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20484 return true;
20487 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20488 operands[5], operands[1], operands[2]))
20489 return true;
20491 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20492 operands[1], operands[2]);
20493 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20494 return true;
20497 /* Expand a signed/unsigned integral vector conditional move. */
20499 bool
20500 ix86_expand_int_vcond (rtx operands[])
20502 enum machine_mode data_mode = GET_MODE (operands[0]);
20503 enum machine_mode mode = GET_MODE (operands[4]);
20504 enum rtx_code code = GET_CODE (operands[3]);
20505 bool negate = false;
20506 rtx x, cop0, cop1;
20508 cop0 = operands[4];
20509 cop1 = operands[5];
20511 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20512 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20513 if ((code == LT || code == GE)
20514 && data_mode == mode
20515 && cop1 == CONST0_RTX (mode)
20516 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20517 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20518 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20519 && (GET_MODE_SIZE (data_mode) == 16
20520 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20522 rtx negop = operands[2 - (code == LT)];
20523 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20524 if (negop == CONST1_RTX (data_mode))
20526 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20527 operands[0], 1, OPTAB_DIRECT);
20528 if (res != operands[0])
20529 emit_move_insn (operands[0], res);
20530 return true;
20532 else if (GET_MODE_INNER (data_mode) != DImode
20533 && vector_all_ones_operand (negop, data_mode))
20535 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20536 operands[0], 0, OPTAB_DIRECT);
20537 if (res != operands[0])
20538 emit_move_insn (operands[0], res);
20539 return true;
20543 if (!nonimmediate_operand (cop1, mode))
20544 cop1 = force_reg (mode, cop1);
20545 if (!general_operand (operands[1], data_mode))
20546 operands[1] = force_reg (data_mode, operands[1]);
20547 if (!general_operand (operands[2], data_mode))
20548 operands[2] = force_reg (data_mode, operands[2]);
20550 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20551 if (TARGET_XOP
20552 && (mode == V16QImode || mode == V8HImode
20553 || mode == V4SImode || mode == V2DImode))
20555 else
20557 /* Canonicalize the comparison to EQ, GT, GTU. */
20558 switch (code)
20560 case EQ:
20561 case GT:
20562 case GTU:
20563 break;
20565 case NE:
20566 case LE:
20567 case LEU:
20568 code = reverse_condition (code);
20569 negate = true;
20570 break;
20572 case GE:
20573 case GEU:
20574 code = reverse_condition (code);
20575 negate = true;
20576 /* FALLTHRU */
20578 case LT:
20579 case LTU:
20580 code = swap_condition (code);
20581 x = cop0, cop0 = cop1, cop1 = x;
20582 break;
20584 default:
20585 gcc_unreachable ();
20588 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20589 if (mode == V2DImode)
20591 switch (code)
20593 case EQ:
20594 /* SSE4.1 supports EQ. */
20595 if (!TARGET_SSE4_1)
20596 return false;
20597 break;
20599 case GT:
20600 case GTU:
20601 /* SSE4.2 supports GT/GTU. */
20602 if (!TARGET_SSE4_2)
20603 return false;
20604 break;
20606 default:
20607 gcc_unreachable ();
20611 /* Unsigned parallel compare is not supported by the hardware.
20612 Play some tricks to turn this into a signed comparison
20613 against 0. */
20614 if (code == GTU)
20616 cop0 = force_reg (mode, cop0);
20618 switch (mode)
20620 case V8SImode:
20621 case V4DImode:
20622 case V4SImode:
20623 case V2DImode:
20625 rtx t1, t2, mask;
20626 rtx (*gen_sub3) (rtx, rtx, rtx);
20628 switch (mode)
20630 case V8SImode: gen_sub3 = gen_subv8si3; break;
20631 case V4DImode: gen_sub3 = gen_subv4di3; break;
20632 case V4SImode: gen_sub3 = gen_subv4si3; break;
20633 case V2DImode: gen_sub3 = gen_subv2di3; break;
20634 default:
20635 gcc_unreachable ();
20637 /* Subtract (-(INT MAX) - 1) from both operands to make
20638 them signed. */
20639 mask = ix86_build_signbit_mask (mode, true, false);
20640 t1 = gen_reg_rtx (mode);
20641 emit_insn (gen_sub3 (t1, cop0, mask));
20643 t2 = gen_reg_rtx (mode);
20644 emit_insn (gen_sub3 (t2, cop1, mask));
20646 cop0 = t1;
20647 cop1 = t2;
20648 code = GT;
20650 break;
20652 case V32QImode:
20653 case V16HImode:
20654 case V16QImode:
20655 case V8HImode:
20656 /* Perform a parallel unsigned saturating subtraction. */
20657 x = gen_reg_rtx (mode);
20658 emit_insn (gen_rtx_SET (VOIDmode, x,
20659 gen_rtx_US_MINUS (mode, cop0, cop1)));
20661 cop0 = x;
20662 cop1 = CONST0_RTX (mode);
20663 code = EQ;
20664 negate = !negate;
20665 break;
20667 default:
20668 gcc_unreachable ();
20673 /* Allow the comparison to be done in one mode, but the movcc to
20674 happen in another mode. */
20675 if (data_mode == mode)
20677 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20678 operands[1+negate], operands[2-negate]);
20680 else
20682 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20683 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20684 code, cop0, cop1,
20685 operands[1+negate], operands[2-negate]);
20686 x = gen_lowpart (data_mode, x);
20689 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20690 operands[2-negate]);
20691 return true;
20694 /* Expand a variable vector permutation. */
20696 void
20697 ix86_expand_vec_perm (rtx operands[])
20699 rtx target = operands[0];
20700 rtx op0 = operands[1];
20701 rtx op1 = operands[2];
20702 rtx mask = operands[3];
20703 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20704 enum machine_mode mode = GET_MODE (op0);
20705 enum machine_mode maskmode = GET_MODE (mask);
20706 int w, e, i;
20707 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20709 /* Number of elements in the vector. */
20710 w = GET_MODE_NUNITS (mode);
20711 e = GET_MODE_UNIT_SIZE (mode);
20712 gcc_assert (w <= 32);
20714 if (TARGET_AVX2)
20716 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20718 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20719 an constant shuffle operand. With a tiny bit of effort we can
20720 use VPERMD instead. A re-interpretation stall for V4DFmode is
20721 unfortunate but there's no avoiding it.
20722 Similarly for V16HImode we don't have instructions for variable
20723 shuffling, while for V32QImode we can use after preparing suitable
20724 masks vpshufb; vpshufb; vpermq; vpor. */
20726 if (mode == V16HImode)
20728 maskmode = mode = V32QImode;
20729 w = 32;
20730 e = 1;
20732 else
20734 maskmode = mode = V8SImode;
20735 w = 8;
20736 e = 4;
20738 t1 = gen_reg_rtx (maskmode);
20740 /* Replicate the low bits of the V4DImode mask into V8SImode:
20741 mask = { A B C D }
20742 t1 = { A A B B C C D D }. */
20743 for (i = 0; i < w / 2; ++i)
20744 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20745 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20746 vt = force_reg (maskmode, vt);
20747 mask = gen_lowpart (maskmode, mask);
20748 if (maskmode == V8SImode)
20749 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20750 else
20751 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20753 /* Multiply the shuffle indicies by two. */
20754 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20755 OPTAB_DIRECT);
20757 /* Add one to the odd shuffle indicies:
20758 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20759 for (i = 0; i < w / 2; ++i)
20761 vec[i * 2] = const0_rtx;
20762 vec[i * 2 + 1] = const1_rtx;
20764 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20765 vt = validize_mem (force_const_mem (maskmode, vt));
20766 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20767 OPTAB_DIRECT);
20769 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20770 operands[3] = mask = t1;
20771 target = gen_lowpart (mode, target);
20772 op0 = gen_lowpart (mode, op0);
20773 op1 = gen_lowpart (mode, op1);
20776 switch (mode)
20778 case V8SImode:
20779 /* The VPERMD and VPERMPS instructions already properly ignore
20780 the high bits of the shuffle elements. No need for us to
20781 perform an AND ourselves. */
20782 if (one_operand_shuffle)
20783 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20784 else
20786 t1 = gen_reg_rtx (V8SImode);
20787 t2 = gen_reg_rtx (V8SImode);
20788 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20789 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20790 goto merge_two;
20792 return;
20794 case V8SFmode:
20795 mask = gen_lowpart (V8SFmode, mask);
20796 if (one_operand_shuffle)
20797 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20798 else
20800 t1 = gen_reg_rtx (V8SFmode);
20801 t2 = gen_reg_rtx (V8SFmode);
20802 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20803 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20804 goto merge_two;
20806 return;
20808 case V4SImode:
20809 /* By combining the two 128-bit input vectors into one 256-bit
20810 input vector, we can use VPERMD and VPERMPS for the full
20811 two-operand shuffle. */
20812 t1 = gen_reg_rtx (V8SImode);
20813 t2 = gen_reg_rtx (V8SImode);
20814 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20815 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20816 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20817 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20818 return;
20820 case V4SFmode:
20821 t1 = gen_reg_rtx (V8SFmode);
20822 t2 = gen_reg_rtx (V8SImode);
20823 mask = gen_lowpart (V4SImode, mask);
20824 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20825 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20826 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20827 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20828 return;
20830 case V32QImode:
20831 t1 = gen_reg_rtx (V32QImode);
20832 t2 = gen_reg_rtx (V32QImode);
20833 t3 = gen_reg_rtx (V32QImode);
20834 vt2 = GEN_INT (128);
20835 for (i = 0; i < 32; i++)
20836 vec[i] = vt2;
20837 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20838 vt = force_reg (V32QImode, vt);
20839 for (i = 0; i < 32; i++)
20840 vec[i] = i < 16 ? vt2 : const0_rtx;
20841 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20842 vt2 = force_reg (V32QImode, vt2);
20843 /* From mask create two adjusted masks, which contain the same
20844 bits as mask in the low 7 bits of each vector element.
20845 The first mask will have the most significant bit clear
20846 if it requests element from the same 128-bit lane
20847 and MSB set if it requests element from the other 128-bit lane.
20848 The second mask will have the opposite values of the MSB,
20849 and additionally will have its 128-bit lanes swapped.
20850 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20851 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20852 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20853 stands for other 12 bytes. */
20854 /* The bit whether element is from the same lane or the other
20855 lane is bit 4, so shift it up by 3 to the MSB position. */
20856 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20857 gen_lowpart (V4DImode, mask),
20858 GEN_INT (3)));
20859 /* Clear MSB bits from the mask just in case it had them set. */
20860 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20861 /* After this t1 will have MSB set for elements from other lane. */
20862 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20863 /* Clear bits other than MSB. */
20864 emit_insn (gen_andv32qi3 (t1, t1, vt));
20865 /* Or in the lower bits from mask into t3. */
20866 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20867 /* And invert MSB bits in t1, so MSB is set for elements from the same
20868 lane. */
20869 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20870 /* Swap 128-bit lanes in t3. */
20871 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20872 gen_lowpart (V4DImode, t3),
20873 const2_rtx, GEN_INT (3),
20874 const0_rtx, const1_rtx));
20875 /* And or in the lower bits from mask into t1. */
20876 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20877 if (one_operand_shuffle)
20879 /* Each of these shuffles will put 0s in places where
20880 element from the other 128-bit lane is needed, otherwise
20881 will shuffle in the requested value. */
20882 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20883 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20884 /* For t3 the 128-bit lanes are swapped again. */
20885 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20886 gen_lowpart (V4DImode, t3),
20887 const2_rtx, GEN_INT (3),
20888 const0_rtx, const1_rtx));
20889 /* And oring both together leads to the result. */
20890 emit_insn (gen_iorv32qi3 (target, t1, t3));
20891 return;
20894 t4 = gen_reg_rtx (V32QImode);
20895 /* Similarly to the above one_operand_shuffle code,
20896 just for repeated twice for each operand. merge_two:
20897 code will merge the two results together. */
20898 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20899 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20900 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20901 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20902 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20903 gen_lowpart (V4DImode, t4),
20904 const2_rtx, GEN_INT (3),
20905 const0_rtx, const1_rtx));
20906 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20907 gen_lowpart (V4DImode, t3),
20908 const2_rtx, GEN_INT (3),
20909 const0_rtx, const1_rtx));
20910 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20911 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20912 t1 = t4;
20913 t2 = t3;
20914 goto merge_two;
20916 default:
20917 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20918 break;
20922 if (TARGET_XOP)
20924 /* The XOP VPPERM insn supports three inputs. By ignoring the
20925 one_operand_shuffle special case, we avoid creating another
20926 set of constant vectors in memory. */
20927 one_operand_shuffle = false;
20929 /* mask = mask & {2*w-1, ...} */
20930 vt = GEN_INT (2*w - 1);
20932 else
20934 /* mask = mask & {w-1, ...} */
20935 vt = GEN_INT (w - 1);
20938 for (i = 0; i < w; i++)
20939 vec[i] = vt;
20940 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20941 mask = expand_simple_binop (maskmode, AND, mask, vt,
20942 NULL_RTX, 0, OPTAB_DIRECT);
20944 /* For non-QImode operations, convert the word permutation control
20945 into a byte permutation control. */
20946 if (mode != V16QImode)
20948 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20949 GEN_INT (exact_log2 (e)),
20950 NULL_RTX, 0, OPTAB_DIRECT);
20952 /* Convert mask to vector of chars. */
20953 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20955 /* Replicate each of the input bytes into byte positions:
20956 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20957 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20958 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20959 for (i = 0; i < 16; ++i)
20960 vec[i] = GEN_INT (i/e * e);
20961 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20962 vt = validize_mem (force_const_mem (V16QImode, vt));
20963 if (TARGET_XOP)
20964 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20965 else
20966 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20968 /* Convert it into the byte positions by doing
20969 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20970 for (i = 0; i < 16; ++i)
20971 vec[i] = GEN_INT (i % e);
20972 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20973 vt = validize_mem (force_const_mem (V16QImode, vt));
20974 emit_insn (gen_addv16qi3 (mask, mask, vt));
20977 /* The actual shuffle operations all operate on V16QImode. */
20978 op0 = gen_lowpart (V16QImode, op0);
20979 op1 = gen_lowpart (V16QImode, op1);
20980 target = gen_lowpart (V16QImode, target);
20982 if (TARGET_XOP)
20984 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20986 else if (one_operand_shuffle)
20988 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20990 else
20992 rtx xops[6];
20993 bool ok;
20995 /* Shuffle the two input vectors independently. */
20996 t1 = gen_reg_rtx (V16QImode);
20997 t2 = gen_reg_rtx (V16QImode);
20998 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20999 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
21001 merge_two:
21002 /* Then merge them together. The key is whether any given control
21003 element contained a bit set that indicates the second word. */
21004 mask = operands[3];
21005 vt = GEN_INT (w);
21006 if (maskmode == V2DImode && !TARGET_SSE4_1)
21008 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21009 more shuffle to convert the V2DI input mask into a V4SI
21010 input mask. At which point the masking that expand_int_vcond
21011 will work as desired. */
21012 rtx t3 = gen_reg_rtx (V4SImode);
21013 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21014 const0_rtx, const0_rtx,
21015 const2_rtx, const2_rtx));
21016 mask = t3;
21017 maskmode = V4SImode;
21018 e = w = 4;
21021 for (i = 0; i < w; i++)
21022 vec[i] = vt;
21023 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21024 vt = force_reg (maskmode, vt);
21025 mask = expand_simple_binop (maskmode, AND, mask, vt,
21026 NULL_RTX, 0, OPTAB_DIRECT);
21028 xops[0] = gen_lowpart (mode, operands[0]);
21029 xops[1] = gen_lowpart (mode, t2);
21030 xops[2] = gen_lowpart (mode, t1);
21031 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21032 xops[4] = mask;
21033 xops[5] = vt;
21034 ok = ix86_expand_int_vcond (xops);
21035 gcc_assert (ok);
21039 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21040 true if we should do zero extension, else sign extension. HIGH_P is
21041 true if we want the N/2 high elements, else the low elements. */
21043 void
21044 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21046 enum machine_mode imode = GET_MODE (src);
21047 rtx tmp;
21049 if (TARGET_SSE4_1)
21051 rtx (*unpack)(rtx, rtx);
21052 rtx (*extract)(rtx, rtx) = NULL;
21053 enum machine_mode halfmode = BLKmode;
21055 switch (imode)
21057 case V32QImode:
21058 if (unsigned_p)
21059 unpack = gen_avx2_zero_extendv16qiv16hi2;
21060 else
21061 unpack = gen_avx2_sign_extendv16qiv16hi2;
21062 halfmode = V16QImode;
21063 extract
21064 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21065 break;
21066 case V16HImode:
21067 if (unsigned_p)
21068 unpack = gen_avx2_zero_extendv8hiv8si2;
21069 else
21070 unpack = gen_avx2_sign_extendv8hiv8si2;
21071 halfmode = V8HImode;
21072 extract
21073 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21074 break;
21075 case V8SImode:
21076 if (unsigned_p)
21077 unpack = gen_avx2_zero_extendv4siv4di2;
21078 else
21079 unpack = gen_avx2_sign_extendv4siv4di2;
21080 halfmode = V4SImode;
21081 extract
21082 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21083 break;
21084 case V16QImode:
21085 if (unsigned_p)
21086 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21087 else
21088 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21089 break;
21090 case V8HImode:
21091 if (unsigned_p)
21092 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21093 else
21094 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21095 break;
21096 case V4SImode:
21097 if (unsigned_p)
21098 unpack = gen_sse4_1_zero_extendv2siv2di2;
21099 else
21100 unpack = gen_sse4_1_sign_extendv2siv2di2;
21101 break;
21102 default:
21103 gcc_unreachable ();
21106 if (GET_MODE_SIZE (imode) == 32)
21108 tmp = gen_reg_rtx (halfmode);
21109 emit_insn (extract (tmp, src));
21111 else if (high_p)
21113 /* Shift higher 8 bytes to lower 8 bytes. */
21114 tmp = gen_reg_rtx (imode);
21115 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
21116 gen_lowpart (V1TImode, src),
21117 GEN_INT (64)));
21119 else
21120 tmp = src;
21122 emit_insn (unpack (dest, tmp));
21124 else
21126 rtx (*unpack)(rtx, rtx, rtx);
21128 switch (imode)
21130 case V16QImode:
21131 if (high_p)
21132 unpack = gen_vec_interleave_highv16qi;
21133 else
21134 unpack = gen_vec_interleave_lowv16qi;
21135 break;
21136 case V8HImode:
21137 if (high_p)
21138 unpack = gen_vec_interleave_highv8hi;
21139 else
21140 unpack = gen_vec_interleave_lowv8hi;
21141 break;
21142 case V4SImode:
21143 if (high_p)
21144 unpack = gen_vec_interleave_highv4si;
21145 else
21146 unpack = gen_vec_interleave_lowv4si;
21147 break;
21148 default:
21149 gcc_unreachable ();
21152 if (unsigned_p)
21153 tmp = force_reg (imode, CONST0_RTX (imode));
21154 else
21155 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21156 src, pc_rtx, pc_rtx);
21158 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
21162 /* Expand conditional increment or decrement using adb/sbb instructions.
21163 The default case using setcc followed by the conditional move can be
21164 done by generic code. */
21165 bool
21166 ix86_expand_int_addcc (rtx operands[])
21168 enum rtx_code code = GET_CODE (operands[1]);
21169 rtx flags;
21170 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21171 rtx compare_op;
21172 rtx val = const0_rtx;
21173 bool fpcmp = false;
21174 enum machine_mode mode;
21175 rtx op0 = XEXP (operands[1], 0);
21176 rtx op1 = XEXP (operands[1], 1);
21178 if (operands[3] != const1_rtx
21179 && operands[3] != constm1_rtx)
21180 return false;
21181 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21182 return false;
21183 code = GET_CODE (compare_op);
21185 flags = XEXP (compare_op, 0);
21187 if (GET_MODE (flags) == CCFPmode
21188 || GET_MODE (flags) == CCFPUmode)
21190 fpcmp = true;
21191 code = ix86_fp_compare_code_to_integer (code);
21194 if (code != LTU)
21196 val = constm1_rtx;
21197 if (fpcmp)
21198 PUT_CODE (compare_op,
21199 reverse_condition_maybe_unordered
21200 (GET_CODE (compare_op)));
21201 else
21202 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21205 mode = GET_MODE (operands[0]);
21207 /* Construct either adc or sbb insn. */
21208 if ((code == LTU) == (operands[3] == constm1_rtx))
21210 switch (mode)
21212 case QImode:
21213 insn = gen_subqi3_carry;
21214 break;
21215 case HImode:
21216 insn = gen_subhi3_carry;
21217 break;
21218 case SImode:
21219 insn = gen_subsi3_carry;
21220 break;
21221 case DImode:
21222 insn = gen_subdi3_carry;
21223 break;
21224 default:
21225 gcc_unreachable ();
21228 else
21230 switch (mode)
21232 case QImode:
21233 insn = gen_addqi3_carry;
21234 break;
21235 case HImode:
21236 insn = gen_addhi3_carry;
21237 break;
21238 case SImode:
21239 insn = gen_addsi3_carry;
21240 break;
21241 case DImode:
21242 insn = gen_adddi3_carry;
21243 break;
21244 default:
21245 gcc_unreachable ();
21248 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21250 return true;
21254 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21255 but works for floating pointer parameters and nonoffsetable memories.
21256 For pushes, it returns just stack offsets; the values will be saved
21257 in the right order. Maximally three parts are generated. */
21259 static int
21260 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21262 int size;
21264 if (!TARGET_64BIT)
21265 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21266 else
21267 size = (GET_MODE_SIZE (mode) + 4) / 8;
21269 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21270 gcc_assert (size >= 2 && size <= 4);
21272 /* Optimize constant pool reference to immediates. This is used by fp
21273 moves, that force all constants to memory to allow combining. */
21274 if (MEM_P (operand) && MEM_READONLY_P (operand))
21276 rtx tmp = maybe_get_pool_constant (operand);
21277 if (tmp)
21278 operand = tmp;
21281 if (MEM_P (operand) && !offsettable_memref_p (operand))
21283 /* The only non-offsetable memories we handle are pushes. */
21284 int ok = push_operand (operand, VOIDmode);
21286 gcc_assert (ok);
21288 operand = copy_rtx (operand);
21289 PUT_MODE (operand, word_mode);
21290 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21291 return size;
21294 if (GET_CODE (operand) == CONST_VECTOR)
21296 enum machine_mode imode = int_mode_for_mode (mode);
21297 /* Caution: if we looked through a constant pool memory above,
21298 the operand may actually have a different mode now. That's
21299 ok, since we want to pun this all the way back to an integer. */
21300 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21301 gcc_assert (operand != NULL);
21302 mode = imode;
21305 if (!TARGET_64BIT)
21307 if (mode == DImode)
21308 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21309 else
21311 int i;
21313 if (REG_P (operand))
21315 gcc_assert (reload_completed);
21316 for (i = 0; i < size; i++)
21317 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21319 else if (offsettable_memref_p (operand))
21321 operand = adjust_address (operand, SImode, 0);
21322 parts[0] = operand;
21323 for (i = 1; i < size; i++)
21324 parts[i] = adjust_address (operand, SImode, 4 * i);
21326 else if (GET_CODE (operand) == CONST_DOUBLE)
21328 REAL_VALUE_TYPE r;
21329 long l[4];
21331 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21332 switch (mode)
21334 case TFmode:
21335 real_to_target (l, &r, mode);
21336 parts[3] = gen_int_mode (l[3], SImode);
21337 parts[2] = gen_int_mode (l[2], SImode);
21338 break;
21339 case XFmode:
21340 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21341 long double may not be 80-bit. */
21342 real_to_target (l, &r, mode);
21343 parts[2] = gen_int_mode (l[2], SImode);
21344 break;
21345 case DFmode:
21346 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21347 break;
21348 default:
21349 gcc_unreachable ();
21351 parts[1] = gen_int_mode (l[1], SImode);
21352 parts[0] = gen_int_mode (l[0], SImode);
21354 else
21355 gcc_unreachable ();
21358 else
21360 if (mode == TImode)
21361 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21362 if (mode == XFmode || mode == TFmode)
21364 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21365 if (REG_P (operand))
21367 gcc_assert (reload_completed);
21368 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21369 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21371 else if (offsettable_memref_p (operand))
21373 operand = adjust_address (operand, DImode, 0);
21374 parts[0] = operand;
21375 parts[1] = adjust_address (operand, upper_mode, 8);
21377 else if (GET_CODE (operand) == CONST_DOUBLE)
21379 REAL_VALUE_TYPE r;
21380 long l[4];
21382 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21383 real_to_target (l, &r, mode);
21385 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21386 if (HOST_BITS_PER_WIDE_INT >= 64)
21387 parts[0]
21388 = gen_int_mode
21389 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21390 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21391 DImode);
21392 else
21393 parts[0] = immed_double_const (l[0], l[1], DImode);
21395 if (upper_mode == SImode)
21396 parts[1] = gen_int_mode (l[2], SImode);
21397 else if (HOST_BITS_PER_WIDE_INT >= 64)
21398 parts[1]
21399 = gen_int_mode
21400 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21401 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21402 DImode);
21403 else
21404 parts[1] = immed_double_const (l[2], l[3], DImode);
21406 else
21407 gcc_unreachable ();
21411 return size;
21414 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21415 Return false when normal moves are needed; true when all required
21416 insns have been emitted. Operands 2-4 contain the input values
21417 int the correct order; operands 5-7 contain the output values. */
21419 void
21420 ix86_split_long_move (rtx operands[])
21422 rtx part[2][4];
21423 int nparts, i, j;
21424 int push = 0;
21425 int collisions = 0;
21426 enum machine_mode mode = GET_MODE (operands[0]);
21427 bool collisionparts[4];
21429 /* The DFmode expanders may ask us to move double.
21430 For 64bit target this is single move. By hiding the fact
21431 here we simplify i386.md splitters. */
21432 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21434 /* Optimize constant pool reference to immediates. This is used by
21435 fp moves, that force all constants to memory to allow combining. */
21437 if (MEM_P (operands[1])
21438 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21439 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21440 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21441 if (push_operand (operands[0], VOIDmode))
21443 operands[0] = copy_rtx (operands[0]);
21444 PUT_MODE (operands[0], word_mode);
21446 else
21447 operands[0] = gen_lowpart (DImode, operands[0]);
21448 operands[1] = gen_lowpart (DImode, operands[1]);
21449 emit_move_insn (operands[0], operands[1]);
21450 return;
21453 /* The only non-offsettable memory we handle is push. */
21454 if (push_operand (operands[0], VOIDmode))
21455 push = 1;
21456 else
21457 gcc_assert (!MEM_P (operands[0])
21458 || offsettable_memref_p (operands[0]));
21460 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21461 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21463 /* When emitting push, take care for source operands on the stack. */
21464 if (push && MEM_P (operands[1])
21465 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21467 rtx src_base = XEXP (part[1][nparts - 1], 0);
21469 /* Compensate for the stack decrement by 4. */
21470 if (!TARGET_64BIT && nparts == 3
21471 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21472 src_base = plus_constant (Pmode, src_base, 4);
21474 /* src_base refers to the stack pointer and is
21475 automatically decreased by emitted push. */
21476 for (i = 0; i < nparts; i++)
21477 part[1][i] = change_address (part[1][i],
21478 GET_MODE (part[1][i]), src_base);
21481 /* We need to do copy in the right order in case an address register
21482 of the source overlaps the destination. */
21483 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21485 rtx tmp;
21487 for (i = 0; i < nparts; i++)
21489 collisionparts[i]
21490 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21491 if (collisionparts[i])
21492 collisions++;
21495 /* Collision in the middle part can be handled by reordering. */
21496 if (collisions == 1 && nparts == 3 && collisionparts [1])
21498 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21499 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21501 else if (collisions == 1
21502 && nparts == 4
21503 && (collisionparts [1] || collisionparts [2]))
21505 if (collisionparts [1])
21507 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21508 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21510 else
21512 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21513 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21517 /* If there are more collisions, we can't handle it by reordering.
21518 Do an lea to the last part and use only one colliding move. */
21519 else if (collisions > 1)
21521 rtx base;
21523 collisions = 1;
21525 base = part[0][nparts - 1];
21527 /* Handle the case when the last part isn't valid for lea.
21528 Happens in 64-bit mode storing the 12-byte XFmode. */
21529 if (GET_MODE (base) != Pmode)
21530 base = gen_rtx_REG (Pmode, REGNO (base));
21532 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21533 part[1][0] = replace_equiv_address (part[1][0], base);
21534 for (i = 1; i < nparts; i++)
21536 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21537 part[1][i] = replace_equiv_address (part[1][i], tmp);
21542 if (push)
21544 if (!TARGET_64BIT)
21546 if (nparts == 3)
21548 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21549 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21550 stack_pointer_rtx, GEN_INT (-4)));
21551 emit_move_insn (part[0][2], part[1][2]);
21553 else if (nparts == 4)
21555 emit_move_insn (part[0][3], part[1][3]);
21556 emit_move_insn (part[0][2], part[1][2]);
21559 else
21561 /* In 64bit mode we don't have 32bit push available. In case this is
21562 register, it is OK - we will just use larger counterpart. We also
21563 retype memory - these comes from attempt to avoid REX prefix on
21564 moving of second half of TFmode value. */
21565 if (GET_MODE (part[1][1]) == SImode)
21567 switch (GET_CODE (part[1][1]))
21569 case MEM:
21570 part[1][1] = adjust_address (part[1][1], DImode, 0);
21571 break;
21573 case REG:
21574 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21575 break;
21577 default:
21578 gcc_unreachable ();
21581 if (GET_MODE (part[1][0]) == SImode)
21582 part[1][0] = part[1][1];
21585 emit_move_insn (part[0][1], part[1][1]);
21586 emit_move_insn (part[0][0], part[1][0]);
21587 return;
21590 /* Choose correct order to not overwrite the source before it is copied. */
21591 if ((REG_P (part[0][0])
21592 && REG_P (part[1][1])
21593 && (REGNO (part[0][0]) == REGNO (part[1][1])
21594 || (nparts == 3
21595 && REGNO (part[0][0]) == REGNO (part[1][2]))
21596 || (nparts == 4
21597 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21598 || (collisions > 0
21599 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21601 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21603 operands[2 + i] = part[0][j];
21604 operands[6 + i] = part[1][j];
21607 else
21609 for (i = 0; i < nparts; i++)
21611 operands[2 + i] = part[0][i];
21612 operands[6 + i] = part[1][i];
21616 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21617 if (optimize_insn_for_size_p ())
21619 for (j = 0; j < nparts - 1; j++)
21620 if (CONST_INT_P (operands[6 + j])
21621 && operands[6 + j] != const0_rtx
21622 && REG_P (operands[2 + j]))
21623 for (i = j; i < nparts - 1; i++)
21624 if (CONST_INT_P (operands[7 + i])
21625 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21626 operands[7 + i] = operands[2 + j];
21629 for (i = 0; i < nparts; i++)
21630 emit_move_insn (operands[2 + i], operands[6 + i]);
21632 return;
21635 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21636 left shift by a constant, either using a single shift or
21637 a sequence of add instructions. */
21639 static void
21640 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21642 rtx (*insn)(rtx, rtx, rtx);
21644 if (count == 1
21645 || (count * ix86_cost->add <= ix86_cost->shift_const
21646 && !optimize_insn_for_size_p ()))
21648 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21649 while (count-- > 0)
21650 emit_insn (insn (operand, operand, operand));
21652 else
21654 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21655 emit_insn (insn (operand, operand, GEN_INT (count)));
21659 void
21660 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21662 rtx (*gen_ashl3)(rtx, rtx, rtx);
21663 rtx (*gen_shld)(rtx, rtx, rtx);
21664 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21666 rtx low[2], high[2];
21667 int count;
21669 if (CONST_INT_P (operands[2]))
21671 split_double_mode (mode, operands, 2, low, high);
21672 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21674 if (count >= half_width)
21676 emit_move_insn (high[0], low[1]);
21677 emit_move_insn (low[0], const0_rtx);
21679 if (count > half_width)
21680 ix86_expand_ashl_const (high[0], count - half_width, mode);
21682 else
21684 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21686 if (!rtx_equal_p (operands[0], operands[1]))
21687 emit_move_insn (operands[0], operands[1]);
21689 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21690 ix86_expand_ashl_const (low[0], count, mode);
21692 return;
21695 split_double_mode (mode, operands, 1, low, high);
21697 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21699 if (operands[1] == const1_rtx)
21701 /* Assuming we've chosen a QImode capable registers, then 1 << N
21702 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21703 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21705 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21707 ix86_expand_clear (low[0]);
21708 ix86_expand_clear (high[0]);
21709 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21711 d = gen_lowpart (QImode, low[0]);
21712 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21713 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21714 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21716 d = gen_lowpart (QImode, high[0]);
21717 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21718 s = gen_rtx_NE (QImode, flags, const0_rtx);
21719 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21722 /* Otherwise, we can get the same results by manually performing
21723 a bit extract operation on bit 5/6, and then performing the two
21724 shifts. The two methods of getting 0/1 into low/high are exactly
21725 the same size. Avoiding the shift in the bit extract case helps
21726 pentium4 a bit; no one else seems to care much either way. */
21727 else
21729 enum machine_mode half_mode;
21730 rtx (*gen_lshr3)(rtx, rtx, rtx);
21731 rtx (*gen_and3)(rtx, rtx, rtx);
21732 rtx (*gen_xor3)(rtx, rtx, rtx);
21733 HOST_WIDE_INT bits;
21734 rtx x;
21736 if (mode == DImode)
21738 half_mode = SImode;
21739 gen_lshr3 = gen_lshrsi3;
21740 gen_and3 = gen_andsi3;
21741 gen_xor3 = gen_xorsi3;
21742 bits = 5;
21744 else
21746 half_mode = DImode;
21747 gen_lshr3 = gen_lshrdi3;
21748 gen_and3 = gen_anddi3;
21749 gen_xor3 = gen_xordi3;
21750 bits = 6;
21753 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21754 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21755 else
21756 x = gen_lowpart (half_mode, operands[2]);
21757 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21759 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21760 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21761 emit_move_insn (low[0], high[0]);
21762 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21765 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21766 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21767 return;
21770 if (operands[1] == constm1_rtx)
21772 /* For -1 << N, we can avoid the shld instruction, because we
21773 know that we're shifting 0...31/63 ones into a -1. */
21774 emit_move_insn (low[0], constm1_rtx);
21775 if (optimize_insn_for_size_p ())
21776 emit_move_insn (high[0], low[0]);
21777 else
21778 emit_move_insn (high[0], constm1_rtx);
21780 else
21782 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21784 if (!rtx_equal_p (operands[0], operands[1]))
21785 emit_move_insn (operands[0], operands[1]);
21787 split_double_mode (mode, operands, 1, low, high);
21788 emit_insn (gen_shld (high[0], low[0], operands[2]));
21791 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21793 if (TARGET_CMOVE && scratch)
21795 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21796 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21798 ix86_expand_clear (scratch);
21799 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21801 else
21803 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21804 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21806 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21810 void
21811 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21813 rtx (*gen_ashr3)(rtx, rtx, rtx)
21814 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21815 rtx (*gen_shrd)(rtx, rtx, rtx);
21816 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21818 rtx low[2], high[2];
21819 int count;
21821 if (CONST_INT_P (operands[2]))
21823 split_double_mode (mode, operands, 2, low, high);
21824 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21826 if (count == GET_MODE_BITSIZE (mode) - 1)
21828 emit_move_insn (high[0], high[1]);
21829 emit_insn (gen_ashr3 (high[0], high[0],
21830 GEN_INT (half_width - 1)));
21831 emit_move_insn (low[0], high[0]);
21834 else if (count >= half_width)
21836 emit_move_insn (low[0], high[1]);
21837 emit_move_insn (high[0], low[0]);
21838 emit_insn (gen_ashr3 (high[0], high[0],
21839 GEN_INT (half_width - 1)));
21841 if (count > half_width)
21842 emit_insn (gen_ashr3 (low[0], low[0],
21843 GEN_INT (count - half_width)));
21845 else
21847 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21849 if (!rtx_equal_p (operands[0], operands[1]))
21850 emit_move_insn (operands[0], operands[1]);
21852 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21853 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21856 else
21858 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21860 if (!rtx_equal_p (operands[0], operands[1]))
21861 emit_move_insn (operands[0], operands[1]);
21863 split_double_mode (mode, operands, 1, low, high);
21865 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21866 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21868 if (TARGET_CMOVE && scratch)
21870 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21871 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21873 emit_move_insn (scratch, high[0]);
21874 emit_insn (gen_ashr3 (scratch, scratch,
21875 GEN_INT (half_width - 1)));
21876 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21877 scratch));
21879 else
21881 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21882 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21884 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21889 void
21890 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21892 rtx (*gen_lshr3)(rtx, rtx, rtx)
21893 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21894 rtx (*gen_shrd)(rtx, rtx, rtx);
21895 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21897 rtx low[2], high[2];
21898 int count;
21900 if (CONST_INT_P (operands[2]))
21902 split_double_mode (mode, operands, 2, low, high);
21903 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21905 if (count >= half_width)
21907 emit_move_insn (low[0], high[1]);
21908 ix86_expand_clear (high[0]);
21910 if (count > half_width)
21911 emit_insn (gen_lshr3 (low[0], low[0],
21912 GEN_INT (count - half_width)));
21914 else
21916 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21918 if (!rtx_equal_p (operands[0], operands[1]))
21919 emit_move_insn (operands[0], operands[1]);
21921 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21922 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21925 else
21927 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21929 if (!rtx_equal_p (operands[0], operands[1]))
21930 emit_move_insn (operands[0], operands[1]);
21932 split_double_mode (mode, operands, 1, low, high);
21934 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21935 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21937 if (TARGET_CMOVE && scratch)
21939 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21940 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21942 ix86_expand_clear (scratch);
21943 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21944 scratch));
21946 else
21948 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21949 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21951 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21956 /* Predict just emitted jump instruction to be taken with probability PROB. */
21957 static void
21958 predict_jump (int prob)
21960 rtx insn = get_last_insn ();
21961 gcc_assert (JUMP_P (insn));
21962 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21965 /* Helper function for the string operations below. Dest VARIABLE whether
21966 it is aligned to VALUE bytes. If true, jump to the label. */
21967 static rtx
21968 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21970 rtx label = gen_label_rtx ();
21971 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21972 if (GET_MODE (variable) == DImode)
21973 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21974 else
21975 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21976 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21977 1, label);
21978 if (epilogue)
21979 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21980 else
21981 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21982 return label;
21985 /* Adjust COUNTER by the VALUE. */
21986 static void
21987 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21989 rtx (*gen_add)(rtx, rtx, rtx)
21990 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21992 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21995 /* Zero extend possibly SImode EXP to Pmode register. */
21997 ix86_zero_extend_to_Pmode (rtx exp)
21999 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
22002 /* Divide COUNTREG by SCALE. */
22003 static rtx
22004 scale_counter (rtx countreg, int scale)
22006 rtx sc;
22008 if (scale == 1)
22009 return countreg;
22010 if (CONST_INT_P (countreg))
22011 return GEN_INT (INTVAL (countreg) / scale);
22012 gcc_assert (REG_P (countreg));
22014 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22015 GEN_INT (exact_log2 (scale)),
22016 NULL, 1, OPTAB_DIRECT);
22017 return sc;
22020 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22021 DImode for constant loop counts. */
22023 static enum machine_mode
22024 counter_mode (rtx count_exp)
22026 if (GET_MODE (count_exp) != VOIDmode)
22027 return GET_MODE (count_exp);
22028 if (!CONST_INT_P (count_exp))
22029 return Pmode;
22030 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22031 return DImode;
22032 return SImode;
22035 /* When SRCPTR is non-NULL, output simple loop to move memory
22036 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
22037 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
22038 equivalent loop to set memory by VALUE (supposed to be in MODE).
22040 The size is rounded down to whole number of chunk size moved at once.
22041 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22044 static void
22045 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22046 rtx destptr, rtx srcptr, rtx value,
22047 rtx count, enum machine_mode mode, int unroll,
22048 int expected_size)
22050 rtx out_label, top_label, iter, tmp;
22051 enum machine_mode iter_mode = counter_mode (count);
22052 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22053 rtx piece_size = GEN_INT (piece_size_n);
22054 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22055 rtx size;
22056 int i;
22058 top_label = gen_label_rtx ();
22059 out_label = gen_label_rtx ();
22060 iter = gen_reg_rtx (iter_mode);
22062 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22063 NULL, 1, OPTAB_DIRECT);
22064 /* Those two should combine. */
22065 if (piece_size == const1_rtx)
22067 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22068 true, out_label);
22069 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22071 emit_move_insn (iter, const0_rtx);
22073 emit_label (top_label);
22075 tmp = convert_modes (Pmode, iter_mode, iter, true);
22077 /* This assert could be relaxed - in this case we'll need to compute
22078 smallest power of two, containing in PIECE_SIZE_N and pass it to
22079 offset_address. */
22080 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22081 destmem = offset_address (destmem, tmp, piece_size_n);
22082 destmem = adjust_address (destmem, mode, 0);
22084 if (srcmem)
22086 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22087 srcmem = adjust_address (srcmem, mode, 0);
22089 /* When unrolling for chips that reorder memory reads and writes,
22090 we can save registers by using single temporary.
22091 Also using 4 temporaries is overkill in 32bit mode. */
22092 if (!TARGET_64BIT && 0)
22094 for (i = 0; i < unroll; i++)
22096 if (i)
22098 destmem =
22099 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22100 srcmem =
22101 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22103 emit_move_insn (destmem, srcmem);
22106 else
22108 rtx tmpreg[4];
22109 gcc_assert (unroll <= 4);
22110 for (i = 0; i < unroll; i++)
22112 tmpreg[i] = gen_reg_rtx (mode);
22113 if (i)
22115 srcmem =
22116 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22118 emit_move_insn (tmpreg[i], srcmem);
22120 for (i = 0; i < unroll; i++)
22122 if (i)
22124 destmem =
22125 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22127 emit_move_insn (destmem, tmpreg[i]);
22131 else
22132 for (i = 0; i < unroll; i++)
22134 if (i)
22135 destmem =
22136 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22137 emit_move_insn (destmem, value);
22140 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22141 true, OPTAB_LIB_WIDEN);
22142 if (tmp != iter)
22143 emit_move_insn (iter, tmp);
22145 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22146 true, top_label);
22147 if (expected_size != -1)
22149 expected_size /= GET_MODE_SIZE (mode) * unroll;
22150 if (expected_size == 0)
22151 predict_jump (0);
22152 else if (expected_size > REG_BR_PROB_BASE)
22153 predict_jump (REG_BR_PROB_BASE - 1);
22154 else
22155 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22157 else
22158 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22159 iter = ix86_zero_extend_to_Pmode (iter);
22160 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22161 true, OPTAB_LIB_WIDEN);
22162 if (tmp != destptr)
22163 emit_move_insn (destptr, tmp);
22164 if (srcptr)
22166 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22167 true, OPTAB_LIB_WIDEN);
22168 if (tmp != srcptr)
22169 emit_move_insn (srcptr, tmp);
22171 emit_label (out_label);
22174 /* Output "rep; mov" instruction.
22175 Arguments have same meaning as for previous function */
22176 static void
22177 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
22178 rtx destptr, rtx srcptr,
22179 rtx count,
22180 enum machine_mode mode)
22182 rtx destexp;
22183 rtx srcexp;
22184 rtx countreg;
22185 HOST_WIDE_INT rounded_count;
22187 /* If the size is known, it is shorter to use rep movs. */
22188 if (mode == QImode && CONST_INT_P (count)
22189 && !(INTVAL (count) & 3))
22190 mode = SImode;
22192 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22193 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22194 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22195 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22196 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22197 if (mode != QImode)
22199 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22200 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22201 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22202 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22203 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22204 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22206 else
22208 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22209 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22211 if (CONST_INT_P (count))
22213 rounded_count = (INTVAL (count)
22214 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22215 destmem = shallow_copy_rtx (destmem);
22216 srcmem = shallow_copy_rtx (srcmem);
22217 set_mem_size (destmem, rounded_count);
22218 set_mem_size (srcmem, rounded_count);
22220 else
22222 if (MEM_SIZE_KNOWN_P (destmem))
22223 clear_mem_size (destmem);
22224 if (MEM_SIZE_KNOWN_P (srcmem))
22225 clear_mem_size (srcmem);
22227 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22228 destexp, srcexp));
22231 /* Output "rep; stos" instruction.
22232 Arguments have same meaning as for previous function */
22233 static void
22234 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
22235 rtx count, enum machine_mode mode,
22236 rtx orig_value)
22238 rtx destexp;
22239 rtx countreg;
22240 HOST_WIDE_INT rounded_count;
22242 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22243 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22244 value = force_reg (mode, gen_lowpart (mode, value));
22245 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22246 if (mode != QImode)
22248 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22249 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22250 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22252 else
22253 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22254 if (orig_value == const0_rtx && CONST_INT_P (count))
22256 rounded_count = (INTVAL (count)
22257 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22258 destmem = shallow_copy_rtx (destmem);
22259 set_mem_size (destmem, rounded_count);
22261 else if (MEM_SIZE_KNOWN_P (destmem))
22262 clear_mem_size (destmem);
22263 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22266 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22267 DESTMEM.
22268 SRC is passed by pointer to be updated on return.
22269 Return value is updated DST. */
22270 static rtx
22271 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22272 HOST_WIDE_INT size_to_move)
22274 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22275 enum insn_code code;
22276 enum machine_mode move_mode;
22277 int piece_size, i;
22279 /* Find the widest mode in which we could perform moves.
22280 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22281 it until move of such size is supported. */
22282 piece_size = 1 << floor_log2 (size_to_move);
22283 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22284 code = optab_handler (mov_optab, move_mode);
22285 while (code == CODE_FOR_nothing && piece_size > 1)
22287 piece_size >>= 1;
22288 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22289 code = optab_handler (mov_optab, move_mode);
22292 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22293 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22294 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22296 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22297 move_mode = mode_for_vector (word_mode, nunits);
22298 code = optab_handler (mov_optab, move_mode);
22299 if (code == CODE_FOR_nothing)
22301 move_mode = word_mode;
22302 piece_size = GET_MODE_SIZE (move_mode);
22303 code = optab_handler (mov_optab, move_mode);
22306 gcc_assert (code != CODE_FOR_nothing);
22308 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22309 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22311 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22312 gcc_assert (size_to_move % piece_size == 0);
22313 adjust = GEN_INT (piece_size);
22314 for (i = 0; i < size_to_move; i += piece_size)
22316 /* We move from memory to memory, so we'll need to do it via
22317 a temporary register. */
22318 tempreg = gen_reg_rtx (move_mode);
22319 emit_insn (GEN_FCN (code) (tempreg, src));
22320 emit_insn (GEN_FCN (code) (dst, tempreg));
22322 emit_move_insn (destptr,
22323 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22324 emit_move_insn (srcptr,
22325 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22327 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22328 piece_size);
22329 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22330 piece_size);
22333 /* Update DST and SRC rtx. */
22334 *srcmem = src;
22335 return dst;
22338 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22339 static void
22340 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22341 rtx destptr, rtx srcptr, rtx count, int max_size)
22343 rtx src, dest;
22344 if (CONST_INT_P (count))
22346 HOST_WIDE_INT countval = INTVAL (count);
22347 HOST_WIDE_INT epilogue_size = countval % max_size;
22348 int i;
22350 /* For now MAX_SIZE should be a power of 2. This assert could be
22351 relaxed, but it'll require a bit more complicated epilogue
22352 expanding. */
22353 gcc_assert ((max_size & (max_size - 1)) == 0);
22354 for (i = max_size; i >= 1; i >>= 1)
22356 if (epilogue_size & i)
22357 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22359 return;
22361 if (max_size > 8)
22363 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22364 count, 1, OPTAB_DIRECT);
22365 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22366 count, QImode, 1, 4);
22367 return;
22370 /* When there are stringops, we can cheaply increase dest and src pointers.
22371 Otherwise we save code size by maintaining offset (zero is readily
22372 available from preceding rep operation) and using x86 addressing modes.
22374 if (TARGET_SINGLE_STRINGOP)
22376 if (max_size > 4)
22378 rtx label = ix86_expand_aligntest (count, 4, true);
22379 src = change_address (srcmem, SImode, srcptr);
22380 dest = change_address (destmem, SImode, destptr);
22381 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22382 emit_label (label);
22383 LABEL_NUSES (label) = 1;
22385 if (max_size > 2)
22387 rtx label = ix86_expand_aligntest (count, 2, true);
22388 src = change_address (srcmem, HImode, srcptr);
22389 dest = change_address (destmem, HImode, destptr);
22390 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22391 emit_label (label);
22392 LABEL_NUSES (label) = 1;
22394 if (max_size > 1)
22396 rtx label = ix86_expand_aligntest (count, 1, true);
22397 src = change_address (srcmem, QImode, srcptr);
22398 dest = change_address (destmem, QImode, destptr);
22399 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22400 emit_label (label);
22401 LABEL_NUSES (label) = 1;
22404 else
22406 rtx offset = force_reg (Pmode, const0_rtx);
22407 rtx tmp;
22409 if (max_size > 4)
22411 rtx label = ix86_expand_aligntest (count, 4, true);
22412 src = change_address (srcmem, SImode, srcptr);
22413 dest = change_address (destmem, SImode, destptr);
22414 emit_move_insn (dest, src);
22415 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22416 true, OPTAB_LIB_WIDEN);
22417 if (tmp != offset)
22418 emit_move_insn (offset, tmp);
22419 emit_label (label);
22420 LABEL_NUSES (label) = 1;
22422 if (max_size > 2)
22424 rtx label = ix86_expand_aligntest (count, 2, true);
22425 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22426 src = change_address (srcmem, HImode, tmp);
22427 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22428 dest = change_address (destmem, HImode, tmp);
22429 emit_move_insn (dest, src);
22430 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22431 true, OPTAB_LIB_WIDEN);
22432 if (tmp != offset)
22433 emit_move_insn (offset, tmp);
22434 emit_label (label);
22435 LABEL_NUSES (label) = 1;
22437 if (max_size > 1)
22439 rtx label = ix86_expand_aligntest (count, 1, true);
22440 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22441 src = change_address (srcmem, QImode, tmp);
22442 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22443 dest = change_address (destmem, QImode, tmp);
22444 emit_move_insn (dest, src);
22445 emit_label (label);
22446 LABEL_NUSES (label) = 1;
22451 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22452 static void
22453 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22454 rtx count, int max_size)
22456 count =
22457 expand_simple_binop (counter_mode (count), AND, count,
22458 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22459 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22460 gen_lowpart (QImode, value), count, QImode,
22461 1, max_size / 2);
22464 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22465 static void
22466 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22468 rtx dest;
22470 if (CONST_INT_P (count))
22472 HOST_WIDE_INT countval = INTVAL (count);
22473 int offset = 0;
22475 if ((countval & 0x10) && max_size > 16)
22477 if (TARGET_64BIT)
22479 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22480 emit_insn (gen_strset (destptr, dest, value));
22481 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22482 emit_insn (gen_strset (destptr, dest, value));
22484 else
22485 gcc_unreachable ();
22486 offset += 16;
22488 if ((countval & 0x08) && max_size > 8)
22490 if (TARGET_64BIT)
22492 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22493 emit_insn (gen_strset (destptr, dest, value));
22495 else
22497 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22498 emit_insn (gen_strset (destptr, dest, value));
22499 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22500 emit_insn (gen_strset (destptr, dest, value));
22502 offset += 8;
22504 if ((countval & 0x04) && max_size > 4)
22506 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22507 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22508 offset += 4;
22510 if ((countval & 0x02) && max_size > 2)
22512 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22513 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22514 offset += 2;
22516 if ((countval & 0x01) && max_size > 1)
22518 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22519 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22520 offset += 1;
22522 return;
22524 if (max_size > 32)
22526 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22527 return;
22529 if (max_size > 16)
22531 rtx label = ix86_expand_aligntest (count, 16, true);
22532 if (TARGET_64BIT)
22534 dest = change_address (destmem, DImode, destptr);
22535 emit_insn (gen_strset (destptr, dest, value));
22536 emit_insn (gen_strset (destptr, dest, value));
22538 else
22540 dest = change_address (destmem, SImode, destptr);
22541 emit_insn (gen_strset (destptr, dest, value));
22542 emit_insn (gen_strset (destptr, dest, value));
22543 emit_insn (gen_strset (destptr, dest, value));
22544 emit_insn (gen_strset (destptr, dest, value));
22546 emit_label (label);
22547 LABEL_NUSES (label) = 1;
22549 if (max_size > 8)
22551 rtx label = ix86_expand_aligntest (count, 8, true);
22552 if (TARGET_64BIT)
22554 dest = change_address (destmem, DImode, destptr);
22555 emit_insn (gen_strset (destptr, dest, value));
22557 else
22559 dest = change_address (destmem, SImode, destptr);
22560 emit_insn (gen_strset (destptr, dest, value));
22561 emit_insn (gen_strset (destptr, dest, value));
22563 emit_label (label);
22564 LABEL_NUSES (label) = 1;
22566 if (max_size > 4)
22568 rtx label = ix86_expand_aligntest (count, 4, true);
22569 dest = change_address (destmem, SImode, destptr);
22570 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22571 emit_label (label);
22572 LABEL_NUSES (label) = 1;
22574 if (max_size > 2)
22576 rtx label = ix86_expand_aligntest (count, 2, true);
22577 dest = change_address (destmem, HImode, destptr);
22578 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22579 emit_label (label);
22580 LABEL_NUSES (label) = 1;
22582 if (max_size > 1)
22584 rtx label = ix86_expand_aligntest (count, 1, true);
22585 dest = change_address (destmem, QImode, destptr);
22586 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22587 emit_label (label);
22588 LABEL_NUSES (label) = 1;
22592 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22593 DESIRED_ALIGNMENT.
22594 Return value is updated DESTMEM. */
22595 static rtx
22596 expand_movmem_prologue (rtx destmem, rtx srcmem,
22597 rtx destptr, rtx srcptr, rtx count,
22598 int align, int desired_alignment)
22600 int i;
22601 for (i = 1; i < desired_alignment; i <<= 1)
22603 if (align <= i)
22605 rtx label = ix86_expand_aligntest (destptr, i, false);
22606 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22607 ix86_adjust_counter (count, i);
22608 emit_label (label);
22609 LABEL_NUSES (label) = 1;
22610 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
22613 return destmem;
22616 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22617 ALIGN_BYTES is how many bytes need to be copied.
22618 The function updates DST and SRC, namely, it sets proper alignment.
22619 DST is returned via return value, SRC is updated via pointer SRCP. */
22620 static rtx
22621 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22622 int desired_align, int align_bytes)
22624 rtx src = *srcp;
22625 rtx orig_dst = dst;
22626 rtx orig_src = src;
22627 int piece_size = 1;
22628 int copied_bytes = 0;
22629 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22630 if (src_align_bytes >= 0)
22631 src_align_bytes = desired_align - src_align_bytes;
22633 for (piece_size = 1;
22634 piece_size <= desired_align && copied_bytes < align_bytes;
22635 piece_size <<= 1)
22637 if (align_bytes & piece_size)
22639 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
22640 copied_bytes += piece_size;
22644 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22645 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22646 if (src_align_bytes >= 0)
22648 unsigned int src_align;
22649 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
22651 if ((src_align_bytes & (src_align - 1))
22652 == (align_bytes & (src_align - 1)))
22653 break;
22655 if (src_align > (unsigned int) desired_align)
22656 src_align = desired_align;
22657 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22658 set_mem_align (src, src_align * BITS_PER_UNIT);
22660 if (MEM_SIZE_KNOWN_P (orig_dst))
22661 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22662 if (MEM_SIZE_KNOWN_P (orig_src))
22663 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22664 *srcp = src;
22665 return dst;
22668 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22669 DESIRED_ALIGNMENT. */
22670 static void
22671 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22672 int align, int desired_alignment)
22674 if (align <= 1 && desired_alignment > 1)
22676 rtx label = ix86_expand_aligntest (destptr, 1, false);
22677 destmem = change_address (destmem, QImode, destptr);
22678 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22679 ix86_adjust_counter (count, 1);
22680 emit_label (label);
22681 LABEL_NUSES (label) = 1;
22683 if (align <= 2 && desired_alignment > 2)
22685 rtx label = ix86_expand_aligntest (destptr, 2, false);
22686 destmem = change_address (destmem, HImode, destptr);
22687 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22688 ix86_adjust_counter (count, 2);
22689 emit_label (label);
22690 LABEL_NUSES (label) = 1;
22692 if (align <= 4 && desired_alignment > 4)
22694 rtx label = ix86_expand_aligntest (destptr, 4, false);
22695 destmem = change_address (destmem, SImode, destptr);
22696 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22697 ix86_adjust_counter (count, 4);
22698 emit_label (label);
22699 LABEL_NUSES (label) = 1;
22701 gcc_assert (desired_alignment <= 8);
22704 /* Set enough from DST to align DST known to by aligned by ALIGN to
22705 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22706 static rtx
22707 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22708 int desired_align, int align_bytes)
22710 int off = 0;
22711 rtx orig_dst = dst;
22712 if (align_bytes & 1)
22714 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22715 off = 1;
22716 emit_insn (gen_strset (destreg, dst,
22717 gen_lowpart (QImode, value)));
22719 if (align_bytes & 2)
22721 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22722 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22723 set_mem_align (dst, 2 * BITS_PER_UNIT);
22724 off = 2;
22725 emit_insn (gen_strset (destreg, dst,
22726 gen_lowpart (HImode, value)));
22728 if (align_bytes & 4)
22730 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22731 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22732 set_mem_align (dst, 4 * BITS_PER_UNIT);
22733 off = 4;
22734 emit_insn (gen_strset (destreg, dst,
22735 gen_lowpart (SImode, value)));
22737 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22738 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22739 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22740 if (MEM_SIZE_KNOWN_P (orig_dst))
22741 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22742 return dst;
22745 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22746 static enum stringop_alg
22747 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22748 int *dynamic_check, bool *noalign)
22750 const struct stringop_algs * algs;
22751 bool optimize_for_speed;
22752 /* Algorithms using the rep prefix want at least edi and ecx;
22753 additionally, memset wants eax and memcpy wants esi. Don't
22754 consider such algorithms if the user has appropriated those
22755 registers for their own purposes. */
22756 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22757 || (memset
22758 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22759 *noalign = false;
22761 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22762 || (alg != rep_prefix_1_byte \
22763 && alg != rep_prefix_4_byte \
22764 && alg != rep_prefix_8_byte))
22765 const struct processor_costs *cost;
22767 /* Even if the string operation call is cold, we still might spend a lot
22768 of time processing large blocks. */
22769 if (optimize_function_for_size_p (cfun)
22770 || (optimize_insn_for_size_p ()
22771 && expected_size != -1 && expected_size < 256))
22772 optimize_for_speed = false;
22773 else
22774 optimize_for_speed = true;
22776 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22778 *dynamic_check = -1;
22779 if (memset)
22780 algs = &cost->memset[TARGET_64BIT != 0];
22781 else
22782 algs = &cost->memcpy[TARGET_64BIT != 0];
22783 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22784 return ix86_stringop_alg;
22785 /* rep; movq or rep; movl is the smallest variant. */
22786 else if (!optimize_for_speed)
22788 if (!count || (count & 3))
22789 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22790 else
22791 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22793 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22795 else if (expected_size != -1 && expected_size < 4)
22796 return loop_1_byte;
22797 else if (expected_size != -1)
22799 unsigned int i;
22800 enum stringop_alg alg = libcall;
22801 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22803 /* We get here if the algorithms that were not libcall-based
22804 were rep-prefix based and we are unable to use rep prefixes
22805 based on global register usage. Break out of the loop and
22806 use the heuristic below. */
22807 if (algs->size[i].max == 0)
22808 break;
22809 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22811 enum stringop_alg candidate = algs->size[i].alg;
22813 if (candidate != libcall && ALG_USABLE_P (candidate))
22814 alg = candidate;
22815 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22816 last non-libcall inline algorithm. */
22817 if (TARGET_INLINE_ALL_STRINGOPS)
22819 /* When the current size is best to be copied by a libcall,
22820 but we are still forced to inline, run the heuristic below
22821 that will pick code for medium sized blocks. */
22822 if (alg != libcall)
22823 return alg;
22824 break;
22826 else if (ALG_USABLE_P (candidate))
22828 *noalign = algs->size[i].noalign;
22829 return candidate;
22833 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22835 /* When asked to inline the call anyway, try to pick meaningful choice.
22836 We look for maximal size of block that is faster to copy by hand and
22837 take blocks of at most of that size guessing that average size will
22838 be roughly half of the block.
22840 If this turns out to be bad, we might simply specify the preferred
22841 choice in ix86_costs. */
22842 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22843 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22845 int max = -1;
22846 enum stringop_alg alg;
22847 int i;
22848 bool any_alg_usable_p = true;
22850 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22852 enum stringop_alg candidate = algs->size[i].alg;
22853 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22855 if (candidate != libcall && candidate
22856 && ALG_USABLE_P (candidate))
22857 max = algs->size[i].max;
22859 /* If there aren't any usable algorithms, then recursing on
22860 smaller sizes isn't going to find anything. Just return the
22861 simple byte-at-a-time copy loop. */
22862 if (!any_alg_usable_p)
22864 /* Pick something reasonable. */
22865 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22866 *dynamic_check = 128;
22867 return loop_1_byte;
22869 if (max == -1)
22870 max = 4096;
22871 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22872 gcc_assert (*dynamic_check == -1);
22873 gcc_assert (alg != libcall);
22874 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22875 *dynamic_check = max;
22876 return alg;
22878 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22879 #undef ALG_USABLE_P
22882 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22883 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22884 static int
22885 decide_alignment (int align,
22886 enum stringop_alg alg,
22887 int expected_size,
22888 enum machine_mode move_mode)
22890 int desired_align = 0;
22892 gcc_assert (alg != no_stringop);
22894 if (alg == libcall)
22895 return 0;
22896 if (move_mode == VOIDmode)
22897 return 0;
22899 desired_align = GET_MODE_SIZE (move_mode);
22900 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22901 copying whole cacheline at once. */
22902 if (TARGET_PENTIUMPRO
22903 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
22904 desired_align = 8;
22906 if (optimize_size)
22907 desired_align = 1;
22908 if (desired_align < align)
22909 desired_align = align;
22910 if (expected_size != -1 && expected_size < 4)
22911 desired_align = align;
22913 return desired_align;
22916 /* Expand string move (memcpy) operation. Use i386 string operations
22917 when profitable. expand_setmem contains similar code. The code
22918 depends upon architecture, block size and alignment, but always has
22919 the same overall structure:
22921 1) Prologue guard: Conditional that jumps up to epilogues for small
22922 blocks that can be handled by epilogue alone. This is faster
22923 but also needed for correctness, since prologue assume the block
22924 is larger than the desired alignment.
22926 Optional dynamic check for size and libcall for large
22927 blocks is emitted here too, with -minline-stringops-dynamically.
22929 2) Prologue: copy first few bytes in order to get destination
22930 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22931 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22932 copied. We emit either a jump tree on power of two sized
22933 blocks, or a byte loop.
22935 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22936 with specified algorithm.
22938 4) Epilogue: code copying tail of the block that is too small to be
22939 handled by main body (or up to size guarded by prologue guard). */
22941 bool
22942 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22943 rtx expected_align_exp, rtx expected_size_exp)
22945 rtx destreg;
22946 rtx srcreg;
22947 rtx label = NULL;
22948 rtx tmp;
22949 rtx jump_around_label = NULL;
22950 HOST_WIDE_INT align = 1;
22951 unsigned HOST_WIDE_INT count = 0;
22952 HOST_WIDE_INT expected_size = -1;
22953 int size_needed = 0, epilogue_size_needed;
22954 int desired_align = 0, align_bytes = 0;
22955 enum stringop_alg alg;
22956 int dynamic_check;
22957 bool need_zero_guard = false;
22958 bool noalign;
22959 enum machine_mode move_mode = VOIDmode;
22960 int unroll_factor = 1;
22962 if (CONST_INT_P (align_exp))
22963 align = INTVAL (align_exp);
22964 /* i386 can do misaligned access on reasonably increased cost. */
22965 if (CONST_INT_P (expected_align_exp)
22966 && INTVAL (expected_align_exp) > align)
22967 align = INTVAL (expected_align_exp);
22968 /* ALIGN is the minimum of destination and source alignment, but we care here
22969 just about destination alignment. */
22970 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22971 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22973 if (CONST_INT_P (count_exp))
22974 count = expected_size = INTVAL (count_exp);
22975 if (CONST_INT_P (expected_size_exp) && count == 0)
22976 expected_size = INTVAL (expected_size_exp);
22978 /* Make sure we don't need to care about overflow later on. */
22979 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22980 return false;
22982 /* Step 0: Decide on preferred algorithm, desired alignment and
22983 size of chunks to be copied by main loop. */
22984 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22985 if (alg == libcall)
22986 return false;
22987 gcc_assert (alg != no_stringop);
22989 if (!count)
22990 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22991 destreg = copy_addr_to_reg (XEXP (dst, 0));
22992 srcreg = copy_addr_to_reg (XEXP (src, 0));
22994 unroll_factor = 1;
22995 move_mode = word_mode;
22996 switch (alg)
22998 case libcall:
22999 case no_stringop:
23000 case last_alg:
23001 gcc_unreachable ();
23002 case loop_1_byte:
23003 need_zero_guard = true;
23004 move_mode = QImode;
23005 break;
23006 case loop:
23007 need_zero_guard = true;
23008 break;
23009 case unrolled_loop:
23010 need_zero_guard = true;
23011 unroll_factor = (TARGET_64BIT ? 4 : 2);
23012 break;
23013 case vector_loop:
23014 need_zero_guard = true;
23015 unroll_factor = 4;
23016 /* Find the widest supported mode. */
23017 move_mode = word_mode;
23018 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23019 != CODE_FOR_nothing)
23020 move_mode = GET_MODE_WIDER_MODE (move_mode);
23022 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23023 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23024 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23026 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23027 move_mode = mode_for_vector (word_mode, nunits);
23028 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23029 move_mode = word_mode;
23031 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23032 break;
23033 case rep_prefix_8_byte:
23034 move_mode = DImode;
23035 break;
23036 case rep_prefix_4_byte:
23037 move_mode = SImode;
23038 break;
23039 case rep_prefix_1_byte:
23040 move_mode = QImode;
23041 break;
23043 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23044 epilogue_size_needed = size_needed;
23046 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23047 if (!TARGET_ALIGN_STRINGOPS || noalign)
23048 align = desired_align;
23050 /* Step 1: Prologue guard. */
23052 /* Alignment code needs count to be in register. */
23053 if (CONST_INT_P (count_exp) && desired_align > align)
23055 if (INTVAL (count_exp) > desired_align
23056 && INTVAL (count_exp) > size_needed)
23058 align_bytes
23059 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23060 if (align_bytes <= 0)
23061 align_bytes = 0;
23062 else
23063 align_bytes = desired_align - align_bytes;
23065 if (align_bytes == 0)
23066 count_exp = force_reg (counter_mode (count_exp), count_exp);
23068 gcc_assert (desired_align >= 1 && align >= 1);
23070 /* Ensure that alignment prologue won't copy past end of block. */
23071 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23073 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23074 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
23075 Make sure it is power of 2. */
23076 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23078 if (count)
23080 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23082 /* If main algorithm works on QImode, no epilogue is needed.
23083 For small sizes just don't align anything. */
23084 if (size_needed == 1)
23085 desired_align = align;
23086 else
23087 goto epilogue;
23090 else
23092 label = gen_label_rtx ();
23093 emit_cmp_and_jump_insns (count_exp,
23094 GEN_INT (epilogue_size_needed),
23095 LTU, 0, counter_mode (count_exp), 1, label);
23096 if (expected_size == -1 || expected_size < epilogue_size_needed)
23097 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23098 else
23099 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23103 /* Emit code to decide on runtime whether library call or inline should be
23104 used. */
23105 if (dynamic_check != -1)
23107 if (CONST_INT_P (count_exp))
23109 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
23111 emit_block_move_via_libcall (dst, src, count_exp, false);
23112 count_exp = const0_rtx;
23113 goto epilogue;
23116 else
23118 rtx hot_label = gen_label_rtx ();
23119 jump_around_label = gen_label_rtx ();
23120 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23121 LEU, 0, GET_MODE (count_exp), 1, hot_label);
23122 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23123 emit_block_move_via_libcall (dst, src, count_exp, false);
23124 emit_jump (jump_around_label);
23125 emit_label (hot_label);
23129 /* Step 2: Alignment prologue. */
23131 if (desired_align > align)
23133 if (align_bytes == 0)
23135 /* Except for the first move in epilogue, we no longer know
23136 constant offset in aliasing info. It don't seems to worth
23137 the pain to maintain it for the first move, so throw away
23138 the info early. */
23139 src = change_address (src, BLKmode, srcreg);
23140 dst = change_address (dst, BLKmode, destreg);
23141 dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
23142 desired_align);
23144 else
23146 /* If we know how many bytes need to be stored before dst is
23147 sufficiently aligned, maintain aliasing info accurately. */
23148 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
23149 desired_align, align_bytes);
23150 count_exp = plus_constant (counter_mode (count_exp),
23151 count_exp, -align_bytes);
23152 count -= align_bytes;
23154 if (need_zero_guard
23155 && (count < (unsigned HOST_WIDE_INT) size_needed
23156 || (align_bytes == 0
23157 && count < ((unsigned HOST_WIDE_INT) size_needed
23158 + desired_align - align))))
23160 /* It is possible that we copied enough so the main loop will not
23161 execute. */
23162 gcc_assert (size_needed > 1);
23163 if (label == NULL_RTX)
23164 label = gen_label_rtx ();
23165 emit_cmp_and_jump_insns (count_exp,
23166 GEN_INT (size_needed),
23167 LTU, 0, counter_mode (count_exp), 1, label);
23168 if (expected_size == -1
23169 || expected_size < (desired_align - align) / 2 + size_needed)
23170 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23171 else
23172 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23175 if (label && size_needed == 1)
23177 emit_label (label);
23178 LABEL_NUSES (label) = 1;
23179 label = NULL;
23180 epilogue_size_needed = 1;
23182 else if (label == NULL_RTX)
23183 epilogue_size_needed = size_needed;
23185 /* Step 3: Main loop. */
23187 switch (alg)
23189 case libcall:
23190 case no_stringop:
23191 case last_alg:
23192 gcc_unreachable ();
23193 case loop_1_byte:
23194 case loop:
23195 case unrolled_loop:
23196 case vector_loop:
23197 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
23198 count_exp, move_mode, unroll_factor,
23199 expected_size);
23200 break;
23201 case rep_prefix_8_byte:
23202 case rep_prefix_4_byte:
23203 case rep_prefix_1_byte:
23204 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
23205 move_mode);
23206 break;
23208 /* Adjust properly the offset of src and dest memory for aliasing. */
23209 if (CONST_INT_P (count_exp))
23211 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
23212 (count / size_needed) * size_needed);
23213 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23214 (count / size_needed) * size_needed);
23216 else
23218 src = change_address (src, BLKmode, srcreg);
23219 dst = change_address (dst, BLKmode, destreg);
23222 /* Step 4: Epilogue to copy the remaining bytes. */
23223 epilogue:
23224 if (label)
23226 /* When the main loop is done, COUNT_EXP might hold original count,
23227 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23228 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23229 bytes. Compensate if needed. */
23231 if (size_needed < epilogue_size_needed)
23233 tmp =
23234 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23235 GEN_INT (size_needed - 1), count_exp, 1,
23236 OPTAB_DIRECT);
23237 if (tmp != count_exp)
23238 emit_move_insn (count_exp, tmp);
23240 emit_label (label);
23241 LABEL_NUSES (label) = 1;
23244 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23245 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
23246 epilogue_size_needed);
23247 if (jump_around_label)
23248 emit_label (jump_around_label);
23249 return true;
23252 /* Helper function for memcpy. For QImode value 0xXY produce
23253 0xXYXYXYXY of wide specified by MODE. This is essentially
23254 a * 0x10101010, but we can do slightly better than
23255 synth_mult by unwinding the sequence by hand on CPUs with
23256 slow multiply. */
23257 static rtx
23258 promote_duplicated_reg (enum machine_mode mode, rtx val)
23260 enum machine_mode valmode = GET_MODE (val);
23261 rtx tmp;
23262 int nops = mode == DImode ? 3 : 2;
23264 gcc_assert (mode == SImode || mode == DImode);
23265 if (val == const0_rtx)
23266 return copy_to_mode_reg (mode, const0_rtx);
23267 if (CONST_INT_P (val))
23269 HOST_WIDE_INT v = INTVAL (val) & 255;
23271 v |= v << 8;
23272 v |= v << 16;
23273 if (mode == DImode)
23274 v |= (v << 16) << 16;
23275 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23278 if (valmode == VOIDmode)
23279 valmode = QImode;
23280 if (valmode != QImode)
23281 val = gen_lowpart (QImode, val);
23282 if (mode == QImode)
23283 return val;
23284 if (!TARGET_PARTIAL_REG_STALL)
23285 nops--;
23286 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23287 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23288 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23289 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23291 rtx reg = convert_modes (mode, QImode, val, true);
23292 tmp = promote_duplicated_reg (mode, const1_rtx);
23293 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23294 OPTAB_DIRECT);
23296 else
23298 rtx reg = convert_modes (mode, QImode, val, true);
23300 if (!TARGET_PARTIAL_REG_STALL)
23301 if (mode == SImode)
23302 emit_insn (gen_movsi_insv_1 (reg, reg));
23303 else
23304 emit_insn (gen_movdi_insv_1 (reg, reg));
23305 else
23307 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23308 NULL, 1, OPTAB_DIRECT);
23309 reg =
23310 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23312 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23313 NULL, 1, OPTAB_DIRECT);
23314 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23315 if (mode == SImode)
23316 return reg;
23317 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23318 NULL, 1, OPTAB_DIRECT);
23319 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23320 return reg;
23324 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23325 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23326 alignment from ALIGN to DESIRED_ALIGN. */
23327 static rtx
23328 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23330 rtx promoted_val;
23332 if (TARGET_64BIT
23333 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23334 promoted_val = promote_duplicated_reg (DImode, val);
23335 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23336 promoted_val = promote_duplicated_reg (SImode, val);
23337 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23338 promoted_val = promote_duplicated_reg (HImode, val);
23339 else
23340 promoted_val = val;
23342 return promoted_val;
23345 /* Expand string clear operation (bzero). Use i386 string operations when
23346 profitable. See expand_movmem comment for explanation of individual
23347 steps performed. */
23348 bool
23349 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23350 rtx expected_align_exp, rtx expected_size_exp)
23352 rtx destreg;
23353 rtx label = NULL;
23354 rtx tmp;
23355 rtx jump_around_label = NULL;
23356 HOST_WIDE_INT align = 1;
23357 unsigned HOST_WIDE_INT count = 0;
23358 HOST_WIDE_INT expected_size = -1;
23359 int size_needed = 0, epilogue_size_needed;
23360 int desired_align = 0, align_bytes = 0;
23361 enum stringop_alg alg;
23362 rtx promoted_val = NULL;
23363 bool force_loopy_epilogue = false;
23364 int dynamic_check;
23365 bool need_zero_guard = false;
23366 bool noalign;
23367 enum machine_mode move_mode = VOIDmode;
23368 int unroll_factor;
23370 if (CONST_INT_P (align_exp))
23371 align = INTVAL (align_exp);
23372 /* i386 can do misaligned access on reasonably increased cost. */
23373 if (CONST_INT_P (expected_align_exp)
23374 && INTVAL (expected_align_exp) > align)
23375 align = INTVAL (expected_align_exp);
23376 if (CONST_INT_P (count_exp))
23377 count = expected_size = INTVAL (count_exp);
23378 if (CONST_INT_P (expected_size_exp) && count == 0)
23379 expected_size = INTVAL (expected_size_exp);
23381 /* Make sure we don't need to care about overflow later on. */
23382 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23383 return false;
23385 /* Step 0: Decide on preferred algorithm, desired alignment and
23386 size of chunks to be copied by main loop. */
23388 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23389 if (alg == libcall)
23390 return false;
23391 gcc_assert (alg != no_stringop);
23393 if (!count)
23394 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23395 destreg = copy_addr_to_reg (XEXP (dst, 0));
23397 move_mode = word_mode;
23398 unroll_factor = 1;
23399 switch (alg)
23401 case libcall:
23402 case no_stringop:
23403 case last_alg:
23404 gcc_unreachable ();
23405 case loop:
23406 need_zero_guard = true;
23407 break;
23408 case vector_loop:
23409 case unrolled_loop:
23410 need_zero_guard = true;
23411 unroll_factor = 4;
23412 break;
23413 case rep_prefix_8_byte:
23414 move_mode = DImode;
23415 break;
23416 case rep_prefix_4_byte:
23417 move_mode = SImode;
23418 break;
23419 case rep_prefix_1_byte:
23420 move_mode = QImode;
23421 break;
23422 case loop_1_byte:
23423 need_zero_guard = true;
23424 move_mode = QImode;
23425 break;
23427 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23428 epilogue_size_needed = size_needed;
23430 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23431 if (!TARGET_ALIGN_STRINGOPS || noalign)
23432 align = desired_align;
23434 /* Step 1: Prologue guard. */
23436 /* Alignment code needs count to be in register. */
23437 if (CONST_INT_P (count_exp) && desired_align > align)
23439 if (INTVAL (count_exp) > desired_align
23440 && INTVAL (count_exp) > size_needed)
23442 align_bytes
23443 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23444 if (align_bytes <= 0)
23445 align_bytes = 0;
23446 else
23447 align_bytes = desired_align - align_bytes;
23449 if (align_bytes == 0)
23451 enum machine_mode mode = SImode;
23452 if (TARGET_64BIT && (count & ~0xffffffff))
23453 mode = DImode;
23454 count_exp = force_reg (mode, count_exp);
23457 /* Do the cheap promotion to allow better CSE across the
23458 main loop and epilogue (ie one load of the big constant in the
23459 front of all code. */
23460 if (CONST_INT_P (val_exp))
23461 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23462 desired_align, align);
23463 /* Ensure that alignment prologue won't copy past end of block. */
23464 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23466 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23467 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23468 Make sure it is power of 2. */
23469 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23471 /* To improve performance of small blocks, we jump around the VAL
23472 promoting mode. This mean that if the promoted VAL is not constant,
23473 we might not use it in the epilogue and have to use byte
23474 loop variant. */
23475 if (epilogue_size_needed > 2 && !promoted_val)
23476 force_loopy_epilogue = true;
23477 if (count)
23479 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23481 /* If main algorithm works on QImode, no epilogue is needed.
23482 For small sizes just don't align anything. */
23483 if (size_needed == 1)
23484 desired_align = align;
23485 else
23486 goto epilogue;
23489 else
23491 label = gen_label_rtx ();
23492 emit_cmp_and_jump_insns (count_exp,
23493 GEN_INT (epilogue_size_needed),
23494 LTU, 0, counter_mode (count_exp), 1, label);
23495 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23496 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23497 else
23498 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23501 if (dynamic_check != -1)
23503 rtx hot_label = gen_label_rtx ();
23504 jump_around_label = gen_label_rtx ();
23505 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23506 LEU, 0, counter_mode (count_exp), 1, hot_label);
23507 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23508 set_storage_via_libcall (dst, count_exp, val_exp, false);
23509 emit_jump (jump_around_label);
23510 emit_label (hot_label);
23513 /* Step 2: Alignment prologue. */
23515 /* Do the expensive promotion once we branched off the small blocks. */
23516 if (!promoted_val)
23517 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23518 desired_align, align);
23519 gcc_assert (desired_align >= 1 && align >= 1);
23521 if (desired_align > align)
23523 if (align_bytes == 0)
23525 /* Except for the first move in epilogue, we no longer know
23526 constant offset in aliasing info. It don't seems to worth
23527 the pain to maintain it for the first move, so throw away
23528 the info early. */
23529 dst = change_address (dst, BLKmode, destreg);
23530 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23531 desired_align);
23533 else
23535 /* If we know how many bytes need to be stored before dst is
23536 sufficiently aligned, maintain aliasing info accurately. */
23537 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23538 desired_align, align_bytes);
23539 count_exp = plus_constant (counter_mode (count_exp),
23540 count_exp, -align_bytes);
23541 count -= align_bytes;
23543 if (need_zero_guard
23544 && (count < (unsigned HOST_WIDE_INT) size_needed
23545 || (align_bytes == 0
23546 && count < ((unsigned HOST_WIDE_INT) size_needed
23547 + desired_align - align))))
23549 /* It is possible that we copied enough so the main loop will not
23550 execute. */
23551 gcc_assert (size_needed > 1);
23552 if (label == NULL_RTX)
23553 label = gen_label_rtx ();
23554 emit_cmp_and_jump_insns (count_exp,
23555 GEN_INT (size_needed),
23556 LTU, 0, counter_mode (count_exp), 1, label);
23557 if (expected_size == -1
23558 || expected_size < (desired_align - align) / 2 + size_needed)
23559 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23560 else
23561 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23564 if (label && size_needed == 1)
23566 emit_label (label);
23567 LABEL_NUSES (label) = 1;
23568 label = NULL;
23569 promoted_val = val_exp;
23570 epilogue_size_needed = 1;
23572 else if (label == NULL_RTX)
23573 epilogue_size_needed = size_needed;
23575 /* Step 3: Main loop. */
23577 switch (alg)
23579 case libcall:
23580 case no_stringop:
23581 case last_alg:
23582 gcc_unreachable ();
23583 case loop_1_byte:
23584 case loop:
23585 case vector_loop:
23586 case unrolled_loop:
23587 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23588 count_exp, move_mode, unroll_factor,
23589 expected_size);
23590 break;
23591 case rep_prefix_8_byte:
23592 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23593 DImode, val_exp);
23594 break;
23595 case rep_prefix_4_byte:
23596 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23597 SImode, val_exp);
23598 break;
23599 case rep_prefix_1_byte:
23600 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23601 QImode, val_exp);
23602 break;
23604 /* Adjust properly the offset of src and dest memory for aliasing. */
23605 if (CONST_INT_P (count_exp))
23606 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23607 (count / size_needed) * size_needed);
23608 else
23609 dst = change_address (dst, BLKmode, destreg);
23611 /* Step 4: Epilogue to copy the remaining bytes. */
23613 if (label)
23615 /* When the main loop is done, COUNT_EXP might hold original count,
23616 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23617 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23618 bytes. Compensate if needed. */
23620 if (size_needed < epilogue_size_needed)
23622 tmp =
23623 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23624 GEN_INT (size_needed - 1), count_exp, 1,
23625 OPTAB_DIRECT);
23626 if (tmp != count_exp)
23627 emit_move_insn (count_exp, tmp);
23629 emit_label (label);
23630 LABEL_NUSES (label) = 1;
23632 epilogue:
23633 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23635 if (force_loopy_epilogue)
23636 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23637 epilogue_size_needed);
23638 else
23639 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23640 epilogue_size_needed);
23642 if (jump_around_label)
23643 emit_label (jump_around_label);
23644 return true;
23647 /* Expand the appropriate insns for doing strlen if not just doing
23648 repnz; scasb
23650 out = result, initialized with the start address
23651 align_rtx = alignment of the address.
23652 scratch = scratch register, initialized with the startaddress when
23653 not aligned, otherwise undefined
23655 This is just the body. It needs the initializations mentioned above and
23656 some address computing at the end. These things are done in i386.md. */
23658 static void
23659 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23661 int align;
23662 rtx tmp;
23663 rtx align_2_label = NULL_RTX;
23664 rtx align_3_label = NULL_RTX;
23665 rtx align_4_label = gen_label_rtx ();
23666 rtx end_0_label = gen_label_rtx ();
23667 rtx mem;
23668 rtx tmpreg = gen_reg_rtx (SImode);
23669 rtx scratch = gen_reg_rtx (SImode);
23670 rtx cmp;
23672 align = 0;
23673 if (CONST_INT_P (align_rtx))
23674 align = INTVAL (align_rtx);
23676 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23678 /* Is there a known alignment and is it less than 4? */
23679 if (align < 4)
23681 rtx scratch1 = gen_reg_rtx (Pmode);
23682 emit_move_insn (scratch1, out);
23683 /* Is there a known alignment and is it not 2? */
23684 if (align != 2)
23686 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23687 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23689 /* Leave just the 3 lower bits. */
23690 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23691 NULL_RTX, 0, OPTAB_WIDEN);
23693 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23694 Pmode, 1, align_4_label);
23695 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23696 Pmode, 1, align_2_label);
23697 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23698 Pmode, 1, align_3_label);
23700 else
23702 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23703 check if is aligned to 4 - byte. */
23705 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23706 NULL_RTX, 0, OPTAB_WIDEN);
23708 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23709 Pmode, 1, align_4_label);
23712 mem = change_address (src, QImode, out);
23714 /* Now compare the bytes. */
23716 /* Compare the first n unaligned byte on a byte per byte basis. */
23717 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23718 QImode, 1, end_0_label);
23720 /* Increment the address. */
23721 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23723 /* Not needed with an alignment of 2 */
23724 if (align != 2)
23726 emit_label (align_2_label);
23728 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23729 end_0_label);
23731 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23733 emit_label (align_3_label);
23736 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23737 end_0_label);
23739 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23742 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23743 align this loop. It gives only huge programs, but does not help to
23744 speed up. */
23745 emit_label (align_4_label);
23747 mem = change_address (src, SImode, out);
23748 emit_move_insn (scratch, mem);
23749 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23751 /* This formula yields a nonzero result iff one of the bytes is zero.
23752 This saves three branches inside loop and many cycles. */
23754 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23755 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23756 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23757 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23758 gen_int_mode (0x80808080, SImode)));
23759 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23760 align_4_label);
23762 if (TARGET_CMOVE)
23764 rtx reg = gen_reg_rtx (SImode);
23765 rtx reg2 = gen_reg_rtx (Pmode);
23766 emit_move_insn (reg, tmpreg);
23767 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23769 /* If zero is not in the first two bytes, move two bytes forward. */
23770 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23771 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23772 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23773 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23774 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23775 reg,
23776 tmpreg)));
23777 /* Emit lea manually to avoid clobbering of flags. */
23778 emit_insn (gen_rtx_SET (SImode, reg2,
23779 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23781 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23782 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23783 emit_insn (gen_rtx_SET (VOIDmode, out,
23784 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23785 reg2,
23786 out)));
23788 else
23790 rtx end_2_label = gen_label_rtx ();
23791 /* Is zero in the first two bytes? */
23793 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23794 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23795 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23796 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23797 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23798 pc_rtx);
23799 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23800 JUMP_LABEL (tmp) = end_2_label;
23802 /* Not in the first two. Move two bytes forward. */
23803 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23804 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23806 emit_label (end_2_label);
23810 /* Avoid branch in fixing the byte. */
23811 tmpreg = gen_lowpart (QImode, tmpreg);
23812 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23813 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23814 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23815 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23817 emit_label (end_0_label);
23820 /* Expand strlen. */
23822 bool
23823 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23825 rtx addr, scratch1, scratch2, scratch3, scratch4;
23827 /* The generic case of strlen expander is long. Avoid it's
23828 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23830 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23831 && !TARGET_INLINE_ALL_STRINGOPS
23832 && !optimize_insn_for_size_p ()
23833 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23834 return false;
23836 addr = force_reg (Pmode, XEXP (src, 0));
23837 scratch1 = gen_reg_rtx (Pmode);
23839 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23840 && !optimize_insn_for_size_p ())
23842 /* Well it seems that some optimizer does not combine a call like
23843 foo(strlen(bar), strlen(bar));
23844 when the move and the subtraction is done here. It does calculate
23845 the length just once when these instructions are done inside of
23846 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23847 often used and I use one fewer register for the lifetime of
23848 output_strlen_unroll() this is better. */
23850 emit_move_insn (out, addr);
23852 ix86_expand_strlensi_unroll_1 (out, src, align);
23854 /* strlensi_unroll_1 returns the address of the zero at the end of
23855 the string, like memchr(), so compute the length by subtracting
23856 the start address. */
23857 emit_insn (ix86_gen_sub3 (out, out, addr));
23859 else
23861 rtx unspec;
23863 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23864 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23865 return false;
23867 scratch2 = gen_reg_rtx (Pmode);
23868 scratch3 = gen_reg_rtx (Pmode);
23869 scratch4 = force_reg (Pmode, constm1_rtx);
23871 emit_move_insn (scratch3, addr);
23872 eoschar = force_reg (QImode, eoschar);
23874 src = replace_equiv_address_nv (src, scratch3);
23876 /* If .md starts supporting :P, this can be done in .md. */
23877 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23878 scratch4), UNSPEC_SCAS);
23879 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23880 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23881 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23883 return true;
23886 /* For given symbol (function) construct code to compute address of it's PLT
23887 entry in large x86-64 PIC model. */
23888 static rtx
23889 construct_plt_address (rtx symbol)
23891 rtx tmp, unspec;
23893 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23894 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
23895 gcc_assert (Pmode == DImode);
23897 tmp = gen_reg_rtx (Pmode);
23898 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23900 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23901 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23902 return tmp;
23906 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23907 rtx callarg2,
23908 rtx pop, bool sibcall)
23910 unsigned int const cregs_size
23911 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
23912 rtx vec[3 + cregs_size];
23913 rtx use = NULL, call;
23914 unsigned int vec_len = 0;
23916 if (pop == const0_rtx)
23917 pop = NULL;
23918 gcc_assert (!TARGET_64BIT || !pop);
23920 if (TARGET_MACHO && !TARGET_64BIT)
23922 #if TARGET_MACHO
23923 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23924 fnaddr = machopic_indirect_call_target (fnaddr);
23925 #endif
23927 else
23929 /* Static functions and indirect calls don't need the pic register. */
23930 if (flag_pic
23931 && (!TARGET_64BIT
23932 || (ix86_cmodel == CM_LARGE_PIC
23933 && DEFAULT_ABI != MS_ABI))
23934 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23935 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23936 use_reg (&use, pic_offset_table_rtx);
23939 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23941 rtx al = gen_rtx_REG (QImode, AX_REG);
23942 emit_move_insn (al, callarg2);
23943 use_reg (&use, al);
23946 if (ix86_cmodel == CM_LARGE_PIC
23947 && !TARGET_PECOFF
23948 && MEM_P (fnaddr)
23949 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23950 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23951 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23952 else if (sibcall
23953 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23954 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23956 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23957 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23960 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23961 if (retval)
23962 call = gen_rtx_SET (VOIDmode, retval, call);
23963 vec[vec_len++] = call;
23965 if (pop)
23967 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23968 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23969 vec[vec_len++] = pop;
23972 if (TARGET_64BIT_MS_ABI
23973 && (!callarg2 || INTVAL (callarg2) != -2))
23975 unsigned i;
23977 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23978 UNSPEC_MS_TO_SYSV_CALL);
23980 for (i = 0; i < cregs_size; i++)
23982 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
23983 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
23985 vec[vec_len++]
23986 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
23990 if (vec_len > 1)
23991 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23992 call = emit_call_insn (call);
23993 if (use)
23994 CALL_INSN_FUNCTION_USAGE (call) = use;
23996 return call;
23999 /* Output the assembly for a call instruction. */
24001 const char *
24002 ix86_output_call_insn (rtx insn, rtx call_op)
24004 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
24005 bool seh_nop_p = false;
24006 const char *xasm;
24008 if (SIBLING_CALL_P (insn))
24010 if (direct_p)
24011 xasm = "jmp\t%P0";
24012 /* SEH epilogue detection requires the indirect branch case
24013 to include REX.W. */
24014 else if (TARGET_SEH)
24015 xasm = "rex.W jmp %A0";
24016 else
24017 xasm = "jmp\t%A0";
24019 output_asm_insn (xasm, &call_op);
24020 return "";
24023 /* SEH unwinding can require an extra nop to be emitted in several
24024 circumstances. Determine if we have one of those. */
24025 if (TARGET_SEH)
24027 rtx i;
24029 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24031 /* If we get to another real insn, we don't need the nop. */
24032 if (INSN_P (i))
24033 break;
24035 /* If we get to the epilogue note, prevent a catch region from
24036 being adjacent to the standard epilogue sequence. If non-
24037 call-exceptions, we'll have done this during epilogue emission. */
24038 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24039 && !flag_non_call_exceptions
24040 && !can_throw_internal (insn))
24042 seh_nop_p = true;
24043 break;
24047 /* If we didn't find a real insn following the call, prevent the
24048 unwinder from looking into the next function. */
24049 if (i == NULL)
24050 seh_nop_p = true;
24053 if (direct_p)
24054 xasm = "call\t%P0";
24055 else
24056 xasm = "call\t%A0";
24058 output_asm_insn (xasm, &call_op);
24060 if (seh_nop_p)
24061 return "nop";
24063 return "";
24066 /* Clear stack slot assignments remembered from previous functions.
24067 This is called from INIT_EXPANDERS once before RTL is emitted for each
24068 function. */
24070 static struct machine_function *
24071 ix86_init_machine_status (void)
24073 struct machine_function *f;
24075 f = ggc_alloc_cleared_machine_function ();
24076 f->use_fast_prologue_epilogue_nregs = -1;
24077 f->call_abi = ix86_abi;
24079 return f;
24082 /* Return a MEM corresponding to a stack slot with mode MODE.
24083 Allocate a new slot if necessary.
24085 The RTL for a function can have several slots available: N is
24086 which slot to use. */
24089 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24091 struct stack_local_entry *s;
24093 gcc_assert (n < MAX_386_STACK_LOCALS);
24095 for (s = ix86_stack_locals; s; s = s->next)
24096 if (s->mode == mode && s->n == n)
24097 return validize_mem (copy_rtx (s->rtl));
24099 s = ggc_alloc_stack_local_entry ();
24100 s->n = n;
24101 s->mode = mode;
24102 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24104 s->next = ix86_stack_locals;
24105 ix86_stack_locals = s;
24106 return validize_mem (s->rtl);
24109 static void
24110 ix86_instantiate_decls (void)
24112 struct stack_local_entry *s;
24114 for (s = ix86_stack_locals; s; s = s->next)
24115 if (s->rtl != NULL_RTX)
24116 instantiate_decl_rtl (s->rtl);
24119 /* Calculate the length of the memory address in the instruction encoding.
24120 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24121 or other prefixes. We never generate addr32 prefix for LEA insn. */
24124 memory_address_length (rtx addr, bool lea)
24126 struct ix86_address parts;
24127 rtx base, index, disp;
24128 int len;
24129 int ok;
24131 if (GET_CODE (addr) == PRE_DEC
24132 || GET_CODE (addr) == POST_INC
24133 || GET_CODE (addr) == PRE_MODIFY
24134 || GET_CODE (addr) == POST_MODIFY)
24135 return 0;
24137 ok = ix86_decompose_address (addr, &parts);
24138 gcc_assert (ok);
24140 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24142 /* If this is not LEA instruction, add the length of addr32 prefix. */
24143 if (TARGET_64BIT && !lea
24144 && (SImode_address_operand (addr, VOIDmode)
24145 || (parts.base && GET_MODE (parts.base) == SImode)
24146 || (parts.index && GET_MODE (parts.index) == SImode)))
24147 len++;
24149 base = parts.base;
24150 index = parts.index;
24151 disp = parts.disp;
24153 if (base && GET_CODE (base) == SUBREG)
24154 base = SUBREG_REG (base);
24155 if (index && GET_CODE (index) == SUBREG)
24156 index = SUBREG_REG (index);
24158 gcc_assert (base == NULL_RTX || REG_P (base));
24159 gcc_assert (index == NULL_RTX || REG_P (index));
24161 /* Rule of thumb:
24162 - esp as the base always wants an index,
24163 - ebp as the base always wants a displacement,
24164 - r12 as the base always wants an index,
24165 - r13 as the base always wants a displacement. */
24167 /* Register Indirect. */
24168 if (base && !index && !disp)
24170 /* esp (for its index) and ebp (for its displacement) need
24171 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24172 code. */
24173 if (base == arg_pointer_rtx
24174 || base == frame_pointer_rtx
24175 || REGNO (base) == SP_REG
24176 || REGNO (base) == BP_REG
24177 || REGNO (base) == R12_REG
24178 || REGNO (base) == R13_REG)
24179 len++;
24182 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24183 is not disp32, but disp32(%rip), so for disp32
24184 SIB byte is needed, unless print_operand_address
24185 optimizes it into disp32(%rip) or (%rip) is implied
24186 by UNSPEC. */
24187 else if (disp && !base && !index)
24189 len += 4;
24190 if (TARGET_64BIT)
24192 rtx symbol = disp;
24194 if (GET_CODE (disp) == CONST)
24195 symbol = XEXP (disp, 0);
24196 if (GET_CODE (symbol) == PLUS
24197 && CONST_INT_P (XEXP (symbol, 1)))
24198 symbol = XEXP (symbol, 0);
24200 if (GET_CODE (symbol) != LABEL_REF
24201 && (GET_CODE (symbol) != SYMBOL_REF
24202 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
24203 && (GET_CODE (symbol) != UNSPEC
24204 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
24205 && XINT (symbol, 1) != UNSPEC_PCREL
24206 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
24207 len++;
24210 else
24212 /* Find the length of the displacement constant. */
24213 if (disp)
24215 if (base && satisfies_constraint_K (disp))
24216 len += 1;
24217 else
24218 len += 4;
24220 /* ebp always wants a displacement. Similarly r13. */
24221 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24222 len++;
24224 /* An index requires the two-byte modrm form.... */
24225 if (index
24226 /* ...like esp (or r12), which always wants an index. */
24227 || base == arg_pointer_rtx
24228 || base == frame_pointer_rtx
24229 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24230 len++;
24233 return len;
24236 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24237 is set, expect that insn have 8bit immediate alternative. */
24239 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24241 int len = 0;
24242 int i;
24243 extract_insn_cached (insn);
24244 for (i = recog_data.n_operands - 1; i >= 0; --i)
24245 if (CONSTANT_P (recog_data.operand[i]))
24247 enum attr_mode mode = get_attr_mode (insn);
24249 gcc_assert (!len);
24250 if (shortform && CONST_INT_P (recog_data.operand[i]))
24252 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24253 switch (mode)
24255 case MODE_QI:
24256 len = 1;
24257 continue;
24258 case MODE_HI:
24259 ival = trunc_int_for_mode (ival, HImode);
24260 break;
24261 case MODE_SI:
24262 ival = trunc_int_for_mode (ival, SImode);
24263 break;
24264 default:
24265 break;
24267 if (IN_RANGE (ival, -128, 127))
24269 len = 1;
24270 continue;
24273 switch (mode)
24275 case MODE_QI:
24276 len = 1;
24277 break;
24278 case MODE_HI:
24279 len = 2;
24280 break;
24281 case MODE_SI:
24282 len = 4;
24283 break;
24284 /* Immediates for DImode instructions are encoded
24285 as 32bit sign extended values. */
24286 case MODE_DI:
24287 len = 4;
24288 break;
24289 default:
24290 fatal_insn ("unknown insn mode", insn);
24293 return len;
24296 /* Compute default value for "length_address" attribute. */
24298 ix86_attr_length_address_default (rtx insn)
24300 int i;
24302 if (get_attr_type (insn) == TYPE_LEA)
24304 rtx set = PATTERN (insn), addr;
24306 if (GET_CODE (set) == PARALLEL)
24307 set = XVECEXP (set, 0, 0);
24309 gcc_assert (GET_CODE (set) == SET);
24311 addr = SET_SRC (set);
24313 return memory_address_length (addr, true);
24316 extract_insn_cached (insn);
24317 for (i = recog_data.n_operands - 1; i >= 0; --i)
24318 if (MEM_P (recog_data.operand[i]))
24320 constrain_operands_cached (reload_completed);
24321 if (which_alternative != -1)
24323 const char *constraints = recog_data.constraints[i];
24324 int alt = which_alternative;
24326 while (*constraints == '=' || *constraints == '+')
24327 constraints++;
24328 while (alt-- > 0)
24329 while (*constraints++ != ',')
24331 /* Skip ignored operands. */
24332 if (*constraints == 'X')
24333 continue;
24335 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24337 return 0;
24340 /* Compute default value for "length_vex" attribute. It includes
24341 2 or 3 byte VEX prefix and 1 opcode byte. */
24344 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24346 int i;
24348 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24349 byte VEX prefix. */
24350 if (!has_0f_opcode || has_vex_w)
24351 return 3 + 1;
24353 /* We can always use 2 byte VEX prefix in 32bit. */
24354 if (!TARGET_64BIT)
24355 return 2 + 1;
24357 extract_insn_cached (insn);
24359 for (i = recog_data.n_operands - 1; i >= 0; --i)
24360 if (REG_P (recog_data.operand[i]))
24362 /* REX.W bit uses 3 byte VEX prefix. */
24363 if (GET_MODE (recog_data.operand[i]) == DImode
24364 && GENERAL_REG_P (recog_data.operand[i]))
24365 return 3 + 1;
24367 else
24369 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24370 if (MEM_P (recog_data.operand[i])
24371 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24372 return 3 + 1;
24375 return 2 + 1;
24378 /* Return the maximum number of instructions a cpu can issue. */
24380 static int
24381 ix86_issue_rate (void)
24383 switch (ix86_tune)
24385 case PROCESSOR_PENTIUM:
24386 case PROCESSOR_ATOM:
24387 case PROCESSOR_SLM:
24388 case PROCESSOR_K6:
24389 case PROCESSOR_BTVER2:
24390 return 2;
24392 case PROCESSOR_PENTIUMPRO:
24393 case PROCESSOR_PENTIUM4:
24394 case PROCESSOR_CORE2:
24395 case PROCESSOR_COREI7:
24396 case PROCESSOR_HASWELL:
24397 case PROCESSOR_ATHLON:
24398 case PROCESSOR_K8:
24399 case PROCESSOR_AMDFAM10:
24400 case PROCESSOR_NOCONA:
24401 case PROCESSOR_GENERIC:
24402 case PROCESSOR_BDVER1:
24403 case PROCESSOR_BDVER2:
24404 case PROCESSOR_BDVER3:
24405 case PROCESSOR_BTVER1:
24406 return 3;
24408 default:
24409 return 1;
24413 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24414 by DEP_INSN and nothing set by DEP_INSN. */
24416 static bool
24417 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24419 rtx set, set2;
24421 /* Simplify the test for uninteresting insns. */
24422 if (insn_type != TYPE_SETCC
24423 && insn_type != TYPE_ICMOV
24424 && insn_type != TYPE_FCMOV
24425 && insn_type != TYPE_IBR)
24426 return false;
24428 if ((set = single_set (dep_insn)) != 0)
24430 set = SET_DEST (set);
24431 set2 = NULL_RTX;
24433 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24434 && XVECLEN (PATTERN (dep_insn), 0) == 2
24435 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24436 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24438 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24439 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24441 else
24442 return false;
24444 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24445 return false;
24447 /* This test is true if the dependent insn reads the flags but
24448 not any other potentially set register. */
24449 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24450 return false;
24452 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24453 return false;
24455 return true;
24458 /* Return true iff USE_INSN has a memory address with operands set by
24459 SET_INSN. */
24461 bool
24462 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24464 int i;
24465 extract_insn_cached (use_insn);
24466 for (i = recog_data.n_operands - 1; i >= 0; --i)
24467 if (MEM_P (recog_data.operand[i]))
24469 rtx addr = XEXP (recog_data.operand[i], 0);
24470 return modified_in_p (addr, set_insn) != 0;
24472 return false;
24475 /* Helper function for exact_store_load_dependency.
24476 Return true if addr is found in insn. */
24477 static bool
24478 exact_dependency_1 (rtx addr, rtx insn)
24480 enum rtx_code code;
24481 const char *format_ptr;
24482 int i, j;
24484 code = GET_CODE (insn);
24485 switch (code)
24487 case MEM:
24488 if (rtx_equal_p (addr, insn))
24489 return true;
24490 break;
24491 case REG:
24492 CASE_CONST_ANY:
24493 case SYMBOL_REF:
24494 case CODE_LABEL:
24495 case PC:
24496 case CC0:
24497 case EXPR_LIST:
24498 return false;
24499 default:
24500 break;
24503 format_ptr = GET_RTX_FORMAT (code);
24504 for (i = 0; i < GET_RTX_LENGTH (code); i++)
24506 switch (*format_ptr++)
24508 case 'e':
24509 if (exact_dependency_1 (addr, XEXP (insn, i)))
24510 return true;
24511 break;
24512 case 'E':
24513 for (j = 0; j < XVECLEN (insn, i); j++)
24514 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
24515 return true;
24516 break;
24519 return false;
24522 /* Return true if there exists exact dependency for store & load, i.e.
24523 the same memory address is used in them. */
24524 static bool
24525 exact_store_load_dependency (rtx store, rtx load)
24527 rtx set1, set2;
24529 set1 = single_set (store);
24530 if (!set1)
24531 return false;
24532 if (!MEM_P (SET_DEST (set1)))
24533 return false;
24534 set2 = single_set (load);
24535 if (!set2)
24536 return false;
24537 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
24538 return true;
24539 return false;
24542 static int
24543 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24545 enum attr_type insn_type, dep_insn_type;
24546 enum attr_memory memory;
24547 rtx set, set2;
24548 int dep_insn_code_number;
24550 /* Anti and output dependencies have zero cost on all CPUs. */
24551 if (REG_NOTE_KIND (link) != 0)
24552 return 0;
24554 dep_insn_code_number = recog_memoized (dep_insn);
24556 /* If we can't recognize the insns, we can't really do anything. */
24557 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24558 return cost;
24560 insn_type = get_attr_type (insn);
24561 dep_insn_type = get_attr_type (dep_insn);
24563 switch (ix86_tune)
24565 case PROCESSOR_PENTIUM:
24566 /* Address Generation Interlock adds a cycle of latency. */
24567 if (insn_type == TYPE_LEA)
24569 rtx addr = PATTERN (insn);
24571 if (GET_CODE (addr) == PARALLEL)
24572 addr = XVECEXP (addr, 0, 0);
24574 gcc_assert (GET_CODE (addr) == SET);
24576 addr = SET_SRC (addr);
24577 if (modified_in_p (addr, dep_insn))
24578 cost += 1;
24580 else if (ix86_agi_dependent (dep_insn, insn))
24581 cost += 1;
24583 /* ??? Compares pair with jump/setcc. */
24584 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24585 cost = 0;
24587 /* Floating point stores require value to be ready one cycle earlier. */
24588 if (insn_type == TYPE_FMOV
24589 && get_attr_memory (insn) == MEMORY_STORE
24590 && !ix86_agi_dependent (dep_insn, insn))
24591 cost += 1;
24592 break;
24594 case PROCESSOR_PENTIUMPRO:
24595 memory = get_attr_memory (insn);
24597 /* INT->FP conversion is expensive. */
24598 if (get_attr_fp_int_src (dep_insn))
24599 cost += 5;
24601 /* There is one cycle extra latency between an FP op and a store. */
24602 if (insn_type == TYPE_FMOV
24603 && (set = single_set (dep_insn)) != NULL_RTX
24604 && (set2 = single_set (insn)) != NULL_RTX
24605 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24606 && MEM_P (SET_DEST (set2)))
24607 cost += 1;
24609 /* Show ability of reorder buffer to hide latency of load by executing
24610 in parallel with previous instruction in case
24611 previous instruction is not needed to compute the address. */
24612 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24613 && !ix86_agi_dependent (dep_insn, insn))
24615 /* Claim moves to take one cycle, as core can issue one load
24616 at time and the next load can start cycle later. */
24617 if (dep_insn_type == TYPE_IMOV
24618 || dep_insn_type == TYPE_FMOV)
24619 cost = 1;
24620 else if (cost > 1)
24621 cost--;
24623 break;
24625 case PROCESSOR_K6:
24626 memory = get_attr_memory (insn);
24628 /* The esp dependency is resolved before the instruction is really
24629 finished. */
24630 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24631 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24632 return 1;
24634 /* INT->FP conversion is expensive. */
24635 if (get_attr_fp_int_src (dep_insn))
24636 cost += 5;
24638 /* Show ability of reorder buffer to hide latency of load by executing
24639 in parallel with previous instruction in case
24640 previous instruction is not needed to compute the address. */
24641 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24642 && !ix86_agi_dependent (dep_insn, insn))
24644 /* Claim moves to take one cycle, as core can issue one load
24645 at time and the next load can start cycle later. */
24646 if (dep_insn_type == TYPE_IMOV
24647 || dep_insn_type == TYPE_FMOV)
24648 cost = 1;
24649 else if (cost > 2)
24650 cost -= 2;
24651 else
24652 cost = 1;
24654 break;
24656 case PROCESSOR_ATHLON:
24657 case PROCESSOR_K8:
24658 case PROCESSOR_AMDFAM10:
24659 case PROCESSOR_BDVER1:
24660 case PROCESSOR_BDVER2:
24661 case PROCESSOR_BDVER3:
24662 case PROCESSOR_BTVER1:
24663 case PROCESSOR_BTVER2:
24664 case PROCESSOR_ATOM:
24665 case PROCESSOR_GENERIC:
24666 memory = get_attr_memory (insn);
24668 /* Show ability of reorder buffer to hide latency of load by executing
24669 in parallel with previous instruction in case
24670 previous instruction is not needed to compute the address. */
24671 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24672 && !ix86_agi_dependent (dep_insn, insn))
24674 enum attr_unit unit = get_attr_unit (insn);
24675 int loadcost = 3;
24677 /* Because of the difference between the length of integer and
24678 floating unit pipeline preparation stages, the memory operands
24679 for floating point are cheaper.
24681 ??? For Athlon it the difference is most probably 2. */
24682 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24683 loadcost = 3;
24684 else
24685 loadcost = TARGET_ATHLON ? 2 : 0;
24687 if (cost >= loadcost)
24688 cost -= loadcost;
24689 else
24690 cost = 0;
24692 break;
24694 case PROCESSOR_SLM:
24695 if (!reload_completed)
24696 return cost;
24698 /* Increase cost of integer loads. */
24699 memory = get_attr_memory (dep_insn);
24700 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24702 enum attr_unit unit = get_attr_unit (dep_insn);
24703 if (unit == UNIT_INTEGER && cost == 1)
24705 if (memory == MEMORY_LOAD)
24706 cost = 3;
24707 else
24709 /* Increase cost of ld/st for short int types only
24710 because of store forwarding issue. */
24711 rtx set = single_set (dep_insn);
24712 if (set && (GET_MODE (SET_DEST (set)) == QImode
24713 || GET_MODE (SET_DEST (set)) == HImode))
24715 /* Increase cost of store/load insn if exact
24716 dependence exists and it is load insn. */
24717 enum attr_memory insn_memory = get_attr_memory (insn);
24718 if (insn_memory == MEMORY_LOAD
24719 && exact_store_load_dependency (dep_insn, insn))
24720 cost = 3;
24726 default:
24727 break;
24730 return cost;
24733 /* How many alternative schedules to try. This should be as wide as the
24734 scheduling freedom in the DFA, but no wider. Making this value too
24735 large results extra work for the scheduler. */
24737 static int
24738 ia32_multipass_dfa_lookahead (void)
24740 switch (ix86_tune)
24742 case PROCESSOR_PENTIUM:
24743 return 2;
24745 case PROCESSOR_PENTIUMPRO:
24746 case PROCESSOR_K6:
24747 return 1;
24749 case PROCESSOR_CORE2:
24750 case PROCESSOR_COREI7:
24751 case PROCESSOR_HASWELL:
24752 case PROCESSOR_ATOM:
24753 case PROCESSOR_SLM:
24754 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24755 as many instructions can be executed on a cycle, i.e.,
24756 issue_rate. I wonder why tuning for many CPUs does not do this. */
24757 if (reload_completed)
24758 return ix86_issue_rate ();
24759 /* Don't use lookahead for pre-reload schedule to save compile time. */
24760 return 0;
24762 default:
24763 return 0;
24767 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24768 execution. It is applied if
24769 (1) IMUL instruction is on the top of list;
24770 (2) There exists the only producer of independent IMUL instruction in
24771 ready list.
24772 Return index of IMUL producer if it was found and -1 otherwise. */
24773 static int
24774 do_reorder_for_imul (rtx *ready, int n_ready)
24776 rtx insn, set, insn1, insn2;
24777 sd_iterator_def sd_it;
24778 dep_t dep;
24779 int index = -1;
24780 int i;
24782 if (ix86_tune != PROCESSOR_ATOM)
24783 return index;
24785 /* Check that IMUL instruction is on the top of ready list. */
24786 insn = ready[n_ready - 1];
24787 set = single_set (insn);
24788 if (!set)
24789 return index;
24790 if (!(GET_CODE (SET_SRC (set)) == MULT
24791 && GET_MODE (SET_SRC (set)) == SImode))
24792 return index;
24794 /* Search for producer of independent IMUL instruction. */
24795 for (i = n_ready - 2; i >= 0; i--)
24797 insn = ready[i];
24798 if (!NONDEBUG_INSN_P (insn))
24799 continue;
24800 /* Skip IMUL instruction. */
24801 insn2 = PATTERN (insn);
24802 if (GET_CODE (insn2) == PARALLEL)
24803 insn2 = XVECEXP (insn2, 0, 0);
24804 if (GET_CODE (insn2) == SET
24805 && GET_CODE (SET_SRC (insn2)) == MULT
24806 && GET_MODE (SET_SRC (insn2)) == SImode)
24807 continue;
24809 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24811 rtx con;
24812 con = DEP_CON (dep);
24813 if (!NONDEBUG_INSN_P (con))
24814 continue;
24815 insn1 = PATTERN (con);
24816 if (GET_CODE (insn1) == PARALLEL)
24817 insn1 = XVECEXP (insn1, 0, 0);
24819 if (GET_CODE (insn1) == SET
24820 && GET_CODE (SET_SRC (insn1)) == MULT
24821 && GET_MODE (SET_SRC (insn1)) == SImode)
24823 sd_iterator_def sd_it1;
24824 dep_t dep1;
24825 /* Check if there is no other dependee for IMUL. */
24826 index = i;
24827 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24829 rtx pro;
24830 pro = DEP_PRO (dep1);
24831 if (!NONDEBUG_INSN_P (pro))
24832 continue;
24833 if (pro != insn)
24834 index = -1;
24836 if (index >= 0)
24837 break;
24840 if (index >= 0)
24841 break;
24843 return index;
24846 /* Try to find the best candidate on the top of ready list if two insns
24847 have the same priority - candidate is best if its dependees were
24848 scheduled earlier. Applied for Silvermont only.
24849 Return true if top 2 insns must be interchanged. */
24850 static bool
24851 swap_top_of_ready_list (rtx *ready, int n_ready)
24853 rtx top = ready[n_ready - 1];
24854 rtx next = ready[n_ready - 2];
24855 rtx set;
24856 sd_iterator_def sd_it;
24857 dep_t dep;
24858 int clock1 = -1;
24859 int clock2 = -1;
24860 #define INSN_TICK(INSN) (HID (INSN)->tick)
24862 if (ix86_tune != PROCESSOR_SLM)
24863 return false;
24865 if (!NONDEBUG_INSN_P (top))
24866 return false;
24867 if (!NONJUMP_INSN_P (top))
24868 return false;
24869 if (!NONDEBUG_INSN_P (next))
24870 return false;
24871 if (!NONJUMP_INSN_P (next))
24872 return false;
24873 set = single_set (top);
24874 if (!set)
24875 return false;
24876 set = single_set (next);
24877 if (!set)
24878 return false;
24880 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
24882 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
24883 return false;
24884 /* Determine winner more precise. */
24885 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
24887 rtx pro;
24888 pro = DEP_PRO (dep);
24889 if (!NONDEBUG_INSN_P (pro))
24890 continue;
24891 if (INSN_TICK (pro) > clock1)
24892 clock1 = INSN_TICK (pro);
24894 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
24896 rtx pro;
24897 pro = DEP_PRO (dep);
24898 if (!NONDEBUG_INSN_P (pro))
24899 continue;
24900 if (INSN_TICK (pro) > clock2)
24901 clock2 = INSN_TICK (pro);
24904 if (clock1 == clock2)
24906 /* Determine winner - load must win. */
24907 enum attr_memory memory1, memory2;
24908 memory1 = get_attr_memory (top);
24909 memory2 = get_attr_memory (next);
24910 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
24911 return true;
24913 return (bool) (clock2 < clock1);
24915 return false;
24916 #undef INSN_TICK
24919 /* Perform possible reodering of ready list for Atom/Silvermont only.
24920 Return issue rate. */
24921 static int
24922 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24923 int clock_var)
24925 int issue_rate = -1;
24926 int n_ready = *pn_ready;
24927 int i;
24928 rtx insn;
24929 int index = -1;
24931 /* Set up issue rate. */
24932 issue_rate = ix86_issue_rate ();
24934 /* Do reodering for Atom/SLM only. */
24935 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
24936 return issue_rate;
24938 /* Nothing to do if ready list contains only 1 instruction. */
24939 if (n_ready <= 1)
24940 return issue_rate;
24942 /* Do reodering for post-reload scheduler only. */
24943 if (!reload_completed)
24944 return issue_rate;
24946 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
24948 if (sched_verbose > 1)
24949 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
24950 INSN_UID (ready[index]));
24952 /* Put IMUL producer (ready[index]) at the top of ready list. */
24953 insn = ready[index];
24954 for (i = index; i < n_ready - 1; i++)
24955 ready[i] = ready[i + 1];
24956 ready[n_ready - 1] = insn;
24957 return issue_rate;
24959 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
24961 if (sched_verbose > 1)
24962 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
24963 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
24964 /* Swap 2 top elements of ready list. */
24965 insn = ready[n_ready - 1];
24966 ready[n_ready - 1] = ready[n_ready - 2];
24967 ready[n_ready - 2] = insn;
24969 return issue_rate;
24972 static bool
24973 ix86_class_likely_spilled_p (reg_class_t);
24975 /* Returns true if lhs of insn is HW function argument register and set up
24976 is_spilled to true if it is likely spilled HW register. */
24977 static bool
24978 insn_is_function_arg (rtx insn, bool* is_spilled)
24980 rtx dst;
24982 if (!NONDEBUG_INSN_P (insn))
24983 return false;
24984 /* Call instructions are not movable, ignore it. */
24985 if (CALL_P (insn))
24986 return false;
24987 insn = PATTERN (insn);
24988 if (GET_CODE (insn) == PARALLEL)
24989 insn = XVECEXP (insn, 0, 0);
24990 if (GET_CODE (insn) != SET)
24991 return false;
24992 dst = SET_DEST (insn);
24993 if (REG_P (dst) && HARD_REGISTER_P (dst)
24994 && ix86_function_arg_regno_p (REGNO (dst)))
24996 /* Is it likely spilled HW register? */
24997 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24998 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24999 *is_spilled = true;
25000 return true;
25002 return false;
25005 /* Add output dependencies for chain of function adjacent arguments if only
25006 there is a move to likely spilled HW register. Return first argument
25007 if at least one dependence was added or NULL otherwise. */
25008 static rtx
25009 add_parameter_dependencies (rtx call, rtx head)
25011 rtx insn;
25012 rtx last = call;
25013 rtx first_arg = NULL;
25014 bool is_spilled = false;
25016 head = PREV_INSN (head);
25018 /* Find nearest to call argument passing instruction. */
25019 while (true)
25021 last = PREV_INSN (last);
25022 if (last == head)
25023 return NULL;
25024 if (!NONDEBUG_INSN_P (last))
25025 continue;
25026 if (insn_is_function_arg (last, &is_spilled))
25027 break;
25028 return NULL;
25031 first_arg = last;
25032 while (true)
25034 insn = PREV_INSN (last);
25035 if (!INSN_P (insn))
25036 break;
25037 if (insn == head)
25038 break;
25039 if (!NONDEBUG_INSN_P (insn))
25041 last = insn;
25042 continue;
25044 if (insn_is_function_arg (insn, &is_spilled))
25046 /* Add output depdendence between two function arguments if chain
25047 of output arguments contains likely spilled HW registers. */
25048 if (is_spilled)
25049 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25050 first_arg = last = insn;
25052 else
25053 break;
25055 if (!is_spilled)
25056 return NULL;
25057 return first_arg;
25060 /* Add output or anti dependency from insn to first_arg to restrict its code
25061 motion. */
25062 static void
25063 avoid_func_arg_motion (rtx first_arg, rtx insn)
25065 rtx set;
25066 rtx tmp;
25068 set = single_set (insn);
25069 if (!set)
25070 return;
25071 tmp = SET_DEST (set);
25072 if (REG_P (tmp))
25074 /* Add output dependency to the first function argument. */
25075 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25076 return;
25078 /* Add anti dependency. */
25079 add_dependence (first_arg, insn, REG_DEP_ANTI);
25082 /* Avoid cross block motion of function argument through adding dependency
25083 from the first non-jump instruction in bb. */
25084 static void
25085 add_dependee_for_func_arg (rtx arg, basic_block bb)
25087 rtx insn = BB_END (bb);
25089 while (insn)
25091 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25093 rtx set = single_set (insn);
25094 if (set)
25096 avoid_func_arg_motion (arg, insn);
25097 return;
25100 if (insn == BB_HEAD (bb))
25101 return;
25102 insn = PREV_INSN (insn);
25106 /* Hook for pre-reload schedule - avoid motion of function arguments
25107 passed in likely spilled HW registers. */
25108 static void
25109 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25111 rtx insn;
25112 rtx first_arg = NULL;
25113 if (reload_completed)
25114 return;
25115 while (head != tail && DEBUG_INSN_P (head))
25116 head = NEXT_INSN (head);
25117 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25118 if (INSN_P (insn) && CALL_P (insn))
25120 first_arg = add_parameter_dependencies (insn, head);
25121 if (first_arg)
25123 /* Add dependee for first argument to predecessors if only
25124 region contains more than one block. */
25125 basic_block bb = BLOCK_FOR_INSN (insn);
25126 int rgn = CONTAINING_RGN (bb->index);
25127 int nr_blks = RGN_NR_BLOCKS (rgn);
25128 /* Skip trivial regions and region head blocks that can have
25129 predecessors outside of region. */
25130 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25132 edge e;
25133 edge_iterator ei;
25134 /* Assume that region is SCC, i.e. all immediate predecessors
25135 of non-head block are in the same region. */
25136 FOR_EACH_EDGE (e, ei, bb->preds)
25138 /* Avoid creating of loop-carried dependencies through
25139 using topological odering in region. */
25140 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25141 add_dependee_for_func_arg (first_arg, e->src);
25144 insn = first_arg;
25145 if (insn == head)
25146 break;
25149 else if (first_arg)
25150 avoid_func_arg_motion (first_arg, insn);
25153 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25154 HW registers to maximum, to schedule them at soon as possible. These are
25155 moves from function argument registers at the top of the function entry
25156 and moves from function return value registers after call. */
25157 static int
25158 ix86_adjust_priority (rtx insn, int priority)
25160 rtx set;
25162 if (reload_completed)
25163 return priority;
25165 if (!NONDEBUG_INSN_P (insn))
25166 return priority;
25168 set = single_set (insn);
25169 if (set)
25171 rtx tmp = SET_SRC (set);
25172 if (REG_P (tmp)
25173 && HARD_REGISTER_P (tmp)
25174 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25175 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25176 return current_sched_info->sched_max_insns_priority;
25179 return priority;
25182 /* Model decoder of Core 2/i7.
25183 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25184 track the instruction fetch block boundaries and make sure that long
25185 (9+ bytes) instructions are assigned to D0. */
25187 /* Maximum length of an insn that can be handled by
25188 a secondary decoder unit. '8' for Core 2/i7. */
25189 static int core2i7_secondary_decoder_max_insn_size;
25191 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25192 '16' for Core 2/i7. */
25193 static int core2i7_ifetch_block_size;
25195 /* Maximum number of instructions decoder can handle per cycle.
25196 '6' for Core 2/i7. */
25197 static int core2i7_ifetch_block_max_insns;
25199 typedef struct ix86_first_cycle_multipass_data_ *
25200 ix86_first_cycle_multipass_data_t;
25201 typedef const struct ix86_first_cycle_multipass_data_ *
25202 const_ix86_first_cycle_multipass_data_t;
25204 /* A variable to store target state across calls to max_issue within
25205 one cycle. */
25206 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25207 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25209 /* Initialize DATA. */
25210 static void
25211 core2i7_first_cycle_multipass_init (void *_data)
25213 ix86_first_cycle_multipass_data_t data
25214 = (ix86_first_cycle_multipass_data_t) _data;
25216 data->ifetch_block_len = 0;
25217 data->ifetch_block_n_insns = 0;
25218 data->ready_try_change = NULL;
25219 data->ready_try_change_size = 0;
25222 /* Advancing the cycle; reset ifetch block counts. */
25223 static void
25224 core2i7_dfa_post_advance_cycle (void)
25226 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
25228 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25230 data->ifetch_block_len = 0;
25231 data->ifetch_block_n_insns = 0;
25234 static int min_insn_size (rtx);
25236 /* Filter out insns from ready_try that the core will not be able to issue
25237 on current cycle due to decoder. */
25238 static void
25239 core2i7_first_cycle_multipass_filter_ready_try
25240 (const_ix86_first_cycle_multipass_data_t data,
25241 char *ready_try, int n_ready, bool first_cycle_insn_p)
25243 while (n_ready--)
25245 rtx insn;
25246 int insn_size;
25248 if (ready_try[n_ready])
25249 continue;
25251 insn = get_ready_element (n_ready);
25252 insn_size = min_insn_size (insn);
25254 if (/* If this is a too long an insn for a secondary decoder ... */
25255 (!first_cycle_insn_p
25256 && insn_size > core2i7_secondary_decoder_max_insn_size)
25257 /* ... or it would not fit into the ifetch block ... */
25258 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
25259 /* ... or the decoder is full already ... */
25260 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
25261 /* ... mask the insn out. */
25263 ready_try[n_ready] = 1;
25265 if (data->ready_try_change)
25266 bitmap_set_bit (data->ready_try_change, n_ready);
25271 /* Prepare for a new round of multipass lookahead scheduling. */
25272 static void
25273 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
25274 bool first_cycle_insn_p)
25276 ix86_first_cycle_multipass_data_t data
25277 = (ix86_first_cycle_multipass_data_t) _data;
25278 const_ix86_first_cycle_multipass_data_t prev_data
25279 = ix86_first_cycle_multipass_data;
25281 /* Restore the state from the end of the previous round. */
25282 data->ifetch_block_len = prev_data->ifetch_block_len;
25283 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
25285 /* Filter instructions that cannot be issued on current cycle due to
25286 decoder restrictions. */
25287 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25288 first_cycle_insn_p);
25291 /* INSN is being issued in current solution. Account for its impact on
25292 the decoder model. */
25293 static void
25294 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
25295 rtx insn, const void *_prev_data)
25297 ix86_first_cycle_multipass_data_t data
25298 = (ix86_first_cycle_multipass_data_t) _data;
25299 const_ix86_first_cycle_multipass_data_t prev_data
25300 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
25302 int insn_size = min_insn_size (insn);
25304 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
25305 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
25306 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
25307 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25309 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
25310 if (!data->ready_try_change)
25312 data->ready_try_change = sbitmap_alloc (n_ready);
25313 data->ready_try_change_size = n_ready;
25315 else if (data->ready_try_change_size < n_ready)
25317 data->ready_try_change = sbitmap_resize (data->ready_try_change,
25318 n_ready, 0);
25319 data->ready_try_change_size = n_ready;
25321 bitmap_clear (data->ready_try_change);
25323 /* Filter out insns from ready_try that the core will not be able to issue
25324 on current cycle due to decoder. */
25325 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25326 false);
25329 /* Revert the effect on ready_try. */
25330 static void
25331 core2i7_first_cycle_multipass_backtrack (const void *_data,
25332 char *ready_try,
25333 int n_ready ATTRIBUTE_UNUSED)
25335 const_ix86_first_cycle_multipass_data_t data
25336 = (const_ix86_first_cycle_multipass_data_t) _data;
25337 unsigned int i = 0;
25338 sbitmap_iterator sbi;
25340 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
25341 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
25343 ready_try[i] = 0;
25347 /* Save the result of multipass lookahead scheduling for the next round. */
25348 static void
25349 core2i7_first_cycle_multipass_end (const void *_data)
25351 const_ix86_first_cycle_multipass_data_t data
25352 = (const_ix86_first_cycle_multipass_data_t) _data;
25353 ix86_first_cycle_multipass_data_t next_data
25354 = ix86_first_cycle_multipass_data;
25356 if (data != NULL)
25358 next_data->ifetch_block_len = data->ifetch_block_len;
25359 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
25363 /* Deallocate target data. */
25364 static void
25365 core2i7_first_cycle_multipass_fini (void *_data)
25367 ix86_first_cycle_multipass_data_t data
25368 = (ix86_first_cycle_multipass_data_t) _data;
25370 if (data->ready_try_change)
25372 sbitmap_free (data->ready_try_change);
25373 data->ready_try_change = NULL;
25374 data->ready_try_change_size = 0;
25378 /* Prepare for scheduling pass. */
25379 static void
25380 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
25381 int verbose ATTRIBUTE_UNUSED,
25382 int max_uid ATTRIBUTE_UNUSED)
25384 /* Install scheduling hooks for current CPU. Some of these hooks are used
25385 in time-critical parts of the scheduler, so we only set them up when
25386 they are actually used. */
25387 switch (ix86_tune)
25389 case PROCESSOR_CORE2:
25390 case PROCESSOR_COREI7:
25391 case PROCESSOR_HASWELL:
25392 /* Do not perform multipass scheduling for pre-reload schedule
25393 to save compile time. */
25394 if (reload_completed)
25396 targetm.sched.dfa_post_advance_cycle
25397 = core2i7_dfa_post_advance_cycle;
25398 targetm.sched.first_cycle_multipass_init
25399 = core2i7_first_cycle_multipass_init;
25400 targetm.sched.first_cycle_multipass_begin
25401 = core2i7_first_cycle_multipass_begin;
25402 targetm.sched.first_cycle_multipass_issue
25403 = core2i7_first_cycle_multipass_issue;
25404 targetm.sched.first_cycle_multipass_backtrack
25405 = core2i7_first_cycle_multipass_backtrack;
25406 targetm.sched.first_cycle_multipass_end
25407 = core2i7_first_cycle_multipass_end;
25408 targetm.sched.first_cycle_multipass_fini
25409 = core2i7_first_cycle_multipass_fini;
25411 /* Set decoder parameters. */
25412 core2i7_secondary_decoder_max_insn_size = 8;
25413 core2i7_ifetch_block_size = 16;
25414 core2i7_ifetch_block_max_insns = 6;
25415 break;
25417 /* ... Fall through ... */
25418 default:
25419 targetm.sched.dfa_post_advance_cycle = NULL;
25420 targetm.sched.first_cycle_multipass_init = NULL;
25421 targetm.sched.first_cycle_multipass_begin = NULL;
25422 targetm.sched.first_cycle_multipass_issue = NULL;
25423 targetm.sched.first_cycle_multipass_backtrack = NULL;
25424 targetm.sched.first_cycle_multipass_end = NULL;
25425 targetm.sched.first_cycle_multipass_fini = NULL;
25426 break;
25431 /* Compute the alignment given to a constant that is being placed in memory.
25432 EXP is the constant and ALIGN is the alignment that the object would
25433 ordinarily have.
25434 The value of this function is used instead of that alignment to align
25435 the object. */
25438 ix86_constant_alignment (tree exp, int align)
25440 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
25441 || TREE_CODE (exp) == INTEGER_CST)
25443 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
25444 return 64;
25445 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
25446 return 128;
25448 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
25449 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
25450 return BITS_PER_WORD;
25452 return align;
25455 /* Compute the alignment for a static variable.
25456 TYPE is the data type, and ALIGN is the alignment that
25457 the object would ordinarily have. The value of this function is used
25458 instead of that alignment to align the object. */
25461 ix86_data_alignment (tree type, int align, bool opt)
25463 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
25465 if (opt
25466 && AGGREGATE_TYPE_P (type)
25467 && TYPE_SIZE (type)
25468 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25469 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
25470 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
25471 && align < max_align)
25472 align = max_align;
25474 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25475 to 16byte boundary. */
25476 if (TARGET_64BIT)
25478 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
25479 && TYPE_SIZE (type)
25480 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25481 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
25482 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25483 return 128;
25486 if (!opt)
25487 return align;
25489 if (TREE_CODE (type) == ARRAY_TYPE)
25491 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25492 return 64;
25493 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25494 return 128;
25496 else if (TREE_CODE (type) == COMPLEX_TYPE)
25499 if (TYPE_MODE (type) == DCmode && align < 64)
25500 return 64;
25501 if ((TYPE_MODE (type) == XCmode
25502 || TYPE_MODE (type) == TCmode) && align < 128)
25503 return 128;
25505 else if ((TREE_CODE (type) == RECORD_TYPE
25506 || TREE_CODE (type) == UNION_TYPE
25507 || TREE_CODE (type) == QUAL_UNION_TYPE)
25508 && TYPE_FIELDS (type))
25510 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25511 return 64;
25512 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25513 return 128;
25515 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25516 || TREE_CODE (type) == INTEGER_TYPE)
25518 if (TYPE_MODE (type) == DFmode && align < 64)
25519 return 64;
25520 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25521 return 128;
25524 return align;
25527 /* Compute the alignment for a local variable or a stack slot. EXP is
25528 the data type or decl itself, MODE is the widest mode available and
25529 ALIGN is the alignment that the object would ordinarily have. The
25530 value of this macro is used instead of that alignment to align the
25531 object. */
25533 unsigned int
25534 ix86_local_alignment (tree exp, enum machine_mode mode,
25535 unsigned int align)
25537 tree type, decl;
25539 if (exp && DECL_P (exp))
25541 type = TREE_TYPE (exp);
25542 decl = exp;
25544 else
25546 type = exp;
25547 decl = NULL;
25550 /* Don't do dynamic stack realignment for long long objects with
25551 -mpreferred-stack-boundary=2. */
25552 if (!TARGET_64BIT
25553 && align == 64
25554 && ix86_preferred_stack_boundary < 64
25555 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25556 && (!type || !TYPE_USER_ALIGN (type))
25557 && (!decl || !DECL_USER_ALIGN (decl)))
25558 align = 32;
25560 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25561 register in MODE. We will return the largest alignment of XF
25562 and DF. */
25563 if (!type)
25565 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25566 align = GET_MODE_ALIGNMENT (DFmode);
25567 return align;
25570 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25571 to 16byte boundary. Exact wording is:
25573 An array uses the same alignment as its elements, except that a local or
25574 global array variable of length at least 16 bytes or
25575 a C99 variable-length array variable always has alignment of at least 16 bytes.
25577 This was added to allow use of aligned SSE instructions at arrays. This
25578 rule is meant for static storage (where compiler can not do the analysis
25579 by itself). We follow it for automatic variables only when convenient.
25580 We fully control everything in the function compiled and functions from
25581 other unit can not rely on the alignment.
25583 Exclude va_list type. It is the common case of local array where
25584 we can not benefit from the alignment.
25586 TODO: Probably one should optimize for size only when var is not escaping. */
25587 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25588 && TARGET_SSE)
25590 if (AGGREGATE_TYPE_P (type)
25591 && (va_list_type_node == NULL_TREE
25592 || (TYPE_MAIN_VARIANT (type)
25593 != TYPE_MAIN_VARIANT (va_list_type_node)))
25594 && TYPE_SIZE (type)
25595 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25596 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25597 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25598 return 128;
25600 if (TREE_CODE (type) == ARRAY_TYPE)
25602 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25603 return 64;
25604 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25605 return 128;
25607 else if (TREE_CODE (type) == COMPLEX_TYPE)
25609 if (TYPE_MODE (type) == DCmode && align < 64)
25610 return 64;
25611 if ((TYPE_MODE (type) == XCmode
25612 || TYPE_MODE (type) == TCmode) && align < 128)
25613 return 128;
25615 else if ((TREE_CODE (type) == RECORD_TYPE
25616 || TREE_CODE (type) == UNION_TYPE
25617 || TREE_CODE (type) == QUAL_UNION_TYPE)
25618 && TYPE_FIELDS (type))
25620 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25621 return 64;
25622 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25623 return 128;
25625 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25626 || TREE_CODE (type) == INTEGER_TYPE)
25629 if (TYPE_MODE (type) == DFmode && align < 64)
25630 return 64;
25631 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25632 return 128;
25634 return align;
25637 /* Compute the minimum required alignment for dynamic stack realignment
25638 purposes for a local variable, parameter or a stack slot. EXP is
25639 the data type or decl itself, MODE is its mode and ALIGN is the
25640 alignment that the object would ordinarily have. */
25642 unsigned int
25643 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25644 unsigned int align)
25646 tree type, decl;
25648 if (exp && DECL_P (exp))
25650 type = TREE_TYPE (exp);
25651 decl = exp;
25653 else
25655 type = exp;
25656 decl = NULL;
25659 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25660 return align;
25662 /* Don't do dynamic stack realignment for long long objects with
25663 -mpreferred-stack-boundary=2. */
25664 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25665 && (!type || !TYPE_USER_ALIGN (type))
25666 && (!decl || !DECL_USER_ALIGN (decl)))
25667 return 32;
25669 return align;
25672 /* Find a location for the static chain incoming to a nested function.
25673 This is a register, unless all free registers are used by arguments. */
25675 static rtx
25676 ix86_static_chain (const_tree fndecl, bool incoming_p)
25678 unsigned regno;
25680 if (!DECL_STATIC_CHAIN (fndecl))
25681 return NULL;
25683 if (TARGET_64BIT)
25685 /* We always use R10 in 64-bit mode. */
25686 regno = R10_REG;
25688 else
25690 tree fntype;
25691 unsigned int ccvt;
25693 /* By default in 32-bit mode we use ECX to pass the static chain. */
25694 regno = CX_REG;
25696 fntype = TREE_TYPE (fndecl);
25697 ccvt = ix86_get_callcvt (fntype);
25698 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25700 /* Fastcall functions use ecx/edx for arguments, which leaves
25701 us with EAX for the static chain.
25702 Thiscall functions use ecx for arguments, which also
25703 leaves us with EAX for the static chain. */
25704 regno = AX_REG;
25706 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25708 /* Thiscall functions use ecx for arguments, which leaves
25709 us with EAX and EDX for the static chain.
25710 We are using for abi-compatibility EAX. */
25711 regno = AX_REG;
25713 else if (ix86_function_regparm (fntype, fndecl) == 3)
25715 /* For regparm 3, we have no free call-clobbered registers in
25716 which to store the static chain. In order to implement this,
25717 we have the trampoline push the static chain to the stack.
25718 However, we can't push a value below the return address when
25719 we call the nested function directly, so we have to use an
25720 alternate entry point. For this we use ESI, and have the
25721 alternate entry point push ESI, so that things appear the
25722 same once we're executing the nested function. */
25723 if (incoming_p)
25725 if (fndecl == current_function_decl)
25726 ix86_static_chain_on_stack = true;
25727 return gen_frame_mem (SImode,
25728 plus_constant (Pmode,
25729 arg_pointer_rtx, -8));
25731 regno = SI_REG;
25735 return gen_rtx_REG (Pmode, regno);
25738 /* Emit RTL insns to initialize the variable parts of a trampoline.
25739 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25740 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25741 to be passed to the target function. */
25743 static void
25744 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25746 rtx mem, fnaddr;
25747 int opcode;
25748 int offset = 0;
25750 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25752 if (TARGET_64BIT)
25754 int size;
25756 /* Load the function address to r11. Try to load address using
25757 the shorter movl instead of movabs. We may want to support
25758 movq for kernel mode, but kernel does not use trampolines at
25759 the moment. FNADDR is a 32bit address and may not be in
25760 DImode when ptr_mode == SImode. Always use movl in this
25761 case. */
25762 if (ptr_mode == SImode
25763 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25765 fnaddr = copy_addr_to_reg (fnaddr);
25767 mem = adjust_address (m_tramp, HImode, offset);
25768 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25770 mem = adjust_address (m_tramp, SImode, offset + 2);
25771 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25772 offset += 6;
25774 else
25776 mem = adjust_address (m_tramp, HImode, offset);
25777 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25779 mem = adjust_address (m_tramp, DImode, offset + 2);
25780 emit_move_insn (mem, fnaddr);
25781 offset += 10;
25784 /* Load static chain using movabs to r10. Use the shorter movl
25785 instead of movabs when ptr_mode == SImode. */
25786 if (ptr_mode == SImode)
25788 opcode = 0xba41;
25789 size = 6;
25791 else
25793 opcode = 0xba49;
25794 size = 10;
25797 mem = adjust_address (m_tramp, HImode, offset);
25798 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25800 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25801 emit_move_insn (mem, chain_value);
25802 offset += size;
25804 /* Jump to r11; the last (unused) byte is a nop, only there to
25805 pad the write out to a single 32-bit store. */
25806 mem = adjust_address (m_tramp, SImode, offset);
25807 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25808 offset += 4;
25810 else
25812 rtx disp, chain;
25814 /* Depending on the static chain location, either load a register
25815 with a constant, or push the constant to the stack. All of the
25816 instructions are the same size. */
25817 chain = ix86_static_chain (fndecl, true);
25818 if (REG_P (chain))
25820 switch (REGNO (chain))
25822 case AX_REG:
25823 opcode = 0xb8; break;
25824 case CX_REG:
25825 opcode = 0xb9; break;
25826 default:
25827 gcc_unreachable ();
25830 else
25831 opcode = 0x68;
25833 mem = adjust_address (m_tramp, QImode, offset);
25834 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25836 mem = adjust_address (m_tramp, SImode, offset + 1);
25837 emit_move_insn (mem, chain_value);
25838 offset += 5;
25840 mem = adjust_address (m_tramp, QImode, offset);
25841 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25843 mem = adjust_address (m_tramp, SImode, offset + 1);
25845 /* Compute offset from the end of the jmp to the target function.
25846 In the case in which the trampoline stores the static chain on
25847 the stack, we need to skip the first insn which pushes the
25848 (call-saved) register static chain; this push is 1 byte. */
25849 offset += 5;
25850 disp = expand_binop (SImode, sub_optab, fnaddr,
25851 plus_constant (Pmode, XEXP (m_tramp, 0),
25852 offset - (MEM_P (chain) ? 1 : 0)),
25853 NULL_RTX, 1, OPTAB_DIRECT);
25854 emit_move_insn (mem, disp);
25857 gcc_assert (offset <= TRAMPOLINE_SIZE);
25859 #ifdef HAVE_ENABLE_EXECUTE_STACK
25860 #ifdef CHECK_EXECUTE_STACK_ENABLED
25861 if (CHECK_EXECUTE_STACK_ENABLED)
25862 #endif
25863 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25864 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25865 #endif
25868 /* The following file contains several enumerations and data structures
25869 built from the definitions in i386-builtin-types.def. */
25871 #include "i386-builtin-types.inc"
25873 /* Table for the ix86 builtin non-function types. */
25874 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25876 /* Retrieve an element from the above table, building some of
25877 the types lazily. */
25879 static tree
25880 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25882 unsigned int index;
25883 tree type, itype;
25885 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25887 type = ix86_builtin_type_tab[(int) tcode];
25888 if (type != NULL)
25889 return type;
25891 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25892 if (tcode <= IX86_BT_LAST_VECT)
25894 enum machine_mode mode;
25896 index = tcode - IX86_BT_LAST_PRIM - 1;
25897 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25898 mode = ix86_builtin_type_vect_mode[index];
25900 type = build_vector_type_for_mode (itype, mode);
25902 else
25904 int quals;
25906 index = tcode - IX86_BT_LAST_VECT - 1;
25907 if (tcode <= IX86_BT_LAST_PTR)
25908 quals = TYPE_UNQUALIFIED;
25909 else
25910 quals = TYPE_QUAL_CONST;
25912 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25913 if (quals != TYPE_UNQUALIFIED)
25914 itype = build_qualified_type (itype, quals);
25916 type = build_pointer_type (itype);
25919 ix86_builtin_type_tab[(int) tcode] = type;
25920 return type;
25923 /* Table for the ix86 builtin function types. */
25924 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25926 /* Retrieve an element from the above table, building some of
25927 the types lazily. */
25929 static tree
25930 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25932 tree type;
25934 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25936 type = ix86_builtin_func_type_tab[(int) tcode];
25937 if (type != NULL)
25938 return type;
25940 if (tcode <= IX86_BT_LAST_FUNC)
25942 unsigned start = ix86_builtin_func_start[(int) tcode];
25943 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25944 tree rtype, atype, args = void_list_node;
25945 unsigned i;
25947 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25948 for (i = after - 1; i > start; --i)
25950 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25951 args = tree_cons (NULL, atype, args);
25954 type = build_function_type (rtype, args);
25956 else
25958 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25959 enum ix86_builtin_func_type icode;
25961 icode = ix86_builtin_func_alias_base[index];
25962 type = ix86_get_builtin_func_type (icode);
25965 ix86_builtin_func_type_tab[(int) tcode] = type;
25966 return type;
25970 /* Codes for all the SSE/MMX builtins. */
25971 enum ix86_builtins
25973 IX86_BUILTIN_ADDPS,
25974 IX86_BUILTIN_ADDSS,
25975 IX86_BUILTIN_DIVPS,
25976 IX86_BUILTIN_DIVSS,
25977 IX86_BUILTIN_MULPS,
25978 IX86_BUILTIN_MULSS,
25979 IX86_BUILTIN_SUBPS,
25980 IX86_BUILTIN_SUBSS,
25982 IX86_BUILTIN_CMPEQPS,
25983 IX86_BUILTIN_CMPLTPS,
25984 IX86_BUILTIN_CMPLEPS,
25985 IX86_BUILTIN_CMPGTPS,
25986 IX86_BUILTIN_CMPGEPS,
25987 IX86_BUILTIN_CMPNEQPS,
25988 IX86_BUILTIN_CMPNLTPS,
25989 IX86_BUILTIN_CMPNLEPS,
25990 IX86_BUILTIN_CMPNGTPS,
25991 IX86_BUILTIN_CMPNGEPS,
25992 IX86_BUILTIN_CMPORDPS,
25993 IX86_BUILTIN_CMPUNORDPS,
25994 IX86_BUILTIN_CMPEQSS,
25995 IX86_BUILTIN_CMPLTSS,
25996 IX86_BUILTIN_CMPLESS,
25997 IX86_BUILTIN_CMPNEQSS,
25998 IX86_BUILTIN_CMPNLTSS,
25999 IX86_BUILTIN_CMPNLESS,
26000 IX86_BUILTIN_CMPORDSS,
26001 IX86_BUILTIN_CMPUNORDSS,
26003 IX86_BUILTIN_COMIEQSS,
26004 IX86_BUILTIN_COMILTSS,
26005 IX86_BUILTIN_COMILESS,
26006 IX86_BUILTIN_COMIGTSS,
26007 IX86_BUILTIN_COMIGESS,
26008 IX86_BUILTIN_COMINEQSS,
26009 IX86_BUILTIN_UCOMIEQSS,
26010 IX86_BUILTIN_UCOMILTSS,
26011 IX86_BUILTIN_UCOMILESS,
26012 IX86_BUILTIN_UCOMIGTSS,
26013 IX86_BUILTIN_UCOMIGESS,
26014 IX86_BUILTIN_UCOMINEQSS,
26016 IX86_BUILTIN_CVTPI2PS,
26017 IX86_BUILTIN_CVTPS2PI,
26018 IX86_BUILTIN_CVTSI2SS,
26019 IX86_BUILTIN_CVTSI642SS,
26020 IX86_BUILTIN_CVTSS2SI,
26021 IX86_BUILTIN_CVTSS2SI64,
26022 IX86_BUILTIN_CVTTPS2PI,
26023 IX86_BUILTIN_CVTTSS2SI,
26024 IX86_BUILTIN_CVTTSS2SI64,
26026 IX86_BUILTIN_MAXPS,
26027 IX86_BUILTIN_MAXSS,
26028 IX86_BUILTIN_MINPS,
26029 IX86_BUILTIN_MINSS,
26031 IX86_BUILTIN_LOADUPS,
26032 IX86_BUILTIN_STOREUPS,
26033 IX86_BUILTIN_MOVSS,
26035 IX86_BUILTIN_MOVHLPS,
26036 IX86_BUILTIN_MOVLHPS,
26037 IX86_BUILTIN_LOADHPS,
26038 IX86_BUILTIN_LOADLPS,
26039 IX86_BUILTIN_STOREHPS,
26040 IX86_BUILTIN_STORELPS,
26042 IX86_BUILTIN_MASKMOVQ,
26043 IX86_BUILTIN_MOVMSKPS,
26044 IX86_BUILTIN_PMOVMSKB,
26046 IX86_BUILTIN_MOVNTPS,
26047 IX86_BUILTIN_MOVNTQ,
26049 IX86_BUILTIN_LOADDQU,
26050 IX86_BUILTIN_STOREDQU,
26052 IX86_BUILTIN_PACKSSWB,
26053 IX86_BUILTIN_PACKSSDW,
26054 IX86_BUILTIN_PACKUSWB,
26056 IX86_BUILTIN_PADDB,
26057 IX86_BUILTIN_PADDW,
26058 IX86_BUILTIN_PADDD,
26059 IX86_BUILTIN_PADDQ,
26060 IX86_BUILTIN_PADDSB,
26061 IX86_BUILTIN_PADDSW,
26062 IX86_BUILTIN_PADDUSB,
26063 IX86_BUILTIN_PADDUSW,
26064 IX86_BUILTIN_PSUBB,
26065 IX86_BUILTIN_PSUBW,
26066 IX86_BUILTIN_PSUBD,
26067 IX86_BUILTIN_PSUBQ,
26068 IX86_BUILTIN_PSUBSB,
26069 IX86_BUILTIN_PSUBSW,
26070 IX86_BUILTIN_PSUBUSB,
26071 IX86_BUILTIN_PSUBUSW,
26073 IX86_BUILTIN_PAND,
26074 IX86_BUILTIN_PANDN,
26075 IX86_BUILTIN_POR,
26076 IX86_BUILTIN_PXOR,
26078 IX86_BUILTIN_PAVGB,
26079 IX86_BUILTIN_PAVGW,
26081 IX86_BUILTIN_PCMPEQB,
26082 IX86_BUILTIN_PCMPEQW,
26083 IX86_BUILTIN_PCMPEQD,
26084 IX86_BUILTIN_PCMPGTB,
26085 IX86_BUILTIN_PCMPGTW,
26086 IX86_BUILTIN_PCMPGTD,
26088 IX86_BUILTIN_PMADDWD,
26090 IX86_BUILTIN_PMAXSW,
26091 IX86_BUILTIN_PMAXUB,
26092 IX86_BUILTIN_PMINSW,
26093 IX86_BUILTIN_PMINUB,
26095 IX86_BUILTIN_PMULHUW,
26096 IX86_BUILTIN_PMULHW,
26097 IX86_BUILTIN_PMULLW,
26099 IX86_BUILTIN_PSADBW,
26100 IX86_BUILTIN_PSHUFW,
26102 IX86_BUILTIN_PSLLW,
26103 IX86_BUILTIN_PSLLD,
26104 IX86_BUILTIN_PSLLQ,
26105 IX86_BUILTIN_PSRAW,
26106 IX86_BUILTIN_PSRAD,
26107 IX86_BUILTIN_PSRLW,
26108 IX86_BUILTIN_PSRLD,
26109 IX86_BUILTIN_PSRLQ,
26110 IX86_BUILTIN_PSLLWI,
26111 IX86_BUILTIN_PSLLDI,
26112 IX86_BUILTIN_PSLLQI,
26113 IX86_BUILTIN_PSRAWI,
26114 IX86_BUILTIN_PSRADI,
26115 IX86_BUILTIN_PSRLWI,
26116 IX86_BUILTIN_PSRLDI,
26117 IX86_BUILTIN_PSRLQI,
26119 IX86_BUILTIN_PUNPCKHBW,
26120 IX86_BUILTIN_PUNPCKHWD,
26121 IX86_BUILTIN_PUNPCKHDQ,
26122 IX86_BUILTIN_PUNPCKLBW,
26123 IX86_BUILTIN_PUNPCKLWD,
26124 IX86_BUILTIN_PUNPCKLDQ,
26126 IX86_BUILTIN_SHUFPS,
26128 IX86_BUILTIN_RCPPS,
26129 IX86_BUILTIN_RCPSS,
26130 IX86_BUILTIN_RSQRTPS,
26131 IX86_BUILTIN_RSQRTPS_NR,
26132 IX86_BUILTIN_RSQRTSS,
26133 IX86_BUILTIN_RSQRTF,
26134 IX86_BUILTIN_SQRTPS,
26135 IX86_BUILTIN_SQRTPS_NR,
26136 IX86_BUILTIN_SQRTSS,
26138 IX86_BUILTIN_UNPCKHPS,
26139 IX86_BUILTIN_UNPCKLPS,
26141 IX86_BUILTIN_ANDPS,
26142 IX86_BUILTIN_ANDNPS,
26143 IX86_BUILTIN_ORPS,
26144 IX86_BUILTIN_XORPS,
26146 IX86_BUILTIN_EMMS,
26147 IX86_BUILTIN_LDMXCSR,
26148 IX86_BUILTIN_STMXCSR,
26149 IX86_BUILTIN_SFENCE,
26151 IX86_BUILTIN_FXSAVE,
26152 IX86_BUILTIN_FXRSTOR,
26153 IX86_BUILTIN_FXSAVE64,
26154 IX86_BUILTIN_FXRSTOR64,
26156 IX86_BUILTIN_XSAVE,
26157 IX86_BUILTIN_XRSTOR,
26158 IX86_BUILTIN_XSAVE64,
26159 IX86_BUILTIN_XRSTOR64,
26161 IX86_BUILTIN_XSAVEOPT,
26162 IX86_BUILTIN_XSAVEOPT64,
26164 /* 3DNow! Original */
26165 IX86_BUILTIN_FEMMS,
26166 IX86_BUILTIN_PAVGUSB,
26167 IX86_BUILTIN_PF2ID,
26168 IX86_BUILTIN_PFACC,
26169 IX86_BUILTIN_PFADD,
26170 IX86_BUILTIN_PFCMPEQ,
26171 IX86_BUILTIN_PFCMPGE,
26172 IX86_BUILTIN_PFCMPGT,
26173 IX86_BUILTIN_PFMAX,
26174 IX86_BUILTIN_PFMIN,
26175 IX86_BUILTIN_PFMUL,
26176 IX86_BUILTIN_PFRCP,
26177 IX86_BUILTIN_PFRCPIT1,
26178 IX86_BUILTIN_PFRCPIT2,
26179 IX86_BUILTIN_PFRSQIT1,
26180 IX86_BUILTIN_PFRSQRT,
26181 IX86_BUILTIN_PFSUB,
26182 IX86_BUILTIN_PFSUBR,
26183 IX86_BUILTIN_PI2FD,
26184 IX86_BUILTIN_PMULHRW,
26186 /* 3DNow! Athlon Extensions */
26187 IX86_BUILTIN_PF2IW,
26188 IX86_BUILTIN_PFNACC,
26189 IX86_BUILTIN_PFPNACC,
26190 IX86_BUILTIN_PI2FW,
26191 IX86_BUILTIN_PSWAPDSI,
26192 IX86_BUILTIN_PSWAPDSF,
26194 /* SSE2 */
26195 IX86_BUILTIN_ADDPD,
26196 IX86_BUILTIN_ADDSD,
26197 IX86_BUILTIN_DIVPD,
26198 IX86_BUILTIN_DIVSD,
26199 IX86_BUILTIN_MULPD,
26200 IX86_BUILTIN_MULSD,
26201 IX86_BUILTIN_SUBPD,
26202 IX86_BUILTIN_SUBSD,
26204 IX86_BUILTIN_CMPEQPD,
26205 IX86_BUILTIN_CMPLTPD,
26206 IX86_BUILTIN_CMPLEPD,
26207 IX86_BUILTIN_CMPGTPD,
26208 IX86_BUILTIN_CMPGEPD,
26209 IX86_BUILTIN_CMPNEQPD,
26210 IX86_BUILTIN_CMPNLTPD,
26211 IX86_BUILTIN_CMPNLEPD,
26212 IX86_BUILTIN_CMPNGTPD,
26213 IX86_BUILTIN_CMPNGEPD,
26214 IX86_BUILTIN_CMPORDPD,
26215 IX86_BUILTIN_CMPUNORDPD,
26216 IX86_BUILTIN_CMPEQSD,
26217 IX86_BUILTIN_CMPLTSD,
26218 IX86_BUILTIN_CMPLESD,
26219 IX86_BUILTIN_CMPNEQSD,
26220 IX86_BUILTIN_CMPNLTSD,
26221 IX86_BUILTIN_CMPNLESD,
26222 IX86_BUILTIN_CMPORDSD,
26223 IX86_BUILTIN_CMPUNORDSD,
26225 IX86_BUILTIN_COMIEQSD,
26226 IX86_BUILTIN_COMILTSD,
26227 IX86_BUILTIN_COMILESD,
26228 IX86_BUILTIN_COMIGTSD,
26229 IX86_BUILTIN_COMIGESD,
26230 IX86_BUILTIN_COMINEQSD,
26231 IX86_BUILTIN_UCOMIEQSD,
26232 IX86_BUILTIN_UCOMILTSD,
26233 IX86_BUILTIN_UCOMILESD,
26234 IX86_BUILTIN_UCOMIGTSD,
26235 IX86_BUILTIN_UCOMIGESD,
26236 IX86_BUILTIN_UCOMINEQSD,
26238 IX86_BUILTIN_MAXPD,
26239 IX86_BUILTIN_MAXSD,
26240 IX86_BUILTIN_MINPD,
26241 IX86_BUILTIN_MINSD,
26243 IX86_BUILTIN_ANDPD,
26244 IX86_BUILTIN_ANDNPD,
26245 IX86_BUILTIN_ORPD,
26246 IX86_BUILTIN_XORPD,
26248 IX86_BUILTIN_SQRTPD,
26249 IX86_BUILTIN_SQRTSD,
26251 IX86_BUILTIN_UNPCKHPD,
26252 IX86_BUILTIN_UNPCKLPD,
26254 IX86_BUILTIN_SHUFPD,
26256 IX86_BUILTIN_LOADUPD,
26257 IX86_BUILTIN_STOREUPD,
26258 IX86_BUILTIN_MOVSD,
26260 IX86_BUILTIN_LOADHPD,
26261 IX86_BUILTIN_LOADLPD,
26263 IX86_BUILTIN_CVTDQ2PD,
26264 IX86_BUILTIN_CVTDQ2PS,
26266 IX86_BUILTIN_CVTPD2DQ,
26267 IX86_BUILTIN_CVTPD2PI,
26268 IX86_BUILTIN_CVTPD2PS,
26269 IX86_BUILTIN_CVTTPD2DQ,
26270 IX86_BUILTIN_CVTTPD2PI,
26272 IX86_BUILTIN_CVTPI2PD,
26273 IX86_BUILTIN_CVTSI2SD,
26274 IX86_BUILTIN_CVTSI642SD,
26276 IX86_BUILTIN_CVTSD2SI,
26277 IX86_BUILTIN_CVTSD2SI64,
26278 IX86_BUILTIN_CVTSD2SS,
26279 IX86_BUILTIN_CVTSS2SD,
26280 IX86_BUILTIN_CVTTSD2SI,
26281 IX86_BUILTIN_CVTTSD2SI64,
26283 IX86_BUILTIN_CVTPS2DQ,
26284 IX86_BUILTIN_CVTPS2PD,
26285 IX86_BUILTIN_CVTTPS2DQ,
26287 IX86_BUILTIN_MOVNTI,
26288 IX86_BUILTIN_MOVNTI64,
26289 IX86_BUILTIN_MOVNTPD,
26290 IX86_BUILTIN_MOVNTDQ,
26292 IX86_BUILTIN_MOVQ128,
26294 /* SSE2 MMX */
26295 IX86_BUILTIN_MASKMOVDQU,
26296 IX86_BUILTIN_MOVMSKPD,
26297 IX86_BUILTIN_PMOVMSKB128,
26299 IX86_BUILTIN_PACKSSWB128,
26300 IX86_BUILTIN_PACKSSDW128,
26301 IX86_BUILTIN_PACKUSWB128,
26303 IX86_BUILTIN_PADDB128,
26304 IX86_BUILTIN_PADDW128,
26305 IX86_BUILTIN_PADDD128,
26306 IX86_BUILTIN_PADDQ128,
26307 IX86_BUILTIN_PADDSB128,
26308 IX86_BUILTIN_PADDSW128,
26309 IX86_BUILTIN_PADDUSB128,
26310 IX86_BUILTIN_PADDUSW128,
26311 IX86_BUILTIN_PSUBB128,
26312 IX86_BUILTIN_PSUBW128,
26313 IX86_BUILTIN_PSUBD128,
26314 IX86_BUILTIN_PSUBQ128,
26315 IX86_BUILTIN_PSUBSB128,
26316 IX86_BUILTIN_PSUBSW128,
26317 IX86_BUILTIN_PSUBUSB128,
26318 IX86_BUILTIN_PSUBUSW128,
26320 IX86_BUILTIN_PAND128,
26321 IX86_BUILTIN_PANDN128,
26322 IX86_BUILTIN_POR128,
26323 IX86_BUILTIN_PXOR128,
26325 IX86_BUILTIN_PAVGB128,
26326 IX86_BUILTIN_PAVGW128,
26328 IX86_BUILTIN_PCMPEQB128,
26329 IX86_BUILTIN_PCMPEQW128,
26330 IX86_BUILTIN_PCMPEQD128,
26331 IX86_BUILTIN_PCMPGTB128,
26332 IX86_BUILTIN_PCMPGTW128,
26333 IX86_BUILTIN_PCMPGTD128,
26335 IX86_BUILTIN_PMADDWD128,
26337 IX86_BUILTIN_PMAXSW128,
26338 IX86_BUILTIN_PMAXUB128,
26339 IX86_BUILTIN_PMINSW128,
26340 IX86_BUILTIN_PMINUB128,
26342 IX86_BUILTIN_PMULUDQ,
26343 IX86_BUILTIN_PMULUDQ128,
26344 IX86_BUILTIN_PMULHUW128,
26345 IX86_BUILTIN_PMULHW128,
26346 IX86_BUILTIN_PMULLW128,
26348 IX86_BUILTIN_PSADBW128,
26349 IX86_BUILTIN_PSHUFHW,
26350 IX86_BUILTIN_PSHUFLW,
26351 IX86_BUILTIN_PSHUFD,
26353 IX86_BUILTIN_PSLLDQI128,
26354 IX86_BUILTIN_PSLLWI128,
26355 IX86_BUILTIN_PSLLDI128,
26356 IX86_BUILTIN_PSLLQI128,
26357 IX86_BUILTIN_PSRAWI128,
26358 IX86_BUILTIN_PSRADI128,
26359 IX86_BUILTIN_PSRLDQI128,
26360 IX86_BUILTIN_PSRLWI128,
26361 IX86_BUILTIN_PSRLDI128,
26362 IX86_BUILTIN_PSRLQI128,
26364 IX86_BUILTIN_PSLLDQ128,
26365 IX86_BUILTIN_PSLLW128,
26366 IX86_BUILTIN_PSLLD128,
26367 IX86_BUILTIN_PSLLQ128,
26368 IX86_BUILTIN_PSRAW128,
26369 IX86_BUILTIN_PSRAD128,
26370 IX86_BUILTIN_PSRLW128,
26371 IX86_BUILTIN_PSRLD128,
26372 IX86_BUILTIN_PSRLQ128,
26374 IX86_BUILTIN_PUNPCKHBW128,
26375 IX86_BUILTIN_PUNPCKHWD128,
26376 IX86_BUILTIN_PUNPCKHDQ128,
26377 IX86_BUILTIN_PUNPCKHQDQ128,
26378 IX86_BUILTIN_PUNPCKLBW128,
26379 IX86_BUILTIN_PUNPCKLWD128,
26380 IX86_BUILTIN_PUNPCKLDQ128,
26381 IX86_BUILTIN_PUNPCKLQDQ128,
26383 IX86_BUILTIN_CLFLUSH,
26384 IX86_BUILTIN_MFENCE,
26385 IX86_BUILTIN_LFENCE,
26386 IX86_BUILTIN_PAUSE,
26388 IX86_BUILTIN_BSRSI,
26389 IX86_BUILTIN_BSRDI,
26390 IX86_BUILTIN_RDPMC,
26391 IX86_BUILTIN_RDTSC,
26392 IX86_BUILTIN_RDTSCP,
26393 IX86_BUILTIN_ROLQI,
26394 IX86_BUILTIN_ROLHI,
26395 IX86_BUILTIN_RORQI,
26396 IX86_BUILTIN_RORHI,
26398 /* SSE3. */
26399 IX86_BUILTIN_ADDSUBPS,
26400 IX86_BUILTIN_HADDPS,
26401 IX86_BUILTIN_HSUBPS,
26402 IX86_BUILTIN_MOVSHDUP,
26403 IX86_BUILTIN_MOVSLDUP,
26404 IX86_BUILTIN_ADDSUBPD,
26405 IX86_BUILTIN_HADDPD,
26406 IX86_BUILTIN_HSUBPD,
26407 IX86_BUILTIN_LDDQU,
26409 IX86_BUILTIN_MONITOR,
26410 IX86_BUILTIN_MWAIT,
26412 /* SSSE3. */
26413 IX86_BUILTIN_PHADDW,
26414 IX86_BUILTIN_PHADDD,
26415 IX86_BUILTIN_PHADDSW,
26416 IX86_BUILTIN_PHSUBW,
26417 IX86_BUILTIN_PHSUBD,
26418 IX86_BUILTIN_PHSUBSW,
26419 IX86_BUILTIN_PMADDUBSW,
26420 IX86_BUILTIN_PMULHRSW,
26421 IX86_BUILTIN_PSHUFB,
26422 IX86_BUILTIN_PSIGNB,
26423 IX86_BUILTIN_PSIGNW,
26424 IX86_BUILTIN_PSIGND,
26425 IX86_BUILTIN_PALIGNR,
26426 IX86_BUILTIN_PABSB,
26427 IX86_BUILTIN_PABSW,
26428 IX86_BUILTIN_PABSD,
26430 IX86_BUILTIN_PHADDW128,
26431 IX86_BUILTIN_PHADDD128,
26432 IX86_BUILTIN_PHADDSW128,
26433 IX86_BUILTIN_PHSUBW128,
26434 IX86_BUILTIN_PHSUBD128,
26435 IX86_BUILTIN_PHSUBSW128,
26436 IX86_BUILTIN_PMADDUBSW128,
26437 IX86_BUILTIN_PMULHRSW128,
26438 IX86_BUILTIN_PSHUFB128,
26439 IX86_BUILTIN_PSIGNB128,
26440 IX86_BUILTIN_PSIGNW128,
26441 IX86_BUILTIN_PSIGND128,
26442 IX86_BUILTIN_PALIGNR128,
26443 IX86_BUILTIN_PABSB128,
26444 IX86_BUILTIN_PABSW128,
26445 IX86_BUILTIN_PABSD128,
26447 /* AMDFAM10 - SSE4A New Instructions. */
26448 IX86_BUILTIN_MOVNTSD,
26449 IX86_BUILTIN_MOVNTSS,
26450 IX86_BUILTIN_EXTRQI,
26451 IX86_BUILTIN_EXTRQ,
26452 IX86_BUILTIN_INSERTQI,
26453 IX86_BUILTIN_INSERTQ,
26455 /* SSE4.1. */
26456 IX86_BUILTIN_BLENDPD,
26457 IX86_BUILTIN_BLENDPS,
26458 IX86_BUILTIN_BLENDVPD,
26459 IX86_BUILTIN_BLENDVPS,
26460 IX86_BUILTIN_PBLENDVB128,
26461 IX86_BUILTIN_PBLENDW128,
26463 IX86_BUILTIN_DPPD,
26464 IX86_BUILTIN_DPPS,
26466 IX86_BUILTIN_INSERTPS128,
26468 IX86_BUILTIN_MOVNTDQA,
26469 IX86_BUILTIN_MPSADBW128,
26470 IX86_BUILTIN_PACKUSDW128,
26471 IX86_BUILTIN_PCMPEQQ,
26472 IX86_BUILTIN_PHMINPOSUW128,
26474 IX86_BUILTIN_PMAXSB128,
26475 IX86_BUILTIN_PMAXSD128,
26476 IX86_BUILTIN_PMAXUD128,
26477 IX86_BUILTIN_PMAXUW128,
26479 IX86_BUILTIN_PMINSB128,
26480 IX86_BUILTIN_PMINSD128,
26481 IX86_BUILTIN_PMINUD128,
26482 IX86_BUILTIN_PMINUW128,
26484 IX86_BUILTIN_PMOVSXBW128,
26485 IX86_BUILTIN_PMOVSXBD128,
26486 IX86_BUILTIN_PMOVSXBQ128,
26487 IX86_BUILTIN_PMOVSXWD128,
26488 IX86_BUILTIN_PMOVSXWQ128,
26489 IX86_BUILTIN_PMOVSXDQ128,
26491 IX86_BUILTIN_PMOVZXBW128,
26492 IX86_BUILTIN_PMOVZXBD128,
26493 IX86_BUILTIN_PMOVZXBQ128,
26494 IX86_BUILTIN_PMOVZXWD128,
26495 IX86_BUILTIN_PMOVZXWQ128,
26496 IX86_BUILTIN_PMOVZXDQ128,
26498 IX86_BUILTIN_PMULDQ128,
26499 IX86_BUILTIN_PMULLD128,
26501 IX86_BUILTIN_ROUNDSD,
26502 IX86_BUILTIN_ROUNDSS,
26504 IX86_BUILTIN_ROUNDPD,
26505 IX86_BUILTIN_ROUNDPS,
26507 IX86_BUILTIN_FLOORPD,
26508 IX86_BUILTIN_CEILPD,
26509 IX86_BUILTIN_TRUNCPD,
26510 IX86_BUILTIN_RINTPD,
26511 IX86_BUILTIN_ROUNDPD_AZ,
26513 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26514 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26515 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26517 IX86_BUILTIN_FLOORPS,
26518 IX86_BUILTIN_CEILPS,
26519 IX86_BUILTIN_TRUNCPS,
26520 IX86_BUILTIN_RINTPS,
26521 IX86_BUILTIN_ROUNDPS_AZ,
26523 IX86_BUILTIN_FLOORPS_SFIX,
26524 IX86_BUILTIN_CEILPS_SFIX,
26525 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26527 IX86_BUILTIN_PTESTZ,
26528 IX86_BUILTIN_PTESTC,
26529 IX86_BUILTIN_PTESTNZC,
26531 IX86_BUILTIN_VEC_INIT_V2SI,
26532 IX86_BUILTIN_VEC_INIT_V4HI,
26533 IX86_BUILTIN_VEC_INIT_V8QI,
26534 IX86_BUILTIN_VEC_EXT_V2DF,
26535 IX86_BUILTIN_VEC_EXT_V2DI,
26536 IX86_BUILTIN_VEC_EXT_V4SF,
26537 IX86_BUILTIN_VEC_EXT_V4SI,
26538 IX86_BUILTIN_VEC_EXT_V8HI,
26539 IX86_BUILTIN_VEC_EXT_V2SI,
26540 IX86_BUILTIN_VEC_EXT_V4HI,
26541 IX86_BUILTIN_VEC_EXT_V16QI,
26542 IX86_BUILTIN_VEC_SET_V2DI,
26543 IX86_BUILTIN_VEC_SET_V4SF,
26544 IX86_BUILTIN_VEC_SET_V4SI,
26545 IX86_BUILTIN_VEC_SET_V8HI,
26546 IX86_BUILTIN_VEC_SET_V4HI,
26547 IX86_BUILTIN_VEC_SET_V16QI,
26549 IX86_BUILTIN_VEC_PACK_SFIX,
26550 IX86_BUILTIN_VEC_PACK_SFIX256,
26552 /* SSE4.2. */
26553 IX86_BUILTIN_CRC32QI,
26554 IX86_BUILTIN_CRC32HI,
26555 IX86_BUILTIN_CRC32SI,
26556 IX86_BUILTIN_CRC32DI,
26558 IX86_BUILTIN_PCMPESTRI128,
26559 IX86_BUILTIN_PCMPESTRM128,
26560 IX86_BUILTIN_PCMPESTRA128,
26561 IX86_BUILTIN_PCMPESTRC128,
26562 IX86_BUILTIN_PCMPESTRO128,
26563 IX86_BUILTIN_PCMPESTRS128,
26564 IX86_BUILTIN_PCMPESTRZ128,
26565 IX86_BUILTIN_PCMPISTRI128,
26566 IX86_BUILTIN_PCMPISTRM128,
26567 IX86_BUILTIN_PCMPISTRA128,
26568 IX86_BUILTIN_PCMPISTRC128,
26569 IX86_BUILTIN_PCMPISTRO128,
26570 IX86_BUILTIN_PCMPISTRS128,
26571 IX86_BUILTIN_PCMPISTRZ128,
26573 IX86_BUILTIN_PCMPGTQ,
26575 /* AES instructions */
26576 IX86_BUILTIN_AESENC128,
26577 IX86_BUILTIN_AESENCLAST128,
26578 IX86_BUILTIN_AESDEC128,
26579 IX86_BUILTIN_AESDECLAST128,
26580 IX86_BUILTIN_AESIMC128,
26581 IX86_BUILTIN_AESKEYGENASSIST128,
26583 /* PCLMUL instruction */
26584 IX86_BUILTIN_PCLMULQDQ128,
26586 /* AVX */
26587 IX86_BUILTIN_ADDPD256,
26588 IX86_BUILTIN_ADDPS256,
26589 IX86_BUILTIN_ADDSUBPD256,
26590 IX86_BUILTIN_ADDSUBPS256,
26591 IX86_BUILTIN_ANDPD256,
26592 IX86_BUILTIN_ANDPS256,
26593 IX86_BUILTIN_ANDNPD256,
26594 IX86_BUILTIN_ANDNPS256,
26595 IX86_BUILTIN_BLENDPD256,
26596 IX86_BUILTIN_BLENDPS256,
26597 IX86_BUILTIN_BLENDVPD256,
26598 IX86_BUILTIN_BLENDVPS256,
26599 IX86_BUILTIN_DIVPD256,
26600 IX86_BUILTIN_DIVPS256,
26601 IX86_BUILTIN_DPPS256,
26602 IX86_BUILTIN_HADDPD256,
26603 IX86_BUILTIN_HADDPS256,
26604 IX86_BUILTIN_HSUBPD256,
26605 IX86_BUILTIN_HSUBPS256,
26606 IX86_BUILTIN_MAXPD256,
26607 IX86_BUILTIN_MAXPS256,
26608 IX86_BUILTIN_MINPD256,
26609 IX86_BUILTIN_MINPS256,
26610 IX86_BUILTIN_MULPD256,
26611 IX86_BUILTIN_MULPS256,
26612 IX86_BUILTIN_ORPD256,
26613 IX86_BUILTIN_ORPS256,
26614 IX86_BUILTIN_SHUFPD256,
26615 IX86_BUILTIN_SHUFPS256,
26616 IX86_BUILTIN_SUBPD256,
26617 IX86_BUILTIN_SUBPS256,
26618 IX86_BUILTIN_XORPD256,
26619 IX86_BUILTIN_XORPS256,
26620 IX86_BUILTIN_CMPSD,
26621 IX86_BUILTIN_CMPSS,
26622 IX86_BUILTIN_CMPPD,
26623 IX86_BUILTIN_CMPPS,
26624 IX86_BUILTIN_CMPPD256,
26625 IX86_BUILTIN_CMPPS256,
26626 IX86_BUILTIN_CVTDQ2PD256,
26627 IX86_BUILTIN_CVTDQ2PS256,
26628 IX86_BUILTIN_CVTPD2PS256,
26629 IX86_BUILTIN_CVTPS2DQ256,
26630 IX86_BUILTIN_CVTPS2PD256,
26631 IX86_BUILTIN_CVTTPD2DQ256,
26632 IX86_BUILTIN_CVTPD2DQ256,
26633 IX86_BUILTIN_CVTTPS2DQ256,
26634 IX86_BUILTIN_EXTRACTF128PD256,
26635 IX86_BUILTIN_EXTRACTF128PS256,
26636 IX86_BUILTIN_EXTRACTF128SI256,
26637 IX86_BUILTIN_VZEROALL,
26638 IX86_BUILTIN_VZEROUPPER,
26639 IX86_BUILTIN_VPERMILVARPD,
26640 IX86_BUILTIN_VPERMILVARPS,
26641 IX86_BUILTIN_VPERMILVARPD256,
26642 IX86_BUILTIN_VPERMILVARPS256,
26643 IX86_BUILTIN_VPERMILPD,
26644 IX86_BUILTIN_VPERMILPS,
26645 IX86_BUILTIN_VPERMILPD256,
26646 IX86_BUILTIN_VPERMILPS256,
26647 IX86_BUILTIN_VPERMIL2PD,
26648 IX86_BUILTIN_VPERMIL2PS,
26649 IX86_BUILTIN_VPERMIL2PD256,
26650 IX86_BUILTIN_VPERMIL2PS256,
26651 IX86_BUILTIN_VPERM2F128PD256,
26652 IX86_BUILTIN_VPERM2F128PS256,
26653 IX86_BUILTIN_VPERM2F128SI256,
26654 IX86_BUILTIN_VBROADCASTSS,
26655 IX86_BUILTIN_VBROADCASTSD256,
26656 IX86_BUILTIN_VBROADCASTSS256,
26657 IX86_BUILTIN_VBROADCASTPD256,
26658 IX86_BUILTIN_VBROADCASTPS256,
26659 IX86_BUILTIN_VINSERTF128PD256,
26660 IX86_BUILTIN_VINSERTF128PS256,
26661 IX86_BUILTIN_VINSERTF128SI256,
26662 IX86_BUILTIN_LOADUPD256,
26663 IX86_BUILTIN_LOADUPS256,
26664 IX86_BUILTIN_STOREUPD256,
26665 IX86_BUILTIN_STOREUPS256,
26666 IX86_BUILTIN_LDDQU256,
26667 IX86_BUILTIN_MOVNTDQ256,
26668 IX86_BUILTIN_MOVNTPD256,
26669 IX86_BUILTIN_MOVNTPS256,
26670 IX86_BUILTIN_LOADDQU256,
26671 IX86_BUILTIN_STOREDQU256,
26672 IX86_BUILTIN_MASKLOADPD,
26673 IX86_BUILTIN_MASKLOADPS,
26674 IX86_BUILTIN_MASKSTOREPD,
26675 IX86_BUILTIN_MASKSTOREPS,
26676 IX86_BUILTIN_MASKLOADPD256,
26677 IX86_BUILTIN_MASKLOADPS256,
26678 IX86_BUILTIN_MASKSTOREPD256,
26679 IX86_BUILTIN_MASKSTOREPS256,
26680 IX86_BUILTIN_MOVSHDUP256,
26681 IX86_BUILTIN_MOVSLDUP256,
26682 IX86_BUILTIN_MOVDDUP256,
26684 IX86_BUILTIN_SQRTPD256,
26685 IX86_BUILTIN_SQRTPS256,
26686 IX86_BUILTIN_SQRTPS_NR256,
26687 IX86_BUILTIN_RSQRTPS256,
26688 IX86_BUILTIN_RSQRTPS_NR256,
26690 IX86_BUILTIN_RCPPS256,
26692 IX86_BUILTIN_ROUNDPD256,
26693 IX86_BUILTIN_ROUNDPS256,
26695 IX86_BUILTIN_FLOORPD256,
26696 IX86_BUILTIN_CEILPD256,
26697 IX86_BUILTIN_TRUNCPD256,
26698 IX86_BUILTIN_RINTPD256,
26699 IX86_BUILTIN_ROUNDPD_AZ256,
26701 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26702 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26703 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26705 IX86_BUILTIN_FLOORPS256,
26706 IX86_BUILTIN_CEILPS256,
26707 IX86_BUILTIN_TRUNCPS256,
26708 IX86_BUILTIN_RINTPS256,
26709 IX86_BUILTIN_ROUNDPS_AZ256,
26711 IX86_BUILTIN_FLOORPS_SFIX256,
26712 IX86_BUILTIN_CEILPS_SFIX256,
26713 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26715 IX86_BUILTIN_UNPCKHPD256,
26716 IX86_BUILTIN_UNPCKLPD256,
26717 IX86_BUILTIN_UNPCKHPS256,
26718 IX86_BUILTIN_UNPCKLPS256,
26720 IX86_BUILTIN_SI256_SI,
26721 IX86_BUILTIN_PS256_PS,
26722 IX86_BUILTIN_PD256_PD,
26723 IX86_BUILTIN_SI_SI256,
26724 IX86_BUILTIN_PS_PS256,
26725 IX86_BUILTIN_PD_PD256,
26727 IX86_BUILTIN_VTESTZPD,
26728 IX86_BUILTIN_VTESTCPD,
26729 IX86_BUILTIN_VTESTNZCPD,
26730 IX86_BUILTIN_VTESTZPS,
26731 IX86_BUILTIN_VTESTCPS,
26732 IX86_BUILTIN_VTESTNZCPS,
26733 IX86_BUILTIN_VTESTZPD256,
26734 IX86_BUILTIN_VTESTCPD256,
26735 IX86_BUILTIN_VTESTNZCPD256,
26736 IX86_BUILTIN_VTESTZPS256,
26737 IX86_BUILTIN_VTESTCPS256,
26738 IX86_BUILTIN_VTESTNZCPS256,
26739 IX86_BUILTIN_PTESTZ256,
26740 IX86_BUILTIN_PTESTC256,
26741 IX86_BUILTIN_PTESTNZC256,
26743 IX86_BUILTIN_MOVMSKPD256,
26744 IX86_BUILTIN_MOVMSKPS256,
26746 /* AVX2 */
26747 IX86_BUILTIN_MPSADBW256,
26748 IX86_BUILTIN_PABSB256,
26749 IX86_BUILTIN_PABSW256,
26750 IX86_BUILTIN_PABSD256,
26751 IX86_BUILTIN_PACKSSDW256,
26752 IX86_BUILTIN_PACKSSWB256,
26753 IX86_BUILTIN_PACKUSDW256,
26754 IX86_BUILTIN_PACKUSWB256,
26755 IX86_BUILTIN_PADDB256,
26756 IX86_BUILTIN_PADDW256,
26757 IX86_BUILTIN_PADDD256,
26758 IX86_BUILTIN_PADDQ256,
26759 IX86_BUILTIN_PADDSB256,
26760 IX86_BUILTIN_PADDSW256,
26761 IX86_BUILTIN_PADDUSB256,
26762 IX86_BUILTIN_PADDUSW256,
26763 IX86_BUILTIN_PALIGNR256,
26764 IX86_BUILTIN_AND256I,
26765 IX86_BUILTIN_ANDNOT256I,
26766 IX86_BUILTIN_PAVGB256,
26767 IX86_BUILTIN_PAVGW256,
26768 IX86_BUILTIN_PBLENDVB256,
26769 IX86_BUILTIN_PBLENDVW256,
26770 IX86_BUILTIN_PCMPEQB256,
26771 IX86_BUILTIN_PCMPEQW256,
26772 IX86_BUILTIN_PCMPEQD256,
26773 IX86_BUILTIN_PCMPEQQ256,
26774 IX86_BUILTIN_PCMPGTB256,
26775 IX86_BUILTIN_PCMPGTW256,
26776 IX86_BUILTIN_PCMPGTD256,
26777 IX86_BUILTIN_PCMPGTQ256,
26778 IX86_BUILTIN_PHADDW256,
26779 IX86_BUILTIN_PHADDD256,
26780 IX86_BUILTIN_PHADDSW256,
26781 IX86_BUILTIN_PHSUBW256,
26782 IX86_BUILTIN_PHSUBD256,
26783 IX86_BUILTIN_PHSUBSW256,
26784 IX86_BUILTIN_PMADDUBSW256,
26785 IX86_BUILTIN_PMADDWD256,
26786 IX86_BUILTIN_PMAXSB256,
26787 IX86_BUILTIN_PMAXSW256,
26788 IX86_BUILTIN_PMAXSD256,
26789 IX86_BUILTIN_PMAXUB256,
26790 IX86_BUILTIN_PMAXUW256,
26791 IX86_BUILTIN_PMAXUD256,
26792 IX86_BUILTIN_PMINSB256,
26793 IX86_BUILTIN_PMINSW256,
26794 IX86_BUILTIN_PMINSD256,
26795 IX86_BUILTIN_PMINUB256,
26796 IX86_BUILTIN_PMINUW256,
26797 IX86_BUILTIN_PMINUD256,
26798 IX86_BUILTIN_PMOVMSKB256,
26799 IX86_BUILTIN_PMOVSXBW256,
26800 IX86_BUILTIN_PMOVSXBD256,
26801 IX86_BUILTIN_PMOVSXBQ256,
26802 IX86_BUILTIN_PMOVSXWD256,
26803 IX86_BUILTIN_PMOVSXWQ256,
26804 IX86_BUILTIN_PMOVSXDQ256,
26805 IX86_BUILTIN_PMOVZXBW256,
26806 IX86_BUILTIN_PMOVZXBD256,
26807 IX86_BUILTIN_PMOVZXBQ256,
26808 IX86_BUILTIN_PMOVZXWD256,
26809 IX86_BUILTIN_PMOVZXWQ256,
26810 IX86_BUILTIN_PMOVZXDQ256,
26811 IX86_BUILTIN_PMULDQ256,
26812 IX86_BUILTIN_PMULHRSW256,
26813 IX86_BUILTIN_PMULHUW256,
26814 IX86_BUILTIN_PMULHW256,
26815 IX86_BUILTIN_PMULLW256,
26816 IX86_BUILTIN_PMULLD256,
26817 IX86_BUILTIN_PMULUDQ256,
26818 IX86_BUILTIN_POR256,
26819 IX86_BUILTIN_PSADBW256,
26820 IX86_BUILTIN_PSHUFB256,
26821 IX86_BUILTIN_PSHUFD256,
26822 IX86_BUILTIN_PSHUFHW256,
26823 IX86_BUILTIN_PSHUFLW256,
26824 IX86_BUILTIN_PSIGNB256,
26825 IX86_BUILTIN_PSIGNW256,
26826 IX86_BUILTIN_PSIGND256,
26827 IX86_BUILTIN_PSLLDQI256,
26828 IX86_BUILTIN_PSLLWI256,
26829 IX86_BUILTIN_PSLLW256,
26830 IX86_BUILTIN_PSLLDI256,
26831 IX86_BUILTIN_PSLLD256,
26832 IX86_BUILTIN_PSLLQI256,
26833 IX86_BUILTIN_PSLLQ256,
26834 IX86_BUILTIN_PSRAWI256,
26835 IX86_BUILTIN_PSRAW256,
26836 IX86_BUILTIN_PSRADI256,
26837 IX86_BUILTIN_PSRAD256,
26838 IX86_BUILTIN_PSRLDQI256,
26839 IX86_BUILTIN_PSRLWI256,
26840 IX86_BUILTIN_PSRLW256,
26841 IX86_BUILTIN_PSRLDI256,
26842 IX86_BUILTIN_PSRLD256,
26843 IX86_BUILTIN_PSRLQI256,
26844 IX86_BUILTIN_PSRLQ256,
26845 IX86_BUILTIN_PSUBB256,
26846 IX86_BUILTIN_PSUBW256,
26847 IX86_BUILTIN_PSUBD256,
26848 IX86_BUILTIN_PSUBQ256,
26849 IX86_BUILTIN_PSUBSB256,
26850 IX86_BUILTIN_PSUBSW256,
26851 IX86_BUILTIN_PSUBUSB256,
26852 IX86_BUILTIN_PSUBUSW256,
26853 IX86_BUILTIN_PUNPCKHBW256,
26854 IX86_BUILTIN_PUNPCKHWD256,
26855 IX86_BUILTIN_PUNPCKHDQ256,
26856 IX86_BUILTIN_PUNPCKHQDQ256,
26857 IX86_BUILTIN_PUNPCKLBW256,
26858 IX86_BUILTIN_PUNPCKLWD256,
26859 IX86_BUILTIN_PUNPCKLDQ256,
26860 IX86_BUILTIN_PUNPCKLQDQ256,
26861 IX86_BUILTIN_PXOR256,
26862 IX86_BUILTIN_MOVNTDQA256,
26863 IX86_BUILTIN_VBROADCASTSS_PS,
26864 IX86_BUILTIN_VBROADCASTSS_PS256,
26865 IX86_BUILTIN_VBROADCASTSD_PD256,
26866 IX86_BUILTIN_VBROADCASTSI256,
26867 IX86_BUILTIN_PBLENDD256,
26868 IX86_BUILTIN_PBLENDD128,
26869 IX86_BUILTIN_PBROADCASTB256,
26870 IX86_BUILTIN_PBROADCASTW256,
26871 IX86_BUILTIN_PBROADCASTD256,
26872 IX86_BUILTIN_PBROADCASTQ256,
26873 IX86_BUILTIN_PBROADCASTB128,
26874 IX86_BUILTIN_PBROADCASTW128,
26875 IX86_BUILTIN_PBROADCASTD128,
26876 IX86_BUILTIN_PBROADCASTQ128,
26877 IX86_BUILTIN_VPERMVARSI256,
26878 IX86_BUILTIN_VPERMDF256,
26879 IX86_BUILTIN_VPERMVARSF256,
26880 IX86_BUILTIN_VPERMDI256,
26881 IX86_BUILTIN_VPERMTI256,
26882 IX86_BUILTIN_VEXTRACT128I256,
26883 IX86_BUILTIN_VINSERT128I256,
26884 IX86_BUILTIN_MASKLOADD,
26885 IX86_BUILTIN_MASKLOADQ,
26886 IX86_BUILTIN_MASKLOADD256,
26887 IX86_BUILTIN_MASKLOADQ256,
26888 IX86_BUILTIN_MASKSTORED,
26889 IX86_BUILTIN_MASKSTOREQ,
26890 IX86_BUILTIN_MASKSTORED256,
26891 IX86_BUILTIN_MASKSTOREQ256,
26892 IX86_BUILTIN_PSLLVV4DI,
26893 IX86_BUILTIN_PSLLVV2DI,
26894 IX86_BUILTIN_PSLLVV8SI,
26895 IX86_BUILTIN_PSLLVV4SI,
26896 IX86_BUILTIN_PSRAVV8SI,
26897 IX86_BUILTIN_PSRAVV4SI,
26898 IX86_BUILTIN_PSRLVV4DI,
26899 IX86_BUILTIN_PSRLVV2DI,
26900 IX86_BUILTIN_PSRLVV8SI,
26901 IX86_BUILTIN_PSRLVV4SI,
26903 IX86_BUILTIN_GATHERSIV2DF,
26904 IX86_BUILTIN_GATHERSIV4DF,
26905 IX86_BUILTIN_GATHERDIV2DF,
26906 IX86_BUILTIN_GATHERDIV4DF,
26907 IX86_BUILTIN_GATHERSIV4SF,
26908 IX86_BUILTIN_GATHERSIV8SF,
26909 IX86_BUILTIN_GATHERDIV4SF,
26910 IX86_BUILTIN_GATHERDIV8SF,
26911 IX86_BUILTIN_GATHERSIV2DI,
26912 IX86_BUILTIN_GATHERSIV4DI,
26913 IX86_BUILTIN_GATHERDIV2DI,
26914 IX86_BUILTIN_GATHERDIV4DI,
26915 IX86_BUILTIN_GATHERSIV4SI,
26916 IX86_BUILTIN_GATHERSIV8SI,
26917 IX86_BUILTIN_GATHERDIV4SI,
26918 IX86_BUILTIN_GATHERDIV8SI,
26920 /* Alternate 4 element gather for the vectorizer where
26921 all operands are 32-byte wide. */
26922 IX86_BUILTIN_GATHERALTSIV4DF,
26923 IX86_BUILTIN_GATHERALTDIV8SF,
26924 IX86_BUILTIN_GATHERALTSIV4DI,
26925 IX86_BUILTIN_GATHERALTDIV8SI,
26927 /* TFmode support builtins. */
26928 IX86_BUILTIN_INFQ,
26929 IX86_BUILTIN_HUGE_VALQ,
26930 IX86_BUILTIN_FABSQ,
26931 IX86_BUILTIN_COPYSIGNQ,
26933 /* Vectorizer support builtins. */
26934 IX86_BUILTIN_CPYSGNPS,
26935 IX86_BUILTIN_CPYSGNPD,
26936 IX86_BUILTIN_CPYSGNPS256,
26937 IX86_BUILTIN_CPYSGNPD256,
26939 /* FMA4 instructions. */
26940 IX86_BUILTIN_VFMADDSS,
26941 IX86_BUILTIN_VFMADDSD,
26942 IX86_BUILTIN_VFMADDPS,
26943 IX86_BUILTIN_VFMADDPD,
26944 IX86_BUILTIN_VFMADDPS256,
26945 IX86_BUILTIN_VFMADDPD256,
26946 IX86_BUILTIN_VFMADDSUBPS,
26947 IX86_BUILTIN_VFMADDSUBPD,
26948 IX86_BUILTIN_VFMADDSUBPS256,
26949 IX86_BUILTIN_VFMADDSUBPD256,
26951 /* FMA3 instructions. */
26952 IX86_BUILTIN_VFMADDSS3,
26953 IX86_BUILTIN_VFMADDSD3,
26955 /* XOP instructions. */
26956 IX86_BUILTIN_VPCMOV,
26957 IX86_BUILTIN_VPCMOV_V2DI,
26958 IX86_BUILTIN_VPCMOV_V4SI,
26959 IX86_BUILTIN_VPCMOV_V8HI,
26960 IX86_BUILTIN_VPCMOV_V16QI,
26961 IX86_BUILTIN_VPCMOV_V4SF,
26962 IX86_BUILTIN_VPCMOV_V2DF,
26963 IX86_BUILTIN_VPCMOV256,
26964 IX86_BUILTIN_VPCMOV_V4DI256,
26965 IX86_BUILTIN_VPCMOV_V8SI256,
26966 IX86_BUILTIN_VPCMOV_V16HI256,
26967 IX86_BUILTIN_VPCMOV_V32QI256,
26968 IX86_BUILTIN_VPCMOV_V8SF256,
26969 IX86_BUILTIN_VPCMOV_V4DF256,
26971 IX86_BUILTIN_VPPERM,
26973 IX86_BUILTIN_VPMACSSWW,
26974 IX86_BUILTIN_VPMACSWW,
26975 IX86_BUILTIN_VPMACSSWD,
26976 IX86_BUILTIN_VPMACSWD,
26977 IX86_BUILTIN_VPMACSSDD,
26978 IX86_BUILTIN_VPMACSDD,
26979 IX86_BUILTIN_VPMACSSDQL,
26980 IX86_BUILTIN_VPMACSSDQH,
26981 IX86_BUILTIN_VPMACSDQL,
26982 IX86_BUILTIN_VPMACSDQH,
26983 IX86_BUILTIN_VPMADCSSWD,
26984 IX86_BUILTIN_VPMADCSWD,
26986 IX86_BUILTIN_VPHADDBW,
26987 IX86_BUILTIN_VPHADDBD,
26988 IX86_BUILTIN_VPHADDBQ,
26989 IX86_BUILTIN_VPHADDWD,
26990 IX86_BUILTIN_VPHADDWQ,
26991 IX86_BUILTIN_VPHADDDQ,
26992 IX86_BUILTIN_VPHADDUBW,
26993 IX86_BUILTIN_VPHADDUBD,
26994 IX86_BUILTIN_VPHADDUBQ,
26995 IX86_BUILTIN_VPHADDUWD,
26996 IX86_BUILTIN_VPHADDUWQ,
26997 IX86_BUILTIN_VPHADDUDQ,
26998 IX86_BUILTIN_VPHSUBBW,
26999 IX86_BUILTIN_VPHSUBWD,
27000 IX86_BUILTIN_VPHSUBDQ,
27002 IX86_BUILTIN_VPROTB,
27003 IX86_BUILTIN_VPROTW,
27004 IX86_BUILTIN_VPROTD,
27005 IX86_BUILTIN_VPROTQ,
27006 IX86_BUILTIN_VPROTB_IMM,
27007 IX86_BUILTIN_VPROTW_IMM,
27008 IX86_BUILTIN_VPROTD_IMM,
27009 IX86_BUILTIN_VPROTQ_IMM,
27011 IX86_BUILTIN_VPSHLB,
27012 IX86_BUILTIN_VPSHLW,
27013 IX86_BUILTIN_VPSHLD,
27014 IX86_BUILTIN_VPSHLQ,
27015 IX86_BUILTIN_VPSHAB,
27016 IX86_BUILTIN_VPSHAW,
27017 IX86_BUILTIN_VPSHAD,
27018 IX86_BUILTIN_VPSHAQ,
27020 IX86_BUILTIN_VFRCZSS,
27021 IX86_BUILTIN_VFRCZSD,
27022 IX86_BUILTIN_VFRCZPS,
27023 IX86_BUILTIN_VFRCZPD,
27024 IX86_BUILTIN_VFRCZPS256,
27025 IX86_BUILTIN_VFRCZPD256,
27027 IX86_BUILTIN_VPCOMEQUB,
27028 IX86_BUILTIN_VPCOMNEUB,
27029 IX86_BUILTIN_VPCOMLTUB,
27030 IX86_BUILTIN_VPCOMLEUB,
27031 IX86_BUILTIN_VPCOMGTUB,
27032 IX86_BUILTIN_VPCOMGEUB,
27033 IX86_BUILTIN_VPCOMFALSEUB,
27034 IX86_BUILTIN_VPCOMTRUEUB,
27036 IX86_BUILTIN_VPCOMEQUW,
27037 IX86_BUILTIN_VPCOMNEUW,
27038 IX86_BUILTIN_VPCOMLTUW,
27039 IX86_BUILTIN_VPCOMLEUW,
27040 IX86_BUILTIN_VPCOMGTUW,
27041 IX86_BUILTIN_VPCOMGEUW,
27042 IX86_BUILTIN_VPCOMFALSEUW,
27043 IX86_BUILTIN_VPCOMTRUEUW,
27045 IX86_BUILTIN_VPCOMEQUD,
27046 IX86_BUILTIN_VPCOMNEUD,
27047 IX86_BUILTIN_VPCOMLTUD,
27048 IX86_BUILTIN_VPCOMLEUD,
27049 IX86_BUILTIN_VPCOMGTUD,
27050 IX86_BUILTIN_VPCOMGEUD,
27051 IX86_BUILTIN_VPCOMFALSEUD,
27052 IX86_BUILTIN_VPCOMTRUEUD,
27054 IX86_BUILTIN_VPCOMEQUQ,
27055 IX86_BUILTIN_VPCOMNEUQ,
27056 IX86_BUILTIN_VPCOMLTUQ,
27057 IX86_BUILTIN_VPCOMLEUQ,
27058 IX86_BUILTIN_VPCOMGTUQ,
27059 IX86_BUILTIN_VPCOMGEUQ,
27060 IX86_BUILTIN_VPCOMFALSEUQ,
27061 IX86_BUILTIN_VPCOMTRUEUQ,
27063 IX86_BUILTIN_VPCOMEQB,
27064 IX86_BUILTIN_VPCOMNEB,
27065 IX86_BUILTIN_VPCOMLTB,
27066 IX86_BUILTIN_VPCOMLEB,
27067 IX86_BUILTIN_VPCOMGTB,
27068 IX86_BUILTIN_VPCOMGEB,
27069 IX86_BUILTIN_VPCOMFALSEB,
27070 IX86_BUILTIN_VPCOMTRUEB,
27072 IX86_BUILTIN_VPCOMEQW,
27073 IX86_BUILTIN_VPCOMNEW,
27074 IX86_BUILTIN_VPCOMLTW,
27075 IX86_BUILTIN_VPCOMLEW,
27076 IX86_BUILTIN_VPCOMGTW,
27077 IX86_BUILTIN_VPCOMGEW,
27078 IX86_BUILTIN_VPCOMFALSEW,
27079 IX86_BUILTIN_VPCOMTRUEW,
27081 IX86_BUILTIN_VPCOMEQD,
27082 IX86_BUILTIN_VPCOMNED,
27083 IX86_BUILTIN_VPCOMLTD,
27084 IX86_BUILTIN_VPCOMLED,
27085 IX86_BUILTIN_VPCOMGTD,
27086 IX86_BUILTIN_VPCOMGED,
27087 IX86_BUILTIN_VPCOMFALSED,
27088 IX86_BUILTIN_VPCOMTRUED,
27090 IX86_BUILTIN_VPCOMEQQ,
27091 IX86_BUILTIN_VPCOMNEQ,
27092 IX86_BUILTIN_VPCOMLTQ,
27093 IX86_BUILTIN_VPCOMLEQ,
27094 IX86_BUILTIN_VPCOMGTQ,
27095 IX86_BUILTIN_VPCOMGEQ,
27096 IX86_BUILTIN_VPCOMFALSEQ,
27097 IX86_BUILTIN_VPCOMTRUEQ,
27099 /* LWP instructions. */
27100 IX86_BUILTIN_LLWPCB,
27101 IX86_BUILTIN_SLWPCB,
27102 IX86_BUILTIN_LWPVAL32,
27103 IX86_BUILTIN_LWPVAL64,
27104 IX86_BUILTIN_LWPINS32,
27105 IX86_BUILTIN_LWPINS64,
27107 IX86_BUILTIN_CLZS,
27109 /* RTM */
27110 IX86_BUILTIN_XBEGIN,
27111 IX86_BUILTIN_XEND,
27112 IX86_BUILTIN_XABORT,
27113 IX86_BUILTIN_XTEST,
27115 /* BMI instructions. */
27116 IX86_BUILTIN_BEXTR32,
27117 IX86_BUILTIN_BEXTR64,
27118 IX86_BUILTIN_CTZS,
27120 /* TBM instructions. */
27121 IX86_BUILTIN_BEXTRI32,
27122 IX86_BUILTIN_BEXTRI64,
27124 /* BMI2 instructions. */
27125 IX86_BUILTIN_BZHI32,
27126 IX86_BUILTIN_BZHI64,
27127 IX86_BUILTIN_PDEP32,
27128 IX86_BUILTIN_PDEP64,
27129 IX86_BUILTIN_PEXT32,
27130 IX86_BUILTIN_PEXT64,
27132 /* ADX instructions. */
27133 IX86_BUILTIN_ADDCARRYX32,
27134 IX86_BUILTIN_ADDCARRYX64,
27136 /* FSGSBASE instructions. */
27137 IX86_BUILTIN_RDFSBASE32,
27138 IX86_BUILTIN_RDFSBASE64,
27139 IX86_BUILTIN_RDGSBASE32,
27140 IX86_BUILTIN_RDGSBASE64,
27141 IX86_BUILTIN_WRFSBASE32,
27142 IX86_BUILTIN_WRFSBASE64,
27143 IX86_BUILTIN_WRGSBASE32,
27144 IX86_BUILTIN_WRGSBASE64,
27146 /* RDRND instructions. */
27147 IX86_BUILTIN_RDRAND16_STEP,
27148 IX86_BUILTIN_RDRAND32_STEP,
27149 IX86_BUILTIN_RDRAND64_STEP,
27151 /* RDSEED instructions. */
27152 IX86_BUILTIN_RDSEED16_STEP,
27153 IX86_BUILTIN_RDSEED32_STEP,
27154 IX86_BUILTIN_RDSEED64_STEP,
27156 /* F16C instructions. */
27157 IX86_BUILTIN_CVTPH2PS,
27158 IX86_BUILTIN_CVTPH2PS256,
27159 IX86_BUILTIN_CVTPS2PH,
27160 IX86_BUILTIN_CVTPS2PH256,
27162 /* CFString built-in for darwin */
27163 IX86_BUILTIN_CFSTRING,
27165 /* Builtins to get CPU type and supported features. */
27166 IX86_BUILTIN_CPU_INIT,
27167 IX86_BUILTIN_CPU_IS,
27168 IX86_BUILTIN_CPU_SUPPORTS,
27170 IX86_BUILTIN_MAX
27173 /* Table for the ix86 builtin decls. */
27174 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27176 /* Table of all of the builtin functions that are possible with different ISA's
27177 but are waiting to be built until a function is declared to use that
27178 ISA. */
27179 struct builtin_isa {
27180 const char *name; /* function name */
27181 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27182 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27183 bool const_p; /* true if the declaration is constant */
27184 bool set_and_not_built_p;
27187 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27190 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27191 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27192 function decl in the ix86_builtins array. Returns the function decl or
27193 NULL_TREE, if the builtin was not added.
27195 If the front end has a special hook for builtin functions, delay adding
27196 builtin functions that aren't in the current ISA until the ISA is changed
27197 with function specific optimization. Doing so, can save about 300K for the
27198 default compiler. When the builtin is expanded, check at that time whether
27199 it is valid.
27201 If the front end doesn't have a special hook, record all builtins, even if
27202 it isn't an instruction set in the current ISA in case the user uses
27203 function specific options for a different ISA, so that we don't get scope
27204 errors if a builtin is added in the middle of a function scope. */
27206 static inline tree
27207 def_builtin (HOST_WIDE_INT mask, const char *name,
27208 enum ix86_builtin_func_type tcode,
27209 enum ix86_builtins code)
27211 tree decl = NULL_TREE;
27213 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
27215 ix86_builtins_isa[(int) code].isa = mask;
27217 mask &= ~OPTION_MASK_ISA_64BIT;
27218 if (mask == 0
27219 || (mask & ix86_isa_flags) != 0
27220 || (lang_hooks.builtin_function
27221 == lang_hooks.builtin_function_ext_scope))
27224 tree type = ix86_get_builtin_func_type (tcode);
27225 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27226 NULL, NULL_TREE);
27227 ix86_builtins[(int) code] = decl;
27228 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
27230 else
27232 ix86_builtins[(int) code] = NULL_TREE;
27233 ix86_builtins_isa[(int) code].tcode = tcode;
27234 ix86_builtins_isa[(int) code].name = name;
27235 ix86_builtins_isa[(int) code].const_p = false;
27236 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
27240 return decl;
27243 /* Like def_builtin, but also marks the function decl "const". */
27245 static inline tree
27246 def_builtin_const (HOST_WIDE_INT mask, const char *name,
27247 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
27249 tree decl = def_builtin (mask, name, tcode, code);
27250 if (decl)
27251 TREE_READONLY (decl) = 1;
27252 else
27253 ix86_builtins_isa[(int) code].const_p = true;
27255 return decl;
27258 /* Add any new builtin functions for a given ISA that may not have been
27259 declared. This saves a bit of space compared to adding all of the
27260 declarations to the tree, even if we didn't use them. */
27262 static void
27263 ix86_add_new_builtins (HOST_WIDE_INT isa)
27265 int i;
27267 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
27269 if ((ix86_builtins_isa[i].isa & isa) != 0
27270 && ix86_builtins_isa[i].set_and_not_built_p)
27272 tree decl, type;
27274 /* Don't define the builtin again. */
27275 ix86_builtins_isa[i].set_and_not_built_p = false;
27277 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
27278 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
27279 type, i, BUILT_IN_MD, NULL,
27280 NULL_TREE);
27282 ix86_builtins[i] = decl;
27283 if (ix86_builtins_isa[i].const_p)
27284 TREE_READONLY (decl) = 1;
27289 /* Bits for builtin_description.flag. */
27291 /* Set when we don't support the comparison natively, and should
27292 swap_comparison in order to support it. */
27293 #define BUILTIN_DESC_SWAP_OPERANDS 1
27295 struct builtin_description
27297 const HOST_WIDE_INT mask;
27298 const enum insn_code icode;
27299 const char *const name;
27300 const enum ix86_builtins code;
27301 const enum rtx_code comparison;
27302 const int flag;
27305 static const struct builtin_description bdesc_comi[] =
27307 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
27308 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
27309 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
27310 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
27311 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
27312 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
27313 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
27314 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
27315 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
27316 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
27317 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
27318 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
27319 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
27320 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
27321 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
27322 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
27323 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
27324 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
27325 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
27326 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
27327 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
27328 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
27329 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
27330 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
27333 static const struct builtin_description bdesc_pcmpestr[] =
27335 /* SSE4.2 */
27336 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
27337 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
27338 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
27339 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
27340 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
27341 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
27342 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
27345 static const struct builtin_description bdesc_pcmpistr[] =
27347 /* SSE4.2 */
27348 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
27349 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
27350 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
27351 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
27352 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
27353 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
27354 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
27357 /* Special builtins with variable number of arguments. */
27358 static const struct builtin_description bdesc_special_args[] =
27360 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
27361 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
27362 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
27364 /* MMX */
27365 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27367 /* 3DNow! */
27368 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27370 /* FXSR, XSAVE and XSAVEOPT */
27371 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
27372 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
27373 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27374 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27375 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27377 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27378 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27379 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27380 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27381 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27383 /* SSE */
27384 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27385 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27386 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27388 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27389 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27390 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27391 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27393 /* SSE or 3DNow!A */
27394 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27395 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
27397 /* SSE2 */
27398 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27399 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27400 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27401 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
27402 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27403 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
27404 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
27405 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
27406 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
27407 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27409 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27410 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27412 /* SSE3 */
27413 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27415 /* SSE4.1 */
27416 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
27418 /* SSE4A */
27419 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27420 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27422 /* AVX */
27423 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
27424 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
27426 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27427 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27428 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27429 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
27430 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
27432 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27433 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27434 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27435 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27436 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27437 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
27438 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27440 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
27441 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27442 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27444 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
27445 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
27446 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
27447 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
27448 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
27449 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
27450 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
27451 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
27453 /* AVX2 */
27454 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
27455 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
27456 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
27457 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
27458 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
27459 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
27460 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
27461 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
27462 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
27464 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
27465 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
27466 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
27467 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
27468 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
27469 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
27471 /* FSGSBASE */
27472 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27473 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27474 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27475 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27476 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27477 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27478 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27479 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27481 /* RTM */
27482 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27483 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
27484 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
27487 /* Builtins with variable number of arguments. */
27488 static const struct builtin_description bdesc_args[] =
27490 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
27491 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
27492 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
27493 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27494 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27495 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27496 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27498 /* MMX */
27499 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27500 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27501 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27502 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27503 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27504 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27506 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27507 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27508 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27509 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27510 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27511 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27512 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27513 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27515 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27516 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27518 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27519 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27520 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27521 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27523 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27524 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27525 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27526 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27527 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27528 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27530 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27531 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27532 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27533 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27534 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27535 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27537 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27538 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27539 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27541 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27543 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27544 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27545 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27546 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27547 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27548 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27550 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27551 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27552 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27553 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27554 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27555 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27557 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27558 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27559 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27560 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27562 /* 3DNow! */
27563 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27564 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27565 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27566 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27568 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27569 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27570 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27571 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27572 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27573 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27574 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27575 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27576 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27577 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27578 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27579 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27580 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27581 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27582 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27584 /* 3DNow!A */
27585 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27586 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27587 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27588 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27589 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27590 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27592 /* SSE */
27593 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27594 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27595 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27596 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27597 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27598 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27599 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27600 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27601 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27602 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27603 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27604 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27606 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27608 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27609 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27610 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27611 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27612 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27613 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27614 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27615 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27617 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27618 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27619 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27620 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27621 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27622 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27623 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27624 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27625 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27626 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27627 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27628 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27629 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27630 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27631 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27632 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27633 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27634 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27635 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27636 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27638 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27639 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27640 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27641 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27643 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27644 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27645 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27646 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27648 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27650 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27651 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27652 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27653 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27654 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27656 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27657 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27658 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27660 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27662 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27663 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27664 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27666 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27667 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27669 /* SSE MMX or 3Dnow!A */
27670 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27671 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27672 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27674 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27675 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27676 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27677 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27679 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27680 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27682 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27684 /* SSE2 */
27685 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27687 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27688 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27689 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27690 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27691 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27693 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27694 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27695 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27696 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27697 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27699 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27701 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27702 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27703 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27704 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27706 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27707 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27708 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27710 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27711 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27712 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27713 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27714 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27715 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27716 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27717 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27719 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27720 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27721 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27722 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27723 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27724 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27725 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27726 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27727 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27728 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27729 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27730 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27731 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27732 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27733 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27734 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27735 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27736 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27737 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27738 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27740 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27741 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27742 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27743 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27745 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27746 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27747 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27748 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27750 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27752 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27753 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27754 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27756 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27758 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27759 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27760 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27761 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27762 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27763 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27764 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27765 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27767 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27768 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27769 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27770 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27771 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27772 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27773 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27774 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27776 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27777 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27779 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27780 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27781 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27782 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27784 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27785 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27787 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27788 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27789 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27790 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27791 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27792 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27794 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27795 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27796 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27797 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27799 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27800 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27801 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27802 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27803 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27804 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27805 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27806 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27808 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27809 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27810 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27812 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27813 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27815 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27816 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27818 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27820 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27821 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27822 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27823 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27825 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27826 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27827 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27828 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27829 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27830 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27831 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27833 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27834 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27835 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27836 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27837 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27838 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27839 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27841 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27842 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27843 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27844 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27846 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27847 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27848 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27850 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27852 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27854 /* SSE2 MMX */
27855 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27856 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27858 /* SSE3 */
27859 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27860 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27862 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27863 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27864 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27865 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27866 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27867 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27869 /* SSSE3 */
27870 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27871 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27872 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27873 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27874 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27875 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27877 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27878 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27879 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27880 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27881 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27882 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27883 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27884 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27885 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27886 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27887 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27888 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27889 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27890 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27891 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27892 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27893 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27894 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27895 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27896 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27897 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27898 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27899 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27900 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27902 /* SSSE3. */
27903 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27904 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27906 /* SSE4.1 */
27907 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27908 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27909 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27910 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27911 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27912 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27913 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27914 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27915 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27916 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27918 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27919 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27920 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27921 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27922 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27923 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27924 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27925 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27926 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27927 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27928 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27929 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27930 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27932 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27933 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27934 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27935 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27936 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27937 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27938 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27939 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27940 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27941 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27942 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27943 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27945 /* SSE4.1 */
27946 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27947 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27948 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27949 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27951 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27952 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27953 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27954 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27956 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27957 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27959 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27960 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27962 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27963 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27964 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27965 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27967 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27968 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27970 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27971 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27973 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27974 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27975 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27977 /* SSE4.2 */
27978 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27979 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27980 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27981 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27982 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27984 /* SSE4A */
27985 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27986 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27987 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27988 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27990 /* AES */
27991 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27992 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27994 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27995 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27996 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27997 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27999 /* PCLMUL */
28000 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
28002 /* AVX */
28003 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28004 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28005 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28006 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28007 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28008 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28009 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28010 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28011 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28012 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28013 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28014 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28015 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28016 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28017 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28018 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28019 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28020 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28021 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28022 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28023 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28024 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28025 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28026 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28027 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28028 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28030 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28031 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28032 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28033 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28035 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28036 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28037 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28038 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28039 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28040 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28041 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28042 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28043 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28044 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28045 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28046 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28047 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28048 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28049 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28050 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28051 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28052 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28053 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28054 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28055 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28056 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28057 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28058 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28059 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28060 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28061 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28062 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28063 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28064 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28065 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28066 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28067 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28068 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28070 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28071 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28072 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28074 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28075 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28076 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28077 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28078 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28080 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28082 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28083 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28085 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28086 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28087 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28088 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28090 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28091 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28093 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28094 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28096 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28097 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28098 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28099 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28101 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28102 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28104 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28105 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28107 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28108 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28109 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28110 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28112 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28113 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28114 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28115 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28116 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28117 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28119 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28120 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28121 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28122 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28123 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28124 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28125 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28126 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28127 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28128 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28129 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28130 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28131 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28132 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28133 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28135 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28136 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28138 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28139 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28141 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28143 /* AVX2 */
28144 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28145 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28146 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28147 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28148 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28149 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28150 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28151 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28152 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28153 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28154 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28155 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28156 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28157 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28158 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28159 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28160 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28161 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28162 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28163 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28164 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28165 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28166 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28167 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28168 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28169 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28170 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28171 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28172 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28173 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28174 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28175 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28176 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28177 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28178 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28179 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28180 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28181 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28182 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28183 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28184 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28185 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28186 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28187 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28188 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28189 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28190 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28191 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28192 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28193 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28194 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28195 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28196 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28197 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28198 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28199 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28200 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28201 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28202 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28203 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28204 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28205 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28206 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28207 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28208 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28209 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28210 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28211 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28212 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28213 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28214 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28215 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28216 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28217 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28218 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
28219 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28220 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28221 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28222 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28223 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28224 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28225 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28226 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28227 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28228 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28229 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28230 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28231 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28232 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28233 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28234 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28235 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28236 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28237 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28238 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28239 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28240 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28241 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28242 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28243 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28244 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28245 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28246 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28247 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28248 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28249 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28250 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28251 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28252 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28253 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28254 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28255 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28256 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28257 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28258 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28259 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28260 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28261 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28262 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28263 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
28264 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28265 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
28266 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
28267 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28268 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28269 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28270 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28271 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28272 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28273 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28274 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28275 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28276 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
28277 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
28278 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
28279 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
28280 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28281 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28282 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28283 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28284 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28285 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28286 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28287 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28288 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28289 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28291 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28293 /* BMI */
28294 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28295 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28296 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28298 /* TBM */
28299 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28300 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28302 /* F16C */
28303 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
28304 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
28305 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
28306 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
28308 /* BMI2 */
28309 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28310 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28311 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28312 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28313 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28314 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28317 /* FMA4 and XOP. */
28318 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
28319 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
28320 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
28321 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
28322 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
28323 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
28324 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
28325 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
28326 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
28327 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
28328 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
28329 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
28330 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
28331 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
28332 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
28333 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
28334 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
28335 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
28336 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
28337 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
28338 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
28339 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
28340 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
28341 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
28342 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
28343 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
28344 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
28345 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
28346 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
28347 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
28348 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
28349 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
28350 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
28351 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
28352 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
28353 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
28354 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
28355 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
28356 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
28357 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
28358 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
28359 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
28360 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
28361 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
28362 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
28363 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
28364 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
28365 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
28366 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
28367 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
28368 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
28369 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
28371 static const struct builtin_description bdesc_multi_arg[] =
28373 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
28374 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
28375 UNKNOWN, (int)MULTI_ARG_3_SF },
28376 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
28377 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
28378 UNKNOWN, (int)MULTI_ARG_3_DF },
28380 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
28381 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
28382 UNKNOWN, (int)MULTI_ARG_3_SF },
28383 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
28384 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
28385 UNKNOWN, (int)MULTI_ARG_3_DF },
28387 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
28388 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
28389 UNKNOWN, (int)MULTI_ARG_3_SF },
28390 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
28391 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
28392 UNKNOWN, (int)MULTI_ARG_3_DF },
28393 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
28394 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
28395 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28396 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
28397 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
28398 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28400 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
28401 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
28402 UNKNOWN, (int)MULTI_ARG_3_SF },
28403 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
28404 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
28405 UNKNOWN, (int)MULTI_ARG_3_DF },
28406 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
28407 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
28408 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28409 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
28410 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
28411 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28413 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
28414 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
28415 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
28416 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
28417 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
28418 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
28419 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
28421 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28422 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28423 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
28424 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
28425 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
28426 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
28427 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
28429 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
28431 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28432 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28433 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28434 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28435 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28436 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28437 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28438 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28439 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28440 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28441 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28442 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28444 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28445 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
28446 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
28447 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
28448 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
28449 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
28450 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
28451 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
28452 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28453 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
28454 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
28455 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
28456 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28457 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
28458 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
28459 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
28461 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
28462 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
28463 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
28464 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
28465 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
28466 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
28468 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28469 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28470 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28471 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28472 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28473 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28474 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28475 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28476 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28477 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28478 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28479 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28480 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28481 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28482 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28484 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
28485 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28486 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28487 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
28488 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
28489 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
28490 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
28492 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
28493 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28494 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28495 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
28496 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
28497 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
28498 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
28500 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
28501 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28502 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28503 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
28504 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
28505 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
28506 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28508 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28509 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28510 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28511 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28512 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28513 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28514 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28516 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28517 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28518 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28519 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28520 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28521 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28522 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28524 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28525 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28526 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28527 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28528 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28529 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28530 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28532 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28533 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28534 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28535 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28536 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28537 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28538 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28540 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28541 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28542 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28543 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28544 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28545 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28546 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28548 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28549 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28550 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28551 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28552 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28553 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28554 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28555 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28557 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28558 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28559 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28560 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28561 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28562 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28563 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28564 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28566 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28567 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28568 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28569 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28573 /* TM vector builtins. */
28575 /* Reuse the existing x86-specific `struct builtin_description' cause
28576 we're lazy. Add casts to make them fit. */
28577 static const struct builtin_description bdesc_tm[] =
28579 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28580 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28581 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28582 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28583 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28584 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28585 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28587 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28588 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28589 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28590 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28591 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28592 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28593 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28595 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28596 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28597 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28598 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28599 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28600 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28601 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28603 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28604 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28605 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28608 /* TM callbacks. */
28610 /* Return the builtin decl needed to load a vector of TYPE. */
28612 static tree
28613 ix86_builtin_tm_load (tree type)
28615 if (TREE_CODE (type) == VECTOR_TYPE)
28617 switch (tree_low_cst (TYPE_SIZE (type), 1))
28619 case 64:
28620 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28621 case 128:
28622 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28623 case 256:
28624 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28627 return NULL_TREE;
28630 /* Return the builtin decl needed to store a vector of TYPE. */
28632 static tree
28633 ix86_builtin_tm_store (tree type)
28635 if (TREE_CODE (type) == VECTOR_TYPE)
28637 switch (tree_low_cst (TYPE_SIZE (type), 1))
28639 case 64:
28640 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28641 case 128:
28642 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28643 case 256:
28644 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28647 return NULL_TREE;
28650 /* Initialize the transactional memory vector load/store builtins. */
28652 static void
28653 ix86_init_tm_builtins (void)
28655 enum ix86_builtin_func_type ftype;
28656 const struct builtin_description *d;
28657 size_t i;
28658 tree decl;
28659 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28660 tree attrs_log, attrs_type_log;
28662 if (!flag_tm)
28663 return;
28665 /* If there are no builtins defined, we must be compiling in a
28666 language without trans-mem support. */
28667 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28668 return;
28670 /* Use whatever attributes a normal TM load has. */
28671 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28672 attrs_load = DECL_ATTRIBUTES (decl);
28673 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28674 /* Use whatever attributes a normal TM store has. */
28675 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28676 attrs_store = DECL_ATTRIBUTES (decl);
28677 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28678 /* Use whatever attributes a normal TM log has. */
28679 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28680 attrs_log = DECL_ATTRIBUTES (decl);
28681 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28683 for (i = 0, d = bdesc_tm;
28684 i < ARRAY_SIZE (bdesc_tm);
28685 i++, d++)
28687 if ((d->mask & ix86_isa_flags) != 0
28688 || (lang_hooks.builtin_function
28689 == lang_hooks.builtin_function_ext_scope))
28691 tree type, attrs, attrs_type;
28692 enum built_in_function code = (enum built_in_function) d->code;
28694 ftype = (enum ix86_builtin_func_type) d->flag;
28695 type = ix86_get_builtin_func_type (ftype);
28697 if (BUILTIN_TM_LOAD_P (code))
28699 attrs = attrs_load;
28700 attrs_type = attrs_type_load;
28702 else if (BUILTIN_TM_STORE_P (code))
28704 attrs = attrs_store;
28705 attrs_type = attrs_type_store;
28707 else
28709 attrs = attrs_log;
28710 attrs_type = attrs_type_log;
28712 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28713 /* The builtin without the prefix for
28714 calling it directly. */
28715 d->name + strlen ("__builtin_"),
28716 attrs);
28717 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28718 set the TYPE_ATTRIBUTES. */
28719 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28721 set_builtin_decl (code, decl, false);
28726 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28727 in the current target ISA to allow the user to compile particular modules
28728 with different target specific options that differ from the command line
28729 options. */
28730 static void
28731 ix86_init_mmx_sse_builtins (void)
28733 const struct builtin_description * d;
28734 enum ix86_builtin_func_type ftype;
28735 size_t i;
28737 /* Add all special builtins with variable number of operands. */
28738 for (i = 0, d = bdesc_special_args;
28739 i < ARRAY_SIZE (bdesc_special_args);
28740 i++, d++)
28742 if (d->name == 0)
28743 continue;
28745 ftype = (enum ix86_builtin_func_type) d->flag;
28746 def_builtin (d->mask, d->name, ftype, d->code);
28749 /* Add all builtins with variable number of operands. */
28750 for (i = 0, d = bdesc_args;
28751 i < ARRAY_SIZE (bdesc_args);
28752 i++, d++)
28754 if (d->name == 0)
28755 continue;
28757 ftype = (enum ix86_builtin_func_type) d->flag;
28758 def_builtin_const (d->mask, d->name, ftype, d->code);
28761 /* pcmpestr[im] insns. */
28762 for (i = 0, d = bdesc_pcmpestr;
28763 i < ARRAY_SIZE (bdesc_pcmpestr);
28764 i++, d++)
28766 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28767 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28768 else
28769 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28770 def_builtin_const (d->mask, d->name, ftype, d->code);
28773 /* pcmpistr[im] insns. */
28774 for (i = 0, d = bdesc_pcmpistr;
28775 i < ARRAY_SIZE (bdesc_pcmpistr);
28776 i++, d++)
28778 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28779 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28780 else
28781 ftype = INT_FTYPE_V16QI_V16QI_INT;
28782 def_builtin_const (d->mask, d->name, ftype, d->code);
28785 /* comi/ucomi insns. */
28786 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28788 if (d->mask == OPTION_MASK_ISA_SSE2)
28789 ftype = INT_FTYPE_V2DF_V2DF;
28790 else
28791 ftype = INT_FTYPE_V4SF_V4SF;
28792 def_builtin_const (d->mask, d->name, ftype, d->code);
28795 /* SSE */
28796 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28797 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28798 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28799 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28801 /* SSE or 3DNow!A */
28802 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28803 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28804 IX86_BUILTIN_MASKMOVQ);
28806 /* SSE2 */
28807 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28808 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28810 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28811 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28812 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28813 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28815 /* SSE3. */
28816 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28817 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28818 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28819 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28821 /* AES */
28822 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28823 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28824 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28825 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28826 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28827 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28828 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28829 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28830 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28831 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28832 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28833 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28835 /* PCLMUL */
28836 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28837 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28839 /* RDRND */
28840 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28841 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28842 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28843 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28844 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28845 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28846 IX86_BUILTIN_RDRAND64_STEP);
28848 /* AVX2 */
28849 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28850 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28851 IX86_BUILTIN_GATHERSIV2DF);
28853 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28854 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28855 IX86_BUILTIN_GATHERSIV4DF);
28857 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28858 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28859 IX86_BUILTIN_GATHERDIV2DF);
28861 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28862 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28863 IX86_BUILTIN_GATHERDIV4DF);
28865 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28866 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28867 IX86_BUILTIN_GATHERSIV4SF);
28869 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28870 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28871 IX86_BUILTIN_GATHERSIV8SF);
28873 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28874 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28875 IX86_BUILTIN_GATHERDIV4SF);
28877 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28878 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28879 IX86_BUILTIN_GATHERDIV8SF);
28881 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28882 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28883 IX86_BUILTIN_GATHERSIV2DI);
28885 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28886 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28887 IX86_BUILTIN_GATHERSIV4DI);
28889 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28890 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28891 IX86_BUILTIN_GATHERDIV2DI);
28893 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28894 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28895 IX86_BUILTIN_GATHERDIV4DI);
28897 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28898 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28899 IX86_BUILTIN_GATHERSIV4SI);
28901 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28902 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28903 IX86_BUILTIN_GATHERSIV8SI);
28905 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28906 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28907 IX86_BUILTIN_GATHERDIV4SI);
28909 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28910 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28911 IX86_BUILTIN_GATHERDIV8SI);
28913 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28914 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28915 IX86_BUILTIN_GATHERALTSIV4DF);
28917 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28918 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28919 IX86_BUILTIN_GATHERALTDIV8SF);
28921 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28922 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28923 IX86_BUILTIN_GATHERALTSIV4DI);
28925 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28926 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28927 IX86_BUILTIN_GATHERALTDIV8SI);
28929 /* RTM. */
28930 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28931 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28933 /* MMX access to the vec_init patterns. */
28934 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28935 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28937 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28938 V4HI_FTYPE_HI_HI_HI_HI,
28939 IX86_BUILTIN_VEC_INIT_V4HI);
28941 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28942 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28943 IX86_BUILTIN_VEC_INIT_V8QI);
28945 /* Access to the vec_extract patterns. */
28946 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28947 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28948 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28949 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28950 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28951 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28952 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28953 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28954 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28955 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28957 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28958 "__builtin_ia32_vec_ext_v4hi",
28959 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28961 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28962 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28964 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28965 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28967 /* Access to the vec_set patterns. */
28968 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28969 "__builtin_ia32_vec_set_v2di",
28970 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28972 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28973 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28975 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28976 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28978 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28979 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28981 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28982 "__builtin_ia32_vec_set_v4hi",
28983 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28985 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28986 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28988 /* RDSEED */
28989 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28990 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28991 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28992 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28993 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28994 "__builtin_ia32_rdseed_di_step",
28995 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28997 /* ADCX */
28998 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28999 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
29000 def_builtin (OPTION_MASK_ISA_64BIT,
29001 "__builtin_ia32_addcarryx_u64",
29002 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
29003 IX86_BUILTIN_ADDCARRYX64);
29005 /* Add FMA4 multi-arg argument instructions */
29006 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29008 if (d->name == 0)
29009 continue;
29011 ftype = (enum ix86_builtin_func_type) d->flag;
29012 def_builtin_const (d->mask, d->name, ftype, d->code);
29016 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29017 to return a pointer to VERSION_DECL if the outcome of the expression
29018 formed by PREDICATE_CHAIN is true. This function will be called during
29019 version dispatch to decide which function version to execute. It returns
29020 the basic block at the end, to which more conditions can be added. */
29022 static basic_block
29023 add_condition_to_bb (tree function_decl, tree version_decl,
29024 tree predicate_chain, basic_block new_bb)
29026 gimple return_stmt;
29027 tree convert_expr, result_var;
29028 gimple convert_stmt;
29029 gimple call_cond_stmt;
29030 gimple if_else_stmt;
29032 basic_block bb1, bb2, bb3;
29033 edge e12, e23;
29035 tree cond_var, and_expr_var = NULL_TREE;
29036 gimple_seq gseq;
29038 tree predicate_decl, predicate_arg;
29040 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29042 gcc_assert (new_bb != NULL);
29043 gseq = bb_seq (new_bb);
29046 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29047 build_fold_addr_expr (version_decl));
29048 result_var = create_tmp_var (ptr_type_node, NULL);
29049 convert_stmt = gimple_build_assign (result_var, convert_expr);
29050 return_stmt = gimple_build_return (result_var);
29052 if (predicate_chain == NULL_TREE)
29054 gimple_seq_add_stmt (&gseq, convert_stmt);
29055 gimple_seq_add_stmt (&gseq, return_stmt);
29056 set_bb_seq (new_bb, gseq);
29057 gimple_set_bb (convert_stmt, new_bb);
29058 gimple_set_bb (return_stmt, new_bb);
29059 pop_cfun ();
29060 return new_bb;
29063 while (predicate_chain != NULL)
29065 cond_var = create_tmp_var (integer_type_node, NULL);
29066 predicate_decl = TREE_PURPOSE (predicate_chain);
29067 predicate_arg = TREE_VALUE (predicate_chain);
29068 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29069 gimple_call_set_lhs (call_cond_stmt, cond_var);
29071 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29072 gimple_set_bb (call_cond_stmt, new_bb);
29073 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29075 predicate_chain = TREE_CHAIN (predicate_chain);
29077 if (and_expr_var == NULL)
29078 and_expr_var = cond_var;
29079 else
29081 gimple assign_stmt;
29082 /* Use MIN_EXPR to check if any integer is zero?.
29083 and_expr_var = min_expr <cond_var, and_expr_var> */
29084 assign_stmt = gimple_build_assign (and_expr_var,
29085 build2 (MIN_EXPR, integer_type_node,
29086 cond_var, and_expr_var));
29088 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29089 gimple_set_bb (assign_stmt, new_bb);
29090 gimple_seq_add_stmt (&gseq, assign_stmt);
29094 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29095 integer_zero_node,
29096 NULL_TREE, NULL_TREE);
29097 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29098 gimple_set_bb (if_else_stmt, new_bb);
29099 gimple_seq_add_stmt (&gseq, if_else_stmt);
29101 gimple_seq_add_stmt (&gseq, convert_stmt);
29102 gimple_seq_add_stmt (&gseq, return_stmt);
29103 set_bb_seq (new_bb, gseq);
29105 bb1 = new_bb;
29106 e12 = split_block (bb1, if_else_stmt);
29107 bb2 = e12->dest;
29108 e12->flags &= ~EDGE_FALLTHRU;
29109 e12->flags |= EDGE_TRUE_VALUE;
29111 e23 = split_block (bb2, return_stmt);
29113 gimple_set_bb (convert_stmt, bb2);
29114 gimple_set_bb (return_stmt, bb2);
29116 bb3 = e23->dest;
29117 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29119 remove_edge (e23);
29120 make_edge (bb2, EXIT_BLOCK_PTR, 0);
29122 pop_cfun ();
29124 return bb3;
29127 /* This parses the attribute arguments to target in DECL and determines
29128 the right builtin to use to match the platform specification.
29129 It returns the priority value for this version decl. If PREDICATE_LIST
29130 is not NULL, it stores the list of cpu features that need to be checked
29131 before dispatching this function. */
29133 static unsigned int
29134 get_builtin_code_for_version (tree decl, tree *predicate_list)
29136 tree attrs;
29137 struct cl_target_option cur_target;
29138 tree target_node;
29139 struct cl_target_option *new_target;
29140 const char *arg_str = NULL;
29141 const char *attrs_str = NULL;
29142 char *tok_str = NULL;
29143 char *token;
29145 /* Priority of i386 features, greater value is higher priority. This is
29146 used to decide the order in which function dispatch must happen. For
29147 instance, a version specialized for SSE4.2 should be checked for dispatch
29148 before a version for SSE3, as SSE4.2 implies SSE3. */
29149 enum feature_priority
29151 P_ZERO = 0,
29152 P_MMX,
29153 P_SSE,
29154 P_SSE2,
29155 P_SSE3,
29156 P_SSSE3,
29157 P_PROC_SSSE3,
29158 P_SSE4_a,
29159 P_PROC_SSE4_a,
29160 P_SSE4_1,
29161 P_SSE4_2,
29162 P_PROC_SSE4_2,
29163 P_POPCNT,
29164 P_AVX,
29165 P_AVX2,
29166 P_FMA,
29167 P_PROC_FMA
29170 enum feature_priority priority = P_ZERO;
29172 /* These are the target attribute strings for which a dispatcher is
29173 available, from fold_builtin_cpu. */
29175 static struct _feature_list
29177 const char *const name;
29178 const enum feature_priority priority;
29180 const feature_list[] =
29182 {"mmx", P_MMX},
29183 {"sse", P_SSE},
29184 {"sse2", P_SSE2},
29185 {"sse3", P_SSE3},
29186 {"ssse3", P_SSSE3},
29187 {"sse4.1", P_SSE4_1},
29188 {"sse4.2", P_SSE4_2},
29189 {"popcnt", P_POPCNT},
29190 {"avx", P_AVX},
29191 {"avx2", P_AVX2}
29195 static unsigned int NUM_FEATURES
29196 = sizeof (feature_list) / sizeof (struct _feature_list);
29198 unsigned int i;
29200 tree predicate_chain = NULL_TREE;
29201 tree predicate_decl, predicate_arg;
29203 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29204 gcc_assert (attrs != NULL);
29206 attrs = TREE_VALUE (TREE_VALUE (attrs));
29208 gcc_assert (TREE_CODE (attrs) == STRING_CST);
29209 attrs_str = TREE_STRING_POINTER (attrs);
29211 /* Return priority zero for default function. */
29212 if (strcmp (attrs_str, "default") == 0)
29213 return 0;
29215 /* Handle arch= if specified. For priority, set it to be 1 more than
29216 the best instruction set the processor can handle. For instance, if
29217 there is a version for atom and a version for ssse3 (the highest ISA
29218 priority for atom), the atom version must be checked for dispatch
29219 before the ssse3 version. */
29220 if (strstr (attrs_str, "arch=") != NULL)
29222 cl_target_option_save (&cur_target, &global_options);
29223 target_node = ix86_valid_target_attribute_tree (attrs);
29225 gcc_assert (target_node);
29226 new_target = TREE_TARGET_OPTION (target_node);
29227 gcc_assert (new_target);
29229 if (new_target->arch_specified && new_target->arch > 0)
29231 switch (new_target->arch)
29233 case PROCESSOR_CORE2:
29234 arg_str = "core2";
29235 priority = P_PROC_SSSE3;
29236 break;
29237 case PROCESSOR_COREI7:
29238 arg_str = "corei7";
29239 priority = P_PROC_SSE4_2;
29240 break;
29241 case PROCESSOR_ATOM:
29242 arg_str = "atom";
29243 priority = P_PROC_SSSE3;
29244 break;
29245 case PROCESSOR_AMDFAM10:
29246 arg_str = "amdfam10h";
29247 priority = P_PROC_SSE4_a;
29248 break;
29249 case PROCESSOR_BDVER1:
29250 arg_str = "bdver1";
29251 priority = P_PROC_FMA;
29252 break;
29253 case PROCESSOR_BDVER2:
29254 arg_str = "bdver2";
29255 priority = P_PROC_FMA;
29256 break;
29260 cl_target_option_restore (&global_options, &cur_target);
29262 if (predicate_list && arg_str == NULL)
29264 error_at (DECL_SOURCE_LOCATION (decl),
29265 "No dispatcher found for the versioning attributes");
29266 return 0;
29269 if (predicate_list)
29271 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
29272 /* For a C string literal the length includes the trailing NULL. */
29273 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
29274 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29275 predicate_chain);
29279 /* Process feature name. */
29280 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
29281 strcpy (tok_str, attrs_str);
29282 token = strtok (tok_str, ",");
29283 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
29285 while (token != NULL)
29287 /* Do not process "arch=" */
29288 if (strncmp (token, "arch=", 5) == 0)
29290 token = strtok (NULL, ",");
29291 continue;
29293 for (i = 0; i < NUM_FEATURES; ++i)
29295 if (strcmp (token, feature_list[i].name) == 0)
29297 if (predicate_list)
29299 predicate_arg = build_string_literal (
29300 strlen (feature_list[i].name) + 1,
29301 feature_list[i].name);
29302 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29303 predicate_chain);
29305 /* Find the maximum priority feature. */
29306 if (feature_list[i].priority > priority)
29307 priority = feature_list[i].priority;
29309 break;
29312 if (predicate_list && i == NUM_FEATURES)
29314 error_at (DECL_SOURCE_LOCATION (decl),
29315 "No dispatcher found for %s", token);
29316 return 0;
29318 token = strtok (NULL, ",");
29320 free (tok_str);
29322 if (predicate_list && predicate_chain == NULL_TREE)
29324 error_at (DECL_SOURCE_LOCATION (decl),
29325 "No dispatcher found for the versioning attributes : %s",
29326 attrs_str);
29327 return 0;
29329 else if (predicate_list)
29331 predicate_chain = nreverse (predicate_chain);
29332 *predicate_list = predicate_chain;
29335 return priority;
29338 /* This compares the priority of target features in function DECL1
29339 and DECL2. It returns positive value if DECL1 is higher priority,
29340 negative value if DECL2 is higher priority and 0 if they are the
29341 same. */
29343 static int
29344 ix86_compare_version_priority (tree decl1, tree decl2)
29346 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
29347 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
29349 return (int)priority1 - (int)priority2;
29352 /* V1 and V2 point to function versions with different priorities
29353 based on the target ISA. This function compares their priorities. */
29355 static int
29356 feature_compare (const void *v1, const void *v2)
29358 typedef struct _function_version_info
29360 tree version_decl;
29361 tree predicate_chain;
29362 unsigned int dispatch_priority;
29363 } function_version_info;
29365 const function_version_info c1 = *(const function_version_info *)v1;
29366 const function_version_info c2 = *(const function_version_info *)v2;
29367 return (c2.dispatch_priority - c1.dispatch_priority);
29370 /* This function generates the dispatch function for
29371 multi-versioned functions. DISPATCH_DECL is the function which will
29372 contain the dispatch logic. FNDECLS are the function choices for
29373 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
29374 in DISPATCH_DECL in which the dispatch code is generated. */
29376 static int
29377 dispatch_function_versions (tree dispatch_decl,
29378 void *fndecls_p,
29379 basic_block *empty_bb)
29381 tree default_decl;
29382 gimple ifunc_cpu_init_stmt;
29383 gimple_seq gseq;
29384 int ix;
29385 tree ele;
29386 vec<tree> *fndecls;
29387 unsigned int num_versions = 0;
29388 unsigned int actual_versions = 0;
29389 unsigned int i;
29391 struct _function_version_info
29393 tree version_decl;
29394 tree predicate_chain;
29395 unsigned int dispatch_priority;
29396 }*function_version_info;
29398 gcc_assert (dispatch_decl != NULL
29399 && fndecls_p != NULL
29400 && empty_bb != NULL);
29402 /*fndecls_p is actually a vector. */
29403 fndecls = static_cast<vec<tree> *> (fndecls_p);
29405 /* At least one more version other than the default. */
29406 num_versions = fndecls->length ();
29407 gcc_assert (num_versions >= 2);
29409 function_version_info = (struct _function_version_info *)
29410 XNEWVEC (struct _function_version_info, (num_versions - 1));
29412 /* The first version in the vector is the default decl. */
29413 default_decl = (*fndecls)[0];
29415 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
29417 gseq = bb_seq (*empty_bb);
29418 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
29419 constructors, so explicity call __builtin_cpu_init here. */
29420 ifunc_cpu_init_stmt = gimple_build_call_vec (
29421 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
29422 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
29423 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
29424 set_bb_seq (*empty_bb, gseq);
29426 pop_cfun ();
29429 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
29431 tree version_decl = ele;
29432 tree predicate_chain = NULL_TREE;
29433 unsigned int priority;
29434 /* Get attribute string, parse it and find the right predicate decl.
29435 The predicate function could be a lengthy combination of many
29436 features, like arch-type and various isa-variants. */
29437 priority = get_builtin_code_for_version (version_decl,
29438 &predicate_chain);
29440 if (predicate_chain == NULL_TREE)
29441 continue;
29443 function_version_info [actual_versions].version_decl = version_decl;
29444 function_version_info [actual_versions].predicate_chain
29445 = predicate_chain;
29446 function_version_info [actual_versions].dispatch_priority = priority;
29447 actual_versions++;
29450 /* Sort the versions according to descending order of dispatch priority. The
29451 priority is based on the ISA. This is not a perfect solution. There
29452 could still be ambiguity. If more than one function version is suitable
29453 to execute, which one should be dispatched? In future, allow the user
29454 to specify a dispatch priority next to the version. */
29455 qsort (function_version_info, actual_versions,
29456 sizeof (struct _function_version_info), feature_compare);
29458 for (i = 0; i < actual_versions; ++i)
29459 *empty_bb = add_condition_to_bb (dispatch_decl,
29460 function_version_info[i].version_decl,
29461 function_version_info[i].predicate_chain,
29462 *empty_bb);
29464 /* dispatch default version at the end. */
29465 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
29466 NULL, *empty_bb);
29468 free (function_version_info);
29469 return 0;
29472 /* Comparator function to be used in qsort routine to sort attribute
29473 specification strings to "target". */
29475 static int
29476 attr_strcmp (const void *v1, const void *v2)
29478 const char *c1 = *(char *const*)v1;
29479 const char *c2 = *(char *const*)v2;
29480 return strcmp (c1, c2);
29483 /* ARGLIST is the argument to target attribute. This function tokenizes
29484 the comma separated arguments, sorts them and returns a string which
29485 is a unique identifier for the comma separated arguments. It also
29486 replaces non-identifier characters "=,-" with "_". */
29488 static char *
29489 sorted_attr_string (tree arglist)
29491 tree arg;
29492 size_t str_len_sum = 0;
29493 char **args = NULL;
29494 char *attr_str, *ret_str;
29495 char *attr = NULL;
29496 unsigned int argnum = 1;
29497 unsigned int i;
29499 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29501 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29502 size_t len = strlen (str);
29503 str_len_sum += len + 1;
29504 if (arg != arglist)
29505 argnum++;
29506 for (i = 0; i < strlen (str); i++)
29507 if (str[i] == ',')
29508 argnum++;
29511 attr_str = XNEWVEC (char, str_len_sum);
29512 str_len_sum = 0;
29513 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29515 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29516 size_t len = strlen (str);
29517 memcpy (attr_str + str_len_sum, str, len);
29518 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29519 str_len_sum += len + 1;
29522 /* Replace "=,-" with "_". */
29523 for (i = 0; i < strlen (attr_str); i++)
29524 if (attr_str[i] == '=' || attr_str[i]== '-')
29525 attr_str[i] = '_';
29527 if (argnum == 1)
29528 return attr_str;
29530 args = XNEWVEC (char *, argnum);
29532 i = 0;
29533 attr = strtok (attr_str, ",");
29534 while (attr != NULL)
29536 args[i] = attr;
29537 i++;
29538 attr = strtok (NULL, ",");
29541 qsort (args, argnum, sizeof (char *), attr_strcmp);
29543 ret_str = XNEWVEC (char, str_len_sum);
29544 str_len_sum = 0;
29545 for (i = 0; i < argnum; i++)
29547 size_t len = strlen (args[i]);
29548 memcpy (ret_str + str_len_sum, args[i], len);
29549 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29550 str_len_sum += len + 1;
29553 XDELETEVEC (args);
29554 XDELETEVEC (attr_str);
29555 return ret_str;
29558 /* This function changes the assembler name for functions that are
29559 versions. If DECL is a function version and has a "target"
29560 attribute, it appends the attribute string to its assembler name. */
29562 static tree
29563 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29565 tree version_attr;
29566 const char *orig_name, *version_string;
29567 char *attr_str, *assembler_name;
29569 if (DECL_DECLARED_INLINE_P (decl)
29570 && lookup_attribute ("gnu_inline",
29571 DECL_ATTRIBUTES (decl)))
29572 error_at (DECL_SOURCE_LOCATION (decl),
29573 "Function versions cannot be marked as gnu_inline,"
29574 " bodies have to be generated");
29576 if (DECL_VIRTUAL_P (decl)
29577 || DECL_VINDEX (decl))
29578 sorry ("Virtual function multiversioning not supported");
29580 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29582 /* target attribute string cannot be NULL. */
29583 gcc_assert (version_attr != NULL_TREE);
29585 orig_name = IDENTIFIER_POINTER (id);
29586 version_string
29587 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29589 if (strcmp (version_string, "default") == 0)
29590 return id;
29592 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29593 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29595 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29597 /* Allow assembler name to be modified if already set. */
29598 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29599 SET_DECL_RTL (decl, NULL);
29601 tree ret = get_identifier (assembler_name);
29602 XDELETEVEC (attr_str);
29603 XDELETEVEC (assembler_name);
29604 return ret;
29607 /* This function returns true if FN1 and FN2 are versions of the same function,
29608 that is, the target strings of the function decls are different. This assumes
29609 that FN1 and FN2 have the same signature. */
29611 static bool
29612 ix86_function_versions (tree fn1, tree fn2)
29614 tree attr1, attr2;
29615 char *target1, *target2;
29616 bool result;
29618 if (TREE_CODE (fn1) != FUNCTION_DECL
29619 || TREE_CODE (fn2) != FUNCTION_DECL)
29620 return false;
29622 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29623 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29625 /* At least one function decl should have the target attribute specified. */
29626 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29627 return false;
29629 /* Diagnose missing target attribute if one of the decls is already
29630 multi-versioned. */
29631 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29633 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29635 if (attr2 != NULL_TREE)
29637 tree tem = fn1;
29638 fn1 = fn2;
29639 fn2 = tem;
29640 attr1 = attr2;
29642 error_at (DECL_SOURCE_LOCATION (fn2),
29643 "missing %<target%> attribute for multi-versioned %D",
29644 fn2);
29645 inform (DECL_SOURCE_LOCATION (fn1),
29646 "previous declaration of %D", fn1);
29647 /* Prevent diagnosing of the same error multiple times. */
29648 DECL_ATTRIBUTES (fn2)
29649 = tree_cons (get_identifier ("target"),
29650 copy_node (TREE_VALUE (attr1)),
29651 DECL_ATTRIBUTES (fn2));
29653 return false;
29656 target1 = sorted_attr_string (TREE_VALUE (attr1));
29657 target2 = sorted_attr_string (TREE_VALUE (attr2));
29659 /* The sorted target strings must be different for fn1 and fn2
29660 to be versions. */
29661 if (strcmp (target1, target2) == 0)
29662 result = false;
29663 else
29664 result = true;
29666 XDELETEVEC (target1);
29667 XDELETEVEC (target2);
29669 return result;
29672 static tree
29673 ix86_mangle_decl_assembler_name (tree decl, tree id)
29675 /* For function version, add the target suffix to the assembler name. */
29676 if (TREE_CODE (decl) == FUNCTION_DECL
29677 && DECL_FUNCTION_VERSIONED (decl))
29678 id = ix86_mangle_function_version_assembler_name (decl, id);
29679 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29680 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29681 #endif
29683 return id;
29686 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29687 is true, append the full path name of the source file. */
29689 static char *
29690 make_name (tree decl, const char *suffix, bool make_unique)
29692 char *global_var_name;
29693 int name_len;
29694 const char *name;
29695 const char *unique_name = NULL;
29697 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29699 /* Get a unique name that can be used globally without any chances
29700 of collision at link time. */
29701 if (make_unique)
29702 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29704 name_len = strlen (name) + strlen (suffix) + 2;
29706 if (make_unique)
29707 name_len += strlen (unique_name) + 1;
29708 global_var_name = XNEWVEC (char, name_len);
29710 /* Use '.' to concatenate names as it is demangler friendly. */
29711 if (make_unique)
29712 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29713 suffix);
29714 else
29715 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29717 return global_var_name;
29720 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29722 /* Make a dispatcher declaration for the multi-versioned function DECL.
29723 Calls to DECL function will be replaced with calls to the dispatcher
29724 by the front-end. Return the decl created. */
29726 static tree
29727 make_dispatcher_decl (const tree decl)
29729 tree func_decl;
29730 char *func_name;
29731 tree fn_type, func_type;
29732 bool is_uniq = false;
29734 if (TREE_PUBLIC (decl) == 0)
29735 is_uniq = true;
29737 func_name = make_name (decl, "ifunc", is_uniq);
29739 fn_type = TREE_TYPE (decl);
29740 func_type = build_function_type (TREE_TYPE (fn_type),
29741 TYPE_ARG_TYPES (fn_type));
29743 func_decl = build_fn_decl (func_name, func_type);
29744 XDELETEVEC (func_name);
29745 TREE_USED (func_decl) = 1;
29746 DECL_CONTEXT (func_decl) = NULL_TREE;
29747 DECL_INITIAL (func_decl) = error_mark_node;
29748 DECL_ARTIFICIAL (func_decl) = 1;
29749 /* Mark this func as external, the resolver will flip it again if
29750 it gets generated. */
29751 DECL_EXTERNAL (func_decl) = 1;
29752 /* This will be of type IFUNCs have to be externally visible. */
29753 TREE_PUBLIC (func_decl) = 1;
29755 return func_decl;
29758 #endif
29760 /* Returns true if decl is multi-versioned and DECL is the default function,
29761 that is it is not tagged with target specific optimization. */
29763 static bool
29764 is_function_default_version (const tree decl)
29766 if (TREE_CODE (decl) != FUNCTION_DECL
29767 || !DECL_FUNCTION_VERSIONED (decl))
29768 return false;
29769 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29770 gcc_assert (attr);
29771 attr = TREE_VALUE (TREE_VALUE (attr));
29772 return (TREE_CODE (attr) == STRING_CST
29773 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29776 /* Make a dispatcher declaration for the multi-versioned function DECL.
29777 Calls to DECL function will be replaced with calls to the dispatcher
29778 by the front-end. Returns the decl of the dispatcher function. */
29780 static tree
29781 ix86_get_function_versions_dispatcher (void *decl)
29783 tree fn = (tree) decl;
29784 struct cgraph_node *node = NULL;
29785 struct cgraph_node *default_node = NULL;
29786 struct cgraph_function_version_info *node_v = NULL;
29787 struct cgraph_function_version_info *first_v = NULL;
29789 tree dispatch_decl = NULL;
29791 struct cgraph_function_version_info *default_version_info = NULL;
29793 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29795 node = cgraph_get_node (fn);
29796 gcc_assert (node != NULL);
29798 node_v = get_cgraph_node_version (node);
29799 gcc_assert (node_v != NULL);
29801 if (node_v->dispatcher_resolver != NULL)
29802 return node_v->dispatcher_resolver;
29804 /* Find the default version and make it the first node. */
29805 first_v = node_v;
29806 /* Go to the beginning of the chain. */
29807 while (first_v->prev != NULL)
29808 first_v = first_v->prev;
29809 default_version_info = first_v;
29810 while (default_version_info != NULL)
29812 if (is_function_default_version
29813 (default_version_info->this_node->symbol.decl))
29814 break;
29815 default_version_info = default_version_info->next;
29818 /* If there is no default node, just return NULL. */
29819 if (default_version_info == NULL)
29820 return NULL;
29822 /* Make default info the first node. */
29823 if (first_v != default_version_info)
29825 default_version_info->prev->next = default_version_info->next;
29826 if (default_version_info->next)
29827 default_version_info->next->prev = default_version_info->prev;
29828 first_v->prev = default_version_info;
29829 default_version_info->next = first_v;
29830 default_version_info->prev = NULL;
29833 default_node = default_version_info->this_node;
29835 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29836 if (targetm.has_ifunc_p ())
29838 struct cgraph_function_version_info *it_v = NULL;
29839 struct cgraph_node *dispatcher_node = NULL;
29840 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29842 /* Right now, the dispatching is done via ifunc. */
29843 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29845 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29846 gcc_assert (dispatcher_node != NULL);
29847 dispatcher_node->dispatcher_function = 1;
29848 dispatcher_version_info
29849 = insert_new_cgraph_node_version (dispatcher_node);
29850 dispatcher_version_info->next = default_version_info;
29851 dispatcher_node->symbol.definition = 1;
29853 /* Set the dispatcher for all the versions. */
29854 it_v = default_version_info;
29855 while (it_v != NULL)
29857 it_v->dispatcher_resolver = dispatch_decl;
29858 it_v = it_v->next;
29861 else
29862 #endif
29864 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29865 "multiversioning needs ifunc which is not supported "
29866 "on this target");
29869 return dispatch_decl;
29872 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29873 it to CHAIN. */
29875 static tree
29876 make_attribute (const char *name, const char *arg_name, tree chain)
29878 tree attr_name;
29879 tree attr_arg_name;
29880 tree attr_args;
29881 tree attr;
29883 attr_name = get_identifier (name);
29884 attr_arg_name = build_string (strlen (arg_name), arg_name);
29885 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29886 attr = tree_cons (attr_name, attr_args, chain);
29887 return attr;
29890 /* Make the resolver function decl to dispatch the versions of
29891 a multi-versioned function, DEFAULT_DECL. Create an
29892 empty basic block in the resolver and store the pointer in
29893 EMPTY_BB. Return the decl of the resolver function. */
29895 static tree
29896 make_resolver_func (const tree default_decl,
29897 const tree dispatch_decl,
29898 basic_block *empty_bb)
29900 char *resolver_name;
29901 tree decl, type, decl_name, t;
29902 bool is_uniq = false;
29904 /* IFUNC's have to be globally visible. So, if the default_decl is
29905 not, then the name of the IFUNC should be made unique. */
29906 if (TREE_PUBLIC (default_decl) == 0)
29907 is_uniq = true;
29909 /* Append the filename to the resolver function if the versions are
29910 not externally visible. This is because the resolver function has
29911 to be externally visible for the loader to find it. So, appending
29912 the filename will prevent conflicts with a resolver function from
29913 another module which is based on the same version name. */
29914 resolver_name = make_name (default_decl, "resolver", is_uniq);
29916 /* The resolver function should return a (void *). */
29917 type = build_function_type_list (ptr_type_node, NULL_TREE);
29919 decl = build_fn_decl (resolver_name, type);
29920 decl_name = get_identifier (resolver_name);
29921 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29923 DECL_NAME (decl) = decl_name;
29924 TREE_USED (decl) = 1;
29925 DECL_ARTIFICIAL (decl) = 1;
29926 DECL_IGNORED_P (decl) = 0;
29927 /* IFUNC resolvers have to be externally visible. */
29928 TREE_PUBLIC (decl) = 1;
29929 DECL_UNINLINABLE (decl) = 1;
29931 /* Resolver is not external, body is generated. */
29932 DECL_EXTERNAL (decl) = 0;
29933 DECL_EXTERNAL (dispatch_decl) = 0;
29935 DECL_CONTEXT (decl) = NULL_TREE;
29936 DECL_INITIAL (decl) = make_node (BLOCK);
29937 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29939 if (DECL_COMDAT_GROUP (default_decl)
29940 || TREE_PUBLIC (default_decl))
29942 /* In this case, each translation unit with a call to this
29943 versioned function will put out a resolver. Ensure it
29944 is comdat to keep just one copy. */
29945 DECL_COMDAT (decl) = 1;
29946 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29948 /* Build result decl and add to function_decl. */
29949 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29950 DECL_ARTIFICIAL (t) = 1;
29951 DECL_IGNORED_P (t) = 1;
29952 DECL_RESULT (decl) = t;
29954 gimplify_function_tree (decl);
29955 push_cfun (DECL_STRUCT_FUNCTION (decl));
29956 *empty_bb = init_lowered_empty_function (decl, false);
29958 cgraph_add_new_function (decl, true);
29959 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29961 pop_cfun ();
29963 gcc_assert (dispatch_decl != NULL);
29964 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29965 DECL_ATTRIBUTES (dispatch_decl)
29966 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29968 /* Create the alias for dispatch to resolver here. */
29969 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29970 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29971 XDELETEVEC (resolver_name);
29972 return decl;
29975 /* Generate the dispatching code body to dispatch multi-versioned function
29976 DECL. The target hook is called to process the "target" attributes and
29977 provide the code to dispatch the right function at run-time. NODE points
29978 to the dispatcher decl whose body will be created. */
29980 static tree
29981 ix86_generate_version_dispatcher_body (void *node_p)
29983 tree resolver_decl;
29984 basic_block empty_bb;
29985 vec<tree> fn_ver_vec = vNULL;
29986 tree default_ver_decl;
29987 struct cgraph_node *versn;
29988 struct cgraph_node *node;
29990 struct cgraph_function_version_info *node_version_info = NULL;
29991 struct cgraph_function_version_info *versn_info = NULL;
29993 node = (cgraph_node *)node_p;
29995 node_version_info = get_cgraph_node_version (node);
29996 gcc_assert (node->dispatcher_function
29997 && node_version_info != NULL);
29999 if (node_version_info->dispatcher_resolver)
30000 return node_version_info->dispatcher_resolver;
30002 /* The first version in the chain corresponds to the default version. */
30003 default_ver_decl = node_version_info->next->this_node->symbol.decl;
30005 /* node is going to be an alias, so remove the finalized bit. */
30006 node->symbol.definition = false;
30008 resolver_decl = make_resolver_func (default_ver_decl,
30009 node->symbol.decl, &empty_bb);
30011 node_version_info->dispatcher_resolver = resolver_decl;
30013 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30015 fn_ver_vec.create (2);
30017 for (versn_info = node_version_info->next; versn_info;
30018 versn_info = versn_info->next)
30020 versn = versn_info->this_node;
30021 /* Check for virtual functions here again, as by this time it should
30022 have been determined if this function needs a vtable index or
30023 not. This happens for methods in derived classes that override
30024 virtual methods in base classes but are not explicitly marked as
30025 virtual. */
30026 if (DECL_VINDEX (versn->symbol.decl))
30027 sorry ("Virtual function multiversioning not supported");
30029 fn_ver_vec.safe_push (versn->symbol.decl);
30032 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30033 fn_ver_vec.release ();
30034 rebuild_cgraph_edges ();
30035 pop_cfun ();
30036 return resolver_decl;
30038 /* This builds the processor_model struct type defined in
30039 libgcc/config/i386/cpuinfo.c */
30041 static tree
30042 build_processor_model_struct (void)
30044 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30045 "__cpu_features"};
30046 tree field = NULL_TREE, field_chain = NULL_TREE;
30047 int i;
30048 tree type = make_node (RECORD_TYPE);
30050 /* The first 3 fields are unsigned int. */
30051 for (i = 0; i < 3; ++i)
30053 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30054 get_identifier (field_name[i]), unsigned_type_node);
30055 if (field_chain != NULL_TREE)
30056 DECL_CHAIN (field) = field_chain;
30057 field_chain = field;
30060 /* The last field is an array of unsigned integers of size one. */
30061 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30062 get_identifier (field_name[3]),
30063 build_array_type (unsigned_type_node,
30064 build_index_type (size_one_node)));
30065 if (field_chain != NULL_TREE)
30066 DECL_CHAIN (field) = field_chain;
30067 field_chain = field;
30069 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30070 return type;
30073 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30075 static tree
30076 make_var_decl (tree type, const char *name)
30078 tree new_decl;
30080 new_decl = build_decl (UNKNOWN_LOCATION,
30081 VAR_DECL,
30082 get_identifier(name),
30083 type);
30085 DECL_EXTERNAL (new_decl) = 1;
30086 TREE_STATIC (new_decl) = 1;
30087 TREE_PUBLIC (new_decl) = 1;
30088 DECL_INITIAL (new_decl) = 0;
30089 DECL_ARTIFICIAL (new_decl) = 0;
30090 DECL_PRESERVE_P (new_decl) = 1;
30092 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30093 assemble_variable (new_decl, 0, 0, 0);
30095 return new_decl;
30098 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30099 into an integer defined in libgcc/config/i386/cpuinfo.c */
30101 static tree
30102 fold_builtin_cpu (tree fndecl, tree *args)
30104 unsigned int i;
30105 enum ix86_builtins fn_code = (enum ix86_builtins)
30106 DECL_FUNCTION_CODE (fndecl);
30107 tree param_string_cst = NULL;
30109 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30110 enum processor_features
30112 F_CMOV = 0,
30113 F_MMX,
30114 F_POPCNT,
30115 F_SSE,
30116 F_SSE2,
30117 F_SSE3,
30118 F_SSSE3,
30119 F_SSE4_1,
30120 F_SSE4_2,
30121 F_AVX,
30122 F_AVX2,
30123 F_MAX
30126 /* These are the values for vendor types and cpu types and subtypes
30127 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30128 the corresponding start value. */
30129 enum processor_model
30131 M_INTEL = 1,
30132 M_AMD,
30133 M_CPU_TYPE_START,
30134 M_INTEL_ATOM,
30135 M_INTEL_CORE2,
30136 M_INTEL_COREI7,
30137 M_AMDFAM10H,
30138 M_AMDFAM15H,
30139 M_INTEL_SLM,
30140 M_CPU_SUBTYPE_START,
30141 M_INTEL_COREI7_NEHALEM,
30142 M_INTEL_COREI7_WESTMERE,
30143 M_INTEL_COREI7_SANDYBRIDGE,
30144 M_AMDFAM10H_BARCELONA,
30145 M_AMDFAM10H_SHANGHAI,
30146 M_AMDFAM10H_ISTANBUL,
30147 M_AMDFAM15H_BDVER1,
30148 M_AMDFAM15H_BDVER2,
30149 M_AMDFAM15H_BDVER3
30152 static struct _arch_names_table
30154 const char *const name;
30155 const enum processor_model model;
30157 const arch_names_table[] =
30159 {"amd", M_AMD},
30160 {"intel", M_INTEL},
30161 {"atom", M_INTEL_ATOM},
30162 {"slm", M_INTEL_SLM},
30163 {"core2", M_INTEL_CORE2},
30164 {"corei7", M_INTEL_COREI7},
30165 {"nehalem", M_INTEL_COREI7_NEHALEM},
30166 {"westmere", M_INTEL_COREI7_WESTMERE},
30167 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30168 {"amdfam10h", M_AMDFAM10H},
30169 {"barcelona", M_AMDFAM10H_BARCELONA},
30170 {"shanghai", M_AMDFAM10H_SHANGHAI},
30171 {"istanbul", M_AMDFAM10H_ISTANBUL},
30172 {"amdfam15h", M_AMDFAM15H},
30173 {"bdver1", M_AMDFAM15H_BDVER1},
30174 {"bdver2", M_AMDFAM15H_BDVER2},
30175 {"bdver3", M_AMDFAM15H_BDVER3},
30178 static struct _isa_names_table
30180 const char *const name;
30181 const enum processor_features feature;
30183 const isa_names_table[] =
30185 {"cmov", F_CMOV},
30186 {"mmx", F_MMX},
30187 {"popcnt", F_POPCNT},
30188 {"sse", F_SSE},
30189 {"sse2", F_SSE2},
30190 {"sse3", F_SSE3},
30191 {"ssse3", F_SSSE3},
30192 {"sse4.1", F_SSE4_1},
30193 {"sse4.2", F_SSE4_2},
30194 {"avx", F_AVX},
30195 {"avx2", F_AVX2}
30198 tree __processor_model_type = build_processor_model_struct ();
30199 tree __cpu_model_var = make_var_decl (__processor_model_type,
30200 "__cpu_model");
30203 varpool_add_new_variable (__cpu_model_var);
30205 gcc_assert ((args != NULL) && (*args != NULL));
30207 param_string_cst = *args;
30208 while (param_string_cst
30209 && TREE_CODE (param_string_cst) != STRING_CST)
30211 /* *args must be a expr that can contain other EXPRS leading to a
30212 STRING_CST. */
30213 if (!EXPR_P (param_string_cst))
30215 error ("Parameter to builtin must be a string constant or literal");
30216 return integer_zero_node;
30218 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
30221 gcc_assert (param_string_cst);
30223 if (fn_code == IX86_BUILTIN_CPU_IS)
30225 tree ref;
30226 tree field;
30227 tree final;
30229 unsigned int field_val = 0;
30230 unsigned int NUM_ARCH_NAMES
30231 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
30233 for (i = 0; i < NUM_ARCH_NAMES; i++)
30234 if (strcmp (arch_names_table[i].name,
30235 TREE_STRING_POINTER (param_string_cst)) == 0)
30236 break;
30238 if (i == NUM_ARCH_NAMES)
30240 error ("Parameter to builtin not valid: %s",
30241 TREE_STRING_POINTER (param_string_cst));
30242 return integer_zero_node;
30245 field = TYPE_FIELDS (__processor_model_type);
30246 field_val = arch_names_table[i].model;
30248 /* CPU types are stored in the next field. */
30249 if (field_val > M_CPU_TYPE_START
30250 && field_val < M_CPU_SUBTYPE_START)
30252 field = DECL_CHAIN (field);
30253 field_val -= M_CPU_TYPE_START;
30256 /* CPU subtypes are stored in the next field. */
30257 if (field_val > M_CPU_SUBTYPE_START)
30259 field = DECL_CHAIN ( DECL_CHAIN (field));
30260 field_val -= M_CPU_SUBTYPE_START;
30263 /* Get the appropriate field in __cpu_model. */
30264 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30265 field, NULL_TREE);
30267 /* Check the value. */
30268 final = build2 (EQ_EXPR, unsigned_type_node, ref,
30269 build_int_cstu (unsigned_type_node, field_val));
30270 return build1 (CONVERT_EXPR, integer_type_node, final);
30272 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30274 tree ref;
30275 tree array_elt;
30276 tree field;
30277 tree final;
30279 unsigned int field_val = 0;
30280 unsigned int NUM_ISA_NAMES
30281 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
30283 for (i = 0; i < NUM_ISA_NAMES; i++)
30284 if (strcmp (isa_names_table[i].name,
30285 TREE_STRING_POINTER (param_string_cst)) == 0)
30286 break;
30288 if (i == NUM_ISA_NAMES)
30290 error ("Parameter to builtin not valid: %s",
30291 TREE_STRING_POINTER (param_string_cst));
30292 return integer_zero_node;
30295 field = TYPE_FIELDS (__processor_model_type);
30296 /* Get the last field, which is __cpu_features. */
30297 while (DECL_CHAIN (field))
30298 field = DECL_CHAIN (field);
30300 /* Get the appropriate field: __cpu_model.__cpu_features */
30301 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30302 field, NULL_TREE);
30304 /* Access the 0th element of __cpu_features array. */
30305 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
30306 integer_zero_node, NULL_TREE, NULL_TREE);
30308 field_val = (1 << isa_names_table[i].feature);
30309 /* Return __cpu_model.__cpu_features[0] & field_val */
30310 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
30311 build_int_cstu (unsigned_type_node, field_val));
30312 return build1 (CONVERT_EXPR, integer_type_node, final);
30314 gcc_unreachable ();
30317 static tree
30318 ix86_fold_builtin (tree fndecl, int n_args,
30319 tree *args, bool ignore ATTRIBUTE_UNUSED)
30321 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30323 enum ix86_builtins fn_code = (enum ix86_builtins)
30324 DECL_FUNCTION_CODE (fndecl);
30325 if (fn_code == IX86_BUILTIN_CPU_IS
30326 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30328 gcc_assert (n_args == 1);
30329 return fold_builtin_cpu (fndecl, args);
30333 #ifdef SUBTARGET_FOLD_BUILTIN
30334 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
30335 #endif
30337 return NULL_TREE;
30340 /* Make builtins to detect cpu type and features supported. NAME is
30341 the builtin name, CODE is the builtin code, and FTYPE is the function
30342 type of the builtin. */
30344 static void
30345 make_cpu_type_builtin (const char* name, int code,
30346 enum ix86_builtin_func_type ftype, bool is_const)
30348 tree decl;
30349 tree type;
30351 type = ix86_get_builtin_func_type (ftype);
30352 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30353 NULL, NULL_TREE);
30354 gcc_assert (decl != NULL_TREE);
30355 ix86_builtins[(int) code] = decl;
30356 TREE_READONLY (decl) = is_const;
30359 /* Make builtins to get CPU type and features supported. The created
30360 builtins are :
30362 __builtin_cpu_init (), to detect cpu type and features,
30363 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
30364 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
30367 static void
30368 ix86_init_platform_type_builtins (void)
30370 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
30371 INT_FTYPE_VOID, false);
30372 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
30373 INT_FTYPE_PCCHAR, true);
30374 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
30375 INT_FTYPE_PCCHAR, true);
30378 /* Internal method for ix86_init_builtins. */
30380 static void
30381 ix86_init_builtins_va_builtins_abi (void)
30383 tree ms_va_ref, sysv_va_ref;
30384 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
30385 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
30386 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
30387 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
30389 if (!TARGET_64BIT)
30390 return;
30391 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
30392 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
30393 ms_va_ref = build_reference_type (ms_va_list_type_node);
30394 sysv_va_ref =
30395 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
30397 fnvoid_va_end_ms =
30398 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30399 fnvoid_va_start_ms =
30400 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30401 fnvoid_va_end_sysv =
30402 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
30403 fnvoid_va_start_sysv =
30404 build_varargs_function_type_list (void_type_node, sysv_va_ref,
30405 NULL_TREE);
30406 fnvoid_va_copy_ms =
30407 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
30408 NULL_TREE);
30409 fnvoid_va_copy_sysv =
30410 build_function_type_list (void_type_node, sysv_va_ref,
30411 sysv_va_ref, NULL_TREE);
30413 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
30414 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
30415 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
30416 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
30417 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
30418 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
30419 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
30420 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30421 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
30422 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30423 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
30424 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30427 static void
30428 ix86_init_builtin_types (void)
30430 tree float128_type_node, float80_type_node;
30432 /* The __float80 type. */
30433 float80_type_node = long_double_type_node;
30434 if (TYPE_MODE (float80_type_node) != XFmode)
30436 /* The __float80 type. */
30437 float80_type_node = make_node (REAL_TYPE);
30439 TYPE_PRECISION (float80_type_node) = 80;
30440 layout_type (float80_type_node);
30442 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
30444 /* The __float128 type. */
30445 float128_type_node = make_node (REAL_TYPE);
30446 TYPE_PRECISION (float128_type_node) = 128;
30447 layout_type (float128_type_node);
30448 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
30450 /* This macro is built by i386-builtin-types.awk. */
30451 DEFINE_BUILTIN_PRIMITIVE_TYPES;
30454 static void
30455 ix86_init_builtins (void)
30457 tree t;
30459 ix86_init_builtin_types ();
30461 /* Builtins to get CPU type and features. */
30462 ix86_init_platform_type_builtins ();
30464 /* TFmode support builtins. */
30465 def_builtin_const (0, "__builtin_infq",
30466 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
30467 def_builtin_const (0, "__builtin_huge_valq",
30468 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
30470 /* We will expand them to normal call if SSE isn't available since
30471 they are used by libgcc. */
30472 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
30473 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
30474 BUILT_IN_MD, "__fabstf2", NULL_TREE);
30475 TREE_READONLY (t) = 1;
30476 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
30478 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
30479 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
30480 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
30481 TREE_READONLY (t) = 1;
30482 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
30484 ix86_init_tm_builtins ();
30485 ix86_init_mmx_sse_builtins ();
30487 if (TARGET_LP64)
30488 ix86_init_builtins_va_builtins_abi ();
30490 #ifdef SUBTARGET_INIT_BUILTINS
30491 SUBTARGET_INIT_BUILTINS;
30492 #endif
30495 /* Return the ix86 builtin for CODE. */
30497 static tree
30498 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
30500 if (code >= IX86_BUILTIN_MAX)
30501 return error_mark_node;
30503 return ix86_builtins[code];
30506 /* Errors in the source file can cause expand_expr to return const0_rtx
30507 where we expect a vector. To avoid crashing, use one of the vector
30508 clear instructions. */
30509 static rtx
30510 safe_vector_operand (rtx x, enum machine_mode mode)
30512 if (x == const0_rtx)
30513 x = CONST0_RTX (mode);
30514 return x;
30517 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30519 static rtx
30520 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30522 rtx pat;
30523 tree arg0 = CALL_EXPR_ARG (exp, 0);
30524 tree arg1 = CALL_EXPR_ARG (exp, 1);
30525 rtx op0 = expand_normal (arg0);
30526 rtx op1 = expand_normal (arg1);
30527 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30528 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30529 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30531 if (VECTOR_MODE_P (mode0))
30532 op0 = safe_vector_operand (op0, mode0);
30533 if (VECTOR_MODE_P (mode1))
30534 op1 = safe_vector_operand (op1, mode1);
30536 if (optimize || !target
30537 || GET_MODE (target) != tmode
30538 || !insn_data[icode].operand[0].predicate (target, tmode))
30539 target = gen_reg_rtx (tmode);
30541 if (GET_MODE (op1) == SImode && mode1 == TImode)
30543 rtx x = gen_reg_rtx (V4SImode);
30544 emit_insn (gen_sse2_loadd (x, op1));
30545 op1 = gen_lowpart (TImode, x);
30548 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30549 op0 = copy_to_mode_reg (mode0, op0);
30550 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30551 op1 = copy_to_mode_reg (mode1, op1);
30553 pat = GEN_FCN (icode) (target, op0, op1);
30554 if (! pat)
30555 return 0;
30557 emit_insn (pat);
30559 return target;
30562 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30564 static rtx
30565 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30566 enum ix86_builtin_func_type m_type,
30567 enum rtx_code sub_code)
30569 rtx pat;
30570 int i;
30571 int nargs;
30572 bool comparison_p = false;
30573 bool tf_p = false;
30574 bool last_arg_constant = false;
30575 int num_memory = 0;
30576 struct {
30577 rtx op;
30578 enum machine_mode mode;
30579 } args[4];
30581 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30583 switch (m_type)
30585 case MULTI_ARG_4_DF2_DI_I:
30586 case MULTI_ARG_4_DF2_DI_I1:
30587 case MULTI_ARG_4_SF2_SI_I:
30588 case MULTI_ARG_4_SF2_SI_I1:
30589 nargs = 4;
30590 last_arg_constant = true;
30591 break;
30593 case MULTI_ARG_3_SF:
30594 case MULTI_ARG_3_DF:
30595 case MULTI_ARG_3_SF2:
30596 case MULTI_ARG_3_DF2:
30597 case MULTI_ARG_3_DI:
30598 case MULTI_ARG_3_SI:
30599 case MULTI_ARG_3_SI_DI:
30600 case MULTI_ARG_3_HI:
30601 case MULTI_ARG_3_HI_SI:
30602 case MULTI_ARG_3_QI:
30603 case MULTI_ARG_3_DI2:
30604 case MULTI_ARG_3_SI2:
30605 case MULTI_ARG_3_HI2:
30606 case MULTI_ARG_3_QI2:
30607 nargs = 3;
30608 break;
30610 case MULTI_ARG_2_SF:
30611 case MULTI_ARG_2_DF:
30612 case MULTI_ARG_2_DI:
30613 case MULTI_ARG_2_SI:
30614 case MULTI_ARG_2_HI:
30615 case MULTI_ARG_2_QI:
30616 nargs = 2;
30617 break;
30619 case MULTI_ARG_2_DI_IMM:
30620 case MULTI_ARG_2_SI_IMM:
30621 case MULTI_ARG_2_HI_IMM:
30622 case MULTI_ARG_2_QI_IMM:
30623 nargs = 2;
30624 last_arg_constant = true;
30625 break;
30627 case MULTI_ARG_1_SF:
30628 case MULTI_ARG_1_DF:
30629 case MULTI_ARG_1_SF2:
30630 case MULTI_ARG_1_DF2:
30631 case MULTI_ARG_1_DI:
30632 case MULTI_ARG_1_SI:
30633 case MULTI_ARG_1_HI:
30634 case MULTI_ARG_1_QI:
30635 case MULTI_ARG_1_SI_DI:
30636 case MULTI_ARG_1_HI_DI:
30637 case MULTI_ARG_1_HI_SI:
30638 case MULTI_ARG_1_QI_DI:
30639 case MULTI_ARG_1_QI_SI:
30640 case MULTI_ARG_1_QI_HI:
30641 nargs = 1;
30642 break;
30644 case MULTI_ARG_2_DI_CMP:
30645 case MULTI_ARG_2_SI_CMP:
30646 case MULTI_ARG_2_HI_CMP:
30647 case MULTI_ARG_2_QI_CMP:
30648 nargs = 2;
30649 comparison_p = true;
30650 break;
30652 case MULTI_ARG_2_SF_TF:
30653 case MULTI_ARG_2_DF_TF:
30654 case MULTI_ARG_2_DI_TF:
30655 case MULTI_ARG_2_SI_TF:
30656 case MULTI_ARG_2_HI_TF:
30657 case MULTI_ARG_2_QI_TF:
30658 nargs = 2;
30659 tf_p = true;
30660 break;
30662 default:
30663 gcc_unreachable ();
30666 if (optimize || !target
30667 || GET_MODE (target) != tmode
30668 || !insn_data[icode].operand[0].predicate (target, tmode))
30669 target = gen_reg_rtx (tmode);
30671 gcc_assert (nargs <= 4);
30673 for (i = 0; i < nargs; i++)
30675 tree arg = CALL_EXPR_ARG (exp, i);
30676 rtx op = expand_normal (arg);
30677 int adjust = (comparison_p) ? 1 : 0;
30678 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30680 if (last_arg_constant && i == nargs - 1)
30682 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30684 enum insn_code new_icode = icode;
30685 switch (icode)
30687 case CODE_FOR_xop_vpermil2v2df3:
30688 case CODE_FOR_xop_vpermil2v4sf3:
30689 case CODE_FOR_xop_vpermil2v4df3:
30690 case CODE_FOR_xop_vpermil2v8sf3:
30691 error ("the last argument must be a 2-bit immediate");
30692 return gen_reg_rtx (tmode);
30693 case CODE_FOR_xop_rotlv2di3:
30694 new_icode = CODE_FOR_rotlv2di3;
30695 goto xop_rotl;
30696 case CODE_FOR_xop_rotlv4si3:
30697 new_icode = CODE_FOR_rotlv4si3;
30698 goto xop_rotl;
30699 case CODE_FOR_xop_rotlv8hi3:
30700 new_icode = CODE_FOR_rotlv8hi3;
30701 goto xop_rotl;
30702 case CODE_FOR_xop_rotlv16qi3:
30703 new_icode = CODE_FOR_rotlv16qi3;
30704 xop_rotl:
30705 if (CONST_INT_P (op))
30707 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30708 op = GEN_INT (INTVAL (op) & mask);
30709 gcc_checking_assert
30710 (insn_data[icode].operand[i + 1].predicate (op, mode));
30712 else
30714 gcc_checking_assert
30715 (nargs == 2
30716 && insn_data[new_icode].operand[0].mode == tmode
30717 && insn_data[new_icode].operand[1].mode == tmode
30718 && insn_data[new_icode].operand[2].mode == mode
30719 && insn_data[new_icode].operand[0].predicate
30720 == insn_data[icode].operand[0].predicate
30721 && insn_data[new_icode].operand[1].predicate
30722 == insn_data[icode].operand[1].predicate);
30723 icode = new_icode;
30724 goto non_constant;
30726 break;
30727 default:
30728 gcc_unreachable ();
30732 else
30734 non_constant:
30735 if (VECTOR_MODE_P (mode))
30736 op = safe_vector_operand (op, mode);
30738 /* If we aren't optimizing, only allow one memory operand to be
30739 generated. */
30740 if (memory_operand (op, mode))
30741 num_memory++;
30743 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30745 if (optimize
30746 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30747 || num_memory > 1)
30748 op = force_reg (mode, op);
30751 args[i].op = op;
30752 args[i].mode = mode;
30755 switch (nargs)
30757 case 1:
30758 pat = GEN_FCN (icode) (target, args[0].op);
30759 break;
30761 case 2:
30762 if (tf_p)
30763 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30764 GEN_INT ((int)sub_code));
30765 else if (! comparison_p)
30766 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30767 else
30769 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30770 args[0].op,
30771 args[1].op);
30773 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30775 break;
30777 case 3:
30778 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30779 break;
30781 case 4:
30782 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30783 break;
30785 default:
30786 gcc_unreachable ();
30789 if (! pat)
30790 return 0;
30792 emit_insn (pat);
30793 return target;
30796 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30797 insns with vec_merge. */
30799 static rtx
30800 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30801 rtx target)
30803 rtx pat;
30804 tree arg0 = CALL_EXPR_ARG (exp, 0);
30805 rtx op1, op0 = expand_normal (arg0);
30806 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30807 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30809 if (optimize || !target
30810 || GET_MODE (target) != tmode
30811 || !insn_data[icode].operand[0].predicate (target, tmode))
30812 target = gen_reg_rtx (tmode);
30814 if (VECTOR_MODE_P (mode0))
30815 op0 = safe_vector_operand (op0, mode0);
30817 if ((optimize && !register_operand (op0, mode0))
30818 || !insn_data[icode].operand[1].predicate (op0, mode0))
30819 op0 = copy_to_mode_reg (mode0, op0);
30821 op1 = op0;
30822 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30823 op1 = copy_to_mode_reg (mode0, op1);
30825 pat = GEN_FCN (icode) (target, op0, op1);
30826 if (! pat)
30827 return 0;
30828 emit_insn (pat);
30829 return target;
30832 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30834 static rtx
30835 ix86_expand_sse_compare (const struct builtin_description *d,
30836 tree exp, rtx target, bool swap)
30838 rtx pat;
30839 tree arg0 = CALL_EXPR_ARG (exp, 0);
30840 tree arg1 = CALL_EXPR_ARG (exp, 1);
30841 rtx op0 = expand_normal (arg0);
30842 rtx op1 = expand_normal (arg1);
30843 rtx op2;
30844 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30845 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30846 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30847 enum rtx_code comparison = d->comparison;
30849 if (VECTOR_MODE_P (mode0))
30850 op0 = safe_vector_operand (op0, mode0);
30851 if (VECTOR_MODE_P (mode1))
30852 op1 = safe_vector_operand (op1, mode1);
30854 /* Swap operands if we have a comparison that isn't available in
30855 hardware. */
30856 if (swap)
30858 rtx tmp = gen_reg_rtx (mode1);
30859 emit_move_insn (tmp, op1);
30860 op1 = op0;
30861 op0 = tmp;
30864 if (optimize || !target
30865 || GET_MODE (target) != tmode
30866 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30867 target = gen_reg_rtx (tmode);
30869 if ((optimize && !register_operand (op0, mode0))
30870 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30871 op0 = copy_to_mode_reg (mode0, op0);
30872 if ((optimize && !register_operand (op1, mode1))
30873 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30874 op1 = copy_to_mode_reg (mode1, op1);
30876 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30877 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30878 if (! pat)
30879 return 0;
30880 emit_insn (pat);
30881 return target;
30884 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30886 static rtx
30887 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30888 rtx target)
30890 rtx pat;
30891 tree arg0 = CALL_EXPR_ARG (exp, 0);
30892 tree arg1 = CALL_EXPR_ARG (exp, 1);
30893 rtx op0 = expand_normal (arg0);
30894 rtx op1 = expand_normal (arg1);
30895 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30896 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30897 enum rtx_code comparison = d->comparison;
30899 if (VECTOR_MODE_P (mode0))
30900 op0 = safe_vector_operand (op0, mode0);
30901 if (VECTOR_MODE_P (mode1))
30902 op1 = safe_vector_operand (op1, mode1);
30904 /* Swap operands if we have a comparison that isn't available in
30905 hardware. */
30906 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30908 rtx tmp = op1;
30909 op1 = op0;
30910 op0 = tmp;
30913 target = gen_reg_rtx (SImode);
30914 emit_move_insn (target, const0_rtx);
30915 target = gen_rtx_SUBREG (QImode, target, 0);
30917 if ((optimize && !register_operand (op0, mode0))
30918 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30919 op0 = copy_to_mode_reg (mode0, op0);
30920 if ((optimize && !register_operand (op1, mode1))
30921 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30922 op1 = copy_to_mode_reg (mode1, op1);
30924 pat = GEN_FCN (d->icode) (op0, op1);
30925 if (! pat)
30926 return 0;
30927 emit_insn (pat);
30928 emit_insn (gen_rtx_SET (VOIDmode,
30929 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30930 gen_rtx_fmt_ee (comparison, QImode,
30931 SET_DEST (pat),
30932 const0_rtx)));
30934 return SUBREG_REG (target);
30937 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30939 static rtx
30940 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30941 rtx target)
30943 rtx pat;
30944 tree arg0 = CALL_EXPR_ARG (exp, 0);
30945 rtx op1, op0 = expand_normal (arg0);
30946 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30947 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30949 if (optimize || target == 0
30950 || GET_MODE (target) != tmode
30951 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30952 target = gen_reg_rtx (tmode);
30954 if (VECTOR_MODE_P (mode0))
30955 op0 = safe_vector_operand (op0, mode0);
30957 if ((optimize && !register_operand (op0, mode0))
30958 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30959 op0 = copy_to_mode_reg (mode0, op0);
30961 op1 = GEN_INT (d->comparison);
30963 pat = GEN_FCN (d->icode) (target, op0, op1);
30964 if (! pat)
30965 return 0;
30966 emit_insn (pat);
30967 return target;
30970 static rtx
30971 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30972 tree exp, rtx target)
30974 rtx pat;
30975 tree arg0 = CALL_EXPR_ARG (exp, 0);
30976 tree arg1 = CALL_EXPR_ARG (exp, 1);
30977 rtx op0 = expand_normal (arg0);
30978 rtx op1 = expand_normal (arg1);
30979 rtx op2;
30980 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30981 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30982 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30984 if (optimize || target == 0
30985 || GET_MODE (target) != tmode
30986 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30987 target = gen_reg_rtx (tmode);
30989 op0 = safe_vector_operand (op0, mode0);
30990 op1 = safe_vector_operand (op1, mode1);
30992 if ((optimize && !register_operand (op0, mode0))
30993 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30994 op0 = copy_to_mode_reg (mode0, op0);
30995 if ((optimize && !register_operand (op1, mode1))
30996 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30997 op1 = copy_to_mode_reg (mode1, op1);
30999 op2 = GEN_INT (d->comparison);
31001 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31002 if (! pat)
31003 return 0;
31004 emit_insn (pat);
31005 return target;
31008 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31010 static rtx
31011 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31012 rtx target)
31014 rtx pat;
31015 tree arg0 = CALL_EXPR_ARG (exp, 0);
31016 tree arg1 = CALL_EXPR_ARG (exp, 1);
31017 rtx op0 = expand_normal (arg0);
31018 rtx op1 = expand_normal (arg1);
31019 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31020 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31021 enum rtx_code comparison = d->comparison;
31023 if (VECTOR_MODE_P (mode0))
31024 op0 = safe_vector_operand (op0, mode0);
31025 if (VECTOR_MODE_P (mode1))
31026 op1 = safe_vector_operand (op1, mode1);
31028 target = gen_reg_rtx (SImode);
31029 emit_move_insn (target, const0_rtx);
31030 target = gen_rtx_SUBREG (QImode, target, 0);
31032 if ((optimize && !register_operand (op0, mode0))
31033 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31034 op0 = copy_to_mode_reg (mode0, op0);
31035 if ((optimize && !register_operand (op1, mode1))
31036 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31037 op1 = copy_to_mode_reg (mode1, op1);
31039 pat = GEN_FCN (d->icode) (op0, op1);
31040 if (! pat)
31041 return 0;
31042 emit_insn (pat);
31043 emit_insn (gen_rtx_SET (VOIDmode,
31044 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31045 gen_rtx_fmt_ee (comparison, QImode,
31046 SET_DEST (pat),
31047 const0_rtx)));
31049 return SUBREG_REG (target);
31052 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31054 static rtx
31055 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31056 tree exp, rtx target)
31058 rtx pat;
31059 tree arg0 = CALL_EXPR_ARG (exp, 0);
31060 tree arg1 = CALL_EXPR_ARG (exp, 1);
31061 tree arg2 = CALL_EXPR_ARG (exp, 2);
31062 tree arg3 = CALL_EXPR_ARG (exp, 3);
31063 tree arg4 = CALL_EXPR_ARG (exp, 4);
31064 rtx scratch0, scratch1;
31065 rtx op0 = expand_normal (arg0);
31066 rtx op1 = expand_normal (arg1);
31067 rtx op2 = expand_normal (arg2);
31068 rtx op3 = expand_normal (arg3);
31069 rtx op4 = expand_normal (arg4);
31070 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31072 tmode0 = insn_data[d->icode].operand[0].mode;
31073 tmode1 = insn_data[d->icode].operand[1].mode;
31074 modev2 = insn_data[d->icode].operand[2].mode;
31075 modei3 = insn_data[d->icode].operand[3].mode;
31076 modev4 = insn_data[d->icode].operand[4].mode;
31077 modei5 = insn_data[d->icode].operand[5].mode;
31078 modeimm = insn_data[d->icode].operand[6].mode;
31080 if (VECTOR_MODE_P (modev2))
31081 op0 = safe_vector_operand (op0, modev2);
31082 if (VECTOR_MODE_P (modev4))
31083 op2 = safe_vector_operand (op2, modev4);
31085 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31086 op0 = copy_to_mode_reg (modev2, op0);
31087 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31088 op1 = copy_to_mode_reg (modei3, op1);
31089 if ((optimize && !register_operand (op2, modev4))
31090 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31091 op2 = copy_to_mode_reg (modev4, op2);
31092 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31093 op3 = copy_to_mode_reg (modei5, op3);
31095 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31097 error ("the fifth argument must be an 8-bit immediate");
31098 return const0_rtx;
31101 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31103 if (optimize || !target
31104 || GET_MODE (target) != tmode0
31105 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31106 target = gen_reg_rtx (tmode0);
31108 scratch1 = gen_reg_rtx (tmode1);
31110 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31112 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31114 if (optimize || !target
31115 || GET_MODE (target) != tmode1
31116 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31117 target = gen_reg_rtx (tmode1);
31119 scratch0 = gen_reg_rtx (tmode0);
31121 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31123 else
31125 gcc_assert (d->flag);
31127 scratch0 = gen_reg_rtx (tmode0);
31128 scratch1 = gen_reg_rtx (tmode1);
31130 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31133 if (! pat)
31134 return 0;
31136 emit_insn (pat);
31138 if (d->flag)
31140 target = gen_reg_rtx (SImode);
31141 emit_move_insn (target, const0_rtx);
31142 target = gen_rtx_SUBREG (QImode, target, 0);
31144 emit_insn
31145 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31146 gen_rtx_fmt_ee (EQ, QImode,
31147 gen_rtx_REG ((enum machine_mode) d->flag,
31148 FLAGS_REG),
31149 const0_rtx)));
31150 return SUBREG_REG (target);
31152 else
31153 return target;
31157 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31159 static rtx
31160 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31161 tree exp, rtx target)
31163 rtx pat;
31164 tree arg0 = CALL_EXPR_ARG (exp, 0);
31165 tree arg1 = CALL_EXPR_ARG (exp, 1);
31166 tree arg2 = CALL_EXPR_ARG (exp, 2);
31167 rtx scratch0, scratch1;
31168 rtx op0 = expand_normal (arg0);
31169 rtx op1 = expand_normal (arg1);
31170 rtx op2 = expand_normal (arg2);
31171 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31173 tmode0 = insn_data[d->icode].operand[0].mode;
31174 tmode1 = insn_data[d->icode].operand[1].mode;
31175 modev2 = insn_data[d->icode].operand[2].mode;
31176 modev3 = insn_data[d->icode].operand[3].mode;
31177 modeimm = insn_data[d->icode].operand[4].mode;
31179 if (VECTOR_MODE_P (modev2))
31180 op0 = safe_vector_operand (op0, modev2);
31181 if (VECTOR_MODE_P (modev3))
31182 op1 = safe_vector_operand (op1, modev3);
31184 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31185 op0 = copy_to_mode_reg (modev2, op0);
31186 if ((optimize && !register_operand (op1, modev3))
31187 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31188 op1 = copy_to_mode_reg (modev3, op1);
31190 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
31192 error ("the third argument must be an 8-bit immediate");
31193 return const0_rtx;
31196 if (d->code == IX86_BUILTIN_PCMPISTRI128)
31198 if (optimize || !target
31199 || GET_MODE (target) != tmode0
31200 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31201 target = gen_reg_rtx (tmode0);
31203 scratch1 = gen_reg_rtx (tmode1);
31205 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
31207 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
31209 if (optimize || !target
31210 || GET_MODE (target) != tmode1
31211 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31212 target = gen_reg_rtx (tmode1);
31214 scratch0 = gen_reg_rtx (tmode0);
31216 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
31218 else
31220 gcc_assert (d->flag);
31222 scratch0 = gen_reg_rtx (tmode0);
31223 scratch1 = gen_reg_rtx (tmode1);
31225 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
31228 if (! pat)
31229 return 0;
31231 emit_insn (pat);
31233 if (d->flag)
31235 target = gen_reg_rtx (SImode);
31236 emit_move_insn (target, const0_rtx);
31237 target = gen_rtx_SUBREG (QImode, target, 0);
31239 emit_insn
31240 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31241 gen_rtx_fmt_ee (EQ, QImode,
31242 gen_rtx_REG ((enum machine_mode) d->flag,
31243 FLAGS_REG),
31244 const0_rtx)));
31245 return SUBREG_REG (target);
31247 else
31248 return target;
31251 /* Subroutine of ix86_expand_builtin to take care of insns with
31252 variable number of operands. */
31254 static rtx
31255 ix86_expand_args_builtin (const struct builtin_description *d,
31256 tree exp, rtx target)
31258 rtx pat, real_target;
31259 unsigned int i, nargs;
31260 unsigned int nargs_constant = 0;
31261 int num_memory = 0;
31262 struct
31264 rtx op;
31265 enum machine_mode mode;
31266 } args[4];
31267 bool last_arg_count = false;
31268 enum insn_code icode = d->icode;
31269 const struct insn_data_d *insn_p = &insn_data[icode];
31270 enum machine_mode tmode = insn_p->operand[0].mode;
31271 enum machine_mode rmode = VOIDmode;
31272 bool swap = false;
31273 enum rtx_code comparison = d->comparison;
31275 switch ((enum ix86_builtin_func_type) d->flag)
31277 case V2DF_FTYPE_V2DF_ROUND:
31278 case V4DF_FTYPE_V4DF_ROUND:
31279 case V4SF_FTYPE_V4SF_ROUND:
31280 case V8SF_FTYPE_V8SF_ROUND:
31281 case V4SI_FTYPE_V4SF_ROUND:
31282 case V8SI_FTYPE_V8SF_ROUND:
31283 return ix86_expand_sse_round (d, exp, target);
31284 case V4SI_FTYPE_V2DF_V2DF_ROUND:
31285 case V8SI_FTYPE_V4DF_V4DF_ROUND:
31286 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
31287 case INT_FTYPE_V8SF_V8SF_PTEST:
31288 case INT_FTYPE_V4DI_V4DI_PTEST:
31289 case INT_FTYPE_V4DF_V4DF_PTEST:
31290 case INT_FTYPE_V4SF_V4SF_PTEST:
31291 case INT_FTYPE_V2DI_V2DI_PTEST:
31292 case INT_FTYPE_V2DF_V2DF_PTEST:
31293 return ix86_expand_sse_ptest (d, exp, target);
31294 case FLOAT128_FTYPE_FLOAT128:
31295 case FLOAT_FTYPE_FLOAT:
31296 case INT_FTYPE_INT:
31297 case UINT64_FTYPE_INT:
31298 case UINT16_FTYPE_UINT16:
31299 case INT64_FTYPE_INT64:
31300 case INT64_FTYPE_V4SF:
31301 case INT64_FTYPE_V2DF:
31302 case INT_FTYPE_V16QI:
31303 case INT_FTYPE_V8QI:
31304 case INT_FTYPE_V8SF:
31305 case INT_FTYPE_V4DF:
31306 case INT_FTYPE_V4SF:
31307 case INT_FTYPE_V2DF:
31308 case INT_FTYPE_V32QI:
31309 case V16QI_FTYPE_V16QI:
31310 case V8SI_FTYPE_V8SF:
31311 case V8SI_FTYPE_V4SI:
31312 case V8HI_FTYPE_V8HI:
31313 case V8HI_FTYPE_V16QI:
31314 case V8QI_FTYPE_V8QI:
31315 case V8SF_FTYPE_V8SF:
31316 case V8SF_FTYPE_V8SI:
31317 case V8SF_FTYPE_V4SF:
31318 case V8SF_FTYPE_V8HI:
31319 case V4SI_FTYPE_V4SI:
31320 case V4SI_FTYPE_V16QI:
31321 case V4SI_FTYPE_V4SF:
31322 case V4SI_FTYPE_V8SI:
31323 case V4SI_FTYPE_V8HI:
31324 case V4SI_FTYPE_V4DF:
31325 case V4SI_FTYPE_V2DF:
31326 case V4HI_FTYPE_V4HI:
31327 case V4DF_FTYPE_V4DF:
31328 case V4DF_FTYPE_V4SI:
31329 case V4DF_FTYPE_V4SF:
31330 case V4DF_FTYPE_V2DF:
31331 case V4SF_FTYPE_V4SF:
31332 case V4SF_FTYPE_V4SI:
31333 case V4SF_FTYPE_V8SF:
31334 case V4SF_FTYPE_V4DF:
31335 case V4SF_FTYPE_V8HI:
31336 case V4SF_FTYPE_V2DF:
31337 case V2DI_FTYPE_V2DI:
31338 case V2DI_FTYPE_V16QI:
31339 case V2DI_FTYPE_V8HI:
31340 case V2DI_FTYPE_V4SI:
31341 case V2DF_FTYPE_V2DF:
31342 case V2DF_FTYPE_V4SI:
31343 case V2DF_FTYPE_V4DF:
31344 case V2DF_FTYPE_V4SF:
31345 case V2DF_FTYPE_V2SI:
31346 case V2SI_FTYPE_V2SI:
31347 case V2SI_FTYPE_V4SF:
31348 case V2SI_FTYPE_V2SF:
31349 case V2SI_FTYPE_V2DF:
31350 case V2SF_FTYPE_V2SF:
31351 case V2SF_FTYPE_V2SI:
31352 case V32QI_FTYPE_V32QI:
31353 case V32QI_FTYPE_V16QI:
31354 case V16HI_FTYPE_V16HI:
31355 case V16HI_FTYPE_V8HI:
31356 case V8SI_FTYPE_V8SI:
31357 case V16HI_FTYPE_V16QI:
31358 case V8SI_FTYPE_V16QI:
31359 case V4DI_FTYPE_V16QI:
31360 case V8SI_FTYPE_V8HI:
31361 case V4DI_FTYPE_V8HI:
31362 case V4DI_FTYPE_V4SI:
31363 case V4DI_FTYPE_V2DI:
31364 nargs = 1;
31365 break;
31366 case V4SF_FTYPE_V4SF_VEC_MERGE:
31367 case V2DF_FTYPE_V2DF_VEC_MERGE:
31368 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
31369 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
31370 case V16QI_FTYPE_V16QI_V16QI:
31371 case V16QI_FTYPE_V8HI_V8HI:
31372 case V8QI_FTYPE_V8QI_V8QI:
31373 case V8QI_FTYPE_V4HI_V4HI:
31374 case V8HI_FTYPE_V8HI_V8HI:
31375 case V8HI_FTYPE_V16QI_V16QI:
31376 case V8HI_FTYPE_V4SI_V4SI:
31377 case V8SF_FTYPE_V8SF_V8SF:
31378 case V8SF_FTYPE_V8SF_V8SI:
31379 case V4SI_FTYPE_V4SI_V4SI:
31380 case V4SI_FTYPE_V8HI_V8HI:
31381 case V4SI_FTYPE_V4SF_V4SF:
31382 case V4SI_FTYPE_V2DF_V2DF:
31383 case V4HI_FTYPE_V4HI_V4HI:
31384 case V4HI_FTYPE_V8QI_V8QI:
31385 case V4HI_FTYPE_V2SI_V2SI:
31386 case V4DF_FTYPE_V4DF_V4DF:
31387 case V4DF_FTYPE_V4DF_V4DI:
31388 case V4SF_FTYPE_V4SF_V4SF:
31389 case V4SF_FTYPE_V4SF_V4SI:
31390 case V4SF_FTYPE_V4SF_V2SI:
31391 case V4SF_FTYPE_V4SF_V2DF:
31392 case V4SF_FTYPE_V4SF_DI:
31393 case V4SF_FTYPE_V4SF_SI:
31394 case V2DI_FTYPE_V2DI_V2DI:
31395 case V2DI_FTYPE_V16QI_V16QI:
31396 case V2DI_FTYPE_V4SI_V4SI:
31397 case V2UDI_FTYPE_V4USI_V4USI:
31398 case V2DI_FTYPE_V2DI_V16QI:
31399 case V2DI_FTYPE_V2DF_V2DF:
31400 case V2SI_FTYPE_V2SI_V2SI:
31401 case V2SI_FTYPE_V4HI_V4HI:
31402 case V2SI_FTYPE_V2SF_V2SF:
31403 case V2DF_FTYPE_V2DF_V2DF:
31404 case V2DF_FTYPE_V2DF_V4SF:
31405 case V2DF_FTYPE_V2DF_V2DI:
31406 case V2DF_FTYPE_V2DF_DI:
31407 case V2DF_FTYPE_V2DF_SI:
31408 case V2SF_FTYPE_V2SF_V2SF:
31409 case V1DI_FTYPE_V1DI_V1DI:
31410 case V1DI_FTYPE_V8QI_V8QI:
31411 case V1DI_FTYPE_V2SI_V2SI:
31412 case V32QI_FTYPE_V16HI_V16HI:
31413 case V16HI_FTYPE_V8SI_V8SI:
31414 case V32QI_FTYPE_V32QI_V32QI:
31415 case V16HI_FTYPE_V32QI_V32QI:
31416 case V16HI_FTYPE_V16HI_V16HI:
31417 case V8SI_FTYPE_V4DF_V4DF:
31418 case V8SI_FTYPE_V8SI_V8SI:
31419 case V8SI_FTYPE_V16HI_V16HI:
31420 case V4DI_FTYPE_V4DI_V4DI:
31421 case V4DI_FTYPE_V8SI_V8SI:
31422 case V4UDI_FTYPE_V8USI_V8USI:
31423 if (comparison == UNKNOWN)
31424 return ix86_expand_binop_builtin (icode, exp, target);
31425 nargs = 2;
31426 break;
31427 case V4SF_FTYPE_V4SF_V4SF_SWAP:
31428 case V2DF_FTYPE_V2DF_V2DF_SWAP:
31429 gcc_assert (comparison != UNKNOWN);
31430 nargs = 2;
31431 swap = true;
31432 break;
31433 case V16HI_FTYPE_V16HI_V8HI_COUNT:
31434 case V16HI_FTYPE_V16HI_SI_COUNT:
31435 case V8SI_FTYPE_V8SI_V4SI_COUNT:
31436 case V8SI_FTYPE_V8SI_SI_COUNT:
31437 case V4DI_FTYPE_V4DI_V2DI_COUNT:
31438 case V4DI_FTYPE_V4DI_INT_COUNT:
31439 case V8HI_FTYPE_V8HI_V8HI_COUNT:
31440 case V8HI_FTYPE_V8HI_SI_COUNT:
31441 case V4SI_FTYPE_V4SI_V4SI_COUNT:
31442 case V4SI_FTYPE_V4SI_SI_COUNT:
31443 case V4HI_FTYPE_V4HI_V4HI_COUNT:
31444 case V4HI_FTYPE_V4HI_SI_COUNT:
31445 case V2DI_FTYPE_V2DI_V2DI_COUNT:
31446 case V2DI_FTYPE_V2DI_SI_COUNT:
31447 case V2SI_FTYPE_V2SI_V2SI_COUNT:
31448 case V2SI_FTYPE_V2SI_SI_COUNT:
31449 case V1DI_FTYPE_V1DI_V1DI_COUNT:
31450 case V1DI_FTYPE_V1DI_SI_COUNT:
31451 nargs = 2;
31452 last_arg_count = true;
31453 break;
31454 case UINT64_FTYPE_UINT64_UINT64:
31455 case UINT_FTYPE_UINT_UINT:
31456 case UINT_FTYPE_UINT_USHORT:
31457 case UINT_FTYPE_UINT_UCHAR:
31458 case UINT16_FTYPE_UINT16_INT:
31459 case UINT8_FTYPE_UINT8_INT:
31460 nargs = 2;
31461 break;
31462 case V2DI_FTYPE_V2DI_INT_CONVERT:
31463 nargs = 2;
31464 rmode = V1TImode;
31465 nargs_constant = 1;
31466 break;
31467 case V4DI_FTYPE_V4DI_INT_CONVERT:
31468 nargs = 2;
31469 rmode = V2TImode;
31470 nargs_constant = 1;
31471 break;
31472 case V8HI_FTYPE_V8HI_INT:
31473 case V8HI_FTYPE_V8SF_INT:
31474 case V8HI_FTYPE_V4SF_INT:
31475 case V8SF_FTYPE_V8SF_INT:
31476 case V4SI_FTYPE_V4SI_INT:
31477 case V4SI_FTYPE_V8SI_INT:
31478 case V4HI_FTYPE_V4HI_INT:
31479 case V4DF_FTYPE_V4DF_INT:
31480 case V4SF_FTYPE_V4SF_INT:
31481 case V4SF_FTYPE_V8SF_INT:
31482 case V2DI_FTYPE_V2DI_INT:
31483 case V2DF_FTYPE_V2DF_INT:
31484 case V2DF_FTYPE_V4DF_INT:
31485 case V16HI_FTYPE_V16HI_INT:
31486 case V8SI_FTYPE_V8SI_INT:
31487 case V4DI_FTYPE_V4DI_INT:
31488 case V2DI_FTYPE_V4DI_INT:
31489 nargs = 2;
31490 nargs_constant = 1;
31491 break;
31492 case V16QI_FTYPE_V16QI_V16QI_V16QI:
31493 case V8SF_FTYPE_V8SF_V8SF_V8SF:
31494 case V4DF_FTYPE_V4DF_V4DF_V4DF:
31495 case V4SF_FTYPE_V4SF_V4SF_V4SF:
31496 case V2DF_FTYPE_V2DF_V2DF_V2DF:
31497 case V32QI_FTYPE_V32QI_V32QI_V32QI:
31498 nargs = 3;
31499 break;
31500 case V32QI_FTYPE_V32QI_V32QI_INT:
31501 case V16HI_FTYPE_V16HI_V16HI_INT:
31502 case V16QI_FTYPE_V16QI_V16QI_INT:
31503 case V4DI_FTYPE_V4DI_V4DI_INT:
31504 case V8HI_FTYPE_V8HI_V8HI_INT:
31505 case V8SI_FTYPE_V8SI_V8SI_INT:
31506 case V8SI_FTYPE_V8SI_V4SI_INT:
31507 case V8SF_FTYPE_V8SF_V8SF_INT:
31508 case V8SF_FTYPE_V8SF_V4SF_INT:
31509 case V4SI_FTYPE_V4SI_V4SI_INT:
31510 case V4DF_FTYPE_V4DF_V4DF_INT:
31511 case V4DF_FTYPE_V4DF_V2DF_INT:
31512 case V4SF_FTYPE_V4SF_V4SF_INT:
31513 case V2DI_FTYPE_V2DI_V2DI_INT:
31514 case V4DI_FTYPE_V4DI_V2DI_INT:
31515 case V2DF_FTYPE_V2DF_V2DF_INT:
31516 nargs = 3;
31517 nargs_constant = 1;
31518 break;
31519 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31520 nargs = 3;
31521 rmode = V4DImode;
31522 nargs_constant = 1;
31523 break;
31524 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31525 nargs = 3;
31526 rmode = V2DImode;
31527 nargs_constant = 1;
31528 break;
31529 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31530 nargs = 3;
31531 rmode = DImode;
31532 nargs_constant = 1;
31533 break;
31534 case V2DI_FTYPE_V2DI_UINT_UINT:
31535 nargs = 3;
31536 nargs_constant = 2;
31537 break;
31538 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31539 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31540 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31541 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31542 nargs = 4;
31543 nargs_constant = 1;
31544 break;
31545 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31546 nargs = 4;
31547 nargs_constant = 2;
31548 break;
31549 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31550 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31551 nargs = 4;
31552 break;
31553 default:
31554 gcc_unreachable ();
31557 gcc_assert (nargs <= ARRAY_SIZE (args));
31559 if (comparison != UNKNOWN)
31561 gcc_assert (nargs == 2);
31562 return ix86_expand_sse_compare (d, exp, target, swap);
31565 if (rmode == VOIDmode || rmode == tmode)
31567 if (optimize
31568 || target == 0
31569 || GET_MODE (target) != tmode
31570 || !insn_p->operand[0].predicate (target, tmode))
31571 target = gen_reg_rtx (tmode);
31572 real_target = target;
31574 else
31576 target = gen_reg_rtx (rmode);
31577 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31580 for (i = 0; i < nargs; i++)
31582 tree arg = CALL_EXPR_ARG (exp, i);
31583 rtx op = expand_normal (arg);
31584 enum machine_mode mode = insn_p->operand[i + 1].mode;
31585 bool match = insn_p->operand[i + 1].predicate (op, mode);
31587 if (last_arg_count && (i + 1) == nargs)
31589 /* SIMD shift insns take either an 8-bit immediate or
31590 register as count. But builtin functions take int as
31591 count. If count doesn't match, we put it in register. */
31592 if (!match)
31594 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31595 if (!insn_p->operand[i + 1].predicate (op, mode))
31596 op = copy_to_reg (op);
31599 else if ((nargs - i) <= nargs_constant)
31601 if (!match)
31602 switch (icode)
31604 case CODE_FOR_avx2_inserti128:
31605 case CODE_FOR_avx2_extracti128:
31606 error ("the last argument must be an 1-bit immediate");
31607 return const0_rtx;
31609 case CODE_FOR_sse4_1_roundsd:
31610 case CODE_FOR_sse4_1_roundss:
31612 case CODE_FOR_sse4_1_roundpd:
31613 case CODE_FOR_sse4_1_roundps:
31614 case CODE_FOR_avx_roundpd256:
31615 case CODE_FOR_avx_roundps256:
31617 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31618 case CODE_FOR_sse4_1_roundps_sfix:
31619 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31620 case CODE_FOR_avx_roundps_sfix256:
31622 case CODE_FOR_sse4_1_blendps:
31623 case CODE_FOR_avx_blendpd256:
31624 case CODE_FOR_avx_vpermilv4df:
31625 error ("the last argument must be a 4-bit immediate");
31626 return const0_rtx;
31628 case CODE_FOR_sse4_1_blendpd:
31629 case CODE_FOR_avx_vpermilv2df:
31630 case CODE_FOR_xop_vpermil2v2df3:
31631 case CODE_FOR_xop_vpermil2v4sf3:
31632 case CODE_FOR_xop_vpermil2v4df3:
31633 case CODE_FOR_xop_vpermil2v8sf3:
31634 error ("the last argument must be a 2-bit immediate");
31635 return const0_rtx;
31637 case CODE_FOR_avx_vextractf128v4df:
31638 case CODE_FOR_avx_vextractf128v8sf:
31639 case CODE_FOR_avx_vextractf128v8si:
31640 case CODE_FOR_avx_vinsertf128v4df:
31641 case CODE_FOR_avx_vinsertf128v8sf:
31642 case CODE_FOR_avx_vinsertf128v8si:
31643 error ("the last argument must be a 1-bit immediate");
31644 return const0_rtx;
31646 case CODE_FOR_avx_vmcmpv2df3:
31647 case CODE_FOR_avx_vmcmpv4sf3:
31648 case CODE_FOR_avx_cmpv2df3:
31649 case CODE_FOR_avx_cmpv4sf3:
31650 case CODE_FOR_avx_cmpv4df3:
31651 case CODE_FOR_avx_cmpv8sf3:
31652 error ("the last argument must be a 5-bit immediate");
31653 return const0_rtx;
31655 default:
31656 switch (nargs_constant)
31658 case 2:
31659 if ((nargs - i) == nargs_constant)
31661 error ("the next to last argument must be an 8-bit immediate");
31662 break;
31664 case 1:
31665 error ("the last argument must be an 8-bit immediate");
31666 break;
31667 default:
31668 gcc_unreachable ();
31670 return const0_rtx;
31673 else
31675 if (VECTOR_MODE_P (mode))
31676 op = safe_vector_operand (op, mode);
31678 /* If we aren't optimizing, only allow one memory operand to
31679 be generated. */
31680 if (memory_operand (op, mode))
31681 num_memory++;
31683 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31685 if (optimize || !match || num_memory > 1)
31686 op = copy_to_mode_reg (mode, op);
31688 else
31690 op = copy_to_reg (op);
31691 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31695 args[i].op = op;
31696 args[i].mode = mode;
31699 switch (nargs)
31701 case 1:
31702 pat = GEN_FCN (icode) (real_target, args[0].op);
31703 break;
31704 case 2:
31705 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31706 break;
31707 case 3:
31708 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31709 args[2].op);
31710 break;
31711 case 4:
31712 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31713 args[2].op, args[3].op);
31714 break;
31715 default:
31716 gcc_unreachable ();
31719 if (! pat)
31720 return 0;
31722 emit_insn (pat);
31723 return target;
31726 /* Subroutine of ix86_expand_builtin to take care of special insns
31727 with variable number of operands. */
31729 static rtx
31730 ix86_expand_special_args_builtin (const struct builtin_description *d,
31731 tree exp, rtx target)
31733 tree arg;
31734 rtx pat, op;
31735 unsigned int i, nargs, arg_adjust, memory;
31736 struct
31738 rtx op;
31739 enum machine_mode mode;
31740 } args[3];
31741 enum insn_code icode = d->icode;
31742 bool last_arg_constant = false;
31743 const struct insn_data_d *insn_p = &insn_data[icode];
31744 enum machine_mode tmode = insn_p->operand[0].mode;
31745 enum { load, store } klass;
31747 switch ((enum ix86_builtin_func_type) d->flag)
31749 case VOID_FTYPE_VOID:
31750 emit_insn (GEN_FCN (icode) (target));
31751 return 0;
31752 case VOID_FTYPE_UINT64:
31753 case VOID_FTYPE_UNSIGNED:
31754 nargs = 0;
31755 klass = store;
31756 memory = 0;
31757 break;
31759 case INT_FTYPE_VOID:
31760 case UINT64_FTYPE_VOID:
31761 case UNSIGNED_FTYPE_VOID:
31762 nargs = 0;
31763 klass = load;
31764 memory = 0;
31765 break;
31766 case UINT64_FTYPE_PUNSIGNED:
31767 case V2DI_FTYPE_PV2DI:
31768 case V4DI_FTYPE_PV4DI:
31769 case V32QI_FTYPE_PCCHAR:
31770 case V16QI_FTYPE_PCCHAR:
31771 case V8SF_FTYPE_PCV4SF:
31772 case V8SF_FTYPE_PCFLOAT:
31773 case V4SF_FTYPE_PCFLOAT:
31774 case V4DF_FTYPE_PCV2DF:
31775 case V4DF_FTYPE_PCDOUBLE:
31776 case V2DF_FTYPE_PCDOUBLE:
31777 case VOID_FTYPE_PVOID:
31778 nargs = 1;
31779 klass = load;
31780 memory = 0;
31781 break;
31782 case VOID_FTYPE_PV2SF_V4SF:
31783 case VOID_FTYPE_PV4DI_V4DI:
31784 case VOID_FTYPE_PV2DI_V2DI:
31785 case VOID_FTYPE_PCHAR_V32QI:
31786 case VOID_FTYPE_PCHAR_V16QI:
31787 case VOID_FTYPE_PFLOAT_V8SF:
31788 case VOID_FTYPE_PFLOAT_V4SF:
31789 case VOID_FTYPE_PDOUBLE_V4DF:
31790 case VOID_FTYPE_PDOUBLE_V2DF:
31791 case VOID_FTYPE_PLONGLONG_LONGLONG:
31792 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31793 case VOID_FTYPE_PINT_INT:
31794 nargs = 1;
31795 klass = store;
31796 /* Reserve memory operand for target. */
31797 memory = ARRAY_SIZE (args);
31798 break;
31799 case V4SF_FTYPE_V4SF_PCV2SF:
31800 case V2DF_FTYPE_V2DF_PCDOUBLE:
31801 nargs = 2;
31802 klass = load;
31803 memory = 1;
31804 break;
31805 case V8SF_FTYPE_PCV8SF_V8SI:
31806 case V4DF_FTYPE_PCV4DF_V4DI:
31807 case V4SF_FTYPE_PCV4SF_V4SI:
31808 case V2DF_FTYPE_PCV2DF_V2DI:
31809 case V8SI_FTYPE_PCV8SI_V8SI:
31810 case V4DI_FTYPE_PCV4DI_V4DI:
31811 case V4SI_FTYPE_PCV4SI_V4SI:
31812 case V2DI_FTYPE_PCV2DI_V2DI:
31813 nargs = 2;
31814 klass = load;
31815 memory = 0;
31816 break;
31817 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31818 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31819 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31820 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31821 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31822 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31823 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31824 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31825 nargs = 2;
31826 klass = store;
31827 /* Reserve memory operand for target. */
31828 memory = ARRAY_SIZE (args);
31829 break;
31830 case VOID_FTYPE_UINT_UINT_UINT:
31831 case VOID_FTYPE_UINT64_UINT_UINT:
31832 case UCHAR_FTYPE_UINT_UINT_UINT:
31833 case UCHAR_FTYPE_UINT64_UINT_UINT:
31834 nargs = 3;
31835 klass = load;
31836 memory = ARRAY_SIZE (args);
31837 last_arg_constant = true;
31838 break;
31839 default:
31840 gcc_unreachable ();
31843 gcc_assert (nargs <= ARRAY_SIZE (args));
31845 if (klass == store)
31847 arg = CALL_EXPR_ARG (exp, 0);
31848 op = expand_normal (arg);
31849 gcc_assert (target == 0);
31850 if (memory)
31852 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31853 target = gen_rtx_MEM (tmode, op);
31855 else
31856 target = force_reg (tmode, op);
31857 arg_adjust = 1;
31859 else
31861 arg_adjust = 0;
31862 if (optimize
31863 || target == 0
31864 || !register_operand (target, tmode)
31865 || GET_MODE (target) != tmode)
31866 target = gen_reg_rtx (tmode);
31869 for (i = 0; i < nargs; i++)
31871 enum machine_mode mode = insn_p->operand[i + 1].mode;
31872 bool match;
31874 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31875 op = expand_normal (arg);
31876 match = insn_p->operand[i + 1].predicate (op, mode);
31878 if (last_arg_constant && (i + 1) == nargs)
31880 if (!match)
31882 if (icode == CODE_FOR_lwp_lwpvalsi3
31883 || icode == CODE_FOR_lwp_lwpinssi3
31884 || icode == CODE_FOR_lwp_lwpvaldi3
31885 || icode == CODE_FOR_lwp_lwpinsdi3)
31886 error ("the last argument must be a 32-bit immediate");
31887 else
31888 error ("the last argument must be an 8-bit immediate");
31889 return const0_rtx;
31892 else
31894 if (i == memory)
31896 /* This must be the memory operand. */
31897 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31898 op = gen_rtx_MEM (mode, op);
31899 gcc_assert (GET_MODE (op) == mode
31900 || GET_MODE (op) == VOIDmode);
31902 else
31904 /* This must be register. */
31905 if (VECTOR_MODE_P (mode))
31906 op = safe_vector_operand (op, mode);
31908 gcc_assert (GET_MODE (op) == mode
31909 || GET_MODE (op) == VOIDmode);
31910 op = copy_to_mode_reg (mode, op);
31914 args[i].op = op;
31915 args[i].mode = mode;
31918 switch (nargs)
31920 case 0:
31921 pat = GEN_FCN (icode) (target);
31922 break;
31923 case 1:
31924 pat = GEN_FCN (icode) (target, args[0].op);
31925 break;
31926 case 2:
31927 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31928 break;
31929 case 3:
31930 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31931 break;
31932 default:
31933 gcc_unreachable ();
31936 if (! pat)
31937 return 0;
31938 emit_insn (pat);
31939 return klass == store ? 0 : target;
31942 /* Return the integer constant in ARG. Constrain it to be in the range
31943 of the subparts of VEC_TYPE; issue an error if not. */
31945 static int
31946 get_element_number (tree vec_type, tree arg)
31948 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31950 if (!host_integerp (arg, 1)
31951 || (elt = tree_low_cst (arg, 1), elt > max))
31953 error ("selector must be an integer constant in the range 0..%wi", max);
31954 return 0;
31957 return elt;
31960 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31961 ix86_expand_vector_init. We DO have language-level syntax for this, in
31962 the form of (type){ init-list }. Except that since we can't place emms
31963 instructions from inside the compiler, we can't allow the use of MMX
31964 registers unless the user explicitly asks for it. So we do *not* define
31965 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31966 we have builtins invoked by mmintrin.h that gives us license to emit
31967 these sorts of instructions. */
31969 static rtx
31970 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31972 enum machine_mode tmode = TYPE_MODE (type);
31973 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31974 int i, n_elt = GET_MODE_NUNITS (tmode);
31975 rtvec v = rtvec_alloc (n_elt);
31977 gcc_assert (VECTOR_MODE_P (tmode));
31978 gcc_assert (call_expr_nargs (exp) == n_elt);
31980 for (i = 0; i < n_elt; ++i)
31982 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31983 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31986 if (!target || !register_operand (target, tmode))
31987 target = gen_reg_rtx (tmode);
31989 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31990 return target;
31993 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31994 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31995 had a language-level syntax for referencing vector elements. */
31997 static rtx
31998 ix86_expand_vec_ext_builtin (tree exp, rtx target)
32000 enum machine_mode tmode, mode0;
32001 tree arg0, arg1;
32002 int elt;
32003 rtx op0;
32005 arg0 = CALL_EXPR_ARG (exp, 0);
32006 arg1 = CALL_EXPR_ARG (exp, 1);
32008 op0 = expand_normal (arg0);
32009 elt = get_element_number (TREE_TYPE (arg0), arg1);
32011 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32012 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32013 gcc_assert (VECTOR_MODE_P (mode0));
32015 op0 = force_reg (mode0, op0);
32017 if (optimize || !target || !register_operand (target, tmode))
32018 target = gen_reg_rtx (tmode);
32020 ix86_expand_vector_extract (true, target, op0, elt);
32022 return target;
32025 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32026 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32027 a language-level syntax for referencing vector elements. */
32029 static rtx
32030 ix86_expand_vec_set_builtin (tree exp)
32032 enum machine_mode tmode, mode1;
32033 tree arg0, arg1, arg2;
32034 int elt;
32035 rtx op0, op1, target;
32037 arg0 = CALL_EXPR_ARG (exp, 0);
32038 arg1 = CALL_EXPR_ARG (exp, 1);
32039 arg2 = CALL_EXPR_ARG (exp, 2);
32041 tmode = TYPE_MODE (TREE_TYPE (arg0));
32042 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32043 gcc_assert (VECTOR_MODE_P (tmode));
32045 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32046 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32047 elt = get_element_number (TREE_TYPE (arg0), arg2);
32049 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32050 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32052 op0 = force_reg (tmode, op0);
32053 op1 = force_reg (mode1, op1);
32055 /* OP0 is the source of these builtin functions and shouldn't be
32056 modified. Create a copy, use it and return it as target. */
32057 target = gen_reg_rtx (tmode);
32058 emit_move_insn (target, op0);
32059 ix86_expand_vector_set (true, target, op1, elt);
32061 return target;
32064 /* Expand an expression EXP that calls a built-in function,
32065 with result going to TARGET if that's convenient
32066 (and in mode MODE if that's convenient).
32067 SUBTARGET may be used as the target for computing one of EXP's operands.
32068 IGNORE is nonzero if the value is to be ignored. */
32070 static rtx
32071 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
32072 enum machine_mode mode, int ignore)
32074 const struct builtin_description *d;
32075 size_t i;
32076 enum insn_code icode;
32077 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32078 tree arg0, arg1, arg2, arg3, arg4;
32079 rtx op0, op1, op2, op3, op4, pat, insn;
32080 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32081 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32083 /* For CPU builtins that can be folded, fold first and expand the fold. */
32084 switch (fcode)
32086 case IX86_BUILTIN_CPU_INIT:
32088 /* Make it call __cpu_indicator_init in libgcc. */
32089 tree call_expr, fndecl, type;
32090 type = build_function_type_list (integer_type_node, NULL_TREE);
32091 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32092 call_expr = build_call_expr (fndecl, 0);
32093 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32095 case IX86_BUILTIN_CPU_IS:
32096 case IX86_BUILTIN_CPU_SUPPORTS:
32098 tree arg0 = CALL_EXPR_ARG (exp, 0);
32099 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32100 gcc_assert (fold_expr != NULL_TREE);
32101 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32105 /* Determine whether the builtin function is available under the current ISA.
32106 Originally the builtin was not created if it wasn't applicable to the
32107 current ISA based on the command line switches. With function specific
32108 options, we need to check in the context of the function making the call
32109 whether it is supported. */
32110 if (ix86_builtins_isa[fcode].isa
32111 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32113 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32114 NULL, (enum fpmath_unit) 0, false);
32116 if (!opts)
32117 error ("%qE needs unknown isa option", fndecl);
32118 else
32120 gcc_assert (opts != NULL);
32121 error ("%qE needs isa option %s", fndecl, opts);
32122 free (opts);
32124 return const0_rtx;
32127 switch (fcode)
32129 case IX86_BUILTIN_MASKMOVQ:
32130 case IX86_BUILTIN_MASKMOVDQU:
32131 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32132 ? CODE_FOR_mmx_maskmovq
32133 : CODE_FOR_sse2_maskmovdqu);
32134 /* Note the arg order is different from the operand order. */
32135 arg1 = CALL_EXPR_ARG (exp, 0);
32136 arg2 = CALL_EXPR_ARG (exp, 1);
32137 arg0 = CALL_EXPR_ARG (exp, 2);
32138 op0 = expand_normal (arg0);
32139 op1 = expand_normal (arg1);
32140 op2 = expand_normal (arg2);
32141 mode0 = insn_data[icode].operand[0].mode;
32142 mode1 = insn_data[icode].operand[1].mode;
32143 mode2 = insn_data[icode].operand[2].mode;
32145 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32146 op0 = gen_rtx_MEM (mode1, op0);
32148 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32149 op0 = copy_to_mode_reg (mode0, op0);
32150 if (!insn_data[icode].operand[1].predicate (op1, mode1))
32151 op1 = copy_to_mode_reg (mode1, op1);
32152 if (!insn_data[icode].operand[2].predicate (op2, mode2))
32153 op2 = copy_to_mode_reg (mode2, op2);
32154 pat = GEN_FCN (icode) (op0, op1, op2);
32155 if (! pat)
32156 return 0;
32157 emit_insn (pat);
32158 return 0;
32160 case IX86_BUILTIN_LDMXCSR:
32161 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
32162 target = assign_386_stack_local (SImode, SLOT_TEMP);
32163 emit_move_insn (target, op0);
32164 emit_insn (gen_sse_ldmxcsr (target));
32165 return 0;
32167 case IX86_BUILTIN_STMXCSR:
32168 target = assign_386_stack_local (SImode, SLOT_TEMP);
32169 emit_insn (gen_sse_stmxcsr (target));
32170 return copy_to_mode_reg (SImode, target);
32172 case IX86_BUILTIN_CLFLUSH:
32173 arg0 = CALL_EXPR_ARG (exp, 0);
32174 op0 = expand_normal (arg0);
32175 icode = CODE_FOR_sse2_clflush;
32176 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32177 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32179 emit_insn (gen_sse2_clflush (op0));
32180 return 0;
32182 case IX86_BUILTIN_MONITOR:
32183 arg0 = CALL_EXPR_ARG (exp, 0);
32184 arg1 = CALL_EXPR_ARG (exp, 1);
32185 arg2 = CALL_EXPR_ARG (exp, 2);
32186 op0 = expand_normal (arg0);
32187 op1 = expand_normal (arg1);
32188 op2 = expand_normal (arg2);
32189 if (!REG_P (op0))
32190 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32191 if (!REG_P (op1))
32192 op1 = copy_to_mode_reg (SImode, op1);
32193 if (!REG_P (op2))
32194 op2 = copy_to_mode_reg (SImode, op2);
32195 emit_insn (ix86_gen_monitor (op0, op1, op2));
32196 return 0;
32198 case IX86_BUILTIN_MWAIT:
32199 arg0 = CALL_EXPR_ARG (exp, 0);
32200 arg1 = CALL_EXPR_ARG (exp, 1);
32201 op0 = expand_normal (arg0);
32202 op1 = expand_normal (arg1);
32203 if (!REG_P (op0))
32204 op0 = copy_to_mode_reg (SImode, op0);
32205 if (!REG_P (op1))
32206 op1 = copy_to_mode_reg (SImode, op1);
32207 emit_insn (gen_sse3_mwait (op0, op1));
32208 return 0;
32210 case IX86_BUILTIN_VEC_INIT_V2SI:
32211 case IX86_BUILTIN_VEC_INIT_V4HI:
32212 case IX86_BUILTIN_VEC_INIT_V8QI:
32213 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
32215 case IX86_BUILTIN_VEC_EXT_V2DF:
32216 case IX86_BUILTIN_VEC_EXT_V2DI:
32217 case IX86_BUILTIN_VEC_EXT_V4SF:
32218 case IX86_BUILTIN_VEC_EXT_V4SI:
32219 case IX86_BUILTIN_VEC_EXT_V8HI:
32220 case IX86_BUILTIN_VEC_EXT_V2SI:
32221 case IX86_BUILTIN_VEC_EXT_V4HI:
32222 case IX86_BUILTIN_VEC_EXT_V16QI:
32223 return ix86_expand_vec_ext_builtin (exp, target);
32225 case IX86_BUILTIN_VEC_SET_V2DI:
32226 case IX86_BUILTIN_VEC_SET_V4SF:
32227 case IX86_BUILTIN_VEC_SET_V4SI:
32228 case IX86_BUILTIN_VEC_SET_V8HI:
32229 case IX86_BUILTIN_VEC_SET_V4HI:
32230 case IX86_BUILTIN_VEC_SET_V16QI:
32231 return ix86_expand_vec_set_builtin (exp);
32233 case IX86_BUILTIN_INFQ:
32234 case IX86_BUILTIN_HUGE_VALQ:
32236 REAL_VALUE_TYPE inf;
32237 rtx tmp;
32239 real_inf (&inf);
32240 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
32242 tmp = validize_mem (force_const_mem (mode, tmp));
32244 if (target == 0)
32245 target = gen_reg_rtx (mode);
32247 emit_move_insn (target, tmp);
32248 return target;
32251 case IX86_BUILTIN_RDPMC:
32252 case IX86_BUILTIN_RDTSC:
32253 case IX86_BUILTIN_RDTSCP:
32255 op0 = gen_reg_rtx (DImode);
32256 op1 = gen_reg_rtx (DImode);
32258 if (fcode == IX86_BUILTIN_RDPMC)
32260 arg0 = CALL_EXPR_ARG (exp, 0);
32261 op2 = expand_normal (arg0);
32262 if (!register_operand (op2, SImode))
32263 op2 = copy_to_mode_reg (SImode, op2);
32265 insn = (TARGET_64BIT
32266 ? gen_rdpmc_rex64 (op0, op1, op2)
32267 : gen_rdpmc (op0, op2));
32268 emit_insn (insn);
32270 else if (fcode == IX86_BUILTIN_RDTSC)
32272 insn = (TARGET_64BIT
32273 ? gen_rdtsc_rex64 (op0, op1)
32274 : gen_rdtsc (op0));
32275 emit_insn (insn);
32277 else
32279 op2 = gen_reg_rtx (SImode);
32281 insn = (TARGET_64BIT
32282 ? gen_rdtscp_rex64 (op0, op1, op2)
32283 : gen_rdtscp (op0, op2));
32284 emit_insn (insn);
32286 arg0 = CALL_EXPR_ARG (exp, 0);
32287 op4 = expand_normal (arg0);
32288 if (!address_operand (op4, VOIDmode))
32290 op4 = convert_memory_address (Pmode, op4);
32291 op4 = copy_addr_to_reg (op4);
32293 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
32296 if (target == 0)
32298 /* mode is VOIDmode if __builtin_rd* has been called
32299 without lhs. */
32300 if (mode == VOIDmode)
32301 return target;
32302 target = gen_reg_rtx (mode);
32305 if (TARGET_64BIT)
32307 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
32308 op1, 1, OPTAB_DIRECT);
32309 op0 = expand_simple_binop (DImode, IOR, op0, op1,
32310 op0, 1, OPTAB_DIRECT);
32313 emit_move_insn (target, op0);
32314 return target;
32316 case IX86_BUILTIN_FXSAVE:
32317 case IX86_BUILTIN_FXRSTOR:
32318 case IX86_BUILTIN_FXSAVE64:
32319 case IX86_BUILTIN_FXRSTOR64:
32320 switch (fcode)
32322 case IX86_BUILTIN_FXSAVE:
32323 icode = CODE_FOR_fxsave;
32324 break;
32325 case IX86_BUILTIN_FXRSTOR:
32326 icode = CODE_FOR_fxrstor;
32327 break;
32328 case IX86_BUILTIN_FXSAVE64:
32329 icode = CODE_FOR_fxsave64;
32330 break;
32331 case IX86_BUILTIN_FXRSTOR64:
32332 icode = CODE_FOR_fxrstor64;
32333 break;
32334 default:
32335 gcc_unreachable ();
32338 arg0 = CALL_EXPR_ARG (exp, 0);
32339 op0 = expand_normal (arg0);
32341 if (!address_operand (op0, VOIDmode))
32343 op0 = convert_memory_address (Pmode, op0);
32344 op0 = copy_addr_to_reg (op0);
32346 op0 = gen_rtx_MEM (BLKmode, op0);
32348 pat = GEN_FCN (icode) (op0);
32349 if (pat)
32350 emit_insn (pat);
32351 return 0;
32353 case IX86_BUILTIN_XSAVE:
32354 case IX86_BUILTIN_XRSTOR:
32355 case IX86_BUILTIN_XSAVE64:
32356 case IX86_BUILTIN_XRSTOR64:
32357 case IX86_BUILTIN_XSAVEOPT:
32358 case IX86_BUILTIN_XSAVEOPT64:
32359 arg0 = CALL_EXPR_ARG (exp, 0);
32360 arg1 = CALL_EXPR_ARG (exp, 1);
32361 op0 = expand_normal (arg0);
32362 op1 = expand_normal (arg1);
32364 if (!address_operand (op0, VOIDmode))
32366 op0 = convert_memory_address (Pmode, op0);
32367 op0 = copy_addr_to_reg (op0);
32369 op0 = gen_rtx_MEM (BLKmode, op0);
32371 op1 = force_reg (DImode, op1);
32373 if (TARGET_64BIT)
32375 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
32376 NULL, 1, OPTAB_DIRECT);
32377 switch (fcode)
32379 case IX86_BUILTIN_XSAVE:
32380 icode = CODE_FOR_xsave_rex64;
32381 break;
32382 case IX86_BUILTIN_XRSTOR:
32383 icode = CODE_FOR_xrstor_rex64;
32384 break;
32385 case IX86_BUILTIN_XSAVE64:
32386 icode = CODE_FOR_xsave64;
32387 break;
32388 case IX86_BUILTIN_XRSTOR64:
32389 icode = CODE_FOR_xrstor64;
32390 break;
32391 case IX86_BUILTIN_XSAVEOPT:
32392 icode = CODE_FOR_xsaveopt_rex64;
32393 break;
32394 case IX86_BUILTIN_XSAVEOPT64:
32395 icode = CODE_FOR_xsaveopt64;
32396 break;
32397 default:
32398 gcc_unreachable ();
32401 op2 = gen_lowpart (SImode, op2);
32402 op1 = gen_lowpart (SImode, op1);
32403 pat = GEN_FCN (icode) (op0, op1, op2);
32405 else
32407 switch (fcode)
32409 case IX86_BUILTIN_XSAVE:
32410 icode = CODE_FOR_xsave;
32411 break;
32412 case IX86_BUILTIN_XRSTOR:
32413 icode = CODE_FOR_xrstor;
32414 break;
32415 case IX86_BUILTIN_XSAVEOPT:
32416 icode = CODE_FOR_xsaveopt;
32417 break;
32418 default:
32419 gcc_unreachable ();
32421 pat = GEN_FCN (icode) (op0, op1);
32424 if (pat)
32425 emit_insn (pat);
32426 return 0;
32428 case IX86_BUILTIN_LLWPCB:
32429 arg0 = CALL_EXPR_ARG (exp, 0);
32430 op0 = expand_normal (arg0);
32431 icode = CODE_FOR_lwp_llwpcb;
32432 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32433 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32434 emit_insn (gen_lwp_llwpcb (op0));
32435 return 0;
32437 case IX86_BUILTIN_SLWPCB:
32438 icode = CODE_FOR_lwp_slwpcb;
32439 if (!target
32440 || !insn_data[icode].operand[0].predicate (target, Pmode))
32441 target = gen_reg_rtx (Pmode);
32442 emit_insn (gen_lwp_slwpcb (target));
32443 return target;
32445 case IX86_BUILTIN_BEXTRI32:
32446 case IX86_BUILTIN_BEXTRI64:
32447 arg0 = CALL_EXPR_ARG (exp, 0);
32448 arg1 = CALL_EXPR_ARG (exp, 1);
32449 op0 = expand_normal (arg0);
32450 op1 = expand_normal (arg1);
32451 icode = (fcode == IX86_BUILTIN_BEXTRI32
32452 ? CODE_FOR_tbm_bextri_si
32453 : CODE_FOR_tbm_bextri_di);
32454 if (!CONST_INT_P (op1))
32456 error ("last argument must be an immediate");
32457 return const0_rtx;
32459 else
32461 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
32462 unsigned char lsb_index = INTVAL (op1) & 0xFF;
32463 op1 = GEN_INT (length);
32464 op2 = GEN_INT (lsb_index);
32465 pat = GEN_FCN (icode) (target, op0, op1, op2);
32466 if (pat)
32467 emit_insn (pat);
32468 return target;
32471 case IX86_BUILTIN_RDRAND16_STEP:
32472 icode = CODE_FOR_rdrandhi_1;
32473 mode0 = HImode;
32474 goto rdrand_step;
32476 case IX86_BUILTIN_RDRAND32_STEP:
32477 icode = CODE_FOR_rdrandsi_1;
32478 mode0 = SImode;
32479 goto rdrand_step;
32481 case IX86_BUILTIN_RDRAND64_STEP:
32482 icode = CODE_FOR_rdranddi_1;
32483 mode0 = DImode;
32485 rdrand_step:
32486 op0 = gen_reg_rtx (mode0);
32487 emit_insn (GEN_FCN (icode) (op0));
32489 arg0 = CALL_EXPR_ARG (exp, 0);
32490 op1 = expand_normal (arg0);
32491 if (!address_operand (op1, VOIDmode))
32493 op1 = convert_memory_address (Pmode, op1);
32494 op1 = copy_addr_to_reg (op1);
32496 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32498 op1 = gen_reg_rtx (SImode);
32499 emit_move_insn (op1, CONST1_RTX (SImode));
32501 /* Emit SImode conditional move. */
32502 if (mode0 == HImode)
32504 op2 = gen_reg_rtx (SImode);
32505 emit_insn (gen_zero_extendhisi2 (op2, op0));
32507 else if (mode0 == SImode)
32508 op2 = op0;
32509 else
32510 op2 = gen_rtx_SUBREG (SImode, op0, 0);
32512 if (target == 0)
32513 target = gen_reg_rtx (SImode);
32515 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
32516 const0_rtx);
32517 emit_insn (gen_rtx_SET (VOIDmode, target,
32518 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32519 return target;
32521 case IX86_BUILTIN_RDSEED16_STEP:
32522 icode = CODE_FOR_rdseedhi_1;
32523 mode0 = HImode;
32524 goto rdseed_step;
32526 case IX86_BUILTIN_RDSEED32_STEP:
32527 icode = CODE_FOR_rdseedsi_1;
32528 mode0 = SImode;
32529 goto rdseed_step;
32531 case IX86_BUILTIN_RDSEED64_STEP:
32532 icode = CODE_FOR_rdseeddi_1;
32533 mode0 = DImode;
32535 rdseed_step:
32536 op0 = gen_reg_rtx (mode0);
32537 emit_insn (GEN_FCN (icode) (op0));
32539 arg0 = CALL_EXPR_ARG (exp, 0);
32540 op1 = expand_normal (arg0);
32541 if (!address_operand (op1, VOIDmode))
32543 op1 = convert_memory_address (Pmode, op1);
32544 op1 = copy_addr_to_reg (op1);
32546 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32548 op2 = gen_reg_rtx (QImode);
32550 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32551 const0_rtx);
32552 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32554 if (target == 0)
32555 target = gen_reg_rtx (SImode);
32557 emit_insn (gen_zero_extendqisi2 (target, op2));
32558 return target;
32560 case IX86_BUILTIN_ADDCARRYX32:
32561 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32562 mode0 = SImode;
32563 goto addcarryx;
32565 case IX86_BUILTIN_ADDCARRYX64:
32566 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32567 mode0 = DImode;
32569 addcarryx:
32570 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32571 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32572 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32573 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32575 op0 = gen_reg_rtx (QImode);
32577 /* Generate CF from input operand. */
32578 op1 = expand_normal (arg0);
32579 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32580 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32582 /* Gen ADCX instruction to compute X+Y+CF. */
32583 op2 = expand_normal (arg1);
32584 op3 = expand_normal (arg2);
32586 if (!REG_P (op2))
32587 op2 = copy_to_mode_reg (mode0, op2);
32588 if (!REG_P (op3))
32589 op3 = copy_to_mode_reg (mode0, op3);
32591 op0 = gen_reg_rtx (mode0);
32593 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32594 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32595 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32597 /* Store the result. */
32598 op4 = expand_normal (arg3);
32599 if (!address_operand (op4, VOIDmode))
32601 op4 = convert_memory_address (Pmode, op4);
32602 op4 = copy_addr_to_reg (op4);
32604 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32606 /* Return current CF value. */
32607 if (target == 0)
32608 target = gen_reg_rtx (QImode);
32610 PUT_MODE (pat, QImode);
32611 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32612 return target;
32614 case IX86_BUILTIN_GATHERSIV2DF:
32615 icode = CODE_FOR_avx2_gathersiv2df;
32616 goto gather_gen;
32617 case IX86_BUILTIN_GATHERSIV4DF:
32618 icode = CODE_FOR_avx2_gathersiv4df;
32619 goto gather_gen;
32620 case IX86_BUILTIN_GATHERDIV2DF:
32621 icode = CODE_FOR_avx2_gatherdiv2df;
32622 goto gather_gen;
32623 case IX86_BUILTIN_GATHERDIV4DF:
32624 icode = CODE_FOR_avx2_gatherdiv4df;
32625 goto gather_gen;
32626 case IX86_BUILTIN_GATHERSIV4SF:
32627 icode = CODE_FOR_avx2_gathersiv4sf;
32628 goto gather_gen;
32629 case IX86_BUILTIN_GATHERSIV8SF:
32630 icode = CODE_FOR_avx2_gathersiv8sf;
32631 goto gather_gen;
32632 case IX86_BUILTIN_GATHERDIV4SF:
32633 icode = CODE_FOR_avx2_gatherdiv4sf;
32634 goto gather_gen;
32635 case IX86_BUILTIN_GATHERDIV8SF:
32636 icode = CODE_FOR_avx2_gatherdiv8sf;
32637 goto gather_gen;
32638 case IX86_BUILTIN_GATHERSIV2DI:
32639 icode = CODE_FOR_avx2_gathersiv2di;
32640 goto gather_gen;
32641 case IX86_BUILTIN_GATHERSIV4DI:
32642 icode = CODE_FOR_avx2_gathersiv4di;
32643 goto gather_gen;
32644 case IX86_BUILTIN_GATHERDIV2DI:
32645 icode = CODE_FOR_avx2_gatherdiv2di;
32646 goto gather_gen;
32647 case IX86_BUILTIN_GATHERDIV4DI:
32648 icode = CODE_FOR_avx2_gatherdiv4di;
32649 goto gather_gen;
32650 case IX86_BUILTIN_GATHERSIV4SI:
32651 icode = CODE_FOR_avx2_gathersiv4si;
32652 goto gather_gen;
32653 case IX86_BUILTIN_GATHERSIV8SI:
32654 icode = CODE_FOR_avx2_gathersiv8si;
32655 goto gather_gen;
32656 case IX86_BUILTIN_GATHERDIV4SI:
32657 icode = CODE_FOR_avx2_gatherdiv4si;
32658 goto gather_gen;
32659 case IX86_BUILTIN_GATHERDIV8SI:
32660 icode = CODE_FOR_avx2_gatherdiv8si;
32661 goto gather_gen;
32662 case IX86_BUILTIN_GATHERALTSIV4DF:
32663 icode = CODE_FOR_avx2_gathersiv4df;
32664 goto gather_gen;
32665 case IX86_BUILTIN_GATHERALTDIV8SF:
32666 icode = CODE_FOR_avx2_gatherdiv8sf;
32667 goto gather_gen;
32668 case IX86_BUILTIN_GATHERALTSIV4DI:
32669 icode = CODE_FOR_avx2_gathersiv4di;
32670 goto gather_gen;
32671 case IX86_BUILTIN_GATHERALTDIV8SI:
32672 icode = CODE_FOR_avx2_gatherdiv8si;
32673 goto gather_gen;
32675 gather_gen:
32676 arg0 = CALL_EXPR_ARG (exp, 0);
32677 arg1 = CALL_EXPR_ARG (exp, 1);
32678 arg2 = CALL_EXPR_ARG (exp, 2);
32679 arg3 = CALL_EXPR_ARG (exp, 3);
32680 arg4 = CALL_EXPR_ARG (exp, 4);
32681 op0 = expand_normal (arg0);
32682 op1 = expand_normal (arg1);
32683 op2 = expand_normal (arg2);
32684 op3 = expand_normal (arg3);
32685 op4 = expand_normal (arg4);
32686 /* Note the arg order is different from the operand order. */
32687 mode0 = insn_data[icode].operand[1].mode;
32688 mode2 = insn_data[icode].operand[3].mode;
32689 mode3 = insn_data[icode].operand[4].mode;
32690 mode4 = insn_data[icode].operand[5].mode;
32692 if (target == NULL_RTX
32693 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32694 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32695 else
32696 subtarget = target;
32698 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32699 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32701 rtx half = gen_reg_rtx (V4SImode);
32702 if (!nonimmediate_operand (op2, V8SImode))
32703 op2 = copy_to_mode_reg (V8SImode, op2);
32704 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32705 op2 = half;
32707 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32708 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32710 rtx (*gen) (rtx, rtx);
32711 rtx half = gen_reg_rtx (mode0);
32712 if (mode0 == V4SFmode)
32713 gen = gen_vec_extract_lo_v8sf;
32714 else
32715 gen = gen_vec_extract_lo_v8si;
32716 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32717 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32718 emit_insn (gen (half, op0));
32719 op0 = half;
32720 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32721 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32722 emit_insn (gen (half, op3));
32723 op3 = half;
32726 /* Force memory operand only with base register here. But we
32727 don't want to do it on memory operand for other builtin
32728 functions. */
32729 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32731 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32732 op0 = copy_to_mode_reg (mode0, op0);
32733 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32734 op1 = copy_to_mode_reg (Pmode, op1);
32735 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32736 op2 = copy_to_mode_reg (mode2, op2);
32737 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32738 op3 = copy_to_mode_reg (mode3, op3);
32739 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32741 error ("last argument must be scale 1, 2, 4, 8");
32742 return const0_rtx;
32745 /* Optimize. If mask is known to have all high bits set,
32746 replace op0 with pc_rtx to signal that the instruction
32747 overwrites the whole destination and doesn't use its
32748 previous contents. */
32749 if (optimize)
32751 if (TREE_CODE (arg3) == VECTOR_CST)
32753 unsigned int negative = 0;
32754 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32756 tree cst = VECTOR_CST_ELT (arg3, i);
32757 if (TREE_CODE (cst) == INTEGER_CST
32758 && tree_int_cst_sign_bit (cst))
32759 negative++;
32760 else if (TREE_CODE (cst) == REAL_CST
32761 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32762 negative++;
32764 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32765 op0 = pc_rtx;
32767 else if (TREE_CODE (arg3) == SSA_NAME)
32769 /* Recognize also when mask is like:
32770 __v2df src = _mm_setzero_pd ();
32771 __v2df mask = _mm_cmpeq_pd (src, src);
32773 __v8sf src = _mm256_setzero_ps ();
32774 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32775 as that is a cheaper way to load all ones into
32776 a register than having to load a constant from
32777 memory. */
32778 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32779 if (is_gimple_call (def_stmt))
32781 tree fndecl = gimple_call_fndecl (def_stmt);
32782 if (fndecl
32783 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32784 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32786 case IX86_BUILTIN_CMPPD:
32787 case IX86_BUILTIN_CMPPS:
32788 case IX86_BUILTIN_CMPPD256:
32789 case IX86_BUILTIN_CMPPS256:
32790 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32791 break;
32792 /* FALLTHRU */
32793 case IX86_BUILTIN_CMPEQPD:
32794 case IX86_BUILTIN_CMPEQPS:
32795 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32796 && initializer_zerop (gimple_call_arg (def_stmt,
32797 1)))
32798 op0 = pc_rtx;
32799 break;
32800 default:
32801 break;
32807 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32808 if (! pat)
32809 return const0_rtx;
32810 emit_insn (pat);
32812 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32813 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32815 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32816 ? V4SFmode : V4SImode;
32817 if (target == NULL_RTX)
32818 target = gen_reg_rtx (tmode);
32819 if (tmode == V4SFmode)
32820 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32821 else
32822 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32824 else
32825 target = subtarget;
32827 return target;
32829 case IX86_BUILTIN_XABORT:
32830 icode = CODE_FOR_xabort;
32831 arg0 = CALL_EXPR_ARG (exp, 0);
32832 op0 = expand_normal (arg0);
32833 mode0 = insn_data[icode].operand[0].mode;
32834 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32836 error ("the xabort's argument must be an 8-bit immediate");
32837 return const0_rtx;
32839 emit_insn (gen_xabort (op0));
32840 return 0;
32842 default:
32843 break;
32846 for (i = 0, d = bdesc_special_args;
32847 i < ARRAY_SIZE (bdesc_special_args);
32848 i++, d++)
32849 if (d->code == fcode)
32850 return ix86_expand_special_args_builtin (d, exp, target);
32852 for (i = 0, d = bdesc_args;
32853 i < ARRAY_SIZE (bdesc_args);
32854 i++, d++)
32855 if (d->code == fcode)
32856 switch (fcode)
32858 case IX86_BUILTIN_FABSQ:
32859 case IX86_BUILTIN_COPYSIGNQ:
32860 if (!TARGET_SSE)
32861 /* Emit a normal call if SSE isn't available. */
32862 return expand_call (exp, target, ignore);
32863 default:
32864 return ix86_expand_args_builtin (d, exp, target);
32867 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32868 if (d->code == fcode)
32869 return ix86_expand_sse_comi (d, exp, target);
32871 for (i = 0, d = bdesc_pcmpestr;
32872 i < ARRAY_SIZE (bdesc_pcmpestr);
32873 i++, d++)
32874 if (d->code == fcode)
32875 return ix86_expand_sse_pcmpestr (d, exp, target);
32877 for (i = 0, d = bdesc_pcmpistr;
32878 i < ARRAY_SIZE (bdesc_pcmpistr);
32879 i++, d++)
32880 if (d->code == fcode)
32881 return ix86_expand_sse_pcmpistr (d, exp, target);
32883 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32884 if (d->code == fcode)
32885 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32886 (enum ix86_builtin_func_type)
32887 d->flag, d->comparison);
32889 gcc_unreachable ();
32892 /* Returns a function decl for a vectorized version of the builtin function
32893 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32894 if it is not available. */
32896 static tree
32897 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32898 tree type_in)
32900 enum machine_mode in_mode, out_mode;
32901 int in_n, out_n;
32902 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32904 if (TREE_CODE (type_out) != VECTOR_TYPE
32905 || TREE_CODE (type_in) != VECTOR_TYPE
32906 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32907 return NULL_TREE;
32909 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32910 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32911 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32912 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32914 switch (fn)
32916 case BUILT_IN_SQRT:
32917 if (out_mode == DFmode && in_mode == DFmode)
32919 if (out_n == 2 && in_n == 2)
32920 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32921 else if (out_n == 4 && in_n == 4)
32922 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32924 break;
32926 case BUILT_IN_SQRTF:
32927 if (out_mode == SFmode && in_mode == SFmode)
32929 if (out_n == 4 && in_n == 4)
32930 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32931 else if (out_n == 8 && in_n == 8)
32932 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32934 break;
32936 case BUILT_IN_IFLOOR:
32937 case BUILT_IN_LFLOOR:
32938 case BUILT_IN_LLFLOOR:
32939 /* The round insn does not trap on denormals. */
32940 if (flag_trapping_math || !TARGET_ROUND)
32941 break;
32943 if (out_mode == SImode && in_mode == DFmode)
32945 if (out_n == 4 && in_n == 2)
32946 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32947 else if (out_n == 8 && in_n == 4)
32948 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32950 break;
32952 case BUILT_IN_IFLOORF:
32953 case BUILT_IN_LFLOORF:
32954 case BUILT_IN_LLFLOORF:
32955 /* The round insn does not trap on denormals. */
32956 if (flag_trapping_math || !TARGET_ROUND)
32957 break;
32959 if (out_mode == SImode && in_mode == SFmode)
32961 if (out_n == 4 && in_n == 4)
32962 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32963 else if (out_n == 8 && in_n == 8)
32964 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32966 break;
32968 case BUILT_IN_ICEIL:
32969 case BUILT_IN_LCEIL:
32970 case BUILT_IN_LLCEIL:
32971 /* The round insn does not trap on denormals. */
32972 if (flag_trapping_math || !TARGET_ROUND)
32973 break;
32975 if (out_mode == SImode && in_mode == DFmode)
32977 if (out_n == 4 && in_n == 2)
32978 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32979 else if (out_n == 8 && in_n == 4)
32980 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32982 break;
32984 case BUILT_IN_ICEILF:
32985 case BUILT_IN_LCEILF:
32986 case BUILT_IN_LLCEILF:
32987 /* The round insn does not trap on denormals. */
32988 if (flag_trapping_math || !TARGET_ROUND)
32989 break;
32991 if (out_mode == SImode && in_mode == SFmode)
32993 if (out_n == 4 && in_n == 4)
32994 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32995 else if (out_n == 8 && in_n == 8)
32996 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32998 break;
33000 case BUILT_IN_IRINT:
33001 case BUILT_IN_LRINT:
33002 case BUILT_IN_LLRINT:
33003 if (out_mode == SImode && in_mode == DFmode)
33005 if (out_n == 4 && in_n == 2)
33006 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
33007 else if (out_n == 8 && in_n == 4)
33008 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
33010 break;
33012 case BUILT_IN_IRINTF:
33013 case BUILT_IN_LRINTF:
33014 case BUILT_IN_LLRINTF:
33015 if (out_mode == SImode && in_mode == SFmode)
33017 if (out_n == 4 && in_n == 4)
33018 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
33019 else if (out_n == 8 && in_n == 8)
33020 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
33022 break;
33024 case BUILT_IN_IROUND:
33025 case BUILT_IN_LROUND:
33026 case BUILT_IN_LLROUND:
33027 /* The round insn does not trap on denormals. */
33028 if (flag_trapping_math || !TARGET_ROUND)
33029 break;
33031 if (out_mode == SImode && in_mode == DFmode)
33033 if (out_n == 4 && in_n == 2)
33034 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
33035 else if (out_n == 8 && in_n == 4)
33036 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
33038 break;
33040 case BUILT_IN_IROUNDF:
33041 case BUILT_IN_LROUNDF:
33042 case BUILT_IN_LLROUNDF:
33043 /* The round insn does not trap on denormals. */
33044 if (flag_trapping_math || !TARGET_ROUND)
33045 break;
33047 if (out_mode == SImode && in_mode == SFmode)
33049 if (out_n == 4 && in_n == 4)
33050 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
33051 else if (out_n == 8 && in_n == 8)
33052 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
33054 break;
33056 case BUILT_IN_COPYSIGN:
33057 if (out_mode == DFmode && in_mode == DFmode)
33059 if (out_n == 2 && in_n == 2)
33060 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
33061 else if (out_n == 4 && in_n == 4)
33062 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
33064 break;
33066 case BUILT_IN_COPYSIGNF:
33067 if (out_mode == SFmode && in_mode == SFmode)
33069 if (out_n == 4 && in_n == 4)
33070 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
33071 else if (out_n == 8 && in_n == 8)
33072 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
33074 break;
33076 case BUILT_IN_FLOOR:
33077 /* The round insn does not trap on denormals. */
33078 if (flag_trapping_math || !TARGET_ROUND)
33079 break;
33081 if (out_mode == DFmode && in_mode == DFmode)
33083 if (out_n == 2 && in_n == 2)
33084 return ix86_builtins[IX86_BUILTIN_FLOORPD];
33085 else if (out_n == 4 && in_n == 4)
33086 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
33088 break;
33090 case BUILT_IN_FLOORF:
33091 /* The round insn does not trap on denormals. */
33092 if (flag_trapping_math || !TARGET_ROUND)
33093 break;
33095 if (out_mode == SFmode && in_mode == SFmode)
33097 if (out_n == 4 && in_n == 4)
33098 return ix86_builtins[IX86_BUILTIN_FLOORPS];
33099 else if (out_n == 8 && in_n == 8)
33100 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
33102 break;
33104 case BUILT_IN_CEIL:
33105 /* The round insn does not trap on denormals. */
33106 if (flag_trapping_math || !TARGET_ROUND)
33107 break;
33109 if (out_mode == DFmode && in_mode == DFmode)
33111 if (out_n == 2 && in_n == 2)
33112 return ix86_builtins[IX86_BUILTIN_CEILPD];
33113 else if (out_n == 4 && in_n == 4)
33114 return ix86_builtins[IX86_BUILTIN_CEILPD256];
33116 break;
33118 case BUILT_IN_CEILF:
33119 /* The round insn does not trap on denormals. */
33120 if (flag_trapping_math || !TARGET_ROUND)
33121 break;
33123 if (out_mode == SFmode && in_mode == SFmode)
33125 if (out_n == 4 && in_n == 4)
33126 return ix86_builtins[IX86_BUILTIN_CEILPS];
33127 else if (out_n == 8 && in_n == 8)
33128 return ix86_builtins[IX86_BUILTIN_CEILPS256];
33130 break;
33132 case BUILT_IN_TRUNC:
33133 /* The round insn does not trap on denormals. */
33134 if (flag_trapping_math || !TARGET_ROUND)
33135 break;
33137 if (out_mode == DFmode && in_mode == DFmode)
33139 if (out_n == 2 && in_n == 2)
33140 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
33141 else if (out_n == 4 && in_n == 4)
33142 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
33144 break;
33146 case BUILT_IN_TRUNCF:
33147 /* The round insn does not trap on denormals. */
33148 if (flag_trapping_math || !TARGET_ROUND)
33149 break;
33151 if (out_mode == SFmode && in_mode == SFmode)
33153 if (out_n == 4 && in_n == 4)
33154 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
33155 else if (out_n == 8 && in_n == 8)
33156 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
33158 break;
33160 case BUILT_IN_RINT:
33161 /* The round insn does not trap on denormals. */
33162 if (flag_trapping_math || !TARGET_ROUND)
33163 break;
33165 if (out_mode == DFmode && in_mode == DFmode)
33167 if (out_n == 2 && in_n == 2)
33168 return ix86_builtins[IX86_BUILTIN_RINTPD];
33169 else if (out_n == 4 && in_n == 4)
33170 return ix86_builtins[IX86_BUILTIN_RINTPD256];
33172 break;
33174 case BUILT_IN_RINTF:
33175 /* The round insn does not trap on denormals. */
33176 if (flag_trapping_math || !TARGET_ROUND)
33177 break;
33179 if (out_mode == SFmode && in_mode == SFmode)
33181 if (out_n == 4 && in_n == 4)
33182 return ix86_builtins[IX86_BUILTIN_RINTPS];
33183 else if (out_n == 8 && in_n == 8)
33184 return ix86_builtins[IX86_BUILTIN_RINTPS256];
33186 break;
33188 case BUILT_IN_ROUND:
33189 /* The round insn does not trap on denormals. */
33190 if (flag_trapping_math || !TARGET_ROUND)
33191 break;
33193 if (out_mode == DFmode && in_mode == DFmode)
33195 if (out_n == 2 && in_n == 2)
33196 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
33197 else if (out_n == 4 && in_n == 4)
33198 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
33200 break;
33202 case BUILT_IN_ROUNDF:
33203 /* The round insn does not trap on denormals. */
33204 if (flag_trapping_math || !TARGET_ROUND)
33205 break;
33207 if (out_mode == SFmode && in_mode == SFmode)
33209 if (out_n == 4 && in_n == 4)
33210 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
33211 else if (out_n == 8 && in_n == 8)
33212 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
33214 break;
33216 case BUILT_IN_FMA:
33217 if (out_mode == DFmode && in_mode == DFmode)
33219 if (out_n == 2 && in_n == 2)
33220 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
33221 if (out_n == 4 && in_n == 4)
33222 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
33224 break;
33226 case BUILT_IN_FMAF:
33227 if (out_mode == SFmode && in_mode == SFmode)
33229 if (out_n == 4 && in_n == 4)
33230 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
33231 if (out_n == 8 && in_n == 8)
33232 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
33234 break;
33236 default:
33237 break;
33240 /* Dispatch to a handler for a vectorization library. */
33241 if (ix86_veclib_handler)
33242 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
33243 type_in);
33245 return NULL_TREE;
33248 /* Handler for an SVML-style interface to
33249 a library with vectorized intrinsics. */
33251 static tree
33252 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
33254 char name[20];
33255 tree fntype, new_fndecl, args;
33256 unsigned arity;
33257 const char *bname;
33258 enum machine_mode el_mode, in_mode;
33259 int n, in_n;
33261 /* The SVML is suitable for unsafe math only. */
33262 if (!flag_unsafe_math_optimizations)
33263 return NULL_TREE;
33265 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33266 n = TYPE_VECTOR_SUBPARTS (type_out);
33267 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33268 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33269 if (el_mode != in_mode
33270 || n != in_n)
33271 return NULL_TREE;
33273 switch (fn)
33275 case BUILT_IN_EXP:
33276 case BUILT_IN_LOG:
33277 case BUILT_IN_LOG10:
33278 case BUILT_IN_POW:
33279 case BUILT_IN_TANH:
33280 case BUILT_IN_TAN:
33281 case BUILT_IN_ATAN:
33282 case BUILT_IN_ATAN2:
33283 case BUILT_IN_ATANH:
33284 case BUILT_IN_CBRT:
33285 case BUILT_IN_SINH:
33286 case BUILT_IN_SIN:
33287 case BUILT_IN_ASINH:
33288 case BUILT_IN_ASIN:
33289 case BUILT_IN_COSH:
33290 case BUILT_IN_COS:
33291 case BUILT_IN_ACOSH:
33292 case BUILT_IN_ACOS:
33293 if (el_mode != DFmode || n != 2)
33294 return NULL_TREE;
33295 break;
33297 case BUILT_IN_EXPF:
33298 case BUILT_IN_LOGF:
33299 case BUILT_IN_LOG10F:
33300 case BUILT_IN_POWF:
33301 case BUILT_IN_TANHF:
33302 case BUILT_IN_TANF:
33303 case BUILT_IN_ATANF:
33304 case BUILT_IN_ATAN2F:
33305 case BUILT_IN_ATANHF:
33306 case BUILT_IN_CBRTF:
33307 case BUILT_IN_SINHF:
33308 case BUILT_IN_SINF:
33309 case BUILT_IN_ASINHF:
33310 case BUILT_IN_ASINF:
33311 case BUILT_IN_COSHF:
33312 case BUILT_IN_COSF:
33313 case BUILT_IN_ACOSHF:
33314 case BUILT_IN_ACOSF:
33315 if (el_mode != SFmode || n != 4)
33316 return NULL_TREE;
33317 break;
33319 default:
33320 return NULL_TREE;
33323 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33325 if (fn == BUILT_IN_LOGF)
33326 strcpy (name, "vmlsLn4");
33327 else if (fn == BUILT_IN_LOG)
33328 strcpy (name, "vmldLn2");
33329 else if (n == 4)
33331 sprintf (name, "vmls%s", bname+10);
33332 name[strlen (name)-1] = '4';
33334 else
33335 sprintf (name, "vmld%s2", bname+10);
33337 /* Convert to uppercase. */
33338 name[4] &= ~0x20;
33340 arity = 0;
33341 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33342 args;
33343 args = TREE_CHAIN (args))
33344 arity++;
33346 if (arity == 1)
33347 fntype = build_function_type_list (type_out, type_in, NULL);
33348 else
33349 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33351 /* Build a function declaration for the vectorized function. */
33352 new_fndecl = build_decl (BUILTINS_LOCATION,
33353 FUNCTION_DECL, get_identifier (name), fntype);
33354 TREE_PUBLIC (new_fndecl) = 1;
33355 DECL_EXTERNAL (new_fndecl) = 1;
33356 DECL_IS_NOVOPS (new_fndecl) = 1;
33357 TREE_READONLY (new_fndecl) = 1;
33359 return new_fndecl;
33362 /* Handler for an ACML-style interface to
33363 a library with vectorized intrinsics. */
33365 static tree
33366 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
33368 char name[20] = "__vr.._";
33369 tree fntype, new_fndecl, args;
33370 unsigned arity;
33371 const char *bname;
33372 enum machine_mode el_mode, in_mode;
33373 int n, in_n;
33375 /* The ACML is 64bits only and suitable for unsafe math only as
33376 it does not correctly support parts of IEEE with the required
33377 precision such as denormals. */
33378 if (!TARGET_64BIT
33379 || !flag_unsafe_math_optimizations)
33380 return NULL_TREE;
33382 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33383 n = TYPE_VECTOR_SUBPARTS (type_out);
33384 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33385 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33386 if (el_mode != in_mode
33387 || n != in_n)
33388 return NULL_TREE;
33390 switch (fn)
33392 case BUILT_IN_SIN:
33393 case BUILT_IN_COS:
33394 case BUILT_IN_EXP:
33395 case BUILT_IN_LOG:
33396 case BUILT_IN_LOG2:
33397 case BUILT_IN_LOG10:
33398 name[4] = 'd';
33399 name[5] = '2';
33400 if (el_mode != DFmode
33401 || n != 2)
33402 return NULL_TREE;
33403 break;
33405 case BUILT_IN_SINF:
33406 case BUILT_IN_COSF:
33407 case BUILT_IN_EXPF:
33408 case BUILT_IN_POWF:
33409 case BUILT_IN_LOGF:
33410 case BUILT_IN_LOG2F:
33411 case BUILT_IN_LOG10F:
33412 name[4] = 's';
33413 name[5] = '4';
33414 if (el_mode != SFmode
33415 || n != 4)
33416 return NULL_TREE;
33417 break;
33419 default:
33420 return NULL_TREE;
33423 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33424 sprintf (name + 7, "%s", bname+10);
33426 arity = 0;
33427 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33428 args;
33429 args = TREE_CHAIN (args))
33430 arity++;
33432 if (arity == 1)
33433 fntype = build_function_type_list (type_out, type_in, NULL);
33434 else
33435 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33437 /* Build a function declaration for the vectorized function. */
33438 new_fndecl = build_decl (BUILTINS_LOCATION,
33439 FUNCTION_DECL, get_identifier (name), fntype);
33440 TREE_PUBLIC (new_fndecl) = 1;
33441 DECL_EXTERNAL (new_fndecl) = 1;
33442 DECL_IS_NOVOPS (new_fndecl) = 1;
33443 TREE_READONLY (new_fndecl) = 1;
33445 return new_fndecl;
33448 /* Returns a decl of a function that implements gather load with
33449 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
33450 Return NULL_TREE if it is not available. */
33452 static tree
33453 ix86_vectorize_builtin_gather (const_tree mem_vectype,
33454 const_tree index_type, int scale)
33456 bool si;
33457 enum ix86_builtins code;
33459 if (! TARGET_AVX2)
33460 return NULL_TREE;
33462 if ((TREE_CODE (index_type) != INTEGER_TYPE
33463 && !POINTER_TYPE_P (index_type))
33464 || (TYPE_MODE (index_type) != SImode
33465 && TYPE_MODE (index_type) != DImode))
33466 return NULL_TREE;
33468 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
33469 return NULL_TREE;
33471 /* v*gather* insn sign extends index to pointer mode. */
33472 if (TYPE_PRECISION (index_type) < POINTER_SIZE
33473 && TYPE_UNSIGNED (index_type))
33474 return NULL_TREE;
33476 if (scale <= 0
33477 || scale > 8
33478 || (scale & (scale - 1)) != 0)
33479 return NULL_TREE;
33481 si = TYPE_MODE (index_type) == SImode;
33482 switch (TYPE_MODE (mem_vectype))
33484 case V2DFmode:
33485 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
33486 break;
33487 case V4DFmode:
33488 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
33489 break;
33490 case V2DImode:
33491 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
33492 break;
33493 case V4DImode:
33494 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
33495 break;
33496 case V4SFmode:
33497 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
33498 break;
33499 case V8SFmode:
33500 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
33501 break;
33502 case V4SImode:
33503 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
33504 break;
33505 case V8SImode:
33506 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
33507 break;
33508 default:
33509 return NULL_TREE;
33512 return ix86_builtins[code];
33515 /* Returns a code for a target-specific builtin that implements
33516 reciprocal of the function, or NULL_TREE if not available. */
33518 static tree
33519 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33520 bool sqrt ATTRIBUTE_UNUSED)
33522 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33523 && flag_finite_math_only && !flag_trapping_math
33524 && flag_unsafe_math_optimizations))
33525 return NULL_TREE;
33527 if (md_fn)
33528 /* Machine dependent builtins. */
33529 switch (fn)
33531 /* Vectorized version of sqrt to rsqrt conversion. */
33532 case IX86_BUILTIN_SQRTPS_NR:
33533 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33535 case IX86_BUILTIN_SQRTPS_NR256:
33536 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33538 default:
33539 return NULL_TREE;
33541 else
33542 /* Normal builtins. */
33543 switch (fn)
33545 /* Sqrt to rsqrt conversion. */
33546 case BUILT_IN_SQRTF:
33547 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33549 default:
33550 return NULL_TREE;
33554 /* Helper for avx_vpermilps256_operand et al. This is also used by
33555 the expansion functions to turn the parallel back into a mask.
33556 The return value is 0 for no match and the imm8+1 for a match. */
33559 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33561 unsigned i, nelt = GET_MODE_NUNITS (mode);
33562 unsigned mask = 0;
33563 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33565 if (XVECLEN (par, 0) != (int) nelt)
33566 return 0;
33568 /* Validate that all of the elements are constants, and not totally
33569 out of range. Copy the data into an integral array to make the
33570 subsequent checks easier. */
33571 for (i = 0; i < nelt; ++i)
33573 rtx er = XVECEXP (par, 0, i);
33574 unsigned HOST_WIDE_INT ei;
33576 if (!CONST_INT_P (er))
33577 return 0;
33578 ei = INTVAL (er);
33579 if (ei >= nelt)
33580 return 0;
33581 ipar[i] = ei;
33584 switch (mode)
33586 case V4DFmode:
33587 /* In the 256-bit DFmode case, we can only move elements within
33588 a 128-bit lane. */
33589 for (i = 0; i < 2; ++i)
33591 if (ipar[i] >= 2)
33592 return 0;
33593 mask |= ipar[i] << i;
33595 for (i = 2; i < 4; ++i)
33597 if (ipar[i] < 2)
33598 return 0;
33599 mask |= (ipar[i] - 2) << i;
33601 break;
33603 case V8SFmode:
33604 /* In the 256-bit SFmode case, we have full freedom of movement
33605 within the low 128-bit lane, but the high 128-bit lane must
33606 mirror the exact same pattern. */
33607 for (i = 0; i < 4; ++i)
33608 if (ipar[i] + 4 != ipar[i + 4])
33609 return 0;
33610 nelt = 4;
33611 /* FALLTHRU */
33613 case V2DFmode:
33614 case V4SFmode:
33615 /* In the 128-bit case, we've full freedom in the placement of
33616 the elements from the source operand. */
33617 for (i = 0; i < nelt; ++i)
33618 mask |= ipar[i] << (i * (nelt / 2));
33619 break;
33621 default:
33622 gcc_unreachable ();
33625 /* Make sure success has a non-zero value by adding one. */
33626 return mask + 1;
33629 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33630 the expansion functions to turn the parallel back into a mask.
33631 The return value is 0 for no match and the imm8+1 for a match. */
33634 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33636 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33637 unsigned mask = 0;
33638 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33640 if (XVECLEN (par, 0) != (int) nelt)
33641 return 0;
33643 /* Validate that all of the elements are constants, and not totally
33644 out of range. Copy the data into an integral array to make the
33645 subsequent checks easier. */
33646 for (i = 0; i < nelt; ++i)
33648 rtx er = XVECEXP (par, 0, i);
33649 unsigned HOST_WIDE_INT ei;
33651 if (!CONST_INT_P (er))
33652 return 0;
33653 ei = INTVAL (er);
33654 if (ei >= 2 * nelt)
33655 return 0;
33656 ipar[i] = ei;
33659 /* Validate that the halves of the permute are halves. */
33660 for (i = 0; i < nelt2 - 1; ++i)
33661 if (ipar[i] + 1 != ipar[i + 1])
33662 return 0;
33663 for (i = nelt2; i < nelt - 1; ++i)
33664 if (ipar[i] + 1 != ipar[i + 1])
33665 return 0;
33667 /* Reconstruct the mask. */
33668 for (i = 0; i < 2; ++i)
33670 unsigned e = ipar[i * nelt2];
33671 if (e % nelt2)
33672 return 0;
33673 e /= nelt2;
33674 mask |= e << (i * 4);
33677 /* Make sure success has a non-zero value by adding one. */
33678 return mask + 1;
33681 /* Store OPERAND to the memory after reload is completed. This means
33682 that we can't easily use assign_stack_local. */
33684 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33686 rtx result;
33688 gcc_assert (reload_completed);
33689 if (ix86_using_red_zone ())
33691 result = gen_rtx_MEM (mode,
33692 gen_rtx_PLUS (Pmode,
33693 stack_pointer_rtx,
33694 GEN_INT (-RED_ZONE_SIZE)));
33695 emit_move_insn (result, operand);
33697 else if (TARGET_64BIT)
33699 switch (mode)
33701 case HImode:
33702 case SImode:
33703 operand = gen_lowpart (DImode, operand);
33704 /* FALLTHRU */
33705 case DImode:
33706 emit_insn (
33707 gen_rtx_SET (VOIDmode,
33708 gen_rtx_MEM (DImode,
33709 gen_rtx_PRE_DEC (DImode,
33710 stack_pointer_rtx)),
33711 operand));
33712 break;
33713 default:
33714 gcc_unreachable ();
33716 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33718 else
33720 switch (mode)
33722 case DImode:
33724 rtx operands[2];
33725 split_double_mode (mode, &operand, 1, operands, operands + 1);
33726 emit_insn (
33727 gen_rtx_SET (VOIDmode,
33728 gen_rtx_MEM (SImode,
33729 gen_rtx_PRE_DEC (Pmode,
33730 stack_pointer_rtx)),
33731 operands[1]));
33732 emit_insn (
33733 gen_rtx_SET (VOIDmode,
33734 gen_rtx_MEM (SImode,
33735 gen_rtx_PRE_DEC (Pmode,
33736 stack_pointer_rtx)),
33737 operands[0]));
33739 break;
33740 case HImode:
33741 /* Store HImodes as SImodes. */
33742 operand = gen_lowpart (SImode, operand);
33743 /* FALLTHRU */
33744 case SImode:
33745 emit_insn (
33746 gen_rtx_SET (VOIDmode,
33747 gen_rtx_MEM (GET_MODE (operand),
33748 gen_rtx_PRE_DEC (SImode,
33749 stack_pointer_rtx)),
33750 operand));
33751 break;
33752 default:
33753 gcc_unreachable ();
33755 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33757 return result;
33760 /* Free operand from the memory. */
33761 void
33762 ix86_free_from_memory (enum machine_mode mode)
33764 if (!ix86_using_red_zone ())
33766 int size;
33768 if (mode == DImode || TARGET_64BIT)
33769 size = 8;
33770 else
33771 size = 4;
33772 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33773 to pop or add instruction if registers are available. */
33774 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33775 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33776 GEN_INT (size))));
33780 /* Return a register priority for hard reg REGNO. */
33781 static int
33782 ix86_register_priority (int hard_regno)
33784 /* ebp and r13 as the base always wants a displacement, r12 as the
33785 base always wants an index. So discourage their usage in an
33786 address. */
33787 if (hard_regno == R12_REG || hard_regno == R13_REG)
33788 return 0;
33789 if (hard_regno == BP_REG)
33790 return 1;
33791 /* New x86-64 int registers result in bigger code size. Discourage
33792 them. */
33793 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33794 return 2;
33795 /* New x86-64 SSE registers result in bigger code size. Discourage
33796 them. */
33797 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33798 return 2;
33799 /* Usage of AX register results in smaller code. Prefer it. */
33800 if (hard_regno == 0)
33801 return 4;
33802 return 3;
33805 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33807 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33808 QImode must go into class Q_REGS.
33809 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33810 movdf to do mem-to-mem moves through integer regs. */
33812 static reg_class_t
33813 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33815 enum machine_mode mode = GET_MODE (x);
33817 /* We're only allowed to return a subclass of CLASS. Many of the
33818 following checks fail for NO_REGS, so eliminate that early. */
33819 if (regclass == NO_REGS)
33820 return NO_REGS;
33822 /* All classes can load zeros. */
33823 if (x == CONST0_RTX (mode))
33824 return regclass;
33826 /* Force constants into memory if we are loading a (nonzero) constant into
33827 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
33828 instructions to load from a constant. */
33829 if (CONSTANT_P (x)
33830 && (MAYBE_MMX_CLASS_P (regclass)
33831 || MAYBE_SSE_CLASS_P (regclass)
33832 || MAYBE_MASK_CLASS_P (regclass)))
33833 return NO_REGS;
33835 /* Prefer SSE regs only, if we can use them for math. */
33836 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33837 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33839 /* Floating-point constants need more complex checks. */
33840 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33842 /* General regs can load everything. */
33843 if (reg_class_subset_p (regclass, GENERAL_REGS))
33844 return regclass;
33846 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33847 zero above. We only want to wind up preferring 80387 registers if
33848 we plan on doing computation with them. */
33849 if (TARGET_80387
33850 && standard_80387_constant_p (x) > 0)
33852 /* Limit class to non-sse. */
33853 if (regclass == FLOAT_SSE_REGS)
33854 return FLOAT_REGS;
33855 if (regclass == FP_TOP_SSE_REGS)
33856 return FP_TOP_REG;
33857 if (regclass == FP_SECOND_SSE_REGS)
33858 return FP_SECOND_REG;
33859 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33860 return regclass;
33863 return NO_REGS;
33866 /* Generally when we see PLUS here, it's the function invariant
33867 (plus soft-fp const_int). Which can only be computed into general
33868 regs. */
33869 if (GET_CODE (x) == PLUS)
33870 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33872 /* QImode constants are easy to load, but non-constant QImode data
33873 must go into Q_REGS. */
33874 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33876 if (reg_class_subset_p (regclass, Q_REGS))
33877 return regclass;
33878 if (reg_class_subset_p (Q_REGS, regclass))
33879 return Q_REGS;
33880 return NO_REGS;
33883 return regclass;
33886 /* Discourage putting floating-point values in SSE registers unless
33887 SSE math is being used, and likewise for the 387 registers. */
33888 static reg_class_t
33889 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33891 enum machine_mode mode = GET_MODE (x);
33893 /* Restrict the output reload class to the register bank that we are doing
33894 math on. If we would like not to return a subset of CLASS, reject this
33895 alternative: if reload cannot do this, it will still use its choice. */
33896 mode = GET_MODE (x);
33897 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33898 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
33900 if (X87_FLOAT_MODE_P (mode))
33902 if (regclass == FP_TOP_SSE_REGS)
33903 return FP_TOP_REG;
33904 else if (regclass == FP_SECOND_SSE_REGS)
33905 return FP_SECOND_REG;
33906 else
33907 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33910 return regclass;
33913 static reg_class_t
33914 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33915 enum machine_mode mode, secondary_reload_info *sri)
33917 /* Double-word spills from general registers to non-offsettable memory
33918 references (zero-extended addresses) require special handling. */
33919 if (TARGET_64BIT
33920 && MEM_P (x)
33921 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33922 && INTEGER_CLASS_P (rclass)
33923 && !offsettable_memref_p (x))
33925 sri->icode = (in_p
33926 ? CODE_FOR_reload_noff_load
33927 : CODE_FOR_reload_noff_store);
33928 /* Add the cost of moving address to a temporary. */
33929 sri->extra_cost = 1;
33931 return NO_REGS;
33934 /* QImode spills from non-QI registers require
33935 intermediate register on 32bit targets. */
33936 if (mode == QImode
33937 && (MAYBE_MASK_CLASS_P (rclass)
33938 || (!TARGET_64BIT && !in_p
33939 && INTEGER_CLASS_P (rclass)
33940 && MAYBE_NON_Q_CLASS_P (rclass))))
33942 int regno;
33944 if (REG_P (x))
33945 regno = REGNO (x);
33946 else
33947 regno = -1;
33949 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33950 regno = true_regnum (x);
33952 /* Return Q_REGS if the operand is in memory. */
33953 if (regno == -1)
33954 return Q_REGS;
33957 /* This condition handles corner case where an expression involving
33958 pointers gets vectorized. We're trying to use the address of a
33959 stack slot as a vector initializer.
33961 (set (reg:V2DI 74 [ vect_cst_.2 ])
33962 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33964 Eventually frame gets turned into sp+offset like this:
33966 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33967 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33968 (const_int 392 [0x188]))))
33970 That later gets turned into:
33972 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33973 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33974 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33976 We'll have the following reload recorded:
33978 Reload 0: reload_in (DI) =
33979 (plus:DI (reg/f:DI 7 sp)
33980 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33981 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33982 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33983 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33984 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33985 reload_reg_rtx: (reg:V2DI 22 xmm1)
33987 Which isn't going to work since SSE instructions can't handle scalar
33988 additions. Returning GENERAL_REGS forces the addition into integer
33989 register and reload can handle subsequent reloads without problems. */
33991 if (in_p && GET_CODE (x) == PLUS
33992 && SSE_CLASS_P (rclass)
33993 && SCALAR_INT_MODE_P (mode))
33994 return GENERAL_REGS;
33996 return NO_REGS;
33999 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
34001 static bool
34002 ix86_class_likely_spilled_p (reg_class_t rclass)
34004 switch (rclass)
34006 case AREG:
34007 case DREG:
34008 case CREG:
34009 case BREG:
34010 case AD_REGS:
34011 case SIREG:
34012 case DIREG:
34013 case SSE_FIRST_REG:
34014 case FP_TOP_REG:
34015 case FP_SECOND_REG:
34016 return true;
34018 default:
34019 break;
34022 return false;
34025 /* If we are copying between general and FP registers, we need a memory
34026 location. The same is true for SSE and MMX registers.
34028 To optimize register_move_cost performance, allow inline variant.
34030 The macro can't work reliably when one of the CLASSES is class containing
34031 registers from multiple units (SSE, MMX, integer). We avoid this by never
34032 combining those units in single alternative in the machine description.
34033 Ensure that this constraint holds to avoid unexpected surprises.
34035 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34036 enforce these sanity checks. */
34038 static inline bool
34039 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34040 enum machine_mode mode, int strict)
34042 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34043 return false;
34044 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34045 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34046 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34047 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34048 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34049 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34051 gcc_assert (!strict || lra_in_progress);
34052 return true;
34055 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34056 return true;
34058 /* ??? This is a lie. We do have moves between mmx/general, and for
34059 mmx/sse2. But by saying we need secondary memory we discourage the
34060 register allocator from using the mmx registers unless needed. */
34061 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34062 return true;
34064 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34066 /* SSE1 doesn't have any direct moves from other classes. */
34067 if (!TARGET_SSE2)
34068 return true;
34070 /* If the target says that inter-unit moves are more expensive
34071 than moving through memory, then don't generate them. */
34072 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34073 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34074 return true;
34076 /* Between SSE and general, we have moves no larger than word size. */
34077 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34078 return true;
34081 return false;
34084 bool
34085 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34086 enum machine_mode mode, int strict)
34088 return inline_secondary_memory_needed (class1, class2, mode, strict);
34091 /* Implement the TARGET_CLASS_MAX_NREGS hook.
34093 On the 80386, this is the size of MODE in words,
34094 except in the FP regs, where a single reg is always enough. */
34096 static unsigned char
34097 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
34099 if (MAYBE_INTEGER_CLASS_P (rclass))
34101 if (mode == XFmode)
34102 return (TARGET_64BIT ? 2 : 3);
34103 else if (mode == XCmode)
34104 return (TARGET_64BIT ? 4 : 6);
34105 else
34106 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
34108 else
34110 if (COMPLEX_MODE_P (mode))
34111 return 2;
34112 else
34113 return 1;
34117 /* Return true if the registers in CLASS cannot represent the change from
34118 modes FROM to TO. */
34120 bool
34121 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
34122 enum reg_class regclass)
34124 if (from == to)
34125 return false;
34127 /* x87 registers can't do subreg at all, as all values are reformatted
34128 to extended precision. */
34129 if (MAYBE_FLOAT_CLASS_P (regclass))
34130 return true;
34132 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
34134 /* Vector registers do not support QI or HImode loads. If we don't
34135 disallow a change to these modes, reload will assume it's ok to
34136 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
34137 the vec_dupv4hi pattern. */
34138 if (GET_MODE_SIZE (from) < 4)
34139 return true;
34141 /* Vector registers do not support subreg with nonzero offsets, which
34142 are otherwise valid for integer registers. Since we can't see
34143 whether we have a nonzero offset from here, prohibit all
34144 nonparadoxical subregs changing size. */
34145 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
34146 return true;
34149 return false;
34152 /* Return the cost of moving data of mode M between a
34153 register and memory. A value of 2 is the default; this cost is
34154 relative to those in `REGISTER_MOVE_COST'.
34156 This function is used extensively by register_move_cost that is used to
34157 build tables at startup. Make it inline in this case.
34158 When IN is 2, return maximum of in and out move cost.
34160 If moving between registers and memory is more expensive than
34161 between two registers, you should define this macro to express the
34162 relative cost.
34164 Model also increased moving costs of QImode registers in non
34165 Q_REGS classes.
34167 static inline int
34168 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
34169 int in)
34171 int cost;
34172 if (FLOAT_CLASS_P (regclass))
34174 int index;
34175 switch (mode)
34177 case SFmode:
34178 index = 0;
34179 break;
34180 case DFmode:
34181 index = 1;
34182 break;
34183 case XFmode:
34184 index = 2;
34185 break;
34186 default:
34187 return 100;
34189 if (in == 2)
34190 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
34191 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
34193 if (SSE_CLASS_P (regclass))
34195 int index;
34196 switch (GET_MODE_SIZE (mode))
34198 case 4:
34199 index = 0;
34200 break;
34201 case 8:
34202 index = 1;
34203 break;
34204 case 16:
34205 index = 2;
34206 break;
34207 default:
34208 return 100;
34210 if (in == 2)
34211 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
34212 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
34214 if (MMX_CLASS_P (regclass))
34216 int index;
34217 switch (GET_MODE_SIZE (mode))
34219 case 4:
34220 index = 0;
34221 break;
34222 case 8:
34223 index = 1;
34224 break;
34225 default:
34226 return 100;
34228 if (in)
34229 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
34230 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
34232 switch (GET_MODE_SIZE (mode))
34234 case 1:
34235 if (Q_CLASS_P (regclass) || TARGET_64BIT)
34237 if (!in)
34238 return ix86_cost->int_store[0];
34239 if (TARGET_PARTIAL_REG_DEPENDENCY
34240 && optimize_function_for_speed_p (cfun))
34241 cost = ix86_cost->movzbl_load;
34242 else
34243 cost = ix86_cost->int_load[0];
34244 if (in == 2)
34245 return MAX (cost, ix86_cost->int_store[0]);
34246 return cost;
34248 else
34250 if (in == 2)
34251 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
34252 if (in)
34253 return ix86_cost->movzbl_load;
34254 else
34255 return ix86_cost->int_store[0] + 4;
34257 break;
34258 case 2:
34259 if (in == 2)
34260 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
34261 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
34262 default:
34263 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
34264 if (mode == TFmode)
34265 mode = XFmode;
34266 if (in == 2)
34267 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
34268 else if (in)
34269 cost = ix86_cost->int_load[2];
34270 else
34271 cost = ix86_cost->int_store[2];
34272 return (cost * (((int) GET_MODE_SIZE (mode)
34273 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
34277 static int
34278 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
34279 bool in)
34281 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
34285 /* Return the cost of moving data from a register in class CLASS1 to
34286 one in class CLASS2.
34288 It is not required that the cost always equal 2 when FROM is the same as TO;
34289 on some machines it is expensive to move between registers if they are not
34290 general registers. */
34292 static int
34293 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
34294 reg_class_t class2_i)
34296 enum reg_class class1 = (enum reg_class) class1_i;
34297 enum reg_class class2 = (enum reg_class) class2_i;
34299 /* In case we require secondary memory, compute cost of the store followed
34300 by load. In order to avoid bad register allocation choices, we need
34301 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
34303 if (inline_secondary_memory_needed (class1, class2, mode, 0))
34305 int cost = 1;
34307 cost += inline_memory_move_cost (mode, class1, 2);
34308 cost += inline_memory_move_cost (mode, class2, 2);
34310 /* In case of copying from general_purpose_register we may emit multiple
34311 stores followed by single load causing memory size mismatch stall.
34312 Count this as arbitrarily high cost of 20. */
34313 if (targetm.class_max_nregs (class1, mode)
34314 > targetm.class_max_nregs (class2, mode))
34315 cost += 20;
34317 /* In the case of FP/MMX moves, the registers actually overlap, and we
34318 have to switch modes in order to treat them differently. */
34319 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
34320 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
34321 cost += 20;
34323 return cost;
34326 /* Moves between SSE/MMX and integer unit are expensive. */
34327 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
34328 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34330 /* ??? By keeping returned value relatively high, we limit the number
34331 of moves between integer and MMX/SSE registers for all targets.
34332 Additionally, high value prevents problem with x86_modes_tieable_p(),
34333 where integer modes in MMX/SSE registers are not tieable
34334 because of missing QImode and HImode moves to, from or between
34335 MMX/SSE registers. */
34336 return MAX (8, ix86_cost->mmxsse_to_integer);
34338 if (MAYBE_FLOAT_CLASS_P (class1))
34339 return ix86_cost->fp_move;
34340 if (MAYBE_SSE_CLASS_P (class1))
34341 return ix86_cost->sse_move;
34342 if (MAYBE_MMX_CLASS_P (class1))
34343 return ix86_cost->mmx_move;
34344 return 2;
34347 /* Return TRUE if hard register REGNO can hold a value of machine-mode
34348 MODE. */
34350 bool
34351 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
34353 /* Flags and only flags can only hold CCmode values. */
34354 if (CC_REGNO_P (regno))
34355 return GET_MODE_CLASS (mode) == MODE_CC;
34356 if (GET_MODE_CLASS (mode) == MODE_CC
34357 || GET_MODE_CLASS (mode) == MODE_RANDOM
34358 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
34359 return false;
34360 if (STACK_REGNO_P (regno))
34361 return VALID_FP_MODE_P (mode);
34362 if (MASK_REGNO_P (regno))
34363 return VALID_MASK_REG_MODE (mode);
34364 if (SSE_REGNO_P (regno))
34366 /* We implement the move patterns for all vector modes into and
34367 out of SSE registers, even when no operation instructions
34368 are available. */
34370 /* For AVX-512 we allow, regardless of regno:
34371 - XI mode
34372 - any of 512-bit wide vector mode
34373 - any scalar mode. */
34374 if (TARGET_AVX512F
34375 && (mode == XImode
34376 || VALID_AVX512F_REG_MODE (mode)
34377 || VALID_AVX512F_SCALAR_MODE (mode)))
34378 return true;
34380 /* xmm16-xmm31 are only available for AVX-512. */
34381 if (EXT_REX_SSE_REGNO_P (regno))
34382 return false;
34384 /* OImode move is available only when AVX is enabled. */
34385 return ((TARGET_AVX && mode == OImode)
34386 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34387 || VALID_SSE_REG_MODE (mode)
34388 || VALID_SSE2_REG_MODE (mode)
34389 || VALID_MMX_REG_MODE (mode)
34390 || VALID_MMX_REG_MODE_3DNOW (mode));
34392 if (MMX_REGNO_P (regno))
34394 /* We implement the move patterns for 3DNOW modes even in MMX mode,
34395 so if the register is available at all, then we can move data of
34396 the given mode into or out of it. */
34397 return (VALID_MMX_REG_MODE (mode)
34398 || VALID_MMX_REG_MODE_3DNOW (mode));
34401 if (mode == QImode)
34403 /* Take care for QImode values - they can be in non-QI regs,
34404 but then they do cause partial register stalls. */
34405 if (ANY_QI_REGNO_P (regno))
34406 return true;
34407 if (!TARGET_PARTIAL_REG_STALL)
34408 return true;
34409 /* LRA checks if the hard register is OK for the given mode.
34410 QImode values can live in non-QI regs, so we allow all
34411 registers here. */
34412 if (lra_in_progress)
34413 return true;
34414 return !can_create_pseudo_p ();
34416 /* We handle both integer and floats in the general purpose registers. */
34417 else if (VALID_INT_MODE_P (mode))
34418 return true;
34419 else if (VALID_FP_MODE_P (mode))
34420 return true;
34421 else if (VALID_DFP_MODE_P (mode))
34422 return true;
34423 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
34424 on to use that value in smaller contexts, this can easily force a
34425 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
34426 supporting DImode, allow it. */
34427 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
34428 return true;
34430 return false;
34433 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
34434 tieable integer mode. */
34436 static bool
34437 ix86_tieable_integer_mode_p (enum machine_mode mode)
34439 switch (mode)
34441 case HImode:
34442 case SImode:
34443 return true;
34445 case QImode:
34446 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
34448 case DImode:
34449 return TARGET_64BIT;
34451 default:
34452 return false;
34456 /* Return true if MODE1 is accessible in a register that can hold MODE2
34457 without copying. That is, all register classes that can hold MODE2
34458 can also hold MODE1. */
34460 bool
34461 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
34463 if (mode1 == mode2)
34464 return true;
34466 if (ix86_tieable_integer_mode_p (mode1)
34467 && ix86_tieable_integer_mode_p (mode2))
34468 return true;
34470 /* MODE2 being XFmode implies fp stack or general regs, which means we
34471 can tie any smaller floating point modes to it. Note that we do not
34472 tie this with TFmode. */
34473 if (mode2 == XFmode)
34474 return mode1 == SFmode || mode1 == DFmode;
34476 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
34477 that we can tie it with SFmode. */
34478 if (mode2 == DFmode)
34479 return mode1 == SFmode;
34481 /* If MODE2 is only appropriate for an SSE register, then tie with
34482 any other mode acceptable to SSE registers. */
34483 if (GET_MODE_SIZE (mode2) == 32
34484 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34485 return (GET_MODE_SIZE (mode1) == 32
34486 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34487 if (GET_MODE_SIZE (mode2) == 16
34488 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34489 return (GET_MODE_SIZE (mode1) == 16
34490 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34492 /* If MODE2 is appropriate for an MMX register, then tie
34493 with any other mode acceptable to MMX registers. */
34494 if (GET_MODE_SIZE (mode2) == 8
34495 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
34496 return (GET_MODE_SIZE (mode1) == 8
34497 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
34499 return false;
34502 /* Return the cost of moving between two registers of mode MODE. */
34504 static int
34505 ix86_set_reg_reg_cost (enum machine_mode mode)
34507 unsigned int units = UNITS_PER_WORD;
34509 switch (GET_MODE_CLASS (mode))
34511 default:
34512 break;
34514 case MODE_CC:
34515 units = GET_MODE_SIZE (CCmode);
34516 break;
34518 case MODE_FLOAT:
34519 if ((TARGET_SSE && mode == TFmode)
34520 || (TARGET_80387 && mode == XFmode)
34521 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
34522 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
34523 units = GET_MODE_SIZE (mode);
34524 break;
34526 case MODE_COMPLEX_FLOAT:
34527 if ((TARGET_SSE && mode == TCmode)
34528 || (TARGET_80387 && mode == XCmode)
34529 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
34530 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
34531 units = GET_MODE_SIZE (mode);
34532 break;
34534 case MODE_VECTOR_INT:
34535 case MODE_VECTOR_FLOAT:
34536 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
34537 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34538 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34539 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34540 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
34541 units = GET_MODE_SIZE (mode);
34544 /* Return the cost of moving between two registers of mode MODE,
34545 assuming that the move will be in pieces of at most UNITS bytes. */
34546 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
34549 /* Compute a (partial) cost for rtx X. Return true if the complete
34550 cost has been computed, and false if subexpressions should be
34551 scanned. In either case, *TOTAL contains the cost result. */
34553 static bool
34554 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34555 bool speed)
34557 enum rtx_code code = (enum rtx_code) code_i;
34558 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34559 enum machine_mode mode = GET_MODE (x);
34560 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34562 switch (code)
34564 case SET:
34565 if (register_operand (SET_DEST (x), VOIDmode)
34566 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34568 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34569 return true;
34571 return false;
34573 case CONST_INT:
34574 case CONST:
34575 case LABEL_REF:
34576 case SYMBOL_REF:
34577 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34578 *total = 3;
34579 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34580 *total = 2;
34581 else if (flag_pic && SYMBOLIC_CONST (x)
34582 && (!TARGET_64BIT
34583 || (!GET_CODE (x) != LABEL_REF
34584 && (GET_CODE (x) != SYMBOL_REF
34585 || !SYMBOL_REF_LOCAL_P (x)))))
34586 *total = 1;
34587 else
34588 *total = 0;
34589 return true;
34591 case CONST_DOUBLE:
34592 if (mode == VOIDmode)
34594 *total = 0;
34595 return true;
34597 switch (standard_80387_constant_p (x))
34599 case 1: /* 0.0 */
34600 *total = 1;
34601 return true;
34602 default: /* Other constants */
34603 *total = 2;
34604 return true;
34605 case 0:
34606 case -1:
34607 break;
34609 if (SSE_FLOAT_MODE_P (mode))
34611 case CONST_VECTOR:
34612 switch (standard_sse_constant_p (x))
34614 case 0:
34615 break;
34616 case 1: /* 0: xor eliminates false dependency */
34617 *total = 0;
34618 return true;
34619 default: /* -1: cmp contains false dependency */
34620 *total = 1;
34621 return true;
34624 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34625 it'll probably end up. Add a penalty for size. */
34626 *total = (COSTS_N_INSNS (1)
34627 + (flag_pic != 0 && !TARGET_64BIT)
34628 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34629 return true;
34631 case ZERO_EXTEND:
34632 /* The zero extensions is often completely free on x86_64, so make
34633 it as cheap as possible. */
34634 if (TARGET_64BIT && mode == DImode
34635 && GET_MODE (XEXP (x, 0)) == SImode)
34636 *total = 1;
34637 else if (TARGET_ZERO_EXTEND_WITH_AND)
34638 *total = cost->add;
34639 else
34640 *total = cost->movzx;
34641 return false;
34643 case SIGN_EXTEND:
34644 *total = cost->movsx;
34645 return false;
34647 case ASHIFT:
34648 if (SCALAR_INT_MODE_P (mode)
34649 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34650 && CONST_INT_P (XEXP (x, 1)))
34652 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34653 if (value == 1)
34655 *total = cost->add;
34656 return false;
34658 if ((value == 2 || value == 3)
34659 && cost->lea <= cost->shift_const)
34661 *total = cost->lea;
34662 return false;
34665 /* FALLTHRU */
34667 case ROTATE:
34668 case ASHIFTRT:
34669 case LSHIFTRT:
34670 case ROTATERT:
34671 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34673 /* ??? Should be SSE vector operation cost. */
34674 /* At least for published AMD latencies, this really is the same
34675 as the latency for a simple fpu operation like fabs. */
34676 /* V*QImode is emulated with 1-11 insns. */
34677 if (mode == V16QImode || mode == V32QImode)
34679 int count = 11;
34680 if (TARGET_XOP && mode == V16QImode)
34682 /* For XOP we use vpshab, which requires a broadcast of the
34683 value to the variable shift insn. For constants this
34684 means a V16Q const in mem; even when we can perform the
34685 shift with one insn set the cost to prefer paddb. */
34686 if (CONSTANT_P (XEXP (x, 1)))
34688 *total = (cost->fabs
34689 + rtx_cost (XEXP (x, 0), code, 0, speed)
34690 + (speed ? 2 : COSTS_N_BYTES (16)));
34691 return true;
34693 count = 3;
34695 else if (TARGET_SSSE3)
34696 count = 7;
34697 *total = cost->fabs * count;
34699 else
34700 *total = cost->fabs;
34702 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34704 if (CONST_INT_P (XEXP (x, 1)))
34706 if (INTVAL (XEXP (x, 1)) > 32)
34707 *total = cost->shift_const + COSTS_N_INSNS (2);
34708 else
34709 *total = cost->shift_const * 2;
34711 else
34713 if (GET_CODE (XEXP (x, 1)) == AND)
34714 *total = cost->shift_var * 2;
34715 else
34716 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34719 else
34721 if (CONST_INT_P (XEXP (x, 1)))
34722 *total = cost->shift_const;
34723 else if (GET_CODE (XEXP (x, 1)) == SUBREG
34724 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
34726 /* Return the cost after shift-and truncation. */
34727 *total = cost->shift_var;
34728 return true;
34730 else
34731 *total = cost->shift_var;
34733 return false;
34735 case FMA:
34737 rtx sub;
34739 gcc_assert (FLOAT_MODE_P (mode));
34740 gcc_assert (TARGET_FMA || TARGET_FMA4);
34742 /* ??? SSE scalar/vector cost should be used here. */
34743 /* ??? Bald assumption that fma has the same cost as fmul. */
34744 *total = cost->fmul;
34745 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34747 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34748 sub = XEXP (x, 0);
34749 if (GET_CODE (sub) == NEG)
34750 sub = XEXP (sub, 0);
34751 *total += rtx_cost (sub, FMA, 0, speed);
34753 sub = XEXP (x, 2);
34754 if (GET_CODE (sub) == NEG)
34755 sub = XEXP (sub, 0);
34756 *total += rtx_cost (sub, FMA, 2, speed);
34757 return true;
34760 case MULT:
34761 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34763 /* ??? SSE scalar cost should be used here. */
34764 *total = cost->fmul;
34765 return false;
34767 else if (X87_FLOAT_MODE_P (mode))
34769 *total = cost->fmul;
34770 return false;
34772 else if (FLOAT_MODE_P (mode))
34774 /* ??? SSE vector cost should be used here. */
34775 *total = cost->fmul;
34776 return false;
34778 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34780 /* V*QImode is emulated with 7-13 insns. */
34781 if (mode == V16QImode || mode == V32QImode)
34783 int extra = 11;
34784 if (TARGET_XOP && mode == V16QImode)
34785 extra = 5;
34786 else if (TARGET_SSSE3)
34787 extra = 6;
34788 *total = cost->fmul * 2 + cost->fabs * extra;
34790 /* V*DImode is emulated with 5-8 insns. */
34791 else if (mode == V2DImode || mode == V4DImode)
34793 if (TARGET_XOP && mode == V2DImode)
34794 *total = cost->fmul * 2 + cost->fabs * 3;
34795 else
34796 *total = cost->fmul * 3 + cost->fabs * 5;
34798 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34799 insns, including two PMULUDQ. */
34800 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34801 *total = cost->fmul * 2 + cost->fabs * 5;
34802 else
34803 *total = cost->fmul;
34804 return false;
34806 else
34808 rtx op0 = XEXP (x, 0);
34809 rtx op1 = XEXP (x, 1);
34810 int nbits;
34811 if (CONST_INT_P (XEXP (x, 1)))
34813 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34814 for (nbits = 0; value != 0; value &= value - 1)
34815 nbits++;
34817 else
34818 /* This is arbitrary. */
34819 nbits = 7;
34821 /* Compute costs correctly for widening multiplication. */
34822 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34823 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34824 == GET_MODE_SIZE (mode))
34826 int is_mulwiden = 0;
34827 enum machine_mode inner_mode = GET_MODE (op0);
34829 if (GET_CODE (op0) == GET_CODE (op1))
34830 is_mulwiden = 1, op1 = XEXP (op1, 0);
34831 else if (CONST_INT_P (op1))
34833 if (GET_CODE (op0) == SIGN_EXTEND)
34834 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34835 == INTVAL (op1);
34836 else
34837 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34840 if (is_mulwiden)
34841 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34844 *total = (cost->mult_init[MODE_INDEX (mode)]
34845 + nbits * cost->mult_bit
34846 + rtx_cost (op0, outer_code, opno, speed)
34847 + rtx_cost (op1, outer_code, opno, speed));
34849 return true;
34852 case DIV:
34853 case UDIV:
34854 case MOD:
34855 case UMOD:
34856 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34857 /* ??? SSE cost should be used here. */
34858 *total = cost->fdiv;
34859 else if (X87_FLOAT_MODE_P (mode))
34860 *total = cost->fdiv;
34861 else if (FLOAT_MODE_P (mode))
34862 /* ??? SSE vector cost should be used here. */
34863 *total = cost->fdiv;
34864 else
34865 *total = cost->divide[MODE_INDEX (mode)];
34866 return false;
34868 case PLUS:
34869 if (GET_MODE_CLASS (mode) == MODE_INT
34870 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34872 if (GET_CODE (XEXP (x, 0)) == PLUS
34873 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34874 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34875 && CONSTANT_P (XEXP (x, 1)))
34877 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34878 if (val == 2 || val == 4 || val == 8)
34880 *total = cost->lea;
34881 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34882 outer_code, opno, speed);
34883 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34884 outer_code, opno, speed);
34885 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34886 return true;
34889 else if (GET_CODE (XEXP (x, 0)) == MULT
34890 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34892 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34893 if (val == 2 || val == 4 || val == 8)
34895 *total = cost->lea;
34896 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34897 outer_code, opno, speed);
34898 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34899 return true;
34902 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34904 *total = cost->lea;
34905 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34906 outer_code, opno, speed);
34907 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34908 outer_code, opno, speed);
34909 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34910 return true;
34913 /* FALLTHRU */
34915 case MINUS:
34916 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34918 /* ??? SSE cost should be used here. */
34919 *total = cost->fadd;
34920 return false;
34922 else if (X87_FLOAT_MODE_P (mode))
34924 *total = cost->fadd;
34925 return false;
34927 else if (FLOAT_MODE_P (mode))
34929 /* ??? SSE vector cost should be used here. */
34930 *total = cost->fadd;
34931 return false;
34933 /* FALLTHRU */
34935 case AND:
34936 case IOR:
34937 case XOR:
34938 if (GET_MODE_CLASS (mode) == MODE_INT
34939 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34941 *total = (cost->add * 2
34942 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34943 << (GET_MODE (XEXP (x, 0)) != DImode))
34944 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34945 << (GET_MODE (XEXP (x, 1)) != DImode)));
34946 return true;
34948 /* FALLTHRU */
34950 case NEG:
34951 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34953 /* ??? SSE cost should be used here. */
34954 *total = cost->fchs;
34955 return false;
34957 else if (X87_FLOAT_MODE_P (mode))
34959 *total = cost->fchs;
34960 return false;
34962 else if (FLOAT_MODE_P (mode))
34964 /* ??? SSE vector cost should be used here. */
34965 *total = cost->fchs;
34966 return false;
34968 /* FALLTHRU */
34970 case NOT:
34971 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34973 /* ??? Should be SSE vector operation cost. */
34974 /* At least for published AMD latencies, this really is the same
34975 as the latency for a simple fpu operation like fabs. */
34976 *total = cost->fabs;
34978 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34979 *total = cost->add * 2;
34980 else
34981 *total = cost->add;
34982 return false;
34984 case COMPARE:
34985 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34986 && XEXP (XEXP (x, 0), 1) == const1_rtx
34987 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34988 && XEXP (x, 1) == const0_rtx)
34990 /* This kind of construct is implemented using test[bwl].
34991 Treat it as if we had an AND. */
34992 *total = (cost->add
34993 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34994 + rtx_cost (const1_rtx, outer_code, opno, speed));
34995 return true;
34997 return false;
34999 case FLOAT_EXTEND:
35000 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
35001 *total = 0;
35002 return false;
35004 case ABS:
35005 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35006 /* ??? SSE cost should be used here. */
35007 *total = cost->fabs;
35008 else if (X87_FLOAT_MODE_P (mode))
35009 *total = cost->fabs;
35010 else if (FLOAT_MODE_P (mode))
35011 /* ??? SSE vector cost should be used here. */
35012 *total = cost->fabs;
35013 return false;
35015 case SQRT:
35016 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35017 /* ??? SSE cost should be used here. */
35018 *total = cost->fsqrt;
35019 else if (X87_FLOAT_MODE_P (mode))
35020 *total = cost->fsqrt;
35021 else if (FLOAT_MODE_P (mode))
35022 /* ??? SSE vector cost should be used here. */
35023 *total = cost->fsqrt;
35024 return false;
35026 case UNSPEC:
35027 if (XINT (x, 1) == UNSPEC_TP)
35028 *total = 0;
35029 return false;
35031 case VEC_SELECT:
35032 case VEC_CONCAT:
35033 case VEC_MERGE:
35034 case VEC_DUPLICATE:
35035 /* ??? Assume all of these vector manipulation patterns are
35036 recognizable. In which case they all pretty much have the
35037 same cost. */
35038 *total = cost->fabs;
35039 return true;
35041 default:
35042 return false;
35046 #if TARGET_MACHO
35048 static int current_machopic_label_num;
35050 /* Given a symbol name and its associated stub, write out the
35051 definition of the stub. */
35053 void
35054 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35056 unsigned int length;
35057 char *binder_name, *symbol_name, lazy_ptr_name[32];
35058 int label = ++current_machopic_label_num;
35060 /* For 64-bit we shouldn't get here. */
35061 gcc_assert (!TARGET_64BIT);
35063 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35064 symb = targetm.strip_name_encoding (symb);
35066 length = strlen (stub);
35067 binder_name = XALLOCAVEC (char, length + 32);
35068 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35070 length = strlen (symb);
35071 symbol_name = XALLOCAVEC (char, length + 32);
35072 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35074 sprintf (lazy_ptr_name, "L%d$lz", label);
35076 if (MACHOPIC_ATT_STUB)
35077 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35078 else if (MACHOPIC_PURE)
35079 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
35080 else
35081 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
35083 fprintf (file, "%s:\n", stub);
35084 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35086 if (MACHOPIC_ATT_STUB)
35088 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
35090 else if (MACHOPIC_PURE)
35092 /* PIC stub. */
35093 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35094 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
35095 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
35096 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
35097 label, lazy_ptr_name, label);
35098 fprintf (file, "\tjmp\t*%%ecx\n");
35100 else
35101 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
35103 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
35104 it needs no stub-binding-helper. */
35105 if (MACHOPIC_ATT_STUB)
35106 return;
35108 fprintf (file, "%s:\n", binder_name);
35110 if (MACHOPIC_PURE)
35112 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
35113 fprintf (file, "\tpushl\t%%ecx\n");
35115 else
35116 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
35118 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
35120 /* N.B. Keep the correspondence of these
35121 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
35122 old-pic/new-pic/non-pic stubs; altering this will break
35123 compatibility with existing dylibs. */
35124 if (MACHOPIC_PURE)
35126 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35127 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
35129 else
35130 /* 16-byte -mdynamic-no-pic stub. */
35131 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
35133 fprintf (file, "%s:\n", lazy_ptr_name);
35134 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35135 fprintf (file, ASM_LONG "%s\n", binder_name);
35137 #endif /* TARGET_MACHO */
35139 /* Order the registers for register allocator. */
35141 void
35142 x86_order_regs_for_local_alloc (void)
35144 int pos = 0;
35145 int i;
35147 /* First allocate the local general purpose registers. */
35148 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35149 if (GENERAL_REGNO_P (i) && call_used_regs[i])
35150 reg_alloc_order [pos++] = i;
35152 /* Global general purpose registers. */
35153 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35154 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
35155 reg_alloc_order [pos++] = i;
35157 /* x87 registers come first in case we are doing FP math
35158 using them. */
35159 if (!TARGET_SSE_MATH)
35160 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35161 reg_alloc_order [pos++] = i;
35163 /* SSE registers. */
35164 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
35165 reg_alloc_order [pos++] = i;
35166 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
35167 reg_alloc_order [pos++] = i;
35169 /* Extended REX SSE registers. */
35170 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
35171 reg_alloc_order [pos++] = i;
35173 /* Mask register. */
35174 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
35175 reg_alloc_order [pos++] = i;
35177 /* x87 registers. */
35178 if (TARGET_SSE_MATH)
35179 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35180 reg_alloc_order [pos++] = i;
35182 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
35183 reg_alloc_order [pos++] = i;
35185 /* Initialize the rest of array as we do not allocate some registers
35186 at all. */
35187 while (pos < FIRST_PSEUDO_REGISTER)
35188 reg_alloc_order [pos++] = 0;
35191 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
35192 in struct attribute_spec handler. */
35193 static tree
35194 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
35195 tree args,
35196 int flags ATTRIBUTE_UNUSED,
35197 bool *no_add_attrs)
35199 if (TREE_CODE (*node) != FUNCTION_TYPE
35200 && TREE_CODE (*node) != METHOD_TYPE
35201 && TREE_CODE (*node) != FIELD_DECL
35202 && TREE_CODE (*node) != TYPE_DECL)
35204 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35205 name);
35206 *no_add_attrs = true;
35207 return NULL_TREE;
35209 if (TARGET_64BIT)
35211 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
35212 name);
35213 *no_add_attrs = true;
35214 return NULL_TREE;
35216 if (is_attribute_p ("callee_pop_aggregate_return", name))
35218 tree cst;
35220 cst = TREE_VALUE (args);
35221 if (TREE_CODE (cst) != INTEGER_CST)
35223 warning (OPT_Wattributes,
35224 "%qE attribute requires an integer constant argument",
35225 name);
35226 *no_add_attrs = true;
35228 else if (compare_tree_int (cst, 0) != 0
35229 && compare_tree_int (cst, 1) != 0)
35231 warning (OPT_Wattributes,
35232 "argument to %qE attribute is neither zero, nor one",
35233 name);
35234 *no_add_attrs = true;
35237 return NULL_TREE;
35240 return NULL_TREE;
35243 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
35244 struct attribute_spec.handler. */
35245 static tree
35246 ix86_handle_abi_attribute (tree *node, tree name,
35247 tree args ATTRIBUTE_UNUSED,
35248 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35250 if (TREE_CODE (*node) != FUNCTION_TYPE
35251 && TREE_CODE (*node) != METHOD_TYPE
35252 && TREE_CODE (*node) != FIELD_DECL
35253 && TREE_CODE (*node) != TYPE_DECL)
35255 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35256 name);
35257 *no_add_attrs = true;
35258 return NULL_TREE;
35261 /* Can combine regparm with all attributes but fastcall. */
35262 if (is_attribute_p ("ms_abi", name))
35264 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
35266 error ("ms_abi and sysv_abi attributes are not compatible");
35269 return NULL_TREE;
35271 else if (is_attribute_p ("sysv_abi", name))
35273 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
35275 error ("ms_abi and sysv_abi attributes are not compatible");
35278 return NULL_TREE;
35281 return NULL_TREE;
35284 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
35285 struct attribute_spec.handler. */
35286 static tree
35287 ix86_handle_struct_attribute (tree *node, tree name,
35288 tree args ATTRIBUTE_UNUSED,
35289 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35291 tree *type = NULL;
35292 if (DECL_P (*node))
35294 if (TREE_CODE (*node) == TYPE_DECL)
35295 type = &TREE_TYPE (*node);
35297 else
35298 type = node;
35300 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
35302 warning (OPT_Wattributes, "%qE attribute ignored",
35303 name);
35304 *no_add_attrs = true;
35307 else if ((is_attribute_p ("ms_struct", name)
35308 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
35309 || ((is_attribute_p ("gcc_struct", name)
35310 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
35312 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
35313 name);
35314 *no_add_attrs = true;
35317 return NULL_TREE;
35320 static tree
35321 ix86_handle_fndecl_attribute (tree *node, tree name,
35322 tree args ATTRIBUTE_UNUSED,
35323 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35325 if (TREE_CODE (*node) != FUNCTION_DECL)
35327 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35328 name);
35329 *no_add_attrs = true;
35331 return NULL_TREE;
35334 static bool
35335 ix86_ms_bitfield_layout_p (const_tree record_type)
35337 return ((TARGET_MS_BITFIELD_LAYOUT
35338 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
35339 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
35342 /* Returns an expression indicating where the this parameter is
35343 located on entry to the FUNCTION. */
35345 static rtx
35346 x86_this_parameter (tree function)
35348 tree type = TREE_TYPE (function);
35349 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
35350 int nregs;
35352 if (TARGET_64BIT)
35354 const int *parm_regs;
35356 if (ix86_function_type_abi (type) == MS_ABI)
35357 parm_regs = x86_64_ms_abi_int_parameter_registers;
35358 else
35359 parm_regs = x86_64_int_parameter_registers;
35360 return gen_rtx_REG (Pmode, parm_regs[aggr]);
35363 nregs = ix86_function_regparm (type, function);
35365 if (nregs > 0 && !stdarg_p (type))
35367 int regno;
35368 unsigned int ccvt = ix86_get_callcvt (type);
35370 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35371 regno = aggr ? DX_REG : CX_REG;
35372 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35374 regno = CX_REG;
35375 if (aggr)
35376 return gen_rtx_MEM (SImode,
35377 plus_constant (Pmode, stack_pointer_rtx, 4));
35379 else
35381 regno = AX_REG;
35382 if (aggr)
35384 regno = DX_REG;
35385 if (nregs == 1)
35386 return gen_rtx_MEM (SImode,
35387 plus_constant (Pmode,
35388 stack_pointer_rtx, 4));
35391 return gen_rtx_REG (SImode, regno);
35394 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
35395 aggr ? 8 : 4));
35398 /* Determine whether x86_output_mi_thunk can succeed. */
35400 static bool
35401 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
35402 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
35403 HOST_WIDE_INT vcall_offset, const_tree function)
35405 /* 64-bit can handle anything. */
35406 if (TARGET_64BIT)
35407 return true;
35409 /* For 32-bit, everything's fine if we have one free register. */
35410 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
35411 return true;
35413 /* Need a free register for vcall_offset. */
35414 if (vcall_offset)
35415 return false;
35417 /* Need a free register for GOT references. */
35418 if (flag_pic && !targetm.binds_local_p (function))
35419 return false;
35421 /* Otherwise ok. */
35422 return true;
35425 /* Output the assembler code for a thunk function. THUNK_DECL is the
35426 declaration for the thunk function itself, FUNCTION is the decl for
35427 the target function. DELTA is an immediate constant offset to be
35428 added to THIS. If VCALL_OFFSET is nonzero, the word at
35429 *(*this + vcall_offset) should be added to THIS. */
35431 static void
35432 x86_output_mi_thunk (FILE *file,
35433 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
35434 HOST_WIDE_INT vcall_offset, tree function)
35436 rtx this_param = x86_this_parameter (function);
35437 rtx this_reg, tmp, fnaddr;
35438 unsigned int tmp_regno;
35440 if (TARGET_64BIT)
35441 tmp_regno = R10_REG;
35442 else
35444 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
35445 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35446 tmp_regno = AX_REG;
35447 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35448 tmp_regno = DX_REG;
35449 else
35450 tmp_regno = CX_REG;
35453 emit_note (NOTE_INSN_PROLOGUE_END);
35455 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
35456 pull it in now and let DELTA benefit. */
35457 if (REG_P (this_param))
35458 this_reg = this_param;
35459 else if (vcall_offset)
35461 /* Put the this parameter into %eax. */
35462 this_reg = gen_rtx_REG (Pmode, AX_REG);
35463 emit_move_insn (this_reg, this_param);
35465 else
35466 this_reg = NULL_RTX;
35468 /* Adjust the this parameter by a fixed constant. */
35469 if (delta)
35471 rtx delta_rtx = GEN_INT (delta);
35472 rtx delta_dst = this_reg ? this_reg : this_param;
35474 if (TARGET_64BIT)
35476 if (!x86_64_general_operand (delta_rtx, Pmode))
35478 tmp = gen_rtx_REG (Pmode, tmp_regno);
35479 emit_move_insn (tmp, delta_rtx);
35480 delta_rtx = tmp;
35484 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
35487 /* Adjust the this parameter by a value stored in the vtable. */
35488 if (vcall_offset)
35490 rtx vcall_addr, vcall_mem, this_mem;
35492 tmp = gen_rtx_REG (Pmode, tmp_regno);
35494 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
35495 if (Pmode != ptr_mode)
35496 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
35497 emit_move_insn (tmp, this_mem);
35499 /* Adjust the this parameter. */
35500 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
35501 if (TARGET_64BIT
35502 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
35504 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
35505 emit_move_insn (tmp2, GEN_INT (vcall_offset));
35506 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
35509 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
35510 if (Pmode != ptr_mode)
35511 emit_insn (gen_addsi_1_zext (this_reg,
35512 gen_rtx_REG (ptr_mode,
35513 REGNO (this_reg)),
35514 vcall_mem));
35515 else
35516 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
35519 /* If necessary, drop THIS back to its stack slot. */
35520 if (this_reg && this_reg != this_param)
35521 emit_move_insn (this_param, this_reg);
35523 fnaddr = XEXP (DECL_RTL (function), 0);
35524 if (TARGET_64BIT)
35526 if (!flag_pic || targetm.binds_local_p (function)
35527 || TARGET_PECOFF)
35529 else
35531 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
35532 tmp = gen_rtx_CONST (Pmode, tmp);
35533 fnaddr = gen_rtx_MEM (Pmode, tmp);
35536 else
35538 if (!flag_pic || targetm.binds_local_p (function))
35540 #if TARGET_MACHO
35541 else if (TARGET_MACHO)
35543 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
35544 fnaddr = XEXP (fnaddr, 0);
35546 #endif /* TARGET_MACHO */
35547 else
35549 tmp = gen_rtx_REG (Pmode, CX_REG);
35550 output_set_got (tmp, NULL_RTX);
35552 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
35553 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
35554 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
35558 /* Our sibling call patterns do not allow memories, because we have no
35559 predicate that can distinguish between frame and non-frame memory.
35560 For our purposes here, we can get away with (ab)using a jump pattern,
35561 because we're going to do no optimization. */
35562 if (MEM_P (fnaddr))
35563 emit_jump_insn (gen_indirect_jump (fnaddr));
35564 else
35566 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35567 fnaddr = legitimize_pic_address (fnaddr,
35568 gen_rtx_REG (Pmode, tmp_regno));
35570 if (!sibcall_insn_operand (fnaddr, word_mode))
35572 tmp = gen_rtx_REG (word_mode, tmp_regno);
35573 if (GET_MODE (fnaddr) != word_mode)
35574 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35575 emit_move_insn (tmp, fnaddr);
35576 fnaddr = tmp;
35579 tmp = gen_rtx_MEM (QImode, fnaddr);
35580 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35581 tmp = emit_call_insn (tmp);
35582 SIBLING_CALL_P (tmp) = 1;
35584 emit_barrier ();
35586 /* Emit just enough of rest_of_compilation to get the insns emitted.
35587 Note that use_thunk calls assemble_start_function et al. */
35588 tmp = get_insns ();
35589 shorten_branches (tmp);
35590 final_start_function (tmp, file, 1);
35591 final (tmp, file, 1);
35592 final_end_function ();
35595 static void
35596 x86_file_start (void)
35598 default_file_start ();
35599 #if TARGET_MACHO
35600 darwin_file_start ();
35601 #endif
35602 if (X86_FILE_START_VERSION_DIRECTIVE)
35603 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35604 if (X86_FILE_START_FLTUSED)
35605 fputs ("\t.global\t__fltused\n", asm_out_file);
35606 if (ix86_asm_dialect == ASM_INTEL)
35607 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35611 x86_field_alignment (tree field, int computed)
35613 enum machine_mode mode;
35614 tree type = TREE_TYPE (field);
35616 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35617 return computed;
35618 mode = TYPE_MODE (strip_array_types (type));
35619 if (mode == DFmode || mode == DCmode
35620 || GET_MODE_CLASS (mode) == MODE_INT
35621 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35622 return MIN (32, computed);
35623 return computed;
35626 /* Output assembler code to FILE to increment profiler label # LABELNO
35627 for profiling a function entry. */
35628 void
35629 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35631 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35632 : MCOUNT_NAME);
35634 if (TARGET_64BIT)
35636 #ifndef NO_PROFILE_COUNTERS
35637 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35638 #endif
35640 if (!TARGET_PECOFF && flag_pic)
35641 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35642 else
35643 fprintf (file, "\tcall\t%s\n", mcount_name);
35645 else if (flag_pic)
35647 #ifndef NO_PROFILE_COUNTERS
35648 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35649 LPREFIX, labelno);
35650 #endif
35651 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35653 else
35655 #ifndef NO_PROFILE_COUNTERS
35656 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35657 LPREFIX, labelno);
35658 #endif
35659 fprintf (file, "\tcall\t%s\n", mcount_name);
35663 /* We don't have exact information about the insn sizes, but we may assume
35664 quite safely that we are informed about all 1 byte insns and memory
35665 address sizes. This is enough to eliminate unnecessary padding in
35666 99% of cases. */
35668 static int
35669 min_insn_size (rtx insn)
35671 int l = 0, len;
35673 if (!INSN_P (insn) || !active_insn_p (insn))
35674 return 0;
35676 /* Discard alignments we've emit and jump instructions. */
35677 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35678 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35679 return 0;
35681 /* Important case - calls are always 5 bytes.
35682 It is common to have many calls in the row. */
35683 if (CALL_P (insn)
35684 && symbolic_reference_mentioned_p (PATTERN (insn))
35685 && !SIBLING_CALL_P (insn))
35686 return 5;
35687 len = get_attr_length (insn);
35688 if (len <= 1)
35689 return 1;
35691 /* For normal instructions we rely on get_attr_length being exact,
35692 with a few exceptions. */
35693 if (!JUMP_P (insn))
35695 enum attr_type type = get_attr_type (insn);
35697 switch (type)
35699 case TYPE_MULTI:
35700 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35701 || asm_noperands (PATTERN (insn)) >= 0)
35702 return 0;
35703 break;
35704 case TYPE_OTHER:
35705 case TYPE_FCMP:
35706 break;
35707 default:
35708 /* Otherwise trust get_attr_length. */
35709 return len;
35712 l = get_attr_length_address (insn);
35713 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35714 l = 4;
35716 if (l)
35717 return 1+l;
35718 else
35719 return 2;
35722 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35724 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35725 window. */
35727 static void
35728 ix86_avoid_jump_mispredicts (void)
35730 rtx insn, start = get_insns ();
35731 int nbytes = 0, njumps = 0;
35732 int isjump = 0;
35734 /* Look for all minimal intervals of instructions containing 4 jumps.
35735 The intervals are bounded by START and INSN. NBYTES is the total
35736 size of instructions in the interval including INSN and not including
35737 START. When the NBYTES is smaller than 16 bytes, it is possible
35738 that the end of START and INSN ends up in the same 16byte page.
35740 The smallest offset in the page INSN can start is the case where START
35741 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35742 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35744 for (insn = start; insn; insn = NEXT_INSN (insn))
35746 int min_size;
35748 if (LABEL_P (insn))
35750 int align = label_to_alignment (insn);
35751 int max_skip = label_to_max_skip (insn);
35753 if (max_skip > 15)
35754 max_skip = 15;
35755 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35756 already in the current 16 byte page, because otherwise
35757 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35758 bytes to reach 16 byte boundary. */
35759 if (align <= 0
35760 || (align <= 3 && max_skip != (1 << align) - 1))
35761 max_skip = 0;
35762 if (dump_file)
35763 fprintf (dump_file, "Label %i with max_skip %i\n",
35764 INSN_UID (insn), max_skip);
35765 if (max_skip)
35767 while (nbytes + max_skip >= 16)
35769 start = NEXT_INSN (start);
35770 if (JUMP_P (start) || CALL_P (start))
35771 njumps--, isjump = 1;
35772 else
35773 isjump = 0;
35774 nbytes -= min_insn_size (start);
35777 continue;
35780 min_size = min_insn_size (insn);
35781 nbytes += min_size;
35782 if (dump_file)
35783 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35784 INSN_UID (insn), min_size);
35785 if (JUMP_P (insn) || CALL_P (insn))
35786 njumps++;
35787 else
35788 continue;
35790 while (njumps > 3)
35792 start = NEXT_INSN (start);
35793 if (JUMP_P (start) || CALL_P (start))
35794 njumps--, isjump = 1;
35795 else
35796 isjump = 0;
35797 nbytes -= min_insn_size (start);
35799 gcc_assert (njumps >= 0);
35800 if (dump_file)
35801 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35802 INSN_UID (start), INSN_UID (insn), nbytes);
35804 if (njumps == 3 && isjump && nbytes < 16)
35806 int padsize = 15 - nbytes + min_insn_size (insn);
35808 if (dump_file)
35809 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35810 INSN_UID (insn), padsize);
35811 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35815 #endif
35817 /* AMD Athlon works faster
35818 when RET is not destination of conditional jump or directly preceded
35819 by other jump instruction. We avoid the penalty by inserting NOP just
35820 before the RET instructions in such cases. */
35821 static void
35822 ix86_pad_returns (void)
35824 edge e;
35825 edge_iterator ei;
35827 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35829 basic_block bb = e->src;
35830 rtx ret = BB_END (bb);
35831 rtx prev;
35832 bool replace = false;
35834 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35835 || optimize_bb_for_size_p (bb))
35836 continue;
35837 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35838 if (active_insn_p (prev) || LABEL_P (prev))
35839 break;
35840 if (prev && LABEL_P (prev))
35842 edge e;
35843 edge_iterator ei;
35845 FOR_EACH_EDGE (e, ei, bb->preds)
35846 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35847 && !(e->flags & EDGE_FALLTHRU))
35849 replace = true;
35850 break;
35853 if (!replace)
35855 prev = prev_active_insn (ret);
35856 if (prev
35857 && ((JUMP_P (prev) && any_condjump_p (prev))
35858 || CALL_P (prev)))
35859 replace = true;
35860 /* Empty functions get branch mispredict even when
35861 the jump destination is not visible to us. */
35862 if (!prev && !optimize_function_for_size_p (cfun))
35863 replace = true;
35865 if (replace)
35867 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35868 delete_insn (ret);
35873 /* Count the minimum number of instructions in BB. Return 4 if the
35874 number of instructions >= 4. */
35876 static int
35877 ix86_count_insn_bb (basic_block bb)
35879 rtx insn;
35880 int insn_count = 0;
35882 /* Count number of instructions in this block. Return 4 if the number
35883 of instructions >= 4. */
35884 FOR_BB_INSNS (bb, insn)
35886 /* Only happen in exit blocks. */
35887 if (JUMP_P (insn)
35888 && ANY_RETURN_P (PATTERN (insn)))
35889 break;
35891 if (NONDEBUG_INSN_P (insn)
35892 && GET_CODE (PATTERN (insn)) != USE
35893 && GET_CODE (PATTERN (insn)) != CLOBBER)
35895 insn_count++;
35896 if (insn_count >= 4)
35897 return insn_count;
35901 return insn_count;
35905 /* Count the minimum number of instructions in code path in BB.
35906 Return 4 if the number of instructions >= 4. */
35908 static int
35909 ix86_count_insn (basic_block bb)
35911 edge e;
35912 edge_iterator ei;
35913 int min_prev_count;
35915 /* Only bother counting instructions along paths with no
35916 more than 2 basic blocks between entry and exit. Given
35917 that BB has an edge to exit, determine if a predecessor
35918 of BB has an edge from entry. If so, compute the number
35919 of instructions in the predecessor block. If there
35920 happen to be multiple such blocks, compute the minimum. */
35921 min_prev_count = 4;
35922 FOR_EACH_EDGE (e, ei, bb->preds)
35924 edge prev_e;
35925 edge_iterator prev_ei;
35927 if (e->src == ENTRY_BLOCK_PTR)
35929 min_prev_count = 0;
35930 break;
35932 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35934 if (prev_e->src == ENTRY_BLOCK_PTR)
35936 int count = ix86_count_insn_bb (e->src);
35937 if (count < min_prev_count)
35938 min_prev_count = count;
35939 break;
35944 if (min_prev_count < 4)
35945 min_prev_count += ix86_count_insn_bb (bb);
35947 return min_prev_count;
35950 /* Pad short function to 4 instructions. */
35952 static void
35953 ix86_pad_short_function (void)
35955 edge e;
35956 edge_iterator ei;
35958 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35960 rtx ret = BB_END (e->src);
35961 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35963 int insn_count = ix86_count_insn (e->src);
35965 /* Pad short function. */
35966 if (insn_count < 4)
35968 rtx insn = ret;
35970 /* Find epilogue. */
35971 while (insn
35972 && (!NOTE_P (insn)
35973 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35974 insn = PREV_INSN (insn);
35976 if (!insn)
35977 insn = ret;
35979 /* Two NOPs count as one instruction. */
35980 insn_count = 2 * (4 - insn_count);
35981 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35987 /* Fix up a Windows system unwinder issue. If an EH region falls through into
35988 the epilogue, the Windows system unwinder will apply epilogue logic and
35989 produce incorrect offsets. This can be avoided by adding a nop between
35990 the last insn that can throw and the first insn of the epilogue. */
35992 static void
35993 ix86_seh_fixup_eh_fallthru (void)
35995 edge e;
35996 edge_iterator ei;
35998 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36000 rtx insn, next;
36002 /* Find the beginning of the epilogue. */
36003 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
36004 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
36005 break;
36006 if (insn == NULL)
36007 continue;
36009 /* We only care about preceding insns that can throw. */
36010 insn = prev_active_insn (insn);
36011 if (insn == NULL || !can_throw_internal (insn))
36012 continue;
36014 /* Do not separate calls from their debug information. */
36015 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
36016 if (NOTE_P (next)
36017 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
36018 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
36019 insn = next;
36020 else
36021 break;
36023 emit_insn_after (gen_nops (const1_rtx), insn);
36027 /* Implement machine specific optimizations. We implement padding of returns
36028 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
36029 static void
36030 ix86_reorg (void)
36032 /* We are freeing block_for_insn in the toplev to keep compatibility
36033 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36034 compute_bb_for_insn ();
36036 if (TARGET_SEH && current_function_has_exception_handlers ())
36037 ix86_seh_fixup_eh_fallthru ();
36039 if (optimize && optimize_function_for_speed_p (cfun))
36041 if (TARGET_PAD_SHORT_FUNCTION)
36042 ix86_pad_short_function ();
36043 else if (TARGET_PAD_RETURNS)
36044 ix86_pad_returns ();
36045 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36046 if (TARGET_FOUR_JUMP_LIMIT)
36047 ix86_avoid_jump_mispredicts ();
36048 #endif
36052 /* Return nonzero when QImode register that must be represented via REX prefix
36053 is used. */
36054 bool
36055 x86_extended_QIreg_mentioned_p (rtx insn)
36057 int i;
36058 extract_insn_cached (insn);
36059 for (i = 0; i < recog_data.n_operands; i++)
36060 if (GENERAL_REG_P (recog_data.operand[i])
36061 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36062 return true;
36063 return false;
36066 /* Return nonzero when P points to register encoded via REX prefix.
36067 Called via for_each_rtx. */
36068 static int
36069 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36071 unsigned int regno;
36072 if (!REG_P (*p))
36073 return 0;
36074 regno = REGNO (*p);
36075 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36078 /* Return true when INSN mentions register that must be encoded using REX
36079 prefix. */
36080 bool
36081 x86_extended_reg_mentioned_p (rtx insn)
36083 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
36084 extended_reg_mentioned_1, NULL);
36087 /* If profitable, negate (without causing overflow) integer constant
36088 of mode MODE at location LOC. Return true in this case. */
36089 bool
36090 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
36092 HOST_WIDE_INT val;
36094 if (!CONST_INT_P (*loc))
36095 return false;
36097 switch (mode)
36099 case DImode:
36100 /* DImode x86_64 constants must fit in 32 bits. */
36101 gcc_assert (x86_64_immediate_operand (*loc, mode));
36103 mode = SImode;
36104 break;
36106 case SImode:
36107 case HImode:
36108 case QImode:
36109 break;
36111 default:
36112 gcc_unreachable ();
36115 /* Avoid overflows. */
36116 if (mode_signbit_p (mode, *loc))
36117 return false;
36119 val = INTVAL (*loc);
36121 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
36122 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
36123 if ((val < 0 && val != -128)
36124 || val == 128)
36126 *loc = GEN_INT (-val);
36127 return true;
36130 return false;
36133 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
36134 optabs would emit if we didn't have TFmode patterns. */
36136 void
36137 x86_emit_floatuns (rtx operands[2])
36139 rtx neglab, donelab, i0, i1, f0, in, out;
36140 enum machine_mode mode, inmode;
36142 inmode = GET_MODE (operands[1]);
36143 gcc_assert (inmode == SImode || inmode == DImode);
36145 out = operands[0];
36146 in = force_reg (inmode, operands[1]);
36147 mode = GET_MODE (out);
36148 neglab = gen_label_rtx ();
36149 donelab = gen_label_rtx ();
36150 f0 = gen_reg_rtx (mode);
36152 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
36154 expand_float (out, in, 0);
36156 emit_jump_insn (gen_jump (donelab));
36157 emit_barrier ();
36159 emit_label (neglab);
36161 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
36162 1, OPTAB_DIRECT);
36163 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
36164 1, OPTAB_DIRECT);
36165 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
36167 expand_float (f0, i0, 0);
36169 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
36171 emit_label (donelab);
36174 /* AVX512F does support 64-byte integer vector operations,
36175 thus the longest vector we are faced with is V64QImode. */
36176 #define MAX_VECT_LEN 64
36178 struct expand_vec_perm_d
36180 rtx target, op0, op1;
36181 unsigned char perm[MAX_VECT_LEN];
36182 enum machine_mode vmode;
36183 unsigned char nelt;
36184 bool one_operand_p;
36185 bool testing_p;
36188 static bool canonicalize_perm (struct expand_vec_perm_d *d);
36189 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
36190 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
36192 /* Get a vector mode of the same size as the original but with elements
36193 twice as wide. This is only guaranteed to apply to integral vectors. */
36195 static inline enum machine_mode
36196 get_mode_wider_vector (enum machine_mode o)
36198 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
36199 enum machine_mode n = GET_MODE_WIDER_MODE (o);
36200 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
36201 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
36202 return n;
36205 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36206 with all elements equal to VAR. Return true if successful. */
36208 static bool
36209 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
36210 rtx target, rtx val)
36212 bool ok;
36214 switch (mode)
36216 case V2SImode:
36217 case V2SFmode:
36218 if (!mmx_ok)
36219 return false;
36220 /* FALLTHRU */
36222 case V4DFmode:
36223 case V4DImode:
36224 case V8SFmode:
36225 case V8SImode:
36226 case V2DFmode:
36227 case V2DImode:
36228 case V4SFmode:
36229 case V4SImode:
36231 rtx insn, dup;
36233 /* First attempt to recognize VAL as-is. */
36234 dup = gen_rtx_VEC_DUPLICATE (mode, val);
36235 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
36236 if (recog_memoized (insn) < 0)
36238 rtx seq;
36239 /* If that fails, force VAL into a register. */
36241 start_sequence ();
36242 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
36243 seq = get_insns ();
36244 end_sequence ();
36245 if (seq)
36246 emit_insn_before (seq, insn);
36248 ok = recog_memoized (insn) >= 0;
36249 gcc_assert (ok);
36252 return true;
36254 case V4HImode:
36255 if (!mmx_ok)
36256 return false;
36257 if (TARGET_SSE || TARGET_3DNOW_A)
36259 rtx x;
36261 val = gen_lowpart (SImode, val);
36262 x = gen_rtx_TRUNCATE (HImode, val);
36263 x = gen_rtx_VEC_DUPLICATE (mode, x);
36264 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36265 return true;
36267 goto widen;
36269 case V8QImode:
36270 if (!mmx_ok)
36271 return false;
36272 goto widen;
36274 case V8HImode:
36275 if (TARGET_SSE2)
36277 struct expand_vec_perm_d dperm;
36278 rtx tmp1, tmp2;
36280 permute:
36281 memset (&dperm, 0, sizeof (dperm));
36282 dperm.target = target;
36283 dperm.vmode = mode;
36284 dperm.nelt = GET_MODE_NUNITS (mode);
36285 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
36286 dperm.one_operand_p = true;
36288 /* Extend to SImode using a paradoxical SUBREG. */
36289 tmp1 = gen_reg_rtx (SImode);
36290 emit_move_insn (tmp1, gen_lowpart (SImode, val));
36292 /* Insert the SImode value as low element of a V4SImode vector. */
36293 tmp2 = gen_lowpart (V4SImode, dperm.op0);
36294 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
36296 ok = (expand_vec_perm_1 (&dperm)
36297 || expand_vec_perm_broadcast_1 (&dperm));
36298 gcc_assert (ok);
36299 return ok;
36301 goto widen;
36303 case V16QImode:
36304 if (TARGET_SSE2)
36305 goto permute;
36306 goto widen;
36308 widen:
36309 /* Replicate the value once into the next wider mode and recurse. */
36311 enum machine_mode smode, wsmode, wvmode;
36312 rtx x;
36314 smode = GET_MODE_INNER (mode);
36315 wvmode = get_mode_wider_vector (mode);
36316 wsmode = GET_MODE_INNER (wvmode);
36318 val = convert_modes (wsmode, smode, val, true);
36319 x = expand_simple_binop (wsmode, ASHIFT, val,
36320 GEN_INT (GET_MODE_BITSIZE (smode)),
36321 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36322 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
36324 x = gen_lowpart (wvmode, target);
36325 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
36326 gcc_assert (ok);
36327 return ok;
36330 case V16HImode:
36331 case V32QImode:
36333 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
36334 rtx x = gen_reg_rtx (hvmode);
36336 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
36337 gcc_assert (ok);
36339 x = gen_rtx_VEC_CONCAT (mode, x, x);
36340 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36342 return true;
36344 default:
36345 return false;
36349 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36350 whose ONE_VAR element is VAR, and other elements are zero. Return true
36351 if successful. */
36353 static bool
36354 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
36355 rtx target, rtx var, int one_var)
36357 enum machine_mode vsimode;
36358 rtx new_target;
36359 rtx x, tmp;
36360 bool use_vector_set = false;
36362 switch (mode)
36364 case V2DImode:
36365 /* For SSE4.1, we normally use vector set. But if the second
36366 element is zero and inter-unit moves are OK, we use movq
36367 instead. */
36368 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
36369 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
36370 && one_var == 0));
36371 break;
36372 case V16QImode:
36373 case V4SImode:
36374 case V4SFmode:
36375 use_vector_set = TARGET_SSE4_1;
36376 break;
36377 case V8HImode:
36378 use_vector_set = TARGET_SSE2;
36379 break;
36380 case V4HImode:
36381 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
36382 break;
36383 case V32QImode:
36384 case V16HImode:
36385 case V8SImode:
36386 case V8SFmode:
36387 case V4DFmode:
36388 use_vector_set = TARGET_AVX;
36389 break;
36390 case V4DImode:
36391 /* Use ix86_expand_vector_set in 64bit mode only. */
36392 use_vector_set = TARGET_AVX && TARGET_64BIT;
36393 break;
36394 default:
36395 break;
36398 if (use_vector_set)
36400 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
36401 var = force_reg (GET_MODE_INNER (mode), var);
36402 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36403 return true;
36406 switch (mode)
36408 case V2SFmode:
36409 case V2SImode:
36410 if (!mmx_ok)
36411 return false;
36412 /* FALLTHRU */
36414 case V2DFmode:
36415 case V2DImode:
36416 if (one_var != 0)
36417 return false;
36418 var = force_reg (GET_MODE_INNER (mode), var);
36419 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
36420 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36421 return true;
36423 case V4SFmode:
36424 case V4SImode:
36425 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
36426 new_target = gen_reg_rtx (mode);
36427 else
36428 new_target = target;
36429 var = force_reg (GET_MODE_INNER (mode), var);
36430 x = gen_rtx_VEC_DUPLICATE (mode, var);
36431 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
36432 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
36433 if (one_var != 0)
36435 /* We need to shuffle the value to the correct position, so
36436 create a new pseudo to store the intermediate result. */
36438 /* With SSE2, we can use the integer shuffle insns. */
36439 if (mode != V4SFmode && TARGET_SSE2)
36441 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
36442 const1_rtx,
36443 GEN_INT (one_var == 1 ? 0 : 1),
36444 GEN_INT (one_var == 2 ? 0 : 1),
36445 GEN_INT (one_var == 3 ? 0 : 1)));
36446 if (target != new_target)
36447 emit_move_insn (target, new_target);
36448 return true;
36451 /* Otherwise convert the intermediate result to V4SFmode and
36452 use the SSE1 shuffle instructions. */
36453 if (mode != V4SFmode)
36455 tmp = gen_reg_rtx (V4SFmode);
36456 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
36458 else
36459 tmp = new_target;
36461 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
36462 const1_rtx,
36463 GEN_INT (one_var == 1 ? 0 : 1),
36464 GEN_INT (one_var == 2 ? 0+4 : 1+4),
36465 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
36467 if (mode != V4SFmode)
36468 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
36469 else if (tmp != target)
36470 emit_move_insn (target, tmp);
36472 else if (target != new_target)
36473 emit_move_insn (target, new_target);
36474 return true;
36476 case V8HImode:
36477 case V16QImode:
36478 vsimode = V4SImode;
36479 goto widen;
36480 case V4HImode:
36481 case V8QImode:
36482 if (!mmx_ok)
36483 return false;
36484 vsimode = V2SImode;
36485 goto widen;
36486 widen:
36487 if (one_var != 0)
36488 return false;
36490 /* Zero extend the variable element to SImode and recurse. */
36491 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
36493 x = gen_reg_rtx (vsimode);
36494 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
36495 var, one_var))
36496 gcc_unreachable ();
36498 emit_move_insn (target, gen_lowpart (mode, x));
36499 return true;
36501 default:
36502 return false;
36506 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36507 consisting of the values in VALS. It is known that all elements
36508 except ONE_VAR are constants. Return true if successful. */
36510 static bool
36511 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
36512 rtx target, rtx vals, int one_var)
36514 rtx var = XVECEXP (vals, 0, one_var);
36515 enum machine_mode wmode;
36516 rtx const_vec, x;
36518 const_vec = copy_rtx (vals);
36519 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
36520 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
36522 switch (mode)
36524 case V2DFmode:
36525 case V2DImode:
36526 case V2SFmode:
36527 case V2SImode:
36528 /* For the two element vectors, it's just as easy to use
36529 the general case. */
36530 return false;
36532 case V4DImode:
36533 /* Use ix86_expand_vector_set in 64bit mode only. */
36534 if (!TARGET_64BIT)
36535 return false;
36536 case V4DFmode:
36537 case V8SFmode:
36538 case V8SImode:
36539 case V16HImode:
36540 case V32QImode:
36541 case V4SFmode:
36542 case V4SImode:
36543 case V8HImode:
36544 case V4HImode:
36545 break;
36547 case V16QImode:
36548 if (TARGET_SSE4_1)
36549 break;
36550 wmode = V8HImode;
36551 goto widen;
36552 case V8QImode:
36553 wmode = V4HImode;
36554 goto widen;
36555 widen:
36556 /* There's no way to set one QImode entry easily. Combine
36557 the variable value with its adjacent constant value, and
36558 promote to an HImode set. */
36559 x = XVECEXP (vals, 0, one_var ^ 1);
36560 if (one_var & 1)
36562 var = convert_modes (HImode, QImode, var, true);
36563 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
36564 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36565 x = GEN_INT (INTVAL (x) & 0xff);
36567 else
36569 var = convert_modes (HImode, QImode, var, true);
36570 x = gen_int_mode (INTVAL (x) << 8, HImode);
36572 if (x != const0_rtx)
36573 var = expand_simple_binop (HImode, IOR, var, x, var,
36574 1, OPTAB_LIB_WIDEN);
36576 x = gen_reg_rtx (wmode);
36577 emit_move_insn (x, gen_lowpart (wmode, const_vec));
36578 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
36580 emit_move_insn (target, gen_lowpart (mode, x));
36581 return true;
36583 default:
36584 return false;
36587 emit_move_insn (target, const_vec);
36588 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36589 return true;
36592 /* A subroutine of ix86_expand_vector_init_general. Use vector
36593 concatenate to handle the most general case: all values variable,
36594 and none identical. */
36596 static void
36597 ix86_expand_vector_init_concat (enum machine_mode mode,
36598 rtx target, rtx *ops, int n)
36600 enum machine_mode cmode, hmode = VOIDmode;
36601 rtx first[8], second[4];
36602 rtvec v;
36603 int i, j;
36605 switch (n)
36607 case 2:
36608 switch (mode)
36610 case V8SImode:
36611 cmode = V4SImode;
36612 break;
36613 case V8SFmode:
36614 cmode = V4SFmode;
36615 break;
36616 case V4DImode:
36617 cmode = V2DImode;
36618 break;
36619 case V4DFmode:
36620 cmode = V2DFmode;
36621 break;
36622 case V4SImode:
36623 cmode = V2SImode;
36624 break;
36625 case V4SFmode:
36626 cmode = V2SFmode;
36627 break;
36628 case V2DImode:
36629 cmode = DImode;
36630 break;
36631 case V2SImode:
36632 cmode = SImode;
36633 break;
36634 case V2DFmode:
36635 cmode = DFmode;
36636 break;
36637 case V2SFmode:
36638 cmode = SFmode;
36639 break;
36640 default:
36641 gcc_unreachable ();
36644 if (!register_operand (ops[1], cmode))
36645 ops[1] = force_reg (cmode, ops[1]);
36646 if (!register_operand (ops[0], cmode))
36647 ops[0] = force_reg (cmode, ops[0]);
36648 emit_insn (gen_rtx_SET (VOIDmode, target,
36649 gen_rtx_VEC_CONCAT (mode, ops[0],
36650 ops[1])));
36651 break;
36653 case 4:
36654 switch (mode)
36656 case V4DImode:
36657 cmode = V2DImode;
36658 break;
36659 case V4DFmode:
36660 cmode = V2DFmode;
36661 break;
36662 case V4SImode:
36663 cmode = V2SImode;
36664 break;
36665 case V4SFmode:
36666 cmode = V2SFmode;
36667 break;
36668 default:
36669 gcc_unreachable ();
36671 goto half;
36673 case 8:
36674 switch (mode)
36676 case V8SImode:
36677 cmode = V2SImode;
36678 hmode = V4SImode;
36679 break;
36680 case V8SFmode:
36681 cmode = V2SFmode;
36682 hmode = V4SFmode;
36683 break;
36684 default:
36685 gcc_unreachable ();
36687 goto half;
36689 half:
36690 /* FIXME: We process inputs backward to help RA. PR 36222. */
36691 i = n - 1;
36692 j = (n >> 1) - 1;
36693 for (; i > 0; i -= 2, j--)
36695 first[j] = gen_reg_rtx (cmode);
36696 v = gen_rtvec (2, ops[i - 1], ops[i]);
36697 ix86_expand_vector_init (false, first[j],
36698 gen_rtx_PARALLEL (cmode, v));
36701 n >>= 1;
36702 if (n > 2)
36704 gcc_assert (hmode != VOIDmode);
36705 for (i = j = 0; i < n; i += 2, j++)
36707 second[j] = gen_reg_rtx (hmode);
36708 ix86_expand_vector_init_concat (hmode, second [j],
36709 &first [i], 2);
36711 n >>= 1;
36712 ix86_expand_vector_init_concat (mode, target, second, n);
36714 else
36715 ix86_expand_vector_init_concat (mode, target, first, n);
36716 break;
36718 default:
36719 gcc_unreachable ();
36723 /* A subroutine of ix86_expand_vector_init_general. Use vector
36724 interleave to handle the most general case: all values variable,
36725 and none identical. */
36727 static void
36728 ix86_expand_vector_init_interleave (enum machine_mode mode,
36729 rtx target, rtx *ops, int n)
36731 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36732 int i, j;
36733 rtx op0, op1;
36734 rtx (*gen_load_even) (rtx, rtx, rtx);
36735 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36736 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36738 switch (mode)
36740 case V8HImode:
36741 gen_load_even = gen_vec_setv8hi;
36742 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36743 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36744 inner_mode = HImode;
36745 first_imode = V4SImode;
36746 second_imode = V2DImode;
36747 third_imode = VOIDmode;
36748 break;
36749 case V16QImode:
36750 gen_load_even = gen_vec_setv16qi;
36751 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36752 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36753 inner_mode = QImode;
36754 first_imode = V8HImode;
36755 second_imode = V4SImode;
36756 third_imode = V2DImode;
36757 break;
36758 default:
36759 gcc_unreachable ();
36762 for (i = 0; i < n; i++)
36764 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36765 op0 = gen_reg_rtx (SImode);
36766 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36768 /* Insert the SImode value as low element of V4SImode vector. */
36769 op1 = gen_reg_rtx (V4SImode);
36770 op0 = gen_rtx_VEC_MERGE (V4SImode,
36771 gen_rtx_VEC_DUPLICATE (V4SImode,
36772 op0),
36773 CONST0_RTX (V4SImode),
36774 const1_rtx);
36775 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36777 /* Cast the V4SImode vector back to a vector in orignal mode. */
36778 op0 = gen_reg_rtx (mode);
36779 emit_move_insn (op0, gen_lowpart (mode, op1));
36781 /* Load even elements into the second position. */
36782 emit_insn (gen_load_even (op0,
36783 force_reg (inner_mode,
36784 ops [i + i + 1]),
36785 const1_rtx));
36787 /* Cast vector to FIRST_IMODE vector. */
36788 ops[i] = gen_reg_rtx (first_imode);
36789 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36792 /* Interleave low FIRST_IMODE vectors. */
36793 for (i = j = 0; i < n; i += 2, j++)
36795 op0 = gen_reg_rtx (first_imode);
36796 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36798 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36799 ops[j] = gen_reg_rtx (second_imode);
36800 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36803 /* Interleave low SECOND_IMODE vectors. */
36804 switch (second_imode)
36806 case V4SImode:
36807 for (i = j = 0; i < n / 2; i += 2, j++)
36809 op0 = gen_reg_rtx (second_imode);
36810 emit_insn (gen_interleave_second_low (op0, ops[i],
36811 ops[i + 1]));
36813 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36814 vector. */
36815 ops[j] = gen_reg_rtx (third_imode);
36816 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36818 second_imode = V2DImode;
36819 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36820 /* FALLTHRU */
36822 case V2DImode:
36823 op0 = gen_reg_rtx (second_imode);
36824 emit_insn (gen_interleave_second_low (op0, ops[0],
36825 ops[1]));
36827 /* Cast the SECOND_IMODE vector back to a vector on original
36828 mode. */
36829 emit_insn (gen_rtx_SET (VOIDmode, target,
36830 gen_lowpart (mode, op0)));
36831 break;
36833 default:
36834 gcc_unreachable ();
36838 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36839 all values variable, and none identical. */
36841 static void
36842 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36843 rtx target, rtx vals)
36845 rtx ops[32], op0, op1;
36846 enum machine_mode half_mode = VOIDmode;
36847 int n, i;
36849 switch (mode)
36851 case V2SFmode:
36852 case V2SImode:
36853 if (!mmx_ok && !TARGET_SSE)
36854 break;
36855 /* FALLTHRU */
36857 case V8SFmode:
36858 case V8SImode:
36859 case V4DFmode:
36860 case V4DImode:
36861 case V4SFmode:
36862 case V4SImode:
36863 case V2DFmode:
36864 case V2DImode:
36865 n = GET_MODE_NUNITS (mode);
36866 for (i = 0; i < n; i++)
36867 ops[i] = XVECEXP (vals, 0, i);
36868 ix86_expand_vector_init_concat (mode, target, ops, n);
36869 return;
36871 case V32QImode:
36872 half_mode = V16QImode;
36873 goto half;
36875 case V16HImode:
36876 half_mode = V8HImode;
36877 goto half;
36879 half:
36880 n = GET_MODE_NUNITS (mode);
36881 for (i = 0; i < n; i++)
36882 ops[i] = XVECEXP (vals, 0, i);
36883 op0 = gen_reg_rtx (half_mode);
36884 op1 = gen_reg_rtx (half_mode);
36885 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36886 n >> 2);
36887 ix86_expand_vector_init_interleave (half_mode, op1,
36888 &ops [n >> 1], n >> 2);
36889 emit_insn (gen_rtx_SET (VOIDmode, target,
36890 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36891 return;
36893 case V16QImode:
36894 if (!TARGET_SSE4_1)
36895 break;
36896 /* FALLTHRU */
36898 case V8HImode:
36899 if (!TARGET_SSE2)
36900 break;
36902 /* Don't use ix86_expand_vector_init_interleave if we can't
36903 move from GPR to SSE register directly. */
36904 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
36905 break;
36907 n = GET_MODE_NUNITS (mode);
36908 for (i = 0; i < n; i++)
36909 ops[i] = XVECEXP (vals, 0, i);
36910 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36911 return;
36913 case V4HImode:
36914 case V8QImode:
36915 break;
36917 default:
36918 gcc_unreachable ();
36922 int i, j, n_elts, n_words, n_elt_per_word;
36923 enum machine_mode inner_mode;
36924 rtx words[4], shift;
36926 inner_mode = GET_MODE_INNER (mode);
36927 n_elts = GET_MODE_NUNITS (mode);
36928 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36929 n_elt_per_word = n_elts / n_words;
36930 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36932 for (i = 0; i < n_words; ++i)
36934 rtx word = NULL_RTX;
36936 for (j = 0; j < n_elt_per_word; ++j)
36938 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36939 elt = convert_modes (word_mode, inner_mode, elt, true);
36941 if (j == 0)
36942 word = elt;
36943 else
36945 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36946 word, 1, OPTAB_LIB_WIDEN);
36947 word = expand_simple_binop (word_mode, IOR, word, elt,
36948 word, 1, OPTAB_LIB_WIDEN);
36952 words[i] = word;
36955 if (n_words == 1)
36956 emit_move_insn (target, gen_lowpart (mode, words[0]));
36957 else if (n_words == 2)
36959 rtx tmp = gen_reg_rtx (mode);
36960 emit_clobber (tmp);
36961 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36962 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36963 emit_move_insn (target, tmp);
36965 else if (n_words == 4)
36967 rtx tmp = gen_reg_rtx (V4SImode);
36968 gcc_assert (word_mode == SImode);
36969 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36970 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36971 emit_move_insn (target, gen_lowpart (mode, tmp));
36973 else
36974 gcc_unreachable ();
36978 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36979 instructions unless MMX_OK is true. */
36981 void
36982 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36984 enum machine_mode mode = GET_MODE (target);
36985 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36986 int n_elts = GET_MODE_NUNITS (mode);
36987 int n_var = 0, one_var = -1;
36988 bool all_same = true, all_const_zero = true;
36989 int i;
36990 rtx x;
36992 for (i = 0; i < n_elts; ++i)
36994 x = XVECEXP (vals, 0, i);
36995 if (!(CONST_INT_P (x)
36996 || GET_CODE (x) == CONST_DOUBLE
36997 || GET_CODE (x) == CONST_FIXED))
36998 n_var++, one_var = i;
36999 else if (x != CONST0_RTX (inner_mode))
37000 all_const_zero = false;
37001 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
37002 all_same = false;
37005 /* Constants are best loaded from the constant pool. */
37006 if (n_var == 0)
37008 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
37009 return;
37012 /* If all values are identical, broadcast the value. */
37013 if (all_same
37014 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
37015 XVECEXP (vals, 0, 0)))
37016 return;
37018 /* Values where only one field is non-constant are best loaded from
37019 the pool and overwritten via move later. */
37020 if (n_var == 1)
37022 if (all_const_zero
37023 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
37024 XVECEXP (vals, 0, one_var),
37025 one_var))
37026 return;
37028 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
37029 return;
37032 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37035 void
37036 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37038 enum machine_mode mode = GET_MODE (target);
37039 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37040 enum machine_mode half_mode;
37041 bool use_vec_merge = false;
37042 rtx tmp;
37043 static rtx (*gen_extract[6][2]) (rtx, rtx)
37045 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37046 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37047 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37048 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37049 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37050 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37052 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37054 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37055 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37056 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37057 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37058 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37059 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37061 int i, j, n;
37063 switch (mode)
37065 case V2SFmode:
37066 case V2SImode:
37067 if (mmx_ok)
37069 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37070 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37071 if (elt == 0)
37072 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37073 else
37074 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37075 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37076 return;
37078 break;
37080 case V2DImode:
37081 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
37082 if (use_vec_merge)
37083 break;
37085 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37086 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
37087 if (elt == 0)
37088 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37089 else
37090 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37091 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37092 return;
37094 case V2DFmode:
37096 rtx op0, op1;
37098 /* For the two element vectors, we implement a VEC_CONCAT with
37099 the extraction of the other element. */
37101 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
37102 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
37104 if (elt == 0)
37105 op0 = val, op1 = tmp;
37106 else
37107 op0 = tmp, op1 = val;
37109 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
37110 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37112 return;
37114 case V4SFmode:
37115 use_vec_merge = TARGET_SSE4_1;
37116 if (use_vec_merge)
37117 break;
37119 switch (elt)
37121 case 0:
37122 use_vec_merge = true;
37123 break;
37125 case 1:
37126 /* tmp = target = A B C D */
37127 tmp = copy_to_reg (target);
37128 /* target = A A B B */
37129 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
37130 /* target = X A B B */
37131 ix86_expand_vector_set (false, target, val, 0);
37132 /* target = A X C D */
37133 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37134 const1_rtx, const0_rtx,
37135 GEN_INT (2+4), GEN_INT (3+4)));
37136 return;
37138 case 2:
37139 /* tmp = target = A B C D */
37140 tmp = copy_to_reg (target);
37141 /* tmp = X B C D */
37142 ix86_expand_vector_set (false, tmp, val, 0);
37143 /* target = A B X D */
37144 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37145 const0_rtx, const1_rtx,
37146 GEN_INT (0+4), GEN_INT (3+4)));
37147 return;
37149 case 3:
37150 /* tmp = target = A B C D */
37151 tmp = copy_to_reg (target);
37152 /* tmp = X B C D */
37153 ix86_expand_vector_set (false, tmp, val, 0);
37154 /* target = A B X D */
37155 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37156 const0_rtx, const1_rtx,
37157 GEN_INT (2+4), GEN_INT (0+4)));
37158 return;
37160 default:
37161 gcc_unreachable ();
37163 break;
37165 case V4SImode:
37166 use_vec_merge = TARGET_SSE4_1;
37167 if (use_vec_merge)
37168 break;
37170 /* Element 0 handled by vec_merge below. */
37171 if (elt == 0)
37173 use_vec_merge = true;
37174 break;
37177 if (TARGET_SSE2)
37179 /* With SSE2, use integer shuffles to swap element 0 and ELT,
37180 store into element 0, then shuffle them back. */
37182 rtx order[4];
37184 order[0] = GEN_INT (elt);
37185 order[1] = const1_rtx;
37186 order[2] = const2_rtx;
37187 order[3] = GEN_INT (3);
37188 order[elt] = const0_rtx;
37190 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37191 order[1], order[2], order[3]));
37193 ix86_expand_vector_set (false, target, val, 0);
37195 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37196 order[1], order[2], order[3]));
37198 else
37200 /* For SSE1, we have to reuse the V4SF code. */
37201 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
37202 gen_lowpart (SFmode, val), elt);
37204 return;
37206 case V8HImode:
37207 use_vec_merge = TARGET_SSE2;
37208 break;
37209 case V4HImode:
37210 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37211 break;
37213 case V16QImode:
37214 use_vec_merge = TARGET_SSE4_1;
37215 break;
37217 case V8QImode:
37218 break;
37220 case V32QImode:
37221 half_mode = V16QImode;
37222 j = 0;
37223 n = 16;
37224 goto half;
37226 case V16HImode:
37227 half_mode = V8HImode;
37228 j = 1;
37229 n = 8;
37230 goto half;
37232 case V8SImode:
37233 half_mode = V4SImode;
37234 j = 2;
37235 n = 4;
37236 goto half;
37238 case V4DImode:
37239 half_mode = V2DImode;
37240 j = 3;
37241 n = 2;
37242 goto half;
37244 case V8SFmode:
37245 half_mode = V4SFmode;
37246 j = 4;
37247 n = 4;
37248 goto half;
37250 case V4DFmode:
37251 half_mode = V2DFmode;
37252 j = 5;
37253 n = 2;
37254 goto half;
37256 half:
37257 /* Compute offset. */
37258 i = elt / n;
37259 elt %= n;
37261 gcc_assert (i <= 1);
37263 /* Extract the half. */
37264 tmp = gen_reg_rtx (half_mode);
37265 emit_insn (gen_extract[j][i] (tmp, target));
37267 /* Put val in tmp at elt. */
37268 ix86_expand_vector_set (false, tmp, val, elt);
37270 /* Put it back. */
37271 emit_insn (gen_insert[j][i] (target, target, tmp));
37272 return;
37274 default:
37275 break;
37278 if (use_vec_merge)
37280 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
37281 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
37282 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37284 else
37286 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37288 emit_move_insn (mem, target);
37290 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37291 emit_move_insn (tmp, val);
37293 emit_move_insn (target, mem);
37297 void
37298 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
37300 enum machine_mode mode = GET_MODE (vec);
37301 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37302 bool use_vec_extr = false;
37303 rtx tmp;
37305 switch (mode)
37307 case V2SImode:
37308 case V2SFmode:
37309 if (!mmx_ok)
37310 break;
37311 /* FALLTHRU */
37313 case V2DFmode:
37314 case V2DImode:
37315 use_vec_extr = true;
37316 break;
37318 case V4SFmode:
37319 use_vec_extr = TARGET_SSE4_1;
37320 if (use_vec_extr)
37321 break;
37323 switch (elt)
37325 case 0:
37326 tmp = vec;
37327 break;
37329 case 1:
37330 case 3:
37331 tmp = gen_reg_rtx (mode);
37332 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
37333 GEN_INT (elt), GEN_INT (elt),
37334 GEN_INT (elt+4), GEN_INT (elt+4)));
37335 break;
37337 case 2:
37338 tmp = gen_reg_rtx (mode);
37339 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
37340 break;
37342 default:
37343 gcc_unreachable ();
37345 vec = tmp;
37346 use_vec_extr = true;
37347 elt = 0;
37348 break;
37350 case V4SImode:
37351 use_vec_extr = TARGET_SSE4_1;
37352 if (use_vec_extr)
37353 break;
37355 if (TARGET_SSE2)
37357 switch (elt)
37359 case 0:
37360 tmp = vec;
37361 break;
37363 case 1:
37364 case 3:
37365 tmp = gen_reg_rtx (mode);
37366 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
37367 GEN_INT (elt), GEN_INT (elt),
37368 GEN_INT (elt), GEN_INT (elt)));
37369 break;
37371 case 2:
37372 tmp = gen_reg_rtx (mode);
37373 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
37374 break;
37376 default:
37377 gcc_unreachable ();
37379 vec = tmp;
37380 use_vec_extr = true;
37381 elt = 0;
37383 else
37385 /* For SSE1, we have to reuse the V4SF code. */
37386 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
37387 gen_lowpart (V4SFmode, vec), elt);
37388 return;
37390 break;
37392 case V8HImode:
37393 use_vec_extr = TARGET_SSE2;
37394 break;
37395 case V4HImode:
37396 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37397 break;
37399 case V16QImode:
37400 use_vec_extr = TARGET_SSE4_1;
37401 break;
37403 case V8SFmode:
37404 if (TARGET_AVX)
37406 tmp = gen_reg_rtx (V4SFmode);
37407 if (elt < 4)
37408 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
37409 else
37410 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
37411 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37412 return;
37414 break;
37416 case V4DFmode:
37417 if (TARGET_AVX)
37419 tmp = gen_reg_rtx (V2DFmode);
37420 if (elt < 2)
37421 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
37422 else
37423 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
37424 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37425 return;
37427 break;
37429 case V32QImode:
37430 if (TARGET_AVX)
37432 tmp = gen_reg_rtx (V16QImode);
37433 if (elt < 16)
37434 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
37435 else
37436 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
37437 ix86_expand_vector_extract (false, target, tmp, elt & 15);
37438 return;
37440 break;
37442 case V16HImode:
37443 if (TARGET_AVX)
37445 tmp = gen_reg_rtx (V8HImode);
37446 if (elt < 8)
37447 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
37448 else
37449 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
37450 ix86_expand_vector_extract (false, target, tmp, elt & 7);
37451 return;
37453 break;
37455 case V8SImode:
37456 if (TARGET_AVX)
37458 tmp = gen_reg_rtx (V4SImode);
37459 if (elt < 4)
37460 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
37461 else
37462 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
37463 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37464 return;
37466 break;
37468 case V4DImode:
37469 if (TARGET_AVX)
37471 tmp = gen_reg_rtx (V2DImode);
37472 if (elt < 2)
37473 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
37474 else
37475 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
37476 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37477 return;
37479 break;
37481 case V8QImode:
37482 /* ??? Could extract the appropriate HImode element and shift. */
37483 default:
37484 break;
37487 if (use_vec_extr)
37489 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
37490 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
37492 /* Let the rtl optimizers know about the zero extension performed. */
37493 if (inner_mode == QImode || inner_mode == HImode)
37495 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
37496 target = gen_lowpart (SImode, target);
37499 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37501 else
37503 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37505 emit_move_insn (mem, vec);
37507 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37508 emit_move_insn (target, tmp);
37512 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
37513 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
37514 The upper bits of DEST are undefined, though they shouldn't cause
37515 exceptions (some bits from src or all zeros are ok). */
37517 static void
37518 emit_reduc_half (rtx dest, rtx src, int i)
37520 rtx tem;
37521 switch (GET_MODE (src))
37523 case V4SFmode:
37524 if (i == 128)
37525 tem = gen_sse_movhlps (dest, src, src);
37526 else
37527 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
37528 GEN_INT (1 + 4), GEN_INT (1 + 4));
37529 break;
37530 case V2DFmode:
37531 tem = gen_vec_interleave_highv2df (dest, src, src);
37532 break;
37533 case V16QImode:
37534 case V8HImode:
37535 case V4SImode:
37536 case V2DImode:
37537 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
37538 gen_lowpart (V1TImode, src),
37539 GEN_INT (i / 2));
37540 break;
37541 case V8SFmode:
37542 if (i == 256)
37543 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
37544 else
37545 tem = gen_avx_shufps256 (dest, src, src,
37546 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
37547 break;
37548 case V4DFmode:
37549 if (i == 256)
37550 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
37551 else
37552 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
37553 break;
37554 case V32QImode:
37555 case V16HImode:
37556 case V8SImode:
37557 case V4DImode:
37558 if (i == 256)
37559 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
37560 gen_lowpart (V4DImode, src),
37561 gen_lowpart (V4DImode, src),
37562 const1_rtx);
37563 else
37564 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
37565 gen_lowpart (V2TImode, src),
37566 GEN_INT (i / 2));
37567 break;
37568 default:
37569 gcc_unreachable ();
37571 emit_insn (tem);
37574 /* Expand a vector reduction. FN is the binary pattern to reduce;
37575 DEST is the destination; IN is the input vector. */
37577 void
37578 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
37580 rtx half, dst, vec = in;
37581 enum machine_mode mode = GET_MODE (in);
37582 int i;
37584 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
37585 if (TARGET_SSE4_1
37586 && mode == V8HImode
37587 && fn == gen_uminv8hi3)
37589 emit_insn (gen_sse4_1_phminposuw (dest, in));
37590 return;
37593 for (i = GET_MODE_BITSIZE (mode);
37594 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
37595 i >>= 1)
37597 half = gen_reg_rtx (mode);
37598 emit_reduc_half (half, vec, i);
37599 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37600 dst = dest;
37601 else
37602 dst = gen_reg_rtx (mode);
37603 emit_insn (fn (dst, half, vec));
37604 vec = dst;
37608 /* Target hook for scalar_mode_supported_p. */
37609 static bool
37610 ix86_scalar_mode_supported_p (enum machine_mode mode)
37612 if (DECIMAL_FLOAT_MODE_P (mode))
37613 return default_decimal_float_supported_p ();
37614 else if (mode == TFmode)
37615 return true;
37616 else
37617 return default_scalar_mode_supported_p (mode);
37620 /* Implements target hook vector_mode_supported_p. */
37621 static bool
37622 ix86_vector_mode_supported_p (enum machine_mode mode)
37624 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37625 return true;
37626 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37627 return true;
37628 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37629 return true;
37630 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37631 return true;
37632 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37633 return true;
37634 return false;
37637 /* Target hook for c_mode_for_suffix. */
37638 static enum machine_mode
37639 ix86_c_mode_for_suffix (char suffix)
37641 if (suffix == 'q')
37642 return TFmode;
37643 if (suffix == 'w')
37644 return XFmode;
37646 return VOIDmode;
37649 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37651 We do this in the new i386 backend to maintain source compatibility
37652 with the old cc0-based compiler. */
37654 static tree
37655 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37656 tree inputs ATTRIBUTE_UNUSED,
37657 tree clobbers)
37659 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37660 clobbers);
37661 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37662 clobbers);
37663 return clobbers;
37666 /* Implements target vector targetm.asm.encode_section_info. */
37668 static void ATTRIBUTE_UNUSED
37669 ix86_encode_section_info (tree decl, rtx rtl, int first)
37671 default_encode_section_info (decl, rtl, first);
37673 if (TREE_CODE (decl) == VAR_DECL
37674 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37675 && ix86_in_large_data_p (decl))
37676 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37679 /* Worker function for REVERSE_CONDITION. */
37681 enum rtx_code
37682 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37684 return (mode != CCFPmode && mode != CCFPUmode
37685 ? reverse_condition (code)
37686 : reverse_condition_maybe_unordered (code));
37689 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37690 to OPERANDS[0]. */
37692 const char *
37693 output_387_reg_move (rtx insn, rtx *operands)
37695 if (REG_P (operands[0]))
37697 if (REG_P (operands[1])
37698 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37700 if (REGNO (operands[0]) == FIRST_STACK_REG)
37701 return output_387_ffreep (operands, 0);
37702 return "fstp\t%y0";
37704 if (STACK_TOP_P (operands[0]))
37705 return "fld%Z1\t%y1";
37706 return "fst\t%y0";
37708 else if (MEM_P (operands[0]))
37710 gcc_assert (REG_P (operands[1]));
37711 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37712 return "fstp%Z0\t%y0";
37713 else
37715 /* There is no non-popping store to memory for XFmode.
37716 So if we need one, follow the store with a load. */
37717 if (GET_MODE (operands[0]) == XFmode)
37718 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37719 else
37720 return "fst%Z0\t%y0";
37723 else
37724 gcc_unreachable();
37727 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37728 FP status register is set. */
37730 void
37731 ix86_emit_fp_unordered_jump (rtx label)
37733 rtx reg = gen_reg_rtx (HImode);
37734 rtx temp;
37736 emit_insn (gen_x86_fnstsw_1 (reg));
37738 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37740 emit_insn (gen_x86_sahf_1 (reg));
37742 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37743 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37745 else
37747 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37749 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37750 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37753 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37754 gen_rtx_LABEL_REF (VOIDmode, label),
37755 pc_rtx);
37756 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37758 emit_jump_insn (temp);
37759 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37762 /* Output code to perform a log1p XFmode calculation. */
37764 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37766 rtx label1 = gen_label_rtx ();
37767 rtx label2 = gen_label_rtx ();
37769 rtx tmp = gen_reg_rtx (XFmode);
37770 rtx tmp2 = gen_reg_rtx (XFmode);
37771 rtx test;
37773 emit_insn (gen_absxf2 (tmp, op1));
37774 test = gen_rtx_GE (VOIDmode, tmp,
37775 CONST_DOUBLE_FROM_REAL_VALUE (
37776 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37777 XFmode));
37778 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37780 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37781 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37782 emit_jump (label2);
37784 emit_label (label1);
37785 emit_move_insn (tmp, CONST1_RTX (XFmode));
37786 emit_insn (gen_addxf3 (tmp, op1, tmp));
37787 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37788 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37790 emit_label (label2);
37793 /* Emit code for round calculation. */
37794 void ix86_emit_i387_round (rtx op0, rtx op1)
37796 enum machine_mode inmode = GET_MODE (op1);
37797 enum machine_mode outmode = GET_MODE (op0);
37798 rtx e1, e2, res, tmp, tmp1, half;
37799 rtx scratch = gen_reg_rtx (HImode);
37800 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37801 rtx jump_label = gen_label_rtx ();
37802 rtx insn;
37803 rtx (*gen_abs) (rtx, rtx);
37804 rtx (*gen_neg) (rtx, rtx);
37806 switch (inmode)
37808 case SFmode:
37809 gen_abs = gen_abssf2;
37810 break;
37811 case DFmode:
37812 gen_abs = gen_absdf2;
37813 break;
37814 case XFmode:
37815 gen_abs = gen_absxf2;
37816 break;
37817 default:
37818 gcc_unreachable ();
37821 switch (outmode)
37823 case SFmode:
37824 gen_neg = gen_negsf2;
37825 break;
37826 case DFmode:
37827 gen_neg = gen_negdf2;
37828 break;
37829 case XFmode:
37830 gen_neg = gen_negxf2;
37831 break;
37832 case HImode:
37833 gen_neg = gen_neghi2;
37834 break;
37835 case SImode:
37836 gen_neg = gen_negsi2;
37837 break;
37838 case DImode:
37839 gen_neg = gen_negdi2;
37840 break;
37841 default:
37842 gcc_unreachable ();
37845 e1 = gen_reg_rtx (inmode);
37846 e2 = gen_reg_rtx (inmode);
37847 res = gen_reg_rtx (outmode);
37849 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37851 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37853 /* scratch = fxam(op1) */
37854 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37855 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37856 UNSPEC_FXAM)));
37857 /* e1 = fabs(op1) */
37858 emit_insn (gen_abs (e1, op1));
37860 /* e2 = e1 + 0.5 */
37861 half = force_reg (inmode, half);
37862 emit_insn (gen_rtx_SET (VOIDmode, e2,
37863 gen_rtx_PLUS (inmode, e1, half)));
37865 /* res = floor(e2) */
37866 if (inmode != XFmode)
37868 tmp1 = gen_reg_rtx (XFmode);
37870 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37871 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37873 else
37874 tmp1 = e2;
37876 switch (outmode)
37878 case SFmode:
37879 case DFmode:
37881 rtx tmp0 = gen_reg_rtx (XFmode);
37883 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37885 emit_insn (gen_rtx_SET (VOIDmode, res,
37886 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37887 UNSPEC_TRUNC_NOOP)));
37889 break;
37890 case XFmode:
37891 emit_insn (gen_frndintxf2_floor (res, tmp1));
37892 break;
37893 case HImode:
37894 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37895 break;
37896 case SImode:
37897 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37898 break;
37899 case DImode:
37900 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37901 break;
37902 default:
37903 gcc_unreachable ();
37906 /* flags = signbit(a) */
37907 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37909 /* if (flags) then res = -res */
37910 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37911 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37912 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37913 pc_rtx);
37914 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37915 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37916 JUMP_LABEL (insn) = jump_label;
37918 emit_insn (gen_neg (res, res));
37920 emit_label (jump_label);
37921 LABEL_NUSES (jump_label) = 1;
37923 emit_move_insn (op0, res);
37926 /* Output code to perform a Newton-Rhapson approximation of a single precision
37927 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37929 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37931 rtx x0, x1, e0, e1;
37933 x0 = gen_reg_rtx (mode);
37934 e0 = gen_reg_rtx (mode);
37935 e1 = gen_reg_rtx (mode);
37936 x1 = gen_reg_rtx (mode);
37938 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37940 b = force_reg (mode, b);
37942 /* x0 = rcp(b) estimate */
37943 emit_insn (gen_rtx_SET (VOIDmode, x0,
37944 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37945 UNSPEC_RCP)));
37946 /* e0 = x0 * b */
37947 emit_insn (gen_rtx_SET (VOIDmode, e0,
37948 gen_rtx_MULT (mode, x0, b)));
37950 /* e0 = x0 * e0 */
37951 emit_insn (gen_rtx_SET (VOIDmode, e0,
37952 gen_rtx_MULT (mode, x0, e0)));
37954 /* e1 = x0 + x0 */
37955 emit_insn (gen_rtx_SET (VOIDmode, e1,
37956 gen_rtx_PLUS (mode, x0, x0)));
37958 /* x1 = e1 - e0 */
37959 emit_insn (gen_rtx_SET (VOIDmode, x1,
37960 gen_rtx_MINUS (mode, e1, e0)));
37962 /* res = a * x1 */
37963 emit_insn (gen_rtx_SET (VOIDmode, res,
37964 gen_rtx_MULT (mode, a, x1)));
37967 /* Output code to perform a Newton-Rhapson approximation of a
37968 single precision floating point [reciprocal] square root. */
37970 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37971 bool recip)
37973 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37974 REAL_VALUE_TYPE r;
37976 x0 = gen_reg_rtx (mode);
37977 e0 = gen_reg_rtx (mode);
37978 e1 = gen_reg_rtx (mode);
37979 e2 = gen_reg_rtx (mode);
37980 e3 = gen_reg_rtx (mode);
37982 real_from_integer (&r, VOIDmode, -3, -1, 0);
37983 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37985 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37986 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37988 if (VECTOR_MODE_P (mode))
37990 mthree = ix86_build_const_vector (mode, true, mthree);
37991 mhalf = ix86_build_const_vector (mode, true, mhalf);
37994 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37995 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37997 a = force_reg (mode, a);
37999 /* x0 = rsqrt(a) estimate */
38000 emit_insn (gen_rtx_SET (VOIDmode, x0,
38001 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
38002 UNSPEC_RSQRT)));
38004 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
38005 if (!recip)
38007 rtx zero, mask;
38009 zero = gen_reg_rtx (mode);
38010 mask = gen_reg_rtx (mode);
38012 zero = force_reg (mode, CONST0_RTX(mode));
38013 emit_insn (gen_rtx_SET (VOIDmode, mask,
38014 gen_rtx_NE (mode, zero, a)));
38016 emit_insn (gen_rtx_SET (VOIDmode, x0,
38017 gen_rtx_AND (mode, x0, mask)));
38020 /* e0 = x0 * a */
38021 emit_insn (gen_rtx_SET (VOIDmode, e0,
38022 gen_rtx_MULT (mode, x0, a)));
38023 /* e1 = e0 * x0 */
38024 emit_insn (gen_rtx_SET (VOIDmode, e1,
38025 gen_rtx_MULT (mode, e0, x0)));
38027 /* e2 = e1 - 3. */
38028 mthree = force_reg (mode, mthree);
38029 emit_insn (gen_rtx_SET (VOIDmode, e2,
38030 gen_rtx_PLUS (mode, e1, mthree)));
38032 mhalf = force_reg (mode, mhalf);
38033 if (recip)
38034 /* e3 = -.5 * x0 */
38035 emit_insn (gen_rtx_SET (VOIDmode, e3,
38036 gen_rtx_MULT (mode, x0, mhalf)));
38037 else
38038 /* e3 = -.5 * e0 */
38039 emit_insn (gen_rtx_SET (VOIDmode, e3,
38040 gen_rtx_MULT (mode, e0, mhalf)));
38041 /* ret = e2 * e3 */
38042 emit_insn (gen_rtx_SET (VOIDmode, res,
38043 gen_rtx_MULT (mode, e2, e3)));
38046 #ifdef TARGET_SOLARIS
38047 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38049 static void
38050 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38051 tree decl)
38053 /* With Binutils 2.15, the "@unwind" marker must be specified on
38054 every occurrence of the ".eh_frame" section, not just the first
38055 one. */
38056 if (TARGET_64BIT
38057 && strcmp (name, ".eh_frame") == 0)
38059 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38060 flags & SECTION_WRITE ? "aw" : "a");
38061 return;
38064 #ifndef USE_GAS
38065 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38067 solaris_elf_asm_comdat_section (name, flags, decl);
38068 return;
38070 #endif
38072 default_elf_asm_named_section (name, flags, decl);
38074 #endif /* TARGET_SOLARIS */
38076 /* Return the mangling of TYPE if it is an extended fundamental type. */
38078 static const char *
38079 ix86_mangle_type (const_tree type)
38081 type = TYPE_MAIN_VARIANT (type);
38083 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
38084 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
38085 return NULL;
38087 switch (TYPE_MODE (type))
38089 case TFmode:
38090 /* __float128 is "g". */
38091 return "g";
38092 case XFmode:
38093 /* "long double" or __float80 is "e". */
38094 return "e";
38095 default:
38096 return NULL;
38100 /* For 32-bit code we can save PIC register setup by using
38101 __stack_chk_fail_local hidden function instead of calling
38102 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
38103 register, so it is better to call __stack_chk_fail directly. */
38105 static tree ATTRIBUTE_UNUSED
38106 ix86_stack_protect_fail (void)
38108 return TARGET_64BIT
38109 ? default_external_stack_protect_fail ()
38110 : default_hidden_stack_protect_fail ();
38113 /* Select a format to encode pointers in exception handling data. CODE
38114 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
38115 true if the symbol may be affected by dynamic relocations.
38117 ??? All x86 object file formats are capable of representing this.
38118 After all, the relocation needed is the same as for the call insn.
38119 Whether or not a particular assembler allows us to enter such, I
38120 guess we'll have to see. */
38122 asm_preferred_eh_data_format (int code, int global)
38124 if (flag_pic)
38126 int type = DW_EH_PE_sdata8;
38127 if (!TARGET_64BIT
38128 || ix86_cmodel == CM_SMALL_PIC
38129 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
38130 type = DW_EH_PE_sdata4;
38131 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
38133 if (ix86_cmodel == CM_SMALL
38134 || (ix86_cmodel == CM_MEDIUM && code))
38135 return DW_EH_PE_udata4;
38136 return DW_EH_PE_absptr;
38139 /* Expand copysign from SIGN to the positive value ABS_VALUE
38140 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
38141 the sign-bit. */
38142 static void
38143 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
38145 enum machine_mode mode = GET_MODE (sign);
38146 rtx sgn = gen_reg_rtx (mode);
38147 if (mask == NULL_RTX)
38149 enum machine_mode vmode;
38151 if (mode == SFmode)
38152 vmode = V4SFmode;
38153 else if (mode == DFmode)
38154 vmode = V2DFmode;
38155 else
38156 vmode = mode;
38158 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
38159 if (!VECTOR_MODE_P (mode))
38161 /* We need to generate a scalar mode mask in this case. */
38162 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38163 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38164 mask = gen_reg_rtx (mode);
38165 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38168 else
38169 mask = gen_rtx_NOT (mode, mask);
38170 emit_insn (gen_rtx_SET (VOIDmode, sgn,
38171 gen_rtx_AND (mode, mask, sign)));
38172 emit_insn (gen_rtx_SET (VOIDmode, result,
38173 gen_rtx_IOR (mode, abs_value, sgn)));
38176 /* Expand fabs (OP0) and return a new rtx that holds the result. The
38177 mask for masking out the sign-bit is stored in *SMASK, if that is
38178 non-null. */
38179 static rtx
38180 ix86_expand_sse_fabs (rtx op0, rtx *smask)
38182 enum machine_mode vmode, mode = GET_MODE (op0);
38183 rtx xa, mask;
38185 xa = gen_reg_rtx (mode);
38186 if (mode == SFmode)
38187 vmode = V4SFmode;
38188 else if (mode == DFmode)
38189 vmode = V2DFmode;
38190 else
38191 vmode = mode;
38192 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
38193 if (!VECTOR_MODE_P (mode))
38195 /* We need to generate a scalar mode mask in this case. */
38196 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38197 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38198 mask = gen_reg_rtx (mode);
38199 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38201 emit_insn (gen_rtx_SET (VOIDmode, xa,
38202 gen_rtx_AND (mode, op0, mask)));
38204 if (smask)
38205 *smask = mask;
38207 return xa;
38210 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
38211 swapping the operands if SWAP_OPERANDS is true. The expanded
38212 code is a forward jump to a newly created label in case the
38213 comparison is true. The generated label rtx is returned. */
38214 static rtx
38215 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
38216 bool swap_operands)
38218 rtx label, tmp;
38220 if (swap_operands)
38222 tmp = op0;
38223 op0 = op1;
38224 op1 = tmp;
38227 label = gen_label_rtx ();
38228 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
38229 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38230 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
38231 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
38232 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
38233 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
38234 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38235 JUMP_LABEL (tmp) = label;
38237 return label;
38240 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
38241 using comparison code CODE. Operands are swapped for the comparison if
38242 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
38243 static rtx
38244 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
38245 bool swap_operands)
38247 rtx (*insn)(rtx, rtx, rtx, rtx);
38248 enum machine_mode mode = GET_MODE (op0);
38249 rtx mask = gen_reg_rtx (mode);
38251 if (swap_operands)
38253 rtx tmp = op0;
38254 op0 = op1;
38255 op1 = tmp;
38258 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
38260 emit_insn (insn (mask, op0, op1,
38261 gen_rtx_fmt_ee (code, mode, op0, op1)));
38262 return mask;
38265 /* Generate and return a rtx of mode MODE for 2**n where n is the number
38266 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
38267 static rtx
38268 ix86_gen_TWO52 (enum machine_mode mode)
38270 REAL_VALUE_TYPE TWO52r;
38271 rtx TWO52;
38273 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
38274 TWO52 = const_double_from_real_value (TWO52r, mode);
38275 TWO52 = force_reg (mode, TWO52);
38277 return TWO52;
38280 /* Expand SSE sequence for computing lround from OP1 storing
38281 into OP0. */
38282 void
38283 ix86_expand_lround (rtx op0, rtx op1)
38285 /* C code for the stuff we're doing below:
38286 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
38287 return (long)tmp;
38289 enum machine_mode mode = GET_MODE (op1);
38290 const struct real_format *fmt;
38291 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38292 rtx adj;
38294 /* load nextafter (0.5, 0.0) */
38295 fmt = REAL_MODE_FORMAT (mode);
38296 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38297 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38299 /* adj = copysign (0.5, op1) */
38300 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
38301 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
38303 /* adj = op1 + adj */
38304 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
38306 /* op0 = (imode)adj */
38307 expand_fix (op0, adj, 0);
38310 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
38311 into OPERAND0. */
38312 void
38313 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
38315 /* C code for the stuff we're doing below (for do_floor):
38316 xi = (long)op1;
38317 xi -= (double)xi > op1 ? 1 : 0;
38318 return xi;
38320 enum machine_mode fmode = GET_MODE (op1);
38321 enum machine_mode imode = GET_MODE (op0);
38322 rtx ireg, freg, label, tmp;
38324 /* reg = (long)op1 */
38325 ireg = gen_reg_rtx (imode);
38326 expand_fix (ireg, op1, 0);
38328 /* freg = (double)reg */
38329 freg = gen_reg_rtx (fmode);
38330 expand_float (freg, ireg, 0);
38332 /* ireg = (freg > op1) ? ireg - 1 : ireg */
38333 label = ix86_expand_sse_compare_and_jump (UNLE,
38334 freg, op1, !do_floor);
38335 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
38336 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
38337 emit_move_insn (ireg, tmp);
38339 emit_label (label);
38340 LABEL_NUSES (label) = 1;
38342 emit_move_insn (op0, ireg);
38345 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
38346 result in OPERAND0. */
38347 void
38348 ix86_expand_rint (rtx operand0, rtx operand1)
38350 /* C code for the stuff we're doing below:
38351 xa = fabs (operand1);
38352 if (!isless (xa, 2**52))
38353 return operand1;
38354 xa = xa + 2**52 - 2**52;
38355 return copysign (xa, operand1);
38357 enum machine_mode mode = GET_MODE (operand0);
38358 rtx res, xa, label, TWO52, mask;
38360 res = gen_reg_rtx (mode);
38361 emit_move_insn (res, operand1);
38363 /* xa = abs (operand1) */
38364 xa = ix86_expand_sse_fabs (res, &mask);
38366 /* if (!isless (xa, TWO52)) goto label; */
38367 TWO52 = ix86_gen_TWO52 (mode);
38368 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38370 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38371 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38373 ix86_sse_copysign_to_positive (res, xa, res, mask);
38375 emit_label (label);
38376 LABEL_NUSES (label) = 1;
38378 emit_move_insn (operand0, res);
38381 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38382 into OPERAND0. */
38383 void
38384 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
38386 /* C code for the stuff we expand below.
38387 double xa = fabs (x), x2;
38388 if (!isless (xa, TWO52))
38389 return x;
38390 xa = xa + TWO52 - TWO52;
38391 x2 = copysign (xa, x);
38392 Compensate. Floor:
38393 if (x2 > x)
38394 x2 -= 1;
38395 Compensate. Ceil:
38396 if (x2 < x)
38397 x2 -= -1;
38398 return x2;
38400 enum machine_mode mode = GET_MODE (operand0);
38401 rtx xa, TWO52, tmp, label, one, res, mask;
38403 TWO52 = ix86_gen_TWO52 (mode);
38405 /* Temporary for holding the result, initialized to the input
38406 operand to ease control flow. */
38407 res = gen_reg_rtx (mode);
38408 emit_move_insn (res, operand1);
38410 /* xa = abs (operand1) */
38411 xa = ix86_expand_sse_fabs (res, &mask);
38413 /* if (!isless (xa, TWO52)) goto label; */
38414 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38416 /* xa = xa + TWO52 - TWO52; */
38417 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38418 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38420 /* xa = copysign (xa, operand1) */
38421 ix86_sse_copysign_to_positive (xa, xa, res, mask);
38423 /* generate 1.0 or -1.0 */
38424 one = force_reg (mode,
38425 const_double_from_real_value (do_floor
38426 ? dconst1 : dconstm1, mode));
38428 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38429 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38430 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38431 gen_rtx_AND (mode, one, tmp)));
38432 /* We always need to subtract here to preserve signed zero. */
38433 tmp = expand_simple_binop (mode, MINUS,
38434 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38435 emit_move_insn (res, tmp);
38437 emit_label (label);
38438 LABEL_NUSES (label) = 1;
38440 emit_move_insn (operand0, res);
38443 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38444 into OPERAND0. */
38445 void
38446 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
38448 /* C code for the stuff we expand below.
38449 double xa = fabs (x), x2;
38450 if (!isless (xa, TWO52))
38451 return x;
38452 x2 = (double)(long)x;
38453 Compensate. Floor:
38454 if (x2 > x)
38455 x2 -= 1;
38456 Compensate. Ceil:
38457 if (x2 < x)
38458 x2 += 1;
38459 if (HONOR_SIGNED_ZEROS (mode))
38460 return copysign (x2, x);
38461 return x2;
38463 enum machine_mode mode = GET_MODE (operand0);
38464 rtx xa, xi, TWO52, tmp, label, one, res, mask;
38466 TWO52 = ix86_gen_TWO52 (mode);
38468 /* Temporary for holding the result, initialized to the input
38469 operand to ease control flow. */
38470 res = gen_reg_rtx (mode);
38471 emit_move_insn (res, operand1);
38473 /* xa = abs (operand1) */
38474 xa = ix86_expand_sse_fabs (res, &mask);
38476 /* if (!isless (xa, TWO52)) goto label; */
38477 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38479 /* xa = (double)(long)x */
38480 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38481 expand_fix (xi, res, 0);
38482 expand_float (xa, xi, 0);
38484 /* generate 1.0 */
38485 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38487 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38488 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38489 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38490 gen_rtx_AND (mode, one, tmp)));
38491 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
38492 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38493 emit_move_insn (res, tmp);
38495 if (HONOR_SIGNED_ZEROS (mode))
38496 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38498 emit_label (label);
38499 LABEL_NUSES (label) = 1;
38501 emit_move_insn (operand0, res);
38504 /* Expand SSE sequence for computing round from OPERAND1 storing
38505 into OPERAND0. Sequence that works without relying on DImode truncation
38506 via cvttsd2siq that is only available on 64bit targets. */
38507 void
38508 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
38510 /* C code for the stuff we expand below.
38511 double xa = fabs (x), xa2, x2;
38512 if (!isless (xa, TWO52))
38513 return x;
38514 Using the absolute value and copying back sign makes
38515 -0.0 -> -0.0 correct.
38516 xa2 = xa + TWO52 - TWO52;
38517 Compensate.
38518 dxa = xa2 - xa;
38519 if (dxa <= -0.5)
38520 xa2 += 1;
38521 else if (dxa > 0.5)
38522 xa2 -= 1;
38523 x2 = copysign (xa2, x);
38524 return x2;
38526 enum machine_mode mode = GET_MODE (operand0);
38527 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
38529 TWO52 = ix86_gen_TWO52 (mode);
38531 /* Temporary for holding the result, initialized to the input
38532 operand to ease control flow. */
38533 res = gen_reg_rtx (mode);
38534 emit_move_insn (res, operand1);
38536 /* xa = abs (operand1) */
38537 xa = ix86_expand_sse_fabs (res, &mask);
38539 /* if (!isless (xa, TWO52)) goto label; */
38540 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38542 /* xa2 = xa + TWO52 - TWO52; */
38543 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38544 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
38546 /* dxa = xa2 - xa; */
38547 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
38549 /* generate 0.5, 1.0 and -0.5 */
38550 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
38551 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
38552 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
38553 0, OPTAB_DIRECT);
38555 /* Compensate. */
38556 tmp = gen_reg_rtx (mode);
38557 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
38558 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
38559 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38560 gen_rtx_AND (mode, one, tmp)));
38561 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38562 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
38563 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
38564 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38565 gen_rtx_AND (mode, one, tmp)));
38566 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38568 /* res = copysign (xa2, operand1) */
38569 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
38571 emit_label (label);
38572 LABEL_NUSES (label) = 1;
38574 emit_move_insn (operand0, res);
38577 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38578 into OPERAND0. */
38579 void
38580 ix86_expand_trunc (rtx operand0, rtx operand1)
38582 /* C code for SSE variant we expand below.
38583 double xa = fabs (x), x2;
38584 if (!isless (xa, TWO52))
38585 return x;
38586 x2 = (double)(long)x;
38587 if (HONOR_SIGNED_ZEROS (mode))
38588 return copysign (x2, x);
38589 return x2;
38591 enum machine_mode mode = GET_MODE (operand0);
38592 rtx xa, xi, TWO52, label, res, mask;
38594 TWO52 = ix86_gen_TWO52 (mode);
38596 /* Temporary for holding the result, initialized to the input
38597 operand to ease control flow. */
38598 res = gen_reg_rtx (mode);
38599 emit_move_insn (res, operand1);
38601 /* xa = abs (operand1) */
38602 xa = ix86_expand_sse_fabs (res, &mask);
38604 /* if (!isless (xa, TWO52)) goto label; */
38605 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38607 /* x = (double)(long)x */
38608 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38609 expand_fix (xi, res, 0);
38610 expand_float (res, xi, 0);
38612 if (HONOR_SIGNED_ZEROS (mode))
38613 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38615 emit_label (label);
38616 LABEL_NUSES (label) = 1;
38618 emit_move_insn (operand0, res);
38621 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38622 into OPERAND0. */
38623 void
38624 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38626 enum machine_mode mode = GET_MODE (operand0);
38627 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38629 /* C code for SSE variant we expand below.
38630 double xa = fabs (x), x2;
38631 if (!isless (xa, TWO52))
38632 return x;
38633 xa2 = xa + TWO52 - TWO52;
38634 Compensate:
38635 if (xa2 > xa)
38636 xa2 -= 1.0;
38637 x2 = copysign (xa2, x);
38638 return x2;
38641 TWO52 = ix86_gen_TWO52 (mode);
38643 /* Temporary for holding the result, initialized to the input
38644 operand to ease control flow. */
38645 res = gen_reg_rtx (mode);
38646 emit_move_insn (res, operand1);
38648 /* xa = abs (operand1) */
38649 xa = ix86_expand_sse_fabs (res, &smask);
38651 /* if (!isless (xa, TWO52)) goto label; */
38652 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38654 /* res = xa + TWO52 - TWO52; */
38655 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38656 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38657 emit_move_insn (res, tmp);
38659 /* generate 1.0 */
38660 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38662 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38663 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38664 emit_insn (gen_rtx_SET (VOIDmode, mask,
38665 gen_rtx_AND (mode, mask, one)));
38666 tmp = expand_simple_binop (mode, MINUS,
38667 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38668 emit_move_insn (res, tmp);
38670 /* res = copysign (res, operand1) */
38671 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38673 emit_label (label);
38674 LABEL_NUSES (label) = 1;
38676 emit_move_insn (operand0, res);
38679 /* Expand SSE sequence for computing round from OPERAND1 storing
38680 into OPERAND0. */
38681 void
38682 ix86_expand_round (rtx operand0, rtx operand1)
38684 /* C code for the stuff we're doing below:
38685 double xa = fabs (x);
38686 if (!isless (xa, TWO52))
38687 return x;
38688 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38689 return copysign (xa, x);
38691 enum machine_mode mode = GET_MODE (operand0);
38692 rtx res, TWO52, xa, label, xi, half, mask;
38693 const struct real_format *fmt;
38694 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38696 /* Temporary for holding the result, initialized to the input
38697 operand to ease control flow. */
38698 res = gen_reg_rtx (mode);
38699 emit_move_insn (res, operand1);
38701 TWO52 = ix86_gen_TWO52 (mode);
38702 xa = ix86_expand_sse_fabs (res, &mask);
38703 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38705 /* load nextafter (0.5, 0.0) */
38706 fmt = REAL_MODE_FORMAT (mode);
38707 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38708 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38710 /* xa = xa + 0.5 */
38711 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38712 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38714 /* xa = (double)(int64_t)xa */
38715 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38716 expand_fix (xi, xa, 0);
38717 expand_float (xa, xi, 0);
38719 /* res = copysign (xa, operand1) */
38720 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38722 emit_label (label);
38723 LABEL_NUSES (label) = 1;
38725 emit_move_insn (operand0, res);
38728 /* Expand SSE sequence for computing round
38729 from OP1 storing into OP0 using sse4 round insn. */
38730 void
38731 ix86_expand_round_sse4 (rtx op0, rtx op1)
38733 enum machine_mode mode = GET_MODE (op0);
38734 rtx e1, e2, res, half;
38735 const struct real_format *fmt;
38736 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38737 rtx (*gen_copysign) (rtx, rtx, rtx);
38738 rtx (*gen_round) (rtx, rtx, rtx);
38740 switch (mode)
38742 case SFmode:
38743 gen_copysign = gen_copysignsf3;
38744 gen_round = gen_sse4_1_roundsf2;
38745 break;
38746 case DFmode:
38747 gen_copysign = gen_copysigndf3;
38748 gen_round = gen_sse4_1_rounddf2;
38749 break;
38750 default:
38751 gcc_unreachable ();
38754 /* round (a) = trunc (a + copysign (0.5, a)) */
38756 /* load nextafter (0.5, 0.0) */
38757 fmt = REAL_MODE_FORMAT (mode);
38758 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38759 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38760 half = const_double_from_real_value (pred_half, mode);
38762 /* e1 = copysign (0.5, op1) */
38763 e1 = gen_reg_rtx (mode);
38764 emit_insn (gen_copysign (e1, half, op1));
38766 /* e2 = op1 + e1 */
38767 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38769 /* res = trunc (e2) */
38770 res = gen_reg_rtx (mode);
38771 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38773 emit_move_insn (op0, res);
38777 /* Table of valid machine attributes. */
38778 static const struct attribute_spec ix86_attribute_table[] =
38780 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38781 affects_type_identity } */
38782 /* Stdcall attribute says callee is responsible for popping arguments
38783 if they are not variable. */
38784 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38785 true },
38786 /* Fastcall attribute says callee is responsible for popping arguments
38787 if they are not variable. */
38788 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38789 true },
38790 /* Thiscall attribute says callee is responsible for popping arguments
38791 if they are not variable. */
38792 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38793 true },
38794 /* Cdecl attribute says the callee is a normal C declaration */
38795 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38796 true },
38797 /* Regparm attribute specifies how many integer arguments are to be
38798 passed in registers. */
38799 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38800 true },
38801 /* Sseregparm attribute says we are using x86_64 calling conventions
38802 for FP arguments. */
38803 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38804 true },
38805 /* The transactional memory builtins are implicitly regparm or fastcall
38806 depending on the ABI. Override the generic do-nothing attribute that
38807 these builtins were declared with. */
38808 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38809 true },
38810 /* force_align_arg_pointer says this function realigns the stack at entry. */
38811 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38812 false, true, true, ix86_handle_cconv_attribute, false },
38813 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38814 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38815 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38816 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38817 false },
38818 #endif
38819 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38820 false },
38821 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38822 false },
38823 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38824 SUBTARGET_ATTRIBUTE_TABLE,
38825 #endif
38826 /* ms_abi and sysv_abi calling convention function attributes. */
38827 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38828 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38829 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38830 false },
38831 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38832 ix86_handle_callee_pop_aggregate_return, true },
38833 /* End element. */
38834 { NULL, 0, 0, false, false, false, NULL, false }
38837 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38838 static int
38839 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38840 tree vectype,
38841 int misalign ATTRIBUTE_UNUSED)
38843 unsigned elements;
38845 switch (type_of_cost)
38847 case scalar_stmt:
38848 return ix86_cost->scalar_stmt_cost;
38850 case scalar_load:
38851 return ix86_cost->scalar_load_cost;
38853 case scalar_store:
38854 return ix86_cost->scalar_store_cost;
38856 case vector_stmt:
38857 return ix86_cost->vec_stmt_cost;
38859 case vector_load:
38860 return ix86_cost->vec_align_load_cost;
38862 case vector_store:
38863 return ix86_cost->vec_store_cost;
38865 case vec_to_scalar:
38866 return ix86_cost->vec_to_scalar_cost;
38868 case scalar_to_vec:
38869 return ix86_cost->scalar_to_vec_cost;
38871 case unaligned_load:
38872 case unaligned_store:
38873 return ix86_cost->vec_unalign_load_cost;
38875 case cond_branch_taken:
38876 return ix86_cost->cond_taken_branch_cost;
38878 case cond_branch_not_taken:
38879 return ix86_cost->cond_not_taken_branch_cost;
38881 case vec_perm:
38882 case vec_promote_demote:
38883 return ix86_cost->vec_stmt_cost;
38885 case vec_construct:
38886 elements = TYPE_VECTOR_SUBPARTS (vectype);
38887 return elements / 2 + 1;
38889 default:
38890 gcc_unreachable ();
38894 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38895 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38896 insn every time. */
38898 static GTY(()) rtx vselect_insn;
38900 /* Initialize vselect_insn. */
38902 static void
38903 init_vselect_insn (void)
38905 unsigned i;
38906 rtx x;
38908 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38909 for (i = 0; i < MAX_VECT_LEN; ++i)
38910 XVECEXP (x, 0, i) = const0_rtx;
38911 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38912 const0_rtx), x);
38913 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38914 start_sequence ();
38915 vselect_insn = emit_insn (x);
38916 end_sequence ();
38919 /* Construct (set target (vec_select op0 (parallel perm))) and
38920 return true if that's a valid instruction in the active ISA. */
38922 static bool
38923 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38924 unsigned nelt, bool testing_p)
38926 unsigned int i;
38927 rtx x, save_vconcat;
38928 int icode;
38930 if (vselect_insn == NULL_RTX)
38931 init_vselect_insn ();
38933 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38934 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38935 for (i = 0; i < nelt; ++i)
38936 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38937 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38938 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38939 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38940 SET_DEST (PATTERN (vselect_insn)) = target;
38941 icode = recog_memoized (vselect_insn);
38943 if (icode >= 0 && !testing_p)
38944 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38946 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38947 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38948 INSN_CODE (vselect_insn) = -1;
38950 return icode >= 0;
38953 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38955 static bool
38956 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38957 const unsigned char *perm, unsigned nelt,
38958 bool testing_p)
38960 enum machine_mode v2mode;
38961 rtx x;
38962 bool ok;
38964 if (vselect_insn == NULL_RTX)
38965 init_vselect_insn ();
38967 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38968 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38969 PUT_MODE (x, v2mode);
38970 XEXP (x, 0) = op0;
38971 XEXP (x, 1) = op1;
38972 ok = expand_vselect (target, x, perm, nelt, testing_p);
38973 XEXP (x, 0) = const0_rtx;
38974 XEXP (x, 1) = const0_rtx;
38975 return ok;
38978 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38979 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38981 static bool
38982 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38984 enum machine_mode vmode = d->vmode;
38985 unsigned i, mask, nelt = d->nelt;
38986 rtx target, op0, op1, x;
38987 rtx rperm[32], vperm;
38989 if (d->one_operand_p)
38990 return false;
38991 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38993 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38995 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38997 else
38998 return false;
39000 /* This is a blend, not a permute. Elements must stay in their
39001 respective lanes. */
39002 for (i = 0; i < nelt; ++i)
39004 unsigned e = d->perm[i];
39005 if (!(e == i || e == i + nelt))
39006 return false;
39009 if (d->testing_p)
39010 return true;
39012 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
39013 decision should be extracted elsewhere, so that we only try that
39014 sequence once all budget==3 options have been tried. */
39015 target = d->target;
39016 op0 = d->op0;
39017 op1 = d->op1;
39018 mask = 0;
39020 switch (vmode)
39022 case V4DFmode:
39023 case V8SFmode:
39024 case V2DFmode:
39025 case V4SFmode:
39026 case V8HImode:
39027 case V8SImode:
39028 for (i = 0; i < nelt; ++i)
39029 mask |= (d->perm[i] >= nelt) << i;
39030 break;
39032 case V2DImode:
39033 for (i = 0; i < 2; ++i)
39034 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39035 vmode = V8HImode;
39036 goto do_subreg;
39038 case V4SImode:
39039 for (i = 0; i < 4; ++i)
39040 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39041 vmode = V8HImode;
39042 goto do_subreg;
39044 case V16QImode:
39045 /* See if bytes move in pairs so we can use pblendw with
39046 an immediate argument, rather than pblendvb with a vector
39047 argument. */
39048 for (i = 0; i < 16; i += 2)
39049 if (d->perm[i] + 1 != d->perm[i + 1])
39051 use_pblendvb:
39052 for (i = 0; i < nelt; ++i)
39053 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39055 finish_pblendvb:
39056 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39057 vperm = force_reg (vmode, vperm);
39059 if (GET_MODE_SIZE (vmode) == 16)
39060 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39061 else
39062 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39063 return true;
39066 for (i = 0; i < 8; ++i)
39067 mask |= (d->perm[i * 2] >= 16) << i;
39068 vmode = V8HImode;
39069 /* FALLTHRU */
39071 do_subreg:
39072 target = gen_lowpart (vmode, target);
39073 op0 = gen_lowpart (vmode, op0);
39074 op1 = gen_lowpart (vmode, op1);
39075 break;
39077 case V32QImode:
39078 /* See if bytes move in pairs. If not, vpblendvb must be used. */
39079 for (i = 0; i < 32; i += 2)
39080 if (d->perm[i] + 1 != d->perm[i + 1])
39081 goto use_pblendvb;
39082 /* See if bytes move in quadruplets. If yes, vpblendd
39083 with immediate can be used. */
39084 for (i = 0; i < 32; i += 4)
39085 if (d->perm[i] + 2 != d->perm[i + 2])
39086 break;
39087 if (i < 32)
39089 /* See if bytes move the same in both lanes. If yes,
39090 vpblendw with immediate can be used. */
39091 for (i = 0; i < 16; i += 2)
39092 if (d->perm[i] + 16 != d->perm[i + 16])
39093 goto use_pblendvb;
39095 /* Use vpblendw. */
39096 for (i = 0; i < 16; ++i)
39097 mask |= (d->perm[i * 2] >= 32) << i;
39098 vmode = V16HImode;
39099 goto do_subreg;
39102 /* Use vpblendd. */
39103 for (i = 0; i < 8; ++i)
39104 mask |= (d->perm[i * 4] >= 32) << i;
39105 vmode = V8SImode;
39106 goto do_subreg;
39108 case V16HImode:
39109 /* See if words move in pairs. If yes, vpblendd can be used. */
39110 for (i = 0; i < 16; i += 2)
39111 if (d->perm[i] + 1 != d->perm[i + 1])
39112 break;
39113 if (i < 16)
39115 /* See if words move the same in both lanes. If not,
39116 vpblendvb must be used. */
39117 for (i = 0; i < 8; i++)
39118 if (d->perm[i] + 8 != d->perm[i + 8])
39120 /* Use vpblendvb. */
39121 for (i = 0; i < 32; ++i)
39122 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
39124 vmode = V32QImode;
39125 nelt = 32;
39126 target = gen_lowpart (vmode, target);
39127 op0 = gen_lowpart (vmode, op0);
39128 op1 = gen_lowpart (vmode, op1);
39129 goto finish_pblendvb;
39132 /* Use vpblendw. */
39133 for (i = 0; i < 16; ++i)
39134 mask |= (d->perm[i] >= 16) << i;
39135 break;
39138 /* Use vpblendd. */
39139 for (i = 0; i < 8; ++i)
39140 mask |= (d->perm[i * 2] >= 16) << i;
39141 vmode = V8SImode;
39142 goto do_subreg;
39144 case V4DImode:
39145 /* Use vpblendd. */
39146 for (i = 0; i < 4; ++i)
39147 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39148 vmode = V8SImode;
39149 goto do_subreg;
39151 default:
39152 gcc_unreachable ();
39155 /* This matches five different patterns with the different modes. */
39156 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
39157 x = gen_rtx_SET (VOIDmode, target, x);
39158 emit_insn (x);
39160 return true;
39163 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39164 in terms of the variable form of vpermilps.
39166 Note that we will have already failed the immediate input vpermilps,
39167 which requires that the high and low part shuffle be identical; the
39168 variable form doesn't require that. */
39170 static bool
39171 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
39173 rtx rperm[8], vperm;
39174 unsigned i;
39176 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
39177 return false;
39179 /* We can only permute within the 128-bit lane. */
39180 for (i = 0; i < 8; ++i)
39182 unsigned e = d->perm[i];
39183 if (i < 4 ? e >= 4 : e < 4)
39184 return false;
39187 if (d->testing_p)
39188 return true;
39190 for (i = 0; i < 8; ++i)
39192 unsigned e = d->perm[i];
39194 /* Within each 128-bit lane, the elements of op0 are numbered
39195 from 0 and the elements of op1 are numbered from 4. */
39196 if (e >= 8 + 4)
39197 e -= 8;
39198 else if (e >= 4)
39199 e -= 4;
39201 rperm[i] = GEN_INT (e);
39204 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
39205 vperm = force_reg (V8SImode, vperm);
39206 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
39208 return true;
39211 /* Return true if permutation D can be performed as VMODE permutation
39212 instead. */
39214 static bool
39215 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
39217 unsigned int i, j, chunk;
39219 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
39220 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
39221 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
39222 return false;
39224 if (GET_MODE_NUNITS (vmode) >= d->nelt)
39225 return true;
39227 chunk = d->nelt / GET_MODE_NUNITS (vmode);
39228 for (i = 0; i < d->nelt; i += chunk)
39229 if (d->perm[i] & (chunk - 1))
39230 return false;
39231 else
39232 for (j = 1; j < chunk; ++j)
39233 if (d->perm[i] + j != d->perm[i + j])
39234 return false;
39236 return true;
39239 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39240 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
39242 static bool
39243 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
39245 unsigned i, nelt, eltsz, mask;
39246 unsigned char perm[32];
39247 enum machine_mode vmode = V16QImode;
39248 rtx rperm[32], vperm, target, op0, op1;
39250 nelt = d->nelt;
39252 if (!d->one_operand_p)
39254 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
39256 if (TARGET_AVX2
39257 && valid_perm_using_mode_p (V2TImode, d))
39259 if (d->testing_p)
39260 return true;
39262 /* Use vperm2i128 insn. The pattern uses
39263 V4DImode instead of V2TImode. */
39264 target = gen_lowpart (V4DImode, d->target);
39265 op0 = gen_lowpart (V4DImode, d->op0);
39266 op1 = gen_lowpart (V4DImode, d->op1);
39267 rperm[0]
39268 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
39269 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
39270 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
39271 return true;
39273 return false;
39276 else
39278 if (GET_MODE_SIZE (d->vmode) == 16)
39280 if (!TARGET_SSSE3)
39281 return false;
39283 else if (GET_MODE_SIZE (d->vmode) == 32)
39285 if (!TARGET_AVX2)
39286 return false;
39288 /* V4DImode should be already handled through
39289 expand_vselect by vpermq instruction. */
39290 gcc_assert (d->vmode != V4DImode);
39292 vmode = V32QImode;
39293 if (d->vmode == V8SImode
39294 || d->vmode == V16HImode
39295 || d->vmode == V32QImode)
39297 /* First see if vpermq can be used for
39298 V8SImode/V16HImode/V32QImode. */
39299 if (valid_perm_using_mode_p (V4DImode, d))
39301 for (i = 0; i < 4; i++)
39302 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
39303 if (d->testing_p)
39304 return true;
39305 return expand_vselect (gen_lowpart (V4DImode, d->target),
39306 gen_lowpart (V4DImode, d->op0),
39307 perm, 4, false);
39310 /* Next see if vpermd can be used. */
39311 if (valid_perm_using_mode_p (V8SImode, d))
39312 vmode = V8SImode;
39314 /* Or if vpermps can be used. */
39315 else if (d->vmode == V8SFmode)
39316 vmode = V8SImode;
39318 if (vmode == V32QImode)
39320 /* vpshufb only works intra lanes, it is not
39321 possible to shuffle bytes in between the lanes. */
39322 for (i = 0; i < nelt; ++i)
39323 if ((d->perm[i] ^ i) & (nelt / 2))
39324 return false;
39327 else
39328 return false;
39331 if (d->testing_p)
39332 return true;
39334 if (vmode == V8SImode)
39335 for (i = 0; i < 8; ++i)
39336 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
39337 else
39339 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39340 if (!d->one_operand_p)
39341 mask = 2 * nelt - 1;
39342 else if (vmode == V16QImode)
39343 mask = nelt - 1;
39344 else
39345 mask = nelt / 2 - 1;
39347 for (i = 0; i < nelt; ++i)
39349 unsigned j, e = d->perm[i] & mask;
39350 for (j = 0; j < eltsz; ++j)
39351 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
39355 vperm = gen_rtx_CONST_VECTOR (vmode,
39356 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
39357 vperm = force_reg (vmode, vperm);
39359 target = gen_lowpart (vmode, d->target);
39360 op0 = gen_lowpart (vmode, d->op0);
39361 if (d->one_operand_p)
39363 if (vmode == V16QImode)
39364 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
39365 else if (vmode == V32QImode)
39366 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
39367 else if (vmode == V8SFmode)
39368 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
39369 else
39370 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
39372 else
39374 op1 = gen_lowpart (vmode, d->op1);
39375 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
39378 return true;
39381 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
39382 in a single instruction. */
39384 static bool
39385 expand_vec_perm_1 (struct expand_vec_perm_d *d)
39387 unsigned i, nelt = d->nelt;
39388 unsigned char perm2[MAX_VECT_LEN];
39390 /* Check plain VEC_SELECT first, because AVX has instructions that could
39391 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
39392 input where SEL+CONCAT may not. */
39393 if (d->one_operand_p)
39395 int mask = nelt - 1;
39396 bool identity_perm = true;
39397 bool broadcast_perm = true;
39399 for (i = 0; i < nelt; i++)
39401 perm2[i] = d->perm[i] & mask;
39402 if (perm2[i] != i)
39403 identity_perm = false;
39404 if (perm2[i])
39405 broadcast_perm = false;
39408 if (identity_perm)
39410 if (!d->testing_p)
39411 emit_move_insn (d->target, d->op0);
39412 return true;
39414 else if (broadcast_perm && TARGET_AVX2)
39416 /* Use vpbroadcast{b,w,d}. */
39417 rtx (*gen) (rtx, rtx) = NULL;
39418 switch (d->vmode)
39420 case V32QImode:
39421 gen = gen_avx2_pbroadcastv32qi_1;
39422 break;
39423 case V16HImode:
39424 gen = gen_avx2_pbroadcastv16hi_1;
39425 break;
39426 case V8SImode:
39427 gen = gen_avx2_pbroadcastv8si_1;
39428 break;
39429 case V16QImode:
39430 gen = gen_avx2_pbroadcastv16qi;
39431 break;
39432 case V8HImode:
39433 gen = gen_avx2_pbroadcastv8hi;
39434 break;
39435 case V8SFmode:
39436 gen = gen_avx2_vec_dupv8sf_1;
39437 break;
39438 /* For other modes prefer other shuffles this function creates. */
39439 default: break;
39441 if (gen != NULL)
39443 if (!d->testing_p)
39444 emit_insn (gen (d->target, d->op0));
39445 return true;
39449 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
39450 return true;
39452 /* There are plenty of patterns in sse.md that are written for
39453 SEL+CONCAT and are not replicated for a single op. Perhaps
39454 that should be changed, to avoid the nastiness here. */
39456 /* Recognize interleave style patterns, which means incrementing
39457 every other permutation operand. */
39458 for (i = 0; i < nelt; i += 2)
39460 perm2[i] = d->perm[i] & mask;
39461 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
39463 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39464 d->testing_p))
39465 return true;
39467 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
39468 if (nelt >= 4)
39470 for (i = 0; i < nelt; i += 4)
39472 perm2[i + 0] = d->perm[i + 0] & mask;
39473 perm2[i + 1] = d->perm[i + 1] & mask;
39474 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
39475 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
39478 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39479 d->testing_p))
39480 return true;
39484 /* Finally, try the fully general two operand permute. */
39485 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
39486 d->testing_p))
39487 return true;
39489 /* Recognize interleave style patterns with reversed operands. */
39490 if (!d->one_operand_p)
39492 for (i = 0; i < nelt; ++i)
39494 unsigned e = d->perm[i];
39495 if (e >= nelt)
39496 e -= nelt;
39497 else
39498 e += nelt;
39499 perm2[i] = e;
39502 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
39503 d->testing_p))
39504 return true;
39507 /* Try the SSE4.1 blend variable merge instructions. */
39508 if (expand_vec_perm_blend (d))
39509 return true;
39511 /* Try one of the AVX vpermil variable permutations. */
39512 if (expand_vec_perm_vpermil (d))
39513 return true;
39515 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
39516 vpshufb, vpermd, vpermps or vpermq variable permutation. */
39517 if (expand_vec_perm_pshufb (d))
39518 return true;
39520 return false;
39523 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39524 in terms of a pair of pshuflw + pshufhw instructions. */
39526 static bool
39527 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
39529 unsigned char perm2[MAX_VECT_LEN];
39530 unsigned i;
39531 bool ok;
39533 if (d->vmode != V8HImode || !d->one_operand_p)
39534 return false;
39536 /* The two permutations only operate in 64-bit lanes. */
39537 for (i = 0; i < 4; ++i)
39538 if (d->perm[i] >= 4)
39539 return false;
39540 for (i = 4; i < 8; ++i)
39541 if (d->perm[i] < 4)
39542 return false;
39544 if (d->testing_p)
39545 return true;
39547 /* Emit the pshuflw. */
39548 memcpy (perm2, d->perm, 4);
39549 for (i = 4; i < 8; ++i)
39550 perm2[i] = i;
39551 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
39552 gcc_assert (ok);
39554 /* Emit the pshufhw. */
39555 memcpy (perm2 + 4, d->perm + 4, 4);
39556 for (i = 0; i < 4; ++i)
39557 perm2[i] = i;
39558 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
39559 gcc_assert (ok);
39561 return true;
39564 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39565 the permutation using the SSSE3 palignr instruction. This succeeds
39566 when all of the elements in PERM fit within one vector and we merely
39567 need to shift them down so that a single vector permutation has a
39568 chance to succeed. */
39570 static bool
39571 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
39573 unsigned i, nelt = d->nelt;
39574 unsigned min, max;
39575 bool in_order, ok;
39576 rtx shift;
39578 /* Even with AVX, palignr only operates on 128-bit vectors. */
39579 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39580 return false;
39582 min = nelt, max = 0;
39583 for (i = 0; i < nelt; ++i)
39585 unsigned e = d->perm[i];
39586 if (e < min)
39587 min = e;
39588 if (e > max)
39589 max = e;
39591 if (min == 0 || max - min >= nelt)
39592 return false;
39594 /* Given that we have SSSE3, we know we'll be able to implement the
39595 single operand permutation after the palignr with pshufb. */
39596 if (d->testing_p)
39597 return true;
39599 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39600 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39601 gen_lowpart (TImode, d->op1),
39602 gen_lowpart (TImode, d->op0), shift));
39604 d->op0 = d->op1 = d->target;
39605 d->one_operand_p = true;
39607 in_order = true;
39608 for (i = 0; i < nelt; ++i)
39610 unsigned e = d->perm[i] - min;
39611 if (e != i)
39612 in_order = false;
39613 d->perm[i] = e;
39616 /* Test for the degenerate case where the alignment by itself
39617 produces the desired permutation. */
39618 if (in_order)
39619 return true;
39621 ok = expand_vec_perm_1 (d);
39622 gcc_assert (ok);
39624 return ok;
39627 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39629 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39630 a two vector permutation into a single vector permutation by using
39631 an interleave operation to merge the vectors. */
39633 static bool
39634 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39636 struct expand_vec_perm_d dremap, dfinal;
39637 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39638 unsigned HOST_WIDE_INT contents;
39639 unsigned char remap[2 * MAX_VECT_LEN];
39640 rtx seq;
39641 bool ok, same_halves = false;
39643 if (GET_MODE_SIZE (d->vmode) == 16)
39645 if (d->one_operand_p)
39646 return false;
39648 else if (GET_MODE_SIZE (d->vmode) == 32)
39650 if (!TARGET_AVX)
39651 return false;
39652 /* For 32-byte modes allow even d->one_operand_p.
39653 The lack of cross-lane shuffling in some instructions
39654 might prevent a single insn shuffle. */
39655 dfinal = *d;
39656 dfinal.testing_p = true;
39657 /* If expand_vec_perm_interleave3 can expand this into
39658 a 3 insn sequence, give up and let it be expanded as
39659 3 insn sequence. While that is one insn longer,
39660 it doesn't need a memory operand and in the common
39661 case that both interleave low and high permutations
39662 with the same operands are adjacent needs 4 insns
39663 for both after CSE. */
39664 if (expand_vec_perm_interleave3 (&dfinal))
39665 return false;
39667 else
39668 return false;
39670 /* Examine from whence the elements come. */
39671 contents = 0;
39672 for (i = 0; i < nelt; ++i)
39673 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39675 memset (remap, 0xff, sizeof (remap));
39676 dremap = *d;
39678 if (GET_MODE_SIZE (d->vmode) == 16)
39680 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39682 /* Split the two input vectors into 4 halves. */
39683 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39684 h2 = h1 << nelt2;
39685 h3 = h2 << nelt2;
39686 h4 = h3 << nelt2;
39688 /* If the elements from the low halves use interleave low, and similarly
39689 for interleave high. If the elements are from mis-matched halves, we
39690 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39691 if ((contents & (h1 | h3)) == contents)
39693 /* punpckl* */
39694 for (i = 0; i < nelt2; ++i)
39696 remap[i] = i * 2;
39697 remap[i + nelt] = i * 2 + 1;
39698 dremap.perm[i * 2] = i;
39699 dremap.perm[i * 2 + 1] = i + nelt;
39701 if (!TARGET_SSE2 && d->vmode == V4SImode)
39702 dremap.vmode = V4SFmode;
39704 else if ((contents & (h2 | h4)) == contents)
39706 /* punpckh* */
39707 for (i = 0; i < nelt2; ++i)
39709 remap[i + nelt2] = i * 2;
39710 remap[i + nelt + nelt2] = i * 2 + 1;
39711 dremap.perm[i * 2] = i + nelt2;
39712 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39714 if (!TARGET_SSE2 && d->vmode == V4SImode)
39715 dremap.vmode = V4SFmode;
39717 else if ((contents & (h1 | h4)) == contents)
39719 /* shufps */
39720 for (i = 0; i < nelt2; ++i)
39722 remap[i] = i;
39723 remap[i + nelt + nelt2] = i + nelt2;
39724 dremap.perm[i] = i;
39725 dremap.perm[i + nelt2] = i + nelt + nelt2;
39727 if (nelt != 4)
39729 /* shufpd */
39730 dremap.vmode = V2DImode;
39731 dremap.nelt = 2;
39732 dremap.perm[0] = 0;
39733 dremap.perm[1] = 3;
39736 else if ((contents & (h2 | h3)) == contents)
39738 /* shufps */
39739 for (i = 0; i < nelt2; ++i)
39741 remap[i + nelt2] = i;
39742 remap[i + nelt] = i + nelt2;
39743 dremap.perm[i] = i + nelt2;
39744 dremap.perm[i + nelt2] = i + nelt;
39746 if (nelt != 4)
39748 /* shufpd */
39749 dremap.vmode = V2DImode;
39750 dremap.nelt = 2;
39751 dremap.perm[0] = 1;
39752 dremap.perm[1] = 2;
39755 else
39756 return false;
39758 else
39760 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39761 unsigned HOST_WIDE_INT q[8];
39762 unsigned int nonzero_halves[4];
39764 /* Split the two input vectors into 8 quarters. */
39765 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39766 for (i = 1; i < 8; ++i)
39767 q[i] = q[0] << (nelt4 * i);
39768 for (i = 0; i < 4; ++i)
39769 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39771 nonzero_halves[nzcnt] = i;
39772 ++nzcnt;
39775 if (nzcnt == 1)
39777 gcc_assert (d->one_operand_p);
39778 nonzero_halves[1] = nonzero_halves[0];
39779 same_halves = true;
39781 else if (d->one_operand_p)
39783 gcc_assert (nonzero_halves[0] == 0);
39784 gcc_assert (nonzero_halves[1] == 1);
39787 if (nzcnt <= 2)
39789 if (d->perm[0] / nelt2 == nonzero_halves[1])
39791 /* Attempt to increase the likelihood that dfinal
39792 shuffle will be intra-lane. */
39793 char tmph = nonzero_halves[0];
39794 nonzero_halves[0] = nonzero_halves[1];
39795 nonzero_halves[1] = tmph;
39798 /* vperm2f128 or vperm2i128. */
39799 for (i = 0; i < nelt2; ++i)
39801 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39802 remap[i + nonzero_halves[0] * nelt2] = i;
39803 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39804 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39807 if (d->vmode != V8SFmode
39808 && d->vmode != V4DFmode
39809 && d->vmode != V8SImode)
39811 dremap.vmode = V8SImode;
39812 dremap.nelt = 8;
39813 for (i = 0; i < 4; ++i)
39815 dremap.perm[i] = i + nonzero_halves[0] * 4;
39816 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39820 else if (d->one_operand_p)
39821 return false;
39822 else if (TARGET_AVX2
39823 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39825 /* vpunpckl* */
39826 for (i = 0; i < nelt4; ++i)
39828 remap[i] = i * 2;
39829 remap[i + nelt] = i * 2 + 1;
39830 remap[i + nelt2] = i * 2 + nelt2;
39831 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39832 dremap.perm[i * 2] = i;
39833 dremap.perm[i * 2 + 1] = i + nelt;
39834 dremap.perm[i * 2 + nelt2] = i + nelt2;
39835 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39838 else if (TARGET_AVX2
39839 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39841 /* vpunpckh* */
39842 for (i = 0; i < nelt4; ++i)
39844 remap[i + nelt4] = i * 2;
39845 remap[i + nelt + nelt4] = i * 2 + 1;
39846 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39847 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39848 dremap.perm[i * 2] = i + nelt4;
39849 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39850 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39851 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39854 else
39855 return false;
39858 /* Use the remapping array set up above to move the elements from their
39859 swizzled locations into their final destinations. */
39860 dfinal = *d;
39861 for (i = 0; i < nelt; ++i)
39863 unsigned e = remap[d->perm[i]];
39864 gcc_assert (e < nelt);
39865 /* If same_halves is true, both halves of the remapped vector are the
39866 same. Avoid cross-lane accesses if possible. */
39867 if (same_halves && i >= nelt2)
39869 gcc_assert (e < nelt2);
39870 dfinal.perm[i] = e + nelt2;
39872 else
39873 dfinal.perm[i] = e;
39875 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39876 dfinal.op1 = dfinal.op0;
39877 dfinal.one_operand_p = true;
39878 dremap.target = dfinal.op0;
39880 /* Test if the final remap can be done with a single insn. For V4SFmode or
39881 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39882 start_sequence ();
39883 ok = expand_vec_perm_1 (&dfinal);
39884 seq = get_insns ();
39885 end_sequence ();
39887 if (!ok)
39888 return false;
39890 if (d->testing_p)
39891 return true;
39893 if (dremap.vmode != dfinal.vmode)
39895 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39896 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39897 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39900 ok = expand_vec_perm_1 (&dremap);
39901 gcc_assert (ok);
39903 emit_insn (seq);
39904 return true;
39907 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39908 a single vector cross-lane permutation into vpermq followed
39909 by any of the single insn permutations. */
39911 static bool
39912 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39914 struct expand_vec_perm_d dremap, dfinal;
39915 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39916 unsigned contents[2];
39917 bool ok;
39919 if (!(TARGET_AVX2
39920 && (d->vmode == V32QImode || d->vmode == V16HImode)
39921 && d->one_operand_p))
39922 return false;
39924 contents[0] = 0;
39925 contents[1] = 0;
39926 for (i = 0; i < nelt2; ++i)
39928 contents[0] |= 1u << (d->perm[i] / nelt4);
39929 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39932 for (i = 0; i < 2; ++i)
39934 unsigned int cnt = 0;
39935 for (j = 0; j < 4; ++j)
39936 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39937 return false;
39940 if (d->testing_p)
39941 return true;
39943 dremap = *d;
39944 dremap.vmode = V4DImode;
39945 dremap.nelt = 4;
39946 dremap.target = gen_reg_rtx (V4DImode);
39947 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39948 dremap.op1 = dremap.op0;
39949 dremap.one_operand_p = true;
39950 for (i = 0; i < 2; ++i)
39952 unsigned int cnt = 0;
39953 for (j = 0; j < 4; ++j)
39954 if ((contents[i] & (1u << j)) != 0)
39955 dremap.perm[2 * i + cnt++] = j;
39956 for (; cnt < 2; ++cnt)
39957 dremap.perm[2 * i + cnt] = 0;
39960 dfinal = *d;
39961 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39962 dfinal.op1 = dfinal.op0;
39963 dfinal.one_operand_p = true;
39964 for (i = 0, j = 0; i < nelt; ++i)
39966 if (i == nelt2)
39967 j = 2;
39968 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39969 if ((d->perm[i] / nelt4) == dremap.perm[j])
39971 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39972 dfinal.perm[i] |= nelt4;
39973 else
39974 gcc_unreachable ();
39977 ok = expand_vec_perm_1 (&dremap);
39978 gcc_assert (ok);
39980 ok = expand_vec_perm_1 (&dfinal);
39981 gcc_assert (ok);
39983 return true;
39986 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39987 a vector permutation using two instructions, vperm2f128 resp.
39988 vperm2i128 followed by any single in-lane permutation. */
39990 static bool
39991 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39993 struct expand_vec_perm_d dfirst, dsecond;
39994 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39995 bool ok;
39997 if (!TARGET_AVX
39998 || GET_MODE_SIZE (d->vmode) != 32
39999 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
40000 return false;
40002 dsecond = *d;
40003 dsecond.one_operand_p = false;
40004 dsecond.testing_p = true;
40006 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
40007 immediate. For perm < 16 the second permutation uses
40008 d->op0 as first operand, for perm >= 16 it uses d->op1
40009 as first operand. The second operand is the result of
40010 vperm2[fi]128. */
40011 for (perm = 0; perm < 32; perm++)
40013 /* Ignore permutations which do not move anything cross-lane. */
40014 if (perm < 16)
40016 /* The second shuffle for e.g. V4DFmode has
40017 0123 and ABCD operands.
40018 Ignore AB23, as 23 is already in the second lane
40019 of the first operand. */
40020 if ((perm & 0xc) == (1 << 2)) continue;
40021 /* And 01CD, as 01 is in the first lane of the first
40022 operand. */
40023 if ((perm & 3) == 0) continue;
40024 /* And 4567, as then the vperm2[fi]128 doesn't change
40025 anything on the original 4567 second operand. */
40026 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
40028 else
40030 /* The second shuffle for e.g. V4DFmode has
40031 4567 and ABCD operands.
40032 Ignore AB67, as 67 is already in the second lane
40033 of the first operand. */
40034 if ((perm & 0xc) == (3 << 2)) continue;
40035 /* And 45CD, as 45 is in the first lane of the first
40036 operand. */
40037 if ((perm & 3) == 2) continue;
40038 /* And 0123, as then the vperm2[fi]128 doesn't change
40039 anything on the original 0123 first operand. */
40040 if ((perm & 0xf) == (1 << 2)) continue;
40043 for (i = 0; i < nelt; i++)
40045 j = d->perm[i] / nelt2;
40046 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
40047 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
40048 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
40049 dsecond.perm[i] = d->perm[i] & (nelt - 1);
40050 else
40051 break;
40054 if (i == nelt)
40056 start_sequence ();
40057 ok = expand_vec_perm_1 (&dsecond);
40058 end_sequence ();
40060 else
40061 ok = false;
40063 if (ok)
40065 if (d->testing_p)
40066 return true;
40068 /* Found a usable second shuffle. dfirst will be
40069 vperm2f128 on d->op0 and d->op1. */
40070 dsecond.testing_p = false;
40071 dfirst = *d;
40072 dfirst.target = gen_reg_rtx (d->vmode);
40073 for (i = 0; i < nelt; i++)
40074 dfirst.perm[i] = (i & (nelt2 - 1))
40075 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
40077 ok = expand_vec_perm_1 (&dfirst);
40078 gcc_assert (ok);
40080 /* And dsecond is some single insn shuffle, taking
40081 d->op0 and result of vperm2f128 (if perm < 16) or
40082 d->op1 and result of vperm2f128 (otherwise). */
40083 dsecond.op1 = dfirst.target;
40084 if (perm >= 16)
40085 dsecond.op0 = dfirst.op1;
40087 ok = expand_vec_perm_1 (&dsecond);
40088 gcc_assert (ok);
40090 return true;
40093 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
40094 if (d->one_operand_p)
40095 return false;
40098 return false;
40101 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40102 a two vector permutation using 2 intra-lane interleave insns
40103 and cross-lane shuffle for 32-byte vectors. */
40105 static bool
40106 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
40108 unsigned i, nelt;
40109 rtx (*gen) (rtx, rtx, rtx);
40111 if (d->one_operand_p)
40112 return false;
40113 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
40115 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
40117 else
40118 return false;
40120 nelt = d->nelt;
40121 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
40122 return false;
40123 for (i = 0; i < nelt; i += 2)
40124 if (d->perm[i] != d->perm[0] + i / 2
40125 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
40126 return false;
40128 if (d->testing_p)
40129 return true;
40131 switch (d->vmode)
40133 case V32QImode:
40134 if (d->perm[0])
40135 gen = gen_vec_interleave_highv32qi;
40136 else
40137 gen = gen_vec_interleave_lowv32qi;
40138 break;
40139 case V16HImode:
40140 if (d->perm[0])
40141 gen = gen_vec_interleave_highv16hi;
40142 else
40143 gen = gen_vec_interleave_lowv16hi;
40144 break;
40145 case V8SImode:
40146 if (d->perm[0])
40147 gen = gen_vec_interleave_highv8si;
40148 else
40149 gen = gen_vec_interleave_lowv8si;
40150 break;
40151 case V4DImode:
40152 if (d->perm[0])
40153 gen = gen_vec_interleave_highv4di;
40154 else
40155 gen = gen_vec_interleave_lowv4di;
40156 break;
40157 case V8SFmode:
40158 if (d->perm[0])
40159 gen = gen_vec_interleave_highv8sf;
40160 else
40161 gen = gen_vec_interleave_lowv8sf;
40162 break;
40163 case V4DFmode:
40164 if (d->perm[0])
40165 gen = gen_vec_interleave_highv4df;
40166 else
40167 gen = gen_vec_interleave_lowv4df;
40168 break;
40169 default:
40170 gcc_unreachable ();
40173 emit_insn (gen (d->target, d->op0, d->op1));
40174 return true;
40177 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
40178 a single vector permutation using a single intra-lane vector
40179 permutation, vperm2f128 swapping the lanes and vblend* insn blending
40180 the non-swapped and swapped vectors together. */
40182 static bool
40183 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
40185 struct expand_vec_perm_d dfirst, dsecond;
40186 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
40187 rtx seq;
40188 bool ok;
40189 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
40191 if (!TARGET_AVX
40192 || TARGET_AVX2
40193 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
40194 || !d->one_operand_p)
40195 return false;
40197 dfirst = *d;
40198 for (i = 0; i < nelt; i++)
40199 dfirst.perm[i] = 0xff;
40200 for (i = 0, msk = 0; i < nelt; i++)
40202 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
40203 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
40204 return false;
40205 dfirst.perm[j] = d->perm[i];
40206 if (j != i)
40207 msk |= (1 << i);
40209 for (i = 0; i < nelt; i++)
40210 if (dfirst.perm[i] == 0xff)
40211 dfirst.perm[i] = i;
40213 if (!d->testing_p)
40214 dfirst.target = gen_reg_rtx (dfirst.vmode);
40216 start_sequence ();
40217 ok = expand_vec_perm_1 (&dfirst);
40218 seq = get_insns ();
40219 end_sequence ();
40221 if (!ok)
40222 return false;
40224 if (d->testing_p)
40225 return true;
40227 emit_insn (seq);
40229 dsecond = *d;
40230 dsecond.op0 = dfirst.target;
40231 dsecond.op1 = dfirst.target;
40232 dsecond.one_operand_p = true;
40233 dsecond.target = gen_reg_rtx (dsecond.vmode);
40234 for (i = 0; i < nelt; i++)
40235 dsecond.perm[i] = i ^ nelt2;
40237 ok = expand_vec_perm_1 (&dsecond);
40238 gcc_assert (ok);
40240 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
40241 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
40242 return true;
40245 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
40246 permutation using two vperm2f128, followed by a vshufpd insn blending
40247 the two vectors together. */
40249 static bool
40250 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
40252 struct expand_vec_perm_d dfirst, dsecond, dthird;
40253 bool ok;
40255 if (!TARGET_AVX || (d->vmode != V4DFmode))
40256 return false;
40258 if (d->testing_p)
40259 return true;
40261 dfirst = *d;
40262 dsecond = *d;
40263 dthird = *d;
40265 dfirst.perm[0] = (d->perm[0] & ~1);
40266 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
40267 dfirst.perm[2] = (d->perm[2] & ~1);
40268 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
40269 dsecond.perm[0] = (d->perm[1] & ~1);
40270 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
40271 dsecond.perm[2] = (d->perm[3] & ~1);
40272 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
40273 dthird.perm[0] = (d->perm[0] % 2);
40274 dthird.perm[1] = (d->perm[1] % 2) + 4;
40275 dthird.perm[2] = (d->perm[2] % 2) + 2;
40276 dthird.perm[3] = (d->perm[3] % 2) + 6;
40278 dfirst.target = gen_reg_rtx (dfirst.vmode);
40279 dsecond.target = gen_reg_rtx (dsecond.vmode);
40280 dthird.op0 = dfirst.target;
40281 dthird.op1 = dsecond.target;
40282 dthird.one_operand_p = false;
40284 canonicalize_perm (&dfirst);
40285 canonicalize_perm (&dsecond);
40287 ok = expand_vec_perm_1 (&dfirst)
40288 && expand_vec_perm_1 (&dsecond)
40289 && expand_vec_perm_1 (&dthird);
40291 gcc_assert (ok);
40293 return true;
40296 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
40297 permutation with two pshufb insns and an ior. We should have already
40298 failed all two instruction sequences. */
40300 static bool
40301 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
40303 rtx rperm[2][16], vperm, l, h, op, m128;
40304 unsigned int i, nelt, eltsz;
40306 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40307 return false;
40308 gcc_assert (!d->one_operand_p);
40310 nelt = d->nelt;
40311 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40313 /* Generate two permutation masks. If the required element is within
40314 the given vector it is shuffled into the proper lane. If the required
40315 element is in the other vector, force a zero into the lane by setting
40316 bit 7 in the permutation mask. */
40317 m128 = GEN_INT (-128);
40318 for (i = 0; i < nelt; ++i)
40320 unsigned j, e = d->perm[i];
40321 unsigned which = (e >= nelt);
40322 if (e >= nelt)
40323 e -= nelt;
40325 for (j = 0; j < eltsz; ++j)
40327 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
40328 rperm[1-which][i*eltsz + j] = m128;
40332 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
40333 vperm = force_reg (V16QImode, vperm);
40335 l = gen_reg_rtx (V16QImode);
40336 op = gen_lowpart (V16QImode, d->op0);
40337 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
40339 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
40340 vperm = force_reg (V16QImode, vperm);
40342 h = gen_reg_rtx (V16QImode);
40343 op = gen_lowpart (V16QImode, d->op1);
40344 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
40346 op = gen_lowpart (V16QImode, d->target);
40347 emit_insn (gen_iorv16qi3 (op, l, h));
40349 return true;
40352 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
40353 with two vpshufb insns, vpermq and vpor. We should have already failed
40354 all two or three instruction sequences. */
40356 static bool
40357 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
40359 rtx rperm[2][32], vperm, l, h, hp, op, m128;
40360 unsigned int i, nelt, eltsz;
40362 if (!TARGET_AVX2
40363 || !d->one_operand_p
40364 || (d->vmode != V32QImode && d->vmode != V16HImode))
40365 return false;
40367 if (d->testing_p)
40368 return true;
40370 nelt = d->nelt;
40371 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40373 /* Generate two permutation masks. If the required element is within
40374 the same lane, it is shuffled in. If the required element from the
40375 other lane, force a zero by setting bit 7 in the permutation mask.
40376 In the other mask the mask has non-negative elements if element
40377 is requested from the other lane, but also moved to the other lane,
40378 so that the result of vpshufb can have the two V2TImode halves
40379 swapped. */
40380 m128 = GEN_INT (-128);
40381 for (i = 0; i < nelt; ++i)
40383 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40384 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40386 for (j = 0; j < eltsz; ++j)
40388 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
40389 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
40393 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40394 vperm = force_reg (V32QImode, vperm);
40396 h = gen_reg_rtx (V32QImode);
40397 op = gen_lowpart (V32QImode, d->op0);
40398 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40400 /* Swap the 128-byte lanes of h into hp. */
40401 hp = gen_reg_rtx (V4DImode);
40402 op = gen_lowpart (V4DImode, h);
40403 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
40404 const1_rtx));
40406 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40407 vperm = force_reg (V32QImode, vperm);
40409 l = gen_reg_rtx (V32QImode);
40410 op = gen_lowpart (V32QImode, d->op0);
40411 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40413 op = gen_lowpart (V32QImode, d->target);
40414 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
40416 return true;
40419 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
40420 and extract-odd permutations of two V32QImode and V16QImode operand
40421 with two vpshufb insns, vpor and vpermq. We should have already
40422 failed all two or three instruction sequences. */
40424 static bool
40425 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
40427 rtx rperm[2][32], vperm, l, h, ior, op, m128;
40428 unsigned int i, nelt, eltsz;
40430 if (!TARGET_AVX2
40431 || d->one_operand_p
40432 || (d->vmode != V32QImode && d->vmode != V16HImode))
40433 return false;
40435 for (i = 0; i < d->nelt; ++i)
40436 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
40437 return false;
40439 if (d->testing_p)
40440 return true;
40442 nelt = d->nelt;
40443 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40445 /* Generate two permutation masks. In the first permutation mask
40446 the first quarter will contain indexes for the first half
40447 of the op0, the second quarter will contain bit 7 set, third quarter
40448 will contain indexes for the second half of the op0 and the
40449 last quarter bit 7 set. In the second permutation mask
40450 the first quarter will contain bit 7 set, the second quarter
40451 indexes for the first half of the op1, the third quarter bit 7 set
40452 and last quarter indexes for the second half of the op1.
40453 I.e. the first mask e.g. for V32QImode extract even will be:
40454 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
40455 (all values masked with 0xf except for -128) and second mask
40456 for extract even will be
40457 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
40458 m128 = GEN_INT (-128);
40459 for (i = 0; i < nelt; ++i)
40461 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40462 unsigned which = d->perm[i] >= nelt;
40463 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
40465 for (j = 0; j < eltsz; ++j)
40467 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
40468 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
40472 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40473 vperm = force_reg (V32QImode, vperm);
40475 l = gen_reg_rtx (V32QImode);
40476 op = gen_lowpart (V32QImode, d->op0);
40477 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40479 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40480 vperm = force_reg (V32QImode, vperm);
40482 h = gen_reg_rtx (V32QImode);
40483 op = gen_lowpart (V32QImode, d->op1);
40484 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40486 ior = gen_reg_rtx (V32QImode);
40487 emit_insn (gen_iorv32qi3 (ior, l, h));
40489 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
40490 op = gen_lowpart (V4DImode, d->target);
40491 ior = gen_lowpart (V4DImode, ior);
40492 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
40493 const1_rtx, GEN_INT (3)));
40495 return true;
40498 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
40499 and extract-odd permutations. */
40501 static bool
40502 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
40504 rtx t1, t2, t3;
40506 switch (d->vmode)
40508 case V4DFmode:
40509 t1 = gen_reg_rtx (V4DFmode);
40510 t2 = gen_reg_rtx (V4DFmode);
40512 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40513 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
40514 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
40516 /* Now an unpck[lh]pd will produce the result required. */
40517 if (odd)
40518 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
40519 else
40520 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
40521 emit_insn (t3);
40522 break;
40524 case V8SFmode:
40526 int mask = odd ? 0xdd : 0x88;
40528 t1 = gen_reg_rtx (V8SFmode);
40529 t2 = gen_reg_rtx (V8SFmode);
40530 t3 = gen_reg_rtx (V8SFmode);
40532 /* Shuffle within the 128-bit lanes to produce:
40533 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
40534 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
40535 GEN_INT (mask)));
40537 /* Shuffle the lanes around to produce:
40538 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
40539 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
40540 GEN_INT (0x3)));
40542 /* Shuffle within the 128-bit lanes to produce:
40543 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
40544 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
40546 /* Shuffle within the 128-bit lanes to produce:
40547 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
40548 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
40550 /* Shuffle the lanes around to produce:
40551 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
40552 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
40553 GEN_INT (0x20)));
40555 break;
40557 case V2DFmode:
40558 case V4SFmode:
40559 case V2DImode:
40560 case V4SImode:
40561 /* These are always directly implementable by expand_vec_perm_1. */
40562 gcc_unreachable ();
40564 case V8HImode:
40565 if (TARGET_SSSE3)
40566 return expand_vec_perm_pshufb2 (d);
40567 else
40569 /* We need 2*log2(N)-1 operations to achieve odd/even
40570 with interleave. */
40571 t1 = gen_reg_rtx (V8HImode);
40572 t2 = gen_reg_rtx (V8HImode);
40573 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
40574 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
40575 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
40576 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
40577 if (odd)
40578 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
40579 else
40580 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
40581 emit_insn (t3);
40583 break;
40585 case V16QImode:
40586 if (TARGET_SSSE3)
40587 return expand_vec_perm_pshufb2 (d);
40588 else
40590 t1 = gen_reg_rtx (V16QImode);
40591 t2 = gen_reg_rtx (V16QImode);
40592 t3 = gen_reg_rtx (V16QImode);
40593 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
40594 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
40595 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
40596 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
40597 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
40598 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40599 if (odd)
40600 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40601 else
40602 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40603 emit_insn (t3);
40605 break;
40607 case V16HImode:
40608 case V32QImode:
40609 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40611 case V4DImode:
40612 if (!TARGET_AVX2)
40614 struct expand_vec_perm_d d_copy = *d;
40615 d_copy.vmode = V4DFmode;
40616 d_copy.target = gen_lowpart (V4DFmode, d->target);
40617 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40618 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40619 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40622 t1 = gen_reg_rtx (V4DImode);
40623 t2 = gen_reg_rtx (V4DImode);
40625 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40626 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40627 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40629 /* Now an vpunpck[lh]qdq will produce the result required. */
40630 if (odd)
40631 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40632 else
40633 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40634 emit_insn (t3);
40635 break;
40637 case V8SImode:
40638 if (!TARGET_AVX2)
40640 struct expand_vec_perm_d d_copy = *d;
40641 d_copy.vmode = V8SFmode;
40642 d_copy.target = gen_lowpart (V8SFmode, d->target);
40643 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40644 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40645 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40648 t1 = gen_reg_rtx (V8SImode);
40649 t2 = gen_reg_rtx (V8SImode);
40651 /* Shuffle the lanes around into
40652 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40653 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40654 gen_lowpart (V4DImode, d->op0),
40655 gen_lowpart (V4DImode, d->op1),
40656 GEN_INT (0x20)));
40657 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40658 gen_lowpart (V4DImode, d->op0),
40659 gen_lowpart (V4DImode, d->op1),
40660 GEN_INT (0x31)));
40662 /* Swap the 2nd and 3rd position in each lane into
40663 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40664 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40665 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40666 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40667 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40669 /* Now an vpunpck[lh]qdq will produce
40670 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40671 if (odd)
40672 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40673 gen_lowpart (V4DImode, t1),
40674 gen_lowpart (V4DImode, t2));
40675 else
40676 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40677 gen_lowpart (V4DImode, t1),
40678 gen_lowpart (V4DImode, t2));
40679 emit_insn (t3);
40680 break;
40682 default:
40683 gcc_unreachable ();
40686 return true;
40689 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40690 extract-even and extract-odd permutations. */
40692 static bool
40693 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40695 unsigned i, odd, nelt = d->nelt;
40697 odd = d->perm[0];
40698 if (odd != 0 && odd != 1)
40699 return false;
40701 for (i = 1; i < nelt; ++i)
40702 if (d->perm[i] != 2 * i + odd)
40703 return false;
40705 return expand_vec_perm_even_odd_1 (d, odd);
40708 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40709 permutations. We assume that expand_vec_perm_1 has already failed. */
40711 static bool
40712 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40714 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40715 enum machine_mode vmode = d->vmode;
40716 unsigned char perm2[4];
40717 rtx op0 = d->op0;
40718 bool ok;
40720 switch (vmode)
40722 case V4DFmode:
40723 case V8SFmode:
40724 /* These are special-cased in sse.md so that we can optionally
40725 use the vbroadcast instruction. They expand to two insns
40726 if the input happens to be in a register. */
40727 gcc_unreachable ();
40729 case V2DFmode:
40730 case V2DImode:
40731 case V4SFmode:
40732 case V4SImode:
40733 /* These are always implementable using standard shuffle patterns. */
40734 gcc_unreachable ();
40736 case V8HImode:
40737 case V16QImode:
40738 /* These can be implemented via interleave. We save one insn by
40739 stopping once we have promoted to V4SImode and then use pshufd. */
40742 rtx dest;
40743 rtx (*gen) (rtx, rtx, rtx)
40744 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40745 : gen_vec_interleave_lowv8hi;
40747 if (elt >= nelt2)
40749 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40750 : gen_vec_interleave_highv8hi;
40751 elt -= nelt2;
40753 nelt2 /= 2;
40755 dest = gen_reg_rtx (vmode);
40756 emit_insn (gen (dest, op0, op0));
40757 vmode = get_mode_wider_vector (vmode);
40758 op0 = gen_lowpart (vmode, dest);
40760 while (vmode != V4SImode);
40762 memset (perm2, elt, 4);
40763 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40764 d->testing_p);
40765 gcc_assert (ok);
40766 return true;
40768 case V32QImode:
40769 case V16HImode:
40770 case V8SImode:
40771 case V4DImode:
40772 /* For AVX2 broadcasts of the first element vpbroadcast* or
40773 vpermq should be used by expand_vec_perm_1. */
40774 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40775 return false;
40777 default:
40778 gcc_unreachable ();
40782 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40783 broadcast permutations. */
40785 static bool
40786 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40788 unsigned i, elt, nelt = d->nelt;
40790 if (!d->one_operand_p)
40791 return false;
40793 elt = d->perm[0];
40794 for (i = 1; i < nelt; ++i)
40795 if (d->perm[i] != elt)
40796 return false;
40798 return expand_vec_perm_broadcast_1 (d);
40801 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40802 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40803 all the shorter instruction sequences. */
40805 static bool
40806 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40808 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40809 unsigned int i, nelt, eltsz;
40810 bool used[4];
40812 if (!TARGET_AVX2
40813 || d->one_operand_p
40814 || (d->vmode != V32QImode && d->vmode != V16HImode))
40815 return false;
40817 if (d->testing_p)
40818 return true;
40820 nelt = d->nelt;
40821 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40823 /* Generate 4 permutation masks. If the required element is within
40824 the same lane, it is shuffled in. If the required element from the
40825 other lane, force a zero by setting bit 7 in the permutation mask.
40826 In the other mask the mask has non-negative elements if element
40827 is requested from the other lane, but also moved to the other lane,
40828 so that the result of vpshufb can have the two V2TImode halves
40829 swapped. */
40830 m128 = GEN_INT (-128);
40831 for (i = 0; i < 32; ++i)
40833 rperm[0][i] = m128;
40834 rperm[1][i] = m128;
40835 rperm[2][i] = m128;
40836 rperm[3][i] = m128;
40838 used[0] = false;
40839 used[1] = false;
40840 used[2] = false;
40841 used[3] = false;
40842 for (i = 0; i < nelt; ++i)
40844 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40845 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40846 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40848 for (j = 0; j < eltsz; ++j)
40849 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40850 used[which] = true;
40853 for (i = 0; i < 2; ++i)
40855 if (!used[2 * i + 1])
40857 h[i] = NULL_RTX;
40858 continue;
40860 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40861 gen_rtvec_v (32, rperm[2 * i + 1]));
40862 vperm = force_reg (V32QImode, vperm);
40863 h[i] = gen_reg_rtx (V32QImode);
40864 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40865 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40868 /* Swap the 128-byte lanes of h[X]. */
40869 for (i = 0; i < 2; ++i)
40871 if (h[i] == NULL_RTX)
40872 continue;
40873 op = gen_reg_rtx (V4DImode);
40874 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40875 const2_rtx, GEN_INT (3), const0_rtx,
40876 const1_rtx));
40877 h[i] = gen_lowpart (V32QImode, op);
40880 for (i = 0; i < 2; ++i)
40882 if (!used[2 * i])
40884 l[i] = NULL_RTX;
40885 continue;
40887 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40888 vperm = force_reg (V32QImode, vperm);
40889 l[i] = gen_reg_rtx (V32QImode);
40890 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40891 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40894 for (i = 0; i < 2; ++i)
40896 if (h[i] && l[i])
40898 op = gen_reg_rtx (V32QImode);
40899 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40900 l[i] = op;
40902 else if (h[i])
40903 l[i] = h[i];
40906 gcc_assert (l[0] && l[1]);
40907 op = gen_lowpart (V32QImode, d->target);
40908 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40909 return true;
40912 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40913 With all of the interface bits taken care of, perform the expansion
40914 in D and return true on success. */
40916 static bool
40917 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40919 /* Try a single instruction expansion. */
40920 if (expand_vec_perm_1 (d))
40921 return true;
40923 /* Try sequences of two instructions. */
40925 if (expand_vec_perm_pshuflw_pshufhw (d))
40926 return true;
40928 if (expand_vec_perm_palignr (d))
40929 return true;
40931 if (expand_vec_perm_interleave2 (d))
40932 return true;
40934 if (expand_vec_perm_broadcast (d))
40935 return true;
40937 if (expand_vec_perm_vpermq_perm_1 (d))
40938 return true;
40940 if (expand_vec_perm_vperm2f128 (d))
40941 return true;
40943 /* Try sequences of three instructions. */
40945 if (expand_vec_perm_2vperm2f128_vshuf (d))
40946 return true;
40948 if (expand_vec_perm_pshufb2 (d))
40949 return true;
40951 if (expand_vec_perm_interleave3 (d))
40952 return true;
40954 if (expand_vec_perm_vperm2f128_vblend (d))
40955 return true;
40957 /* Try sequences of four instructions. */
40959 if (expand_vec_perm_vpshufb2_vpermq (d))
40960 return true;
40962 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40963 return true;
40965 /* ??? Look for narrow permutations whose element orderings would
40966 allow the promotion to a wider mode. */
40968 /* ??? Look for sequences of interleave or a wider permute that place
40969 the data into the correct lanes for a half-vector shuffle like
40970 pshuf[lh]w or vpermilps. */
40972 /* ??? Look for sequences of interleave that produce the desired results.
40973 The combinatorics of punpck[lh] get pretty ugly... */
40975 if (expand_vec_perm_even_odd (d))
40976 return true;
40978 /* Even longer sequences. */
40979 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40980 return true;
40982 return false;
40985 /* If a permutation only uses one operand, make it clear. Returns true
40986 if the permutation references both operands. */
40988 static bool
40989 canonicalize_perm (struct expand_vec_perm_d *d)
40991 int i, which, nelt = d->nelt;
40993 for (i = which = 0; i < nelt; ++i)
40994 which |= (d->perm[i] < nelt ? 1 : 2);
40996 d->one_operand_p = true;
40997 switch (which)
40999 default:
41000 gcc_unreachable();
41002 case 3:
41003 if (!rtx_equal_p (d->op0, d->op1))
41005 d->one_operand_p = false;
41006 break;
41008 /* The elements of PERM do not suggest that only the first operand
41009 is used, but both operands are identical. Allow easier matching
41010 of the permutation by folding the permutation into the single
41011 input vector. */
41012 /* FALLTHRU */
41014 case 2:
41015 for (i = 0; i < nelt; ++i)
41016 d->perm[i] &= nelt - 1;
41017 d->op0 = d->op1;
41018 break;
41020 case 1:
41021 d->op1 = d->op0;
41022 break;
41025 return (which == 3);
41028 bool
41029 ix86_expand_vec_perm_const (rtx operands[4])
41031 struct expand_vec_perm_d d;
41032 unsigned char perm[MAX_VECT_LEN];
41033 int i, nelt;
41034 bool two_args;
41035 rtx sel;
41037 d.target = operands[0];
41038 d.op0 = operands[1];
41039 d.op1 = operands[2];
41040 sel = operands[3];
41042 d.vmode = GET_MODE (d.target);
41043 gcc_assert (VECTOR_MODE_P (d.vmode));
41044 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41045 d.testing_p = false;
41047 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
41048 gcc_assert (XVECLEN (sel, 0) == nelt);
41049 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
41051 for (i = 0; i < nelt; ++i)
41053 rtx e = XVECEXP (sel, 0, i);
41054 int ei = INTVAL (e) & (2 * nelt - 1);
41055 d.perm[i] = ei;
41056 perm[i] = ei;
41059 two_args = canonicalize_perm (&d);
41061 if (ix86_expand_vec_perm_const_1 (&d))
41062 return true;
41064 /* If the selector says both arguments are needed, but the operands are the
41065 same, the above tried to expand with one_operand_p and flattened selector.
41066 If that didn't work, retry without one_operand_p; we succeeded with that
41067 during testing. */
41068 if (two_args && d.one_operand_p)
41070 d.one_operand_p = false;
41071 memcpy (d.perm, perm, sizeof (perm));
41072 return ix86_expand_vec_perm_const_1 (&d);
41075 return false;
41078 /* Implement targetm.vectorize.vec_perm_const_ok. */
41080 static bool
41081 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
41082 const unsigned char *sel)
41084 struct expand_vec_perm_d d;
41085 unsigned int i, nelt, which;
41086 bool ret;
41088 d.vmode = vmode;
41089 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41090 d.testing_p = true;
41092 /* Given sufficient ISA support we can just return true here
41093 for selected vector modes. */
41094 if (GET_MODE_SIZE (d.vmode) == 16)
41096 /* All implementable with a single vpperm insn. */
41097 if (TARGET_XOP)
41098 return true;
41099 /* All implementable with 2 pshufb + 1 ior. */
41100 if (TARGET_SSSE3)
41101 return true;
41102 /* All implementable with shufpd or unpck[lh]pd. */
41103 if (d.nelt == 2)
41104 return true;
41107 /* Extract the values from the vector CST into the permutation
41108 array in D. */
41109 memcpy (d.perm, sel, nelt);
41110 for (i = which = 0; i < nelt; ++i)
41112 unsigned char e = d.perm[i];
41113 gcc_assert (e < 2 * nelt);
41114 which |= (e < nelt ? 1 : 2);
41117 /* For all elements from second vector, fold the elements to first. */
41118 if (which == 2)
41119 for (i = 0; i < nelt; ++i)
41120 d.perm[i] -= nelt;
41122 /* Check whether the mask can be applied to the vector type. */
41123 d.one_operand_p = (which != 3);
41125 /* Implementable with shufps or pshufd. */
41126 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
41127 return true;
41129 /* Otherwise we have to go through the motions and see if we can
41130 figure out how to generate the requested permutation. */
41131 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
41132 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
41133 if (!d.one_operand_p)
41134 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
41136 start_sequence ();
41137 ret = ix86_expand_vec_perm_const_1 (&d);
41138 end_sequence ();
41140 return ret;
41143 void
41144 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
41146 struct expand_vec_perm_d d;
41147 unsigned i, nelt;
41149 d.target = targ;
41150 d.op0 = op0;
41151 d.op1 = op1;
41152 d.vmode = GET_MODE (targ);
41153 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41154 d.one_operand_p = false;
41155 d.testing_p = false;
41157 for (i = 0; i < nelt; ++i)
41158 d.perm[i] = i * 2 + odd;
41160 /* We'll either be able to implement the permutation directly... */
41161 if (expand_vec_perm_1 (&d))
41162 return;
41164 /* ... or we use the special-case patterns. */
41165 expand_vec_perm_even_odd_1 (&d, odd);
41168 static void
41169 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
41171 struct expand_vec_perm_d d;
41172 unsigned i, nelt, base;
41173 bool ok;
41175 d.target = targ;
41176 d.op0 = op0;
41177 d.op1 = op1;
41178 d.vmode = GET_MODE (targ);
41179 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41180 d.one_operand_p = false;
41181 d.testing_p = false;
41183 base = high_p ? nelt / 2 : 0;
41184 for (i = 0; i < nelt / 2; ++i)
41186 d.perm[i * 2] = i + base;
41187 d.perm[i * 2 + 1] = i + base + nelt;
41190 /* Note that for AVX this isn't one instruction. */
41191 ok = ix86_expand_vec_perm_const_1 (&d);
41192 gcc_assert (ok);
41196 /* Expand a vector operation CODE for a V*QImode in terms of the
41197 same operation on V*HImode. */
41199 void
41200 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
41202 enum machine_mode qimode = GET_MODE (dest);
41203 enum machine_mode himode;
41204 rtx (*gen_il) (rtx, rtx, rtx);
41205 rtx (*gen_ih) (rtx, rtx, rtx);
41206 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
41207 struct expand_vec_perm_d d;
41208 bool ok, full_interleave;
41209 bool uns_p = false;
41210 int i;
41212 switch (qimode)
41214 case V16QImode:
41215 himode = V8HImode;
41216 gen_il = gen_vec_interleave_lowv16qi;
41217 gen_ih = gen_vec_interleave_highv16qi;
41218 break;
41219 case V32QImode:
41220 himode = V16HImode;
41221 gen_il = gen_avx2_interleave_lowv32qi;
41222 gen_ih = gen_avx2_interleave_highv32qi;
41223 break;
41224 default:
41225 gcc_unreachable ();
41228 op2_l = op2_h = op2;
41229 switch (code)
41231 case MULT:
41232 /* Unpack data such that we've got a source byte in each low byte of
41233 each word. We don't care what goes into the high byte of each word.
41234 Rather than trying to get zero in there, most convenient is to let
41235 it be a copy of the low byte. */
41236 op2_l = gen_reg_rtx (qimode);
41237 op2_h = gen_reg_rtx (qimode);
41238 emit_insn (gen_il (op2_l, op2, op2));
41239 emit_insn (gen_ih (op2_h, op2, op2));
41240 /* FALLTHRU */
41242 op1_l = gen_reg_rtx (qimode);
41243 op1_h = gen_reg_rtx (qimode);
41244 emit_insn (gen_il (op1_l, op1, op1));
41245 emit_insn (gen_ih (op1_h, op1, op1));
41246 full_interleave = qimode == V16QImode;
41247 break;
41249 case ASHIFT:
41250 case LSHIFTRT:
41251 uns_p = true;
41252 /* FALLTHRU */
41253 case ASHIFTRT:
41254 op1_l = gen_reg_rtx (himode);
41255 op1_h = gen_reg_rtx (himode);
41256 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
41257 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
41258 full_interleave = true;
41259 break;
41260 default:
41261 gcc_unreachable ();
41264 /* Perform the operation. */
41265 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
41266 1, OPTAB_DIRECT);
41267 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
41268 1, OPTAB_DIRECT);
41269 gcc_assert (res_l && res_h);
41271 /* Merge the data back into the right place. */
41272 d.target = dest;
41273 d.op0 = gen_lowpart (qimode, res_l);
41274 d.op1 = gen_lowpart (qimode, res_h);
41275 d.vmode = qimode;
41276 d.nelt = GET_MODE_NUNITS (qimode);
41277 d.one_operand_p = false;
41278 d.testing_p = false;
41280 if (full_interleave)
41282 /* For SSE2, we used an full interleave, so the desired
41283 results are in the even elements. */
41284 for (i = 0; i < 32; ++i)
41285 d.perm[i] = i * 2;
41287 else
41289 /* For AVX, the interleave used above was not cross-lane. So the
41290 extraction is evens but with the second and third quarter swapped.
41291 Happily, that is even one insn shorter than even extraction. */
41292 for (i = 0; i < 32; ++i)
41293 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
41296 ok = ix86_expand_vec_perm_const_1 (&d);
41297 gcc_assert (ok);
41299 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41300 gen_rtx_fmt_ee (code, qimode, op1, op2));
41303 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
41304 if op is CONST_VECTOR with all odd elements equal to their
41305 preceding element. */
41307 static bool
41308 const_vector_equal_evenodd_p (rtx op)
41310 enum machine_mode mode = GET_MODE (op);
41311 int i, nunits = GET_MODE_NUNITS (mode);
41312 if (GET_CODE (op) != CONST_VECTOR
41313 || nunits != CONST_VECTOR_NUNITS (op))
41314 return false;
41315 for (i = 0; i < nunits; i += 2)
41316 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
41317 return false;
41318 return true;
41321 void
41322 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
41323 bool uns_p, bool odd_p)
41325 enum machine_mode mode = GET_MODE (op1);
41326 enum machine_mode wmode = GET_MODE (dest);
41327 rtx x;
41328 rtx orig_op1 = op1, orig_op2 = op2;
41330 if (!nonimmediate_operand (op1, mode))
41331 op1 = force_reg (mode, op1);
41332 if (!nonimmediate_operand (op2, mode))
41333 op2 = force_reg (mode, op2);
41335 /* We only play even/odd games with vectors of SImode. */
41336 gcc_assert (mode == V4SImode || mode == V8SImode);
41338 /* If we're looking for the odd results, shift those members down to
41339 the even slots. For some cpus this is faster than a PSHUFD. */
41340 if (odd_p)
41342 /* For XOP use vpmacsdqh, but only for smult, as it is only
41343 signed. */
41344 if (TARGET_XOP && mode == V4SImode && !uns_p)
41346 x = force_reg (wmode, CONST0_RTX (wmode));
41347 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
41348 return;
41351 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
41352 if (!const_vector_equal_evenodd_p (orig_op1))
41353 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
41354 x, NULL, 1, OPTAB_DIRECT);
41355 if (!const_vector_equal_evenodd_p (orig_op2))
41356 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
41357 x, NULL, 1, OPTAB_DIRECT);
41358 op1 = gen_lowpart (mode, op1);
41359 op2 = gen_lowpart (mode, op2);
41362 if (mode == V8SImode)
41364 if (uns_p)
41365 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
41366 else
41367 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
41369 else if (uns_p)
41370 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
41371 else if (TARGET_SSE4_1)
41372 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
41373 else
41375 rtx s1, s2, t0, t1, t2;
41377 /* The easiest way to implement this without PMULDQ is to go through
41378 the motions as if we are performing a full 64-bit multiply. With
41379 the exception that we need to do less shuffling of the elements. */
41381 /* Compute the sign-extension, aka highparts, of the two operands. */
41382 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41383 op1, pc_rtx, pc_rtx);
41384 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41385 op2, pc_rtx, pc_rtx);
41387 /* Multiply LO(A) * HI(B), and vice-versa. */
41388 t1 = gen_reg_rtx (wmode);
41389 t2 = gen_reg_rtx (wmode);
41390 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
41391 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
41393 /* Multiply LO(A) * LO(B). */
41394 t0 = gen_reg_rtx (wmode);
41395 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
41397 /* Combine and shift the highparts into place. */
41398 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
41399 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
41400 1, OPTAB_DIRECT);
41402 /* Combine high and low parts. */
41403 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
41404 return;
41406 emit_insn (x);
41409 void
41410 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
41411 bool uns_p, bool high_p)
41413 enum machine_mode wmode = GET_MODE (dest);
41414 enum machine_mode mode = GET_MODE (op1);
41415 rtx t1, t2, t3, t4, mask;
41417 switch (mode)
41419 case V4SImode:
41420 t1 = gen_reg_rtx (mode);
41421 t2 = gen_reg_rtx (mode);
41422 if (TARGET_XOP && !uns_p)
41424 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
41425 shuffle the elements once so that all elements are in the right
41426 place for immediate use: { A C B D }. */
41427 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
41428 const1_rtx, GEN_INT (3)));
41429 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
41430 const1_rtx, GEN_INT (3)));
41432 else
41434 /* Put the elements into place for the multiply. */
41435 ix86_expand_vec_interleave (t1, op1, op1, high_p);
41436 ix86_expand_vec_interleave (t2, op2, op2, high_p);
41437 high_p = false;
41439 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
41440 break;
41442 case V8SImode:
41443 /* Shuffle the elements between the lanes. After this we
41444 have { A B E F | C D G H } for each operand. */
41445 t1 = gen_reg_rtx (V4DImode);
41446 t2 = gen_reg_rtx (V4DImode);
41447 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
41448 const0_rtx, const2_rtx,
41449 const1_rtx, GEN_INT (3)));
41450 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
41451 const0_rtx, const2_rtx,
41452 const1_rtx, GEN_INT (3)));
41454 /* Shuffle the elements within the lanes. After this we
41455 have { A A B B | C C D D } or { E E F F | G G H H }. */
41456 t3 = gen_reg_rtx (V8SImode);
41457 t4 = gen_reg_rtx (V8SImode);
41458 mask = GEN_INT (high_p
41459 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
41460 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
41461 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
41462 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
41464 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
41465 break;
41467 case V8HImode:
41468 case V16HImode:
41469 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
41470 uns_p, OPTAB_DIRECT);
41471 t2 = expand_binop (mode,
41472 uns_p ? umul_highpart_optab : smul_highpart_optab,
41473 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
41474 gcc_assert (t1 && t2);
41476 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
41477 break;
41479 case V16QImode:
41480 case V32QImode:
41481 t1 = gen_reg_rtx (wmode);
41482 t2 = gen_reg_rtx (wmode);
41483 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
41484 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
41486 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
41487 break;
41489 default:
41490 gcc_unreachable ();
41494 void
41495 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
41497 rtx res_1, res_2;
41499 res_1 = gen_reg_rtx (V4SImode);
41500 res_2 = gen_reg_rtx (V4SImode);
41501 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
41502 op1, op2, true, false);
41503 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
41504 op1, op2, true, true);
41506 /* Move the results in element 2 down to element 1; we don't care
41507 what goes in elements 2 and 3. Then we can merge the parts
41508 back together with an interleave.
41510 Note that two other sequences were tried:
41511 (1) Use interleaves at the start instead of psrldq, which allows
41512 us to use a single shufps to merge things back at the end.
41513 (2) Use shufps here to combine the two vectors, then pshufd to
41514 put the elements in the correct order.
41515 In both cases the cost of the reformatting stall was too high
41516 and the overall sequence slower. */
41518 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
41519 const0_rtx, const0_rtx));
41520 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
41521 const0_rtx, const0_rtx));
41522 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
41524 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
41527 void
41528 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
41530 enum machine_mode mode = GET_MODE (op0);
41531 rtx t1, t2, t3, t4, t5, t6;
41533 if (TARGET_XOP && mode == V2DImode)
41535 /* op1: A,B,C,D, op2: E,F,G,H */
41536 op1 = gen_lowpart (V4SImode, op1);
41537 op2 = gen_lowpart (V4SImode, op2);
41539 t1 = gen_reg_rtx (V4SImode);
41540 t2 = gen_reg_rtx (V4SImode);
41541 t3 = gen_reg_rtx (V2DImode);
41542 t4 = gen_reg_rtx (V2DImode);
41544 /* t1: B,A,D,C */
41545 emit_insn (gen_sse2_pshufd_1 (t1, op1,
41546 GEN_INT (1),
41547 GEN_INT (0),
41548 GEN_INT (3),
41549 GEN_INT (2)));
41551 /* t2: (B*E),(A*F),(D*G),(C*H) */
41552 emit_insn (gen_mulv4si3 (t2, t1, op2));
41554 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
41555 emit_insn (gen_xop_phadddq (t3, t2));
41557 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
41558 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
41560 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
41561 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
41563 else
41565 enum machine_mode nmode;
41566 rtx (*umul) (rtx, rtx, rtx);
41568 if (mode == V2DImode)
41570 umul = gen_vec_widen_umult_even_v4si;
41571 nmode = V4SImode;
41573 else if (mode == V4DImode)
41575 umul = gen_vec_widen_umult_even_v8si;
41576 nmode = V8SImode;
41578 else
41579 gcc_unreachable ();
41582 /* Multiply low parts. */
41583 t1 = gen_reg_rtx (mode);
41584 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
41586 /* Shift input vectors right 32 bits so we can multiply high parts. */
41587 t6 = GEN_INT (32);
41588 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
41589 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
41591 /* Multiply high parts by low parts. */
41592 t4 = gen_reg_rtx (mode);
41593 t5 = gen_reg_rtx (mode);
41594 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
41595 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
41597 /* Combine and shift the highparts back. */
41598 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
41599 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
41601 /* Combine high and low parts. */
41602 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
41605 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41606 gen_rtx_MULT (mode, op1, op2));
41609 /* Expand an insert into a vector register through pinsr insn.
41610 Return true if successful. */
41612 bool
41613 ix86_expand_pinsr (rtx *operands)
41615 rtx dst = operands[0];
41616 rtx src = operands[3];
41618 unsigned int size = INTVAL (operands[1]);
41619 unsigned int pos = INTVAL (operands[2]);
41621 if (GET_CODE (dst) == SUBREG)
41623 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
41624 dst = SUBREG_REG (dst);
41627 if (GET_CODE (src) == SUBREG)
41628 src = SUBREG_REG (src);
41630 switch (GET_MODE (dst))
41632 case V16QImode:
41633 case V8HImode:
41634 case V4SImode:
41635 case V2DImode:
41637 enum machine_mode srcmode, dstmode;
41638 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41640 srcmode = mode_for_size (size, MODE_INT, 0);
41642 switch (srcmode)
41644 case QImode:
41645 if (!TARGET_SSE4_1)
41646 return false;
41647 dstmode = V16QImode;
41648 pinsr = gen_sse4_1_pinsrb;
41649 break;
41651 case HImode:
41652 if (!TARGET_SSE2)
41653 return false;
41654 dstmode = V8HImode;
41655 pinsr = gen_sse2_pinsrw;
41656 break;
41658 case SImode:
41659 if (!TARGET_SSE4_1)
41660 return false;
41661 dstmode = V4SImode;
41662 pinsr = gen_sse4_1_pinsrd;
41663 break;
41665 case DImode:
41666 gcc_assert (TARGET_64BIT);
41667 if (!TARGET_SSE4_1)
41668 return false;
41669 dstmode = V2DImode;
41670 pinsr = gen_sse4_1_pinsrq;
41671 break;
41673 default:
41674 return false;
41677 dst = gen_lowpart (dstmode, dst);
41678 src = gen_lowpart (srcmode, src);
41680 pos /= size;
41682 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41683 return true;
41686 default:
41687 return false;
41691 /* This function returns the calling abi specific va_list type node.
41692 It returns the FNDECL specific va_list type. */
41694 static tree
41695 ix86_fn_abi_va_list (tree fndecl)
41697 if (!TARGET_64BIT)
41698 return va_list_type_node;
41699 gcc_assert (fndecl != NULL_TREE);
41701 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41702 return ms_va_list_type_node;
41703 else
41704 return sysv_va_list_type_node;
41707 /* Returns the canonical va_list type specified by TYPE. If there
41708 is no valid TYPE provided, it return NULL_TREE. */
41710 static tree
41711 ix86_canonical_va_list_type (tree type)
41713 tree wtype, htype;
41715 /* Resolve references and pointers to va_list type. */
41716 if (TREE_CODE (type) == MEM_REF)
41717 type = TREE_TYPE (type);
41718 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41719 type = TREE_TYPE (type);
41720 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41721 type = TREE_TYPE (type);
41723 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41725 wtype = va_list_type_node;
41726 gcc_assert (wtype != NULL_TREE);
41727 htype = type;
41728 if (TREE_CODE (wtype) == ARRAY_TYPE)
41730 /* If va_list is an array type, the argument may have decayed
41731 to a pointer type, e.g. by being passed to another function.
41732 In that case, unwrap both types so that we can compare the
41733 underlying records. */
41734 if (TREE_CODE (htype) == ARRAY_TYPE
41735 || POINTER_TYPE_P (htype))
41737 wtype = TREE_TYPE (wtype);
41738 htype = TREE_TYPE (htype);
41741 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41742 return va_list_type_node;
41743 wtype = sysv_va_list_type_node;
41744 gcc_assert (wtype != NULL_TREE);
41745 htype = type;
41746 if (TREE_CODE (wtype) == ARRAY_TYPE)
41748 /* If va_list is an array type, the argument may have decayed
41749 to a pointer type, e.g. by being passed to another function.
41750 In that case, unwrap both types so that we can compare the
41751 underlying records. */
41752 if (TREE_CODE (htype) == ARRAY_TYPE
41753 || POINTER_TYPE_P (htype))
41755 wtype = TREE_TYPE (wtype);
41756 htype = TREE_TYPE (htype);
41759 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41760 return sysv_va_list_type_node;
41761 wtype = ms_va_list_type_node;
41762 gcc_assert (wtype != NULL_TREE);
41763 htype = type;
41764 if (TREE_CODE (wtype) == ARRAY_TYPE)
41766 /* If va_list is an array type, the argument may have decayed
41767 to a pointer type, e.g. by being passed to another function.
41768 In that case, unwrap both types so that we can compare the
41769 underlying records. */
41770 if (TREE_CODE (htype) == ARRAY_TYPE
41771 || POINTER_TYPE_P (htype))
41773 wtype = TREE_TYPE (wtype);
41774 htype = TREE_TYPE (htype);
41777 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41778 return ms_va_list_type_node;
41779 return NULL_TREE;
41781 return std_canonical_va_list_type (type);
41784 /* Iterate through the target-specific builtin types for va_list.
41785 IDX denotes the iterator, *PTREE is set to the result type of
41786 the va_list builtin, and *PNAME to its internal type.
41787 Returns zero if there is no element for this index, otherwise
41788 IDX should be increased upon the next call.
41789 Note, do not iterate a base builtin's name like __builtin_va_list.
41790 Used from c_common_nodes_and_builtins. */
41792 static int
41793 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41795 if (TARGET_64BIT)
41797 switch (idx)
41799 default:
41800 break;
41802 case 0:
41803 *ptree = ms_va_list_type_node;
41804 *pname = "__builtin_ms_va_list";
41805 return 1;
41807 case 1:
41808 *ptree = sysv_va_list_type_node;
41809 *pname = "__builtin_sysv_va_list";
41810 return 1;
41814 return 0;
41817 #undef TARGET_SCHED_DISPATCH
41818 #define TARGET_SCHED_DISPATCH has_dispatch
41819 #undef TARGET_SCHED_DISPATCH_DO
41820 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41821 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41822 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41823 #undef TARGET_SCHED_REORDER
41824 #define TARGET_SCHED_REORDER ix86_sched_reorder
41825 #undef TARGET_SCHED_ADJUST_PRIORITY
41826 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41827 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41828 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
41829 ix86_dependencies_evaluation_hook
41831 /* The size of the dispatch window is the total number of bytes of
41832 object code allowed in a window. */
41833 #define DISPATCH_WINDOW_SIZE 16
41835 /* Number of dispatch windows considered for scheduling. */
41836 #define MAX_DISPATCH_WINDOWS 3
41838 /* Maximum number of instructions in a window. */
41839 #define MAX_INSN 4
41841 /* Maximum number of immediate operands in a window. */
41842 #define MAX_IMM 4
41844 /* Maximum number of immediate bits allowed in a window. */
41845 #define MAX_IMM_SIZE 128
41847 /* Maximum number of 32 bit immediates allowed in a window. */
41848 #define MAX_IMM_32 4
41850 /* Maximum number of 64 bit immediates allowed in a window. */
41851 #define MAX_IMM_64 2
41853 /* Maximum total of loads or prefetches allowed in a window. */
41854 #define MAX_LOAD 2
41856 /* Maximum total of stores allowed in a window. */
41857 #define MAX_STORE 1
41859 #undef BIG
41860 #define BIG 100
41863 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41864 enum dispatch_group {
41865 disp_no_group = 0,
41866 disp_load,
41867 disp_store,
41868 disp_load_store,
41869 disp_prefetch,
41870 disp_imm,
41871 disp_imm_32,
41872 disp_imm_64,
41873 disp_branch,
41874 disp_cmp,
41875 disp_jcc,
41876 disp_last
41879 /* Number of allowable groups in a dispatch window. It is an array
41880 indexed by dispatch_group enum. 100 is used as a big number,
41881 because the number of these kind of operations does not have any
41882 effect in dispatch window, but we need them for other reasons in
41883 the table. */
41884 static unsigned int num_allowable_groups[disp_last] = {
41885 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41888 char group_name[disp_last + 1][16] = {
41889 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41890 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41891 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41894 /* Instruction path. */
41895 enum insn_path {
41896 no_path = 0,
41897 path_single, /* Single micro op. */
41898 path_double, /* Double micro op. */
41899 path_multi, /* Instructions with more than 2 micro op.. */
41900 last_path
41903 /* sched_insn_info defines a window to the instructions scheduled in
41904 the basic block. It contains a pointer to the insn_info table and
41905 the instruction scheduled.
41907 Windows are allocated for each basic block and are linked
41908 together. */
41909 typedef struct sched_insn_info_s {
41910 rtx insn;
41911 enum dispatch_group group;
41912 enum insn_path path;
41913 int byte_len;
41914 int imm_bytes;
41915 } sched_insn_info;
41917 /* Linked list of dispatch windows. This is a two way list of
41918 dispatch windows of a basic block. It contains information about
41919 the number of uops in the window and the total number of
41920 instructions and of bytes in the object code for this dispatch
41921 window. */
41922 typedef struct dispatch_windows_s {
41923 int num_insn; /* Number of insn in the window. */
41924 int num_uops; /* Number of uops in the window. */
41925 int window_size; /* Number of bytes in the window. */
41926 int window_num; /* Window number between 0 or 1. */
41927 int num_imm; /* Number of immediates in an insn. */
41928 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41929 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41930 int imm_size; /* Total immediates in the window. */
41931 int num_loads; /* Total memory loads in the window. */
41932 int num_stores; /* Total memory stores in the window. */
41933 int violation; /* Violation exists in window. */
41934 sched_insn_info *window; /* Pointer to the window. */
41935 struct dispatch_windows_s *next;
41936 struct dispatch_windows_s *prev;
41937 } dispatch_windows;
41939 /* Immediate valuse used in an insn. */
41940 typedef struct imm_info_s
41942 int imm;
41943 int imm32;
41944 int imm64;
41945 } imm_info;
41947 static dispatch_windows *dispatch_window_list;
41948 static dispatch_windows *dispatch_window_list1;
41950 /* Get dispatch group of insn. */
41952 static enum dispatch_group
41953 get_mem_group (rtx insn)
41955 enum attr_memory memory;
41957 if (INSN_CODE (insn) < 0)
41958 return disp_no_group;
41959 memory = get_attr_memory (insn);
41960 if (memory == MEMORY_STORE)
41961 return disp_store;
41963 if (memory == MEMORY_LOAD)
41964 return disp_load;
41966 if (memory == MEMORY_BOTH)
41967 return disp_load_store;
41969 return disp_no_group;
41972 /* Return true if insn is a compare instruction. */
41974 static bool
41975 is_cmp (rtx insn)
41977 enum attr_type type;
41979 type = get_attr_type (insn);
41980 return (type == TYPE_TEST
41981 || type == TYPE_ICMP
41982 || type == TYPE_FCMP
41983 || GET_CODE (PATTERN (insn)) == COMPARE);
41986 /* Return true if a dispatch violation encountered. */
41988 static bool
41989 dispatch_violation (void)
41991 if (dispatch_window_list->next)
41992 return dispatch_window_list->next->violation;
41993 return dispatch_window_list->violation;
41996 /* Return true if insn is a branch instruction. */
41998 static bool
41999 is_branch (rtx insn)
42001 return (CALL_P (insn) || JUMP_P (insn));
42004 /* Return true if insn is a prefetch instruction. */
42006 static bool
42007 is_prefetch (rtx insn)
42009 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
42012 /* This function initializes a dispatch window and the list container holding a
42013 pointer to the window. */
42015 static void
42016 init_window (int window_num)
42018 int i;
42019 dispatch_windows *new_list;
42021 if (window_num == 0)
42022 new_list = dispatch_window_list;
42023 else
42024 new_list = dispatch_window_list1;
42026 new_list->num_insn = 0;
42027 new_list->num_uops = 0;
42028 new_list->window_size = 0;
42029 new_list->next = NULL;
42030 new_list->prev = NULL;
42031 new_list->window_num = window_num;
42032 new_list->num_imm = 0;
42033 new_list->num_imm_32 = 0;
42034 new_list->num_imm_64 = 0;
42035 new_list->imm_size = 0;
42036 new_list->num_loads = 0;
42037 new_list->num_stores = 0;
42038 new_list->violation = false;
42040 for (i = 0; i < MAX_INSN; i++)
42042 new_list->window[i].insn = NULL;
42043 new_list->window[i].group = disp_no_group;
42044 new_list->window[i].path = no_path;
42045 new_list->window[i].byte_len = 0;
42046 new_list->window[i].imm_bytes = 0;
42048 return;
42051 /* This function allocates and initializes a dispatch window and the
42052 list container holding a pointer to the window. */
42054 static dispatch_windows *
42055 allocate_window (void)
42057 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
42058 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
42060 return new_list;
42063 /* This routine initializes the dispatch scheduling information. It
42064 initiates building dispatch scheduler tables and constructs the
42065 first dispatch window. */
42067 static void
42068 init_dispatch_sched (void)
42070 /* Allocate a dispatch list and a window. */
42071 dispatch_window_list = allocate_window ();
42072 dispatch_window_list1 = allocate_window ();
42073 init_window (0);
42074 init_window (1);
42077 /* This function returns true if a branch is detected. End of a basic block
42078 does not have to be a branch, but here we assume only branches end a
42079 window. */
42081 static bool
42082 is_end_basic_block (enum dispatch_group group)
42084 return group == disp_branch;
42087 /* This function is called when the end of a window processing is reached. */
42089 static void
42090 process_end_window (void)
42092 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
42093 if (dispatch_window_list->next)
42095 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
42096 gcc_assert (dispatch_window_list->window_size
42097 + dispatch_window_list1->window_size <= 48);
42098 init_window (1);
42100 init_window (0);
42103 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
42104 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
42105 for 48 bytes of instructions. Note that these windows are not dispatch
42106 windows that their sizes are DISPATCH_WINDOW_SIZE. */
42108 static dispatch_windows *
42109 allocate_next_window (int window_num)
42111 if (window_num == 0)
42113 if (dispatch_window_list->next)
42114 init_window (1);
42115 init_window (0);
42116 return dispatch_window_list;
42119 dispatch_window_list->next = dispatch_window_list1;
42120 dispatch_window_list1->prev = dispatch_window_list;
42122 return dispatch_window_list1;
42125 /* Increment the number of immediate operands of an instruction. */
42127 static int
42128 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
42130 if (*in_rtx == 0)
42131 return 0;
42133 switch ( GET_CODE (*in_rtx))
42135 case CONST:
42136 case SYMBOL_REF:
42137 case CONST_INT:
42138 (imm_values->imm)++;
42139 if (x86_64_immediate_operand (*in_rtx, SImode))
42140 (imm_values->imm32)++;
42141 else
42142 (imm_values->imm64)++;
42143 break;
42145 case CONST_DOUBLE:
42146 (imm_values->imm)++;
42147 (imm_values->imm64)++;
42148 break;
42150 case CODE_LABEL:
42151 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
42153 (imm_values->imm)++;
42154 (imm_values->imm32)++;
42156 break;
42158 default:
42159 break;
42162 return 0;
42165 /* Compute number of immediate operands of an instruction. */
42167 static void
42168 find_constant (rtx in_rtx, imm_info *imm_values)
42170 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
42171 (rtx_function) find_constant_1, (void *) imm_values);
42174 /* Return total size of immediate operands of an instruction along with number
42175 of corresponding immediate-operands. It initializes its parameters to zero
42176 befor calling FIND_CONSTANT.
42177 INSN is the input instruction. IMM is the total of immediates.
42178 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
42179 bit immediates. */
42181 static int
42182 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
42184 imm_info imm_values = {0, 0, 0};
42186 find_constant (insn, &imm_values);
42187 *imm = imm_values.imm;
42188 *imm32 = imm_values.imm32;
42189 *imm64 = imm_values.imm64;
42190 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
42193 /* This function indicates if an operand of an instruction is an
42194 immediate. */
42196 static bool
42197 has_immediate (rtx insn)
42199 int num_imm_operand;
42200 int num_imm32_operand;
42201 int num_imm64_operand;
42203 if (insn)
42204 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42205 &num_imm64_operand);
42206 return false;
42209 /* Return single or double path for instructions. */
42211 static enum insn_path
42212 get_insn_path (rtx insn)
42214 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
42216 if ((int)path == 0)
42217 return path_single;
42219 if ((int)path == 1)
42220 return path_double;
42222 return path_multi;
42225 /* Return insn dispatch group. */
42227 static enum dispatch_group
42228 get_insn_group (rtx insn)
42230 enum dispatch_group group = get_mem_group (insn);
42231 if (group)
42232 return group;
42234 if (is_branch (insn))
42235 return disp_branch;
42237 if (is_cmp (insn))
42238 return disp_cmp;
42240 if (has_immediate (insn))
42241 return disp_imm;
42243 if (is_prefetch (insn))
42244 return disp_prefetch;
42246 return disp_no_group;
42249 /* Count number of GROUP restricted instructions in a dispatch
42250 window WINDOW_LIST. */
42252 static int
42253 count_num_restricted (rtx insn, dispatch_windows *window_list)
42255 enum dispatch_group group = get_insn_group (insn);
42256 int imm_size;
42257 int num_imm_operand;
42258 int num_imm32_operand;
42259 int num_imm64_operand;
42261 if (group == disp_no_group)
42262 return 0;
42264 if (group == disp_imm)
42266 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42267 &num_imm64_operand);
42268 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
42269 || num_imm_operand + window_list->num_imm > MAX_IMM
42270 || (num_imm32_operand > 0
42271 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
42272 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
42273 || (num_imm64_operand > 0
42274 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
42275 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
42276 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
42277 && num_imm64_operand > 0
42278 && ((window_list->num_imm_64 > 0
42279 && window_list->num_insn >= 2)
42280 || window_list->num_insn >= 3)))
42281 return BIG;
42283 return 1;
42286 if ((group == disp_load_store
42287 && (window_list->num_loads >= MAX_LOAD
42288 || window_list->num_stores >= MAX_STORE))
42289 || ((group == disp_load
42290 || group == disp_prefetch)
42291 && window_list->num_loads >= MAX_LOAD)
42292 || (group == disp_store
42293 && window_list->num_stores >= MAX_STORE))
42294 return BIG;
42296 return 1;
42299 /* This function returns true if insn satisfies dispatch rules on the
42300 last window scheduled. */
42302 static bool
42303 fits_dispatch_window (rtx insn)
42305 dispatch_windows *window_list = dispatch_window_list;
42306 dispatch_windows *window_list_next = dispatch_window_list->next;
42307 unsigned int num_restrict;
42308 enum dispatch_group group = get_insn_group (insn);
42309 enum insn_path path = get_insn_path (insn);
42310 int sum;
42312 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
42313 instructions should be given the lowest priority in the
42314 scheduling process in Haifa scheduler to make sure they will be
42315 scheduled in the same dispatch window as the reference to them. */
42316 if (group == disp_jcc || group == disp_cmp)
42317 return false;
42319 /* Check nonrestricted. */
42320 if (group == disp_no_group || group == disp_branch)
42321 return true;
42323 /* Get last dispatch window. */
42324 if (window_list_next)
42325 window_list = window_list_next;
42327 if (window_list->window_num == 1)
42329 sum = window_list->prev->window_size + window_list->window_size;
42331 if (sum == 32
42332 || (min_insn_size (insn) + sum) >= 48)
42333 /* Window 1 is full. Go for next window. */
42334 return true;
42337 num_restrict = count_num_restricted (insn, window_list);
42339 if (num_restrict > num_allowable_groups[group])
42340 return false;
42342 /* See if it fits in the first window. */
42343 if (window_list->window_num == 0)
42345 /* The first widow should have only single and double path
42346 uops. */
42347 if (path == path_double
42348 && (window_list->num_uops + 2) > MAX_INSN)
42349 return false;
42350 else if (path != path_single)
42351 return false;
42353 return true;
42356 /* Add an instruction INSN with NUM_UOPS micro-operations to the
42357 dispatch window WINDOW_LIST. */
42359 static void
42360 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
42362 int byte_len = min_insn_size (insn);
42363 int num_insn = window_list->num_insn;
42364 int imm_size;
42365 sched_insn_info *window = window_list->window;
42366 enum dispatch_group group = get_insn_group (insn);
42367 enum insn_path path = get_insn_path (insn);
42368 int num_imm_operand;
42369 int num_imm32_operand;
42370 int num_imm64_operand;
42372 if (!window_list->violation && group != disp_cmp
42373 && !fits_dispatch_window (insn))
42374 window_list->violation = true;
42376 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42377 &num_imm64_operand);
42379 /* Initialize window with new instruction. */
42380 window[num_insn].insn = insn;
42381 window[num_insn].byte_len = byte_len;
42382 window[num_insn].group = group;
42383 window[num_insn].path = path;
42384 window[num_insn].imm_bytes = imm_size;
42386 window_list->window_size += byte_len;
42387 window_list->num_insn = num_insn + 1;
42388 window_list->num_uops = window_list->num_uops + num_uops;
42389 window_list->imm_size += imm_size;
42390 window_list->num_imm += num_imm_operand;
42391 window_list->num_imm_32 += num_imm32_operand;
42392 window_list->num_imm_64 += num_imm64_operand;
42394 if (group == disp_store)
42395 window_list->num_stores += 1;
42396 else if (group == disp_load
42397 || group == disp_prefetch)
42398 window_list->num_loads += 1;
42399 else if (group == disp_load_store)
42401 window_list->num_stores += 1;
42402 window_list->num_loads += 1;
42406 /* Adds a scheduled instruction, INSN, to the current dispatch window.
42407 If the total bytes of instructions or the number of instructions in
42408 the window exceed allowable, it allocates a new window. */
42410 static void
42411 add_to_dispatch_window (rtx insn)
42413 int byte_len;
42414 dispatch_windows *window_list;
42415 dispatch_windows *next_list;
42416 dispatch_windows *window0_list;
42417 enum insn_path path;
42418 enum dispatch_group insn_group;
42419 bool insn_fits;
42420 int num_insn;
42421 int num_uops;
42422 int window_num;
42423 int insn_num_uops;
42424 int sum;
42426 if (INSN_CODE (insn) < 0)
42427 return;
42429 byte_len = min_insn_size (insn);
42430 window_list = dispatch_window_list;
42431 next_list = window_list->next;
42432 path = get_insn_path (insn);
42433 insn_group = get_insn_group (insn);
42435 /* Get the last dispatch window. */
42436 if (next_list)
42437 window_list = dispatch_window_list->next;
42439 if (path == path_single)
42440 insn_num_uops = 1;
42441 else if (path == path_double)
42442 insn_num_uops = 2;
42443 else
42444 insn_num_uops = (int) path;
42446 /* If current window is full, get a new window.
42447 Window number zero is full, if MAX_INSN uops are scheduled in it.
42448 Window number one is full, if window zero's bytes plus window
42449 one's bytes is 32, or if the bytes of the new instruction added
42450 to the total makes it greater than 48, or it has already MAX_INSN
42451 instructions in it. */
42452 num_insn = window_list->num_insn;
42453 num_uops = window_list->num_uops;
42454 window_num = window_list->window_num;
42455 insn_fits = fits_dispatch_window (insn);
42457 if (num_insn >= MAX_INSN
42458 || num_uops + insn_num_uops > MAX_INSN
42459 || !(insn_fits))
42461 window_num = ~window_num & 1;
42462 window_list = allocate_next_window (window_num);
42465 if (window_num == 0)
42467 add_insn_window (insn, window_list, insn_num_uops);
42468 if (window_list->num_insn >= MAX_INSN
42469 && insn_group == disp_branch)
42471 process_end_window ();
42472 return;
42475 else if (window_num == 1)
42477 window0_list = window_list->prev;
42478 sum = window0_list->window_size + window_list->window_size;
42479 if (sum == 32
42480 || (byte_len + sum) >= 48)
42482 process_end_window ();
42483 window_list = dispatch_window_list;
42486 add_insn_window (insn, window_list, insn_num_uops);
42488 else
42489 gcc_unreachable ();
42491 if (is_end_basic_block (insn_group))
42493 /* End of basic block is reached do end-basic-block process. */
42494 process_end_window ();
42495 return;
42499 /* Print the dispatch window, WINDOW_NUM, to FILE. */
42501 DEBUG_FUNCTION static void
42502 debug_dispatch_window_file (FILE *file, int window_num)
42504 dispatch_windows *list;
42505 int i;
42507 if (window_num == 0)
42508 list = dispatch_window_list;
42509 else
42510 list = dispatch_window_list1;
42512 fprintf (file, "Window #%d:\n", list->window_num);
42513 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
42514 list->num_insn, list->num_uops, list->window_size);
42515 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42516 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
42518 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
42519 list->num_stores);
42520 fprintf (file, " insn info:\n");
42522 for (i = 0; i < MAX_INSN; i++)
42524 if (!list->window[i].insn)
42525 break;
42526 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
42527 i, group_name[list->window[i].group],
42528 i, (void *)list->window[i].insn,
42529 i, list->window[i].path,
42530 i, list->window[i].byte_len,
42531 i, list->window[i].imm_bytes);
42535 /* Print to stdout a dispatch window. */
42537 DEBUG_FUNCTION void
42538 debug_dispatch_window (int window_num)
42540 debug_dispatch_window_file (stdout, window_num);
42543 /* Print INSN dispatch information to FILE. */
42545 DEBUG_FUNCTION static void
42546 debug_insn_dispatch_info_file (FILE *file, rtx insn)
42548 int byte_len;
42549 enum insn_path path;
42550 enum dispatch_group group;
42551 int imm_size;
42552 int num_imm_operand;
42553 int num_imm32_operand;
42554 int num_imm64_operand;
42556 if (INSN_CODE (insn) < 0)
42557 return;
42559 byte_len = min_insn_size (insn);
42560 path = get_insn_path (insn);
42561 group = get_insn_group (insn);
42562 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42563 &num_imm64_operand);
42565 fprintf (file, " insn info:\n");
42566 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
42567 group_name[group], path, byte_len);
42568 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42569 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
42572 /* Print to STDERR the status of the ready list with respect to
42573 dispatch windows. */
42575 DEBUG_FUNCTION void
42576 debug_ready_dispatch (void)
42578 int i;
42579 int no_ready = number_in_ready ();
42581 fprintf (stdout, "Number of ready: %d\n", no_ready);
42583 for (i = 0; i < no_ready; i++)
42584 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
42587 /* This routine is the driver of the dispatch scheduler. */
42589 static void
42590 do_dispatch (rtx insn, int mode)
42592 if (mode == DISPATCH_INIT)
42593 init_dispatch_sched ();
42594 else if (mode == ADD_TO_DISPATCH_WINDOW)
42595 add_to_dispatch_window (insn);
42598 /* Return TRUE if Dispatch Scheduling is supported. */
42600 static bool
42601 has_dispatch (rtx insn, int action)
42603 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
42604 && flag_dispatch_scheduler)
42605 switch (action)
42607 default:
42608 return false;
42610 case IS_DISPATCH_ON:
42611 return true;
42612 break;
42614 case IS_CMP:
42615 return is_cmp (insn);
42617 case DISPATCH_VIOLATION:
42618 return dispatch_violation ();
42620 case FITS_DISPATCH_WINDOW:
42621 return fits_dispatch_window (insn);
42624 return false;
42627 /* Implementation of reassociation_width target hook used by
42628 reassoc phase to identify parallelism level in reassociated
42629 tree. Statements tree_code is passed in OPC. Arguments type
42630 is passed in MODE.
42632 Currently parallel reassociation is enabled for Atom
42633 processors only and we set reassociation width to be 2
42634 because Atom may issue up to 2 instructions per cycle.
42636 Return value should be fixed if parallel reassociation is
42637 enabled for other processors. */
42639 static int
42640 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42641 enum machine_mode mode)
42643 int res = 1;
42645 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42646 res = 2;
42647 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42648 res = 2;
42650 return res;
42653 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42654 place emms and femms instructions. */
42656 static enum machine_mode
42657 ix86_preferred_simd_mode (enum machine_mode mode)
42659 if (!TARGET_SSE)
42660 return word_mode;
42662 switch (mode)
42664 case QImode:
42665 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42666 case HImode:
42667 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42668 case SImode:
42669 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42670 case DImode:
42671 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42673 case SFmode:
42674 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42675 return V8SFmode;
42676 else
42677 return V4SFmode;
42679 case DFmode:
42680 if (!TARGET_VECTORIZE_DOUBLE)
42681 return word_mode;
42682 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42683 return V4DFmode;
42684 else if (TARGET_SSE2)
42685 return V2DFmode;
42686 /* FALLTHRU */
42688 default:
42689 return word_mode;
42693 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42694 vectors. */
42696 static unsigned int
42697 ix86_autovectorize_vector_sizes (void)
42699 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42704 /* Return class of registers which could be used for pseudo of MODE
42705 and of class RCLASS for spilling instead of memory. Return NO_REGS
42706 if it is not possible or non-profitable. */
42707 static reg_class_t
42708 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42710 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42711 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42712 && INTEGER_CLASS_P (rclass))
42713 return ALL_SSE_REGS;
42714 return NO_REGS;
42717 /* Implement targetm.vectorize.init_cost. */
42719 static void *
42720 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42722 unsigned *cost = XNEWVEC (unsigned, 3);
42723 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42724 return cost;
42727 /* Implement targetm.vectorize.add_stmt_cost. */
42729 static unsigned
42730 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42731 struct _stmt_vec_info *stmt_info, int misalign,
42732 enum vect_cost_model_location where)
42734 unsigned *cost = (unsigned *) data;
42735 unsigned retval = 0;
42737 if (flag_vect_cost_model)
42739 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42740 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42742 /* Statements in an inner loop relative to the loop being
42743 vectorized are weighted more heavily. The value here is
42744 arbitrary and could potentially be improved with analysis. */
42745 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42746 count *= 50; /* FIXME. */
42748 retval = (unsigned) (count * stmt_cost);
42749 cost[where] += retval;
42752 return retval;
42755 /* Implement targetm.vectorize.finish_cost. */
42757 static void
42758 ix86_finish_cost (void *data, unsigned *prologue_cost,
42759 unsigned *body_cost, unsigned *epilogue_cost)
42761 unsigned *cost = (unsigned *) data;
42762 *prologue_cost = cost[vect_prologue];
42763 *body_cost = cost[vect_body];
42764 *epilogue_cost = cost[vect_epilogue];
42767 /* Implement targetm.vectorize.destroy_cost_data. */
42769 static void
42770 ix86_destroy_cost_data (void *data)
42772 free (data);
42775 /* Validate target specific memory model bits in VAL. */
42777 static unsigned HOST_WIDE_INT
42778 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42780 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42781 bool strong;
42783 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42784 |MEMMODEL_MASK)
42785 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42787 warning (OPT_Winvalid_memory_model,
42788 "Unknown architecture specific memory model");
42789 return MEMMODEL_SEQ_CST;
42791 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42792 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42794 warning (OPT_Winvalid_memory_model,
42795 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42796 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42798 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42800 warning (OPT_Winvalid_memory_model,
42801 "HLE_RELEASE not used with RELEASE or stronger memory model");
42802 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42804 return val;
42807 /* Initialize the GCC target structure. */
42808 #undef TARGET_RETURN_IN_MEMORY
42809 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42811 #undef TARGET_LEGITIMIZE_ADDRESS
42812 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42814 #undef TARGET_ATTRIBUTE_TABLE
42815 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42816 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
42817 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
42818 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42819 # undef TARGET_MERGE_DECL_ATTRIBUTES
42820 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42821 #endif
42823 #undef TARGET_COMP_TYPE_ATTRIBUTES
42824 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42826 #undef TARGET_INIT_BUILTINS
42827 #define TARGET_INIT_BUILTINS ix86_init_builtins
42828 #undef TARGET_BUILTIN_DECL
42829 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42830 #undef TARGET_EXPAND_BUILTIN
42831 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42833 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42834 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42835 ix86_builtin_vectorized_function
42837 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42838 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42840 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42841 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42843 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42844 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42846 #undef TARGET_BUILTIN_RECIPROCAL
42847 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42849 #undef TARGET_ASM_FUNCTION_EPILOGUE
42850 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42852 #undef TARGET_ENCODE_SECTION_INFO
42853 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42854 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42855 #else
42856 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42857 #endif
42859 #undef TARGET_ASM_OPEN_PAREN
42860 #define TARGET_ASM_OPEN_PAREN ""
42861 #undef TARGET_ASM_CLOSE_PAREN
42862 #define TARGET_ASM_CLOSE_PAREN ""
42864 #undef TARGET_ASM_BYTE_OP
42865 #define TARGET_ASM_BYTE_OP ASM_BYTE
42867 #undef TARGET_ASM_ALIGNED_HI_OP
42868 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42869 #undef TARGET_ASM_ALIGNED_SI_OP
42870 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42871 #ifdef ASM_QUAD
42872 #undef TARGET_ASM_ALIGNED_DI_OP
42873 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42874 #endif
42876 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42877 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42879 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42880 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42882 #undef TARGET_ASM_UNALIGNED_HI_OP
42883 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42884 #undef TARGET_ASM_UNALIGNED_SI_OP
42885 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42886 #undef TARGET_ASM_UNALIGNED_DI_OP
42887 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42889 #undef TARGET_PRINT_OPERAND
42890 #define TARGET_PRINT_OPERAND ix86_print_operand
42891 #undef TARGET_PRINT_OPERAND_ADDRESS
42892 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42893 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42894 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42895 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42896 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42898 #undef TARGET_SCHED_INIT_GLOBAL
42899 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42900 #undef TARGET_SCHED_ADJUST_COST
42901 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42902 #undef TARGET_SCHED_ISSUE_RATE
42903 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42904 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42905 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42906 ia32_multipass_dfa_lookahead
42908 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42909 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42911 #undef TARGET_MEMMODEL_CHECK
42912 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42914 #ifdef HAVE_AS_TLS
42915 #undef TARGET_HAVE_TLS
42916 #define TARGET_HAVE_TLS true
42917 #endif
42918 #undef TARGET_CANNOT_FORCE_CONST_MEM
42919 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42920 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42921 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42923 #undef TARGET_DELEGITIMIZE_ADDRESS
42924 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42926 #undef TARGET_MS_BITFIELD_LAYOUT_P
42927 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42929 #if TARGET_MACHO
42930 #undef TARGET_BINDS_LOCAL_P
42931 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42932 #endif
42933 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42934 #undef TARGET_BINDS_LOCAL_P
42935 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42936 #endif
42938 #undef TARGET_ASM_OUTPUT_MI_THUNK
42939 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42940 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42941 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42943 #undef TARGET_ASM_FILE_START
42944 #define TARGET_ASM_FILE_START x86_file_start
42946 #undef TARGET_OPTION_OVERRIDE
42947 #define TARGET_OPTION_OVERRIDE ix86_option_override
42949 #undef TARGET_REGISTER_MOVE_COST
42950 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42951 #undef TARGET_MEMORY_MOVE_COST
42952 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42953 #undef TARGET_RTX_COSTS
42954 #define TARGET_RTX_COSTS ix86_rtx_costs
42955 #undef TARGET_ADDRESS_COST
42956 #define TARGET_ADDRESS_COST ix86_address_cost
42958 #undef TARGET_FIXED_CONDITION_CODE_REGS
42959 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42960 #undef TARGET_CC_MODES_COMPATIBLE
42961 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42963 #undef TARGET_MACHINE_DEPENDENT_REORG
42964 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42966 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42967 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42969 #undef TARGET_BUILD_BUILTIN_VA_LIST
42970 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42972 #undef TARGET_FOLD_BUILTIN
42973 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42975 #undef TARGET_COMPARE_VERSION_PRIORITY
42976 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42978 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42979 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42980 ix86_generate_version_dispatcher_body
42982 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42983 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42984 ix86_get_function_versions_dispatcher
42986 #undef TARGET_ENUM_VA_LIST_P
42987 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42989 #undef TARGET_FN_ABI_VA_LIST
42990 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42992 #undef TARGET_CANONICAL_VA_LIST_TYPE
42993 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42995 #undef TARGET_EXPAND_BUILTIN_VA_START
42996 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42998 #undef TARGET_MD_ASM_CLOBBERS
42999 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
43001 #undef TARGET_PROMOTE_PROTOTYPES
43002 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
43003 #undef TARGET_STRUCT_VALUE_RTX
43004 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
43005 #undef TARGET_SETUP_INCOMING_VARARGS
43006 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
43007 #undef TARGET_MUST_PASS_IN_STACK
43008 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
43009 #undef TARGET_FUNCTION_ARG_ADVANCE
43010 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
43011 #undef TARGET_FUNCTION_ARG
43012 #define TARGET_FUNCTION_ARG ix86_function_arg
43013 #undef TARGET_FUNCTION_ARG_BOUNDARY
43014 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
43015 #undef TARGET_PASS_BY_REFERENCE
43016 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
43017 #undef TARGET_INTERNAL_ARG_POINTER
43018 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
43019 #undef TARGET_UPDATE_STACK_BOUNDARY
43020 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
43021 #undef TARGET_GET_DRAP_RTX
43022 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
43023 #undef TARGET_STRICT_ARGUMENT_NAMING
43024 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
43025 #undef TARGET_STATIC_CHAIN
43026 #define TARGET_STATIC_CHAIN ix86_static_chain
43027 #undef TARGET_TRAMPOLINE_INIT
43028 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
43029 #undef TARGET_RETURN_POPS_ARGS
43030 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
43032 #undef TARGET_LEGITIMATE_COMBINED_INSN
43033 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
43035 #undef TARGET_ASAN_SHADOW_OFFSET
43036 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
43038 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
43039 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
43041 #undef TARGET_SCALAR_MODE_SUPPORTED_P
43042 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
43044 #undef TARGET_VECTOR_MODE_SUPPORTED_P
43045 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
43047 #undef TARGET_C_MODE_FOR_SUFFIX
43048 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
43050 #ifdef HAVE_AS_TLS
43051 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
43052 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
43053 #endif
43055 #ifdef SUBTARGET_INSERT_ATTRIBUTES
43056 #undef TARGET_INSERT_ATTRIBUTES
43057 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
43058 #endif
43060 #undef TARGET_MANGLE_TYPE
43061 #define TARGET_MANGLE_TYPE ix86_mangle_type
43063 #if !TARGET_MACHO
43064 #undef TARGET_STACK_PROTECT_FAIL
43065 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
43066 #endif
43068 #undef TARGET_FUNCTION_VALUE
43069 #define TARGET_FUNCTION_VALUE ix86_function_value
43071 #undef TARGET_FUNCTION_VALUE_REGNO_P
43072 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
43074 #undef TARGET_PROMOTE_FUNCTION_MODE
43075 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
43077 #undef TARGET_MEMBER_TYPE_FORCES_BLK
43078 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
43080 #undef TARGET_INSTANTIATE_DECLS
43081 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
43083 #undef TARGET_SECONDARY_RELOAD
43084 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
43086 #undef TARGET_CLASS_MAX_NREGS
43087 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
43089 #undef TARGET_PREFERRED_RELOAD_CLASS
43090 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
43091 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
43092 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
43093 #undef TARGET_CLASS_LIKELY_SPILLED_P
43094 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
43096 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
43097 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
43098 ix86_builtin_vectorization_cost
43099 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
43100 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
43101 ix86_vectorize_vec_perm_const_ok
43102 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
43103 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
43104 ix86_preferred_simd_mode
43105 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
43106 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
43107 ix86_autovectorize_vector_sizes
43108 #undef TARGET_VECTORIZE_INIT_COST
43109 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
43110 #undef TARGET_VECTORIZE_ADD_STMT_COST
43111 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
43112 #undef TARGET_VECTORIZE_FINISH_COST
43113 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
43114 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
43115 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
43117 #undef TARGET_SET_CURRENT_FUNCTION
43118 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
43120 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
43121 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
43123 #undef TARGET_OPTION_SAVE
43124 #define TARGET_OPTION_SAVE ix86_function_specific_save
43126 #undef TARGET_OPTION_RESTORE
43127 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
43129 #undef TARGET_OPTION_PRINT
43130 #define TARGET_OPTION_PRINT ix86_function_specific_print
43132 #undef TARGET_OPTION_FUNCTION_VERSIONS
43133 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
43135 #undef TARGET_CAN_INLINE_P
43136 #define TARGET_CAN_INLINE_P ix86_can_inline_p
43138 #undef TARGET_EXPAND_TO_RTL_HOOK
43139 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
43141 #undef TARGET_LEGITIMATE_ADDRESS_P
43142 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
43144 #undef TARGET_LRA_P
43145 #define TARGET_LRA_P hook_bool_void_true
43147 #undef TARGET_REGISTER_PRIORITY
43148 #define TARGET_REGISTER_PRIORITY ix86_register_priority
43150 #undef TARGET_REGISTER_USAGE_LEVELING_P
43151 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
43153 #undef TARGET_LEGITIMATE_CONSTANT_P
43154 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
43156 #undef TARGET_FRAME_POINTER_REQUIRED
43157 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
43159 #undef TARGET_CAN_ELIMINATE
43160 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
43162 #undef TARGET_EXTRA_LIVE_ON_ENTRY
43163 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
43165 #undef TARGET_ASM_CODE_END
43166 #define TARGET_ASM_CODE_END ix86_code_end
43168 #undef TARGET_CONDITIONAL_REGISTER_USAGE
43169 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
43171 #if TARGET_MACHO
43172 #undef TARGET_INIT_LIBFUNCS
43173 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
43174 #endif
43176 #undef TARGET_SPILL_CLASS
43177 #define TARGET_SPILL_CLASS ix86_spill_class
43179 struct gcc_target targetm = TARGET_INITIALIZER;
43181 #include "gt-i386.h"