* config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation
[official-gcc.git] / gcc / config / i386 / i386.c
blob91e65105a5cb40b7945316a0b67142e5c3bc27e7
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "context.h"
65 #include "pass_manager.h"
67 static rtx legitimize_dllimport_symbol (rtx, bool);
68 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
69 static rtx legitimize_pe_coff_symbol (rtx, bool);
71 #ifndef CHECK_STACK_LIMIT
72 #define CHECK_STACK_LIMIT (-1)
73 #endif
75 /* Return index of given mode in mult and division cost tables. */
76 #define MODE_INDEX(mode) \
77 ((mode) == QImode ? 0 \
78 : (mode) == HImode ? 1 \
79 : (mode) == SImode ? 2 \
80 : (mode) == DImode ? 3 \
81 : 4)
83 /* Processor costs (relative to an add) */
84 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
85 #define COSTS_N_BYTES(N) ((N) * 2)
87 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
89 static stringop_algs ix86_size_memcpy[2] = {
90 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
91 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
92 static stringop_algs ix86_size_memset[2] = {
93 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
94 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
96 const
97 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
98 COSTS_N_BYTES (2), /* cost of an add instruction */
99 COSTS_N_BYTES (3), /* cost of a lea instruction */
100 COSTS_N_BYTES (2), /* variable shift costs */
101 COSTS_N_BYTES (3), /* constant shift costs */
102 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
103 COSTS_N_BYTES (3), /* HI */
104 COSTS_N_BYTES (3), /* SI */
105 COSTS_N_BYTES (3), /* DI */
106 COSTS_N_BYTES (5)}, /* other */
107 0, /* cost of multiply per each bit set */
108 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
109 COSTS_N_BYTES (3), /* HI */
110 COSTS_N_BYTES (3), /* SI */
111 COSTS_N_BYTES (3), /* DI */
112 COSTS_N_BYTES (5)}, /* other */
113 COSTS_N_BYTES (3), /* cost of movsx */
114 COSTS_N_BYTES (3), /* cost of movzx */
115 0, /* "large" insn */
116 2, /* MOVE_RATIO */
117 2, /* cost for loading QImode using movzbl */
118 {2, 2, 2}, /* cost of loading integer registers
119 in QImode, HImode and SImode.
120 Relative to reg-reg move (2). */
121 {2, 2, 2}, /* cost of storing integer registers */
122 2, /* cost of reg,reg fld/fst */
123 {2, 2, 2}, /* cost of loading fp registers
124 in SFmode, DFmode and XFmode */
125 {2, 2, 2}, /* cost of storing fp registers
126 in SFmode, DFmode and XFmode */
127 3, /* cost of moving MMX register */
128 {3, 3}, /* cost of loading MMX registers
129 in SImode and DImode */
130 {3, 3}, /* cost of storing MMX registers
131 in SImode and DImode */
132 3, /* cost of moving SSE register */
133 {3, 3, 3}, /* cost of loading SSE registers
134 in SImode, DImode and TImode */
135 {3, 3, 3}, /* cost of storing SSE registers
136 in SImode, DImode and TImode */
137 3, /* MMX or SSE register to integer */
138 0, /* size of l1 cache */
139 0, /* size of l2 cache */
140 0, /* size of prefetch block */
141 0, /* number of parallel prefetches */
142 2, /* Branch cost */
143 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
144 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
145 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
146 COSTS_N_BYTES (2), /* cost of FABS instruction. */
147 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
148 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
149 ix86_size_memcpy,
150 ix86_size_memset,
151 1, /* scalar_stmt_cost. */
152 1, /* scalar load_cost. */
153 1, /* scalar_store_cost. */
154 1, /* vec_stmt_cost. */
155 1, /* vec_to_scalar_cost. */
156 1, /* scalar_to_vec_cost. */
157 1, /* vec_align_load_cost. */
158 1, /* vec_unalign_load_cost. */
159 1, /* vec_store_cost. */
160 1, /* cond_taken_branch_cost. */
161 1, /* cond_not_taken_branch_cost. */
164 /* Processor costs (relative to an add) */
165 static stringop_algs i386_memcpy[2] = {
166 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
167 DUMMY_STRINGOP_ALGS};
168 static stringop_algs i386_memset[2] = {
169 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
170 DUMMY_STRINGOP_ALGS};
172 static const
173 struct processor_costs i386_cost = { /* 386 specific costs */
174 COSTS_N_INSNS (1), /* cost of an add instruction */
175 COSTS_N_INSNS (1), /* cost of a lea instruction */
176 COSTS_N_INSNS (3), /* variable shift costs */
177 COSTS_N_INSNS (2), /* constant shift costs */
178 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
179 COSTS_N_INSNS (6), /* HI */
180 COSTS_N_INSNS (6), /* SI */
181 COSTS_N_INSNS (6), /* DI */
182 COSTS_N_INSNS (6)}, /* other */
183 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
184 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
185 COSTS_N_INSNS (23), /* HI */
186 COSTS_N_INSNS (23), /* SI */
187 COSTS_N_INSNS (23), /* DI */
188 COSTS_N_INSNS (23)}, /* other */
189 COSTS_N_INSNS (3), /* cost of movsx */
190 COSTS_N_INSNS (2), /* cost of movzx */
191 15, /* "large" insn */
192 3, /* MOVE_RATIO */
193 4, /* cost for loading QImode using movzbl */
194 {2, 4, 2}, /* cost of loading integer registers
195 in QImode, HImode and SImode.
196 Relative to reg-reg move (2). */
197 {2, 4, 2}, /* cost of storing integer registers */
198 2, /* cost of reg,reg fld/fst */
199 {8, 8, 8}, /* cost of loading fp registers
200 in SFmode, DFmode and XFmode */
201 {8, 8, 8}, /* cost of storing fp registers
202 in SFmode, DFmode and XFmode */
203 2, /* cost of moving MMX register */
204 {4, 8}, /* cost of loading MMX registers
205 in SImode and DImode */
206 {4, 8}, /* cost of storing MMX registers
207 in SImode and DImode */
208 2, /* cost of moving SSE register */
209 {4, 8, 16}, /* cost of loading SSE registers
210 in SImode, DImode and TImode */
211 {4, 8, 16}, /* cost of storing SSE registers
212 in SImode, DImode and TImode */
213 3, /* MMX or SSE register to integer */
214 0, /* size of l1 cache */
215 0, /* size of l2 cache */
216 0, /* size of prefetch block */
217 0, /* number of parallel prefetches */
218 1, /* Branch cost */
219 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
220 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
221 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
222 COSTS_N_INSNS (22), /* cost of FABS instruction. */
223 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
224 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
225 i386_memcpy,
226 i386_memset,
227 1, /* scalar_stmt_cost. */
228 1, /* scalar load_cost. */
229 1, /* scalar_store_cost. */
230 1, /* vec_stmt_cost. */
231 1, /* vec_to_scalar_cost. */
232 1, /* scalar_to_vec_cost. */
233 1, /* vec_align_load_cost. */
234 2, /* vec_unalign_load_cost. */
235 1, /* vec_store_cost. */
236 3, /* cond_taken_branch_cost. */
237 1, /* cond_not_taken_branch_cost. */
240 static stringop_algs i486_memcpy[2] = {
241 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
242 DUMMY_STRINGOP_ALGS};
243 static stringop_algs i486_memset[2] = {
244 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
245 DUMMY_STRINGOP_ALGS};
247 static const
248 struct processor_costs i486_cost = { /* 486 specific costs */
249 COSTS_N_INSNS (1), /* cost of an add instruction */
250 COSTS_N_INSNS (1), /* cost of a lea instruction */
251 COSTS_N_INSNS (3), /* variable shift costs */
252 COSTS_N_INSNS (2), /* constant shift costs */
253 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
254 COSTS_N_INSNS (12), /* HI */
255 COSTS_N_INSNS (12), /* SI */
256 COSTS_N_INSNS (12), /* DI */
257 COSTS_N_INSNS (12)}, /* other */
258 1, /* cost of multiply per each bit set */
259 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
260 COSTS_N_INSNS (40), /* HI */
261 COSTS_N_INSNS (40), /* SI */
262 COSTS_N_INSNS (40), /* DI */
263 COSTS_N_INSNS (40)}, /* other */
264 COSTS_N_INSNS (3), /* cost of movsx */
265 COSTS_N_INSNS (2), /* cost of movzx */
266 15, /* "large" insn */
267 3, /* MOVE_RATIO */
268 4, /* cost for loading QImode using movzbl */
269 {2, 4, 2}, /* cost of loading integer registers
270 in QImode, HImode and SImode.
271 Relative to reg-reg move (2). */
272 {2, 4, 2}, /* cost of storing integer registers */
273 2, /* cost of reg,reg fld/fst */
274 {8, 8, 8}, /* cost of loading fp registers
275 in SFmode, DFmode and XFmode */
276 {8, 8, 8}, /* cost of storing fp registers
277 in SFmode, DFmode and XFmode */
278 2, /* cost of moving MMX register */
279 {4, 8}, /* cost of loading MMX registers
280 in SImode and DImode */
281 {4, 8}, /* cost of storing MMX registers
282 in SImode and DImode */
283 2, /* cost of moving SSE register */
284 {4, 8, 16}, /* cost of loading SSE registers
285 in SImode, DImode and TImode */
286 {4, 8, 16}, /* cost of storing SSE registers
287 in SImode, DImode and TImode */
288 3, /* MMX or SSE register to integer */
289 4, /* size of l1 cache. 486 has 8kB cache
290 shared for code and data, so 4kB is
291 not really precise. */
292 4, /* size of l2 cache */
293 0, /* size of prefetch block */
294 0, /* number of parallel prefetches */
295 1, /* Branch cost */
296 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
297 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
298 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
299 COSTS_N_INSNS (3), /* cost of FABS instruction. */
300 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
301 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
302 i486_memcpy,
303 i486_memset,
304 1, /* scalar_stmt_cost. */
305 1, /* scalar load_cost. */
306 1, /* scalar_store_cost. */
307 1, /* vec_stmt_cost. */
308 1, /* vec_to_scalar_cost. */
309 1, /* scalar_to_vec_cost. */
310 1, /* vec_align_load_cost. */
311 2, /* vec_unalign_load_cost. */
312 1, /* vec_store_cost. */
313 3, /* cond_taken_branch_cost. */
314 1, /* cond_not_taken_branch_cost. */
317 static stringop_algs pentium_memcpy[2] = {
318 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
319 DUMMY_STRINGOP_ALGS};
320 static stringop_algs pentium_memset[2] = {
321 {libcall, {{-1, rep_prefix_4_byte, false}}},
322 DUMMY_STRINGOP_ALGS};
324 static const
325 struct processor_costs pentium_cost = {
326 COSTS_N_INSNS (1), /* cost of an add instruction */
327 COSTS_N_INSNS (1), /* cost of a lea instruction */
328 COSTS_N_INSNS (4), /* variable shift costs */
329 COSTS_N_INSNS (1), /* constant shift costs */
330 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
331 COSTS_N_INSNS (11), /* HI */
332 COSTS_N_INSNS (11), /* SI */
333 COSTS_N_INSNS (11), /* DI */
334 COSTS_N_INSNS (11)}, /* other */
335 0, /* cost of multiply per each bit set */
336 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
337 COSTS_N_INSNS (25), /* HI */
338 COSTS_N_INSNS (25), /* SI */
339 COSTS_N_INSNS (25), /* DI */
340 COSTS_N_INSNS (25)}, /* other */
341 COSTS_N_INSNS (3), /* cost of movsx */
342 COSTS_N_INSNS (2), /* cost of movzx */
343 8, /* "large" insn */
344 6, /* MOVE_RATIO */
345 6, /* cost for loading QImode using movzbl */
346 {2, 4, 2}, /* cost of loading integer registers
347 in QImode, HImode and SImode.
348 Relative to reg-reg move (2). */
349 {2, 4, 2}, /* cost of storing integer registers */
350 2, /* cost of reg,reg fld/fst */
351 {2, 2, 6}, /* cost of loading fp registers
352 in SFmode, DFmode and XFmode */
353 {4, 4, 6}, /* cost of storing fp registers
354 in SFmode, DFmode and XFmode */
355 8, /* cost of moving MMX register */
356 {8, 8}, /* cost of loading MMX registers
357 in SImode and DImode */
358 {8, 8}, /* cost of storing MMX registers
359 in SImode and DImode */
360 2, /* cost of moving SSE register */
361 {4, 8, 16}, /* cost of loading SSE registers
362 in SImode, DImode and TImode */
363 {4, 8, 16}, /* cost of storing SSE registers
364 in SImode, DImode and TImode */
365 3, /* MMX or SSE register to integer */
366 8, /* size of l1 cache. */
367 8, /* size of l2 cache */
368 0, /* size of prefetch block */
369 0, /* number of parallel prefetches */
370 2, /* Branch cost */
371 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
372 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
373 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
374 COSTS_N_INSNS (1), /* cost of FABS instruction. */
375 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
376 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
377 pentium_memcpy,
378 pentium_memset,
379 1, /* scalar_stmt_cost. */
380 1, /* scalar load_cost. */
381 1, /* scalar_store_cost. */
382 1, /* vec_stmt_cost. */
383 1, /* vec_to_scalar_cost. */
384 1, /* scalar_to_vec_cost. */
385 1, /* vec_align_load_cost. */
386 2, /* vec_unalign_load_cost. */
387 1, /* vec_store_cost. */
388 3, /* cond_taken_branch_cost. */
389 1, /* cond_not_taken_branch_cost. */
392 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
393 (we ensure the alignment). For small blocks inline loop is still a
394 noticeable win, for bigger blocks either rep movsl or rep movsb is
395 way to go. Rep movsb has apparently more expensive startup time in CPU,
396 but after 4K the difference is down in the noise. */
397 static stringop_algs pentiumpro_memcpy[2] = {
398 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
399 {8192, rep_prefix_4_byte, false},
400 {-1, rep_prefix_1_byte, false}}},
401 DUMMY_STRINGOP_ALGS};
402 static stringop_algs pentiumpro_memset[2] = {
403 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
404 {8192, rep_prefix_4_byte, false},
405 {-1, libcall, false}}},
406 DUMMY_STRINGOP_ALGS};
407 static const
408 struct processor_costs pentiumpro_cost = {
409 COSTS_N_INSNS (1), /* cost of an add instruction */
410 COSTS_N_INSNS (1), /* cost of a lea instruction */
411 COSTS_N_INSNS (1), /* variable shift costs */
412 COSTS_N_INSNS (1), /* constant shift costs */
413 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
414 COSTS_N_INSNS (4), /* HI */
415 COSTS_N_INSNS (4), /* SI */
416 COSTS_N_INSNS (4), /* DI */
417 COSTS_N_INSNS (4)}, /* other */
418 0, /* cost of multiply per each bit set */
419 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
420 COSTS_N_INSNS (17), /* HI */
421 COSTS_N_INSNS (17), /* SI */
422 COSTS_N_INSNS (17), /* DI */
423 COSTS_N_INSNS (17)}, /* other */
424 COSTS_N_INSNS (1), /* cost of movsx */
425 COSTS_N_INSNS (1), /* cost of movzx */
426 8, /* "large" insn */
427 6, /* MOVE_RATIO */
428 2, /* cost for loading QImode using movzbl */
429 {4, 4, 4}, /* cost of loading integer registers
430 in QImode, HImode and SImode.
431 Relative to reg-reg move (2). */
432 {2, 2, 2}, /* cost of storing integer registers */
433 2, /* cost of reg,reg fld/fst */
434 {2, 2, 6}, /* cost of loading fp registers
435 in SFmode, DFmode and XFmode */
436 {4, 4, 6}, /* cost of storing fp registers
437 in SFmode, DFmode and XFmode */
438 2, /* cost of moving MMX register */
439 {2, 2}, /* cost of loading MMX registers
440 in SImode and DImode */
441 {2, 2}, /* cost of storing MMX registers
442 in SImode and DImode */
443 2, /* cost of moving SSE register */
444 {2, 2, 8}, /* cost of loading SSE registers
445 in SImode, DImode and TImode */
446 {2, 2, 8}, /* cost of storing SSE registers
447 in SImode, DImode and TImode */
448 3, /* MMX or SSE register to integer */
449 8, /* size of l1 cache. */
450 256, /* size of l2 cache */
451 32, /* size of prefetch block */
452 6, /* number of parallel prefetches */
453 2, /* Branch cost */
454 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
455 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
456 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
457 COSTS_N_INSNS (2), /* cost of FABS instruction. */
458 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
459 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
460 pentiumpro_memcpy,
461 pentiumpro_memset,
462 1, /* scalar_stmt_cost. */
463 1, /* scalar load_cost. */
464 1, /* scalar_store_cost. */
465 1, /* vec_stmt_cost. */
466 1, /* vec_to_scalar_cost. */
467 1, /* scalar_to_vec_cost. */
468 1, /* vec_align_load_cost. */
469 2, /* vec_unalign_load_cost. */
470 1, /* vec_store_cost. */
471 3, /* cond_taken_branch_cost. */
472 1, /* cond_not_taken_branch_cost. */
475 static stringop_algs geode_memcpy[2] = {
476 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
477 DUMMY_STRINGOP_ALGS};
478 static stringop_algs geode_memset[2] = {
479 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
480 DUMMY_STRINGOP_ALGS};
481 static const
482 struct processor_costs geode_cost = {
483 COSTS_N_INSNS (1), /* cost of an add instruction */
484 COSTS_N_INSNS (1), /* cost of a lea instruction */
485 COSTS_N_INSNS (2), /* variable shift costs */
486 COSTS_N_INSNS (1), /* constant shift costs */
487 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
488 COSTS_N_INSNS (4), /* HI */
489 COSTS_N_INSNS (7), /* SI */
490 COSTS_N_INSNS (7), /* DI */
491 COSTS_N_INSNS (7)}, /* other */
492 0, /* cost of multiply per each bit set */
493 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
494 COSTS_N_INSNS (23), /* HI */
495 COSTS_N_INSNS (39), /* SI */
496 COSTS_N_INSNS (39), /* DI */
497 COSTS_N_INSNS (39)}, /* other */
498 COSTS_N_INSNS (1), /* cost of movsx */
499 COSTS_N_INSNS (1), /* cost of movzx */
500 8, /* "large" insn */
501 4, /* MOVE_RATIO */
502 1, /* cost for loading QImode using movzbl */
503 {1, 1, 1}, /* cost of loading integer registers
504 in QImode, HImode and SImode.
505 Relative to reg-reg move (2). */
506 {1, 1, 1}, /* cost of storing integer registers */
507 1, /* cost of reg,reg fld/fst */
508 {1, 1, 1}, /* cost of loading fp registers
509 in SFmode, DFmode and XFmode */
510 {4, 6, 6}, /* cost of storing fp registers
511 in SFmode, DFmode and XFmode */
513 1, /* cost of moving MMX register */
514 {1, 1}, /* cost of loading MMX registers
515 in SImode and DImode */
516 {1, 1}, /* cost of storing MMX registers
517 in SImode and DImode */
518 1, /* cost of moving SSE register */
519 {1, 1, 1}, /* cost of loading SSE registers
520 in SImode, DImode and TImode */
521 {1, 1, 1}, /* cost of storing SSE registers
522 in SImode, DImode and TImode */
523 1, /* MMX or SSE register to integer */
524 64, /* size of l1 cache. */
525 128, /* size of l2 cache. */
526 32, /* size of prefetch block */
527 1, /* number of parallel prefetches */
528 1, /* Branch cost */
529 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
530 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
531 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
532 COSTS_N_INSNS (1), /* cost of FABS instruction. */
533 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
534 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
535 geode_memcpy,
536 geode_memset,
537 1, /* scalar_stmt_cost. */
538 1, /* scalar load_cost. */
539 1, /* scalar_store_cost. */
540 1, /* vec_stmt_cost. */
541 1, /* vec_to_scalar_cost. */
542 1, /* scalar_to_vec_cost. */
543 1, /* vec_align_load_cost. */
544 2, /* vec_unalign_load_cost. */
545 1, /* vec_store_cost. */
546 3, /* cond_taken_branch_cost. */
547 1, /* cond_not_taken_branch_cost. */
550 static stringop_algs k6_memcpy[2] = {
551 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
552 DUMMY_STRINGOP_ALGS};
553 static stringop_algs k6_memset[2] = {
554 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
555 DUMMY_STRINGOP_ALGS};
556 static const
557 struct processor_costs k6_cost = {
558 COSTS_N_INSNS (1), /* cost of an add instruction */
559 COSTS_N_INSNS (2), /* cost of a lea instruction */
560 COSTS_N_INSNS (1), /* variable shift costs */
561 COSTS_N_INSNS (1), /* constant shift costs */
562 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
563 COSTS_N_INSNS (3), /* HI */
564 COSTS_N_INSNS (3), /* SI */
565 COSTS_N_INSNS (3), /* DI */
566 COSTS_N_INSNS (3)}, /* other */
567 0, /* cost of multiply per each bit set */
568 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
569 COSTS_N_INSNS (18), /* HI */
570 COSTS_N_INSNS (18), /* SI */
571 COSTS_N_INSNS (18), /* DI */
572 COSTS_N_INSNS (18)}, /* other */
573 COSTS_N_INSNS (2), /* cost of movsx */
574 COSTS_N_INSNS (2), /* cost of movzx */
575 8, /* "large" insn */
576 4, /* MOVE_RATIO */
577 3, /* cost for loading QImode using movzbl */
578 {4, 5, 4}, /* cost of loading integer registers
579 in QImode, HImode and SImode.
580 Relative to reg-reg move (2). */
581 {2, 3, 2}, /* cost of storing integer registers */
582 4, /* cost of reg,reg fld/fst */
583 {6, 6, 6}, /* cost of loading fp registers
584 in SFmode, DFmode and XFmode */
585 {4, 4, 4}, /* cost of storing fp registers
586 in SFmode, DFmode and XFmode */
587 2, /* cost of moving MMX register */
588 {2, 2}, /* cost of loading MMX registers
589 in SImode and DImode */
590 {2, 2}, /* cost of storing MMX registers
591 in SImode and DImode */
592 2, /* cost of moving SSE register */
593 {2, 2, 8}, /* cost of loading SSE registers
594 in SImode, DImode and TImode */
595 {2, 2, 8}, /* cost of storing SSE registers
596 in SImode, DImode and TImode */
597 6, /* MMX or SSE register to integer */
598 32, /* size of l1 cache. */
599 32, /* size of l2 cache. Some models
600 have integrated l2 cache, but
601 optimizing for k6 is not important
602 enough to worry about that. */
603 32, /* size of prefetch block */
604 1, /* number of parallel prefetches */
605 1, /* Branch cost */
606 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
607 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
608 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
609 COSTS_N_INSNS (2), /* cost of FABS instruction. */
610 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
611 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
612 k6_memcpy,
613 k6_memset,
614 1, /* scalar_stmt_cost. */
615 1, /* scalar load_cost. */
616 1, /* scalar_store_cost. */
617 1, /* vec_stmt_cost. */
618 1, /* vec_to_scalar_cost. */
619 1, /* scalar_to_vec_cost. */
620 1, /* vec_align_load_cost. */
621 2, /* vec_unalign_load_cost. */
622 1, /* vec_store_cost. */
623 3, /* cond_taken_branch_cost. */
624 1, /* cond_not_taken_branch_cost. */
627 /* For some reason, Athlon deals better with REP prefix (relative to loops)
628 compared to K8. Alignment becomes important after 8 bytes for memcpy and
629 128 bytes for memset. */
630 static stringop_algs athlon_memcpy[2] = {
631 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
632 DUMMY_STRINGOP_ALGS};
633 static stringop_algs athlon_memset[2] = {
634 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
635 DUMMY_STRINGOP_ALGS};
636 static const
637 struct processor_costs athlon_cost = {
638 COSTS_N_INSNS (1), /* cost of an add instruction */
639 COSTS_N_INSNS (2), /* cost of a lea instruction */
640 COSTS_N_INSNS (1), /* variable shift costs */
641 COSTS_N_INSNS (1), /* constant shift costs */
642 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
643 COSTS_N_INSNS (5), /* HI */
644 COSTS_N_INSNS (5), /* SI */
645 COSTS_N_INSNS (5), /* DI */
646 COSTS_N_INSNS (5)}, /* other */
647 0, /* cost of multiply per each bit set */
648 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
649 COSTS_N_INSNS (26), /* HI */
650 COSTS_N_INSNS (42), /* SI */
651 COSTS_N_INSNS (74), /* DI */
652 COSTS_N_INSNS (74)}, /* other */
653 COSTS_N_INSNS (1), /* cost of movsx */
654 COSTS_N_INSNS (1), /* cost of movzx */
655 8, /* "large" insn */
656 9, /* MOVE_RATIO */
657 4, /* cost for loading QImode using movzbl */
658 {3, 4, 3}, /* cost of loading integer registers
659 in QImode, HImode and SImode.
660 Relative to reg-reg move (2). */
661 {3, 4, 3}, /* cost of storing integer registers */
662 4, /* cost of reg,reg fld/fst */
663 {4, 4, 12}, /* cost of loading fp registers
664 in SFmode, DFmode and XFmode */
665 {6, 6, 8}, /* cost of storing fp registers
666 in SFmode, DFmode and XFmode */
667 2, /* cost of moving MMX register */
668 {4, 4}, /* cost of loading MMX registers
669 in SImode and DImode */
670 {4, 4}, /* cost of storing MMX registers
671 in SImode and DImode */
672 2, /* cost of moving SSE register */
673 {4, 4, 6}, /* cost of loading SSE registers
674 in SImode, DImode and TImode */
675 {4, 4, 5}, /* cost of storing SSE registers
676 in SImode, DImode and TImode */
677 5, /* MMX or SSE register to integer */
678 64, /* size of l1 cache. */
679 256, /* size of l2 cache. */
680 64, /* size of prefetch block */
681 6, /* number of parallel prefetches */
682 5, /* Branch cost */
683 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
684 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
685 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
686 COSTS_N_INSNS (2), /* cost of FABS instruction. */
687 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
688 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
689 athlon_memcpy,
690 athlon_memset,
691 1, /* scalar_stmt_cost. */
692 1, /* scalar load_cost. */
693 1, /* scalar_store_cost. */
694 1, /* vec_stmt_cost. */
695 1, /* vec_to_scalar_cost. */
696 1, /* scalar_to_vec_cost. */
697 1, /* vec_align_load_cost. */
698 2, /* vec_unalign_load_cost. */
699 1, /* vec_store_cost. */
700 3, /* cond_taken_branch_cost. */
701 1, /* cond_not_taken_branch_cost. */
704 /* K8 has optimized REP instruction for medium sized blocks, but for very
705 small blocks it is better to use loop. For large blocks, libcall can
706 do nontemporary accesses and beat inline considerably. */
707 static stringop_algs k8_memcpy[2] = {
708 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
709 {-1, rep_prefix_4_byte, false}}},
710 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
711 {-1, libcall, false}}}};
712 static stringop_algs k8_memset[2] = {
713 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
714 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
715 {libcall, {{48, unrolled_loop, false},
716 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
717 static const
718 struct processor_costs k8_cost = {
719 COSTS_N_INSNS (1), /* cost of an add instruction */
720 COSTS_N_INSNS (2), /* cost of a lea instruction */
721 COSTS_N_INSNS (1), /* variable shift costs */
722 COSTS_N_INSNS (1), /* constant shift costs */
723 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
724 COSTS_N_INSNS (4), /* HI */
725 COSTS_N_INSNS (3), /* SI */
726 COSTS_N_INSNS (4), /* DI */
727 COSTS_N_INSNS (5)}, /* other */
728 0, /* cost of multiply per each bit set */
729 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
730 COSTS_N_INSNS (26), /* HI */
731 COSTS_N_INSNS (42), /* SI */
732 COSTS_N_INSNS (74), /* DI */
733 COSTS_N_INSNS (74)}, /* other */
734 COSTS_N_INSNS (1), /* cost of movsx */
735 COSTS_N_INSNS (1), /* cost of movzx */
736 8, /* "large" insn */
737 9, /* MOVE_RATIO */
738 4, /* cost for loading QImode using movzbl */
739 {3, 4, 3}, /* cost of loading integer registers
740 in QImode, HImode and SImode.
741 Relative to reg-reg move (2). */
742 {3, 4, 3}, /* cost of storing integer registers */
743 4, /* cost of reg,reg fld/fst */
744 {4, 4, 12}, /* cost of loading fp registers
745 in SFmode, DFmode and XFmode */
746 {6, 6, 8}, /* cost of storing fp registers
747 in SFmode, DFmode and XFmode */
748 2, /* cost of moving MMX register */
749 {3, 3}, /* cost of loading MMX registers
750 in SImode and DImode */
751 {4, 4}, /* cost of storing MMX registers
752 in SImode and DImode */
753 2, /* cost of moving SSE register */
754 {4, 3, 6}, /* cost of loading SSE registers
755 in SImode, DImode and TImode */
756 {4, 4, 5}, /* cost of storing SSE registers
757 in SImode, DImode and TImode */
758 5, /* MMX or SSE register to integer */
759 64, /* size of l1 cache. */
760 512, /* size of l2 cache. */
761 64, /* size of prefetch block */
762 /* New AMD processors never drop prefetches; if they cannot be performed
763 immediately, they are queued. We set number of simultaneous prefetches
764 to a large constant to reflect this (it probably is not a good idea not
765 to limit number of prefetches at all, as their execution also takes some
766 time). */
767 100, /* number of parallel prefetches */
768 3, /* Branch cost */
769 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
770 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
771 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
772 COSTS_N_INSNS (2), /* cost of FABS instruction. */
773 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
774 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
776 k8_memcpy,
777 k8_memset,
778 4, /* scalar_stmt_cost. */
779 2, /* scalar load_cost. */
780 2, /* scalar_store_cost. */
781 5, /* vec_stmt_cost. */
782 0, /* vec_to_scalar_cost. */
783 2, /* scalar_to_vec_cost. */
784 2, /* vec_align_load_cost. */
785 3, /* vec_unalign_load_cost. */
786 3, /* vec_store_cost. */
787 3, /* cond_taken_branch_cost. */
788 2, /* cond_not_taken_branch_cost. */
791 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
792 very small blocks it is better to use loop. For large blocks, libcall can
793 do nontemporary accesses and beat inline considerably. */
794 static stringop_algs amdfam10_memcpy[2] = {
795 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
796 {-1, rep_prefix_4_byte, false}}},
797 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
798 {-1, libcall, false}}}};
799 static stringop_algs amdfam10_memset[2] = {
800 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
801 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
802 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
803 {-1, libcall, false}}}};
804 struct processor_costs amdfam10_cost = {
805 COSTS_N_INSNS (1), /* cost of an add instruction */
806 COSTS_N_INSNS (2), /* cost of a lea instruction */
807 COSTS_N_INSNS (1), /* variable shift costs */
808 COSTS_N_INSNS (1), /* constant shift costs */
809 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
810 COSTS_N_INSNS (4), /* HI */
811 COSTS_N_INSNS (3), /* SI */
812 COSTS_N_INSNS (4), /* DI */
813 COSTS_N_INSNS (5)}, /* other */
814 0, /* cost of multiply per each bit set */
815 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
816 COSTS_N_INSNS (35), /* HI */
817 COSTS_N_INSNS (51), /* SI */
818 COSTS_N_INSNS (83), /* DI */
819 COSTS_N_INSNS (83)}, /* other */
820 COSTS_N_INSNS (1), /* cost of movsx */
821 COSTS_N_INSNS (1), /* cost of movzx */
822 8, /* "large" insn */
823 9, /* MOVE_RATIO */
824 4, /* cost for loading QImode using movzbl */
825 {3, 4, 3}, /* cost of loading integer registers
826 in QImode, HImode and SImode.
827 Relative to reg-reg move (2). */
828 {3, 4, 3}, /* cost of storing integer registers */
829 4, /* cost of reg,reg fld/fst */
830 {4, 4, 12}, /* cost of loading fp registers
831 in SFmode, DFmode and XFmode */
832 {6, 6, 8}, /* cost of storing fp registers
833 in SFmode, DFmode and XFmode */
834 2, /* cost of moving MMX register */
835 {3, 3}, /* cost of loading MMX registers
836 in SImode and DImode */
837 {4, 4}, /* cost of storing MMX registers
838 in SImode and DImode */
839 2, /* cost of moving SSE register */
840 {4, 4, 3}, /* cost of loading SSE registers
841 in SImode, DImode and TImode */
842 {4, 4, 5}, /* cost of storing SSE registers
843 in SImode, DImode and TImode */
844 3, /* MMX or SSE register to integer */
845 /* On K8:
846 MOVD reg64, xmmreg Double FSTORE 4
847 MOVD reg32, xmmreg Double FSTORE 4
848 On AMDFAM10:
849 MOVD reg64, xmmreg Double FADD 3
850 1/1 1/1
851 MOVD reg32, xmmreg Double FADD 3
852 1/1 1/1 */
853 64, /* size of l1 cache. */
854 512, /* size of l2 cache. */
855 64, /* size of prefetch block */
856 /* New AMD processors never drop prefetches; if they cannot be performed
857 immediately, they are queued. We set number of simultaneous prefetches
858 to a large constant to reflect this (it probably is not a good idea not
859 to limit number of prefetches at all, as their execution also takes some
860 time). */
861 100, /* number of parallel prefetches */
862 2, /* Branch cost */
863 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
864 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
865 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
866 COSTS_N_INSNS (2), /* cost of FABS instruction. */
867 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
868 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
870 amdfam10_memcpy,
871 amdfam10_memset,
872 4, /* scalar_stmt_cost. */
873 2, /* scalar load_cost. */
874 2, /* scalar_store_cost. */
875 6, /* vec_stmt_cost. */
876 0, /* vec_to_scalar_cost. */
877 2, /* scalar_to_vec_cost. */
878 2, /* vec_align_load_cost. */
879 2, /* vec_unalign_load_cost. */
880 2, /* vec_store_cost. */
881 2, /* cond_taken_branch_cost. */
882 1, /* cond_not_taken_branch_cost. */
885 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
886 very small blocks it is better to use loop. For large blocks, libcall
887 can do nontemporary accesses and beat inline considerably. */
888 static stringop_algs bdver1_memcpy[2] = {
889 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
890 {-1, rep_prefix_4_byte, false}}},
891 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
892 {-1, libcall, false}}}};
893 static stringop_algs bdver1_memset[2] = {
894 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
895 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
896 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
897 {-1, libcall, false}}}};
899 const struct processor_costs bdver1_cost = {
900 COSTS_N_INSNS (1), /* cost of an add instruction */
901 COSTS_N_INSNS (1), /* cost of a lea instruction */
902 COSTS_N_INSNS (1), /* variable shift costs */
903 COSTS_N_INSNS (1), /* constant shift costs */
904 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
905 COSTS_N_INSNS (4), /* HI */
906 COSTS_N_INSNS (4), /* SI */
907 COSTS_N_INSNS (6), /* DI */
908 COSTS_N_INSNS (6)}, /* other */
909 0, /* cost of multiply per each bit set */
910 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
911 COSTS_N_INSNS (35), /* HI */
912 COSTS_N_INSNS (51), /* SI */
913 COSTS_N_INSNS (83), /* DI */
914 COSTS_N_INSNS (83)}, /* other */
915 COSTS_N_INSNS (1), /* cost of movsx */
916 COSTS_N_INSNS (1), /* cost of movzx */
917 8, /* "large" insn */
918 9, /* MOVE_RATIO */
919 4, /* cost for loading QImode using movzbl */
920 {5, 5, 4}, /* cost of loading integer registers
921 in QImode, HImode and SImode.
922 Relative to reg-reg move (2). */
923 {4, 4, 4}, /* cost of storing integer registers */
924 2, /* cost of reg,reg fld/fst */
925 {5, 5, 12}, /* cost of loading fp registers
926 in SFmode, DFmode and XFmode */
927 {4, 4, 8}, /* cost of storing fp registers
928 in SFmode, DFmode and XFmode */
929 2, /* cost of moving MMX register */
930 {4, 4}, /* cost of loading MMX registers
931 in SImode and DImode */
932 {4, 4}, /* cost of storing MMX registers
933 in SImode and DImode */
934 2, /* cost of moving SSE register */
935 {4, 4, 4}, /* cost of loading SSE registers
936 in SImode, DImode and TImode */
937 {4, 4, 4}, /* cost of storing SSE registers
938 in SImode, DImode and TImode */
939 2, /* MMX or SSE register to integer */
940 /* On K8:
941 MOVD reg64, xmmreg Double FSTORE 4
942 MOVD reg32, xmmreg Double FSTORE 4
943 On AMDFAM10:
944 MOVD reg64, xmmreg Double FADD 3
945 1/1 1/1
946 MOVD reg32, xmmreg Double FADD 3
947 1/1 1/1 */
948 16, /* size of l1 cache. */
949 2048, /* size of l2 cache. */
950 64, /* size of prefetch block */
951 /* New AMD processors never drop prefetches; if they cannot be performed
952 immediately, they are queued. We set number of simultaneous prefetches
953 to a large constant to reflect this (it probably is not a good idea not
954 to limit number of prefetches at all, as their execution also takes some
955 time). */
956 100, /* number of parallel prefetches */
957 2, /* Branch cost */
958 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
959 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
960 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
961 COSTS_N_INSNS (2), /* cost of FABS instruction. */
962 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
963 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
965 bdver1_memcpy,
966 bdver1_memset,
967 6, /* scalar_stmt_cost. */
968 4, /* scalar load_cost. */
969 4, /* scalar_store_cost. */
970 6, /* vec_stmt_cost. */
971 0, /* vec_to_scalar_cost. */
972 2, /* scalar_to_vec_cost. */
973 4, /* vec_align_load_cost. */
974 4, /* vec_unalign_load_cost. */
975 4, /* vec_store_cost. */
976 2, /* cond_taken_branch_cost. */
977 1, /* cond_not_taken_branch_cost. */
980 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
981 very small blocks it is better to use loop. For large blocks, libcall
982 can do nontemporary accesses and beat inline considerably. */
984 static stringop_algs bdver2_memcpy[2] = {
985 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
986 {-1, rep_prefix_4_byte, false}}},
987 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
988 {-1, libcall, false}}}};
989 static stringop_algs bdver2_memset[2] = {
990 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
991 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
992 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
993 {-1, libcall, false}}}};
995 const struct processor_costs bdver2_cost = {
996 COSTS_N_INSNS (1), /* cost of an add instruction */
997 COSTS_N_INSNS (1), /* cost of a lea instruction */
998 COSTS_N_INSNS (1), /* variable shift costs */
999 COSTS_N_INSNS (1), /* constant shift costs */
1000 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1001 COSTS_N_INSNS (4), /* HI */
1002 COSTS_N_INSNS (4), /* SI */
1003 COSTS_N_INSNS (6), /* DI */
1004 COSTS_N_INSNS (6)}, /* other */
1005 0, /* cost of multiply per each bit set */
1006 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1007 COSTS_N_INSNS (35), /* HI */
1008 COSTS_N_INSNS (51), /* SI */
1009 COSTS_N_INSNS (83), /* DI */
1010 COSTS_N_INSNS (83)}, /* other */
1011 COSTS_N_INSNS (1), /* cost of movsx */
1012 COSTS_N_INSNS (1), /* cost of movzx */
1013 8, /* "large" insn */
1014 9, /* MOVE_RATIO */
1015 4, /* cost for loading QImode using movzbl */
1016 {5, 5, 4}, /* cost of loading integer registers
1017 in QImode, HImode and SImode.
1018 Relative to reg-reg move (2). */
1019 {4, 4, 4}, /* cost of storing integer registers */
1020 2, /* cost of reg,reg fld/fst */
1021 {5, 5, 12}, /* cost of loading fp registers
1022 in SFmode, DFmode and XFmode */
1023 {4, 4, 8}, /* cost of storing fp registers
1024 in SFmode, DFmode and XFmode */
1025 2, /* cost of moving MMX register */
1026 {4, 4}, /* cost of loading MMX registers
1027 in SImode and DImode */
1028 {4, 4}, /* cost of storing MMX registers
1029 in SImode and DImode */
1030 2, /* cost of moving SSE register */
1031 {4, 4, 4}, /* cost of loading SSE registers
1032 in SImode, DImode and TImode */
1033 {4, 4, 4}, /* cost of storing SSE registers
1034 in SImode, DImode and TImode */
1035 2, /* MMX or SSE register to integer */
1036 /* On K8:
1037 MOVD reg64, xmmreg Double FSTORE 4
1038 MOVD reg32, xmmreg Double FSTORE 4
1039 On AMDFAM10:
1040 MOVD reg64, xmmreg Double FADD 3
1041 1/1 1/1
1042 MOVD reg32, xmmreg Double FADD 3
1043 1/1 1/1 */
1044 16, /* size of l1 cache. */
1045 2048, /* size of l2 cache. */
1046 64, /* size of prefetch block */
1047 /* New AMD processors never drop prefetches; if they cannot be performed
1048 immediately, they are queued. We set number of simultaneous prefetches
1049 to a large constant to reflect this (it probably is not a good idea not
1050 to limit number of prefetches at all, as their execution also takes some
1051 time). */
1052 100, /* number of parallel prefetches */
1053 2, /* Branch cost */
1054 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1055 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1056 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1057 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1058 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1059 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1061 bdver2_memcpy,
1062 bdver2_memset,
1063 6, /* scalar_stmt_cost. */
1064 4, /* scalar load_cost. */
1065 4, /* scalar_store_cost. */
1066 6, /* vec_stmt_cost. */
1067 0, /* vec_to_scalar_cost. */
1068 2, /* scalar_to_vec_cost. */
1069 4, /* vec_align_load_cost. */
1070 4, /* vec_unalign_load_cost. */
1071 4, /* vec_store_cost. */
1072 2, /* cond_taken_branch_cost. */
1073 1, /* cond_not_taken_branch_cost. */
1077 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1078 very small blocks it is better to use loop. For large blocks, libcall
1079 can do nontemporary accesses and beat inline considerably. */
1080 static stringop_algs bdver3_memcpy[2] = {
1081 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1082 {-1, rep_prefix_4_byte, false}}},
1083 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1084 {-1, libcall, false}}}};
1085 static stringop_algs bdver3_memset[2] = {
1086 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1087 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1088 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1089 {-1, libcall, false}}}};
1090 struct processor_costs bdver3_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (1), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (4), /* SI */
1098 COSTS_N_INSNS (6), /* DI */
1099 COSTS_N_INSNS (6)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (35), /* HI */
1103 COSTS_N_INSNS (51), /* SI */
1104 COSTS_N_INSNS (83), /* DI */
1105 COSTS_N_INSNS (83)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {5, 5, 4}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {4, 4, 4}, /* cost of storing integer registers */
1115 2, /* cost of reg,reg fld/fst */
1116 {5, 5, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {4, 4, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {4, 4}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 4, 4}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 4}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 2, /* MMX or SSE register to integer */
1131 16, /* size of l1 cache. */
1132 2048, /* size of l2 cache. */
1133 64, /* size of prefetch block */
1134 /* New AMD processors never drop prefetches; if they cannot be performed
1135 immediately, they are queued. We set number of simultaneous prefetches
1136 to a large constant to reflect this (it probably is not a good idea not
1137 to limit number of prefetches at all, as their execution also takes some
1138 time). */
1139 100, /* number of parallel prefetches */
1140 2, /* Branch cost */
1141 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1148 bdver3_memcpy,
1149 bdver3_memset,
1150 6, /* scalar_stmt_cost. */
1151 4, /* scalar load_cost. */
1152 4, /* scalar_store_cost. */
1153 6, /* vec_stmt_cost. */
1154 0, /* vec_to_scalar_cost. */
1155 2, /* scalar_to_vec_cost. */
1156 4, /* vec_align_load_cost. */
1157 4, /* vec_unalign_load_cost. */
1158 4, /* vec_store_cost. */
1159 2, /* cond_taken_branch_cost. */
1160 1, /* cond_not_taken_branch_cost. */
1163 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1164 very small blocks it is better to use loop. For large blocks, libcall can
1165 do nontemporary accesses and beat inline considerably. */
1166 static stringop_algs btver1_memcpy[2] = {
1167 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1168 {-1, rep_prefix_4_byte, false}}},
1169 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}};
1171 static stringop_algs btver1_memset[2] = {
1172 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1173 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1174 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1175 {-1, libcall, false}}}};
1176 const struct processor_costs btver1_cost = {
1177 COSTS_N_INSNS (1), /* cost of an add instruction */
1178 COSTS_N_INSNS (2), /* cost of a lea instruction */
1179 COSTS_N_INSNS (1), /* variable shift costs */
1180 COSTS_N_INSNS (1), /* constant shift costs */
1181 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1182 COSTS_N_INSNS (4), /* HI */
1183 COSTS_N_INSNS (3), /* SI */
1184 COSTS_N_INSNS (4), /* DI */
1185 COSTS_N_INSNS (5)}, /* other */
1186 0, /* cost of multiply per each bit set */
1187 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1188 COSTS_N_INSNS (35), /* HI */
1189 COSTS_N_INSNS (51), /* SI */
1190 COSTS_N_INSNS (83), /* DI */
1191 COSTS_N_INSNS (83)}, /* other */
1192 COSTS_N_INSNS (1), /* cost of movsx */
1193 COSTS_N_INSNS (1), /* cost of movzx */
1194 8, /* "large" insn */
1195 9, /* MOVE_RATIO */
1196 4, /* cost for loading QImode using movzbl */
1197 {3, 4, 3}, /* cost of loading integer registers
1198 in QImode, HImode and SImode.
1199 Relative to reg-reg move (2). */
1200 {3, 4, 3}, /* cost of storing integer registers */
1201 4, /* cost of reg,reg fld/fst */
1202 {4, 4, 12}, /* cost of loading fp registers
1203 in SFmode, DFmode and XFmode */
1204 {6, 6, 8}, /* cost of storing fp registers
1205 in SFmode, DFmode and XFmode */
1206 2, /* cost of moving MMX register */
1207 {3, 3}, /* cost of loading MMX registers
1208 in SImode and DImode */
1209 {4, 4}, /* cost of storing MMX registers
1210 in SImode and DImode */
1211 2, /* cost of moving SSE register */
1212 {4, 4, 3}, /* cost of loading SSE registers
1213 in SImode, DImode and TImode */
1214 {4, 4, 5}, /* cost of storing SSE registers
1215 in SImode, DImode and TImode */
1216 3, /* MMX or SSE register to integer */
1217 /* On K8:
1218 MOVD reg64, xmmreg Double FSTORE 4
1219 MOVD reg32, xmmreg Double FSTORE 4
1220 On AMDFAM10:
1221 MOVD reg64, xmmreg Double FADD 3
1222 1/1 1/1
1223 MOVD reg32, xmmreg Double FADD 3
1224 1/1 1/1 */
1225 32, /* size of l1 cache. */
1226 512, /* size of l2 cache. */
1227 64, /* size of prefetch block */
1228 100, /* number of parallel prefetches */
1229 2, /* Branch cost */
1230 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1231 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1232 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1233 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1234 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1235 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1237 btver1_memcpy,
1238 btver1_memset,
1239 4, /* scalar_stmt_cost. */
1240 2, /* scalar load_cost. */
1241 2, /* scalar_store_cost. */
1242 6, /* vec_stmt_cost. */
1243 0, /* vec_to_scalar_cost. */
1244 2, /* scalar_to_vec_cost. */
1245 2, /* vec_align_load_cost. */
1246 2, /* vec_unalign_load_cost. */
1247 2, /* vec_store_cost. */
1248 2, /* cond_taken_branch_cost. */
1249 1, /* cond_not_taken_branch_cost. */
1252 static stringop_algs btver2_memcpy[2] = {
1253 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1254 {-1, rep_prefix_4_byte, false}}},
1255 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1256 {-1, libcall, false}}}};
1257 static stringop_algs btver2_memset[2] = {
1258 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1259 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1260 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1261 {-1, libcall, false}}}};
1262 const struct processor_costs btver2_cost = {
1263 COSTS_N_INSNS (1), /* cost of an add instruction */
1264 COSTS_N_INSNS (2), /* cost of a lea instruction */
1265 COSTS_N_INSNS (1), /* variable shift costs */
1266 COSTS_N_INSNS (1), /* constant shift costs */
1267 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1268 COSTS_N_INSNS (4), /* HI */
1269 COSTS_N_INSNS (3), /* SI */
1270 COSTS_N_INSNS (4), /* DI */
1271 COSTS_N_INSNS (5)}, /* other */
1272 0, /* cost of multiply per each bit set */
1273 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1274 COSTS_N_INSNS (35), /* HI */
1275 COSTS_N_INSNS (51), /* SI */
1276 COSTS_N_INSNS (83), /* DI */
1277 COSTS_N_INSNS (83)}, /* other */
1278 COSTS_N_INSNS (1), /* cost of movsx */
1279 COSTS_N_INSNS (1), /* cost of movzx */
1280 8, /* "large" insn */
1281 9, /* MOVE_RATIO */
1282 4, /* cost for loading QImode using movzbl */
1283 {3, 4, 3}, /* cost of loading integer registers
1284 in QImode, HImode and SImode.
1285 Relative to reg-reg move (2). */
1286 {3, 4, 3}, /* cost of storing integer registers */
1287 4, /* cost of reg,reg fld/fst */
1288 {4, 4, 12}, /* cost of loading fp registers
1289 in SFmode, DFmode and XFmode */
1290 {6, 6, 8}, /* cost of storing fp registers
1291 in SFmode, DFmode and XFmode */
1292 2, /* cost of moving MMX register */
1293 {3, 3}, /* cost of loading MMX registers
1294 in SImode and DImode */
1295 {4, 4}, /* cost of storing MMX registers
1296 in SImode and DImode */
1297 2, /* cost of moving SSE register */
1298 {4, 4, 3}, /* cost of loading SSE registers
1299 in SImode, DImode and TImode */
1300 {4, 4, 5}, /* cost of storing SSE registers
1301 in SImode, DImode and TImode */
1302 3, /* MMX or SSE register to integer */
1303 /* On K8:
1304 MOVD reg64, xmmreg Double FSTORE 4
1305 MOVD reg32, xmmreg Double FSTORE 4
1306 On AMDFAM10:
1307 MOVD reg64, xmmreg Double FADD 3
1308 1/1 1/1
1309 MOVD reg32, xmmreg Double FADD 3
1310 1/1 1/1 */
1311 32, /* size of l1 cache. */
1312 2048, /* size of l2 cache. */
1313 64, /* size of prefetch block */
1314 100, /* number of parallel prefetches */
1315 2, /* Branch cost */
1316 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1317 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1318 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1319 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1320 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1321 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1322 btver2_memcpy,
1323 btver2_memset,
1324 4, /* scalar_stmt_cost. */
1325 2, /* scalar load_cost. */
1326 2, /* scalar_store_cost. */
1327 6, /* vec_stmt_cost. */
1328 0, /* vec_to_scalar_cost. */
1329 2, /* scalar_to_vec_cost. */
1330 2, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 2, /* vec_store_cost. */
1333 2, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1337 static stringop_algs pentium4_memcpy[2] = {
1338 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1339 DUMMY_STRINGOP_ALGS};
1340 static stringop_algs pentium4_memset[2] = {
1341 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1342 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1343 DUMMY_STRINGOP_ALGS};
1345 static const
1346 struct processor_costs pentium4_cost = {
1347 COSTS_N_INSNS (1), /* cost of an add instruction */
1348 COSTS_N_INSNS (3), /* cost of a lea instruction */
1349 COSTS_N_INSNS (4), /* variable shift costs */
1350 COSTS_N_INSNS (4), /* constant shift costs */
1351 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1352 COSTS_N_INSNS (15), /* HI */
1353 COSTS_N_INSNS (15), /* SI */
1354 COSTS_N_INSNS (15), /* DI */
1355 COSTS_N_INSNS (15)}, /* other */
1356 0, /* cost of multiply per each bit set */
1357 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1358 COSTS_N_INSNS (56), /* HI */
1359 COSTS_N_INSNS (56), /* SI */
1360 COSTS_N_INSNS (56), /* DI */
1361 COSTS_N_INSNS (56)}, /* other */
1362 COSTS_N_INSNS (1), /* cost of movsx */
1363 COSTS_N_INSNS (1), /* cost of movzx */
1364 16, /* "large" insn */
1365 6, /* MOVE_RATIO */
1366 2, /* cost for loading QImode using movzbl */
1367 {4, 5, 4}, /* cost of loading integer registers
1368 in QImode, HImode and SImode.
1369 Relative to reg-reg move (2). */
1370 {2, 3, 2}, /* cost of storing integer registers */
1371 2, /* cost of reg,reg fld/fst */
1372 {2, 2, 6}, /* cost of loading fp registers
1373 in SFmode, DFmode and XFmode */
1374 {4, 4, 6}, /* cost of storing fp registers
1375 in SFmode, DFmode and XFmode */
1376 2, /* cost of moving MMX register */
1377 {2, 2}, /* cost of loading MMX registers
1378 in SImode and DImode */
1379 {2, 2}, /* cost of storing MMX registers
1380 in SImode and DImode */
1381 12, /* cost of moving SSE register */
1382 {12, 12, 12}, /* cost of loading SSE registers
1383 in SImode, DImode and TImode */
1384 {2, 2, 8}, /* cost of storing SSE registers
1385 in SImode, DImode and TImode */
1386 10, /* MMX or SSE register to integer */
1387 8, /* size of l1 cache. */
1388 256, /* size of l2 cache. */
1389 64, /* size of prefetch block */
1390 6, /* number of parallel prefetches */
1391 2, /* Branch cost */
1392 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1393 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1394 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1395 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1396 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1397 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1398 pentium4_memcpy,
1399 pentium4_memset,
1400 1, /* scalar_stmt_cost. */
1401 1, /* scalar load_cost. */
1402 1, /* scalar_store_cost. */
1403 1, /* vec_stmt_cost. */
1404 1, /* vec_to_scalar_cost. */
1405 1, /* scalar_to_vec_cost. */
1406 1, /* vec_align_load_cost. */
1407 2, /* vec_unalign_load_cost. */
1408 1, /* vec_store_cost. */
1409 3, /* cond_taken_branch_cost. */
1410 1, /* cond_not_taken_branch_cost. */
1413 static stringop_algs nocona_memcpy[2] = {
1414 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1415 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1416 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1418 static stringop_algs nocona_memset[2] = {
1419 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1420 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1421 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1422 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1424 static const
1425 struct processor_costs nocona_cost = {
1426 COSTS_N_INSNS (1), /* cost of an add instruction */
1427 COSTS_N_INSNS (1), /* cost of a lea instruction */
1428 COSTS_N_INSNS (1), /* variable shift costs */
1429 COSTS_N_INSNS (1), /* constant shift costs */
1430 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1431 COSTS_N_INSNS (10), /* HI */
1432 COSTS_N_INSNS (10), /* SI */
1433 COSTS_N_INSNS (10), /* DI */
1434 COSTS_N_INSNS (10)}, /* other */
1435 0, /* cost of multiply per each bit set */
1436 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1437 COSTS_N_INSNS (66), /* HI */
1438 COSTS_N_INSNS (66), /* SI */
1439 COSTS_N_INSNS (66), /* DI */
1440 COSTS_N_INSNS (66)}, /* other */
1441 COSTS_N_INSNS (1), /* cost of movsx */
1442 COSTS_N_INSNS (1), /* cost of movzx */
1443 16, /* "large" insn */
1444 17, /* MOVE_RATIO */
1445 4, /* cost for loading QImode using movzbl */
1446 {4, 4, 4}, /* cost of loading integer registers
1447 in QImode, HImode and SImode.
1448 Relative to reg-reg move (2). */
1449 {4, 4, 4}, /* cost of storing integer registers */
1450 3, /* cost of reg,reg fld/fst */
1451 {12, 12, 12}, /* cost of loading fp registers
1452 in SFmode, DFmode and XFmode */
1453 {4, 4, 4}, /* cost of storing fp registers
1454 in SFmode, DFmode and XFmode */
1455 6, /* cost of moving MMX register */
1456 {12, 12}, /* cost of loading MMX registers
1457 in SImode and DImode */
1458 {12, 12}, /* cost of storing MMX registers
1459 in SImode and DImode */
1460 6, /* cost of moving SSE register */
1461 {12, 12, 12}, /* cost of loading SSE registers
1462 in SImode, DImode and TImode */
1463 {12, 12, 12}, /* cost of storing SSE registers
1464 in SImode, DImode and TImode */
1465 8, /* MMX or SSE register to integer */
1466 8, /* size of l1 cache. */
1467 1024, /* size of l2 cache. */
1468 128, /* size of prefetch block */
1469 8, /* number of parallel prefetches */
1470 1, /* Branch cost */
1471 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1472 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1473 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1474 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1475 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1476 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1477 nocona_memcpy,
1478 nocona_memset,
1479 1, /* scalar_stmt_cost. */
1480 1, /* scalar load_cost. */
1481 1, /* scalar_store_cost. */
1482 1, /* vec_stmt_cost. */
1483 1, /* vec_to_scalar_cost. */
1484 1, /* scalar_to_vec_cost. */
1485 1, /* vec_align_load_cost. */
1486 2, /* vec_unalign_load_cost. */
1487 1, /* vec_store_cost. */
1488 3, /* cond_taken_branch_cost. */
1489 1, /* cond_not_taken_branch_cost. */
1492 static stringop_algs atom_memcpy[2] = {
1493 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1494 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1495 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1496 static stringop_algs atom_memset[2] = {
1497 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1498 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1499 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1500 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1501 static const
1502 struct processor_costs atom_cost = {
1503 COSTS_N_INSNS (1), /* cost of an add instruction */
1504 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1505 COSTS_N_INSNS (1), /* variable shift costs */
1506 COSTS_N_INSNS (1), /* constant shift costs */
1507 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1508 COSTS_N_INSNS (4), /* HI */
1509 COSTS_N_INSNS (3), /* SI */
1510 COSTS_N_INSNS (4), /* DI */
1511 COSTS_N_INSNS (2)}, /* other */
1512 0, /* cost of multiply per each bit set */
1513 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1514 COSTS_N_INSNS (26), /* HI */
1515 COSTS_N_INSNS (42), /* SI */
1516 COSTS_N_INSNS (74), /* DI */
1517 COSTS_N_INSNS (74)}, /* other */
1518 COSTS_N_INSNS (1), /* cost of movsx */
1519 COSTS_N_INSNS (1), /* cost of movzx */
1520 8, /* "large" insn */
1521 17, /* MOVE_RATIO */
1522 4, /* cost for loading QImode using movzbl */
1523 {4, 4, 4}, /* cost of loading integer registers
1524 in QImode, HImode and SImode.
1525 Relative to reg-reg move (2). */
1526 {4, 4, 4}, /* cost of storing integer registers */
1527 4, /* cost of reg,reg fld/fst */
1528 {12, 12, 12}, /* cost of loading fp registers
1529 in SFmode, DFmode and XFmode */
1530 {6, 6, 8}, /* cost of storing fp registers
1531 in SFmode, DFmode and XFmode */
1532 2, /* cost of moving MMX register */
1533 {8, 8}, /* cost of loading MMX registers
1534 in SImode and DImode */
1535 {8, 8}, /* cost of storing MMX registers
1536 in SImode and DImode */
1537 2, /* cost of moving SSE register */
1538 {8, 8, 8}, /* cost of loading SSE registers
1539 in SImode, DImode and TImode */
1540 {8, 8, 8}, /* cost of storing SSE registers
1541 in SImode, DImode and TImode */
1542 5, /* MMX or SSE register to integer */
1543 32, /* size of l1 cache. */
1544 256, /* size of l2 cache. */
1545 64, /* size of prefetch block */
1546 6, /* number of parallel prefetches */
1547 3, /* Branch cost */
1548 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1549 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1550 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1551 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1552 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1553 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1554 atom_memcpy,
1555 atom_memset,
1556 1, /* scalar_stmt_cost. */
1557 1, /* scalar load_cost. */
1558 1, /* scalar_store_cost. */
1559 1, /* vec_stmt_cost. */
1560 1, /* vec_to_scalar_cost. */
1561 1, /* scalar_to_vec_cost. */
1562 1, /* vec_align_load_cost. */
1563 2, /* vec_unalign_load_cost. */
1564 1, /* vec_store_cost. */
1565 3, /* cond_taken_branch_cost. */
1566 1, /* cond_not_taken_branch_cost. */
1569 static stringop_algs slm_memcpy[2] = {
1570 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1571 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1572 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1573 static stringop_algs slm_memset[2] = {
1574 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1575 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1576 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1577 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1578 static const
1579 struct processor_costs slm_cost = {
1580 COSTS_N_INSNS (1), /* cost of an add instruction */
1581 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1582 COSTS_N_INSNS (1), /* variable shift costs */
1583 COSTS_N_INSNS (1), /* constant shift costs */
1584 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1585 COSTS_N_INSNS (4), /* HI */
1586 COSTS_N_INSNS (3), /* SI */
1587 COSTS_N_INSNS (4), /* DI */
1588 COSTS_N_INSNS (2)}, /* other */
1589 0, /* cost of multiply per each bit set */
1590 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1591 COSTS_N_INSNS (26), /* HI */
1592 COSTS_N_INSNS (42), /* SI */
1593 COSTS_N_INSNS (74), /* DI */
1594 COSTS_N_INSNS (74)}, /* other */
1595 COSTS_N_INSNS (1), /* cost of movsx */
1596 COSTS_N_INSNS (1), /* cost of movzx */
1597 8, /* "large" insn */
1598 17, /* MOVE_RATIO */
1599 4, /* cost for loading QImode using movzbl */
1600 {4, 4, 4}, /* cost of loading integer registers
1601 in QImode, HImode and SImode.
1602 Relative to reg-reg move (2). */
1603 {4, 4, 4}, /* cost of storing integer registers */
1604 4, /* cost of reg,reg fld/fst */
1605 {12, 12, 12}, /* cost of loading fp registers
1606 in SFmode, DFmode and XFmode */
1607 {6, 6, 8}, /* cost of storing fp registers
1608 in SFmode, DFmode and XFmode */
1609 2, /* cost of moving MMX register */
1610 {8, 8}, /* cost of loading MMX registers
1611 in SImode and DImode */
1612 {8, 8}, /* cost of storing MMX registers
1613 in SImode and DImode */
1614 2, /* cost of moving SSE register */
1615 {8, 8, 8}, /* cost of loading SSE registers
1616 in SImode, DImode and TImode */
1617 {8, 8, 8}, /* cost of storing SSE registers
1618 in SImode, DImode and TImode */
1619 5, /* MMX or SSE register to integer */
1620 32, /* size of l1 cache. */
1621 256, /* size of l2 cache. */
1622 64, /* size of prefetch block */
1623 6, /* number of parallel prefetches */
1624 3, /* Branch cost */
1625 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1626 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1627 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1628 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1629 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1630 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1631 slm_memcpy,
1632 slm_memset,
1633 1, /* scalar_stmt_cost. */
1634 1, /* scalar load_cost. */
1635 1, /* scalar_store_cost. */
1636 1, /* vec_stmt_cost. */
1637 1, /* vec_to_scalar_cost. */
1638 1, /* scalar_to_vec_cost. */
1639 1, /* vec_align_load_cost. */
1640 2, /* vec_unalign_load_cost. */
1641 1, /* vec_store_cost. */
1642 3, /* cond_taken_branch_cost. */
1643 1, /* cond_not_taken_branch_cost. */
1646 /* Generic should produce code tuned for Core-i7 (and newer chips)
1647 and btver1 (and newer chips). */
1649 static stringop_algs generic_memcpy[2] = {
1650 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1651 {-1, libcall, false}}},
1652 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1653 {-1, libcall, false}}}};
1654 static stringop_algs generic_memset[2] = {
1655 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1656 {-1, libcall, false}}},
1657 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1658 {-1, libcall, false}}}};
1659 static const
1660 struct processor_costs generic_cost = {
1661 COSTS_N_INSNS (1), /* cost of an add instruction */
1662 /* On all chips taken into consideration lea is 2 cycles and more. With
1663 this cost however our current implementation of synth_mult results in
1664 use of unnecessary temporary registers causing regression on several
1665 SPECfp benchmarks. */
1666 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1667 COSTS_N_INSNS (1), /* variable shift costs */
1668 COSTS_N_INSNS (1), /* constant shift costs */
1669 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1670 COSTS_N_INSNS (4), /* HI */
1671 COSTS_N_INSNS (3), /* SI */
1672 COSTS_N_INSNS (4), /* DI */
1673 COSTS_N_INSNS (2)}, /* other */
1674 0, /* cost of multiply per each bit set */
1675 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1676 COSTS_N_INSNS (26), /* HI */
1677 COSTS_N_INSNS (42), /* SI */
1678 COSTS_N_INSNS (74), /* DI */
1679 COSTS_N_INSNS (74)}, /* other */
1680 COSTS_N_INSNS (1), /* cost of movsx */
1681 COSTS_N_INSNS (1), /* cost of movzx */
1682 8, /* "large" insn */
1683 17, /* MOVE_RATIO */
1684 4, /* cost for loading QImode using movzbl */
1685 {4, 4, 4}, /* cost of loading integer registers
1686 in QImode, HImode and SImode.
1687 Relative to reg-reg move (2). */
1688 {4, 4, 4}, /* cost of storing integer registers */
1689 4, /* cost of reg,reg fld/fst */
1690 {12, 12, 12}, /* cost of loading fp registers
1691 in SFmode, DFmode and XFmode */
1692 {6, 6, 8}, /* cost of storing fp registers
1693 in SFmode, DFmode and XFmode */
1694 2, /* cost of moving MMX register */
1695 {8, 8}, /* cost of loading MMX registers
1696 in SImode and DImode */
1697 {8, 8}, /* cost of storing MMX registers
1698 in SImode and DImode */
1699 2, /* cost of moving SSE register */
1700 {8, 8, 8}, /* cost of loading SSE registers
1701 in SImode, DImode and TImode */
1702 {8, 8, 8}, /* cost of storing SSE registers
1703 in SImode, DImode and TImode */
1704 5, /* MMX or SSE register to integer */
1705 32, /* size of l1 cache. */
1706 512, /* size of l2 cache. */
1707 64, /* size of prefetch block */
1708 6, /* number of parallel prefetches */
1709 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1710 value is increased to perhaps more appropriate value of 5. */
1711 3, /* Branch cost */
1712 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1713 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1714 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1715 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1716 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1717 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1718 generic_memcpy,
1719 generic_memset,
1720 1, /* scalar_stmt_cost. */
1721 1, /* scalar load_cost. */
1722 1, /* scalar_store_cost. */
1723 1, /* vec_stmt_cost. */
1724 1, /* vec_to_scalar_cost. */
1725 1, /* scalar_to_vec_cost. */
1726 1, /* vec_align_load_cost. */
1727 2, /* vec_unalign_load_cost. */
1728 1, /* vec_store_cost. */
1729 3, /* cond_taken_branch_cost. */
1730 1, /* cond_not_taken_branch_cost. */
1733 /* core_cost should produce code tuned for Core familly of CPUs. */
1734 static stringop_algs core_memcpy[2] = {
1735 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1736 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1737 {-1, libcall, false}}}};
1738 static stringop_algs core_memset[2] = {
1739 {libcall, {{6, loop_1_byte, true},
1740 {24, loop, true},
1741 {8192, rep_prefix_4_byte, true},
1742 {-1, libcall, false}}},
1743 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1744 {-1, libcall, false}}}};
1746 static const
1747 struct processor_costs core_cost = {
1748 COSTS_N_INSNS (1), /* cost of an add instruction */
1749 /* On all chips taken into consideration lea is 2 cycles and more. With
1750 this cost however our current implementation of synth_mult results in
1751 use of unnecessary temporary registers causing regression on several
1752 SPECfp benchmarks. */
1753 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1754 COSTS_N_INSNS (1), /* variable shift costs */
1755 COSTS_N_INSNS (1), /* constant shift costs */
1756 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1757 COSTS_N_INSNS (4), /* HI */
1758 COSTS_N_INSNS (3), /* SI */
1759 COSTS_N_INSNS (4), /* DI */
1760 COSTS_N_INSNS (2)}, /* other */
1761 0, /* cost of multiply per each bit set */
1762 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1763 COSTS_N_INSNS (26), /* HI */
1764 COSTS_N_INSNS (42), /* SI */
1765 COSTS_N_INSNS (74), /* DI */
1766 COSTS_N_INSNS (74)}, /* other */
1767 COSTS_N_INSNS (1), /* cost of movsx */
1768 COSTS_N_INSNS (1), /* cost of movzx */
1769 8, /* "large" insn */
1770 17, /* MOVE_RATIO */
1771 4, /* cost for loading QImode using movzbl */
1772 {4, 4, 4}, /* cost of loading integer registers
1773 in QImode, HImode and SImode.
1774 Relative to reg-reg move (2). */
1775 {4, 4, 4}, /* cost of storing integer registers */
1776 4, /* cost of reg,reg fld/fst */
1777 {12, 12, 12}, /* cost of loading fp registers
1778 in SFmode, DFmode and XFmode */
1779 {6, 6, 8}, /* cost of storing fp registers
1780 in SFmode, DFmode and XFmode */
1781 2, /* cost of moving MMX register */
1782 {8, 8}, /* cost of loading MMX registers
1783 in SImode and DImode */
1784 {8, 8}, /* cost of storing MMX registers
1785 in SImode and DImode */
1786 2, /* cost of moving SSE register */
1787 {8, 8, 8}, /* cost of loading SSE registers
1788 in SImode, DImode and TImode */
1789 {8, 8, 8}, /* cost of storing SSE registers
1790 in SImode, DImode and TImode */
1791 5, /* MMX or SSE register to integer */
1792 64, /* size of l1 cache. */
1793 512, /* size of l2 cache. */
1794 64, /* size of prefetch block */
1795 6, /* number of parallel prefetches */
1796 /* FIXME perhaps more appropriate value is 5. */
1797 3, /* Branch cost */
1798 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1799 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1800 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1801 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1802 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1803 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1804 core_memcpy,
1805 core_memset,
1806 1, /* scalar_stmt_cost. */
1807 1, /* scalar load_cost. */
1808 1, /* scalar_store_cost. */
1809 1, /* vec_stmt_cost. */
1810 1, /* vec_to_scalar_cost. */
1811 1, /* scalar_to_vec_cost. */
1812 1, /* vec_align_load_cost. */
1813 2, /* vec_unalign_load_cost. */
1814 1, /* vec_store_cost. */
1815 3, /* cond_taken_branch_cost. */
1816 1, /* cond_not_taken_branch_cost. */
1820 /* Set by -mtune. */
1821 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1823 /* Set by -mtune or -Os. */
1824 const struct processor_costs *ix86_cost = &pentium_cost;
1826 /* Processor feature/optimization bitmasks. */
1827 #define m_386 (1<<PROCESSOR_I386)
1828 #define m_486 (1<<PROCESSOR_I486)
1829 #define m_PENT (1<<PROCESSOR_PENTIUM)
1830 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1831 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1832 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1833 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1834 #define m_CORE2 (1<<PROCESSOR_CORE2)
1835 #define m_COREI7 (1<<PROCESSOR_COREI7)
1836 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1837 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1838 #define m_ATOM (1<<PROCESSOR_ATOM)
1839 #define m_SLM (1<<PROCESSOR_SLM)
1841 #define m_GEODE (1<<PROCESSOR_GEODE)
1842 #define m_K6 (1<<PROCESSOR_K6)
1843 #define m_K6_GEODE (m_K6 | m_GEODE)
1844 #define m_K8 (1<<PROCESSOR_K8)
1845 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1846 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1847 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1848 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1849 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1850 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1851 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1852 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1853 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1854 #define m_BTVER (m_BTVER1 | m_BTVER2)
1855 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1857 #define m_GENERIC (1<<PROCESSOR_GENERIC)
1859 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1860 #undef DEF_TUNE
1861 #define DEF_TUNE(tune, name, selector) name,
1862 #include "x86-tune.def"
1863 #undef DEF_TUNE
1866 /* Feature tests against the various tunings. */
1867 unsigned char ix86_tune_features[X86_TUNE_LAST];
1869 /* Feature tests against the various tunings used to create ix86_tune_features
1870 based on the processor mask. */
1871 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1872 #undef DEF_TUNE
1873 #define DEF_TUNE(tune, name, selector) selector,
1874 #include "x86-tune.def"
1875 #undef DEF_TUNE
1878 /* Feature tests against the various architecture variations. */
1879 unsigned char ix86_arch_features[X86_ARCH_LAST];
1881 /* Feature tests against the various architecture variations, used to create
1882 ix86_arch_features based on the processor mask. */
1883 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1884 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1885 ~(m_386 | m_486 | m_PENT | m_K6),
1887 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1888 ~m_386,
1890 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1891 ~(m_386 | m_486),
1893 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1894 ~m_386,
1896 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1897 ~m_386,
1900 /* In case the average insn count for single function invocation is
1901 lower than this constant, emit fast (but longer) prologue and
1902 epilogue code. */
1903 #define FAST_PROLOGUE_INSN_COUNT 20
1905 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1906 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1907 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1908 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1910 /* Array of the smallest class containing reg number REGNO, indexed by
1911 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1913 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1915 /* ax, dx, cx, bx */
1916 AREG, DREG, CREG, BREG,
1917 /* si, di, bp, sp */
1918 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1919 /* FP registers */
1920 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1921 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1922 /* arg pointer */
1923 NON_Q_REGS,
1924 /* flags, fpsr, fpcr, frame */
1925 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1926 /* SSE registers */
1927 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1928 SSE_REGS, SSE_REGS,
1929 /* MMX registers */
1930 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1931 MMX_REGS, MMX_REGS,
1932 /* REX registers */
1933 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1934 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1935 /* SSE REX registers */
1936 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1937 SSE_REGS, SSE_REGS,
1938 /* AVX-512 SSE registers */
1939 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1940 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1941 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1942 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1943 /* Mask registers. */
1944 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
1945 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
1948 /* The "default" register map used in 32bit mode. */
1950 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1952 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1953 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1954 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1955 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1956 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1957 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1958 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1959 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
1960 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
1961 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
1964 /* The "default" register map used in 64bit mode. */
1966 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1968 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1969 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1970 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1971 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1972 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1973 8,9,10,11,12,13,14,15, /* extended integer registers */
1974 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1975 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
1976 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
1977 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
1980 /* Define the register numbers to be used in Dwarf debugging information.
1981 The SVR4 reference port C compiler uses the following register numbers
1982 in its Dwarf output code:
1983 0 for %eax (gcc regno = 0)
1984 1 for %ecx (gcc regno = 2)
1985 2 for %edx (gcc regno = 1)
1986 3 for %ebx (gcc regno = 3)
1987 4 for %esp (gcc regno = 7)
1988 5 for %ebp (gcc regno = 6)
1989 6 for %esi (gcc regno = 4)
1990 7 for %edi (gcc regno = 5)
1991 The following three DWARF register numbers are never generated by
1992 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1993 believes these numbers have these meanings.
1994 8 for %eip (no gcc equivalent)
1995 9 for %eflags (gcc regno = 17)
1996 10 for %trapno (no gcc equivalent)
1997 It is not at all clear how we should number the FP stack registers
1998 for the x86 architecture. If the version of SDB on x86/svr4 were
1999 a bit less brain dead with respect to floating-point then we would
2000 have a precedent to follow with respect to DWARF register numbers
2001 for x86 FP registers, but the SDB on x86/svr4 is so completely
2002 broken with respect to FP registers that it is hardly worth thinking
2003 of it as something to strive for compatibility with.
2004 The version of x86/svr4 SDB I have at the moment does (partially)
2005 seem to believe that DWARF register number 11 is associated with
2006 the x86 register %st(0), but that's about all. Higher DWARF
2007 register numbers don't seem to be associated with anything in
2008 particular, and even for DWARF regno 11, SDB only seems to under-
2009 stand that it should say that a variable lives in %st(0) (when
2010 asked via an `=' command) if we said it was in DWARF regno 11,
2011 but SDB still prints garbage when asked for the value of the
2012 variable in question (via a `/' command).
2013 (Also note that the labels SDB prints for various FP stack regs
2014 when doing an `x' command are all wrong.)
2015 Note that these problems generally don't affect the native SVR4
2016 C compiler because it doesn't allow the use of -O with -g and
2017 because when it is *not* optimizing, it allocates a memory
2018 location for each floating-point variable, and the memory
2019 location is what gets described in the DWARF AT_location
2020 attribute for the variable in question.
2021 Regardless of the severe mental illness of the x86/svr4 SDB, we
2022 do something sensible here and we use the following DWARF
2023 register numbers. Note that these are all stack-top-relative
2024 numbers.
2025 11 for %st(0) (gcc regno = 8)
2026 12 for %st(1) (gcc regno = 9)
2027 13 for %st(2) (gcc regno = 10)
2028 14 for %st(3) (gcc regno = 11)
2029 15 for %st(4) (gcc regno = 12)
2030 16 for %st(5) (gcc regno = 13)
2031 17 for %st(6) (gcc regno = 14)
2032 18 for %st(7) (gcc regno = 15)
2034 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2036 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2037 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2038 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2039 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2040 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2041 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2042 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2043 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2044 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2045 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2048 /* Define parameter passing and return registers. */
2050 static int const x86_64_int_parameter_registers[6] =
2052 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2055 static int const x86_64_ms_abi_int_parameter_registers[4] =
2057 CX_REG, DX_REG, R8_REG, R9_REG
2060 static int const x86_64_int_return_registers[4] =
2062 AX_REG, DX_REG, DI_REG, SI_REG
2065 /* Additional registers that are clobbered by SYSV calls. */
2067 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2069 SI_REG, DI_REG,
2070 XMM6_REG, XMM7_REG,
2071 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2072 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2075 /* Define the structure for the machine field in struct function. */
2077 struct GTY(()) stack_local_entry {
2078 unsigned short mode;
2079 unsigned short n;
2080 rtx rtl;
2081 struct stack_local_entry *next;
2084 /* Structure describing stack frame layout.
2085 Stack grows downward:
2087 [arguments]
2088 <- ARG_POINTER
2089 saved pc
2091 saved static chain if ix86_static_chain_on_stack
2093 saved frame pointer if frame_pointer_needed
2094 <- HARD_FRAME_POINTER
2095 [saved regs]
2096 <- regs_save_offset
2097 [padding0]
2099 [saved SSE regs]
2100 <- sse_regs_save_offset
2101 [padding1] |
2102 | <- FRAME_POINTER
2103 [va_arg registers] |
2105 [frame] |
2107 [padding2] | = to_allocate
2108 <- STACK_POINTER
2110 struct ix86_frame
2112 int nsseregs;
2113 int nregs;
2114 int va_arg_size;
2115 int red_zone_size;
2116 int outgoing_arguments_size;
2118 /* The offsets relative to ARG_POINTER. */
2119 HOST_WIDE_INT frame_pointer_offset;
2120 HOST_WIDE_INT hard_frame_pointer_offset;
2121 HOST_WIDE_INT stack_pointer_offset;
2122 HOST_WIDE_INT hfp_save_offset;
2123 HOST_WIDE_INT reg_save_offset;
2124 HOST_WIDE_INT sse_reg_save_offset;
2126 /* When save_regs_using_mov is set, emit prologue using
2127 move instead of push instructions. */
2128 bool save_regs_using_mov;
2131 /* Which cpu are we scheduling for. */
2132 enum attr_cpu ix86_schedule;
2134 /* Which cpu are we optimizing for. */
2135 enum processor_type ix86_tune;
2137 /* Which instruction set architecture to use. */
2138 enum processor_type ix86_arch;
2140 /* True if processor has SSE prefetch instruction. */
2141 unsigned char x86_prefetch_sse;
2143 /* -mstackrealign option */
2144 static const char ix86_force_align_arg_pointer_string[]
2145 = "force_align_arg_pointer";
2147 static rtx (*ix86_gen_leave) (void);
2148 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2149 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2150 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2151 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2152 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2153 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2154 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2155 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2156 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2157 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2158 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2160 /* Preferred alignment for stack boundary in bits. */
2161 unsigned int ix86_preferred_stack_boundary;
2163 /* Alignment for incoming stack boundary in bits specified at
2164 command line. */
2165 static unsigned int ix86_user_incoming_stack_boundary;
2167 /* Default alignment for incoming stack boundary in bits. */
2168 static unsigned int ix86_default_incoming_stack_boundary;
2170 /* Alignment for incoming stack boundary in bits. */
2171 unsigned int ix86_incoming_stack_boundary;
2173 /* Calling abi specific va_list type nodes. */
2174 static GTY(()) tree sysv_va_list_type_node;
2175 static GTY(()) tree ms_va_list_type_node;
2177 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2178 char internal_label_prefix[16];
2179 int internal_label_prefix_len;
2181 /* Fence to use after loop using movnt. */
2182 tree x86_mfence;
2184 /* Register class used for passing given 64bit part of the argument.
2185 These represent classes as documented by the PS ABI, with the exception
2186 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2187 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2189 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2190 whenever possible (upper half does contain padding). */
2191 enum x86_64_reg_class
2193 X86_64_NO_CLASS,
2194 X86_64_INTEGER_CLASS,
2195 X86_64_INTEGERSI_CLASS,
2196 X86_64_SSE_CLASS,
2197 X86_64_SSESF_CLASS,
2198 X86_64_SSEDF_CLASS,
2199 X86_64_SSEUP_CLASS,
2200 X86_64_X87_CLASS,
2201 X86_64_X87UP_CLASS,
2202 X86_64_COMPLEX_X87_CLASS,
2203 X86_64_MEMORY_CLASS
2206 #define MAX_CLASSES 4
2208 /* Table of constants used by fldpi, fldln2, etc.... */
2209 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2210 static bool ext_80387_constants_init = 0;
2213 static struct machine_function * ix86_init_machine_status (void);
2214 static rtx ix86_function_value (const_tree, const_tree, bool);
2215 static bool ix86_function_value_regno_p (const unsigned int);
2216 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2217 const_tree);
2218 static rtx ix86_static_chain (const_tree, bool);
2219 static int ix86_function_regparm (const_tree, const_tree);
2220 static void ix86_compute_frame_layout (struct ix86_frame *);
2221 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2222 rtx, rtx, int);
2223 static void ix86_add_new_builtins (HOST_WIDE_INT);
2224 static tree ix86_canonical_va_list_type (tree);
2225 static void predict_jump (int);
2226 static unsigned int split_stack_prologue_scratch_regno (void);
2227 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2229 enum ix86_function_specific_strings
2231 IX86_FUNCTION_SPECIFIC_ARCH,
2232 IX86_FUNCTION_SPECIFIC_TUNE,
2233 IX86_FUNCTION_SPECIFIC_MAX
2236 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2237 const char *, enum fpmath_unit, bool);
2238 static void ix86_function_specific_save (struct cl_target_option *,
2239 struct gcc_options *opts);
2240 static void ix86_function_specific_restore (struct gcc_options *opts,
2241 struct cl_target_option *);
2242 static void ix86_function_specific_print (FILE *, int,
2243 struct cl_target_option *);
2244 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2245 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2246 struct gcc_options *,
2247 struct gcc_options *,
2248 struct gcc_options *);
2249 static bool ix86_can_inline_p (tree, tree);
2250 static void ix86_set_current_function (tree);
2251 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2253 static enum calling_abi ix86_function_abi (const_tree);
2256 #ifndef SUBTARGET32_DEFAULT_CPU
2257 #define SUBTARGET32_DEFAULT_CPU "i386"
2258 #endif
2260 /* Whether -mtune= or -march= were specified */
2261 static int ix86_tune_defaulted;
2262 static int ix86_arch_specified;
2264 /* Vectorization library interface and handlers. */
2265 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2267 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2268 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2270 /* Processor target table, indexed by processor number */
2271 struct ptt
2273 const struct processor_costs *cost; /* Processor costs */
2274 const int align_loop; /* Default alignments. */
2275 const int align_loop_max_skip;
2276 const int align_jump;
2277 const int align_jump_max_skip;
2278 const int align_func;
2281 static const struct ptt processor_target_table[PROCESSOR_max] =
2283 {&i386_cost, 4, 3, 4, 3, 4},
2284 {&i486_cost, 16, 15, 16, 15, 16},
2285 {&pentium_cost, 16, 7, 16, 7, 16},
2286 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2287 {&geode_cost, 0, 0, 0, 0, 0},
2288 {&k6_cost, 32, 7, 32, 7, 32},
2289 {&athlon_cost, 16, 7, 16, 7, 16},
2290 {&pentium4_cost, 0, 0, 0, 0, 0},
2291 {&k8_cost, 16, 7, 16, 7, 16},
2292 {&nocona_cost, 0, 0, 0, 0, 0},
2293 /* Core 2 */
2294 {&core_cost, 16, 10, 16, 10, 16},
2295 /* Core i7 */
2296 {&core_cost, 16, 10, 16, 10, 16},
2297 /* Core avx2 */
2298 {&core_cost, 16, 10, 16, 10, 16},
2299 {&generic_cost, 16, 10, 16, 10, 16},
2300 {&amdfam10_cost, 32, 24, 32, 7, 32},
2301 {&bdver1_cost, 16, 10, 16, 7, 11},
2302 {&bdver2_cost, 16, 10, 16, 7, 11},
2303 {&bdver3_cost, 16, 10, 16, 7, 11},
2304 {&btver1_cost, 16, 10, 16, 7, 11},
2305 {&btver2_cost, 16, 10, 16, 7, 11},
2306 {&atom_cost, 16, 15, 16, 7, 16},
2307 {&slm_cost, 16, 15, 16, 7, 16}
2310 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2312 "generic",
2313 "i386",
2314 "i486",
2315 "pentium",
2316 "pentium-mmx",
2317 "pentiumpro",
2318 "pentium2",
2319 "pentium3",
2320 "pentium4",
2321 "pentium-m",
2322 "prescott",
2323 "nocona",
2324 "core2",
2325 "corei7",
2326 "core-avx2",
2327 "atom",
2328 "slm",
2329 "geode",
2330 "k6",
2331 "k6-2",
2332 "k6-3",
2333 "athlon",
2334 "athlon-4",
2335 "k8",
2336 "amdfam10",
2337 "bdver1",
2338 "bdver2",
2339 "bdver3",
2340 "btver1",
2341 "btver2"
2344 static bool
2345 gate_insert_vzeroupper (void)
2347 return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
2350 static unsigned int
2351 rest_of_handle_insert_vzeroupper (void)
2353 int i;
2355 /* vzeroupper instructions are inserted immediately after reload to
2356 account for possible spills from 256bit registers. The pass
2357 reuses mode switching infrastructure by re-running mode insertion
2358 pass, so disable entities that have already been processed. */
2359 for (i = 0; i < MAX_386_ENTITIES; i++)
2360 ix86_optimize_mode_switching[i] = 0;
2362 ix86_optimize_mode_switching[AVX_U128] = 1;
2364 /* Call optimize_mode_switching. */
2365 g->get_passes ()->execute_pass_mode_switching ();
2366 return 0;
2369 namespace {
2371 const pass_data pass_data_insert_vzeroupper =
2373 RTL_PASS, /* type */
2374 "vzeroupper", /* name */
2375 OPTGROUP_NONE, /* optinfo_flags */
2376 true, /* has_gate */
2377 true, /* has_execute */
2378 TV_NONE, /* tv_id */
2379 0, /* properties_required */
2380 0, /* properties_provided */
2381 0, /* properties_destroyed */
2382 0, /* todo_flags_start */
2383 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2386 class pass_insert_vzeroupper : public rtl_opt_pass
2388 public:
2389 pass_insert_vzeroupper(gcc::context *ctxt)
2390 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2393 /* opt_pass methods: */
2394 bool gate () { return gate_insert_vzeroupper (); }
2395 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2397 }; // class pass_insert_vzeroupper
2399 } // anon namespace
2401 rtl_opt_pass *
2402 make_pass_insert_vzeroupper (gcc::context *ctxt)
2404 return new pass_insert_vzeroupper (ctxt);
2407 /* Return true if a red-zone is in use. */
2409 static inline bool
2410 ix86_using_red_zone (void)
2412 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2415 /* Return a string that documents the current -m options. The caller is
2416 responsible for freeing the string. */
2418 static char *
2419 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2420 const char *tune, enum fpmath_unit fpmath,
2421 bool add_nl_p)
2423 struct ix86_target_opts
2425 const char *option; /* option string */
2426 HOST_WIDE_INT mask; /* isa mask options */
2429 /* This table is ordered so that options like -msse4.2 that imply
2430 preceding options while match those first. */
2431 static struct ix86_target_opts isa_opts[] =
2433 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2434 { "-mfma", OPTION_MASK_ISA_FMA },
2435 { "-mxop", OPTION_MASK_ISA_XOP },
2436 { "-mlwp", OPTION_MASK_ISA_LWP },
2437 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2438 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2439 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2440 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2441 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2442 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2443 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2444 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2445 { "-msse3", OPTION_MASK_ISA_SSE3 },
2446 { "-msse2", OPTION_MASK_ISA_SSE2 },
2447 { "-msse", OPTION_MASK_ISA_SSE },
2448 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2449 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2450 { "-mmmx", OPTION_MASK_ISA_MMX },
2451 { "-mabm", OPTION_MASK_ISA_ABM },
2452 { "-mbmi", OPTION_MASK_ISA_BMI },
2453 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2454 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2455 { "-mhle", OPTION_MASK_ISA_HLE },
2456 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2457 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2458 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2459 { "-madx", OPTION_MASK_ISA_ADX },
2460 { "-mtbm", OPTION_MASK_ISA_TBM },
2461 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2462 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2463 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2464 { "-maes", OPTION_MASK_ISA_AES },
2465 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2466 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2467 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2468 { "-mf16c", OPTION_MASK_ISA_F16C },
2469 { "-mrtm", OPTION_MASK_ISA_RTM },
2470 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2471 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2474 /* Flag options. */
2475 static struct ix86_target_opts flag_opts[] =
2477 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2478 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2479 { "-m80387", MASK_80387 },
2480 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2481 { "-malign-double", MASK_ALIGN_DOUBLE },
2482 { "-mcld", MASK_CLD },
2483 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2484 { "-mieee-fp", MASK_IEEE_FP },
2485 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2486 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2487 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2488 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2489 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2490 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2491 { "-mno-red-zone", MASK_NO_RED_ZONE },
2492 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2493 { "-mrecip", MASK_RECIP },
2494 { "-mrtd", MASK_RTD },
2495 { "-msseregparm", MASK_SSEREGPARM },
2496 { "-mstack-arg-probe", MASK_STACK_PROBE },
2497 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2498 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2499 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2500 { "-mvzeroupper", MASK_VZEROUPPER },
2501 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2502 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2503 { "-mprefer-avx128", MASK_PREFER_AVX128},
2506 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2508 char isa_other[40];
2509 char target_other[40];
2510 unsigned num = 0;
2511 unsigned i, j;
2512 char *ret;
2513 char *ptr;
2514 size_t len;
2515 size_t line_len;
2516 size_t sep_len;
2517 const char *abi;
2519 memset (opts, '\0', sizeof (opts));
2521 /* Add -march= option. */
2522 if (arch)
2524 opts[num][0] = "-march=";
2525 opts[num++][1] = arch;
2528 /* Add -mtune= option. */
2529 if (tune)
2531 opts[num][0] = "-mtune=";
2532 opts[num++][1] = tune;
2535 /* Add -m32/-m64/-mx32. */
2536 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2538 if ((isa & OPTION_MASK_ABI_64) != 0)
2539 abi = "-m64";
2540 else
2541 abi = "-mx32";
2542 isa &= ~ (OPTION_MASK_ISA_64BIT
2543 | OPTION_MASK_ABI_64
2544 | OPTION_MASK_ABI_X32);
2546 else
2547 abi = "-m32";
2548 opts[num++][0] = abi;
2550 /* Pick out the options in isa options. */
2551 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2553 if ((isa & isa_opts[i].mask) != 0)
2555 opts[num++][0] = isa_opts[i].option;
2556 isa &= ~ isa_opts[i].mask;
2560 if (isa && add_nl_p)
2562 opts[num++][0] = isa_other;
2563 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2564 isa);
2567 /* Add flag options. */
2568 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2570 if ((flags & flag_opts[i].mask) != 0)
2572 opts[num++][0] = flag_opts[i].option;
2573 flags &= ~ flag_opts[i].mask;
2577 if (flags && add_nl_p)
2579 opts[num++][0] = target_other;
2580 sprintf (target_other, "(other flags: %#x)", flags);
2583 /* Add -fpmath= option. */
2584 if (fpmath)
2586 opts[num][0] = "-mfpmath=";
2587 switch ((int) fpmath)
2589 case FPMATH_387:
2590 opts[num++][1] = "387";
2591 break;
2593 case FPMATH_SSE:
2594 opts[num++][1] = "sse";
2595 break;
2597 case FPMATH_387 | FPMATH_SSE:
2598 opts[num++][1] = "sse+387";
2599 break;
2601 default:
2602 gcc_unreachable ();
2606 /* Any options? */
2607 if (num == 0)
2608 return NULL;
2610 gcc_assert (num < ARRAY_SIZE (opts));
2612 /* Size the string. */
2613 len = 0;
2614 sep_len = (add_nl_p) ? 3 : 1;
2615 for (i = 0; i < num; i++)
2617 len += sep_len;
2618 for (j = 0; j < 2; j++)
2619 if (opts[i][j])
2620 len += strlen (opts[i][j]);
2623 /* Build the string. */
2624 ret = ptr = (char *) xmalloc (len);
2625 line_len = 0;
2627 for (i = 0; i < num; i++)
2629 size_t len2[2];
2631 for (j = 0; j < 2; j++)
2632 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2634 if (i != 0)
2636 *ptr++ = ' ';
2637 line_len++;
2639 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2641 *ptr++ = '\\';
2642 *ptr++ = '\n';
2643 line_len = 0;
2647 for (j = 0; j < 2; j++)
2648 if (opts[i][j])
2650 memcpy (ptr, opts[i][j], len2[j]);
2651 ptr += len2[j];
2652 line_len += len2[j];
2656 *ptr = '\0';
2657 gcc_assert (ret + len >= ptr);
2659 return ret;
2662 /* Return true, if profiling code should be emitted before
2663 prologue. Otherwise it returns false.
2664 Note: For x86 with "hotfix" it is sorried. */
2665 static bool
2666 ix86_profile_before_prologue (void)
2668 return flag_fentry != 0;
2671 /* Function that is callable from the debugger to print the current
2672 options. */
2673 void ATTRIBUTE_UNUSED
2674 ix86_debug_options (void)
2676 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2677 ix86_arch_string, ix86_tune_string,
2678 ix86_fpmath, true);
2680 if (opts)
2682 fprintf (stderr, "%s\n\n", opts);
2683 free (opts);
2685 else
2686 fputs ("<no options>\n\n", stderr);
2688 return;
2691 static const char *stringop_alg_names[] = {
2692 #define DEF_ENUM
2693 #define DEF_ALG(alg, name) #name,
2694 #include "stringop.def"
2695 #undef DEF_ENUM
2696 #undef DEF_ALG
2699 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
2700 The string is of the following form (or comma separated list of it):
2702 strategy_alg:max_size:[align|noalign]
2704 where the full size range for the strategy is either [0, max_size] or
2705 [min_size, max_size], in which min_size is the max_size + 1 of the
2706 preceding range. The last size range must have max_size == -1.
2708 Examples:
2711 -mmemcpy-strategy=libcall:-1:noalign
2713 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2717 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
2719 This is to tell the compiler to use the following strategy for memset
2720 1) when the expected size is between [1, 16], use rep_8byte strategy;
2721 2) when the size is between [17, 2048], use vector_loop;
2722 3) when the size is > 2048, use libcall. */
2724 struct stringop_size_range
2726 int max;
2727 stringop_alg alg;
2728 bool noalign;
2731 static void
2732 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
2734 const struct stringop_algs *default_algs;
2735 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
2736 char *curr_range_str, *next_range_str;
2737 int i = 0, n = 0;
2739 if (is_memset)
2740 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
2741 else
2742 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
2744 curr_range_str = strategy_str;
2748 int maxs;
2749 stringop_alg alg;
2750 char alg_name[128];
2751 char align[16];
2752 next_range_str = strchr (curr_range_str, ',');
2753 if (next_range_str)
2754 *next_range_str++ = '\0';
2756 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
2757 alg_name, &maxs, align))
2759 error ("wrong arg %s to option %s", curr_range_str,
2760 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2761 return;
2764 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
2766 error ("size ranges of option %s should be increasing",
2767 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2768 return;
2771 for (i = 0; i < last_alg; i++)
2773 if (!strcmp (alg_name, stringop_alg_names[i]))
2775 alg = (stringop_alg) i;
2776 break;
2780 if (i == last_alg)
2782 error ("wrong stringop strategy name %s specified for option %s",
2783 alg_name,
2784 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2785 return;
2788 input_ranges[n].max = maxs;
2789 input_ranges[n].alg = alg;
2790 if (!strcmp (align, "align"))
2791 input_ranges[n].noalign = false;
2792 else if (!strcmp (align, "noalign"))
2793 input_ranges[n].noalign = true;
2794 else
2796 error ("unknown alignment %s specified for option %s",
2797 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2798 return;
2800 n++;
2801 curr_range_str = next_range_str;
2803 while (curr_range_str);
2805 if (input_ranges[n - 1].max != -1)
2807 error ("the max value for the last size range should be -1"
2808 " for option %s",
2809 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2810 return;
2813 if (n > MAX_STRINGOP_ALGS)
2815 error ("too many size ranges specified in option %s",
2816 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2817 return;
2820 /* Now override the default algs array. */
2821 for (i = 0; i < n; i++)
2823 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
2824 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
2825 = input_ranges[i].alg;
2826 *const_cast<int *>(&default_algs->size[i].noalign)
2827 = input_ranges[i].noalign;
2832 /* parse -mtune-ctrl= option. When DUMP is true,
2833 print the features that are explicitly set. */
2835 static void
2836 parse_mtune_ctrl_str (bool dump)
2838 if (!ix86_tune_ctrl_string)
2839 return;
2841 char *next_feature_string = NULL;
2842 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
2843 char *orig = curr_feature_string;
2844 int i;
2847 bool clear = false;
2849 next_feature_string = strchr (curr_feature_string, ',');
2850 if (next_feature_string)
2851 *next_feature_string++ = '\0';
2852 if (*curr_feature_string == '^')
2854 curr_feature_string++;
2855 clear = true;
2857 for (i = 0; i < X86_TUNE_LAST; i++)
2859 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
2861 ix86_tune_features[i] = !clear;
2862 if (dump)
2863 fprintf (stderr, "Explicitly %s feature %s\n",
2864 clear ? "clear" : "set", ix86_tune_feature_names[i]);
2865 break;
2868 if (i == X86_TUNE_LAST)
2869 error ("Unknown parameter to option -mtune-ctrl: %s",
2870 clear ? curr_feature_string - 1 : curr_feature_string);
2871 curr_feature_string = next_feature_string;
2873 while (curr_feature_string);
2874 free (orig);
2877 /* Helper function to set ix86_tune_features. IX86_TUNE is the
2878 processor type. */
2880 static void
2881 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
2883 unsigned int ix86_tune_mask = 1u << ix86_tune;
2884 int i;
2886 for (i = 0; i < X86_TUNE_LAST; ++i)
2888 if (ix86_tune_no_default)
2889 ix86_tune_features[i] = 0;
2890 else
2891 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
2894 if (dump)
2896 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
2897 for (i = 0; i < X86_TUNE_LAST; i++)
2898 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
2899 ix86_tune_features[i] ? "on" : "off");
2902 parse_mtune_ctrl_str (dump);
2906 /* Override various settings based on options. If MAIN_ARGS_P, the
2907 options are from the command line, otherwise they are from
2908 attributes. */
2910 static void
2911 ix86_option_override_internal (bool main_args_p,
2912 struct gcc_options *opts,
2913 struct gcc_options *opts_set)
2915 int i;
2916 unsigned int ix86_arch_mask;
2917 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
2918 const char *prefix;
2919 const char *suffix;
2920 const char *sw;
2922 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2923 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2924 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2925 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2926 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2927 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2928 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2929 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2930 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2931 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2932 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2933 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2934 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2935 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2936 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2937 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2938 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2939 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2940 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2941 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2942 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2943 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2944 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2945 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2946 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2947 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2948 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2949 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2950 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2951 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2952 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2953 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2954 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2955 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2956 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2957 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2958 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2959 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2960 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2961 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2962 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
2963 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
2964 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
2965 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
2967 /* if this reaches 64, need to widen struct pta flags below */
2969 static struct pta
2971 const char *const name; /* processor name or nickname. */
2972 const enum processor_type processor;
2973 const enum attr_cpu schedule;
2974 const unsigned HOST_WIDE_INT flags;
2976 const processor_alias_table[] =
2978 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2979 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2980 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2981 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2982 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2983 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2984 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2985 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2986 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2987 PTA_MMX | PTA_SSE | PTA_FXSR},
2988 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2989 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2990 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2991 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2992 PTA_MMX | PTA_SSE | PTA_FXSR},
2993 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2994 PTA_MMX | PTA_SSE | PTA_FXSR},
2995 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2996 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2997 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2998 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2999 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3000 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3001 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3002 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3003 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3004 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3005 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3006 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3007 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3008 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3009 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3010 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3011 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3012 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
3013 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3014 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3015 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3016 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3017 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
3018 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3019 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3020 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3021 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3022 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3023 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3024 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3025 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3026 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3027 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3028 | PTA_XSAVEOPT},
3029 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3030 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3031 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3032 {"slm", PROCESSOR_SLM, CPU_SLM,
3033 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3034 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
3035 | PTA_FXSR},
3036 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3037 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3038 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3039 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3040 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3041 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3042 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3043 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3044 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3045 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3046 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3047 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3048 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3049 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3050 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3051 {"x86-64", PROCESSOR_K8, CPU_K8,
3052 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3053 {"k8", PROCESSOR_K8, CPU_K8,
3054 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3055 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3056 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3057 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3058 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3059 {"opteron", PROCESSOR_K8, CPU_K8,
3060 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3061 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3062 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3063 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3064 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3065 {"athlon64", PROCESSOR_K8, CPU_K8,
3066 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3067 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3068 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3069 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3070 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3071 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3072 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3073 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3074 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3075 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3076 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3077 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3078 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3079 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3080 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3081 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3082 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3083 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3084 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3085 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3086 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3087 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3088 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3089 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3090 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3091 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3092 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3093 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3094 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3095 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3096 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3097 | PTA_XSAVEOPT | PTA_FSGSBASE},
3098 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3099 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3100 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3101 | PTA_FXSR | PTA_XSAVE},
3102 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3103 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3104 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3105 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3106 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3107 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3109 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3110 PTA_64BIT
3111 | PTA_HLE /* flags are only used for -march switch. */ },
3114 /* -mrecip options. */
3115 static struct
3117 const char *string; /* option name */
3118 unsigned int mask; /* mask bits to set */
3120 const recip_options[] =
3122 { "all", RECIP_MASK_ALL },
3123 { "none", RECIP_MASK_NONE },
3124 { "div", RECIP_MASK_DIV },
3125 { "sqrt", RECIP_MASK_SQRT },
3126 { "vec-div", RECIP_MASK_VEC_DIV },
3127 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3130 int const pta_size = ARRAY_SIZE (processor_alias_table);
3132 /* Set up prefix/suffix so the error messages refer to either the command
3133 line argument, or the attribute(target). */
3134 if (main_args_p)
3136 prefix = "-m";
3137 suffix = "";
3138 sw = "switch";
3140 else
3142 prefix = "option(\"";
3143 suffix = "\")";
3144 sw = "attribute";
3147 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3148 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3149 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3150 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3151 #ifdef TARGET_BI_ARCH
3152 else
3154 #if TARGET_BI_ARCH == 1
3155 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3156 is on and OPTION_MASK_ABI_X32 is off. We turn off
3157 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3158 -mx32. */
3159 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3160 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3161 #else
3162 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3163 on and OPTION_MASK_ABI_64 is off. We turn off
3164 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3165 -m64. */
3166 if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3167 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3168 #endif
3170 #endif
3172 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3174 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3175 OPTION_MASK_ABI_64 for TARGET_X32. */
3176 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3177 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3179 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3181 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3182 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3183 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3184 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3187 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3188 SUBTARGET_OVERRIDE_OPTIONS;
3189 #endif
3191 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3192 SUBSUBTARGET_OVERRIDE_OPTIONS;
3193 #endif
3195 /* -fPIC is the default for x86_64. */
3196 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3197 opts->x_flag_pic = 2;
3199 /* Need to check -mtune=generic first. */
3200 if (opts->x_ix86_tune_string)
3202 if (!strcmp (opts->x_ix86_tune_string, "generic")
3203 || !strcmp (opts->x_ix86_tune_string, "i686")
3204 /* As special support for cross compilers we read -mtune=native
3205 as -mtune=generic. With native compilers we won't see the
3206 -mtune=native, as it was changed by the driver. */
3207 || !strcmp (opts->x_ix86_tune_string, "native"))
3209 opts->x_ix86_tune_string = "generic";
3211 /* If this call is for setting the option attribute, allow the
3212 generic that was previously set. */
3213 else if (!main_args_p
3214 && !strcmp (opts->x_ix86_tune_string, "generic"))
3216 else if (!strncmp (opts->x_ix86_tune_string, "generic", 7))
3217 error ("bad value (%s) for %stune=%s %s",
3218 opts->x_ix86_tune_string, prefix, suffix, sw);
3219 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3220 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3221 "%stune=k8%s or %stune=generic%s instead as appropriate",
3222 prefix, suffix, prefix, suffix, prefix, suffix);
3224 else
3226 if (opts->x_ix86_arch_string)
3227 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3228 if (!opts->x_ix86_tune_string)
3230 opts->x_ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3231 ix86_tune_defaulted = 1;
3234 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3235 or defaulted. We need to use a sensible tune option. */
3236 if (!strcmp (opts->x_ix86_tune_string, "generic")
3237 || !strcmp (opts->x_ix86_tune_string, "x86-64")
3238 || !strcmp (opts->x_ix86_tune_string, "i686"))
3240 opts->x_ix86_tune_string = "generic";
3244 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3245 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3247 /* rep; movq isn't available in 32-bit code. */
3248 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3249 opts->x_ix86_stringop_alg = no_stringop;
3252 if (!opts->x_ix86_arch_string)
3253 opts->x_ix86_arch_string
3254 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3255 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3256 else
3257 ix86_arch_specified = 1;
3259 if (opts_set->x_ix86_pmode)
3261 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3262 && opts->x_ix86_pmode == PMODE_SI)
3263 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3264 && opts->x_ix86_pmode == PMODE_DI))
3265 error ("address mode %qs not supported in the %s bit mode",
3266 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3267 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3269 else
3270 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3271 ? PMODE_DI : PMODE_SI;
3273 if (!opts_set->x_ix86_abi)
3274 opts->x_ix86_abi = DEFAULT_ABI;
3276 /* For targets using ms ABI enable ms-extensions, if not
3277 explicit turned off. For non-ms ABI we turn off this
3278 option. */
3279 if (!opts_set->x_flag_ms_extensions)
3280 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3282 if (opts_set->x_ix86_cmodel)
3284 switch (opts->x_ix86_cmodel)
3286 case CM_SMALL:
3287 case CM_SMALL_PIC:
3288 if (opts->x_flag_pic)
3289 opts->x_ix86_cmodel = CM_SMALL_PIC;
3290 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3291 error ("code model %qs not supported in the %s bit mode",
3292 "small", "32");
3293 break;
3295 case CM_MEDIUM:
3296 case CM_MEDIUM_PIC:
3297 if (opts->x_flag_pic)
3298 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3299 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3300 error ("code model %qs not supported in the %s bit mode",
3301 "medium", "32");
3302 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3303 error ("code model %qs not supported in x32 mode",
3304 "medium");
3305 break;
3307 case CM_LARGE:
3308 case CM_LARGE_PIC:
3309 if (opts->x_flag_pic)
3310 opts->x_ix86_cmodel = CM_LARGE_PIC;
3311 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3312 error ("code model %qs not supported in the %s bit mode",
3313 "large", "32");
3314 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3315 error ("code model %qs not supported in x32 mode",
3316 "large");
3317 break;
3319 case CM_32:
3320 if (opts->x_flag_pic)
3321 error ("code model %s does not support PIC mode", "32");
3322 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3323 error ("code model %qs not supported in the %s bit mode",
3324 "32", "64");
3325 break;
3327 case CM_KERNEL:
3328 if (opts->x_flag_pic)
3330 error ("code model %s does not support PIC mode", "kernel");
3331 opts->x_ix86_cmodel = CM_32;
3333 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3334 error ("code model %qs not supported in the %s bit mode",
3335 "kernel", "32");
3336 break;
3338 default:
3339 gcc_unreachable ();
3342 else
3344 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3345 use of rip-relative addressing. This eliminates fixups that
3346 would otherwise be needed if this object is to be placed in a
3347 DLL, and is essentially just as efficient as direct addressing. */
3348 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3349 && (TARGET_RDOS || TARGET_PECOFF))
3350 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3351 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3352 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3353 else
3354 opts->x_ix86_cmodel = CM_32;
3356 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3358 error ("-masm=intel not supported in this configuration");
3359 opts->x_ix86_asm_dialect = ASM_ATT;
3361 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3362 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3363 sorry ("%i-bit mode not compiled in",
3364 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3366 for (i = 0; i < pta_size; i++)
3367 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3369 ix86_schedule = processor_alias_table[i].schedule;
3370 ix86_arch = processor_alias_table[i].processor;
3371 /* Default cpu tuning to the architecture. */
3372 ix86_tune = ix86_arch;
3374 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3375 && !(processor_alias_table[i].flags & PTA_64BIT))
3376 error ("CPU you selected does not support x86-64 "
3377 "instruction set");
3379 if (processor_alias_table[i].flags & PTA_MMX
3380 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3381 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3382 if (processor_alias_table[i].flags & PTA_3DNOW
3383 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3384 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3385 if (processor_alias_table[i].flags & PTA_3DNOW_A
3386 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3387 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3388 if (processor_alias_table[i].flags & PTA_SSE
3389 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3390 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3391 if (processor_alias_table[i].flags & PTA_SSE2
3392 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3393 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3394 if (processor_alias_table[i].flags & PTA_SSE3
3395 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3396 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3397 if (processor_alias_table[i].flags & PTA_SSSE3
3398 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3399 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3400 if (processor_alias_table[i].flags & PTA_SSE4_1
3401 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3402 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3403 if (processor_alias_table[i].flags & PTA_SSE4_2
3404 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3405 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3406 if (processor_alias_table[i].flags & PTA_AVX
3407 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3408 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3409 if (processor_alias_table[i].flags & PTA_AVX2
3410 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3411 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3412 if (processor_alias_table[i].flags & PTA_FMA
3413 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3414 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3415 if (processor_alias_table[i].flags & PTA_SSE4A
3416 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3417 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3418 if (processor_alias_table[i].flags & PTA_FMA4
3419 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3420 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3421 if (processor_alias_table[i].flags & PTA_XOP
3422 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3423 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3424 if (processor_alias_table[i].flags & PTA_LWP
3425 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3426 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3427 if (processor_alias_table[i].flags & PTA_ABM
3428 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3429 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3430 if (processor_alias_table[i].flags & PTA_BMI
3431 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3432 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3433 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3434 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3435 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3436 if (processor_alias_table[i].flags & PTA_TBM
3437 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3438 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3439 if (processor_alias_table[i].flags & PTA_BMI2
3440 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3441 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3442 if (processor_alias_table[i].flags & PTA_CX16
3443 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3444 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3445 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3446 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3447 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3448 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3449 && (processor_alias_table[i].flags & PTA_NO_SAHF))
3450 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3451 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3452 if (processor_alias_table[i].flags & PTA_MOVBE
3453 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3454 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3455 if (processor_alias_table[i].flags & PTA_AES
3456 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3457 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES;
3458 if (processor_alias_table[i].flags & PTA_PCLMUL
3459 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3460 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3461 if (processor_alias_table[i].flags & PTA_FSGSBASE
3462 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3463 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3464 if (processor_alias_table[i].flags & PTA_RDRND
3465 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3466 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3467 if (processor_alias_table[i].flags & PTA_F16C
3468 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3469 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3470 if (processor_alias_table[i].flags & PTA_RTM
3471 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3472 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3473 if (processor_alias_table[i].flags & PTA_HLE
3474 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3475 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3476 if (processor_alias_table[i].flags & PTA_PRFCHW
3477 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3478 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3479 if (processor_alias_table[i].flags & PTA_RDSEED
3480 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3481 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3482 if (processor_alias_table[i].flags & PTA_ADX
3483 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3484 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3485 if (processor_alias_table[i].flags & PTA_FXSR
3486 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3487 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3488 if (processor_alias_table[i].flags & PTA_XSAVE
3489 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3490 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3491 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3492 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3493 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3494 if (processor_alias_table[i].flags & PTA_AVX512F
3495 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
3496 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
3497 if (processor_alias_table[i].flags & PTA_AVX512ER
3498 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
3499 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
3500 if (processor_alias_table[i].flags & PTA_AVX512PF
3501 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
3502 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
3503 if (processor_alias_table[i].flags & PTA_AVX512CD
3504 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
3505 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
3506 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3507 x86_prefetch_sse = true;
3509 break;
3512 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3513 error ("generic CPU can be used only for %stune=%s %s",
3514 prefix, suffix, sw);
3515 else if (!strncmp (opts->x_ix86_arch_string, "generic", 7) || i == pta_size)
3516 error ("bad value (%s) for %sarch=%s %s",
3517 opts->x_ix86_arch_string, prefix, suffix, sw);
3519 ix86_arch_mask = 1u << ix86_arch;
3520 for (i = 0; i < X86_ARCH_LAST; ++i)
3521 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3523 for (i = 0; i < pta_size; i++)
3524 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
3526 ix86_schedule = processor_alias_table[i].schedule;
3527 ix86_tune = processor_alias_table[i].processor;
3528 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3530 if (!(processor_alias_table[i].flags & PTA_64BIT))
3532 if (ix86_tune_defaulted)
3534 opts->x_ix86_tune_string = "x86-64";
3535 for (i = 0; i < pta_size; i++)
3536 if (! strcmp (opts->x_ix86_tune_string,
3537 processor_alias_table[i].name))
3538 break;
3539 ix86_schedule = processor_alias_table[i].schedule;
3540 ix86_tune = processor_alias_table[i].processor;
3542 else
3543 error ("CPU you selected does not support x86-64 "
3544 "instruction set");
3547 /* Intel CPUs have always interpreted SSE prefetch instructions as
3548 NOPs; so, we can enable SSE prefetch instructions even when
3549 -mtune (rather than -march) points us to a processor that has them.
3550 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3551 higher processors. */
3552 if (TARGET_CMOV
3553 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3554 x86_prefetch_sse = true;
3555 break;
3558 if (ix86_tune_specified && i == pta_size)
3559 error ("bad value (%s) for %stune=%s %s",
3560 opts->x_ix86_tune_string, prefix, suffix, sw);
3562 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
3564 #ifndef USE_IX86_FRAME_POINTER
3565 #define USE_IX86_FRAME_POINTER 0
3566 #endif
3568 #ifndef USE_X86_64_FRAME_POINTER
3569 #define USE_X86_64_FRAME_POINTER 0
3570 #endif
3572 /* Set the default values for switches whose default depends on TARGET_64BIT
3573 in case they weren't overwritten by command line options. */
3574 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3576 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3577 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3578 if (opts->x_flag_asynchronous_unwind_tables == 2)
3579 opts->x_flag_unwind_tables
3580 = opts->x_flag_asynchronous_unwind_tables = 1;
3581 if (opts->x_flag_pcc_struct_return == 2)
3582 opts->x_flag_pcc_struct_return = 0;
3584 else
3586 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3587 opts->x_flag_omit_frame_pointer
3588 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
3589 if (opts->x_flag_asynchronous_unwind_tables == 2)
3590 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3591 if (opts->x_flag_pcc_struct_return == 2)
3592 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3595 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3596 if (opts->x_optimize_size)
3597 ix86_cost = &ix86_size_cost;
3598 else
3599 ix86_cost = ix86_tune_cost;
3601 /* Arrange to set up i386_stack_locals for all functions. */
3602 init_machine_status = ix86_init_machine_status;
3604 /* Validate -mregparm= value. */
3605 if (opts_set->x_ix86_regparm)
3607 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3608 warning (0, "-mregparm is ignored in 64-bit mode");
3609 if (opts->x_ix86_regparm > REGPARM_MAX)
3611 error ("-mregparm=%d is not between 0 and %d",
3612 opts->x_ix86_regparm, REGPARM_MAX);
3613 opts->x_ix86_regparm = 0;
3616 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3617 opts->x_ix86_regparm = REGPARM_MAX;
3619 /* Default align_* from the processor table. */
3620 if (opts->x_align_loops == 0)
3622 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3623 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3625 if (opts->x_align_jumps == 0)
3627 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3628 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3630 if (opts->x_align_functions == 0)
3632 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3635 /* Provide default for -mbranch-cost= value. */
3636 if (!opts_set->x_ix86_branch_cost)
3637 opts->x_ix86_branch_cost = ix86_cost->branch_cost;
3639 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3641 opts->x_target_flags
3642 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
3644 /* Enable by default the SSE and MMX builtins. Do allow the user to
3645 explicitly disable any of these. In particular, disabling SSE and
3646 MMX for kernel code is extremely useful. */
3647 if (!ix86_arch_specified)
3648 opts->x_ix86_isa_flags
3649 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3650 | TARGET_SUBTARGET64_ISA_DEFAULT)
3651 & ~opts->x_ix86_isa_flags_explicit);
3653 if (TARGET_RTD_P (opts->x_target_flags))
3654 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3656 else
3658 opts->x_target_flags
3659 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
3661 if (!ix86_arch_specified)
3662 opts->x_ix86_isa_flags
3663 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
3665 /* i386 ABI does not specify red zone. It still makes sense to use it
3666 when programmer takes care to stack from being destroyed. */
3667 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
3668 opts->x_target_flags |= MASK_NO_RED_ZONE;
3671 /* Keep nonleaf frame pointers. */
3672 if (opts->x_flag_omit_frame_pointer)
3673 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3674 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
3675 opts->x_flag_omit_frame_pointer = 1;
3677 /* If we're doing fast math, we don't care about comparison order
3678 wrt NaNs. This lets us use a shorter comparison sequence. */
3679 if (opts->x_flag_finite_math_only)
3680 opts->x_target_flags &= ~MASK_IEEE_FP;
3682 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3683 since the insns won't need emulation. */
3684 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
3685 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
3687 /* Likewise, if the target doesn't have a 387, or we've specified
3688 software floating point, don't use 387 inline intrinsics. */
3689 if (!TARGET_80387_P (opts->x_target_flags))
3690 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
3692 /* Turn on MMX builtins for -msse. */
3693 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
3694 opts->x_ix86_isa_flags
3695 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
3697 /* Enable SSE prefetch. */
3698 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
3699 || (TARGET_PRFCHW && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)))
3700 x86_prefetch_sse = true;
3702 /* Enable prefetch{,w} instructions for -m3dnow. */
3703 if (TARGET_3DNOW_P (opts->x_ix86_isa_flags))
3704 opts->x_ix86_isa_flags
3705 |= OPTION_MASK_ISA_PRFCHW & ~opts->x_ix86_isa_flags_explicit;
3707 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3708 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
3709 || TARGET_ABM_P (opts->x_ix86_isa_flags))
3710 opts->x_ix86_isa_flags
3711 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
3713 /* Enable lzcnt instruction for -mabm. */
3714 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
3715 opts->x_ix86_isa_flags
3716 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
3718 /* Validate -mpreferred-stack-boundary= value or default it to
3719 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3720 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3721 if (opts_set->x_ix86_preferred_stack_boundary_arg)
3723 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3724 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
3725 int max = (TARGET_SEH ? 4 : 12);
3727 if (opts->x_ix86_preferred_stack_boundary_arg < min
3728 || opts->x_ix86_preferred_stack_boundary_arg > max)
3730 if (min == max)
3731 error ("-mpreferred-stack-boundary is not supported "
3732 "for this target");
3733 else
3734 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3735 opts->x_ix86_preferred_stack_boundary_arg, min, max);
3737 else
3738 ix86_preferred_stack_boundary
3739 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3742 /* Set the default value for -mstackrealign. */
3743 if (opts->x_ix86_force_align_arg_pointer == -1)
3744 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3746 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3748 /* Validate -mincoming-stack-boundary= value or default it to
3749 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3750 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3751 if (opts_set->x_ix86_incoming_stack_boundary_arg)
3753 if (ix86_incoming_stack_boundary_arg
3754 < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2)
3755 || ix86_incoming_stack_boundary_arg > 12)
3756 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3757 ix86_incoming_stack_boundary_arg,
3758 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2);
3759 else
3761 ix86_user_incoming_stack_boundary
3762 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3763 ix86_incoming_stack_boundary
3764 = ix86_user_incoming_stack_boundary;
3768 /* Accept -msseregparm only if at least SSE support is enabled. */
3769 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
3770 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
3771 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3773 if (opts_set->x_ix86_fpmath)
3775 if (opts->x_ix86_fpmath & FPMATH_SSE)
3777 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
3779 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3780 opts->x_ix86_fpmath = FPMATH_387;
3782 else if ((opts->x_ix86_fpmath & FPMATH_387)
3783 && !TARGET_80387_P (opts->x_target_flags))
3785 warning (0, "387 instruction set disabled, using SSE arithmetics");
3786 opts->x_ix86_fpmath = FPMATH_SSE;
3790 /* For all chips supporting SSE2, -mfpmath=sse performs better than
3791 fpmath=387. The second is however default at many targets since the
3792 extra 80bit precision of temporaries is considered to be part of ABI.
3793 Overwrite the default at least for -ffast-math.
3794 TODO: -mfpmath=both seems to produce same performing code with bit
3795 smaller binaries. It is however not clear if register allocation is
3796 ready for this setting.
3797 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
3798 codegen. We may switch to 387 with -ffast-math for size optimized
3799 functions. */
3800 else if (fast_math_flags_set_p (&global_options)
3801 && TARGET_SSE2)
3802 ix86_fpmath = FPMATH_SSE;
3803 else
3804 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
3806 /* If the i387 is disabled, then do not return values in it. */
3807 if (!TARGET_80387_P (opts->x_target_flags))
3808 opts->x_target_flags &= ~MASK_FLOAT_RETURNS;
3810 /* Use external vectorized library in vectorizing intrinsics. */
3811 if (opts_set->x_ix86_veclibabi_type)
3812 switch (opts->x_ix86_veclibabi_type)
3814 case ix86_veclibabi_type_svml:
3815 ix86_veclib_handler = ix86_veclibabi_svml;
3816 break;
3818 case ix86_veclibabi_type_acml:
3819 ix86_veclib_handler = ix86_veclibabi_acml;
3820 break;
3822 default:
3823 gcc_unreachable ();
3826 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
3827 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3828 && !opts->x_optimize_size)
3829 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3831 /* If stack probes are required, the space used for large function
3832 arguments on the stack must also be probed, so enable
3833 -maccumulate-outgoing-args so this happens in the prologue. */
3834 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
3835 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3837 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3838 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3839 "for correctness", prefix, suffix);
3840 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3843 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3845 char *p;
3846 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3847 p = strchr (internal_label_prefix, 'X');
3848 internal_label_prefix_len = p - internal_label_prefix;
3849 *p = '\0';
3852 /* When scheduling description is not available, disable scheduler pass
3853 so it won't slow down the compilation and make x87 code slower. */
3854 if (!TARGET_SCHEDULE)
3855 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
3857 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3858 ix86_tune_cost->simultaneous_prefetches,
3859 opts->x_param_values,
3860 opts_set->x_param_values);
3861 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3862 ix86_tune_cost->prefetch_block,
3863 opts->x_param_values,
3864 opts_set->x_param_values);
3865 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3866 ix86_tune_cost->l1_cache_size,
3867 opts->x_param_values,
3868 opts_set->x_param_values);
3869 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3870 ix86_tune_cost->l2_cache_size,
3871 opts->x_param_values,
3872 opts_set->x_param_values);
3874 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3875 if (opts->x_flag_prefetch_loop_arrays < 0
3876 && HAVE_prefetch
3877 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
3878 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3879 opts->x_flag_prefetch_loop_arrays = 1;
3881 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3882 can be opts->x_optimized to ap = __builtin_next_arg (0). */
3883 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
3884 targetm.expand_builtin_va_start = NULL;
3886 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3888 ix86_gen_leave = gen_leave_rex64;
3889 if (Pmode == DImode)
3891 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3892 ix86_gen_tls_local_dynamic_base_64
3893 = gen_tls_local_dynamic_base_64_di;
3895 else
3897 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3898 ix86_gen_tls_local_dynamic_base_64
3899 = gen_tls_local_dynamic_base_64_si;
3902 else
3903 ix86_gen_leave = gen_leave;
3905 if (Pmode == DImode)
3907 ix86_gen_add3 = gen_adddi3;
3908 ix86_gen_sub3 = gen_subdi3;
3909 ix86_gen_sub3_carry = gen_subdi3_carry;
3910 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3911 ix86_gen_andsp = gen_anddi3;
3912 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3913 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3914 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3915 ix86_gen_monitor = gen_sse3_monitor_di;
3917 else
3919 ix86_gen_add3 = gen_addsi3;
3920 ix86_gen_sub3 = gen_subsi3;
3921 ix86_gen_sub3_carry = gen_subsi3_carry;
3922 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3923 ix86_gen_andsp = gen_andsi3;
3924 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3925 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3926 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3927 ix86_gen_monitor = gen_sse3_monitor_si;
3930 #ifdef USE_IX86_CLD
3931 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3932 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3933 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
3934 #endif
3936 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
3938 if (opts->x_flag_fentry > 0)
3939 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3940 "with -fpic");
3941 opts->x_flag_fentry = 0;
3943 else if (TARGET_SEH)
3945 if (opts->x_flag_fentry == 0)
3946 sorry ("-mno-fentry isn%'t compatible with SEH");
3947 opts->x_flag_fentry = 1;
3949 else if (opts->x_flag_fentry < 0)
3951 #if defined(PROFILE_BEFORE_PROLOGUE)
3952 opts->x_flag_fentry = 1;
3953 #else
3954 opts->x_flag_fentry = 0;
3955 #endif
3958 /* When not opts->x_optimize for size, enable vzeroupper optimization for
3959 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3960 AVX unaligned load/store. */
3961 if (!opts->x_optimize_size)
3963 if (flag_expensive_optimizations
3964 && !(opts_set->x_target_flags & MASK_VZEROUPPER))
3965 opts->x_target_flags |= MASK_VZEROUPPER;
3966 if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
3967 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3968 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3969 if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
3970 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3971 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3972 /* Enable 128-bit AVX instruction generation
3973 for the auto-vectorizer. */
3974 if (TARGET_AVX128_OPTIMAL
3975 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
3976 opts->x_target_flags |= MASK_PREFER_AVX128;
3979 if (opts->x_ix86_recip_name)
3981 char *p = ASTRDUP (opts->x_ix86_recip_name);
3982 char *q;
3983 unsigned int mask, i;
3984 bool invert;
3986 while ((q = strtok (p, ",")) != NULL)
3988 p = NULL;
3989 if (*q == '!')
3991 invert = true;
3992 q++;
3994 else
3995 invert = false;
3997 if (!strcmp (q, "default"))
3998 mask = RECIP_MASK_ALL;
3999 else
4001 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4002 if (!strcmp (q, recip_options[i].string))
4004 mask = recip_options[i].mask;
4005 break;
4008 if (i == ARRAY_SIZE (recip_options))
4010 error ("unknown option for -mrecip=%s", q);
4011 invert = false;
4012 mask = RECIP_MASK_NONE;
4016 opts->x_recip_mask_explicit |= mask;
4017 if (invert)
4018 opts->x_recip_mask &= ~mask;
4019 else
4020 opts->x_recip_mask |= mask;
4024 if (TARGET_RECIP_P (opts->x_target_flags))
4025 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4026 else if (opts_set->x_target_flags & MASK_RECIP)
4027 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4029 /* Default long double to 64-bit for Bionic. */
4030 if (TARGET_HAS_BIONIC
4031 && !(opts_set->x_target_flags & MASK_LONG_DOUBLE_64))
4032 opts->x_target_flags |= MASK_LONG_DOUBLE_64;
4034 /* Save the initial options in case the user does function specific
4035 options. */
4036 if (main_args_p)
4037 target_option_default_node = target_option_current_node
4038 = build_target_option_node (opts);
4040 /* Handle stack protector */
4041 if (!opts_set->x_ix86_stack_protector_guard)
4042 opts->x_ix86_stack_protector_guard
4043 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4045 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4046 if (opts->x_ix86_tune_memcpy_strategy)
4048 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4049 ix86_parse_stringop_strategy_string (str, false);
4050 free (str);
4053 if (opts->x_ix86_tune_memset_strategy)
4055 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4056 ix86_parse_stringop_strategy_string (str, true);
4057 free (str);
4061 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4063 static void
4064 ix86_option_override (void)
4066 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4067 static struct register_pass_info insert_vzeroupper_info
4068 = { pass_insert_vzeroupper, "reload",
4069 1, PASS_POS_INSERT_AFTER
4072 ix86_option_override_internal (true, &global_options, &global_options_set);
4075 /* This needs to be done at start up. It's convenient to do it here. */
4076 register_pass (&insert_vzeroupper_info);
4079 /* Update register usage after having seen the compiler flags. */
4081 static void
4082 ix86_conditional_register_usage (void)
4084 int i, c_mask;
4085 unsigned int j;
4087 /* The PIC register, if it exists, is fixed. */
4088 j = PIC_OFFSET_TABLE_REGNUM;
4089 if (j != INVALID_REGNUM)
4090 fixed_regs[j] = call_used_regs[j] = 1;
4092 /* For 32-bit targets, squash the REX registers. */
4093 if (! TARGET_64BIT)
4095 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4096 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4097 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4098 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4099 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4100 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4103 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4104 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4105 : TARGET_64BIT ? (1 << 2)
4106 : (1 << 1));
4108 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4110 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4112 /* Set/reset conditionally defined registers from
4113 CALL_USED_REGISTERS initializer. */
4114 if (call_used_regs[i] > 1)
4115 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4117 /* Calculate registers of CLOBBERED_REGS register set
4118 as call used registers from GENERAL_REGS register set. */
4119 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4120 && call_used_regs[i])
4121 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4124 /* If MMX is disabled, squash the registers. */
4125 if (! TARGET_MMX)
4126 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4127 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4128 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4130 /* If SSE is disabled, squash the registers. */
4131 if (! TARGET_SSE)
4132 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4133 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4134 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4136 /* If the FPU is disabled, squash the registers. */
4137 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4138 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4139 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4140 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4142 /* If AVX512F is disabled, squash the registers. */
4143 if (! TARGET_AVX512F)
4145 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4146 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4148 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4149 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4154 /* Save the current options */
4156 static void
4157 ix86_function_specific_save (struct cl_target_option *ptr,
4158 struct gcc_options *opts)
4160 ptr->arch = ix86_arch;
4161 ptr->schedule = ix86_schedule;
4162 ptr->tune = ix86_tune;
4163 ptr->branch_cost = ix86_branch_cost;
4164 ptr->tune_defaulted = ix86_tune_defaulted;
4165 ptr->arch_specified = ix86_arch_specified;
4166 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
4167 ptr->x_ix86_target_flags_explicit = opts->x_ix86_target_flags_explicit;
4168 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
4170 /* The fields are char but the variables are not; make sure the
4171 values fit in the fields. */
4172 gcc_assert (ptr->arch == ix86_arch);
4173 gcc_assert (ptr->schedule == ix86_schedule);
4174 gcc_assert (ptr->tune == ix86_tune);
4175 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4178 /* Restore the current options */
4180 static void
4181 ix86_function_specific_restore (struct gcc_options *opts,
4182 struct cl_target_option *ptr)
4184 enum processor_type old_tune = ix86_tune;
4185 enum processor_type old_arch = ix86_arch;
4186 unsigned int ix86_arch_mask;
4187 int i;
4189 ix86_arch = (enum processor_type) ptr->arch;
4190 ix86_schedule = (enum attr_cpu) ptr->schedule;
4191 ix86_tune = (enum processor_type) ptr->tune;
4192 opts->x_ix86_branch_cost = ptr->branch_cost;
4193 ix86_tune_defaulted = ptr->tune_defaulted;
4194 ix86_arch_specified = ptr->arch_specified;
4195 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4196 opts->x_ix86_target_flags_explicit = ptr->x_ix86_target_flags_explicit;
4197 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
4199 /* Recreate the arch feature tests if the arch changed */
4200 if (old_arch != ix86_arch)
4202 ix86_arch_mask = 1u << ix86_arch;
4203 for (i = 0; i < X86_ARCH_LAST; ++i)
4204 ix86_arch_features[i]
4205 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4208 /* Recreate the tune optimization tests */
4209 if (old_tune != ix86_tune)
4210 set_ix86_tune_features (ix86_tune, false);
4213 /* Print the current options */
4215 static void
4216 ix86_function_specific_print (FILE *file, int indent,
4217 struct cl_target_option *ptr)
4219 char *target_string
4220 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4221 NULL, NULL, ptr->x_ix86_fpmath, false);
4223 fprintf (file, "%*sarch = %d (%s)\n",
4224 indent, "",
4225 ptr->arch,
4226 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4227 ? cpu_names[ptr->arch]
4228 : "<unknown>"));
4230 fprintf (file, "%*stune = %d (%s)\n",
4231 indent, "",
4232 ptr->tune,
4233 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4234 ? cpu_names[ptr->tune]
4235 : "<unknown>"));
4237 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4239 if (target_string)
4241 fprintf (file, "%*s%s\n", indent, "", target_string);
4242 free (target_string);
4247 /* Inner function to process the attribute((target(...))), take an argument and
4248 set the current options from the argument. If we have a list, recursively go
4249 over the list. */
4251 static bool
4252 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4253 struct gcc_options *opts,
4254 struct gcc_options *opts_set,
4255 struct gcc_options *enum_opts_set)
4257 char *next_optstr;
4258 bool ret = true;
4260 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4261 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4262 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4263 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4264 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4266 enum ix86_opt_type
4268 ix86_opt_unknown,
4269 ix86_opt_yes,
4270 ix86_opt_no,
4271 ix86_opt_str,
4272 ix86_opt_enum,
4273 ix86_opt_isa
4276 static const struct
4278 const char *string;
4279 size_t len;
4280 enum ix86_opt_type type;
4281 int opt;
4282 int mask;
4283 } attrs[] = {
4284 /* isa options */
4285 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4286 IX86_ATTR_ISA ("abm", OPT_mabm),
4287 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4288 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4289 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4290 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4291 IX86_ATTR_ISA ("aes", OPT_maes),
4292 IX86_ATTR_ISA ("avx", OPT_mavx),
4293 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4294 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
4295 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
4296 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
4297 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
4298 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4299 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4300 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4301 IX86_ATTR_ISA ("sse", OPT_msse),
4302 IX86_ATTR_ISA ("sse2", OPT_msse2),
4303 IX86_ATTR_ISA ("sse3", OPT_msse3),
4304 IX86_ATTR_ISA ("sse4", OPT_msse4),
4305 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4306 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4307 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4308 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4309 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4310 IX86_ATTR_ISA ("fma", OPT_mfma),
4311 IX86_ATTR_ISA ("xop", OPT_mxop),
4312 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4313 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4314 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4315 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4316 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4317 IX86_ATTR_ISA ("hle", OPT_mhle),
4318 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4319 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4320 IX86_ATTR_ISA ("adx", OPT_madx),
4321 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4322 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4323 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4325 /* enum options */
4326 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4328 /* string options */
4329 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4330 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4332 /* flag options */
4333 IX86_ATTR_YES ("cld",
4334 OPT_mcld,
4335 MASK_CLD),
4337 IX86_ATTR_NO ("fancy-math-387",
4338 OPT_mfancy_math_387,
4339 MASK_NO_FANCY_MATH_387),
4341 IX86_ATTR_YES ("ieee-fp",
4342 OPT_mieee_fp,
4343 MASK_IEEE_FP),
4345 IX86_ATTR_YES ("inline-all-stringops",
4346 OPT_minline_all_stringops,
4347 MASK_INLINE_ALL_STRINGOPS),
4349 IX86_ATTR_YES ("inline-stringops-dynamically",
4350 OPT_minline_stringops_dynamically,
4351 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4353 IX86_ATTR_NO ("align-stringops",
4354 OPT_mno_align_stringops,
4355 MASK_NO_ALIGN_STRINGOPS),
4357 IX86_ATTR_YES ("recip",
4358 OPT_mrecip,
4359 MASK_RECIP),
4363 /* If this is a list, recurse to get the options. */
4364 if (TREE_CODE (args) == TREE_LIST)
4366 bool ret = true;
4368 for (; args; args = TREE_CHAIN (args))
4369 if (TREE_VALUE (args)
4370 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4371 p_strings, opts, opts_set,
4372 enum_opts_set))
4373 ret = false;
4375 return ret;
4378 else if (TREE_CODE (args) != STRING_CST)
4380 error ("attribute %<target%> argument not a string");
4381 return false;
4384 /* Handle multiple arguments separated by commas. */
4385 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4387 while (next_optstr && *next_optstr != '\0')
4389 char *p = next_optstr;
4390 char *orig_p = p;
4391 char *comma = strchr (next_optstr, ',');
4392 const char *opt_string;
4393 size_t len, opt_len;
4394 int opt;
4395 bool opt_set_p;
4396 char ch;
4397 unsigned i;
4398 enum ix86_opt_type type = ix86_opt_unknown;
4399 int mask = 0;
4401 if (comma)
4403 *comma = '\0';
4404 len = comma - next_optstr;
4405 next_optstr = comma + 1;
4407 else
4409 len = strlen (p);
4410 next_optstr = NULL;
4413 /* Recognize no-xxx. */
4414 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4416 opt_set_p = false;
4417 p += 3;
4418 len -= 3;
4420 else
4421 opt_set_p = true;
4423 /* Find the option. */
4424 ch = *p;
4425 opt = N_OPTS;
4426 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4428 type = attrs[i].type;
4429 opt_len = attrs[i].len;
4430 if (ch == attrs[i].string[0]
4431 && ((type != ix86_opt_str && type != ix86_opt_enum)
4432 ? len == opt_len
4433 : len > opt_len)
4434 && memcmp (p, attrs[i].string, opt_len) == 0)
4436 opt = attrs[i].opt;
4437 mask = attrs[i].mask;
4438 opt_string = attrs[i].string;
4439 break;
4443 /* Process the option. */
4444 if (opt == N_OPTS)
4446 error ("attribute(target(\"%s\")) is unknown", orig_p);
4447 ret = false;
4450 else if (type == ix86_opt_isa)
4452 struct cl_decoded_option decoded;
4454 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4455 ix86_handle_option (opts, opts_set,
4456 &decoded, input_location);
4459 else if (type == ix86_opt_yes || type == ix86_opt_no)
4461 if (type == ix86_opt_no)
4462 opt_set_p = !opt_set_p;
4464 if (opt_set_p)
4465 opts->x_target_flags |= mask;
4466 else
4467 opts->x_target_flags &= ~mask;
4470 else if (type == ix86_opt_str)
4472 if (p_strings[opt])
4474 error ("option(\"%s\") was already specified", opt_string);
4475 ret = false;
4477 else
4478 p_strings[opt] = xstrdup (p + opt_len);
4481 else if (type == ix86_opt_enum)
4483 bool arg_ok;
4484 int value;
4486 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4487 if (arg_ok)
4488 set_option (opts, enum_opts_set, opt, value,
4489 p + opt_len, DK_UNSPECIFIED, input_location,
4490 global_dc);
4491 else
4493 error ("attribute(target(\"%s\")) is unknown", orig_p);
4494 ret = false;
4498 else
4499 gcc_unreachable ();
4502 return ret;
4505 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4507 tree
4508 ix86_valid_target_attribute_tree (tree args,
4509 struct gcc_options *opts,
4510 struct gcc_options *opts_set)
4512 const char *orig_arch_string = ix86_arch_string;
4513 const char *orig_tune_string = ix86_tune_string;
4514 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
4515 int orig_tune_defaulted = ix86_tune_defaulted;
4516 int orig_arch_specified = ix86_arch_specified;
4517 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4518 tree t = NULL_TREE;
4519 int i;
4520 struct cl_target_option *def
4521 = TREE_TARGET_OPTION (target_option_default_node);
4522 struct gcc_options enum_opts_set;
4524 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4526 /* Process each of the options on the chain. */
4527 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
4528 opts_set, &enum_opts_set))
4529 return error_mark_node;
4531 /* If the changed options are different from the default, rerun
4532 ix86_option_override_internal, and then save the options away.
4533 The string options are are attribute options, and will be undone
4534 when we copy the save structure. */
4535 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
4536 || opts->x_target_flags != def->x_target_flags
4537 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4538 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4539 || enum_opts_set.x_ix86_fpmath)
4541 /* If we are using the default tune= or arch=, undo the string assigned,
4542 and use the default. */
4543 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4544 opts->x_ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4545 else if (!orig_arch_specified)
4546 opts->x_ix86_arch_string = NULL;
4548 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4549 opts->x_ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4550 else if (orig_tune_defaulted)
4551 opts->x_ix86_tune_string = NULL;
4553 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4554 if (enum_opts_set.x_ix86_fpmath)
4555 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4556 else if (!TARGET_64BIT && TARGET_SSE)
4558 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4559 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4562 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4563 ix86_option_override_internal (false, opts, opts_set);
4565 /* Add any builtin functions with the new isa if any. */
4566 ix86_add_new_builtins (opts->x_ix86_isa_flags);
4568 /* Save the current options unless we are validating options for
4569 #pragma. */
4570 t = build_target_option_node (opts);
4572 opts->x_ix86_arch_string = orig_arch_string;
4573 opts->x_ix86_tune_string = orig_tune_string;
4574 opts_set->x_ix86_fpmath = orig_fpmath_set;
4576 /* Free up memory allocated to hold the strings */
4577 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4578 free (option_strings[i]);
4581 return t;
4584 /* Hook to validate attribute((target("string"))). */
4586 static bool
4587 ix86_valid_target_attribute_p (tree fndecl,
4588 tree ARG_UNUSED (name),
4589 tree args,
4590 int ARG_UNUSED (flags))
4592 struct gcc_options func_options;
4593 tree new_target, new_optimize;
4594 bool ret = true;
4596 /* attribute((target("default"))) does nothing, beyond
4597 affecting multi-versioning. */
4598 if (TREE_VALUE (args)
4599 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4600 && TREE_CHAIN (args) == NULL_TREE
4601 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4602 return true;
4604 tree old_optimize = build_optimization_node (&global_options);
4606 /* Get the optimization options of the current function. */
4607 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4609 if (!func_optimize)
4610 func_optimize = old_optimize;
4612 /* Init func_options. */
4613 memset (&func_options, 0, sizeof (func_options));
4614 init_options_struct (&func_options, NULL);
4615 lang_hooks.init_options_struct (&func_options);
4617 cl_optimization_restore (&func_options,
4618 TREE_OPTIMIZATION (func_optimize));
4620 /* Initialize func_options to the default before its target options can
4621 be set. */
4622 cl_target_option_restore (&func_options,
4623 TREE_TARGET_OPTION (target_option_default_node));
4625 new_target = ix86_valid_target_attribute_tree (args, &func_options,
4626 &global_options_set);
4628 new_optimize = build_optimization_node (&func_options);
4630 if (new_target == error_mark_node)
4631 ret = false;
4633 else if (fndecl && new_target)
4635 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4637 if (old_optimize != new_optimize)
4638 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4641 return ret;
4645 /* Hook to determine if one function can safely inline another. */
4647 static bool
4648 ix86_can_inline_p (tree caller, tree callee)
4650 bool ret = false;
4651 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4652 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4654 /* If callee has no option attributes, then it is ok to inline. */
4655 if (!callee_tree)
4656 ret = true;
4658 /* If caller has no option attributes, but callee does then it is not ok to
4659 inline. */
4660 else if (!caller_tree)
4661 ret = false;
4663 else
4665 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4666 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4668 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4669 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4670 function. */
4671 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4672 != callee_opts->x_ix86_isa_flags)
4673 ret = false;
4675 /* See if we have the same non-isa options. */
4676 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4677 ret = false;
4679 /* See if arch, tune, etc. are the same. */
4680 else if (caller_opts->arch != callee_opts->arch)
4681 ret = false;
4683 else if (caller_opts->tune != callee_opts->tune)
4684 ret = false;
4686 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4687 ret = false;
4689 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4690 ret = false;
4692 else
4693 ret = true;
4696 return ret;
4700 /* Remember the last target of ix86_set_current_function. */
4701 static GTY(()) tree ix86_previous_fndecl;
4703 /* Invalidate ix86_previous_fndecl cache. */
4704 void
4705 ix86_reset_previous_fndecl (void)
4707 ix86_previous_fndecl = NULL_TREE;
4710 /* Establish appropriate back-end context for processing the function
4711 FNDECL. The argument might be NULL to indicate processing at top
4712 level, outside of any function scope. */
4713 static void
4714 ix86_set_current_function (tree fndecl)
4716 /* Only change the context if the function changes. This hook is called
4717 several times in the course of compiling a function, and we don't want to
4718 slow things down too much or call target_reinit when it isn't safe. */
4719 if (fndecl && fndecl != ix86_previous_fndecl)
4721 tree old_tree = (ix86_previous_fndecl
4722 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4723 : NULL_TREE);
4725 tree new_tree = (fndecl
4726 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4727 : NULL_TREE);
4729 ix86_previous_fndecl = fndecl;
4730 if (old_tree == new_tree)
4733 else if (new_tree)
4735 cl_target_option_restore (&global_options,
4736 TREE_TARGET_OPTION (new_tree));
4737 target_reinit ();
4740 else if (old_tree)
4742 struct cl_target_option *def
4743 = TREE_TARGET_OPTION (target_option_current_node);
4745 cl_target_option_restore (&global_options, def);
4746 target_reinit ();
4752 /* Return true if this goes in large data/bss. */
4754 static bool
4755 ix86_in_large_data_p (tree exp)
4757 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4758 return false;
4760 /* Functions are never large data. */
4761 if (TREE_CODE (exp) == FUNCTION_DECL)
4762 return false;
4764 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4766 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4767 if (strcmp (section, ".ldata") == 0
4768 || strcmp (section, ".lbss") == 0)
4769 return true;
4770 return false;
4772 else
4774 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4776 /* If this is an incomplete type with size 0, then we can't put it
4777 in data because it might be too big when completed. */
4778 if (!size || size > ix86_section_threshold)
4779 return true;
4782 return false;
4785 /* Switch to the appropriate section for output of DECL.
4786 DECL is either a `VAR_DECL' node or a constant of some sort.
4787 RELOC indicates whether forming the initial value of DECL requires
4788 link-time relocations. */
4790 ATTRIBUTE_UNUSED static section *
4791 x86_64_elf_select_section (tree decl, int reloc,
4792 unsigned HOST_WIDE_INT align)
4794 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4795 && ix86_in_large_data_p (decl))
4797 const char *sname = NULL;
4798 unsigned int flags = SECTION_WRITE;
4799 switch (categorize_decl_for_section (decl, reloc))
4801 case SECCAT_DATA:
4802 sname = ".ldata";
4803 break;
4804 case SECCAT_DATA_REL:
4805 sname = ".ldata.rel";
4806 break;
4807 case SECCAT_DATA_REL_LOCAL:
4808 sname = ".ldata.rel.local";
4809 break;
4810 case SECCAT_DATA_REL_RO:
4811 sname = ".ldata.rel.ro";
4812 break;
4813 case SECCAT_DATA_REL_RO_LOCAL:
4814 sname = ".ldata.rel.ro.local";
4815 break;
4816 case SECCAT_BSS:
4817 sname = ".lbss";
4818 flags |= SECTION_BSS;
4819 break;
4820 case SECCAT_RODATA:
4821 case SECCAT_RODATA_MERGE_STR:
4822 case SECCAT_RODATA_MERGE_STR_INIT:
4823 case SECCAT_RODATA_MERGE_CONST:
4824 sname = ".lrodata";
4825 flags = 0;
4826 break;
4827 case SECCAT_SRODATA:
4828 case SECCAT_SDATA:
4829 case SECCAT_SBSS:
4830 gcc_unreachable ();
4831 case SECCAT_TEXT:
4832 case SECCAT_TDATA:
4833 case SECCAT_TBSS:
4834 /* We don't split these for medium model. Place them into
4835 default sections and hope for best. */
4836 break;
4838 if (sname)
4840 /* We might get called with string constants, but get_named_section
4841 doesn't like them as they are not DECLs. Also, we need to set
4842 flags in that case. */
4843 if (!DECL_P (decl))
4844 return get_section (sname, flags, NULL);
4845 return get_named_section (decl, sname, reloc);
4848 return default_elf_select_section (decl, reloc, align);
4851 /* Select a set of attributes for section NAME based on the properties
4852 of DECL and whether or not RELOC indicates that DECL's initializer
4853 might contain runtime relocations. */
4855 static unsigned int ATTRIBUTE_UNUSED
4856 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
4858 unsigned int flags = default_section_type_flags (decl, name, reloc);
4860 if (decl == NULL_TREE
4861 && (strcmp (name, ".ldata.rel.ro") == 0
4862 || strcmp (name, ".ldata.rel.ro.local") == 0))
4863 flags |= SECTION_RELRO;
4865 if (strcmp (name, ".lbss") == 0
4866 || strncmp (name, ".lbss.", 5) == 0
4867 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
4868 flags |= SECTION_BSS;
4870 return flags;
4873 /* Build up a unique section name, expressed as a
4874 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4875 RELOC indicates whether the initial value of EXP requires
4876 link-time relocations. */
4878 static void ATTRIBUTE_UNUSED
4879 x86_64_elf_unique_section (tree decl, int reloc)
4881 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4882 && ix86_in_large_data_p (decl))
4884 const char *prefix = NULL;
4885 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4886 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4888 switch (categorize_decl_for_section (decl, reloc))
4890 case SECCAT_DATA:
4891 case SECCAT_DATA_REL:
4892 case SECCAT_DATA_REL_LOCAL:
4893 case SECCAT_DATA_REL_RO:
4894 case SECCAT_DATA_REL_RO_LOCAL:
4895 prefix = one_only ? ".ld" : ".ldata";
4896 break;
4897 case SECCAT_BSS:
4898 prefix = one_only ? ".lb" : ".lbss";
4899 break;
4900 case SECCAT_RODATA:
4901 case SECCAT_RODATA_MERGE_STR:
4902 case SECCAT_RODATA_MERGE_STR_INIT:
4903 case SECCAT_RODATA_MERGE_CONST:
4904 prefix = one_only ? ".lr" : ".lrodata";
4905 break;
4906 case SECCAT_SRODATA:
4907 case SECCAT_SDATA:
4908 case SECCAT_SBSS:
4909 gcc_unreachable ();
4910 case SECCAT_TEXT:
4911 case SECCAT_TDATA:
4912 case SECCAT_TBSS:
4913 /* We don't split these for medium model. Place them into
4914 default sections and hope for best. */
4915 break;
4917 if (prefix)
4919 const char *name, *linkonce;
4920 char *string;
4922 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4923 name = targetm.strip_name_encoding (name);
4925 /* If we're using one_only, then there needs to be a .gnu.linkonce
4926 prefix to the section name. */
4927 linkonce = one_only ? ".gnu.linkonce" : "";
4929 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4931 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4932 return;
4935 default_unique_section (decl, reloc);
4938 #ifdef COMMON_ASM_OP
4939 /* This says how to output assembler code to declare an
4940 uninitialized external linkage data object.
4942 For medium model x86-64 we need to use .largecomm opcode for
4943 large objects. */
4944 void
4945 x86_elf_aligned_common (FILE *file,
4946 const char *name, unsigned HOST_WIDE_INT size,
4947 int align)
4949 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4950 && size > (unsigned int)ix86_section_threshold)
4951 fputs (".largecomm\t", file);
4952 else
4953 fputs (COMMON_ASM_OP, file);
4954 assemble_name (file, name);
4955 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4956 size, align / BITS_PER_UNIT);
4958 #endif
4960 /* Utility function for targets to use in implementing
4961 ASM_OUTPUT_ALIGNED_BSS. */
4963 void
4964 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4965 const char *name, unsigned HOST_WIDE_INT size,
4966 int align)
4968 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4969 && size > (unsigned int)ix86_section_threshold)
4970 switch_to_section (get_named_section (decl, ".lbss", 0));
4971 else
4972 switch_to_section (bss_section);
4973 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4974 #ifdef ASM_DECLARE_OBJECT_NAME
4975 last_assemble_variable_decl = decl;
4976 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4977 #else
4978 /* Standard thing is just output label for the object. */
4979 ASM_OUTPUT_LABEL (file, name);
4980 #endif /* ASM_DECLARE_OBJECT_NAME */
4981 ASM_OUTPUT_SKIP (file, size ? size : 1);
4984 /* Decide whether we must probe the stack before any space allocation
4985 on this target. It's essentially TARGET_STACK_PROBE except when
4986 -fstack-check causes the stack to be already probed differently. */
4988 bool
4989 ix86_target_stack_probe (void)
4991 /* Do not probe the stack twice if static stack checking is enabled. */
4992 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4993 return false;
4995 return TARGET_STACK_PROBE;
4998 /* Decide whether we can make a sibling call to a function. DECL is the
4999 declaration of the function being targeted by the call and EXP is the
5000 CALL_EXPR representing the call. */
5002 static bool
5003 ix86_function_ok_for_sibcall (tree decl, tree exp)
5005 tree type, decl_or_type;
5006 rtx a, b;
5008 /* If we are generating position-independent code, we cannot sibcall
5009 optimize any indirect call, or a direct call to a global function,
5010 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5011 if (!TARGET_MACHO
5012 && !TARGET_64BIT
5013 && flag_pic
5014 && (!decl || !targetm.binds_local_p (decl)))
5015 return false;
5017 /* If we need to align the outgoing stack, then sibcalling would
5018 unalign the stack, which may break the called function. */
5019 if (ix86_minimum_incoming_stack_boundary (true)
5020 < PREFERRED_STACK_BOUNDARY)
5021 return false;
5023 if (decl)
5025 decl_or_type = decl;
5026 type = TREE_TYPE (decl);
5028 else
5030 /* We're looking at the CALL_EXPR, we need the type of the function. */
5031 type = CALL_EXPR_FN (exp); /* pointer expression */
5032 type = TREE_TYPE (type); /* pointer type */
5033 type = TREE_TYPE (type); /* function type */
5034 decl_or_type = type;
5037 /* Check that the return value locations are the same. Like
5038 if we are returning floats on the 80387 register stack, we cannot
5039 make a sibcall from a function that doesn't return a float to a
5040 function that does or, conversely, from a function that does return
5041 a float to a function that doesn't; the necessary stack adjustment
5042 would not be executed. This is also the place we notice
5043 differences in the return value ABI. Note that it is ok for one
5044 of the functions to have void return type as long as the return
5045 value of the other is passed in a register. */
5046 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5047 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5048 cfun->decl, false);
5049 if (STACK_REG_P (a) || STACK_REG_P (b))
5051 if (!rtx_equal_p (a, b))
5052 return false;
5054 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5056 else if (!rtx_equal_p (a, b))
5057 return false;
5059 if (TARGET_64BIT)
5061 /* The SYSV ABI has more call-clobbered registers;
5062 disallow sibcalls from MS to SYSV. */
5063 if (cfun->machine->call_abi == MS_ABI
5064 && ix86_function_type_abi (type) == SYSV_ABI)
5065 return false;
5067 else
5069 /* If this call is indirect, we'll need to be able to use a
5070 call-clobbered register for the address of the target function.
5071 Make sure that all such registers are not used for passing
5072 parameters. Note that DLLIMPORT functions are indirect. */
5073 if (!decl
5074 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5076 if (ix86_function_regparm (type, NULL) >= 3)
5078 /* ??? Need to count the actual number of registers to be used,
5079 not the possible number of registers. Fix later. */
5080 return false;
5085 /* Otherwise okay. That also includes certain types of indirect calls. */
5086 return true;
5089 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5090 and "sseregparm" calling convention attributes;
5091 arguments as in struct attribute_spec.handler. */
5093 static tree
5094 ix86_handle_cconv_attribute (tree *node, tree name,
5095 tree args,
5096 int flags ATTRIBUTE_UNUSED,
5097 bool *no_add_attrs)
5099 if (TREE_CODE (*node) != FUNCTION_TYPE
5100 && TREE_CODE (*node) != METHOD_TYPE
5101 && TREE_CODE (*node) != FIELD_DECL
5102 && TREE_CODE (*node) != TYPE_DECL)
5104 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5105 name);
5106 *no_add_attrs = true;
5107 return NULL_TREE;
5110 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5111 if (is_attribute_p ("regparm", name))
5113 tree cst;
5115 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5117 error ("fastcall and regparm attributes are not compatible");
5120 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5122 error ("regparam and thiscall attributes are not compatible");
5125 cst = TREE_VALUE (args);
5126 if (TREE_CODE (cst) != INTEGER_CST)
5128 warning (OPT_Wattributes,
5129 "%qE attribute requires an integer constant argument",
5130 name);
5131 *no_add_attrs = true;
5133 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5135 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5136 name, REGPARM_MAX);
5137 *no_add_attrs = true;
5140 return NULL_TREE;
5143 if (TARGET_64BIT)
5145 /* Do not warn when emulating the MS ABI. */
5146 if ((TREE_CODE (*node) != FUNCTION_TYPE
5147 && TREE_CODE (*node) != METHOD_TYPE)
5148 || ix86_function_type_abi (*node) != MS_ABI)
5149 warning (OPT_Wattributes, "%qE attribute ignored",
5150 name);
5151 *no_add_attrs = true;
5152 return NULL_TREE;
5155 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5156 if (is_attribute_p ("fastcall", name))
5158 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5160 error ("fastcall and cdecl attributes are not compatible");
5162 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5164 error ("fastcall and stdcall attributes are not compatible");
5166 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5168 error ("fastcall and regparm attributes are not compatible");
5170 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5172 error ("fastcall and thiscall attributes are not compatible");
5176 /* Can combine stdcall with fastcall (redundant), regparm and
5177 sseregparm. */
5178 else if (is_attribute_p ("stdcall", name))
5180 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5182 error ("stdcall and cdecl attributes are not compatible");
5184 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5186 error ("stdcall and fastcall attributes are not compatible");
5188 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5190 error ("stdcall and thiscall attributes are not compatible");
5194 /* Can combine cdecl with regparm and sseregparm. */
5195 else if (is_attribute_p ("cdecl", name))
5197 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5199 error ("stdcall and cdecl attributes are not compatible");
5201 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5203 error ("fastcall and cdecl attributes are not compatible");
5205 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5207 error ("cdecl and thiscall attributes are not compatible");
5210 else if (is_attribute_p ("thiscall", name))
5212 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5213 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5214 name);
5215 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5217 error ("stdcall and thiscall attributes are not compatible");
5219 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5221 error ("fastcall and thiscall attributes are not compatible");
5223 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5225 error ("cdecl and thiscall attributes are not compatible");
5229 /* Can combine sseregparm with all attributes. */
5231 return NULL_TREE;
5234 /* The transactional memory builtins are implicitly regparm or fastcall
5235 depending on the ABI. Override the generic do-nothing attribute that
5236 these builtins were declared with, and replace it with one of the two
5237 attributes that we expect elsewhere. */
5239 static tree
5240 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5241 tree args ATTRIBUTE_UNUSED,
5242 int flags, bool *no_add_attrs)
5244 tree alt;
5246 /* In no case do we want to add the placeholder attribute. */
5247 *no_add_attrs = true;
5249 /* The 64-bit ABI is unchanged for transactional memory. */
5250 if (TARGET_64BIT)
5251 return NULL_TREE;
5253 /* ??? Is there a better way to validate 32-bit windows? We have
5254 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5255 if (CHECK_STACK_LIMIT > 0)
5256 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5257 else
5259 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5260 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5262 decl_attributes (node, alt, flags);
5264 return NULL_TREE;
5267 /* This function determines from TYPE the calling-convention. */
5269 unsigned int
5270 ix86_get_callcvt (const_tree type)
5272 unsigned int ret = 0;
5273 bool is_stdarg;
5274 tree attrs;
5276 if (TARGET_64BIT)
5277 return IX86_CALLCVT_CDECL;
5279 attrs = TYPE_ATTRIBUTES (type);
5280 if (attrs != NULL_TREE)
5282 if (lookup_attribute ("cdecl", attrs))
5283 ret |= IX86_CALLCVT_CDECL;
5284 else if (lookup_attribute ("stdcall", attrs))
5285 ret |= IX86_CALLCVT_STDCALL;
5286 else if (lookup_attribute ("fastcall", attrs))
5287 ret |= IX86_CALLCVT_FASTCALL;
5288 else if (lookup_attribute ("thiscall", attrs))
5289 ret |= IX86_CALLCVT_THISCALL;
5291 /* Regparam isn't allowed for thiscall and fastcall. */
5292 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5294 if (lookup_attribute ("regparm", attrs))
5295 ret |= IX86_CALLCVT_REGPARM;
5296 if (lookup_attribute ("sseregparm", attrs))
5297 ret |= IX86_CALLCVT_SSEREGPARM;
5300 if (IX86_BASE_CALLCVT(ret) != 0)
5301 return ret;
5304 is_stdarg = stdarg_p (type);
5305 if (TARGET_RTD && !is_stdarg)
5306 return IX86_CALLCVT_STDCALL | ret;
5308 if (ret != 0
5309 || is_stdarg
5310 || TREE_CODE (type) != METHOD_TYPE
5311 || ix86_function_type_abi (type) != MS_ABI)
5312 return IX86_CALLCVT_CDECL | ret;
5314 return IX86_CALLCVT_THISCALL;
5317 /* Return 0 if the attributes for two types are incompatible, 1 if they
5318 are compatible, and 2 if they are nearly compatible (which causes a
5319 warning to be generated). */
5321 static int
5322 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5324 unsigned int ccvt1, ccvt2;
5326 if (TREE_CODE (type1) != FUNCTION_TYPE
5327 && TREE_CODE (type1) != METHOD_TYPE)
5328 return 1;
5330 ccvt1 = ix86_get_callcvt (type1);
5331 ccvt2 = ix86_get_callcvt (type2);
5332 if (ccvt1 != ccvt2)
5333 return 0;
5334 if (ix86_function_regparm (type1, NULL)
5335 != ix86_function_regparm (type2, NULL))
5336 return 0;
5338 return 1;
5341 /* Return the regparm value for a function with the indicated TYPE and DECL.
5342 DECL may be NULL when calling function indirectly
5343 or considering a libcall. */
5345 static int
5346 ix86_function_regparm (const_tree type, const_tree decl)
5348 tree attr;
5349 int regparm;
5350 unsigned int ccvt;
5352 if (TARGET_64BIT)
5353 return (ix86_function_type_abi (type) == SYSV_ABI
5354 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5355 ccvt = ix86_get_callcvt (type);
5356 regparm = ix86_regparm;
5358 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5360 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5361 if (attr)
5363 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5364 return regparm;
5367 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5368 return 2;
5369 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5370 return 1;
5372 /* Use register calling convention for local functions when possible. */
5373 if (decl
5374 && TREE_CODE (decl) == FUNCTION_DECL
5375 && optimize
5376 && !(profile_flag && !flag_fentry))
5378 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5379 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5380 if (i && i->local && i->can_change_signature)
5382 int local_regparm, globals = 0, regno;
5384 /* Make sure no regparm register is taken by a
5385 fixed register variable. */
5386 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5387 if (fixed_regs[local_regparm])
5388 break;
5390 /* We don't want to use regparm(3) for nested functions as
5391 these use a static chain pointer in the third argument. */
5392 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5393 local_regparm = 2;
5395 /* In 32-bit mode save a register for the split stack. */
5396 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5397 local_regparm = 2;
5399 /* Each fixed register usage increases register pressure,
5400 so less registers should be used for argument passing.
5401 This functionality can be overriden by an explicit
5402 regparm value. */
5403 for (regno = AX_REG; regno <= DI_REG; regno++)
5404 if (fixed_regs[regno])
5405 globals++;
5407 local_regparm
5408 = globals < local_regparm ? local_regparm - globals : 0;
5410 if (local_regparm > regparm)
5411 regparm = local_regparm;
5415 return regparm;
5418 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5419 DFmode (2) arguments in SSE registers for a function with the
5420 indicated TYPE and DECL. DECL may be NULL when calling function
5421 indirectly or considering a libcall. Otherwise return 0. */
5423 static int
5424 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5426 gcc_assert (!TARGET_64BIT);
5428 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5429 by the sseregparm attribute. */
5430 if (TARGET_SSEREGPARM
5431 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5433 if (!TARGET_SSE)
5435 if (warn)
5437 if (decl)
5438 error ("calling %qD with attribute sseregparm without "
5439 "SSE/SSE2 enabled", decl);
5440 else
5441 error ("calling %qT with attribute sseregparm without "
5442 "SSE/SSE2 enabled", type);
5444 return 0;
5447 return 2;
5450 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5451 (and DFmode for SSE2) arguments in SSE registers. */
5452 if (decl && TARGET_SSE_MATH && optimize
5453 && !(profile_flag && !flag_fentry))
5455 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5456 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5457 if (i && i->local && i->can_change_signature)
5458 return TARGET_SSE2 ? 2 : 1;
5461 return 0;
5464 /* Return true if EAX is live at the start of the function. Used by
5465 ix86_expand_prologue to determine if we need special help before
5466 calling allocate_stack_worker. */
5468 static bool
5469 ix86_eax_live_at_start_p (void)
5471 /* Cheat. Don't bother working forward from ix86_function_regparm
5472 to the function type to whether an actual argument is located in
5473 eax. Instead just look at cfg info, which is still close enough
5474 to correct at this point. This gives false positives for broken
5475 functions that might use uninitialized data that happens to be
5476 allocated in eax, but who cares? */
5477 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5480 static bool
5481 ix86_keep_aggregate_return_pointer (tree fntype)
5483 tree attr;
5485 if (!TARGET_64BIT)
5487 attr = lookup_attribute ("callee_pop_aggregate_return",
5488 TYPE_ATTRIBUTES (fntype));
5489 if (attr)
5490 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5492 /* For 32-bit MS-ABI the default is to keep aggregate
5493 return pointer. */
5494 if (ix86_function_type_abi (fntype) == MS_ABI)
5495 return true;
5497 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5500 /* Value is the number of bytes of arguments automatically
5501 popped when returning from a subroutine call.
5502 FUNDECL is the declaration node of the function (as a tree),
5503 FUNTYPE is the data type of the function (as a tree),
5504 or for a library call it is an identifier node for the subroutine name.
5505 SIZE is the number of bytes of arguments passed on the stack.
5507 On the 80386, the RTD insn may be used to pop them if the number
5508 of args is fixed, but if the number is variable then the caller
5509 must pop them all. RTD can't be used for library calls now
5510 because the library is compiled with the Unix compiler.
5511 Use of RTD is a selectable option, since it is incompatible with
5512 standard Unix calling sequences. If the option is not selected,
5513 the caller must always pop the args.
5515 The attribute stdcall is equivalent to RTD on a per module basis. */
5517 static int
5518 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5520 unsigned int ccvt;
5522 /* None of the 64-bit ABIs pop arguments. */
5523 if (TARGET_64BIT)
5524 return 0;
5526 ccvt = ix86_get_callcvt (funtype);
5528 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5529 | IX86_CALLCVT_THISCALL)) != 0
5530 && ! stdarg_p (funtype))
5531 return size;
5533 /* Lose any fake structure return argument if it is passed on the stack. */
5534 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5535 && !ix86_keep_aggregate_return_pointer (funtype))
5537 int nregs = ix86_function_regparm (funtype, fundecl);
5538 if (nregs == 0)
5539 return GET_MODE_SIZE (Pmode);
5542 return 0;
5545 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5547 static bool
5548 ix86_legitimate_combined_insn (rtx insn)
5550 /* Check operand constraints in case hard registers were propagated
5551 into insn pattern. This check prevents combine pass from
5552 generating insn patterns with invalid hard register operands.
5553 These invalid insns can eventually confuse reload to error out
5554 with a spill failure. See also PRs 46829 and 46843. */
5555 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5557 int i;
5559 extract_insn (insn);
5560 preprocess_constraints ();
5562 for (i = 0; i < recog_data.n_operands; i++)
5564 rtx op = recog_data.operand[i];
5565 enum machine_mode mode = GET_MODE (op);
5566 struct operand_alternative *op_alt;
5567 int offset = 0;
5568 bool win;
5569 int j;
5571 /* A unary operator may be accepted by the predicate, but it
5572 is irrelevant for matching constraints. */
5573 if (UNARY_P (op))
5574 op = XEXP (op, 0);
5576 if (GET_CODE (op) == SUBREG)
5578 if (REG_P (SUBREG_REG (op))
5579 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5580 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5581 GET_MODE (SUBREG_REG (op)),
5582 SUBREG_BYTE (op),
5583 GET_MODE (op));
5584 op = SUBREG_REG (op);
5587 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5588 continue;
5590 op_alt = recog_op_alt[i];
5592 /* Operand has no constraints, anything is OK. */
5593 win = !recog_data.n_alternatives;
5595 for (j = 0; j < recog_data.n_alternatives; j++)
5597 if (op_alt[j].anything_ok
5598 || (op_alt[j].matches != -1
5599 && operands_match_p
5600 (recog_data.operand[i],
5601 recog_data.operand[op_alt[j].matches]))
5602 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5604 win = true;
5605 break;
5609 if (!win)
5610 return false;
5614 return true;
5617 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5619 static unsigned HOST_WIDE_INT
5620 ix86_asan_shadow_offset (void)
5622 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5623 : HOST_WIDE_INT_C (0x7fff8000))
5624 : (HOST_WIDE_INT_1 << 29);
5627 /* Argument support functions. */
5629 /* Return true when register may be used to pass function parameters. */
5630 bool
5631 ix86_function_arg_regno_p (int regno)
5633 int i;
5634 const int *parm_regs;
5636 if (!TARGET_64BIT)
5638 if (TARGET_MACHO)
5639 return (regno < REGPARM_MAX
5640 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5641 else
5642 return (regno < REGPARM_MAX
5643 || (TARGET_MMX && MMX_REGNO_P (regno)
5644 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5645 || (TARGET_SSE && SSE_REGNO_P (regno)
5646 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5649 if (TARGET_SSE && SSE_REGNO_P (regno)
5650 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5651 return true;
5653 /* TODO: The function should depend on current function ABI but
5654 builtins.c would need updating then. Therefore we use the
5655 default ABI. */
5657 /* RAX is used as hidden argument to va_arg functions. */
5658 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5659 return true;
5661 if (ix86_abi == MS_ABI)
5662 parm_regs = x86_64_ms_abi_int_parameter_registers;
5663 else
5664 parm_regs = x86_64_int_parameter_registers;
5665 for (i = 0; i < (ix86_abi == MS_ABI
5666 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5667 if (regno == parm_regs[i])
5668 return true;
5669 return false;
5672 /* Return if we do not know how to pass TYPE solely in registers. */
5674 static bool
5675 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5677 if (must_pass_in_stack_var_size_or_pad (mode, type))
5678 return true;
5680 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5681 The layout_type routine is crafty and tries to trick us into passing
5682 currently unsupported vector types on the stack by using TImode. */
5683 return (!TARGET_64BIT && mode == TImode
5684 && type && TREE_CODE (type) != VECTOR_TYPE);
5687 /* It returns the size, in bytes, of the area reserved for arguments passed
5688 in registers for the function represented by fndecl dependent to the used
5689 abi format. */
5691 ix86_reg_parm_stack_space (const_tree fndecl)
5693 enum calling_abi call_abi = SYSV_ABI;
5694 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5695 call_abi = ix86_function_abi (fndecl);
5696 else
5697 call_abi = ix86_function_type_abi (fndecl);
5698 if (TARGET_64BIT && call_abi == MS_ABI)
5699 return 32;
5700 return 0;
5703 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5704 call abi used. */
5705 enum calling_abi
5706 ix86_function_type_abi (const_tree fntype)
5708 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5710 enum calling_abi abi = ix86_abi;
5711 if (abi == SYSV_ABI)
5713 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5714 abi = MS_ABI;
5716 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5717 abi = SYSV_ABI;
5718 return abi;
5720 return ix86_abi;
5723 /* We add this as a workaround in order to use libc_has_function
5724 hook in i386.md. */
5725 bool
5726 ix86_libc_has_function (enum function_class fn_class)
5728 return targetm.libc_has_function (fn_class);
5731 static bool
5732 ix86_function_ms_hook_prologue (const_tree fn)
5734 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5736 if (decl_function_context (fn) != NULL_TREE)
5737 error_at (DECL_SOURCE_LOCATION (fn),
5738 "ms_hook_prologue is not compatible with nested function");
5739 else
5740 return true;
5742 return false;
5745 static enum calling_abi
5746 ix86_function_abi (const_tree fndecl)
5748 if (! fndecl)
5749 return ix86_abi;
5750 return ix86_function_type_abi (TREE_TYPE (fndecl));
5753 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5754 call abi used. */
5755 enum calling_abi
5756 ix86_cfun_abi (void)
5758 if (! cfun)
5759 return ix86_abi;
5760 return cfun->machine->call_abi;
5763 /* Write the extra assembler code needed to declare a function properly. */
5765 void
5766 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5767 tree decl)
5769 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5771 if (is_ms_hook)
5773 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5774 unsigned int filler_cc = 0xcccccccc;
5776 for (i = 0; i < filler_count; i += 4)
5777 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5780 #ifdef SUBTARGET_ASM_UNWIND_INIT
5781 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5782 #endif
5784 ASM_OUTPUT_LABEL (asm_out_file, fname);
5786 /* Output magic byte marker, if hot-patch attribute is set. */
5787 if (is_ms_hook)
5789 if (TARGET_64BIT)
5791 /* leaq [%rsp + 0], %rsp */
5792 asm_fprintf (asm_out_file, ASM_BYTE
5793 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5795 else
5797 /* movl.s %edi, %edi
5798 push %ebp
5799 movl.s %esp, %ebp */
5800 asm_fprintf (asm_out_file, ASM_BYTE
5801 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5806 /* regclass.c */
5807 extern void init_regs (void);
5809 /* Implementation of call abi switching target hook. Specific to FNDECL
5810 the specific call register sets are set. See also
5811 ix86_conditional_register_usage for more details. */
5812 void
5813 ix86_call_abi_override (const_tree fndecl)
5815 if (fndecl == NULL_TREE)
5816 cfun->machine->call_abi = ix86_abi;
5817 else
5818 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5821 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5822 expensive re-initialization of init_regs each time we switch function context
5823 since this is needed only during RTL expansion. */
5824 static void
5825 ix86_maybe_switch_abi (void)
5827 if (TARGET_64BIT &&
5828 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5829 reinit_regs ();
5832 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5833 for a call to a function whose data type is FNTYPE.
5834 For a library call, FNTYPE is 0. */
5836 void
5837 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5838 tree fntype, /* tree ptr for function decl */
5839 rtx libname, /* SYMBOL_REF of library name or 0 */
5840 tree fndecl,
5841 int caller)
5843 struct cgraph_local_info *i;
5845 memset (cum, 0, sizeof (*cum));
5847 if (fndecl)
5849 i = cgraph_local_info (fndecl);
5850 cum->call_abi = ix86_function_abi (fndecl);
5852 else
5854 i = NULL;
5855 cum->call_abi = ix86_function_type_abi (fntype);
5858 cum->caller = caller;
5860 /* Set up the number of registers to use for passing arguments. */
5862 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5863 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5864 "or subtarget optimization implying it");
5865 cum->nregs = ix86_regparm;
5866 if (TARGET_64BIT)
5868 cum->nregs = (cum->call_abi == SYSV_ABI
5869 ? X86_64_REGPARM_MAX
5870 : X86_64_MS_REGPARM_MAX);
5872 if (TARGET_SSE)
5874 cum->sse_nregs = SSE_REGPARM_MAX;
5875 if (TARGET_64BIT)
5877 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5878 ? X86_64_SSE_REGPARM_MAX
5879 : X86_64_MS_SSE_REGPARM_MAX);
5882 if (TARGET_MMX)
5883 cum->mmx_nregs = MMX_REGPARM_MAX;
5884 cum->warn_avx = true;
5885 cum->warn_sse = true;
5886 cum->warn_mmx = true;
5888 /* Because type might mismatch in between caller and callee, we need to
5889 use actual type of function for local calls.
5890 FIXME: cgraph_analyze can be told to actually record if function uses
5891 va_start so for local functions maybe_vaarg can be made aggressive
5892 helping K&R code.
5893 FIXME: once typesytem is fixed, we won't need this code anymore. */
5894 if (i && i->local && i->can_change_signature)
5895 fntype = TREE_TYPE (fndecl);
5896 cum->maybe_vaarg = (fntype
5897 ? (!prototype_p (fntype) || stdarg_p (fntype))
5898 : !libname);
5900 if (!TARGET_64BIT)
5902 /* If there are variable arguments, then we won't pass anything
5903 in registers in 32-bit mode. */
5904 if (stdarg_p (fntype))
5906 cum->nregs = 0;
5907 cum->sse_nregs = 0;
5908 cum->mmx_nregs = 0;
5909 cum->warn_avx = 0;
5910 cum->warn_sse = 0;
5911 cum->warn_mmx = 0;
5912 return;
5915 /* Use ecx and edx registers if function has fastcall attribute,
5916 else look for regparm information. */
5917 if (fntype)
5919 unsigned int ccvt = ix86_get_callcvt (fntype);
5920 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5922 cum->nregs = 1;
5923 cum->fastcall = 1; /* Same first register as in fastcall. */
5925 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5927 cum->nregs = 2;
5928 cum->fastcall = 1;
5930 else
5931 cum->nregs = ix86_function_regparm (fntype, fndecl);
5934 /* Set up the number of SSE registers used for passing SFmode
5935 and DFmode arguments. Warn for mismatching ABI. */
5936 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5940 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5941 But in the case of vector types, it is some vector mode.
5943 When we have only some of our vector isa extensions enabled, then there
5944 are some modes for which vector_mode_supported_p is false. For these
5945 modes, the generic vector support in gcc will choose some non-vector mode
5946 in order to implement the type. By computing the natural mode, we'll
5947 select the proper ABI location for the operand and not depend on whatever
5948 the middle-end decides to do with these vector types.
5950 The midde-end can't deal with the vector types > 16 bytes. In this
5951 case, we return the original mode and warn ABI change if CUM isn't
5952 NULL. */
5954 static enum machine_mode
5955 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5957 enum machine_mode mode = TYPE_MODE (type);
5959 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5961 HOST_WIDE_INT size = int_size_in_bytes (type);
5962 if ((size == 8 || size == 16 || size == 32)
5963 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5964 && TYPE_VECTOR_SUBPARTS (type) > 1)
5966 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5968 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5969 mode = MIN_MODE_VECTOR_FLOAT;
5970 else
5971 mode = MIN_MODE_VECTOR_INT;
5973 /* Get the mode which has this inner mode and number of units. */
5974 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5975 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5976 && GET_MODE_INNER (mode) == innermode)
5978 if (size == 32 && !TARGET_AVX)
5980 static bool warnedavx;
5982 if (cum
5983 && !warnedavx
5984 && cum->warn_avx)
5986 warnedavx = true;
5987 warning (0, "AVX vector argument without AVX "
5988 "enabled changes the ABI");
5990 return TYPE_MODE (type);
5992 else if ((size == 8 || size == 16) && !TARGET_SSE)
5994 static bool warnedsse;
5996 if (cum
5997 && !warnedsse
5998 && cum->warn_sse)
6000 warnedsse = true;
6001 warning (0, "SSE vector argument without SSE "
6002 "enabled changes the ABI");
6004 return mode;
6006 else
6007 return mode;
6010 gcc_unreachable ();
6014 return mode;
6017 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6018 this may not agree with the mode that the type system has chosen for the
6019 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6020 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6022 static rtx
6023 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6024 unsigned int regno)
6026 rtx tmp;
6028 if (orig_mode != BLKmode)
6029 tmp = gen_rtx_REG (orig_mode, regno);
6030 else
6032 tmp = gen_rtx_REG (mode, regno);
6033 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6034 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6037 return tmp;
6040 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6041 of this code is to classify each 8bytes of incoming argument by the register
6042 class and assign registers accordingly. */
6044 /* Return the union class of CLASS1 and CLASS2.
6045 See the x86-64 PS ABI for details. */
6047 static enum x86_64_reg_class
6048 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6050 /* Rule #1: If both classes are equal, this is the resulting class. */
6051 if (class1 == class2)
6052 return class1;
6054 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6055 the other class. */
6056 if (class1 == X86_64_NO_CLASS)
6057 return class2;
6058 if (class2 == X86_64_NO_CLASS)
6059 return class1;
6061 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6062 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6063 return X86_64_MEMORY_CLASS;
6065 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6066 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6067 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6068 return X86_64_INTEGERSI_CLASS;
6069 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6070 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6071 return X86_64_INTEGER_CLASS;
6073 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6074 MEMORY is used. */
6075 if (class1 == X86_64_X87_CLASS
6076 || class1 == X86_64_X87UP_CLASS
6077 || class1 == X86_64_COMPLEX_X87_CLASS
6078 || class2 == X86_64_X87_CLASS
6079 || class2 == X86_64_X87UP_CLASS
6080 || class2 == X86_64_COMPLEX_X87_CLASS)
6081 return X86_64_MEMORY_CLASS;
6083 /* Rule #6: Otherwise class SSE is used. */
6084 return X86_64_SSE_CLASS;
6087 /* Classify the argument of type TYPE and mode MODE.
6088 CLASSES will be filled by the register class used to pass each word
6089 of the operand. The number of words is returned. In case the parameter
6090 should be passed in memory, 0 is returned. As a special case for zero
6091 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6093 BIT_OFFSET is used internally for handling records and specifies offset
6094 of the offset in bits modulo 256 to avoid overflow cases.
6096 See the x86-64 PS ABI for details.
6099 static int
6100 classify_argument (enum machine_mode mode, const_tree type,
6101 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6103 HOST_WIDE_INT bytes =
6104 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6105 int words
6106 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6108 /* Variable sized entities are always passed/returned in memory. */
6109 if (bytes < 0)
6110 return 0;
6112 if (mode != VOIDmode
6113 && targetm.calls.must_pass_in_stack (mode, type))
6114 return 0;
6116 if (type && AGGREGATE_TYPE_P (type))
6118 int i;
6119 tree field;
6120 enum x86_64_reg_class subclasses[MAX_CLASSES];
6122 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6123 if (bytes > 32)
6124 return 0;
6126 for (i = 0; i < words; i++)
6127 classes[i] = X86_64_NO_CLASS;
6129 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6130 signalize memory class, so handle it as special case. */
6131 if (!words)
6133 classes[0] = X86_64_NO_CLASS;
6134 return 1;
6137 /* Classify each field of record and merge classes. */
6138 switch (TREE_CODE (type))
6140 case RECORD_TYPE:
6141 /* And now merge the fields of structure. */
6142 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6144 if (TREE_CODE (field) == FIELD_DECL)
6146 int num;
6148 if (TREE_TYPE (field) == error_mark_node)
6149 continue;
6151 /* Bitfields are always classified as integer. Handle them
6152 early, since later code would consider them to be
6153 misaligned integers. */
6154 if (DECL_BIT_FIELD (field))
6156 for (i = (int_bit_position (field)
6157 + (bit_offset % 64)) / 8 / 8;
6158 i < ((int_bit_position (field) + (bit_offset % 64))
6159 + tree_low_cst (DECL_SIZE (field), 0)
6160 + 63) / 8 / 8; i++)
6161 classes[i] =
6162 merge_classes (X86_64_INTEGER_CLASS,
6163 classes[i]);
6165 else
6167 int pos;
6169 type = TREE_TYPE (field);
6171 /* Flexible array member is ignored. */
6172 if (TYPE_MODE (type) == BLKmode
6173 && TREE_CODE (type) == ARRAY_TYPE
6174 && TYPE_SIZE (type) == NULL_TREE
6175 && TYPE_DOMAIN (type) != NULL_TREE
6176 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6177 == NULL_TREE))
6179 static bool warned;
6181 if (!warned && warn_psabi)
6183 warned = true;
6184 inform (input_location,
6185 "the ABI of passing struct with"
6186 " a flexible array member has"
6187 " changed in GCC 4.4");
6189 continue;
6191 num = classify_argument (TYPE_MODE (type), type,
6192 subclasses,
6193 (int_bit_position (field)
6194 + bit_offset) % 256);
6195 if (!num)
6196 return 0;
6197 pos = (int_bit_position (field)
6198 + (bit_offset % 64)) / 8 / 8;
6199 for (i = 0; i < num && (i + pos) < words; i++)
6200 classes[i + pos] =
6201 merge_classes (subclasses[i], classes[i + pos]);
6205 break;
6207 case ARRAY_TYPE:
6208 /* Arrays are handled as small records. */
6210 int num;
6211 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6212 TREE_TYPE (type), subclasses, bit_offset);
6213 if (!num)
6214 return 0;
6216 /* The partial classes are now full classes. */
6217 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6218 subclasses[0] = X86_64_SSE_CLASS;
6219 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6220 && !((bit_offset % 64) == 0 && bytes == 4))
6221 subclasses[0] = X86_64_INTEGER_CLASS;
6223 for (i = 0; i < words; i++)
6224 classes[i] = subclasses[i % num];
6226 break;
6228 case UNION_TYPE:
6229 case QUAL_UNION_TYPE:
6230 /* Unions are similar to RECORD_TYPE but offset is always 0.
6232 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6234 if (TREE_CODE (field) == FIELD_DECL)
6236 int num;
6238 if (TREE_TYPE (field) == error_mark_node)
6239 continue;
6241 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6242 TREE_TYPE (field), subclasses,
6243 bit_offset);
6244 if (!num)
6245 return 0;
6246 for (i = 0; i < num; i++)
6247 classes[i] = merge_classes (subclasses[i], classes[i]);
6250 break;
6252 default:
6253 gcc_unreachable ();
6256 if (words > 2)
6258 /* When size > 16 bytes, if the first one isn't
6259 X86_64_SSE_CLASS or any other ones aren't
6260 X86_64_SSEUP_CLASS, everything should be passed in
6261 memory. */
6262 if (classes[0] != X86_64_SSE_CLASS)
6263 return 0;
6265 for (i = 1; i < words; i++)
6266 if (classes[i] != X86_64_SSEUP_CLASS)
6267 return 0;
6270 /* Final merger cleanup. */
6271 for (i = 0; i < words; i++)
6273 /* If one class is MEMORY, everything should be passed in
6274 memory. */
6275 if (classes[i] == X86_64_MEMORY_CLASS)
6276 return 0;
6278 /* The X86_64_SSEUP_CLASS should be always preceded by
6279 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6280 if (classes[i] == X86_64_SSEUP_CLASS
6281 && classes[i - 1] != X86_64_SSE_CLASS
6282 && classes[i - 1] != X86_64_SSEUP_CLASS)
6284 /* The first one should never be X86_64_SSEUP_CLASS. */
6285 gcc_assert (i != 0);
6286 classes[i] = X86_64_SSE_CLASS;
6289 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6290 everything should be passed in memory. */
6291 if (classes[i] == X86_64_X87UP_CLASS
6292 && (classes[i - 1] != X86_64_X87_CLASS))
6294 static bool warned;
6296 /* The first one should never be X86_64_X87UP_CLASS. */
6297 gcc_assert (i != 0);
6298 if (!warned && warn_psabi)
6300 warned = true;
6301 inform (input_location,
6302 "the ABI of passing union with long double"
6303 " has changed in GCC 4.4");
6305 return 0;
6308 return words;
6311 /* Compute alignment needed. We align all types to natural boundaries with
6312 exception of XFmode that is aligned to 64bits. */
6313 if (mode != VOIDmode && mode != BLKmode)
6315 int mode_alignment = GET_MODE_BITSIZE (mode);
6317 if (mode == XFmode)
6318 mode_alignment = 128;
6319 else if (mode == XCmode)
6320 mode_alignment = 256;
6321 if (COMPLEX_MODE_P (mode))
6322 mode_alignment /= 2;
6323 /* Misaligned fields are always returned in memory. */
6324 if (bit_offset % mode_alignment)
6325 return 0;
6328 /* for V1xx modes, just use the base mode */
6329 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6330 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6331 mode = GET_MODE_INNER (mode);
6333 /* Classification of atomic types. */
6334 switch (mode)
6336 case SDmode:
6337 case DDmode:
6338 classes[0] = X86_64_SSE_CLASS;
6339 return 1;
6340 case TDmode:
6341 classes[0] = X86_64_SSE_CLASS;
6342 classes[1] = X86_64_SSEUP_CLASS;
6343 return 2;
6344 case DImode:
6345 case SImode:
6346 case HImode:
6347 case QImode:
6348 case CSImode:
6349 case CHImode:
6350 case CQImode:
6352 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6354 if (size <= 32)
6356 classes[0] = X86_64_INTEGERSI_CLASS;
6357 return 1;
6359 else if (size <= 64)
6361 classes[0] = X86_64_INTEGER_CLASS;
6362 return 1;
6364 else if (size <= 64+32)
6366 classes[0] = X86_64_INTEGER_CLASS;
6367 classes[1] = X86_64_INTEGERSI_CLASS;
6368 return 2;
6370 else if (size <= 64+64)
6372 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6373 return 2;
6375 else
6376 gcc_unreachable ();
6378 case CDImode:
6379 case TImode:
6380 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6381 return 2;
6382 case COImode:
6383 case OImode:
6384 /* OImode shouldn't be used directly. */
6385 gcc_unreachable ();
6386 case CTImode:
6387 return 0;
6388 case SFmode:
6389 if (!(bit_offset % 64))
6390 classes[0] = X86_64_SSESF_CLASS;
6391 else
6392 classes[0] = X86_64_SSE_CLASS;
6393 return 1;
6394 case DFmode:
6395 classes[0] = X86_64_SSEDF_CLASS;
6396 return 1;
6397 case XFmode:
6398 classes[0] = X86_64_X87_CLASS;
6399 classes[1] = X86_64_X87UP_CLASS;
6400 return 2;
6401 case TFmode:
6402 classes[0] = X86_64_SSE_CLASS;
6403 classes[1] = X86_64_SSEUP_CLASS;
6404 return 2;
6405 case SCmode:
6406 classes[0] = X86_64_SSE_CLASS;
6407 if (!(bit_offset % 64))
6408 return 1;
6409 else
6411 static bool warned;
6413 if (!warned && warn_psabi)
6415 warned = true;
6416 inform (input_location,
6417 "the ABI of passing structure with complex float"
6418 " member has changed in GCC 4.4");
6420 classes[1] = X86_64_SSESF_CLASS;
6421 return 2;
6423 case DCmode:
6424 classes[0] = X86_64_SSEDF_CLASS;
6425 classes[1] = X86_64_SSEDF_CLASS;
6426 return 2;
6427 case XCmode:
6428 classes[0] = X86_64_COMPLEX_X87_CLASS;
6429 return 1;
6430 case TCmode:
6431 /* This modes is larger than 16 bytes. */
6432 return 0;
6433 case V8SFmode:
6434 case V8SImode:
6435 case V32QImode:
6436 case V16HImode:
6437 case V4DFmode:
6438 case V4DImode:
6439 classes[0] = X86_64_SSE_CLASS;
6440 classes[1] = X86_64_SSEUP_CLASS;
6441 classes[2] = X86_64_SSEUP_CLASS;
6442 classes[3] = X86_64_SSEUP_CLASS;
6443 return 4;
6444 case V4SFmode:
6445 case V4SImode:
6446 case V16QImode:
6447 case V8HImode:
6448 case V2DFmode:
6449 case V2DImode:
6450 classes[0] = X86_64_SSE_CLASS;
6451 classes[1] = X86_64_SSEUP_CLASS;
6452 return 2;
6453 case V1TImode:
6454 case V1DImode:
6455 case V2SFmode:
6456 case V2SImode:
6457 case V4HImode:
6458 case V8QImode:
6459 classes[0] = X86_64_SSE_CLASS;
6460 return 1;
6461 case BLKmode:
6462 case VOIDmode:
6463 return 0;
6464 default:
6465 gcc_assert (VECTOR_MODE_P (mode));
6467 if (bytes > 16)
6468 return 0;
6470 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6472 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6473 classes[0] = X86_64_INTEGERSI_CLASS;
6474 else
6475 classes[0] = X86_64_INTEGER_CLASS;
6476 classes[1] = X86_64_INTEGER_CLASS;
6477 return 1 + (bytes > 8);
6481 /* Examine the argument and return set number of register required in each
6482 class. Return 0 iff parameter should be passed in memory. */
6483 static int
6484 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6485 int *int_nregs, int *sse_nregs)
6487 enum x86_64_reg_class regclass[MAX_CLASSES];
6488 int n = classify_argument (mode, type, regclass, 0);
6490 *int_nregs = 0;
6491 *sse_nregs = 0;
6492 if (!n)
6493 return 0;
6494 for (n--; n >= 0; n--)
6495 switch (regclass[n])
6497 case X86_64_INTEGER_CLASS:
6498 case X86_64_INTEGERSI_CLASS:
6499 (*int_nregs)++;
6500 break;
6501 case X86_64_SSE_CLASS:
6502 case X86_64_SSESF_CLASS:
6503 case X86_64_SSEDF_CLASS:
6504 (*sse_nregs)++;
6505 break;
6506 case X86_64_NO_CLASS:
6507 case X86_64_SSEUP_CLASS:
6508 break;
6509 case X86_64_X87_CLASS:
6510 case X86_64_X87UP_CLASS:
6511 if (!in_return)
6512 return 0;
6513 break;
6514 case X86_64_COMPLEX_X87_CLASS:
6515 return in_return ? 2 : 0;
6516 case X86_64_MEMORY_CLASS:
6517 gcc_unreachable ();
6519 return 1;
6522 /* Construct container for the argument used by GCC interface. See
6523 FUNCTION_ARG for the detailed description. */
6525 static rtx
6526 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6527 const_tree type, int in_return, int nintregs, int nsseregs,
6528 const int *intreg, int sse_regno)
6530 /* The following variables hold the static issued_error state. */
6531 static bool issued_sse_arg_error;
6532 static bool issued_sse_ret_error;
6533 static bool issued_x87_ret_error;
6535 enum machine_mode tmpmode;
6536 int bytes =
6537 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6538 enum x86_64_reg_class regclass[MAX_CLASSES];
6539 int n;
6540 int i;
6541 int nexps = 0;
6542 int needed_sseregs, needed_intregs;
6543 rtx exp[MAX_CLASSES];
6544 rtx ret;
6546 n = classify_argument (mode, type, regclass, 0);
6547 if (!n)
6548 return NULL;
6549 if (!examine_argument (mode, type, in_return, &needed_intregs,
6550 &needed_sseregs))
6551 return NULL;
6552 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6553 return NULL;
6555 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6556 some less clueful developer tries to use floating-point anyway. */
6557 if (needed_sseregs && !TARGET_SSE)
6559 if (in_return)
6561 if (!issued_sse_ret_error)
6563 error ("SSE register return with SSE disabled");
6564 issued_sse_ret_error = true;
6567 else if (!issued_sse_arg_error)
6569 error ("SSE register argument with SSE disabled");
6570 issued_sse_arg_error = true;
6572 return NULL;
6575 /* Likewise, error if the ABI requires us to return values in the
6576 x87 registers and the user specified -mno-80387. */
6577 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6578 for (i = 0; i < n; i++)
6579 if (regclass[i] == X86_64_X87_CLASS
6580 || regclass[i] == X86_64_X87UP_CLASS
6581 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6583 if (!issued_x87_ret_error)
6585 error ("x87 register return with x87 disabled");
6586 issued_x87_ret_error = true;
6588 return NULL;
6591 /* First construct simple cases. Avoid SCmode, since we want to use
6592 single register to pass this type. */
6593 if (n == 1 && mode != SCmode)
6594 switch (regclass[0])
6596 case X86_64_INTEGER_CLASS:
6597 case X86_64_INTEGERSI_CLASS:
6598 return gen_rtx_REG (mode, intreg[0]);
6599 case X86_64_SSE_CLASS:
6600 case X86_64_SSESF_CLASS:
6601 case X86_64_SSEDF_CLASS:
6602 if (mode != BLKmode)
6603 return gen_reg_or_parallel (mode, orig_mode,
6604 SSE_REGNO (sse_regno));
6605 break;
6606 case X86_64_X87_CLASS:
6607 case X86_64_COMPLEX_X87_CLASS:
6608 return gen_rtx_REG (mode, FIRST_STACK_REG);
6609 case X86_64_NO_CLASS:
6610 /* Zero sized array, struct or class. */
6611 return NULL;
6612 default:
6613 gcc_unreachable ();
6615 if (n == 2
6616 && regclass[0] == X86_64_SSE_CLASS
6617 && regclass[1] == X86_64_SSEUP_CLASS
6618 && mode != BLKmode)
6619 return gen_reg_or_parallel (mode, orig_mode,
6620 SSE_REGNO (sse_regno));
6621 if (n == 4
6622 && regclass[0] == X86_64_SSE_CLASS
6623 && regclass[1] == X86_64_SSEUP_CLASS
6624 && regclass[2] == X86_64_SSEUP_CLASS
6625 && regclass[3] == X86_64_SSEUP_CLASS
6626 && mode != BLKmode)
6627 return gen_reg_or_parallel (mode, orig_mode,
6628 SSE_REGNO (sse_regno));
6629 if (n == 2
6630 && regclass[0] == X86_64_X87_CLASS
6631 && regclass[1] == X86_64_X87UP_CLASS)
6632 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6634 if (n == 2
6635 && regclass[0] == X86_64_INTEGER_CLASS
6636 && regclass[1] == X86_64_INTEGER_CLASS
6637 && (mode == CDImode || mode == TImode || mode == TFmode)
6638 && intreg[0] + 1 == intreg[1])
6639 return gen_rtx_REG (mode, intreg[0]);
6641 /* Otherwise figure out the entries of the PARALLEL. */
6642 for (i = 0; i < n; i++)
6644 int pos;
6646 switch (regclass[i])
6648 case X86_64_NO_CLASS:
6649 break;
6650 case X86_64_INTEGER_CLASS:
6651 case X86_64_INTEGERSI_CLASS:
6652 /* Merge TImodes on aligned occasions here too. */
6653 if (i * 8 + 8 > bytes)
6654 tmpmode
6655 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6656 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6657 tmpmode = SImode;
6658 else
6659 tmpmode = DImode;
6660 /* We've requested 24 bytes we
6661 don't have mode for. Use DImode. */
6662 if (tmpmode == BLKmode)
6663 tmpmode = DImode;
6664 exp [nexps++]
6665 = gen_rtx_EXPR_LIST (VOIDmode,
6666 gen_rtx_REG (tmpmode, *intreg),
6667 GEN_INT (i*8));
6668 intreg++;
6669 break;
6670 case X86_64_SSESF_CLASS:
6671 exp [nexps++]
6672 = gen_rtx_EXPR_LIST (VOIDmode,
6673 gen_rtx_REG (SFmode,
6674 SSE_REGNO (sse_regno)),
6675 GEN_INT (i*8));
6676 sse_regno++;
6677 break;
6678 case X86_64_SSEDF_CLASS:
6679 exp [nexps++]
6680 = gen_rtx_EXPR_LIST (VOIDmode,
6681 gen_rtx_REG (DFmode,
6682 SSE_REGNO (sse_regno)),
6683 GEN_INT (i*8));
6684 sse_regno++;
6685 break;
6686 case X86_64_SSE_CLASS:
6687 pos = i;
6688 switch (n)
6690 case 1:
6691 tmpmode = DImode;
6692 break;
6693 case 2:
6694 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6696 tmpmode = TImode;
6697 i++;
6699 else
6700 tmpmode = DImode;
6701 break;
6702 case 4:
6703 gcc_assert (i == 0
6704 && regclass[1] == X86_64_SSEUP_CLASS
6705 && regclass[2] == X86_64_SSEUP_CLASS
6706 && regclass[3] == X86_64_SSEUP_CLASS);
6707 tmpmode = OImode;
6708 i += 3;
6709 break;
6710 default:
6711 gcc_unreachable ();
6713 exp [nexps++]
6714 = gen_rtx_EXPR_LIST (VOIDmode,
6715 gen_rtx_REG (tmpmode,
6716 SSE_REGNO (sse_regno)),
6717 GEN_INT (pos*8));
6718 sse_regno++;
6719 break;
6720 default:
6721 gcc_unreachable ();
6725 /* Empty aligned struct, union or class. */
6726 if (nexps == 0)
6727 return NULL;
6729 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6730 for (i = 0; i < nexps; i++)
6731 XVECEXP (ret, 0, i) = exp [i];
6732 return ret;
6735 /* Update the data in CUM to advance over an argument of mode MODE
6736 and data type TYPE. (TYPE is null for libcalls where that information
6737 may not be available.) */
6739 static void
6740 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6741 const_tree type, HOST_WIDE_INT bytes,
6742 HOST_WIDE_INT words)
6744 switch (mode)
6746 default:
6747 break;
6749 case BLKmode:
6750 if (bytes < 0)
6751 break;
6752 /* FALLTHRU */
6754 case DImode:
6755 case SImode:
6756 case HImode:
6757 case QImode:
6758 cum->words += words;
6759 cum->nregs -= words;
6760 cum->regno += words;
6762 if (cum->nregs <= 0)
6764 cum->nregs = 0;
6765 cum->regno = 0;
6767 break;
6769 case OImode:
6770 /* OImode shouldn't be used directly. */
6771 gcc_unreachable ();
6773 case DFmode:
6774 if (cum->float_in_sse < 2)
6775 break;
6776 case SFmode:
6777 if (cum->float_in_sse < 1)
6778 break;
6779 /* FALLTHRU */
6781 case V8SFmode:
6782 case V8SImode:
6783 case V32QImode:
6784 case V16HImode:
6785 case V4DFmode:
6786 case V4DImode:
6787 case TImode:
6788 case V16QImode:
6789 case V8HImode:
6790 case V4SImode:
6791 case V2DImode:
6792 case V4SFmode:
6793 case V2DFmode:
6794 if (!type || !AGGREGATE_TYPE_P (type))
6796 cum->sse_words += words;
6797 cum->sse_nregs -= 1;
6798 cum->sse_regno += 1;
6799 if (cum->sse_nregs <= 0)
6801 cum->sse_nregs = 0;
6802 cum->sse_regno = 0;
6805 break;
6807 case V8QImode:
6808 case V4HImode:
6809 case V2SImode:
6810 case V2SFmode:
6811 case V1TImode:
6812 case V1DImode:
6813 if (!type || !AGGREGATE_TYPE_P (type))
6815 cum->mmx_words += words;
6816 cum->mmx_nregs -= 1;
6817 cum->mmx_regno += 1;
6818 if (cum->mmx_nregs <= 0)
6820 cum->mmx_nregs = 0;
6821 cum->mmx_regno = 0;
6824 break;
6828 static void
6829 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6830 const_tree type, HOST_WIDE_INT words, bool named)
6832 int int_nregs, sse_nregs;
6834 /* Unnamed 256bit vector mode parameters are passed on stack. */
6835 if (!named && VALID_AVX256_REG_MODE (mode))
6836 return;
6838 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6839 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6841 cum->nregs -= int_nregs;
6842 cum->sse_nregs -= sse_nregs;
6843 cum->regno += int_nregs;
6844 cum->sse_regno += sse_nregs;
6846 else
6848 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6849 cum->words = (cum->words + align - 1) & ~(align - 1);
6850 cum->words += words;
6854 static void
6855 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6856 HOST_WIDE_INT words)
6858 /* Otherwise, this should be passed indirect. */
6859 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6861 cum->words += words;
6862 if (cum->nregs > 0)
6864 cum->nregs -= 1;
6865 cum->regno += 1;
6869 /* Update the data in CUM to advance over an argument of mode MODE and
6870 data type TYPE. (TYPE is null for libcalls where that information
6871 may not be available.) */
6873 static void
6874 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6875 const_tree type, bool named)
6877 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6878 HOST_WIDE_INT bytes, words;
6880 if (mode == BLKmode)
6881 bytes = int_size_in_bytes (type);
6882 else
6883 bytes = GET_MODE_SIZE (mode);
6884 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6886 if (type)
6887 mode = type_natural_mode (type, NULL);
6889 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6890 function_arg_advance_ms_64 (cum, bytes, words);
6891 else if (TARGET_64BIT)
6892 function_arg_advance_64 (cum, mode, type, words, named);
6893 else
6894 function_arg_advance_32 (cum, mode, type, bytes, words);
6897 /* Define where to put the arguments to a function.
6898 Value is zero to push the argument on the stack,
6899 or a hard register in which to store the argument.
6901 MODE is the argument's machine mode.
6902 TYPE is the data type of the argument (as a tree).
6903 This is null for libcalls where that information may
6904 not be available.
6905 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6906 the preceding args and about the function being called.
6907 NAMED is nonzero if this argument is a named parameter
6908 (otherwise it is an extra parameter matching an ellipsis). */
6910 static rtx
6911 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6912 enum machine_mode orig_mode, const_tree type,
6913 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6915 static bool warnedsse, warnedmmx;
6917 /* Avoid the AL settings for the Unix64 ABI. */
6918 if (mode == VOIDmode)
6919 return constm1_rtx;
6921 switch (mode)
6923 default:
6924 break;
6926 case BLKmode:
6927 if (bytes < 0)
6928 break;
6929 /* FALLTHRU */
6930 case DImode:
6931 case SImode:
6932 case HImode:
6933 case QImode:
6934 if (words <= cum->nregs)
6936 int regno = cum->regno;
6938 /* Fastcall allocates the first two DWORD (SImode) or
6939 smaller arguments to ECX and EDX if it isn't an
6940 aggregate type . */
6941 if (cum->fastcall)
6943 if (mode == BLKmode
6944 || mode == DImode
6945 || (type && AGGREGATE_TYPE_P (type)))
6946 break;
6948 /* ECX not EAX is the first allocated register. */
6949 if (regno == AX_REG)
6950 regno = CX_REG;
6952 return gen_rtx_REG (mode, regno);
6954 break;
6956 case DFmode:
6957 if (cum->float_in_sse < 2)
6958 break;
6959 case SFmode:
6960 if (cum->float_in_sse < 1)
6961 break;
6962 /* FALLTHRU */
6963 case TImode:
6964 /* In 32bit, we pass TImode in xmm registers. */
6965 case V16QImode:
6966 case V8HImode:
6967 case V4SImode:
6968 case V2DImode:
6969 case V4SFmode:
6970 case V2DFmode:
6971 if (!type || !AGGREGATE_TYPE_P (type))
6973 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6975 warnedsse = true;
6976 warning (0, "SSE vector argument without SSE enabled "
6977 "changes the ABI");
6979 if (cum->sse_nregs)
6980 return gen_reg_or_parallel (mode, orig_mode,
6981 cum->sse_regno + FIRST_SSE_REG);
6983 break;
6985 case OImode:
6986 /* OImode shouldn't be used directly. */
6987 gcc_unreachable ();
6989 case V8SFmode:
6990 case V8SImode:
6991 case V32QImode:
6992 case V16HImode:
6993 case V4DFmode:
6994 case V4DImode:
6995 if (!type || !AGGREGATE_TYPE_P (type))
6997 if (cum->sse_nregs)
6998 return gen_reg_or_parallel (mode, orig_mode,
6999 cum->sse_regno + FIRST_SSE_REG);
7001 break;
7003 case V8QImode:
7004 case V4HImode:
7005 case V2SImode:
7006 case V2SFmode:
7007 case V1TImode:
7008 case V1DImode:
7009 if (!type || !AGGREGATE_TYPE_P (type))
7011 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7013 warnedmmx = true;
7014 warning (0, "MMX vector argument without MMX enabled "
7015 "changes the ABI");
7017 if (cum->mmx_nregs)
7018 return gen_reg_or_parallel (mode, orig_mode,
7019 cum->mmx_regno + FIRST_MMX_REG);
7021 break;
7024 return NULL_RTX;
7027 static rtx
7028 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7029 enum machine_mode orig_mode, const_tree type, bool named)
7031 /* Handle a hidden AL argument containing number of registers
7032 for varargs x86-64 functions. */
7033 if (mode == VOIDmode)
7034 return GEN_INT (cum->maybe_vaarg
7035 ? (cum->sse_nregs < 0
7036 ? X86_64_SSE_REGPARM_MAX
7037 : cum->sse_regno)
7038 : -1);
7040 switch (mode)
7042 default:
7043 break;
7045 case V8SFmode:
7046 case V8SImode:
7047 case V32QImode:
7048 case V16HImode:
7049 case V4DFmode:
7050 case V4DImode:
7051 /* Unnamed 256bit vector mode parameters are passed on stack. */
7052 if (!named)
7053 return NULL;
7054 break;
7057 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7058 cum->sse_nregs,
7059 &x86_64_int_parameter_registers [cum->regno],
7060 cum->sse_regno);
7063 static rtx
7064 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7065 enum machine_mode orig_mode, bool named,
7066 HOST_WIDE_INT bytes)
7068 unsigned int regno;
7070 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7071 We use value of -2 to specify that current function call is MSABI. */
7072 if (mode == VOIDmode)
7073 return GEN_INT (-2);
7075 /* If we've run out of registers, it goes on the stack. */
7076 if (cum->nregs == 0)
7077 return NULL_RTX;
7079 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7081 /* Only floating point modes are passed in anything but integer regs. */
7082 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7084 if (named)
7085 regno = cum->regno + FIRST_SSE_REG;
7086 else
7088 rtx t1, t2;
7090 /* Unnamed floating parameters are passed in both the
7091 SSE and integer registers. */
7092 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7093 t2 = gen_rtx_REG (mode, regno);
7094 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7095 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7096 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7099 /* Handle aggregated types passed in register. */
7100 if (orig_mode == BLKmode)
7102 if (bytes > 0 && bytes <= 8)
7103 mode = (bytes > 4 ? DImode : SImode);
7104 if (mode == BLKmode)
7105 mode = DImode;
7108 return gen_reg_or_parallel (mode, orig_mode, regno);
7111 /* Return where to put the arguments to a function.
7112 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7114 MODE is the argument's machine mode. TYPE is the data type of the
7115 argument. It is null for libcalls where that information may not be
7116 available. CUM gives information about the preceding args and about
7117 the function being called. NAMED is nonzero if this argument is a
7118 named parameter (otherwise it is an extra parameter matching an
7119 ellipsis). */
7121 static rtx
7122 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7123 const_tree type, bool named)
7125 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7126 enum machine_mode mode = omode;
7127 HOST_WIDE_INT bytes, words;
7128 rtx arg;
7130 if (mode == BLKmode)
7131 bytes = int_size_in_bytes (type);
7132 else
7133 bytes = GET_MODE_SIZE (mode);
7134 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7136 /* To simplify the code below, represent vector types with a vector mode
7137 even if MMX/SSE are not active. */
7138 if (type && TREE_CODE (type) == VECTOR_TYPE)
7139 mode = type_natural_mode (type, cum);
7141 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7142 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7143 else if (TARGET_64BIT)
7144 arg = function_arg_64 (cum, mode, omode, type, named);
7145 else
7146 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7148 return arg;
7151 /* A C expression that indicates when an argument must be passed by
7152 reference. If nonzero for an argument, a copy of that argument is
7153 made in memory and a pointer to the argument is passed instead of
7154 the argument itself. The pointer is passed in whatever way is
7155 appropriate for passing a pointer to that type. */
7157 static bool
7158 ix86_pass_by_reference (cumulative_args_t cum_v, enum machine_mode mode,
7159 const_tree type, bool named ATTRIBUTE_UNUSED)
7161 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7163 /* See Windows x64 Software Convention. */
7164 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7166 int msize = (int) GET_MODE_SIZE (mode);
7167 if (type)
7169 /* Arrays are passed by reference. */
7170 if (TREE_CODE (type) == ARRAY_TYPE)
7171 return true;
7173 if (AGGREGATE_TYPE_P (type))
7175 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7176 are passed by reference. */
7177 msize = int_size_in_bytes (type);
7181 /* __m128 is passed by reference. */
7182 switch (msize) {
7183 case 1: case 2: case 4: case 8:
7184 break;
7185 default:
7186 return true;
7189 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7190 return 1;
7192 return 0;
7195 /* Return true when TYPE should be 128bit aligned for 32bit argument
7196 passing ABI. XXX: This function is obsolete and is only used for
7197 checking psABI compatibility with previous versions of GCC. */
7199 static bool
7200 ix86_compat_aligned_value_p (const_tree type)
7202 enum machine_mode mode = TYPE_MODE (type);
7203 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7204 || mode == TDmode
7205 || mode == TFmode
7206 || mode == TCmode)
7207 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7208 return true;
7209 if (TYPE_ALIGN (type) < 128)
7210 return false;
7212 if (AGGREGATE_TYPE_P (type))
7214 /* Walk the aggregates recursively. */
7215 switch (TREE_CODE (type))
7217 case RECORD_TYPE:
7218 case UNION_TYPE:
7219 case QUAL_UNION_TYPE:
7221 tree field;
7223 /* Walk all the structure fields. */
7224 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7226 if (TREE_CODE (field) == FIELD_DECL
7227 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7228 return true;
7230 break;
7233 case ARRAY_TYPE:
7234 /* Just for use if some languages passes arrays by value. */
7235 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7236 return true;
7237 break;
7239 default:
7240 gcc_unreachable ();
7243 return false;
7246 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7247 XXX: This function is obsolete and is only used for checking psABI
7248 compatibility with previous versions of GCC. */
7250 static unsigned int
7251 ix86_compat_function_arg_boundary (enum machine_mode mode,
7252 const_tree type, unsigned int align)
7254 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7255 natural boundaries. */
7256 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7258 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7259 make an exception for SSE modes since these require 128bit
7260 alignment.
7262 The handling here differs from field_alignment. ICC aligns MMX
7263 arguments to 4 byte boundaries, while structure fields are aligned
7264 to 8 byte boundaries. */
7265 if (!type)
7267 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7268 align = PARM_BOUNDARY;
7270 else
7272 if (!ix86_compat_aligned_value_p (type))
7273 align = PARM_BOUNDARY;
7276 if (align > BIGGEST_ALIGNMENT)
7277 align = BIGGEST_ALIGNMENT;
7278 return align;
7281 /* Return true when TYPE should be 128bit aligned for 32bit argument
7282 passing ABI. */
7284 static bool
7285 ix86_contains_aligned_value_p (const_tree type)
7287 enum machine_mode mode = TYPE_MODE (type);
7289 if (mode == XFmode || mode == XCmode)
7290 return false;
7292 if (TYPE_ALIGN (type) < 128)
7293 return false;
7295 if (AGGREGATE_TYPE_P (type))
7297 /* Walk the aggregates recursively. */
7298 switch (TREE_CODE (type))
7300 case RECORD_TYPE:
7301 case UNION_TYPE:
7302 case QUAL_UNION_TYPE:
7304 tree field;
7306 /* Walk all the structure fields. */
7307 for (field = TYPE_FIELDS (type);
7308 field;
7309 field = DECL_CHAIN (field))
7311 if (TREE_CODE (field) == FIELD_DECL
7312 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7313 return true;
7315 break;
7318 case ARRAY_TYPE:
7319 /* Just for use if some languages passes arrays by value. */
7320 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7321 return true;
7322 break;
7324 default:
7325 gcc_unreachable ();
7328 else
7329 return TYPE_ALIGN (type) >= 128;
7331 return false;
7334 /* Gives the alignment boundary, in bits, of an argument with the
7335 specified mode and type. */
7337 static unsigned int
7338 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7340 unsigned int align;
7341 if (type)
7343 /* Since the main variant type is used for call, we convert it to
7344 the main variant type. */
7345 type = TYPE_MAIN_VARIANT (type);
7346 align = TYPE_ALIGN (type);
7348 else
7349 align = GET_MODE_ALIGNMENT (mode);
7350 if (align < PARM_BOUNDARY)
7351 align = PARM_BOUNDARY;
7352 else
7354 static bool warned;
7355 unsigned int saved_align = align;
7357 if (!TARGET_64BIT)
7359 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7360 if (!type)
7362 if (mode == XFmode || mode == XCmode)
7363 align = PARM_BOUNDARY;
7365 else if (!ix86_contains_aligned_value_p (type))
7366 align = PARM_BOUNDARY;
7368 if (align < 128)
7369 align = PARM_BOUNDARY;
7372 if (warn_psabi
7373 && !warned
7374 && align != ix86_compat_function_arg_boundary (mode, type,
7375 saved_align))
7377 warned = true;
7378 inform (input_location,
7379 "The ABI for passing parameters with %d-byte"
7380 " alignment has changed in GCC 4.6",
7381 align / BITS_PER_UNIT);
7385 return align;
7388 /* Return true if N is a possible register number of function value. */
7390 static bool
7391 ix86_function_value_regno_p (const unsigned int regno)
7393 switch (regno)
7395 case AX_REG:
7396 return true;
7398 case FIRST_FLOAT_REG:
7399 /* TODO: The function should depend on current function ABI but
7400 builtins.c would need updating then. Therefore we use the
7401 default ABI. */
7402 if (TARGET_64BIT && ix86_abi == MS_ABI)
7403 return false;
7404 return TARGET_FLOAT_RETURNS_IN_80387;
7406 case FIRST_SSE_REG:
7407 return TARGET_SSE;
7409 case FIRST_MMX_REG:
7410 if (TARGET_MACHO || TARGET_64BIT)
7411 return false;
7412 return TARGET_MMX;
7415 return false;
7418 /* Define how to find the value returned by a function.
7419 VALTYPE is the data type of the value (as a tree).
7420 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7421 otherwise, FUNC is 0. */
7423 static rtx
7424 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7425 const_tree fntype, const_tree fn)
7427 unsigned int regno;
7429 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7430 we normally prevent this case when mmx is not available. However
7431 some ABIs may require the result to be returned like DImode. */
7432 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7433 regno = FIRST_MMX_REG;
7435 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7436 we prevent this case when sse is not available. However some ABIs
7437 may require the result to be returned like integer TImode. */
7438 else if (mode == TImode
7439 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7440 regno = FIRST_SSE_REG;
7442 /* 32-byte vector modes in %ymm0. */
7443 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7444 regno = FIRST_SSE_REG;
7446 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7447 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7448 regno = FIRST_FLOAT_REG;
7449 else
7450 /* Most things go in %eax. */
7451 regno = AX_REG;
7453 /* Override FP return register with %xmm0 for local functions when
7454 SSE math is enabled or for functions with sseregparm attribute. */
7455 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7457 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7458 if ((sse_level >= 1 && mode == SFmode)
7459 || (sse_level == 2 && mode == DFmode))
7460 regno = FIRST_SSE_REG;
7463 /* OImode shouldn't be used directly. */
7464 gcc_assert (mode != OImode);
7466 return gen_rtx_REG (orig_mode, regno);
7469 static rtx
7470 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7471 const_tree valtype)
7473 rtx ret;
7475 /* Handle libcalls, which don't provide a type node. */
7476 if (valtype == NULL)
7478 unsigned int regno;
7480 switch (mode)
7482 case SFmode:
7483 case SCmode:
7484 case DFmode:
7485 case DCmode:
7486 case TFmode:
7487 case SDmode:
7488 case DDmode:
7489 case TDmode:
7490 regno = FIRST_SSE_REG;
7491 break;
7492 case XFmode:
7493 case XCmode:
7494 regno = FIRST_FLOAT_REG;
7495 break;
7496 case TCmode:
7497 return NULL;
7498 default:
7499 regno = AX_REG;
7502 return gen_rtx_REG (mode, regno);
7504 else if (POINTER_TYPE_P (valtype))
7506 /* Pointers are always returned in word_mode. */
7507 mode = word_mode;
7510 ret = construct_container (mode, orig_mode, valtype, 1,
7511 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7512 x86_64_int_return_registers, 0);
7514 /* For zero sized structures, construct_container returns NULL, but we
7515 need to keep rest of compiler happy by returning meaningful value. */
7516 if (!ret)
7517 ret = gen_rtx_REG (orig_mode, AX_REG);
7519 return ret;
7522 static rtx
7523 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7524 const_tree valtype)
7526 unsigned int regno = AX_REG;
7528 if (TARGET_SSE)
7530 switch (GET_MODE_SIZE (mode))
7532 case 16:
7533 if (valtype != NULL_TREE
7534 && !VECTOR_INTEGER_TYPE_P (valtype)
7535 && !VECTOR_INTEGER_TYPE_P (valtype)
7536 && !INTEGRAL_TYPE_P (valtype)
7537 && !VECTOR_FLOAT_TYPE_P (valtype))
7538 break;
7539 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7540 && !COMPLEX_MODE_P (mode))
7541 regno = FIRST_SSE_REG;
7542 break;
7543 case 8:
7544 case 4:
7545 if (mode == SFmode || mode == DFmode)
7546 regno = FIRST_SSE_REG;
7547 break;
7548 default:
7549 break;
7552 return gen_rtx_REG (orig_mode, regno);
7555 static rtx
7556 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7557 enum machine_mode orig_mode, enum machine_mode mode)
7559 const_tree fn, fntype;
7561 fn = NULL_TREE;
7562 if (fntype_or_decl && DECL_P (fntype_or_decl))
7563 fn = fntype_or_decl;
7564 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7566 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7567 return function_value_ms_64 (orig_mode, mode, valtype);
7568 else if (TARGET_64BIT)
7569 return function_value_64 (orig_mode, mode, valtype);
7570 else
7571 return function_value_32 (orig_mode, mode, fntype, fn);
7574 static rtx
7575 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7576 bool outgoing ATTRIBUTE_UNUSED)
7578 enum machine_mode mode, orig_mode;
7580 orig_mode = TYPE_MODE (valtype);
7581 mode = type_natural_mode (valtype, NULL);
7582 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7585 /* Pointer function arguments and return values are promoted to
7586 word_mode. */
7588 static enum machine_mode
7589 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7590 int *punsignedp, const_tree fntype,
7591 int for_return)
7593 if (type != NULL_TREE && POINTER_TYPE_P (type))
7595 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7596 return word_mode;
7598 return default_promote_function_mode (type, mode, punsignedp, fntype,
7599 for_return);
7602 /* Return true if a structure, union or array with MODE containing FIELD
7603 should be accessed using BLKmode. */
7605 static bool
7606 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7608 /* Union with XFmode must be in BLKmode. */
7609 return (mode == XFmode
7610 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7611 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7615 ix86_libcall_value (enum machine_mode mode)
7617 return ix86_function_value_1 (NULL, NULL, mode, mode);
7620 /* Return true iff type is returned in memory. */
7622 static bool ATTRIBUTE_UNUSED
7623 return_in_memory_32 (const_tree type, enum machine_mode mode)
7625 HOST_WIDE_INT size;
7627 if (mode == BLKmode)
7628 return true;
7630 size = int_size_in_bytes (type);
7632 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7633 return false;
7635 if (VECTOR_MODE_P (mode) || mode == TImode)
7637 /* User-created vectors small enough to fit in EAX. */
7638 if (size < 8)
7639 return false;
7641 /* MMX/3dNow values are returned in MM0,
7642 except when it doesn't exits or the ABI prescribes otherwise. */
7643 if (size == 8)
7644 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7646 /* SSE values are returned in XMM0, except when it doesn't exist. */
7647 if (size == 16)
7648 return !TARGET_SSE;
7650 /* AVX values are returned in YMM0, except when it doesn't exist. */
7651 if (size == 32)
7652 return !TARGET_AVX;
7655 if (mode == XFmode)
7656 return false;
7658 if (size > 12)
7659 return true;
7661 /* OImode shouldn't be used directly. */
7662 gcc_assert (mode != OImode);
7664 return false;
7667 static bool ATTRIBUTE_UNUSED
7668 return_in_memory_64 (const_tree type, enum machine_mode mode)
7670 int needed_intregs, needed_sseregs;
7671 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7674 static bool ATTRIBUTE_UNUSED
7675 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7677 HOST_WIDE_INT size = int_size_in_bytes (type);
7679 /* __m128 is returned in xmm0. */
7680 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7681 || VECTOR_FLOAT_TYPE_P (type))
7682 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7683 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7684 return false;
7686 /* Otherwise, the size must be exactly in [1248]. */
7687 return size != 1 && size != 2 && size != 4 && size != 8;
7690 static bool
7691 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7693 #ifdef SUBTARGET_RETURN_IN_MEMORY
7694 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7695 #else
7696 const enum machine_mode mode = type_natural_mode (type, NULL);
7698 if (TARGET_64BIT)
7700 if (ix86_function_type_abi (fntype) == MS_ABI)
7701 return return_in_memory_ms_64 (type, mode);
7702 else
7703 return return_in_memory_64 (type, mode);
7705 else
7706 return return_in_memory_32 (type, mode);
7707 #endif
7710 /* When returning SSE vector types, we have a choice of either
7711 (1) being abi incompatible with a -march switch, or
7712 (2) generating an error.
7713 Given no good solution, I think the safest thing is one warning.
7714 The user won't be able to use -Werror, but....
7716 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7717 called in response to actually generating a caller or callee that
7718 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7719 via aggregate_value_p for general type probing from tree-ssa. */
7721 static rtx
7722 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7724 static bool warnedsse, warnedmmx;
7726 if (!TARGET_64BIT && type)
7728 /* Look at the return type of the function, not the function type. */
7729 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7731 if (!TARGET_SSE && !warnedsse)
7733 if (mode == TImode
7734 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7736 warnedsse = true;
7737 warning (0, "SSE vector return without SSE enabled "
7738 "changes the ABI");
7742 if (!TARGET_MMX && !warnedmmx)
7744 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7746 warnedmmx = true;
7747 warning (0, "MMX vector return without MMX enabled "
7748 "changes the ABI");
7753 return NULL;
7757 /* Create the va_list data type. */
7759 /* Returns the calling convention specific va_list date type.
7760 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7762 static tree
7763 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7765 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7767 /* For i386 we use plain pointer to argument area. */
7768 if (!TARGET_64BIT || abi == MS_ABI)
7769 return build_pointer_type (char_type_node);
7771 record = lang_hooks.types.make_type (RECORD_TYPE);
7772 type_decl = build_decl (BUILTINS_LOCATION,
7773 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7775 f_gpr = build_decl (BUILTINS_LOCATION,
7776 FIELD_DECL, get_identifier ("gp_offset"),
7777 unsigned_type_node);
7778 f_fpr = build_decl (BUILTINS_LOCATION,
7779 FIELD_DECL, get_identifier ("fp_offset"),
7780 unsigned_type_node);
7781 f_ovf = build_decl (BUILTINS_LOCATION,
7782 FIELD_DECL, get_identifier ("overflow_arg_area"),
7783 ptr_type_node);
7784 f_sav = build_decl (BUILTINS_LOCATION,
7785 FIELD_DECL, get_identifier ("reg_save_area"),
7786 ptr_type_node);
7788 va_list_gpr_counter_field = f_gpr;
7789 va_list_fpr_counter_field = f_fpr;
7791 DECL_FIELD_CONTEXT (f_gpr) = record;
7792 DECL_FIELD_CONTEXT (f_fpr) = record;
7793 DECL_FIELD_CONTEXT (f_ovf) = record;
7794 DECL_FIELD_CONTEXT (f_sav) = record;
7796 TYPE_STUB_DECL (record) = type_decl;
7797 TYPE_NAME (record) = type_decl;
7798 TYPE_FIELDS (record) = f_gpr;
7799 DECL_CHAIN (f_gpr) = f_fpr;
7800 DECL_CHAIN (f_fpr) = f_ovf;
7801 DECL_CHAIN (f_ovf) = f_sav;
7803 layout_type (record);
7805 /* The correct type is an array type of one element. */
7806 return build_array_type (record, build_index_type (size_zero_node));
7809 /* Setup the builtin va_list data type and for 64-bit the additional
7810 calling convention specific va_list data types. */
7812 static tree
7813 ix86_build_builtin_va_list (void)
7815 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7817 /* Initialize abi specific va_list builtin types. */
7818 if (TARGET_64BIT)
7820 tree t;
7821 if (ix86_abi == MS_ABI)
7823 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7824 if (TREE_CODE (t) != RECORD_TYPE)
7825 t = build_variant_type_copy (t);
7826 sysv_va_list_type_node = t;
7828 else
7830 t = ret;
7831 if (TREE_CODE (t) != RECORD_TYPE)
7832 t = build_variant_type_copy (t);
7833 sysv_va_list_type_node = t;
7835 if (ix86_abi != MS_ABI)
7837 t = ix86_build_builtin_va_list_abi (MS_ABI);
7838 if (TREE_CODE (t) != RECORD_TYPE)
7839 t = build_variant_type_copy (t);
7840 ms_va_list_type_node = t;
7842 else
7844 t = ret;
7845 if (TREE_CODE (t) != RECORD_TYPE)
7846 t = build_variant_type_copy (t);
7847 ms_va_list_type_node = t;
7851 return ret;
7854 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7856 static void
7857 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7859 rtx save_area, mem;
7860 alias_set_type set;
7861 int i, max;
7863 /* GPR size of varargs save area. */
7864 if (cfun->va_list_gpr_size)
7865 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7866 else
7867 ix86_varargs_gpr_size = 0;
7869 /* FPR size of varargs save area. We don't need it if we don't pass
7870 anything in SSE registers. */
7871 if (TARGET_SSE && cfun->va_list_fpr_size)
7872 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7873 else
7874 ix86_varargs_fpr_size = 0;
7876 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7877 return;
7879 save_area = frame_pointer_rtx;
7880 set = get_varargs_alias_set ();
7882 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7883 if (max > X86_64_REGPARM_MAX)
7884 max = X86_64_REGPARM_MAX;
7886 for (i = cum->regno; i < max; i++)
7888 mem = gen_rtx_MEM (word_mode,
7889 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7890 MEM_NOTRAP_P (mem) = 1;
7891 set_mem_alias_set (mem, set);
7892 emit_move_insn (mem,
7893 gen_rtx_REG (word_mode,
7894 x86_64_int_parameter_registers[i]));
7897 if (ix86_varargs_fpr_size)
7899 enum machine_mode smode;
7900 rtx label, test;
7902 /* Now emit code to save SSE registers. The AX parameter contains number
7903 of SSE parameter registers used to call this function, though all we
7904 actually check here is the zero/non-zero status. */
7906 label = gen_label_rtx ();
7907 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7908 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7909 label));
7911 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7912 we used movdqa (i.e. TImode) instead? Perhaps even better would
7913 be if we could determine the real mode of the data, via a hook
7914 into pass_stdarg. Ignore all that for now. */
7915 smode = V4SFmode;
7916 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7917 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7919 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7920 if (max > X86_64_SSE_REGPARM_MAX)
7921 max = X86_64_SSE_REGPARM_MAX;
7923 for (i = cum->sse_regno; i < max; ++i)
7925 mem = plus_constant (Pmode, save_area,
7926 i * 16 + ix86_varargs_gpr_size);
7927 mem = gen_rtx_MEM (smode, mem);
7928 MEM_NOTRAP_P (mem) = 1;
7929 set_mem_alias_set (mem, set);
7930 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7932 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7935 emit_label (label);
7939 static void
7940 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7942 alias_set_type set = get_varargs_alias_set ();
7943 int i;
7945 /* Reset to zero, as there might be a sysv vaarg used
7946 before. */
7947 ix86_varargs_gpr_size = 0;
7948 ix86_varargs_fpr_size = 0;
7950 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7952 rtx reg, mem;
7954 mem = gen_rtx_MEM (Pmode,
7955 plus_constant (Pmode, virtual_incoming_args_rtx,
7956 i * UNITS_PER_WORD));
7957 MEM_NOTRAP_P (mem) = 1;
7958 set_mem_alias_set (mem, set);
7960 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7961 emit_move_insn (mem, reg);
7965 static void
7966 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7967 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7968 int no_rtl)
7970 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7971 CUMULATIVE_ARGS next_cum;
7972 tree fntype;
7974 /* This argument doesn't appear to be used anymore. Which is good,
7975 because the old code here didn't suppress rtl generation. */
7976 gcc_assert (!no_rtl);
7978 if (!TARGET_64BIT)
7979 return;
7981 fntype = TREE_TYPE (current_function_decl);
7983 /* For varargs, we do not want to skip the dummy va_dcl argument.
7984 For stdargs, we do want to skip the last named argument. */
7985 next_cum = *cum;
7986 if (stdarg_p (fntype))
7987 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7988 true);
7990 if (cum->call_abi == MS_ABI)
7991 setup_incoming_varargs_ms_64 (&next_cum);
7992 else
7993 setup_incoming_varargs_64 (&next_cum);
7996 /* Checks if TYPE is of kind va_list char *. */
7998 static bool
7999 is_va_list_char_pointer (tree type)
8001 tree canonic;
8003 /* For 32-bit it is always true. */
8004 if (!TARGET_64BIT)
8005 return true;
8006 canonic = ix86_canonical_va_list_type (type);
8007 return (canonic == ms_va_list_type_node
8008 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8011 /* Implement va_start. */
8013 static void
8014 ix86_va_start (tree valist, rtx nextarg)
8016 HOST_WIDE_INT words, n_gpr, n_fpr;
8017 tree f_gpr, f_fpr, f_ovf, f_sav;
8018 tree gpr, fpr, ovf, sav, t;
8019 tree type;
8020 rtx ovf_rtx;
8022 if (flag_split_stack
8023 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8025 unsigned int scratch_regno;
8027 /* When we are splitting the stack, we can't refer to the stack
8028 arguments using internal_arg_pointer, because they may be on
8029 the old stack. The split stack prologue will arrange to
8030 leave a pointer to the old stack arguments in a scratch
8031 register, which we here copy to a pseudo-register. The split
8032 stack prologue can't set the pseudo-register directly because
8033 it (the prologue) runs before any registers have been saved. */
8035 scratch_regno = split_stack_prologue_scratch_regno ();
8036 if (scratch_regno != INVALID_REGNUM)
8038 rtx reg, seq;
8040 reg = gen_reg_rtx (Pmode);
8041 cfun->machine->split_stack_varargs_pointer = reg;
8043 start_sequence ();
8044 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8045 seq = get_insns ();
8046 end_sequence ();
8048 push_topmost_sequence ();
8049 emit_insn_after (seq, entry_of_function ());
8050 pop_topmost_sequence ();
8054 /* Only 64bit target needs something special. */
8055 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8057 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8058 std_expand_builtin_va_start (valist, nextarg);
8059 else
8061 rtx va_r, next;
8063 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8064 next = expand_binop (ptr_mode, add_optab,
8065 cfun->machine->split_stack_varargs_pointer,
8066 crtl->args.arg_offset_rtx,
8067 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8068 convert_move (va_r, next, 0);
8070 return;
8073 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8074 f_fpr = DECL_CHAIN (f_gpr);
8075 f_ovf = DECL_CHAIN (f_fpr);
8076 f_sav = DECL_CHAIN (f_ovf);
8078 valist = build_simple_mem_ref (valist);
8079 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8080 /* The following should be folded into the MEM_REF offset. */
8081 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8082 f_gpr, NULL_TREE);
8083 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8084 f_fpr, NULL_TREE);
8085 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8086 f_ovf, NULL_TREE);
8087 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8088 f_sav, NULL_TREE);
8090 /* Count number of gp and fp argument registers used. */
8091 words = crtl->args.info.words;
8092 n_gpr = crtl->args.info.regno;
8093 n_fpr = crtl->args.info.sse_regno;
8095 if (cfun->va_list_gpr_size)
8097 type = TREE_TYPE (gpr);
8098 t = build2 (MODIFY_EXPR, type,
8099 gpr, build_int_cst (type, n_gpr * 8));
8100 TREE_SIDE_EFFECTS (t) = 1;
8101 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8104 if (TARGET_SSE && cfun->va_list_fpr_size)
8106 type = TREE_TYPE (fpr);
8107 t = build2 (MODIFY_EXPR, type, fpr,
8108 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8109 TREE_SIDE_EFFECTS (t) = 1;
8110 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8113 /* Find the overflow area. */
8114 type = TREE_TYPE (ovf);
8115 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8116 ovf_rtx = crtl->args.internal_arg_pointer;
8117 else
8118 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8119 t = make_tree (type, ovf_rtx);
8120 if (words != 0)
8121 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8122 t = build2 (MODIFY_EXPR, type, ovf, t);
8123 TREE_SIDE_EFFECTS (t) = 1;
8124 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8126 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8128 /* Find the register save area.
8129 Prologue of the function save it right above stack frame. */
8130 type = TREE_TYPE (sav);
8131 t = make_tree (type, frame_pointer_rtx);
8132 if (!ix86_varargs_gpr_size)
8133 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8134 t = build2 (MODIFY_EXPR, type, sav, t);
8135 TREE_SIDE_EFFECTS (t) = 1;
8136 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8140 /* Implement va_arg. */
8142 static tree
8143 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8144 gimple_seq *post_p)
8146 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8147 tree f_gpr, f_fpr, f_ovf, f_sav;
8148 tree gpr, fpr, ovf, sav, t;
8149 int size, rsize;
8150 tree lab_false, lab_over = NULL_TREE;
8151 tree addr, t2;
8152 rtx container;
8153 int indirect_p = 0;
8154 tree ptrtype;
8155 enum machine_mode nat_mode;
8156 unsigned int arg_boundary;
8158 /* Only 64bit target needs something special. */
8159 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8160 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8162 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8163 f_fpr = DECL_CHAIN (f_gpr);
8164 f_ovf = DECL_CHAIN (f_fpr);
8165 f_sav = DECL_CHAIN (f_ovf);
8167 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8168 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8169 valist = build_va_arg_indirect_ref (valist);
8170 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8171 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8172 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8174 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8175 if (indirect_p)
8176 type = build_pointer_type (type);
8177 size = int_size_in_bytes (type);
8178 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8180 nat_mode = type_natural_mode (type, NULL);
8181 switch (nat_mode)
8183 case V8SFmode:
8184 case V8SImode:
8185 case V32QImode:
8186 case V16HImode:
8187 case V4DFmode:
8188 case V4DImode:
8189 /* Unnamed 256bit vector mode parameters are passed on stack. */
8190 if (!TARGET_64BIT_MS_ABI)
8192 container = NULL;
8193 break;
8196 default:
8197 container = construct_container (nat_mode, TYPE_MODE (type),
8198 type, 0, X86_64_REGPARM_MAX,
8199 X86_64_SSE_REGPARM_MAX, intreg,
8201 break;
8204 /* Pull the value out of the saved registers. */
8206 addr = create_tmp_var (ptr_type_node, "addr");
8208 if (container)
8210 int needed_intregs, needed_sseregs;
8211 bool need_temp;
8212 tree int_addr, sse_addr;
8214 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8215 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8217 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8219 need_temp = (!REG_P (container)
8220 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8221 || TYPE_ALIGN (type) > 128));
8223 /* In case we are passing structure, verify that it is consecutive block
8224 on the register save area. If not we need to do moves. */
8225 if (!need_temp && !REG_P (container))
8227 /* Verify that all registers are strictly consecutive */
8228 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8230 int i;
8232 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8234 rtx slot = XVECEXP (container, 0, i);
8235 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8236 || INTVAL (XEXP (slot, 1)) != i * 16)
8237 need_temp = 1;
8240 else
8242 int i;
8244 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8246 rtx slot = XVECEXP (container, 0, i);
8247 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8248 || INTVAL (XEXP (slot, 1)) != i * 8)
8249 need_temp = 1;
8253 if (!need_temp)
8255 int_addr = addr;
8256 sse_addr = addr;
8258 else
8260 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8261 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8264 /* First ensure that we fit completely in registers. */
8265 if (needed_intregs)
8267 t = build_int_cst (TREE_TYPE (gpr),
8268 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8269 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8270 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8271 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8272 gimplify_and_add (t, pre_p);
8274 if (needed_sseregs)
8276 t = build_int_cst (TREE_TYPE (fpr),
8277 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8278 + X86_64_REGPARM_MAX * 8);
8279 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8280 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8281 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8282 gimplify_and_add (t, pre_p);
8285 /* Compute index to start of area used for integer regs. */
8286 if (needed_intregs)
8288 /* int_addr = gpr + sav; */
8289 t = fold_build_pointer_plus (sav, gpr);
8290 gimplify_assign (int_addr, t, pre_p);
8292 if (needed_sseregs)
8294 /* sse_addr = fpr + sav; */
8295 t = fold_build_pointer_plus (sav, fpr);
8296 gimplify_assign (sse_addr, t, pre_p);
8298 if (need_temp)
8300 int i, prev_size = 0;
8301 tree temp = create_tmp_var (type, "va_arg_tmp");
8303 /* addr = &temp; */
8304 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8305 gimplify_assign (addr, t, pre_p);
8307 for (i = 0; i < XVECLEN (container, 0); i++)
8309 rtx slot = XVECEXP (container, 0, i);
8310 rtx reg = XEXP (slot, 0);
8311 enum machine_mode mode = GET_MODE (reg);
8312 tree piece_type;
8313 tree addr_type;
8314 tree daddr_type;
8315 tree src_addr, src;
8316 int src_offset;
8317 tree dest_addr, dest;
8318 int cur_size = GET_MODE_SIZE (mode);
8320 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8321 prev_size = INTVAL (XEXP (slot, 1));
8322 if (prev_size + cur_size > size)
8324 cur_size = size - prev_size;
8325 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8326 if (mode == BLKmode)
8327 mode = QImode;
8329 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8330 if (mode == GET_MODE (reg))
8331 addr_type = build_pointer_type (piece_type);
8332 else
8333 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8334 true);
8335 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8336 true);
8338 if (SSE_REGNO_P (REGNO (reg)))
8340 src_addr = sse_addr;
8341 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8343 else
8345 src_addr = int_addr;
8346 src_offset = REGNO (reg) * 8;
8348 src_addr = fold_convert (addr_type, src_addr);
8349 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8351 dest_addr = fold_convert (daddr_type, addr);
8352 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8353 if (cur_size == GET_MODE_SIZE (mode))
8355 src = build_va_arg_indirect_ref (src_addr);
8356 dest = build_va_arg_indirect_ref (dest_addr);
8358 gimplify_assign (dest, src, pre_p);
8360 else
8362 tree copy
8363 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8364 3, dest_addr, src_addr,
8365 size_int (cur_size));
8366 gimplify_and_add (copy, pre_p);
8368 prev_size += cur_size;
8372 if (needed_intregs)
8374 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8375 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8376 gimplify_assign (gpr, t, pre_p);
8379 if (needed_sseregs)
8381 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8382 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8383 gimplify_assign (fpr, t, pre_p);
8386 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8388 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8391 /* ... otherwise out of the overflow area. */
8393 /* When we align parameter on stack for caller, if the parameter
8394 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8395 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8396 here with caller. */
8397 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8398 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8399 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8401 /* Care for on-stack alignment if needed. */
8402 if (arg_boundary <= 64 || size == 0)
8403 t = ovf;
8404 else
8406 HOST_WIDE_INT align = arg_boundary / 8;
8407 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8408 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8409 build_int_cst (TREE_TYPE (t), -align));
8412 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8413 gimplify_assign (addr, t, pre_p);
8415 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8416 gimplify_assign (unshare_expr (ovf), t, pre_p);
8418 if (container)
8419 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8421 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8422 addr = fold_convert (ptrtype, addr);
8424 if (indirect_p)
8425 addr = build_va_arg_indirect_ref (addr);
8426 return build_va_arg_indirect_ref (addr);
8429 /* Return true if OPNUM's MEM should be matched
8430 in movabs* patterns. */
8432 bool
8433 ix86_check_movabs (rtx insn, int opnum)
8435 rtx set, mem;
8437 set = PATTERN (insn);
8438 if (GET_CODE (set) == PARALLEL)
8439 set = XVECEXP (set, 0, 0);
8440 gcc_assert (GET_CODE (set) == SET);
8441 mem = XEXP (set, opnum);
8442 while (GET_CODE (mem) == SUBREG)
8443 mem = SUBREG_REG (mem);
8444 gcc_assert (MEM_P (mem));
8445 return volatile_ok || !MEM_VOLATILE_P (mem);
8448 /* Initialize the table of extra 80387 mathematical constants. */
8450 static void
8451 init_ext_80387_constants (void)
8453 static const char * cst[5] =
8455 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8456 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8457 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8458 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8459 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8461 int i;
8463 for (i = 0; i < 5; i++)
8465 real_from_string (&ext_80387_constants_table[i], cst[i]);
8466 /* Ensure each constant is rounded to XFmode precision. */
8467 real_convert (&ext_80387_constants_table[i],
8468 XFmode, &ext_80387_constants_table[i]);
8471 ext_80387_constants_init = 1;
8474 /* Return non-zero if the constant is something that
8475 can be loaded with a special instruction. */
8478 standard_80387_constant_p (rtx x)
8480 enum machine_mode mode = GET_MODE (x);
8482 REAL_VALUE_TYPE r;
8484 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8485 return -1;
8487 if (x == CONST0_RTX (mode))
8488 return 1;
8489 if (x == CONST1_RTX (mode))
8490 return 2;
8492 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8494 /* For XFmode constants, try to find a special 80387 instruction when
8495 optimizing for size or on those CPUs that benefit from them. */
8496 if (mode == XFmode
8497 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8499 int i;
8501 if (! ext_80387_constants_init)
8502 init_ext_80387_constants ();
8504 for (i = 0; i < 5; i++)
8505 if (real_identical (&r, &ext_80387_constants_table[i]))
8506 return i + 3;
8509 /* Load of the constant -0.0 or -1.0 will be split as
8510 fldz;fchs or fld1;fchs sequence. */
8511 if (real_isnegzero (&r))
8512 return 8;
8513 if (real_identical (&r, &dconstm1))
8514 return 9;
8516 return 0;
8519 /* Return the opcode of the special instruction to be used to load
8520 the constant X. */
8522 const char *
8523 standard_80387_constant_opcode (rtx x)
8525 switch (standard_80387_constant_p (x))
8527 case 1:
8528 return "fldz";
8529 case 2:
8530 return "fld1";
8531 case 3:
8532 return "fldlg2";
8533 case 4:
8534 return "fldln2";
8535 case 5:
8536 return "fldl2e";
8537 case 6:
8538 return "fldl2t";
8539 case 7:
8540 return "fldpi";
8541 case 8:
8542 case 9:
8543 return "#";
8544 default:
8545 gcc_unreachable ();
8549 /* Return the CONST_DOUBLE representing the 80387 constant that is
8550 loaded by the specified special instruction. The argument IDX
8551 matches the return value from standard_80387_constant_p. */
8554 standard_80387_constant_rtx (int idx)
8556 int i;
8558 if (! ext_80387_constants_init)
8559 init_ext_80387_constants ();
8561 switch (idx)
8563 case 3:
8564 case 4:
8565 case 5:
8566 case 6:
8567 case 7:
8568 i = idx - 3;
8569 break;
8571 default:
8572 gcc_unreachable ();
8575 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8576 XFmode);
8579 /* Return 1 if X is all 0s and 2 if x is all 1s
8580 in supported SSE/AVX vector mode. */
8583 standard_sse_constant_p (rtx x)
8585 enum machine_mode mode = GET_MODE (x);
8587 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8588 return 1;
8589 if (vector_all_ones_operand (x, mode))
8590 switch (mode)
8592 case V16QImode:
8593 case V8HImode:
8594 case V4SImode:
8595 case V2DImode:
8596 if (TARGET_SSE2)
8597 return 2;
8598 case V32QImode:
8599 case V16HImode:
8600 case V8SImode:
8601 case V4DImode:
8602 if (TARGET_AVX2)
8603 return 2;
8604 default:
8605 break;
8608 return 0;
8611 /* Return the opcode of the special instruction to be used to load
8612 the constant X. */
8614 const char *
8615 standard_sse_constant_opcode (rtx insn, rtx x)
8617 switch (standard_sse_constant_p (x))
8619 case 1:
8620 switch (get_attr_mode (insn))
8622 case MODE_TI:
8623 return "%vpxor\t%0, %d0";
8624 case MODE_V2DF:
8625 return "%vxorpd\t%0, %d0";
8626 case MODE_V4SF:
8627 return "%vxorps\t%0, %d0";
8629 case MODE_OI:
8630 return "vpxor\t%x0, %x0, %x0";
8631 case MODE_V4DF:
8632 return "vxorpd\t%x0, %x0, %x0";
8633 case MODE_V8SF:
8634 return "vxorps\t%x0, %x0, %x0";
8636 default:
8637 break;
8640 case 2:
8641 if (get_attr_mode (insn) == MODE_XI
8642 || get_attr_mode (insn) == MODE_V8DF
8643 || get_attr_mode (insn) == MODE_V16SF)
8644 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
8645 if (TARGET_AVX)
8646 return "vpcmpeqd\t%0, %0, %0";
8647 else
8648 return "pcmpeqd\t%0, %0";
8650 default:
8651 break;
8653 gcc_unreachable ();
8656 /* Returns true if OP contains a symbol reference */
8658 bool
8659 symbolic_reference_mentioned_p (rtx op)
8661 const char *fmt;
8662 int i;
8664 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8665 return true;
8667 fmt = GET_RTX_FORMAT (GET_CODE (op));
8668 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8670 if (fmt[i] == 'E')
8672 int j;
8674 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8675 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8676 return true;
8679 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8680 return true;
8683 return false;
8686 /* Return true if it is appropriate to emit `ret' instructions in the
8687 body of a function. Do this only if the epilogue is simple, needing a
8688 couple of insns. Prior to reloading, we can't tell how many registers
8689 must be saved, so return false then. Return false if there is no frame
8690 marker to de-allocate. */
8692 bool
8693 ix86_can_use_return_insn_p (void)
8695 struct ix86_frame frame;
8697 if (! reload_completed || frame_pointer_needed)
8698 return 0;
8700 /* Don't allow more than 32k pop, since that's all we can do
8701 with one instruction. */
8702 if (crtl->args.pops_args && crtl->args.size >= 32768)
8703 return 0;
8705 ix86_compute_frame_layout (&frame);
8706 return (frame.stack_pointer_offset == UNITS_PER_WORD
8707 && (frame.nregs + frame.nsseregs) == 0);
8710 /* Value should be nonzero if functions must have frame pointers.
8711 Zero means the frame pointer need not be set up (and parms may
8712 be accessed via the stack pointer) in functions that seem suitable. */
8714 static bool
8715 ix86_frame_pointer_required (void)
8717 /* If we accessed previous frames, then the generated code expects
8718 to be able to access the saved ebp value in our frame. */
8719 if (cfun->machine->accesses_prev_frame)
8720 return true;
8722 /* Several x86 os'es need a frame pointer for other reasons,
8723 usually pertaining to setjmp. */
8724 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8725 return true;
8727 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8728 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8729 return true;
8731 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8732 allocation is 4GB. */
8733 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8734 return true;
8736 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8737 turns off the frame pointer by default. Turn it back on now if
8738 we've not got a leaf function. */
8739 if (TARGET_OMIT_LEAF_FRAME_POINTER
8740 && (!crtl->is_leaf
8741 || ix86_current_function_calls_tls_descriptor))
8742 return true;
8744 if (crtl->profile && !flag_fentry)
8745 return true;
8747 return false;
8750 /* Record that the current function accesses previous call frames. */
8752 void
8753 ix86_setup_frame_addresses (void)
8755 cfun->machine->accesses_prev_frame = 1;
8758 #ifndef USE_HIDDEN_LINKONCE
8759 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8760 # define USE_HIDDEN_LINKONCE 1
8761 # else
8762 # define USE_HIDDEN_LINKONCE 0
8763 # endif
8764 #endif
8766 static int pic_labels_used;
8768 /* Fills in the label name that should be used for a pc thunk for
8769 the given register. */
8771 static void
8772 get_pc_thunk_name (char name[32], unsigned int regno)
8774 gcc_assert (!TARGET_64BIT);
8776 if (USE_HIDDEN_LINKONCE)
8777 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8778 else
8779 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8783 /* This function generates code for -fpic that loads %ebx with
8784 the return address of the caller and then returns. */
8786 static void
8787 ix86_code_end (void)
8789 rtx xops[2];
8790 int regno;
8792 for (regno = AX_REG; regno <= SP_REG; regno++)
8794 char name[32];
8795 tree decl;
8797 if (!(pic_labels_used & (1 << regno)))
8798 continue;
8800 get_pc_thunk_name (name, regno);
8802 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8803 get_identifier (name),
8804 build_function_type_list (void_type_node, NULL_TREE));
8805 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8806 NULL_TREE, void_type_node);
8807 TREE_PUBLIC (decl) = 1;
8808 TREE_STATIC (decl) = 1;
8809 DECL_IGNORED_P (decl) = 1;
8811 #if TARGET_MACHO
8812 if (TARGET_MACHO)
8814 switch_to_section (darwin_sections[text_coal_section]);
8815 fputs ("\t.weak_definition\t", asm_out_file);
8816 assemble_name (asm_out_file, name);
8817 fputs ("\n\t.private_extern\t", asm_out_file);
8818 assemble_name (asm_out_file, name);
8819 putc ('\n', asm_out_file);
8820 ASM_OUTPUT_LABEL (asm_out_file, name);
8821 DECL_WEAK (decl) = 1;
8823 else
8824 #endif
8825 if (USE_HIDDEN_LINKONCE)
8827 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8829 targetm.asm_out.unique_section (decl, 0);
8830 switch_to_section (get_named_section (decl, NULL, 0));
8832 targetm.asm_out.globalize_label (asm_out_file, name);
8833 fputs ("\t.hidden\t", asm_out_file);
8834 assemble_name (asm_out_file, name);
8835 putc ('\n', asm_out_file);
8836 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8838 else
8840 switch_to_section (text_section);
8841 ASM_OUTPUT_LABEL (asm_out_file, name);
8844 DECL_INITIAL (decl) = make_node (BLOCK);
8845 current_function_decl = decl;
8846 init_function_start (decl);
8847 first_function_block_is_cold = false;
8848 /* Make sure unwind info is emitted for the thunk if needed. */
8849 final_start_function (emit_barrier (), asm_out_file, 1);
8851 /* Pad stack IP move with 4 instructions (two NOPs count
8852 as one instruction). */
8853 if (TARGET_PAD_SHORT_FUNCTION)
8855 int i = 8;
8857 while (i--)
8858 fputs ("\tnop\n", asm_out_file);
8861 xops[0] = gen_rtx_REG (Pmode, regno);
8862 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8863 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8864 fputs ("\tret\n", asm_out_file);
8865 final_end_function ();
8866 init_insn_lengths ();
8867 free_after_compilation (cfun);
8868 set_cfun (NULL);
8869 current_function_decl = NULL;
8872 if (flag_split_stack)
8873 file_end_indicate_split_stack ();
8876 /* Emit code for the SET_GOT patterns. */
8878 const char *
8879 output_set_got (rtx dest, rtx label)
8881 rtx xops[3];
8883 xops[0] = dest;
8885 if (TARGET_VXWORKS_RTP && flag_pic)
8887 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8888 xops[2] = gen_rtx_MEM (Pmode,
8889 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8890 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8892 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8893 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8894 an unadorned address. */
8895 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8896 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8897 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8898 return "";
8901 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8903 if (!flag_pic)
8905 if (TARGET_MACHO)
8906 /* We don't need a pic base, we're not producing pic. */
8907 gcc_unreachable ();
8909 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8910 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8911 targetm.asm_out.internal_label (asm_out_file, "L",
8912 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8914 else
8916 char name[32];
8917 get_pc_thunk_name (name, REGNO (dest));
8918 pic_labels_used |= 1 << REGNO (dest);
8920 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8921 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8922 output_asm_insn ("call\t%X2", xops);
8924 #if TARGET_MACHO
8925 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
8926 This is what will be referenced by the Mach-O PIC subsystem. */
8927 if (machopic_should_output_picbase_label () || !label)
8928 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8930 /* When we are restoring the pic base at the site of a nonlocal label,
8931 and we decided to emit the pic base above, we will still output a
8932 local label used for calculating the correction offset (even though
8933 the offset will be 0 in that case). */
8934 if (label)
8935 targetm.asm_out.internal_label (asm_out_file, "L",
8936 CODE_LABEL_NUMBER (label));
8937 #endif
8940 if (!TARGET_MACHO)
8941 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8943 return "";
8946 /* Generate an "push" pattern for input ARG. */
8948 static rtx
8949 gen_push (rtx arg)
8951 struct machine_function *m = cfun->machine;
8953 if (m->fs.cfa_reg == stack_pointer_rtx)
8954 m->fs.cfa_offset += UNITS_PER_WORD;
8955 m->fs.sp_offset += UNITS_PER_WORD;
8957 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8958 arg = gen_rtx_REG (word_mode, REGNO (arg));
8960 return gen_rtx_SET (VOIDmode,
8961 gen_rtx_MEM (word_mode,
8962 gen_rtx_PRE_DEC (Pmode,
8963 stack_pointer_rtx)),
8964 arg);
8967 /* Generate an "pop" pattern for input ARG. */
8969 static rtx
8970 gen_pop (rtx arg)
8972 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8973 arg = gen_rtx_REG (word_mode, REGNO (arg));
8975 return gen_rtx_SET (VOIDmode,
8976 arg,
8977 gen_rtx_MEM (word_mode,
8978 gen_rtx_POST_INC (Pmode,
8979 stack_pointer_rtx)));
8982 /* Return >= 0 if there is an unused call-clobbered register available
8983 for the entire function. */
8985 static unsigned int
8986 ix86_select_alt_pic_regnum (void)
8988 if (crtl->is_leaf
8989 && !crtl->profile
8990 && !ix86_current_function_calls_tls_descriptor)
8992 int i, drap;
8993 /* Can't use the same register for both PIC and DRAP. */
8994 if (crtl->drap_reg)
8995 drap = REGNO (crtl->drap_reg);
8996 else
8997 drap = -1;
8998 for (i = 2; i >= 0; --i)
8999 if (i != drap && !df_regs_ever_live_p (i))
9000 return i;
9003 return INVALID_REGNUM;
9006 /* Return TRUE if we need to save REGNO. */
9008 static bool
9009 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9011 if (pic_offset_table_rtx
9012 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9013 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9014 || crtl->profile
9015 || crtl->calls_eh_return
9016 || crtl->uses_const_pool
9017 || cfun->has_nonlocal_label))
9018 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9020 if (crtl->calls_eh_return && maybe_eh_return)
9022 unsigned i;
9023 for (i = 0; ; i++)
9025 unsigned test = EH_RETURN_DATA_REGNO (i);
9026 if (test == INVALID_REGNUM)
9027 break;
9028 if (test == regno)
9029 return true;
9033 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9034 return true;
9036 return (df_regs_ever_live_p (regno)
9037 && !call_used_regs[regno]
9038 && !fixed_regs[regno]
9039 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9042 /* Return number of saved general prupose registers. */
9044 static int
9045 ix86_nsaved_regs (void)
9047 int nregs = 0;
9048 int regno;
9050 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9051 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9052 nregs ++;
9053 return nregs;
9056 /* Return number of saved SSE registrers. */
9058 static int
9059 ix86_nsaved_sseregs (void)
9061 int nregs = 0;
9062 int regno;
9064 if (!TARGET_64BIT_MS_ABI)
9065 return 0;
9066 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9067 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9068 nregs ++;
9069 return nregs;
9072 /* Given FROM and TO register numbers, say whether this elimination is
9073 allowed. If stack alignment is needed, we can only replace argument
9074 pointer with hard frame pointer, or replace frame pointer with stack
9075 pointer. Otherwise, frame pointer elimination is automatically
9076 handled and all other eliminations are valid. */
9078 static bool
9079 ix86_can_eliminate (const int from, const int to)
9081 if (stack_realign_fp)
9082 return ((from == ARG_POINTER_REGNUM
9083 && to == HARD_FRAME_POINTER_REGNUM)
9084 || (from == FRAME_POINTER_REGNUM
9085 && to == STACK_POINTER_REGNUM));
9086 else
9087 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9090 /* Return the offset between two registers, one to be eliminated, and the other
9091 its replacement, at the start of a routine. */
9093 HOST_WIDE_INT
9094 ix86_initial_elimination_offset (int from, int to)
9096 struct ix86_frame frame;
9097 ix86_compute_frame_layout (&frame);
9099 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9100 return frame.hard_frame_pointer_offset;
9101 else if (from == FRAME_POINTER_REGNUM
9102 && to == HARD_FRAME_POINTER_REGNUM)
9103 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9104 else
9106 gcc_assert (to == STACK_POINTER_REGNUM);
9108 if (from == ARG_POINTER_REGNUM)
9109 return frame.stack_pointer_offset;
9111 gcc_assert (from == FRAME_POINTER_REGNUM);
9112 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9116 /* In a dynamically-aligned function, we can't know the offset from
9117 stack pointer to frame pointer, so we must ensure that setjmp
9118 eliminates fp against the hard fp (%ebp) rather than trying to
9119 index from %esp up to the top of the frame across a gap that is
9120 of unknown (at compile-time) size. */
9121 static rtx
9122 ix86_builtin_setjmp_frame_value (void)
9124 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9127 /* When using -fsplit-stack, the allocation routines set a field in
9128 the TCB to the bottom of the stack plus this much space, measured
9129 in bytes. */
9131 #define SPLIT_STACK_AVAILABLE 256
9133 /* Fill structure ix86_frame about frame of currently computed function. */
9135 static void
9136 ix86_compute_frame_layout (struct ix86_frame *frame)
9138 unsigned HOST_WIDE_INT stack_alignment_needed;
9139 HOST_WIDE_INT offset;
9140 unsigned HOST_WIDE_INT preferred_alignment;
9141 HOST_WIDE_INT size = get_frame_size ();
9142 HOST_WIDE_INT to_allocate;
9144 frame->nregs = ix86_nsaved_regs ();
9145 frame->nsseregs = ix86_nsaved_sseregs ();
9147 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9148 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9150 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9151 function prologues and leaf. */
9152 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9153 && (!crtl->is_leaf || cfun->calls_alloca != 0
9154 || ix86_current_function_calls_tls_descriptor))
9156 preferred_alignment = 16;
9157 stack_alignment_needed = 16;
9158 crtl->preferred_stack_boundary = 128;
9159 crtl->stack_alignment_needed = 128;
9162 gcc_assert (!size || stack_alignment_needed);
9163 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9164 gcc_assert (preferred_alignment <= stack_alignment_needed);
9166 /* For SEH we have to limit the amount of code movement into the prologue.
9167 At present we do this via a BLOCKAGE, at which point there's very little
9168 scheduling that can be done, which means that there's very little point
9169 in doing anything except PUSHs. */
9170 if (TARGET_SEH)
9171 cfun->machine->use_fast_prologue_epilogue = false;
9173 /* During reload iteration the amount of registers saved can change.
9174 Recompute the value as needed. Do not recompute when amount of registers
9175 didn't change as reload does multiple calls to the function and does not
9176 expect the decision to change within single iteration. */
9177 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR)
9178 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9180 int count = frame->nregs;
9181 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9183 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9185 /* The fast prologue uses move instead of push to save registers. This
9186 is significantly longer, but also executes faster as modern hardware
9187 can execute the moves in parallel, but can't do that for push/pop.
9189 Be careful about choosing what prologue to emit: When function takes
9190 many instructions to execute we may use slow version as well as in
9191 case function is known to be outside hot spot (this is known with
9192 feedback only). Weight the size of function by number of registers
9193 to save as it is cheap to use one or two push instructions but very
9194 slow to use many of them. */
9195 if (count)
9196 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9197 if (node->frequency < NODE_FREQUENCY_NORMAL
9198 || (flag_branch_probabilities
9199 && node->frequency < NODE_FREQUENCY_HOT))
9200 cfun->machine->use_fast_prologue_epilogue = false;
9201 else
9202 cfun->machine->use_fast_prologue_epilogue
9203 = !expensive_function_p (count);
9206 frame->save_regs_using_mov
9207 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9208 /* If static stack checking is enabled and done with probes,
9209 the registers need to be saved before allocating the frame. */
9210 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9212 /* Skip return address. */
9213 offset = UNITS_PER_WORD;
9215 /* Skip pushed static chain. */
9216 if (ix86_static_chain_on_stack)
9217 offset += UNITS_PER_WORD;
9219 /* Skip saved base pointer. */
9220 if (frame_pointer_needed)
9221 offset += UNITS_PER_WORD;
9222 frame->hfp_save_offset = offset;
9224 /* The traditional frame pointer location is at the top of the frame. */
9225 frame->hard_frame_pointer_offset = offset;
9227 /* Register save area */
9228 offset += frame->nregs * UNITS_PER_WORD;
9229 frame->reg_save_offset = offset;
9231 /* On SEH target, registers are pushed just before the frame pointer
9232 location. */
9233 if (TARGET_SEH)
9234 frame->hard_frame_pointer_offset = offset;
9236 /* Align and set SSE register save area. */
9237 if (frame->nsseregs)
9239 /* The only ABI that has saved SSE registers (Win64) also has a
9240 16-byte aligned default stack, and thus we don't need to be
9241 within the re-aligned local stack frame to save them. */
9242 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9243 offset = (offset + 16 - 1) & -16;
9244 offset += frame->nsseregs * 16;
9246 frame->sse_reg_save_offset = offset;
9248 /* The re-aligned stack starts here. Values before this point are not
9249 directly comparable with values below this point. In order to make
9250 sure that no value happens to be the same before and after, force
9251 the alignment computation below to add a non-zero value. */
9252 if (stack_realign_fp)
9253 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9255 /* Va-arg area */
9256 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9257 offset += frame->va_arg_size;
9259 /* Align start of frame for local function. */
9260 if (stack_realign_fp
9261 || offset != frame->sse_reg_save_offset
9262 || size != 0
9263 || !crtl->is_leaf
9264 || cfun->calls_alloca
9265 || ix86_current_function_calls_tls_descriptor)
9266 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9268 /* Frame pointer points here. */
9269 frame->frame_pointer_offset = offset;
9271 offset += size;
9273 /* Add outgoing arguments area. Can be skipped if we eliminated
9274 all the function calls as dead code.
9275 Skipping is however impossible when function calls alloca. Alloca
9276 expander assumes that last crtl->outgoing_args_size
9277 of stack frame are unused. */
9278 if (ACCUMULATE_OUTGOING_ARGS
9279 && (!crtl->is_leaf || cfun->calls_alloca
9280 || ix86_current_function_calls_tls_descriptor))
9282 offset += crtl->outgoing_args_size;
9283 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9285 else
9286 frame->outgoing_arguments_size = 0;
9288 /* Align stack boundary. Only needed if we're calling another function
9289 or using alloca. */
9290 if (!crtl->is_leaf || cfun->calls_alloca
9291 || ix86_current_function_calls_tls_descriptor)
9292 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9294 /* We've reached end of stack frame. */
9295 frame->stack_pointer_offset = offset;
9297 /* Size prologue needs to allocate. */
9298 to_allocate = offset - frame->sse_reg_save_offset;
9300 if ((!to_allocate && frame->nregs <= 1)
9301 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9302 frame->save_regs_using_mov = false;
9304 if (ix86_using_red_zone ()
9305 && crtl->sp_is_unchanging
9306 && crtl->is_leaf
9307 && !ix86_current_function_calls_tls_descriptor)
9309 frame->red_zone_size = to_allocate;
9310 if (frame->save_regs_using_mov)
9311 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9312 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9313 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9315 else
9316 frame->red_zone_size = 0;
9317 frame->stack_pointer_offset -= frame->red_zone_size;
9319 /* The SEH frame pointer location is near the bottom of the frame.
9320 This is enforced by the fact that the difference between the
9321 stack pointer and the frame pointer is limited to 240 bytes in
9322 the unwind data structure. */
9323 if (TARGET_SEH)
9325 HOST_WIDE_INT diff;
9327 /* If we can leave the frame pointer where it is, do so. Also, returns
9328 the establisher frame for __builtin_frame_address (0). */
9329 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9330 if (diff <= SEH_MAX_FRAME_SIZE
9331 && (diff > 240 || (diff & 15) != 0)
9332 && !crtl->accesses_prior_frames)
9334 /* Ideally we'd determine what portion of the local stack frame
9335 (within the constraint of the lowest 240) is most heavily used.
9336 But without that complication, simply bias the frame pointer
9337 by 128 bytes so as to maximize the amount of the local stack
9338 frame that is addressable with 8-bit offsets. */
9339 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9344 /* This is semi-inlined memory_address_length, but simplified
9345 since we know that we're always dealing with reg+offset, and
9346 to avoid having to create and discard all that rtl. */
9348 static inline int
9349 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9351 int len = 4;
9353 if (offset == 0)
9355 /* EBP and R13 cannot be encoded without an offset. */
9356 len = (regno == BP_REG || regno == R13_REG);
9358 else if (IN_RANGE (offset, -128, 127))
9359 len = 1;
9361 /* ESP and R12 must be encoded with a SIB byte. */
9362 if (regno == SP_REG || regno == R12_REG)
9363 len++;
9365 return len;
9368 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9369 The valid base registers are taken from CFUN->MACHINE->FS. */
9371 static rtx
9372 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9374 const struct machine_function *m = cfun->machine;
9375 rtx base_reg = NULL;
9376 HOST_WIDE_INT base_offset = 0;
9378 if (m->use_fast_prologue_epilogue)
9380 /* Choose the base register most likely to allow the most scheduling
9381 opportunities. Generally FP is valid throughout the function,
9382 while DRAP must be reloaded within the epilogue. But choose either
9383 over the SP due to increased encoding size. */
9385 if (m->fs.fp_valid)
9387 base_reg = hard_frame_pointer_rtx;
9388 base_offset = m->fs.fp_offset - cfa_offset;
9390 else if (m->fs.drap_valid)
9392 base_reg = crtl->drap_reg;
9393 base_offset = 0 - cfa_offset;
9395 else if (m->fs.sp_valid)
9397 base_reg = stack_pointer_rtx;
9398 base_offset = m->fs.sp_offset - cfa_offset;
9401 else
9403 HOST_WIDE_INT toffset;
9404 int len = 16, tlen;
9406 /* Choose the base register with the smallest address encoding.
9407 With a tie, choose FP > DRAP > SP. */
9408 if (m->fs.sp_valid)
9410 base_reg = stack_pointer_rtx;
9411 base_offset = m->fs.sp_offset - cfa_offset;
9412 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9414 if (m->fs.drap_valid)
9416 toffset = 0 - cfa_offset;
9417 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9418 if (tlen <= len)
9420 base_reg = crtl->drap_reg;
9421 base_offset = toffset;
9422 len = tlen;
9425 if (m->fs.fp_valid)
9427 toffset = m->fs.fp_offset - cfa_offset;
9428 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9429 if (tlen <= len)
9431 base_reg = hard_frame_pointer_rtx;
9432 base_offset = toffset;
9433 len = tlen;
9437 gcc_assert (base_reg != NULL);
9439 return plus_constant (Pmode, base_reg, base_offset);
9442 /* Emit code to save registers in the prologue. */
9444 static void
9445 ix86_emit_save_regs (void)
9447 unsigned int regno;
9448 rtx insn;
9450 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9451 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9453 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9454 RTX_FRAME_RELATED_P (insn) = 1;
9458 /* Emit a single register save at CFA - CFA_OFFSET. */
9460 static void
9461 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9462 HOST_WIDE_INT cfa_offset)
9464 struct machine_function *m = cfun->machine;
9465 rtx reg = gen_rtx_REG (mode, regno);
9466 rtx mem, addr, base, insn;
9468 addr = choose_baseaddr (cfa_offset);
9469 mem = gen_frame_mem (mode, addr);
9471 /* For SSE saves, we need to indicate the 128-bit alignment. */
9472 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9474 insn = emit_move_insn (mem, reg);
9475 RTX_FRAME_RELATED_P (insn) = 1;
9477 base = addr;
9478 if (GET_CODE (base) == PLUS)
9479 base = XEXP (base, 0);
9480 gcc_checking_assert (REG_P (base));
9482 /* When saving registers into a re-aligned local stack frame, avoid
9483 any tricky guessing by dwarf2out. */
9484 if (m->fs.realigned)
9486 gcc_checking_assert (stack_realign_drap);
9488 if (regno == REGNO (crtl->drap_reg))
9490 /* A bit of a hack. We force the DRAP register to be saved in
9491 the re-aligned stack frame, which provides us with a copy
9492 of the CFA that will last past the prologue. Install it. */
9493 gcc_checking_assert (cfun->machine->fs.fp_valid);
9494 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9495 cfun->machine->fs.fp_offset - cfa_offset);
9496 mem = gen_rtx_MEM (mode, addr);
9497 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9499 else
9501 /* The frame pointer is a stable reference within the
9502 aligned frame. Use it. */
9503 gcc_checking_assert (cfun->machine->fs.fp_valid);
9504 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9505 cfun->machine->fs.fp_offset - cfa_offset);
9506 mem = gen_rtx_MEM (mode, addr);
9507 add_reg_note (insn, REG_CFA_EXPRESSION,
9508 gen_rtx_SET (VOIDmode, mem, reg));
9512 /* The memory may not be relative to the current CFA register,
9513 which means that we may need to generate a new pattern for
9514 use by the unwind info. */
9515 else if (base != m->fs.cfa_reg)
9517 addr = plus_constant (Pmode, m->fs.cfa_reg,
9518 m->fs.cfa_offset - cfa_offset);
9519 mem = gen_rtx_MEM (mode, addr);
9520 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9524 /* Emit code to save registers using MOV insns.
9525 First register is stored at CFA - CFA_OFFSET. */
9526 static void
9527 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9529 unsigned int regno;
9531 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9532 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9534 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9535 cfa_offset -= UNITS_PER_WORD;
9539 /* Emit code to save SSE registers using MOV insns.
9540 First register is stored at CFA - CFA_OFFSET. */
9541 static void
9542 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9544 unsigned int regno;
9546 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9547 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9549 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9550 cfa_offset -= 16;
9554 static GTY(()) rtx queued_cfa_restores;
9556 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9557 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9558 Don't add the note if the previously saved value will be left untouched
9559 within stack red-zone till return, as unwinders can find the same value
9560 in the register and on the stack. */
9562 static void
9563 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9565 if (!crtl->shrink_wrapped
9566 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9567 return;
9569 if (insn)
9571 add_reg_note (insn, REG_CFA_RESTORE, reg);
9572 RTX_FRAME_RELATED_P (insn) = 1;
9574 else
9575 queued_cfa_restores
9576 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9579 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9581 static void
9582 ix86_add_queued_cfa_restore_notes (rtx insn)
9584 rtx last;
9585 if (!queued_cfa_restores)
9586 return;
9587 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9589 XEXP (last, 1) = REG_NOTES (insn);
9590 REG_NOTES (insn) = queued_cfa_restores;
9591 queued_cfa_restores = NULL_RTX;
9592 RTX_FRAME_RELATED_P (insn) = 1;
9595 /* Expand prologue or epilogue stack adjustment.
9596 The pattern exist to put a dependency on all ebp-based memory accesses.
9597 STYLE should be negative if instructions should be marked as frame related,
9598 zero if %r11 register is live and cannot be freely used and positive
9599 otherwise. */
9601 static void
9602 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9603 int style, bool set_cfa)
9605 struct machine_function *m = cfun->machine;
9606 rtx insn;
9607 bool add_frame_related_expr = false;
9609 if (Pmode == SImode)
9610 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9611 else if (x86_64_immediate_operand (offset, DImode))
9612 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9613 else
9615 rtx tmp;
9616 /* r11 is used by indirect sibcall return as well, set before the
9617 epilogue and used after the epilogue. */
9618 if (style)
9619 tmp = gen_rtx_REG (DImode, R11_REG);
9620 else
9622 gcc_assert (src != hard_frame_pointer_rtx
9623 && dest != hard_frame_pointer_rtx);
9624 tmp = hard_frame_pointer_rtx;
9626 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9627 if (style < 0)
9628 add_frame_related_expr = true;
9630 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9633 insn = emit_insn (insn);
9634 if (style >= 0)
9635 ix86_add_queued_cfa_restore_notes (insn);
9637 if (set_cfa)
9639 rtx r;
9641 gcc_assert (m->fs.cfa_reg == src);
9642 m->fs.cfa_offset += INTVAL (offset);
9643 m->fs.cfa_reg = dest;
9645 r = gen_rtx_PLUS (Pmode, src, offset);
9646 r = gen_rtx_SET (VOIDmode, dest, r);
9647 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9648 RTX_FRAME_RELATED_P (insn) = 1;
9650 else if (style < 0)
9652 RTX_FRAME_RELATED_P (insn) = 1;
9653 if (add_frame_related_expr)
9655 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9656 r = gen_rtx_SET (VOIDmode, dest, r);
9657 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9661 if (dest == stack_pointer_rtx)
9663 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9664 bool valid = m->fs.sp_valid;
9666 if (src == hard_frame_pointer_rtx)
9668 valid = m->fs.fp_valid;
9669 ooffset = m->fs.fp_offset;
9671 else if (src == crtl->drap_reg)
9673 valid = m->fs.drap_valid;
9674 ooffset = 0;
9676 else
9678 /* Else there are two possibilities: SP itself, which we set
9679 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9680 taken care of this by hand along the eh_return path. */
9681 gcc_checking_assert (src == stack_pointer_rtx
9682 || offset == const0_rtx);
9685 m->fs.sp_offset = ooffset - INTVAL (offset);
9686 m->fs.sp_valid = valid;
9690 /* Find an available register to be used as dynamic realign argument
9691 pointer regsiter. Such a register will be written in prologue and
9692 used in begin of body, so it must not be
9693 1. parameter passing register.
9694 2. GOT pointer.
9695 We reuse static-chain register if it is available. Otherwise, we
9696 use DI for i386 and R13 for x86-64. We chose R13 since it has
9697 shorter encoding.
9699 Return: the regno of chosen register. */
9701 static unsigned int
9702 find_drap_reg (void)
9704 tree decl = cfun->decl;
9706 if (TARGET_64BIT)
9708 /* Use R13 for nested function or function need static chain.
9709 Since function with tail call may use any caller-saved
9710 registers in epilogue, DRAP must not use caller-saved
9711 register in such case. */
9712 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9713 return R13_REG;
9715 return R10_REG;
9717 else
9719 /* Use DI for nested function or function need static chain.
9720 Since function with tail call may use any caller-saved
9721 registers in epilogue, DRAP must not use caller-saved
9722 register in such case. */
9723 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9724 return DI_REG;
9726 /* Reuse static chain register if it isn't used for parameter
9727 passing. */
9728 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9730 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9731 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9732 return CX_REG;
9734 return DI_REG;
9738 /* Return minimum incoming stack alignment. */
9740 static unsigned int
9741 ix86_minimum_incoming_stack_boundary (bool sibcall)
9743 unsigned int incoming_stack_boundary;
9745 /* Prefer the one specified at command line. */
9746 if (ix86_user_incoming_stack_boundary)
9747 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9748 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9749 if -mstackrealign is used, it isn't used for sibcall check and
9750 estimated stack alignment is 128bit. */
9751 else if (!sibcall
9752 && !TARGET_64BIT
9753 && ix86_force_align_arg_pointer
9754 && crtl->stack_alignment_estimated == 128)
9755 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9756 else
9757 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9759 /* Incoming stack alignment can be changed on individual functions
9760 via force_align_arg_pointer attribute. We use the smallest
9761 incoming stack boundary. */
9762 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9763 && lookup_attribute (ix86_force_align_arg_pointer_string,
9764 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9765 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9767 /* The incoming stack frame has to be aligned at least at
9768 parm_stack_boundary. */
9769 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9770 incoming_stack_boundary = crtl->parm_stack_boundary;
9772 /* Stack at entrance of main is aligned by runtime. We use the
9773 smallest incoming stack boundary. */
9774 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9775 && DECL_NAME (current_function_decl)
9776 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9777 && DECL_FILE_SCOPE_P (current_function_decl))
9778 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9780 return incoming_stack_boundary;
9783 /* Update incoming stack boundary and estimated stack alignment. */
9785 static void
9786 ix86_update_stack_boundary (void)
9788 ix86_incoming_stack_boundary
9789 = ix86_minimum_incoming_stack_boundary (false);
9791 /* x86_64 vararg needs 16byte stack alignment for register save
9792 area. */
9793 if (TARGET_64BIT
9794 && cfun->stdarg
9795 && crtl->stack_alignment_estimated < 128)
9796 crtl->stack_alignment_estimated = 128;
9799 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9800 needed or an rtx for DRAP otherwise. */
9802 static rtx
9803 ix86_get_drap_rtx (void)
9805 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9806 crtl->need_drap = true;
9808 if (stack_realign_drap)
9810 /* Assign DRAP to vDRAP and returns vDRAP */
9811 unsigned int regno = find_drap_reg ();
9812 rtx drap_vreg;
9813 rtx arg_ptr;
9814 rtx seq, insn;
9816 arg_ptr = gen_rtx_REG (Pmode, regno);
9817 crtl->drap_reg = arg_ptr;
9819 start_sequence ();
9820 drap_vreg = copy_to_reg (arg_ptr);
9821 seq = get_insns ();
9822 end_sequence ();
9824 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9825 if (!optimize)
9827 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9828 RTX_FRAME_RELATED_P (insn) = 1;
9830 return drap_vreg;
9832 else
9833 return NULL;
9836 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9838 static rtx
9839 ix86_internal_arg_pointer (void)
9841 return virtual_incoming_args_rtx;
9844 struct scratch_reg {
9845 rtx reg;
9846 bool saved;
9849 /* Return a short-lived scratch register for use on function entry.
9850 In 32-bit mode, it is valid only after the registers are saved
9851 in the prologue. This register must be released by means of
9852 release_scratch_register_on_entry once it is dead. */
9854 static void
9855 get_scratch_register_on_entry (struct scratch_reg *sr)
9857 int regno;
9859 sr->saved = false;
9861 if (TARGET_64BIT)
9863 /* We always use R11 in 64-bit mode. */
9864 regno = R11_REG;
9866 else
9868 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9869 bool fastcall_p
9870 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9871 bool thiscall_p
9872 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9873 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9874 int regparm = ix86_function_regparm (fntype, decl);
9875 int drap_regno
9876 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9878 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9879 for the static chain register. */
9880 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9881 && drap_regno != AX_REG)
9882 regno = AX_REG;
9883 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9884 for the static chain register. */
9885 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9886 regno = AX_REG;
9887 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9888 regno = DX_REG;
9889 /* ecx is the static chain register. */
9890 else if (regparm < 3 && !fastcall_p && !thiscall_p
9891 && !static_chain_p
9892 && drap_regno != CX_REG)
9893 regno = CX_REG;
9894 else if (ix86_save_reg (BX_REG, true))
9895 regno = BX_REG;
9896 /* esi is the static chain register. */
9897 else if (!(regparm == 3 && static_chain_p)
9898 && ix86_save_reg (SI_REG, true))
9899 regno = SI_REG;
9900 else if (ix86_save_reg (DI_REG, true))
9901 regno = DI_REG;
9902 else
9904 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9905 sr->saved = true;
9909 sr->reg = gen_rtx_REG (Pmode, regno);
9910 if (sr->saved)
9912 rtx insn = emit_insn (gen_push (sr->reg));
9913 RTX_FRAME_RELATED_P (insn) = 1;
9917 /* Release a scratch register obtained from the preceding function. */
9919 static void
9920 release_scratch_register_on_entry (struct scratch_reg *sr)
9922 if (sr->saved)
9924 struct machine_function *m = cfun->machine;
9925 rtx x, insn = emit_insn (gen_pop (sr->reg));
9927 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9928 RTX_FRAME_RELATED_P (insn) = 1;
9929 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9930 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9931 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9932 m->fs.sp_offset -= UNITS_PER_WORD;
9936 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9938 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9940 static void
9941 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9943 /* We skip the probe for the first interval + a small dope of 4 words and
9944 probe that many bytes past the specified size to maintain a protection
9945 area at the botton of the stack. */
9946 const int dope = 4 * UNITS_PER_WORD;
9947 rtx size_rtx = GEN_INT (size), last;
9949 /* See if we have a constant small number of probes to generate. If so,
9950 that's the easy case. The run-time loop is made up of 11 insns in the
9951 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9952 for n # of intervals. */
9953 if (size <= 5 * PROBE_INTERVAL)
9955 HOST_WIDE_INT i, adjust;
9956 bool first_probe = true;
9958 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9959 values of N from 1 until it exceeds SIZE. If only one probe is
9960 needed, this will not generate any code. Then adjust and probe
9961 to PROBE_INTERVAL + SIZE. */
9962 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9964 if (first_probe)
9966 adjust = 2 * PROBE_INTERVAL + dope;
9967 first_probe = false;
9969 else
9970 adjust = PROBE_INTERVAL;
9972 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9973 plus_constant (Pmode, stack_pointer_rtx,
9974 -adjust)));
9975 emit_stack_probe (stack_pointer_rtx);
9978 if (first_probe)
9979 adjust = size + PROBE_INTERVAL + dope;
9980 else
9981 adjust = size + PROBE_INTERVAL - i;
9983 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9984 plus_constant (Pmode, stack_pointer_rtx,
9985 -adjust)));
9986 emit_stack_probe (stack_pointer_rtx);
9988 /* Adjust back to account for the additional first interval. */
9989 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9990 plus_constant (Pmode, stack_pointer_rtx,
9991 PROBE_INTERVAL + dope)));
9994 /* Otherwise, do the same as above, but in a loop. Note that we must be
9995 extra careful with variables wrapping around because we might be at
9996 the very top (or the very bottom) of the address space and we have
9997 to be able to handle this case properly; in particular, we use an
9998 equality test for the loop condition. */
9999 else
10001 HOST_WIDE_INT rounded_size;
10002 struct scratch_reg sr;
10004 get_scratch_register_on_entry (&sr);
10007 /* Step 1: round SIZE to the previous multiple of the interval. */
10009 rounded_size = size & -PROBE_INTERVAL;
10012 /* Step 2: compute initial and final value of the loop counter. */
10014 /* SP = SP_0 + PROBE_INTERVAL. */
10015 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10016 plus_constant (Pmode, stack_pointer_rtx,
10017 - (PROBE_INTERVAL + dope))));
10019 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10020 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10021 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10022 gen_rtx_PLUS (Pmode, sr.reg,
10023 stack_pointer_rtx)));
10026 /* Step 3: the loop
10028 while (SP != LAST_ADDR)
10030 SP = SP + PROBE_INTERVAL
10031 probe at SP
10034 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10035 values of N from 1 until it is equal to ROUNDED_SIZE. */
10037 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10040 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10041 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10043 if (size != rounded_size)
10045 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10046 plus_constant (Pmode, stack_pointer_rtx,
10047 rounded_size - size)));
10048 emit_stack_probe (stack_pointer_rtx);
10051 /* Adjust back to account for the additional first interval. */
10052 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10053 plus_constant (Pmode, stack_pointer_rtx,
10054 PROBE_INTERVAL + dope)));
10056 release_scratch_register_on_entry (&sr);
10059 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10061 /* Even if the stack pointer isn't the CFA register, we need to correctly
10062 describe the adjustments made to it, in particular differentiate the
10063 frame-related ones from the frame-unrelated ones. */
10064 if (size > 0)
10066 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10067 XVECEXP (expr, 0, 0)
10068 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10069 plus_constant (Pmode, stack_pointer_rtx, -size));
10070 XVECEXP (expr, 0, 1)
10071 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10072 plus_constant (Pmode, stack_pointer_rtx,
10073 PROBE_INTERVAL + dope + size));
10074 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10075 RTX_FRAME_RELATED_P (last) = 1;
10077 cfun->machine->fs.sp_offset += size;
10080 /* Make sure nothing is scheduled before we are done. */
10081 emit_insn (gen_blockage ());
10084 /* Adjust the stack pointer up to REG while probing it. */
10086 const char *
10087 output_adjust_stack_and_probe (rtx reg)
10089 static int labelno = 0;
10090 char loop_lab[32], end_lab[32];
10091 rtx xops[2];
10093 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10094 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10096 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10098 /* Jump to END_LAB if SP == LAST_ADDR. */
10099 xops[0] = stack_pointer_rtx;
10100 xops[1] = reg;
10101 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10102 fputs ("\tje\t", asm_out_file);
10103 assemble_name_raw (asm_out_file, end_lab);
10104 fputc ('\n', asm_out_file);
10106 /* SP = SP + PROBE_INTERVAL. */
10107 xops[1] = GEN_INT (PROBE_INTERVAL);
10108 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10110 /* Probe at SP. */
10111 xops[1] = const0_rtx;
10112 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10114 fprintf (asm_out_file, "\tjmp\t");
10115 assemble_name_raw (asm_out_file, loop_lab);
10116 fputc ('\n', asm_out_file);
10118 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10120 return "";
10123 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10124 inclusive. These are offsets from the current stack pointer. */
10126 static void
10127 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10129 /* See if we have a constant small number of probes to generate. If so,
10130 that's the easy case. The run-time loop is made up of 7 insns in the
10131 generic case while the compile-time loop is made up of n insns for n #
10132 of intervals. */
10133 if (size <= 7 * PROBE_INTERVAL)
10135 HOST_WIDE_INT i;
10137 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10138 it exceeds SIZE. If only one probe is needed, this will not
10139 generate any code. Then probe at FIRST + SIZE. */
10140 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10141 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10142 -(first + i)));
10144 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10145 -(first + size)));
10148 /* Otherwise, do the same as above, but in a loop. Note that we must be
10149 extra careful with variables wrapping around because we might be at
10150 the very top (or the very bottom) of the address space and we have
10151 to be able to handle this case properly; in particular, we use an
10152 equality test for the loop condition. */
10153 else
10155 HOST_WIDE_INT rounded_size, last;
10156 struct scratch_reg sr;
10158 get_scratch_register_on_entry (&sr);
10161 /* Step 1: round SIZE to the previous multiple of the interval. */
10163 rounded_size = size & -PROBE_INTERVAL;
10166 /* Step 2: compute initial and final value of the loop counter. */
10168 /* TEST_OFFSET = FIRST. */
10169 emit_move_insn (sr.reg, GEN_INT (-first));
10171 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10172 last = first + rounded_size;
10175 /* Step 3: the loop
10177 while (TEST_ADDR != LAST_ADDR)
10179 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10180 probe at TEST_ADDR
10183 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10184 until it is equal to ROUNDED_SIZE. */
10186 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10189 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10190 that SIZE is equal to ROUNDED_SIZE. */
10192 if (size != rounded_size)
10193 emit_stack_probe (plus_constant (Pmode,
10194 gen_rtx_PLUS (Pmode,
10195 stack_pointer_rtx,
10196 sr.reg),
10197 rounded_size - size));
10199 release_scratch_register_on_entry (&sr);
10202 /* Make sure nothing is scheduled before we are done. */
10203 emit_insn (gen_blockage ());
10206 /* Probe a range of stack addresses from REG to END, inclusive. These are
10207 offsets from the current stack pointer. */
10209 const char *
10210 output_probe_stack_range (rtx reg, rtx end)
10212 static int labelno = 0;
10213 char loop_lab[32], end_lab[32];
10214 rtx xops[3];
10216 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10217 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10219 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10221 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10222 xops[0] = reg;
10223 xops[1] = end;
10224 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10225 fputs ("\tje\t", asm_out_file);
10226 assemble_name_raw (asm_out_file, end_lab);
10227 fputc ('\n', asm_out_file);
10229 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10230 xops[1] = GEN_INT (PROBE_INTERVAL);
10231 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10233 /* Probe at TEST_ADDR. */
10234 xops[0] = stack_pointer_rtx;
10235 xops[1] = reg;
10236 xops[2] = const0_rtx;
10237 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10239 fprintf (asm_out_file, "\tjmp\t");
10240 assemble_name_raw (asm_out_file, loop_lab);
10241 fputc ('\n', asm_out_file);
10243 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10245 return "";
10248 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10249 to be generated in correct form. */
10250 static void
10251 ix86_finalize_stack_realign_flags (void)
10253 /* Check if stack realign is really needed after reload, and
10254 stores result in cfun */
10255 unsigned int incoming_stack_boundary
10256 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10257 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10258 unsigned int stack_realign = (incoming_stack_boundary
10259 < (crtl->is_leaf
10260 ? crtl->max_used_stack_slot_alignment
10261 : crtl->stack_alignment_needed));
10263 if (crtl->stack_realign_finalized)
10265 /* After stack_realign_needed is finalized, we can't no longer
10266 change it. */
10267 gcc_assert (crtl->stack_realign_needed == stack_realign);
10268 return;
10271 /* If the only reason for frame_pointer_needed is that we conservatively
10272 assumed stack realignment might be needed, but in the end nothing that
10273 needed the stack alignment had been spilled, clear frame_pointer_needed
10274 and say we don't need stack realignment. */
10275 if (stack_realign
10276 && !crtl->need_drap
10277 && frame_pointer_needed
10278 && crtl->is_leaf
10279 && flag_omit_frame_pointer
10280 && crtl->sp_is_unchanging
10281 && !ix86_current_function_calls_tls_descriptor
10282 && !crtl->accesses_prior_frames
10283 && !cfun->calls_alloca
10284 && !crtl->calls_eh_return
10285 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10286 && !ix86_frame_pointer_required ()
10287 && get_frame_size () == 0
10288 && ix86_nsaved_sseregs () == 0
10289 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10291 HARD_REG_SET set_up_by_prologue, prologue_used;
10292 basic_block bb;
10294 CLEAR_HARD_REG_SET (prologue_used);
10295 CLEAR_HARD_REG_SET (set_up_by_prologue);
10296 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10297 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10298 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10299 HARD_FRAME_POINTER_REGNUM);
10300 FOR_EACH_BB (bb)
10302 rtx insn;
10303 FOR_BB_INSNS (bb, insn)
10304 if (NONDEBUG_INSN_P (insn)
10305 && requires_stack_frame_p (insn, prologue_used,
10306 set_up_by_prologue))
10308 crtl->stack_realign_needed = stack_realign;
10309 crtl->stack_realign_finalized = true;
10310 return;
10314 frame_pointer_needed = false;
10315 stack_realign = false;
10316 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10317 crtl->stack_alignment_needed = incoming_stack_boundary;
10318 crtl->stack_alignment_estimated = incoming_stack_boundary;
10319 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10320 crtl->preferred_stack_boundary = incoming_stack_boundary;
10321 df_finish_pass (true);
10322 df_scan_alloc (NULL);
10323 df_scan_blocks ();
10324 df_compute_regs_ever_live (true);
10325 df_analyze ();
10328 crtl->stack_realign_needed = stack_realign;
10329 crtl->stack_realign_finalized = true;
10332 /* Expand the prologue into a bunch of separate insns. */
10334 void
10335 ix86_expand_prologue (void)
10337 struct machine_function *m = cfun->machine;
10338 rtx insn, t;
10339 bool pic_reg_used;
10340 struct ix86_frame frame;
10341 HOST_WIDE_INT allocate;
10342 bool int_registers_saved;
10343 bool sse_registers_saved;
10345 ix86_finalize_stack_realign_flags ();
10347 /* DRAP should not coexist with stack_realign_fp */
10348 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10350 memset (&m->fs, 0, sizeof (m->fs));
10352 /* Initialize CFA state for before the prologue. */
10353 m->fs.cfa_reg = stack_pointer_rtx;
10354 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10356 /* Track SP offset to the CFA. We continue tracking this after we've
10357 swapped the CFA register away from SP. In the case of re-alignment
10358 this is fudged; we're interested to offsets within the local frame. */
10359 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10360 m->fs.sp_valid = true;
10362 ix86_compute_frame_layout (&frame);
10364 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10366 /* We should have already generated an error for any use of
10367 ms_hook on a nested function. */
10368 gcc_checking_assert (!ix86_static_chain_on_stack);
10370 /* Check if profiling is active and we shall use profiling before
10371 prologue variant. If so sorry. */
10372 if (crtl->profile && flag_fentry != 0)
10373 sorry ("ms_hook_prologue attribute isn%'t compatible "
10374 "with -mfentry for 32-bit");
10376 /* In ix86_asm_output_function_label we emitted:
10377 8b ff movl.s %edi,%edi
10378 55 push %ebp
10379 8b ec movl.s %esp,%ebp
10381 This matches the hookable function prologue in Win32 API
10382 functions in Microsoft Windows XP Service Pack 2 and newer.
10383 Wine uses this to enable Windows apps to hook the Win32 API
10384 functions provided by Wine.
10386 What that means is that we've already set up the frame pointer. */
10388 if (frame_pointer_needed
10389 && !(crtl->drap_reg && crtl->stack_realign_needed))
10391 rtx push, mov;
10393 /* We've decided to use the frame pointer already set up.
10394 Describe this to the unwinder by pretending that both
10395 push and mov insns happen right here.
10397 Putting the unwind info here at the end of the ms_hook
10398 is done so that we can make absolutely certain we get
10399 the required byte sequence at the start of the function,
10400 rather than relying on an assembler that can produce
10401 the exact encoding required.
10403 However it does mean (in the unpatched case) that we have
10404 a 1 insn window where the asynchronous unwind info is
10405 incorrect. However, if we placed the unwind info at
10406 its correct location we would have incorrect unwind info
10407 in the patched case. Which is probably all moot since
10408 I don't expect Wine generates dwarf2 unwind info for the
10409 system libraries that use this feature. */
10411 insn = emit_insn (gen_blockage ());
10413 push = gen_push (hard_frame_pointer_rtx);
10414 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10415 stack_pointer_rtx);
10416 RTX_FRAME_RELATED_P (push) = 1;
10417 RTX_FRAME_RELATED_P (mov) = 1;
10419 RTX_FRAME_RELATED_P (insn) = 1;
10420 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10421 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10423 /* Note that gen_push incremented m->fs.cfa_offset, even
10424 though we didn't emit the push insn here. */
10425 m->fs.cfa_reg = hard_frame_pointer_rtx;
10426 m->fs.fp_offset = m->fs.cfa_offset;
10427 m->fs.fp_valid = true;
10429 else
10431 /* The frame pointer is not needed so pop %ebp again.
10432 This leaves us with a pristine state. */
10433 emit_insn (gen_pop (hard_frame_pointer_rtx));
10437 /* The first insn of a function that accepts its static chain on the
10438 stack is to push the register that would be filled in by a direct
10439 call. This insn will be skipped by the trampoline. */
10440 else if (ix86_static_chain_on_stack)
10442 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10443 emit_insn (gen_blockage ());
10445 /* We don't want to interpret this push insn as a register save,
10446 only as a stack adjustment. The real copy of the register as
10447 a save will be done later, if needed. */
10448 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10449 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10450 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10451 RTX_FRAME_RELATED_P (insn) = 1;
10454 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10455 of DRAP is needed and stack realignment is really needed after reload */
10456 if (stack_realign_drap)
10458 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10460 /* Only need to push parameter pointer reg if it is caller saved. */
10461 if (!call_used_regs[REGNO (crtl->drap_reg)])
10463 /* Push arg pointer reg */
10464 insn = emit_insn (gen_push (crtl->drap_reg));
10465 RTX_FRAME_RELATED_P (insn) = 1;
10468 /* Grab the argument pointer. */
10469 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10470 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10471 RTX_FRAME_RELATED_P (insn) = 1;
10472 m->fs.cfa_reg = crtl->drap_reg;
10473 m->fs.cfa_offset = 0;
10475 /* Align the stack. */
10476 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10477 stack_pointer_rtx,
10478 GEN_INT (-align_bytes)));
10479 RTX_FRAME_RELATED_P (insn) = 1;
10481 /* Replicate the return address on the stack so that return
10482 address can be reached via (argp - 1) slot. This is needed
10483 to implement macro RETURN_ADDR_RTX and intrinsic function
10484 expand_builtin_return_addr etc. */
10485 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10486 t = gen_frame_mem (word_mode, t);
10487 insn = emit_insn (gen_push (t));
10488 RTX_FRAME_RELATED_P (insn) = 1;
10490 /* For the purposes of frame and register save area addressing,
10491 we've started over with a new frame. */
10492 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10493 m->fs.realigned = true;
10496 int_registers_saved = (frame.nregs == 0);
10497 sse_registers_saved = (frame.nsseregs == 0);
10499 if (frame_pointer_needed && !m->fs.fp_valid)
10501 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10502 slower on all targets. Also sdb doesn't like it. */
10503 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10504 RTX_FRAME_RELATED_P (insn) = 1;
10506 /* Push registers now, before setting the frame pointer
10507 on SEH target. */
10508 if (!int_registers_saved
10509 && TARGET_SEH
10510 && !frame.save_regs_using_mov)
10512 ix86_emit_save_regs ();
10513 int_registers_saved = true;
10514 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10517 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10519 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10520 RTX_FRAME_RELATED_P (insn) = 1;
10522 if (m->fs.cfa_reg == stack_pointer_rtx)
10523 m->fs.cfa_reg = hard_frame_pointer_rtx;
10524 m->fs.fp_offset = m->fs.sp_offset;
10525 m->fs.fp_valid = true;
10529 if (!int_registers_saved)
10531 /* If saving registers via PUSH, do so now. */
10532 if (!frame.save_regs_using_mov)
10534 ix86_emit_save_regs ();
10535 int_registers_saved = true;
10536 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10539 /* When using red zone we may start register saving before allocating
10540 the stack frame saving one cycle of the prologue. However, avoid
10541 doing this if we have to probe the stack; at least on x86_64 the
10542 stack probe can turn into a call that clobbers a red zone location. */
10543 else if (ix86_using_red_zone ()
10544 && (! TARGET_STACK_PROBE
10545 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10547 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10548 int_registers_saved = true;
10552 if (stack_realign_fp)
10554 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10555 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10557 /* The computation of the size of the re-aligned stack frame means
10558 that we must allocate the size of the register save area before
10559 performing the actual alignment. Otherwise we cannot guarantee
10560 that there's enough storage above the realignment point. */
10561 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10562 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10563 GEN_INT (m->fs.sp_offset
10564 - frame.sse_reg_save_offset),
10565 -1, false);
10567 /* Align the stack. */
10568 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10569 stack_pointer_rtx,
10570 GEN_INT (-align_bytes)));
10572 /* For the purposes of register save area addressing, the stack
10573 pointer is no longer valid. As for the value of sp_offset,
10574 see ix86_compute_frame_layout, which we need to match in order
10575 to pass verification of stack_pointer_offset at the end. */
10576 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10577 m->fs.sp_valid = false;
10580 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10582 if (flag_stack_usage_info)
10584 /* We start to count from ARG_POINTER. */
10585 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10587 /* If it was realigned, take into account the fake frame. */
10588 if (stack_realign_drap)
10590 if (ix86_static_chain_on_stack)
10591 stack_size += UNITS_PER_WORD;
10593 if (!call_used_regs[REGNO (crtl->drap_reg)])
10594 stack_size += UNITS_PER_WORD;
10596 /* This over-estimates by 1 minimal-stack-alignment-unit but
10597 mitigates that by counting in the new return address slot. */
10598 current_function_dynamic_stack_size
10599 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10602 current_function_static_stack_size = stack_size;
10605 /* On SEH target with very large frame size, allocate an area to save
10606 SSE registers (as the very large allocation won't be described). */
10607 if (TARGET_SEH
10608 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10609 && !sse_registers_saved)
10611 HOST_WIDE_INT sse_size =
10612 frame.sse_reg_save_offset - frame.reg_save_offset;
10614 gcc_assert (int_registers_saved);
10616 /* No need to do stack checking as the area will be immediately
10617 written. */
10618 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10619 GEN_INT (-sse_size), -1,
10620 m->fs.cfa_reg == stack_pointer_rtx);
10621 allocate -= sse_size;
10622 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10623 sse_registers_saved = true;
10626 /* The stack has already been decremented by the instruction calling us
10627 so probe if the size is non-negative to preserve the protection area. */
10628 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10630 /* We expect the registers to be saved when probes are used. */
10631 gcc_assert (int_registers_saved);
10633 if (STACK_CHECK_MOVING_SP)
10635 ix86_adjust_stack_and_probe (allocate);
10636 allocate = 0;
10638 else
10640 HOST_WIDE_INT size = allocate;
10642 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10643 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10645 if (TARGET_STACK_PROBE)
10646 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10647 else
10648 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10652 if (allocate == 0)
10654 else if (!ix86_target_stack_probe ()
10655 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10657 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10658 GEN_INT (-allocate), -1,
10659 m->fs.cfa_reg == stack_pointer_rtx);
10661 else
10663 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10664 rtx r10 = NULL;
10665 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10666 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10667 bool eax_live = false;
10668 bool r10_live = false;
10670 if (TARGET_64BIT)
10671 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10672 if (!TARGET_64BIT_MS_ABI)
10673 eax_live = ix86_eax_live_at_start_p ();
10675 /* Note that SEH directives need to continue tracking the stack
10676 pointer even after the frame pointer has been set up. */
10677 if (eax_live)
10679 insn = emit_insn (gen_push (eax));
10680 allocate -= UNITS_PER_WORD;
10681 if (sp_is_cfa_reg || TARGET_SEH)
10683 if (sp_is_cfa_reg)
10684 m->fs.cfa_offset += UNITS_PER_WORD;
10685 RTX_FRAME_RELATED_P (insn) = 1;
10689 if (r10_live)
10691 r10 = gen_rtx_REG (Pmode, R10_REG);
10692 insn = emit_insn (gen_push (r10));
10693 allocate -= UNITS_PER_WORD;
10694 if (sp_is_cfa_reg || TARGET_SEH)
10696 if (sp_is_cfa_reg)
10697 m->fs.cfa_offset += UNITS_PER_WORD;
10698 RTX_FRAME_RELATED_P (insn) = 1;
10702 emit_move_insn (eax, GEN_INT (allocate));
10703 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10705 /* Use the fact that AX still contains ALLOCATE. */
10706 adjust_stack_insn = (Pmode == DImode
10707 ? gen_pro_epilogue_adjust_stack_di_sub
10708 : gen_pro_epilogue_adjust_stack_si_sub);
10710 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10711 stack_pointer_rtx, eax));
10713 if (sp_is_cfa_reg || TARGET_SEH)
10715 if (sp_is_cfa_reg)
10716 m->fs.cfa_offset += allocate;
10717 RTX_FRAME_RELATED_P (insn) = 1;
10718 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10719 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10720 plus_constant (Pmode, stack_pointer_rtx,
10721 -allocate)));
10723 m->fs.sp_offset += allocate;
10725 if (r10_live && eax_live)
10727 t = choose_baseaddr (m->fs.sp_offset - allocate);
10728 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10729 gen_frame_mem (word_mode, t));
10730 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10731 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10732 gen_frame_mem (word_mode, t));
10734 else if (eax_live || r10_live)
10736 t = choose_baseaddr (m->fs.sp_offset - allocate);
10737 emit_move_insn (gen_rtx_REG (word_mode,
10738 (eax_live ? AX_REG : R10_REG)),
10739 gen_frame_mem (word_mode, t));
10742 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10744 /* If we havn't already set up the frame pointer, do so now. */
10745 if (frame_pointer_needed && !m->fs.fp_valid)
10747 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10748 GEN_INT (frame.stack_pointer_offset
10749 - frame.hard_frame_pointer_offset));
10750 insn = emit_insn (insn);
10751 RTX_FRAME_RELATED_P (insn) = 1;
10752 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10754 if (m->fs.cfa_reg == stack_pointer_rtx)
10755 m->fs.cfa_reg = hard_frame_pointer_rtx;
10756 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10757 m->fs.fp_valid = true;
10760 if (!int_registers_saved)
10761 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10762 if (!sse_registers_saved)
10763 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10765 pic_reg_used = false;
10766 /* We don't use pic-register for pe-coff target. */
10767 if (pic_offset_table_rtx
10768 && !TARGET_PECOFF
10769 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10770 || crtl->profile))
10772 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10774 if (alt_pic_reg_used != INVALID_REGNUM)
10775 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10777 pic_reg_used = true;
10780 if (pic_reg_used)
10782 if (TARGET_64BIT)
10784 if (ix86_cmodel == CM_LARGE_PIC)
10786 rtx label, tmp_reg;
10788 gcc_assert (Pmode == DImode);
10789 label = gen_label_rtx ();
10790 emit_label (label);
10791 LABEL_PRESERVE_P (label) = 1;
10792 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10793 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10794 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10795 label));
10796 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10797 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10798 pic_offset_table_rtx, tmp_reg));
10800 else
10801 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10803 else
10805 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10806 RTX_FRAME_RELATED_P (insn) = 1;
10807 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10811 /* In the pic_reg_used case, make sure that the got load isn't deleted
10812 when mcount needs it. Blockage to avoid call movement across mcount
10813 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10814 note. */
10815 if (crtl->profile && !flag_fentry && pic_reg_used)
10816 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10818 if (crtl->drap_reg && !crtl->stack_realign_needed)
10820 /* vDRAP is setup but after reload it turns out stack realign
10821 isn't necessary, here we will emit prologue to setup DRAP
10822 without stack realign adjustment */
10823 t = choose_baseaddr (0);
10824 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10827 /* Prevent instructions from being scheduled into register save push
10828 sequence when access to the redzone area is done through frame pointer.
10829 The offset between the frame pointer and the stack pointer is calculated
10830 relative to the value of the stack pointer at the end of the function
10831 prologue, and moving instructions that access redzone area via frame
10832 pointer inside push sequence violates this assumption. */
10833 if (frame_pointer_needed && frame.red_zone_size)
10834 emit_insn (gen_memory_blockage ());
10836 /* Emit cld instruction if stringops are used in the function. */
10837 if (TARGET_CLD && ix86_current_function_needs_cld)
10838 emit_insn (gen_cld ());
10840 /* SEH requires that the prologue end within 256 bytes of the start of
10841 the function. Prevent instruction schedules that would extend that.
10842 Further, prevent alloca modifications to the stack pointer from being
10843 combined with prologue modifications. */
10844 if (TARGET_SEH)
10845 emit_insn (gen_prologue_use (stack_pointer_rtx));
10848 /* Emit code to restore REG using a POP insn. */
10850 static void
10851 ix86_emit_restore_reg_using_pop (rtx reg)
10853 struct machine_function *m = cfun->machine;
10854 rtx insn = emit_insn (gen_pop (reg));
10856 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10857 m->fs.sp_offset -= UNITS_PER_WORD;
10859 if (m->fs.cfa_reg == crtl->drap_reg
10860 && REGNO (reg) == REGNO (crtl->drap_reg))
10862 /* Previously we'd represented the CFA as an expression
10863 like *(%ebp - 8). We've just popped that value from
10864 the stack, which means we need to reset the CFA to
10865 the drap register. This will remain until we restore
10866 the stack pointer. */
10867 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10868 RTX_FRAME_RELATED_P (insn) = 1;
10870 /* This means that the DRAP register is valid for addressing too. */
10871 m->fs.drap_valid = true;
10872 return;
10875 if (m->fs.cfa_reg == stack_pointer_rtx)
10877 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10878 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10879 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10880 RTX_FRAME_RELATED_P (insn) = 1;
10882 m->fs.cfa_offset -= UNITS_PER_WORD;
10885 /* When the frame pointer is the CFA, and we pop it, we are
10886 swapping back to the stack pointer as the CFA. This happens
10887 for stack frames that don't allocate other data, so we assume
10888 the stack pointer is now pointing at the return address, i.e.
10889 the function entry state, which makes the offset be 1 word. */
10890 if (reg == hard_frame_pointer_rtx)
10892 m->fs.fp_valid = false;
10893 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10895 m->fs.cfa_reg = stack_pointer_rtx;
10896 m->fs.cfa_offset -= UNITS_PER_WORD;
10898 add_reg_note (insn, REG_CFA_DEF_CFA,
10899 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10900 GEN_INT (m->fs.cfa_offset)));
10901 RTX_FRAME_RELATED_P (insn) = 1;
10906 /* Emit code to restore saved registers using POP insns. */
10908 static void
10909 ix86_emit_restore_regs_using_pop (void)
10911 unsigned int regno;
10913 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10914 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10915 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10918 /* Emit code and notes for the LEAVE instruction. */
10920 static void
10921 ix86_emit_leave (void)
10923 struct machine_function *m = cfun->machine;
10924 rtx insn = emit_insn (ix86_gen_leave ());
10926 ix86_add_queued_cfa_restore_notes (insn);
10928 gcc_assert (m->fs.fp_valid);
10929 m->fs.sp_valid = true;
10930 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10931 m->fs.fp_valid = false;
10933 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10935 m->fs.cfa_reg = stack_pointer_rtx;
10936 m->fs.cfa_offset = m->fs.sp_offset;
10938 add_reg_note (insn, REG_CFA_DEF_CFA,
10939 plus_constant (Pmode, stack_pointer_rtx,
10940 m->fs.sp_offset));
10941 RTX_FRAME_RELATED_P (insn) = 1;
10943 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10944 m->fs.fp_offset);
10947 /* Emit code to restore saved registers using MOV insns.
10948 First register is restored from CFA - CFA_OFFSET. */
10949 static void
10950 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10951 bool maybe_eh_return)
10953 struct machine_function *m = cfun->machine;
10954 unsigned int regno;
10956 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10957 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10959 rtx reg = gen_rtx_REG (word_mode, regno);
10960 rtx insn, mem;
10962 mem = choose_baseaddr (cfa_offset);
10963 mem = gen_frame_mem (word_mode, mem);
10964 insn = emit_move_insn (reg, mem);
10966 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10968 /* Previously we'd represented the CFA as an expression
10969 like *(%ebp - 8). We've just popped that value from
10970 the stack, which means we need to reset the CFA to
10971 the drap register. This will remain until we restore
10972 the stack pointer. */
10973 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10974 RTX_FRAME_RELATED_P (insn) = 1;
10976 /* This means that the DRAP register is valid for addressing. */
10977 m->fs.drap_valid = true;
10979 else
10980 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10982 cfa_offset -= UNITS_PER_WORD;
10986 /* Emit code to restore saved registers using MOV insns.
10987 First register is restored from CFA - CFA_OFFSET. */
10988 static void
10989 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10990 bool maybe_eh_return)
10992 unsigned int regno;
10994 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10995 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10997 rtx reg = gen_rtx_REG (V4SFmode, regno);
10998 rtx mem;
11000 mem = choose_baseaddr (cfa_offset);
11001 mem = gen_rtx_MEM (V4SFmode, mem);
11002 set_mem_align (mem, 128);
11003 emit_move_insn (reg, mem);
11005 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11007 cfa_offset -= 16;
11011 /* Restore function stack, frame, and registers. */
11013 void
11014 ix86_expand_epilogue (int style)
11016 struct machine_function *m = cfun->machine;
11017 struct machine_frame_state frame_state_save = m->fs;
11018 struct ix86_frame frame;
11019 bool restore_regs_via_mov;
11020 bool using_drap;
11022 ix86_finalize_stack_realign_flags ();
11023 ix86_compute_frame_layout (&frame);
11025 m->fs.sp_valid = (!frame_pointer_needed
11026 || (crtl->sp_is_unchanging
11027 && !stack_realign_fp));
11028 gcc_assert (!m->fs.sp_valid
11029 || m->fs.sp_offset == frame.stack_pointer_offset);
11031 /* The FP must be valid if the frame pointer is present. */
11032 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11033 gcc_assert (!m->fs.fp_valid
11034 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11036 /* We must have *some* valid pointer to the stack frame. */
11037 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11039 /* The DRAP is never valid at this point. */
11040 gcc_assert (!m->fs.drap_valid);
11042 /* See the comment about red zone and frame
11043 pointer usage in ix86_expand_prologue. */
11044 if (frame_pointer_needed && frame.red_zone_size)
11045 emit_insn (gen_memory_blockage ());
11047 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11048 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11050 /* Determine the CFA offset of the end of the red-zone. */
11051 m->fs.red_zone_offset = 0;
11052 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11054 /* The red-zone begins below the return address. */
11055 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11057 /* When the register save area is in the aligned portion of
11058 the stack, determine the maximum runtime displacement that
11059 matches up with the aligned frame. */
11060 if (stack_realign_drap)
11061 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11062 + UNITS_PER_WORD);
11065 /* Special care must be taken for the normal return case of a function
11066 using eh_return: the eax and edx registers are marked as saved, but
11067 not restored along this path. Adjust the save location to match. */
11068 if (crtl->calls_eh_return && style != 2)
11069 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11071 /* EH_RETURN requires the use of moves to function properly. */
11072 if (crtl->calls_eh_return)
11073 restore_regs_via_mov = true;
11074 /* SEH requires the use of pops to identify the epilogue. */
11075 else if (TARGET_SEH)
11076 restore_regs_via_mov = false;
11077 /* If we're only restoring one register and sp is not valid then
11078 using a move instruction to restore the register since it's
11079 less work than reloading sp and popping the register. */
11080 else if (!m->fs.sp_valid && frame.nregs <= 1)
11081 restore_regs_via_mov = true;
11082 else if (TARGET_EPILOGUE_USING_MOVE
11083 && cfun->machine->use_fast_prologue_epilogue
11084 && (frame.nregs > 1
11085 || m->fs.sp_offset != frame.reg_save_offset))
11086 restore_regs_via_mov = true;
11087 else if (frame_pointer_needed
11088 && !frame.nregs
11089 && m->fs.sp_offset != frame.reg_save_offset)
11090 restore_regs_via_mov = true;
11091 else if (frame_pointer_needed
11092 && TARGET_USE_LEAVE
11093 && cfun->machine->use_fast_prologue_epilogue
11094 && frame.nregs == 1)
11095 restore_regs_via_mov = true;
11096 else
11097 restore_regs_via_mov = false;
11099 if (restore_regs_via_mov || frame.nsseregs)
11101 /* Ensure that the entire register save area is addressable via
11102 the stack pointer, if we will restore via sp. */
11103 if (TARGET_64BIT
11104 && m->fs.sp_offset > 0x7fffffff
11105 && !(m->fs.fp_valid || m->fs.drap_valid)
11106 && (frame.nsseregs + frame.nregs) != 0)
11108 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11109 GEN_INT (m->fs.sp_offset
11110 - frame.sse_reg_save_offset),
11111 style,
11112 m->fs.cfa_reg == stack_pointer_rtx);
11116 /* If there are any SSE registers to restore, then we have to do it
11117 via moves, since there's obviously no pop for SSE regs. */
11118 if (frame.nsseregs)
11119 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11120 style == 2);
11122 if (restore_regs_via_mov)
11124 rtx t;
11126 if (frame.nregs)
11127 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11129 /* eh_return epilogues need %ecx added to the stack pointer. */
11130 if (style == 2)
11132 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11134 /* Stack align doesn't work with eh_return. */
11135 gcc_assert (!stack_realign_drap);
11136 /* Neither does regparm nested functions. */
11137 gcc_assert (!ix86_static_chain_on_stack);
11139 if (frame_pointer_needed)
11141 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11142 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11143 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11145 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11146 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11148 /* Note that we use SA as a temporary CFA, as the return
11149 address is at the proper place relative to it. We
11150 pretend this happens at the FP restore insn because
11151 prior to this insn the FP would be stored at the wrong
11152 offset relative to SA, and after this insn we have no
11153 other reasonable register to use for the CFA. We don't
11154 bother resetting the CFA to the SP for the duration of
11155 the return insn. */
11156 add_reg_note (insn, REG_CFA_DEF_CFA,
11157 plus_constant (Pmode, sa, UNITS_PER_WORD));
11158 ix86_add_queued_cfa_restore_notes (insn);
11159 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11160 RTX_FRAME_RELATED_P (insn) = 1;
11162 m->fs.cfa_reg = sa;
11163 m->fs.cfa_offset = UNITS_PER_WORD;
11164 m->fs.fp_valid = false;
11166 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11167 const0_rtx, style, false);
11169 else
11171 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11172 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11173 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11174 ix86_add_queued_cfa_restore_notes (insn);
11176 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11177 if (m->fs.cfa_offset != UNITS_PER_WORD)
11179 m->fs.cfa_offset = UNITS_PER_WORD;
11180 add_reg_note (insn, REG_CFA_DEF_CFA,
11181 plus_constant (Pmode, stack_pointer_rtx,
11182 UNITS_PER_WORD));
11183 RTX_FRAME_RELATED_P (insn) = 1;
11186 m->fs.sp_offset = UNITS_PER_WORD;
11187 m->fs.sp_valid = true;
11190 else
11192 /* SEH requires that the function end with (1) a stack adjustment
11193 if necessary, (2) a sequence of pops, and (3) a return or
11194 jump instruction. Prevent insns from the function body from
11195 being scheduled into this sequence. */
11196 if (TARGET_SEH)
11198 /* Prevent a catch region from being adjacent to the standard
11199 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11200 several other flags that would be interesting to test are
11201 not yet set up. */
11202 if (flag_non_call_exceptions)
11203 emit_insn (gen_nops (const1_rtx));
11204 else
11205 emit_insn (gen_blockage ());
11208 /* First step is to deallocate the stack frame so that we can
11209 pop the registers. Also do it on SEH target for very large
11210 frame as the emitted instructions aren't allowed by the ABI in
11211 epilogues. */
11212 if (!m->fs.sp_valid
11213 || (TARGET_SEH
11214 && (m->fs.sp_offset - frame.reg_save_offset
11215 >= SEH_MAX_FRAME_SIZE)))
11217 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11218 GEN_INT (m->fs.fp_offset
11219 - frame.reg_save_offset),
11220 style, false);
11222 else if (m->fs.sp_offset != frame.reg_save_offset)
11224 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11225 GEN_INT (m->fs.sp_offset
11226 - frame.reg_save_offset),
11227 style,
11228 m->fs.cfa_reg == stack_pointer_rtx);
11231 ix86_emit_restore_regs_using_pop ();
11234 /* If we used a stack pointer and haven't already got rid of it,
11235 then do so now. */
11236 if (m->fs.fp_valid)
11238 /* If the stack pointer is valid and pointing at the frame
11239 pointer store address, then we only need a pop. */
11240 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11241 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11242 /* Leave results in shorter dependency chains on CPUs that are
11243 able to grok it fast. */
11244 else if (TARGET_USE_LEAVE
11245 || optimize_bb_for_size_p (EXIT_BLOCK_PTR)
11246 || !cfun->machine->use_fast_prologue_epilogue)
11247 ix86_emit_leave ();
11248 else
11250 pro_epilogue_adjust_stack (stack_pointer_rtx,
11251 hard_frame_pointer_rtx,
11252 const0_rtx, style, !using_drap);
11253 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11257 if (using_drap)
11259 int param_ptr_offset = UNITS_PER_WORD;
11260 rtx insn;
11262 gcc_assert (stack_realign_drap);
11264 if (ix86_static_chain_on_stack)
11265 param_ptr_offset += UNITS_PER_WORD;
11266 if (!call_used_regs[REGNO (crtl->drap_reg)])
11267 param_ptr_offset += UNITS_PER_WORD;
11269 insn = emit_insn (gen_rtx_SET
11270 (VOIDmode, stack_pointer_rtx,
11271 gen_rtx_PLUS (Pmode,
11272 crtl->drap_reg,
11273 GEN_INT (-param_ptr_offset))));
11274 m->fs.cfa_reg = stack_pointer_rtx;
11275 m->fs.cfa_offset = param_ptr_offset;
11276 m->fs.sp_offset = param_ptr_offset;
11277 m->fs.realigned = false;
11279 add_reg_note (insn, REG_CFA_DEF_CFA,
11280 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11281 GEN_INT (param_ptr_offset)));
11282 RTX_FRAME_RELATED_P (insn) = 1;
11284 if (!call_used_regs[REGNO (crtl->drap_reg)])
11285 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11288 /* At this point the stack pointer must be valid, and we must have
11289 restored all of the registers. We may not have deallocated the
11290 entire stack frame. We've delayed this until now because it may
11291 be possible to merge the local stack deallocation with the
11292 deallocation forced by ix86_static_chain_on_stack. */
11293 gcc_assert (m->fs.sp_valid);
11294 gcc_assert (!m->fs.fp_valid);
11295 gcc_assert (!m->fs.realigned);
11296 if (m->fs.sp_offset != UNITS_PER_WORD)
11298 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11299 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11300 style, true);
11302 else
11303 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11305 /* Sibcall epilogues don't want a return instruction. */
11306 if (style == 0)
11308 m->fs = frame_state_save;
11309 return;
11312 if (crtl->args.pops_args && crtl->args.size)
11314 rtx popc = GEN_INT (crtl->args.pops_args);
11316 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11317 address, do explicit add, and jump indirectly to the caller. */
11319 if (crtl->args.pops_args >= 65536)
11321 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11322 rtx insn;
11324 /* There is no "pascal" calling convention in any 64bit ABI. */
11325 gcc_assert (!TARGET_64BIT);
11327 insn = emit_insn (gen_pop (ecx));
11328 m->fs.cfa_offset -= UNITS_PER_WORD;
11329 m->fs.sp_offset -= UNITS_PER_WORD;
11331 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11332 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11333 add_reg_note (insn, REG_CFA_REGISTER,
11334 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11335 RTX_FRAME_RELATED_P (insn) = 1;
11337 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11338 popc, -1, true);
11339 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11341 else
11342 emit_jump_insn (gen_simple_return_pop_internal (popc));
11344 else
11345 emit_jump_insn (gen_simple_return_internal ());
11347 /* Restore the state back to the state from the prologue,
11348 so that it's correct for the next epilogue. */
11349 m->fs = frame_state_save;
11352 /* Reset from the function's potential modifications. */
11354 static void
11355 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11356 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11358 if (pic_offset_table_rtx)
11359 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11360 #if TARGET_MACHO
11361 /* Mach-O doesn't support labels at the end of objects, so if
11362 it looks like we might want one, insert a NOP. */
11364 rtx insn = get_last_insn ();
11365 rtx deleted_debug_label = NULL_RTX;
11366 while (insn
11367 && NOTE_P (insn)
11368 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11370 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11371 notes only, instead set their CODE_LABEL_NUMBER to -1,
11372 otherwise there would be code generation differences
11373 in between -g and -g0. */
11374 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11375 deleted_debug_label = insn;
11376 insn = PREV_INSN (insn);
11378 if (insn
11379 && (LABEL_P (insn)
11380 || (NOTE_P (insn)
11381 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11382 fputs ("\tnop\n", file);
11383 else if (deleted_debug_label)
11384 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11385 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11386 CODE_LABEL_NUMBER (insn) = -1;
11388 #endif
11392 /* Return a scratch register to use in the split stack prologue. The
11393 split stack prologue is used for -fsplit-stack. It is the first
11394 instructions in the function, even before the regular prologue.
11395 The scratch register can be any caller-saved register which is not
11396 used for parameters or for the static chain. */
11398 static unsigned int
11399 split_stack_prologue_scratch_regno (void)
11401 if (TARGET_64BIT)
11402 return R11_REG;
11403 else
11405 bool is_fastcall, is_thiscall;
11406 int regparm;
11408 is_fastcall = (lookup_attribute ("fastcall",
11409 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11410 != NULL);
11411 is_thiscall = (lookup_attribute ("thiscall",
11412 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11413 != NULL);
11414 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11416 if (is_fastcall)
11418 if (DECL_STATIC_CHAIN (cfun->decl))
11420 sorry ("-fsplit-stack does not support fastcall with "
11421 "nested function");
11422 return INVALID_REGNUM;
11424 return AX_REG;
11426 else if (is_thiscall)
11428 if (!DECL_STATIC_CHAIN (cfun->decl))
11429 return DX_REG;
11430 return AX_REG;
11432 else if (regparm < 3)
11434 if (!DECL_STATIC_CHAIN (cfun->decl))
11435 return CX_REG;
11436 else
11438 if (regparm >= 2)
11440 sorry ("-fsplit-stack does not support 2 register "
11441 " parameters for a nested function");
11442 return INVALID_REGNUM;
11444 return DX_REG;
11447 else
11449 /* FIXME: We could make this work by pushing a register
11450 around the addition and comparison. */
11451 sorry ("-fsplit-stack does not support 3 register parameters");
11452 return INVALID_REGNUM;
11457 /* A SYMBOL_REF for the function which allocates new stackspace for
11458 -fsplit-stack. */
11460 static GTY(()) rtx split_stack_fn;
11462 /* A SYMBOL_REF for the more stack function when using the large
11463 model. */
11465 static GTY(()) rtx split_stack_fn_large;
11467 /* Handle -fsplit-stack. These are the first instructions in the
11468 function, even before the regular prologue. */
11470 void
11471 ix86_expand_split_stack_prologue (void)
11473 struct ix86_frame frame;
11474 HOST_WIDE_INT allocate;
11475 unsigned HOST_WIDE_INT args_size;
11476 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11477 rtx scratch_reg = NULL_RTX;
11478 rtx varargs_label = NULL_RTX;
11479 rtx fn;
11481 gcc_assert (flag_split_stack && reload_completed);
11483 ix86_finalize_stack_realign_flags ();
11484 ix86_compute_frame_layout (&frame);
11485 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11487 /* This is the label we will branch to if we have enough stack
11488 space. We expect the basic block reordering pass to reverse this
11489 branch if optimizing, so that we branch in the unlikely case. */
11490 label = gen_label_rtx ();
11492 /* We need to compare the stack pointer minus the frame size with
11493 the stack boundary in the TCB. The stack boundary always gives
11494 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11495 can compare directly. Otherwise we need to do an addition. */
11497 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11498 UNSPEC_STACK_CHECK);
11499 limit = gen_rtx_CONST (Pmode, limit);
11500 limit = gen_rtx_MEM (Pmode, limit);
11501 if (allocate < SPLIT_STACK_AVAILABLE)
11502 current = stack_pointer_rtx;
11503 else
11505 unsigned int scratch_regno;
11506 rtx offset;
11508 /* We need a scratch register to hold the stack pointer minus
11509 the required frame size. Since this is the very start of the
11510 function, the scratch register can be any caller-saved
11511 register which is not used for parameters. */
11512 offset = GEN_INT (- allocate);
11513 scratch_regno = split_stack_prologue_scratch_regno ();
11514 if (scratch_regno == INVALID_REGNUM)
11515 return;
11516 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11517 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11519 /* We don't use ix86_gen_add3 in this case because it will
11520 want to split to lea, but when not optimizing the insn
11521 will not be split after this point. */
11522 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11523 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11524 offset)));
11526 else
11528 emit_move_insn (scratch_reg, offset);
11529 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11530 stack_pointer_rtx));
11532 current = scratch_reg;
11535 ix86_expand_branch (GEU, current, limit, label);
11536 jump_insn = get_last_insn ();
11537 JUMP_LABEL (jump_insn) = label;
11539 /* Mark the jump as very likely to be taken. */
11540 add_int_reg_note (jump_insn, REG_BR_PROB,
11541 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
11543 if (split_stack_fn == NULL_RTX)
11544 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11545 fn = split_stack_fn;
11547 /* Get more stack space. We pass in the desired stack space and the
11548 size of the arguments to copy to the new stack. In 32-bit mode
11549 we push the parameters; __morestack will return on a new stack
11550 anyhow. In 64-bit mode we pass the parameters in r10 and
11551 r11. */
11552 allocate_rtx = GEN_INT (allocate);
11553 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11554 call_fusage = NULL_RTX;
11555 if (TARGET_64BIT)
11557 rtx reg10, reg11;
11559 reg10 = gen_rtx_REG (Pmode, R10_REG);
11560 reg11 = gen_rtx_REG (Pmode, R11_REG);
11562 /* If this function uses a static chain, it will be in %r10.
11563 Preserve it across the call to __morestack. */
11564 if (DECL_STATIC_CHAIN (cfun->decl))
11566 rtx rax;
11568 rax = gen_rtx_REG (word_mode, AX_REG);
11569 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11570 use_reg (&call_fusage, rax);
11573 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11574 && !TARGET_PECOFF)
11576 HOST_WIDE_INT argval;
11578 gcc_assert (Pmode == DImode);
11579 /* When using the large model we need to load the address
11580 into a register, and we've run out of registers. So we
11581 switch to a different calling convention, and we call a
11582 different function: __morestack_large. We pass the
11583 argument size in the upper 32 bits of r10 and pass the
11584 frame size in the lower 32 bits. */
11585 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11586 gcc_assert ((args_size & 0xffffffff) == args_size);
11588 if (split_stack_fn_large == NULL_RTX)
11589 split_stack_fn_large =
11590 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11592 if (ix86_cmodel == CM_LARGE_PIC)
11594 rtx label, x;
11596 label = gen_label_rtx ();
11597 emit_label (label);
11598 LABEL_PRESERVE_P (label) = 1;
11599 emit_insn (gen_set_rip_rex64 (reg10, label));
11600 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11601 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11602 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11603 UNSPEC_GOT);
11604 x = gen_rtx_CONST (Pmode, x);
11605 emit_move_insn (reg11, x);
11606 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11607 x = gen_const_mem (Pmode, x);
11608 emit_move_insn (reg11, x);
11610 else
11611 emit_move_insn (reg11, split_stack_fn_large);
11613 fn = reg11;
11615 argval = ((args_size << 16) << 16) + allocate;
11616 emit_move_insn (reg10, GEN_INT (argval));
11618 else
11620 emit_move_insn (reg10, allocate_rtx);
11621 emit_move_insn (reg11, GEN_INT (args_size));
11622 use_reg (&call_fusage, reg11);
11625 use_reg (&call_fusage, reg10);
11627 else
11629 emit_insn (gen_push (GEN_INT (args_size)));
11630 emit_insn (gen_push (allocate_rtx));
11632 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11633 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11634 NULL_RTX, false);
11635 add_function_usage_to (call_insn, call_fusage);
11637 /* In order to make call/return prediction work right, we now need
11638 to execute a return instruction. See
11639 libgcc/config/i386/morestack.S for the details on how this works.
11641 For flow purposes gcc must not see this as a return
11642 instruction--we need control flow to continue at the subsequent
11643 label. Therefore, we use an unspec. */
11644 gcc_assert (crtl->args.pops_args < 65536);
11645 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11647 /* If we are in 64-bit mode and this function uses a static chain,
11648 we saved %r10 in %rax before calling _morestack. */
11649 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11650 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11651 gen_rtx_REG (word_mode, AX_REG));
11653 /* If this function calls va_start, we need to store a pointer to
11654 the arguments on the old stack, because they may not have been
11655 all copied to the new stack. At this point the old stack can be
11656 found at the frame pointer value used by __morestack, because
11657 __morestack has set that up before calling back to us. Here we
11658 store that pointer in a scratch register, and in
11659 ix86_expand_prologue we store the scratch register in a stack
11660 slot. */
11661 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11663 unsigned int scratch_regno;
11664 rtx frame_reg;
11665 int words;
11667 scratch_regno = split_stack_prologue_scratch_regno ();
11668 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11669 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11671 /* 64-bit:
11672 fp -> old fp value
11673 return address within this function
11674 return address of caller of this function
11675 stack arguments
11676 So we add three words to get to the stack arguments.
11678 32-bit:
11679 fp -> old fp value
11680 return address within this function
11681 first argument to __morestack
11682 second argument to __morestack
11683 return address of caller of this function
11684 stack arguments
11685 So we add five words to get to the stack arguments.
11687 words = TARGET_64BIT ? 3 : 5;
11688 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11689 gen_rtx_PLUS (Pmode, frame_reg,
11690 GEN_INT (words * UNITS_PER_WORD))));
11692 varargs_label = gen_label_rtx ();
11693 emit_jump_insn (gen_jump (varargs_label));
11694 JUMP_LABEL (get_last_insn ()) = varargs_label;
11696 emit_barrier ();
11699 emit_label (label);
11700 LABEL_NUSES (label) = 1;
11702 /* If this function calls va_start, we now have to set the scratch
11703 register for the case where we do not call __morestack. In this
11704 case we need to set it based on the stack pointer. */
11705 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11707 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11708 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11709 GEN_INT (UNITS_PER_WORD))));
11711 emit_label (varargs_label);
11712 LABEL_NUSES (varargs_label) = 1;
11716 /* We may have to tell the dataflow pass that the split stack prologue
11717 is initializing a scratch register. */
11719 static void
11720 ix86_live_on_entry (bitmap regs)
11722 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11724 gcc_assert (flag_split_stack);
11725 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11729 /* Determine if op is suitable SUBREG RTX for address. */
11731 static bool
11732 ix86_address_subreg_operand (rtx op)
11734 enum machine_mode mode;
11736 if (!REG_P (op))
11737 return false;
11739 mode = GET_MODE (op);
11741 if (GET_MODE_CLASS (mode) != MODE_INT)
11742 return false;
11744 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11745 failures when the register is one word out of a two word structure. */
11746 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11747 return false;
11749 /* Allow only SUBREGs of non-eliminable hard registers. */
11750 return register_no_elim_operand (op, mode);
11753 /* Extract the parts of an RTL expression that is a valid memory address
11754 for an instruction. Return 0 if the structure of the address is
11755 grossly off. Return -1 if the address contains ASHIFT, so it is not
11756 strictly valid, but still used for computing length of lea instruction. */
11759 ix86_decompose_address (rtx addr, struct ix86_address *out)
11761 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11762 rtx base_reg, index_reg;
11763 HOST_WIDE_INT scale = 1;
11764 rtx scale_rtx = NULL_RTX;
11765 rtx tmp;
11766 int retval = 1;
11767 enum ix86_address_seg seg = SEG_DEFAULT;
11769 /* Allow zero-extended SImode addresses,
11770 they will be emitted with addr32 prefix. */
11771 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11773 if (GET_CODE (addr) == ZERO_EXTEND
11774 && GET_MODE (XEXP (addr, 0)) == SImode)
11776 addr = XEXP (addr, 0);
11777 if (CONST_INT_P (addr))
11778 return 0;
11780 else if (GET_CODE (addr) == AND
11781 && const_32bit_mask (XEXP (addr, 1), DImode))
11783 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11784 if (addr == NULL_RTX)
11785 return 0;
11787 if (CONST_INT_P (addr))
11788 return 0;
11792 /* Allow SImode subregs of DImode addresses,
11793 they will be emitted with addr32 prefix. */
11794 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11796 if (GET_CODE (addr) == SUBREG
11797 && GET_MODE (SUBREG_REG (addr)) == DImode)
11799 addr = SUBREG_REG (addr);
11800 if (CONST_INT_P (addr))
11801 return 0;
11805 if (REG_P (addr))
11806 base = addr;
11807 else if (GET_CODE (addr) == SUBREG)
11809 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11810 base = addr;
11811 else
11812 return 0;
11814 else if (GET_CODE (addr) == PLUS)
11816 rtx addends[4], op;
11817 int n = 0, i;
11819 op = addr;
11822 if (n >= 4)
11823 return 0;
11824 addends[n++] = XEXP (op, 1);
11825 op = XEXP (op, 0);
11827 while (GET_CODE (op) == PLUS);
11828 if (n >= 4)
11829 return 0;
11830 addends[n] = op;
11832 for (i = n; i >= 0; --i)
11834 op = addends[i];
11835 switch (GET_CODE (op))
11837 case MULT:
11838 if (index)
11839 return 0;
11840 index = XEXP (op, 0);
11841 scale_rtx = XEXP (op, 1);
11842 break;
11844 case ASHIFT:
11845 if (index)
11846 return 0;
11847 index = XEXP (op, 0);
11848 tmp = XEXP (op, 1);
11849 if (!CONST_INT_P (tmp))
11850 return 0;
11851 scale = INTVAL (tmp);
11852 if ((unsigned HOST_WIDE_INT) scale > 3)
11853 return 0;
11854 scale = 1 << scale;
11855 break;
11857 case ZERO_EXTEND:
11858 op = XEXP (op, 0);
11859 if (GET_CODE (op) != UNSPEC)
11860 return 0;
11861 /* FALLTHRU */
11863 case UNSPEC:
11864 if (XINT (op, 1) == UNSPEC_TP
11865 && TARGET_TLS_DIRECT_SEG_REFS
11866 && seg == SEG_DEFAULT)
11867 seg = DEFAULT_TLS_SEG_REG;
11868 else
11869 return 0;
11870 break;
11872 case SUBREG:
11873 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11874 return 0;
11875 /* FALLTHRU */
11877 case REG:
11878 if (!base)
11879 base = op;
11880 else if (!index)
11881 index = op;
11882 else
11883 return 0;
11884 break;
11886 case CONST:
11887 case CONST_INT:
11888 case SYMBOL_REF:
11889 case LABEL_REF:
11890 if (disp)
11891 return 0;
11892 disp = op;
11893 break;
11895 default:
11896 return 0;
11900 else if (GET_CODE (addr) == MULT)
11902 index = XEXP (addr, 0); /* index*scale */
11903 scale_rtx = XEXP (addr, 1);
11905 else if (GET_CODE (addr) == ASHIFT)
11907 /* We're called for lea too, which implements ashift on occasion. */
11908 index = XEXP (addr, 0);
11909 tmp = XEXP (addr, 1);
11910 if (!CONST_INT_P (tmp))
11911 return 0;
11912 scale = INTVAL (tmp);
11913 if ((unsigned HOST_WIDE_INT) scale > 3)
11914 return 0;
11915 scale = 1 << scale;
11916 retval = -1;
11918 else if (CONST_INT_P (addr))
11920 if (!x86_64_immediate_operand (addr, VOIDmode))
11921 return 0;
11923 /* Constant addresses are sign extended to 64bit, we have to
11924 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11925 if (TARGET_X32
11926 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11927 return 0;
11929 disp = addr;
11931 else
11932 disp = addr; /* displacement */
11934 if (index)
11936 if (REG_P (index))
11938 else if (GET_CODE (index) == SUBREG
11939 && ix86_address_subreg_operand (SUBREG_REG (index)))
11941 else
11942 return 0;
11945 /* Address override works only on the (%reg) part of %fs:(%reg). */
11946 if (seg != SEG_DEFAULT
11947 && ((base && GET_MODE (base) != word_mode)
11948 || (index && GET_MODE (index) != word_mode)))
11949 return 0;
11951 /* Extract the integral value of scale. */
11952 if (scale_rtx)
11954 if (!CONST_INT_P (scale_rtx))
11955 return 0;
11956 scale = INTVAL (scale_rtx);
11959 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11960 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11962 /* Avoid useless 0 displacement. */
11963 if (disp == const0_rtx && (base || index))
11964 disp = NULL_RTX;
11966 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11967 if (base_reg && index_reg && scale == 1
11968 && (index_reg == arg_pointer_rtx
11969 || index_reg == frame_pointer_rtx
11970 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11972 rtx tmp;
11973 tmp = base, base = index, index = tmp;
11974 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11977 /* Special case: %ebp cannot be encoded as a base without a displacement.
11978 Similarly %r13. */
11979 if (!disp
11980 && base_reg
11981 && (base_reg == hard_frame_pointer_rtx
11982 || base_reg == frame_pointer_rtx
11983 || base_reg == arg_pointer_rtx
11984 || (REG_P (base_reg)
11985 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11986 || REGNO (base_reg) == R13_REG))))
11987 disp = const0_rtx;
11989 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11990 Avoid this by transforming to [%esi+0].
11991 Reload calls address legitimization without cfun defined, so we need
11992 to test cfun for being non-NULL. */
11993 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11994 && base_reg && !index_reg && !disp
11995 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11996 disp = const0_rtx;
11998 /* Special case: encode reg+reg instead of reg*2. */
11999 if (!base && index && scale == 2)
12000 base = index, base_reg = index_reg, scale = 1;
12002 /* Special case: scaling cannot be encoded without base or displacement. */
12003 if (!base && !disp && index && scale != 1)
12004 disp = const0_rtx;
12006 out->base = base;
12007 out->index = index;
12008 out->disp = disp;
12009 out->scale = scale;
12010 out->seg = seg;
12012 return retval;
12015 /* Return cost of the memory address x.
12016 For i386, it is better to use a complex address than let gcc copy
12017 the address into a reg and make a new pseudo. But not if the address
12018 requires to two regs - that would mean more pseudos with longer
12019 lifetimes. */
12020 static int
12021 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12022 addr_space_t as ATTRIBUTE_UNUSED,
12023 bool speed ATTRIBUTE_UNUSED)
12025 struct ix86_address parts;
12026 int cost = 1;
12027 int ok = ix86_decompose_address (x, &parts);
12029 gcc_assert (ok);
12031 if (parts.base && GET_CODE (parts.base) == SUBREG)
12032 parts.base = SUBREG_REG (parts.base);
12033 if (parts.index && GET_CODE (parts.index) == SUBREG)
12034 parts.index = SUBREG_REG (parts.index);
12036 /* Attempt to minimize number of registers in the address. */
12037 if ((parts.base
12038 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12039 || (parts.index
12040 && (!REG_P (parts.index)
12041 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12042 cost++;
12044 if (parts.base
12045 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12046 && parts.index
12047 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12048 && parts.base != parts.index)
12049 cost++;
12051 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12052 since it's predecode logic can't detect the length of instructions
12053 and it degenerates to vector decoded. Increase cost of such
12054 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12055 to split such addresses or even refuse such addresses at all.
12057 Following addressing modes are affected:
12058 [base+scale*index]
12059 [scale*index+disp]
12060 [base+index]
12062 The first and last case may be avoidable by explicitly coding the zero in
12063 memory address, but I don't have AMD-K6 machine handy to check this
12064 theory. */
12066 if (TARGET_K6
12067 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12068 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12069 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12070 cost += 10;
12072 return cost;
12075 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12076 this is used for to form addresses to local data when -fPIC is in
12077 use. */
12079 static bool
12080 darwin_local_data_pic (rtx disp)
12082 return (GET_CODE (disp) == UNSPEC
12083 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12086 /* Determine if a given RTX is a valid constant. We already know this
12087 satisfies CONSTANT_P. */
12089 static bool
12090 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12092 switch (GET_CODE (x))
12094 case CONST:
12095 x = XEXP (x, 0);
12097 if (GET_CODE (x) == PLUS)
12099 if (!CONST_INT_P (XEXP (x, 1)))
12100 return false;
12101 x = XEXP (x, 0);
12104 if (TARGET_MACHO && darwin_local_data_pic (x))
12105 return true;
12107 /* Only some unspecs are valid as "constants". */
12108 if (GET_CODE (x) == UNSPEC)
12109 switch (XINT (x, 1))
12111 case UNSPEC_GOT:
12112 case UNSPEC_GOTOFF:
12113 case UNSPEC_PLTOFF:
12114 return TARGET_64BIT;
12115 case UNSPEC_TPOFF:
12116 case UNSPEC_NTPOFF:
12117 x = XVECEXP (x, 0, 0);
12118 return (GET_CODE (x) == SYMBOL_REF
12119 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12120 case UNSPEC_DTPOFF:
12121 x = XVECEXP (x, 0, 0);
12122 return (GET_CODE (x) == SYMBOL_REF
12123 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12124 default:
12125 return false;
12128 /* We must have drilled down to a symbol. */
12129 if (GET_CODE (x) == LABEL_REF)
12130 return true;
12131 if (GET_CODE (x) != SYMBOL_REF)
12132 return false;
12133 /* FALLTHRU */
12135 case SYMBOL_REF:
12136 /* TLS symbols are never valid. */
12137 if (SYMBOL_REF_TLS_MODEL (x))
12138 return false;
12140 /* DLLIMPORT symbols are never valid. */
12141 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12142 && SYMBOL_REF_DLLIMPORT_P (x))
12143 return false;
12145 #if TARGET_MACHO
12146 /* mdynamic-no-pic */
12147 if (MACHO_DYNAMIC_NO_PIC_P)
12148 return machopic_symbol_defined_p (x);
12149 #endif
12150 break;
12152 case CONST_DOUBLE:
12153 if (GET_MODE (x) == TImode
12154 && x != CONST0_RTX (TImode)
12155 && !TARGET_64BIT)
12156 return false;
12157 break;
12159 case CONST_VECTOR:
12160 if (!standard_sse_constant_p (x))
12161 return false;
12163 default:
12164 break;
12167 /* Otherwise we handle everything else in the move patterns. */
12168 return true;
12171 /* Determine if it's legal to put X into the constant pool. This
12172 is not possible for the address of thread-local symbols, which
12173 is checked above. */
12175 static bool
12176 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12178 /* We can always put integral constants and vectors in memory. */
12179 switch (GET_CODE (x))
12181 case CONST_INT:
12182 case CONST_DOUBLE:
12183 case CONST_VECTOR:
12184 return false;
12186 default:
12187 break;
12189 return !ix86_legitimate_constant_p (mode, x);
12192 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12193 otherwise zero. */
12195 static bool
12196 is_imported_p (rtx x)
12198 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12199 || GET_CODE (x) != SYMBOL_REF)
12200 return false;
12202 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12206 /* Nonzero if the constant value X is a legitimate general operand
12207 when generating PIC code. It is given that flag_pic is on and
12208 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12210 bool
12211 legitimate_pic_operand_p (rtx x)
12213 rtx inner;
12215 switch (GET_CODE (x))
12217 case CONST:
12218 inner = XEXP (x, 0);
12219 if (GET_CODE (inner) == PLUS
12220 && CONST_INT_P (XEXP (inner, 1)))
12221 inner = XEXP (inner, 0);
12223 /* Only some unspecs are valid as "constants". */
12224 if (GET_CODE (inner) == UNSPEC)
12225 switch (XINT (inner, 1))
12227 case UNSPEC_GOT:
12228 case UNSPEC_GOTOFF:
12229 case UNSPEC_PLTOFF:
12230 return TARGET_64BIT;
12231 case UNSPEC_TPOFF:
12232 x = XVECEXP (inner, 0, 0);
12233 return (GET_CODE (x) == SYMBOL_REF
12234 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12235 case UNSPEC_MACHOPIC_OFFSET:
12236 return legitimate_pic_address_disp_p (x);
12237 default:
12238 return false;
12240 /* FALLTHRU */
12242 case SYMBOL_REF:
12243 case LABEL_REF:
12244 return legitimate_pic_address_disp_p (x);
12246 default:
12247 return true;
12251 /* Determine if a given CONST RTX is a valid memory displacement
12252 in PIC mode. */
12254 bool
12255 legitimate_pic_address_disp_p (rtx disp)
12257 bool saw_plus;
12259 /* In 64bit mode we can allow direct addresses of symbols and labels
12260 when they are not dynamic symbols. */
12261 if (TARGET_64BIT)
12263 rtx op0 = disp, op1;
12265 switch (GET_CODE (disp))
12267 case LABEL_REF:
12268 return true;
12270 case CONST:
12271 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12272 break;
12273 op0 = XEXP (XEXP (disp, 0), 0);
12274 op1 = XEXP (XEXP (disp, 0), 1);
12275 if (!CONST_INT_P (op1)
12276 || INTVAL (op1) >= 16*1024*1024
12277 || INTVAL (op1) < -16*1024*1024)
12278 break;
12279 if (GET_CODE (op0) == LABEL_REF)
12280 return true;
12281 if (GET_CODE (op0) == CONST
12282 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12283 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12284 return true;
12285 if (GET_CODE (op0) == UNSPEC
12286 && XINT (op0, 1) == UNSPEC_PCREL)
12287 return true;
12288 if (GET_CODE (op0) != SYMBOL_REF)
12289 break;
12290 /* FALLTHRU */
12292 case SYMBOL_REF:
12293 /* TLS references should always be enclosed in UNSPEC.
12294 The dllimported symbol needs always to be resolved. */
12295 if (SYMBOL_REF_TLS_MODEL (op0)
12296 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12297 return false;
12299 if (TARGET_PECOFF)
12301 if (is_imported_p (op0))
12302 return true;
12304 if (SYMBOL_REF_FAR_ADDR_P (op0)
12305 || !SYMBOL_REF_LOCAL_P (op0))
12306 break;
12308 /* Function-symbols need to be resolved only for
12309 large-model.
12310 For the small-model we don't need to resolve anything
12311 here. */
12312 if ((ix86_cmodel != CM_LARGE_PIC
12313 && SYMBOL_REF_FUNCTION_P (op0))
12314 || ix86_cmodel == CM_SMALL_PIC)
12315 return true;
12316 /* Non-external symbols don't need to be resolved for
12317 large, and medium-model. */
12318 if ((ix86_cmodel == CM_LARGE_PIC
12319 || ix86_cmodel == CM_MEDIUM_PIC)
12320 && !SYMBOL_REF_EXTERNAL_P (op0))
12321 return true;
12323 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12324 && SYMBOL_REF_LOCAL_P (op0)
12325 && ix86_cmodel != CM_LARGE_PIC)
12326 return true;
12327 break;
12329 default:
12330 break;
12333 if (GET_CODE (disp) != CONST)
12334 return false;
12335 disp = XEXP (disp, 0);
12337 if (TARGET_64BIT)
12339 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12340 of GOT tables. We should not need these anyway. */
12341 if (GET_CODE (disp) != UNSPEC
12342 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12343 && XINT (disp, 1) != UNSPEC_GOTOFF
12344 && XINT (disp, 1) != UNSPEC_PCREL
12345 && XINT (disp, 1) != UNSPEC_PLTOFF))
12346 return false;
12348 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12349 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12350 return false;
12351 return true;
12354 saw_plus = false;
12355 if (GET_CODE (disp) == PLUS)
12357 if (!CONST_INT_P (XEXP (disp, 1)))
12358 return false;
12359 disp = XEXP (disp, 0);
12360 saw_plus = true;
12363 if (TARGET_MACHO && darwin_local_data_pic (disp))
12364 return true;
12366 if (GET_CODE (disp) != UNSPEC)
12367 return false;
12369 switch (XINT (disp, 1))
12371 case UNSPEC_GOT:
12372 if (saw_plus)
12373 return false;
12374 /* We need to check for both symbols and labels because VxWorks loads
12375 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12376 details. */
12377 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12378 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12379 case UNSPEC_GOTOFF:
12380 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12381 While ABI specify also 32bit relocation but we don't produce it in
12382 small PIC model at all. */
12383 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12384 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12385 && !TARGET_64BIT)
12386 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12387 return false;
12388 case UNSPEC_GOTTPOFF:
12389 case UNSPEC_GOTNTPOFF:
12390 case UNSPEC_INDNTPOFF:
12391 if (saw_plus)
12392 return false;
12393 disp = XVECEXP (disp, 0, 0);
12394 return (GET_CODE (disp) == SYMBOL_REF
12395 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12396 case UNSPEC_NTPOFF:
12397 disp = XVECEXP (disp, 0, 0);
12398 return (GET_CODE (disp) == SYMBOL_REF
12399 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12400 case UNSPEC_DTPOFF:
12401 disp = XVECEXP (disp, 0, 0);
12402 return (GET_CODE (disp) == SYMBOL_REF
12403 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12406 return false;
12409 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12410 replace the input X, or the original X if no replacement is called for.
12411 The output parameter *WIN is 1 if the calling macro should goto WIN,
12412 0 if it should not. */
12414 bool
12415 ix86_legitimize_reload_address (rtx x,
12416 enum machine_mode mode ATTRIBUTE_UNUSED,
12417 int opnum, int type,
12418 int ind_levels ATTRIBUTE_UNUSED)
12420 /* Reload can generate:
12422 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12423 (reg:DI 97))
12424 (reg:DI 2 cx))
12426 This RTX is rejected from ix86_legitimate_address_p due to
12427 non-strictness of base register 97. Following this rejection,
12428 reload pushes all three components into separate registers,
12429 creating invalid memory address RTX.
12431 Following code reloads only the invalid part of the
12432 memory address RTX. */
12434 if (GET_CODE (x) == PLUS
12435 && REG_P (XEXP (x, 1))
12436 && GET_CODE (XEXP (x, 0)) == PLUS
12437 && REG_P (XEXP (XEXP (x, 0), 1)))
12439 rtx base, index;
12440 bool something_reloaded = false;
12442 base = XEXP (XEXP (x, 0), 1);
12443 if (!REG_OK_FOR_BASE_STRICT_P (base))
12445 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12446 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12447 opnum, (enum reload_type) type);
12448 something_reloaded = true;
12451 index = XEXP (x, 1);
12452 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12454 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12455 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12456 opnum, (enum reload_type) type);
12457 something_reloaded = true;
12460 gcc_assert (something_reloaded);
12461 return true;
12464 return false;
12467 /* Recognizes RTL expressions that are valid memory addresses for an
12468 instruction. The MODE argument is the machine mode for the MEM
12469 expression that wants to use this address.
12471 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12472 convert common non-canonical forms to canonical form so that they will
12473 be recognized. */
12475 static bool
12476 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12477 rtx addr, bool strict)
12479 struct ix86_address parts;
12480 rtx base, index, disp;
12481 HOST_WIDE_INT scale;
12483 if (ix86_decompose_address (addr, &parts) <= 0)
12484 /* Decomposition failed. */
12485 return false;
12487 base = parts.base;
12488 index = parts.index;
12489 disp = parts.disp;
12490 scale = parts.scale;
12492 /* Validate base register. */
12493 if (base)
12495 rtx reg;
12497 if (REG_P (base))
12498 reg = base;
12499 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12500 reg = SUBREG_REG (base);
12501 else
12502 /* Base is not a register. */
12503 return false;
12505 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12506 return false;
12508 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12509 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12510 /* Base is not valid. */
12511 return false;
12514 /* Validate index register. */
12515 if (index)
12517 rtx reg;
12519 if (REG_P (index))
12520 reg = index;
12521 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12522 reg = SUBREG_REG (index);
12523 else
12524 /* Index is not a register. */
12525 return false;
12527 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12528 return false;
12530 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12531 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12532 /* Index is not valid. */
12533 return false;
12536 /* Index and base should have the same mode. */
12537 if (base && index
12538 && GET_MODE (base) != GET_MODE (index))
12539 return false;
12541 /* Validate scale factor. */
12542 if (scale != 1)
12544 if (!index)
12545 /* Scale without index. */
12546 return false;
12548 if (scale != 2 && scale != 4 && scale != 8)
12549 /* Scale is not a valid multiplier. */
12550 return false;
12553 /* Validate displacement. */
12554 if (disp)
12556 if (GET_CODE (disp) == CONST
12557 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12558 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12559 switch (XINT (XEXP (disp, 0), 1))
12561 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12562 used. While ABI specify also 32bit relocations, we don't produce
12563 them at all and use IP relative instead. */
12564 case UNSPEC_GOT:
12565 case UNSPEC_GOTOFF:
12566 gcc_assert (flag_pic);
12567 if (!TARGET_64BIT)
12568 goto is_legitimate_pic;
12570 /* 64bit address unspec. */
12571 return false;
12573 case UNSPEC_GOTPCREL:
12574 case UNSPEC_PCREL:
12575 gcc_assert (flag_pic);
12576 goto is_legitimate_pic;
12578 case UNSPEC_GOTTPOFF:
12579 case UNSPEC_GOTNTPOFF:
12580 case UNSPEC_INDNTPOFF:
12581 case UNSPEC_NTPOFF:
12582 case UNSPEC_DTPOFF:
12583 break;
12585 case UNSPEC_STACK_CHECK:
12586 gcc_assert (flag_split_stack);
12587 break;
12589 default:
12590 /* Invalid address unspec. */
12591 return false;
12594 else if (SYMBOLIC_CONST (disp)
12595 && (flag_pic
12596 || (TARGET_MACHO
12597 #if TARGET_MACHO
12598 && MACHOPIC_INDIRECT
12599 && !machopic_operand_p (disp)
12600 #endif
12604 is_legitimate_pic:
12605 if (TARGET_64BIT && (index || base))
12607 /* foo@dtpoff(%rX) is ok. */
12608 if (GET_CODE (disp) != CONST
12609 || GET_CODE (XEXP (disp, 0)) != PLUS
12610 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12611 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12612 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12613 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12614 /* Non-constant pic memory reference. */
12615 return false;
12617 else if ((!TARGET_MACHO || flag_pic)
12618 && ! legitimate_pic_address_disp_p (disp))
12619 /* Displacement is an invalid pic construct. */
12620 return false;
12621 #if TARGET_MACHO
12622 else if (MACHO_DYNAMIC_NO_PIC_P
12623 && !ix86_legitimate_constant_p (Pmode, disp))
12624 /* displacment must be referenced via non_lazy_pointer */
12625 return false;
12626 #endif
12628 /* This code used to verify that a symbolic pic displacement
12629 includes the pic_offset_table_rtx register.
12631 While this is good idea, unfortunately these constructs may
12632 be created by "adds using lea" optimization for incorrect
12633 code like:
12635 int a;
12636 int foo(int i)
12638 return *(&a+i);
12641 This code is nonsensical, but results in addressing
12642 GOT table with pic_offset_table_rtx base. We can't
12643 just refuse it easily, since it gets matched by
12644 "addsi3" pattern, that later gets split to lea in the
12645 case output register differs from input. While this
12646 can be handled by separate addsi pattern for this case
12647 that never results in lea, this seems to be easier and
12648 correct fix for crash to disable this test. */
12650 else if (GET_CODE (disp) != LABEL_REF
12651 && !CONST_INT_P (disp)
12652 && (GET_CODE (disp) != CONST
12653 || !ix86_legitimate_constant_p (Pmode, disp))
12654 && (GET_CODE (disp) != SYMBOL_REF
12655 || !ix86_legitimate_constant_p (Pmode, disp)))
12656 /* Displacement is not constant. */
12657 return false;
12658 else if (TARGET_64BIT
12659 && !x86_64_immediate_operand (disp, VOIDmode))
12660 /* Displacement is out of range. */
12661 return false;
12664 /* Everything looks valid. */
12665 return true;
12668 /* Determine if a given RTX is a valid constant address. */
12670 bool
12671 constant_address_p (rtx x)
12673 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12676 /* Return a unique alias set for the GOT. */
12678 static alias_set_type
12679 ix86_GOT_alias_set (void)
12681 static alias_set_type set = -1;
12682 if (set == -1)
12683 set = new_alias_set ();
12684 return set;
12687 /* Return a legitimate reference for ORIG (an address) using the
12688 register REG. If REG is 0, a new pseudo is generated.
12690 There are two types of references that must be handled:
12692 1. Global data references must load the address from the GOT, via
12693 the PIC reg. An insn is emitted to do this load, and the reg is
12694 returned.
12696 2. Static data references, constant pool addresses, and code labels
12697 compute the address as an offset from the GOT, whose base is in
12698 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12699 differentiate them from global data objects. The returned
12700 address is the PIC reg + an unspec constant.
12702 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12703 reg also appears in the address. */
12705 static rtx
12706 legitimize_pic_address (rtx orig, rtx reg)
12708 rtx addr = orig;
12709 rtx new_rtx = orig;
12711 #if TARGET_MACHO
12712 if (TARGET_MACHO && !TARGET_64BIT)
12714 if (reg == 0)
12715 reg = gen_reg_rtx (Pmode);
12716 /* Use the generic Mach-O PIC machinery. */
12717 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12719 #endif
12721 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12723 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12724 if (tmp)
12725 return tmp;
12728 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12729 new_rtx = addr;
12730 else if (TARGET_64BIT && !TARGET_PECOFF
12731 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12733 rtx tmpreg;
12734 /* This symbol may be referenced via a displacement from the PIC
12735 base address (@GOTOFF). */
12737 if (reload_in_progress)
12738 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12739 if (GET_CODE (addr) == CONST)
12740 addr = XEXP (addr, 0);
12741 if (GET_CODE (addr) == PLUS)
12743 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12744 UNSPEC_GOTOFF);
12745 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12747 else
12748 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12749 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12750 if (!reg)
12751 tmpreg = gen_reg_rtx (Pmode);
12752 else
12753 tmpreg = reg;
12754 emit_move_insn (tmpreg, new_rtx);
12756 if (reg != 0)
12758 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12759 tmpreg, 1, OPTAB_DIRECT);
12760 new_rtx = reg;
12762 else
12763 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12765 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12767 /* This symbol may be referenced via a displacement from the PIC
12768 base address (@GOTOFF). */
12770 if (reload_in_progress)
12771 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12772 if (GET_CODE (addr) == CONST)
12773 addr = XEXP (addr, 0);
12774 if (GET_CODE (addr) == PLUS)
12776 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12777 UNSPEC_GOTOFF);
12778 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12780 else
12781 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12782 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12783 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12785 if (reg != 0)
12787 emit_move_insn (reg, new_rtx);
12788 new_rtx = reg;
12791 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12792 /* We can't use @GOTOFF for text labels on VxWorks;
12793 see gotoff_operand. */
12794 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12796 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12797 if (tmp)
12798 return tmp;
12800 /* For x64 PE-COFF there is no GOT table. So we use address
12801 directly. */
12802 if (TARGET_64BIT && TARGET_PECOFF)
12804 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12805 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12807 if (reg == 0)
12808 reg = gen_reg_rtx (Pmode);
12809 emit_move_insn (reg, new_rtx);
12810 new_rtx = reg;
12812 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12814 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12815 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12816 new_rtx = gen_const_mem (Pmode, new_rtx);
12817 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12819 if (reg == 0)
12820 reg = gen_reg_rtx (Pmode);
12821 /* Use directly gen_movsi, otherwise the address is loaded
12822 into register for CSE. We don't want to CSE this addresses,
12823 instead we CSE addresses from the GOT table, so skip this. */
12824 emit_insn (gen_movsi (reg, new_rtx));
12825 new_rtx = reg;
12827 else
12829 /* This symbol must be referenced via a load from the
12830 Global Offset Table (@GOT). */
12832 if (reload_in_progress)
12833 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12834 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12835 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12836 if (TARGET_64BIT)
12837 new_rtx = force_reg (Pmode, new_rtx);
12838 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12839 new_rtx = gen_const_mem (Pmode, new_rtx);
12840 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12842 if (reg == 0)
12843 reg = gen_reg_rtx (Pmode);
12844 emit_move_insn (reg, new_rtx);
12845 new_rtx = reg;
12848 else
12850 if (CONST_INT_P (addr)
12851 && !x86_64_immediate_operand (addr, VOIDmode))
12853 if (reg)
12855 emit_move_insn (reg, addr);
12856 new_rtx = reg;
12858 else
12859 new_rtx = force_reg (Pmode, addr);
12861 else if (GET_CODE (addr) == CONST)
12863 addr = XEXP (addr, 0);
12865 /* We must match stuff we generate before. Assume the only
12866 unspecs that can get here are ours. Not that we could do
12867 anything with them anyway.... */
12868 if (GET_CODE (addr) == UNSPEC
12869 || (GET_CODE (addr) == PLUS
12870 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12871 return orig;
12872 gcc_assert (GET_CODE (addr) == PLUS);
12874 if (GET_CODE (addr) == PLUS)
12876 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12878 /* Check first to see if this is a constant offset from a @GOTOFF
12879 symbol reference. */
12880 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
12881 && CONST_INT_P (op1))
12883 if (!TARGET_64BIT)
12885 if (reload_in_progress)
12886 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12887 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12888 UNSPEC_GOTOFF);
12889 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12890 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12891 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12893 if (reg != 0)
12895 emit_move_insn (reg, new_rtx);
12896 new_rtx = reg;
12899 else
12901 if (INTVAL (op1) < -16*1024*1024
12902 || INTVAL (op1) >= 16*1024*1024)
12904 if (!x86_64_immediate_operand (op1, Pmode))
12905 op1 = force_reg (Pmode, op1);
12906 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12910 else
12912 rtx base = legitimize_pic_address (op0, reg);
12913 enum machine_mode mode = GET_MODE (base);
12914 new_rtx
12915 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12917 if (CONST_INT_P (new_rtx))
12919 if (INTVAL (new_rtx) < -16*1024*1024
12920 || INTVAL (new_rtx) >= 16*1024*1024)
12922 if (!x86_64_immediate_operand (new_rtx, mode))
12923 new_rtx = force_reg (mode, new_rtx);
12924 new_rtx
12925 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12927 else
12928 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12930 else
12932 if (GET_CODE (new_rtx) == PLUS
12933 && CONSTANT_P (XEXP (new_rtx, 1)))
12935 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12936 new_rtx = XEXP (new_rtx, 1);
12938 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12943 return new_rtx;
12946 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12948 static rtx
12949 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12951 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12953 if (GET_MODE (tp) != tp_mode)
12955 gcc_assert (GET_MODE (tp) == SImode);
12956 gcc_assert (tp_mode == DImode);
12958 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12961 if (to_reg)
12962 tp = copy_to_mode_reg (tp_mode, tp);
12964 return tp;
12967 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12969 static GTY(()) rtx ix86_tls_symbol;
12971 static rtx
12972 ix86_tls_get_addr (void)
12974 if (!ix86_tls_symbol)
12976 const char *sym
12977 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12978 ? "___tls_get_addr" : "__tls_get_addr");
12980 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12983 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
12985 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
12986 UNSPEC_PLTOFF);
12987 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
12988 gen_rtx_CONST (Pmode, unspec));
12991 return ix86_tls_symbol;
12994 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12996 static GTY(()) rtx ix86_tls_module_base_symbol;
12999 ix86_tls_module_base (void)
13001 if (!ix86_tls_module_base_symbol)
13003 ix86_tls_module_base_symbol
13004 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
13006 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
13007 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
13010 return ix86_tls_module_base_symbol;
13013 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
13014 false if we expect this to be used for a memory address and true if
13015 we expect to load the address into a register. */
13017 static rtx
13018 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13020 rtx dest, base, off;
13021 rtx pic = NULL_RTX, tp = NULL_RTX;
13022 enum machine_mode tp_mode = Pmode;
13023 int type;
13025 switch (model)
13027 case TLS_MODEL_GLOBAL_DYNAMIC:
13028 dest = gen_reg_rtx (Pmode);
13030 if (!TARGET_64BIT)
13032 if (flag_pic && !TARGET_PECOFF)
13033 pic = pic_offset_table_rtx;
13034 else
13036 pic = gen_reg_rtx (Pmode);
13037 emit_insn (gen_set_got (pic));
13041 if (TARGET_GNU2_TLS)
13043 if (TARGET_64BIT)
13044 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13045 else
13046 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13048 tp = get_thread_pointer (Pmode, true);
13049 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13051 if (GET_MODE (x) != Pmode)
13052 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13054 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13056 else
13058 rtx caddr = ix86_tls_get_addr ();
13060 if (TARGET_64BIT)
13062 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13063 rtx insns;
13065 start_sequence ();
13066 emit_call_insn
13067 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13068 insns = get_insns ();
13069 end_sequence ();
13071 if (GET_MODE (x) != Pmode)
13072 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13074 RTL_CONST_CALL_P (insns) = 1;
13075 emit_libcall_block (insns, dest, rax, x);
13077 else
13078 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13080 break;
13082 case TLS_MODEL_LOCAL_DYNAMIC:
13083 base = gen_reg_rtx (Pmode);
13085 if (!TARGET_64BIT)
13087 if (flag_pic)
13088 pic = pic_offset_table_rtx;
13089 else
13091 pic = gen_reg_rtx (Pmode);
13092 emit_insn (gen_set_got (pic));
13096 if (TARGET_GNU2_TLS)
13098 rtx tmp = ix86_tls_module_base ();
13100 if (TARGET_64BIT)
13101 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13102 else
13103 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13105 tp = get_thread_pointer (Pmode, true);
13106 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13107 gen_rtx_MINUS (Pmode, tmp, tp));
13109 else
13111 rtx caddr = ix86_tls_get_addr ();
13113 if (TARGET_64BIT)
13115 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13116 rtx insns, eqv;
13118 start_sequence ();
13119 emit_call_insn
13120 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13121 insns = get_insns ();
13122 end_sequence ();
13124 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13125 share the LD_BASE result with other LD model accesses. */
13126 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13127 UNSPEC_TLS_LD_BASE);
13129 RTL_CONST_CALL_P (insns) = 1;
13130 emit_libcall_block (insns, base, rax, eqv);
13132 else
13133 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13136 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13137 off = gen_rtx_CONST (Pmode, off);
13139 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13141 if (TARGET_GNU2_TLS)
13143 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13145 if (GET_MODE (x) != Pmode)
13146 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13148 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13150 break;
13152 case TLS_MODEL_INITIAL_EXEC:
13153 if (TARGET_64BIT)
13155 if (TARGET_SUN_TLS && !TARGET_X32)
13157 /* The Sun linker took the AMD64 TLS spec literally
13158 and can only handle %rax as destination of the
13159 initial executable code sequence. */
13161 dest = gen_reg_rtx (DImode);
13162 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13163 return dest;
13166 /* Generate DImode references to avoid %fs:(%reg32)
13167 problems and linker IE->LE relaxation bug. */
13168 tp_mode = DImode;
13169 pic = NULL;
13170 type = UNSPEC_GOTNTPOFF;
13172 else if (flag_pic)
13174 if (reload_in_progress)
13175 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13176 pic = pic_offset_table_rtx;
13177 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13179 else if (!TARGET_ANY_GNU_TLS)
13181 pic = gen_reg_rtx (Pmode);
13182 emit_insn (gen_set_got (pic));
13183 type = UNSPEC_GOTTPOFF;
13185 else
13187 pic = NULL;
13188 type = UNSPEC_INDNTPOFF;
13191 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13192 off = gen_rtx_CONST (tp_mode, off);
13193 if (pic)
13194 off = gen_rtx_PLUS (tp_mode, pic, off);
13195 off = gen_const_mem (tp_mode, off);
13196 set_mem_alias_set (off, ix86_GOT_alias_set ());
13198 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13200 base = get_thread_pointer (tp_mode,
13201 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13202 off = force_reg (tp_mode, off);
13203 return gen_rtx_PLUS (tp_mode, base, off);
13205 else
13207 base = get_thread_pointer (Pmode, true);
13208 dest = gen_reg_rtx (Pmode);
13209 emit_insn (ix86_gen_sub3 (dest, base, off));
13211 break;
13213 case TLS_MODEL_LOCAL_EXEC:
13214 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13215 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13216 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13217 off = gen_rtx_CONST (Pmode, off);
13219 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13221 base = get_thread_pointer (Pmode,
13222 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13223 return gen_rtx_PLUS (Pmode, base, off);
13225 else
13227 base = get_thread_pointer (Pmode, true);
13228 dest = gen_reg_rtx (Pmode);
13229 emit_insn (ix86_gen_sub3 (dest, base, off));
13231 break;
13233 default:
13234 gcc_unreachable ();
13237 return dest;
13240 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13241 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13242 unique refptr-DECL symbol corresponding to symbol DECL. */
13244 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13245 htab_t dllimport_map;
13247 static tree
13248 get_dllimport_decl (tree decl, bool beimport)
13250 struct tree_map *h, in;
13251 void **loc;
13252 const char *name;
13253 const char *prefix;
13254 size_t namelen, prefixlen;
13255 char *imp_name;
13256 tree to;
13257 rtx rtl;
13259 if (!dllimport_map)
13260 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13262 in.hash = htab_hash_pointer (decl);
13263 in.base.from = decl;
13264 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13265 h = (struct tree_map *) *loc;
13266 if (h)
13267 return h->to;
13269 *loc = h = ggc_alloc_tree_map ();
13270 h->hash = in.hash;
13271 h->base.from = decl;
13272 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13273 VAR_DECL, NULL, ptr_type_node);
13274 DECL_ARTIFICIAL (to) = 1;
13275 DECL_IGNORED_P (to) = 1;
13276 DECL_EXTERNAL (to) = 1;
13277 TREE_READONLY (to) = 1;
13279 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13280 name = targetm.strip_name_encoding (name);
13281 if (beimport)
13282 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13283 ? "*__imp_" : "*__imp__";
13284 else
13285 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13286 namelen = strlen (name);
13287 prefixlen = strlen (prefix);
13288 imp_name = (char *) alloca (namelen + prefixlen + 1);
13289 memcpy (imp_name, prefix, prefixlen);
13290 memcpy (imp_name + prefixlen, name, namelen + 1);
13292 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13293 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13294 SET_SYMBOL_REF_DECL (rtl, to);
13295 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13296 if (!beimport)
13298 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13299 #ifdef SUB_TARGET_RECORD_STUB
13300 SUB_TARGET_RECORD_STUB (name);
13301 #endif
13304 rtl = gen_const_mem (Pmode, rtl);
13305 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13307 SET_DECL_RTL (to, rtl);
13308 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13310 return to;
13313 /* Expand SYMBOL into its corresponding far-addresse symbol.
13314 WANT_REG is true if we require the result be a register. */
13316 static rtx
13317 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13319 tree imp_decl;
13320 rtx x;
13322 gcc_assert (SYMBOL_REF_DECL (symbol));
13323 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13325 x = DECL_RTL (imp_decl);
13326 if (want_reg)
13327 x = force_reg (Pmode, x);
13328 return x;
13331 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13332 true if we require the result be a register. */
13334 static rtx
13335 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13337 tree imp_decl;
13338 rtx x;
13340 gcc_assert (SYMBOL_REF_DECL (symbol));
13341 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13343 x = DECL_RTL (imp_decl);
13344 if (want_reg)
13345 x = force_reg (Pmode, x);
13346 return x;
13349 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13350 is true if we require the result be a register. */
13352 static rtx
13353 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13355 if (!TARGET_PECOFF)
13356 return NULL_RTX;
13358 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13360 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13361 return legitimize_dllimport_symbol (addr, inreg);
13362 if (GET_CODE (addr) == CONST
13363 && GET_CODE (XEXP (addr, 0)) == PLUS
13364 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13365 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13367 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13368 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13372 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13373 return NULL_RTX;
13374 if (GET_CODE (addr) == SYMBOL_REF
13375 && !is_imported_p (addr)
13376 && SYMBOL_REF_EXTERNAL_P (addr)
13377 && SYMBOL_REF_DECL (addr))
13378 return legitimize_pe_coff_extern_decl (addr, inreg);
13380 if (GET_CODE (addr) == CONST
13381 && GET_CODE (XEXP (addr, 0)) == PLUS
13382 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13383 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13384 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13385 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13387 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13388 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13390 return NULL_RTX;
13393 /* Try machine-dependent ways of modifying an illegitimate address
13394 to be legitimate. If we find one, return the new, valid address.
13395 This macro is used in only one place: `memory_address' in explow.c.
13397 OLDX is the address as it was before break_out_memory_refs was called.
13398 In some cases it is useful to look at this to decide what needs to be done.
13400 It is always safe for this macro to do nothing. It exists to recognize
13401 opportunities to optimize the output.
13403 For the 80386, we handle X+REG by loading X into a register R and
13404 using R+REG. R will go in a general reg and indexing will be used.
13405 However, if REG is a broken-out memory address or multiplication,
13406 nothing needs to be done because REG can certainly go in a general reg.
13408 When -fpic is used, special handling is needed for symbolic references.
13409 See comments by legitimize_pic_address in i386.c for details. */
13411 static rtx
13412 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13413 enum machine_mode mode)
13415 int changed = 0;
13416 unsigned log;
13418 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13419 if (log)
13420 return legitimize_tls_address (x, (enum tls_model) log, false);
13421 if (GET_CODE (x) == CONST
13422 && GET_CODE (XEXP (x, 0)) == PLUS
13423 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13424 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13426 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13427 (enum tls_model) log, false);
13428 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13431 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13433 rtx tmp = legitimize_pe_coff_symbol (x, true);
13434 if (tmp)
13435 return tmp;
13438 if (flag_pic && SYMBOLIC_CONST (x))
13439 return legitimize_pic_address (x, 0);
13441 #if TARGET_MACHO
13442 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13443 return machopic_indirect_data_reference (x, 0);
13444 #endif
13446 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13447 if (GET_CODE (x) == ASHIFT
13448 && CONST_INT_P (XEXP (x, 1))
13449 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13451 changed = 1;
13452 log = INTVAL (XEXP (x, 1));
13453 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13454 GEN_INT (1 << log));
13457 if (GET_CODE (x) == PLUS)
13459 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13461 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13462 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13463 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13465 changed = 1;
13466 log = INTVAL (XEXP (XEXP (x, 0), 1));
13467 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13468 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13469 GEN_INT (1 << log));
13472 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13473 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13474 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13476 changed = 1;
13477 log = INTVAL (XEXP (XEXP (x, 1), 1));
13478 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13479 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13480 GEN_INT (1 << log));
13483 /* Put multiply first if it isn't already. */
13484 if (GET_CODE (XEXP (x, 1)) == MULT)
13486 rtx tmp = XEXP (x, 0);
13487 XEXP (x, 0) = XEXP (x, 1);
13488 XEXP (x, 1) = tmp;
13489 changed = 1;
13492 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13493 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13494 created by virtual register instantiation, register elimination, and
13495 similar optimizations. */
13496 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13498 changed = 1;
13499 x = gen_rtx_PLUS (Pmode,
13500 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13501 XEXP (XEXP (x, 1), 0)),
13502 XEXP (XEXP (x, 1), 1));
13505 /* Canonicalize
13506 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13507 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13508 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13509 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13510 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13511 && CONSTANT_P (XEXP (x, 1)))
13513 rtx constant;
13514 rtx other = NULL_RTX;
13516 if (CONST_INT_P (XEXP (x, 1)))
13518 constant = XEXP (x, 1);
13519 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13521 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13523 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13524 other = XEXP (x, 1);
13526 else
13527 constant = 0;
13529 if (constant)
13531 changed = 1;
13532 x = gen_rtx_PLUS (Pmode,
13533 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13534 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13535 plus_constant (Pmode, other,
13536 INTVAL (constant)));
13540 if (changed && ix86_legitimate_address_p (mode, x, false))
13541 return x;
13543 if (GET_CODE (XEXP (x, 0)) == MULT)
13545 changed = 1;
13546 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13549 if (GET_CODE (XEXP (x, 1)) == MULT)
13551 changed = 1;
13552 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13555 if (changed
13556 && REG_P (XEXP (x, 1))
13557 && REG_P (XEXP (x, 0)))
13558 return x;
13560 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13562 changed = 1;
13563 x = legitimize_pic_address (x, 0);
13566 if (changed && ix86_legitimate_address_p (mode, x, false))
13567 return x;
13569 if (REG_P (XEXP (x, 0)))
13571 rtx temp = gen_reg_rtx (Pmode);
13572 rtx val = force_operand (XEXP (x, 1), temp);
13573 if (val != temp)
13575 val = convert_to_mode (Pmode, val, 1);
13576 emit_move_insn (temp, val);
13579 XEXP (x, 1) = temp;
13580 return x;
13583 else if (REG_P (XEXP (x, 1)))
13585 rtx temp = gen_reg_rtx (Pmode);
13586 rtx val = force_operand (XEXP (x, 0), temp);
13587 if (val != temp)
13589 val = convert_to_mode (Pmode, val, 1);
13590 emit_move_insn (temp, val);
13593 XEXP (x, 0) = temp;
13594 return x;
13598 return x;
13601 /* Print an integer constant expression in assembler syntax. Addition
13602 and subtraction are the only arithmetic that may appear in these
13603 expressions. FILE is the stdio stream to write to, X is the rtx, and
13604 CODE is the operand print code from the output string. */
13606 static void
13607 output_pic_addr_const (FILE *file, rtx x, int code)
13609 char buf[256];
13611 switch (GET_CODE (x))
13613 case PC:
13614 gcc_assert (flag_pic);
13615 putc ('.', file);
13616 break;
13618 case SYMBOL_REF:
13619 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13620 output_addr_const (file, x);
13621 else
13623 const char *name = XSTR (x, 0);
13625 /* Mark the decl as referenced so that cgraph will
13626 output the function. */
13627 if (SYMBOL_REF_DECL (x))
13628 mark_decl_referenced (SYMBOL_REF_DECL (x));
13630 #if TARGET_MACHO
13631 if (MACHOPIC_INDIRECT
13632 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13633 name = machopic_indirection_name (x, /*stub_p=*/true);
13634 #endif
13635 assemble_name (file, name);
13637 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13638 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13639 fputs ("@PLT", file);
13640 break;
13642 case LABEL_REF:
13643 x = XEXP (x, 0);
13644 /* FALLTHRU */
13645 case CODE_LABEL:
13646 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13647 assemble_name (asm_out_file, buf);
13648 break;
13650 case CONST_INT:
13651 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13652 break;
13654 case CONST:
13655 /* This used to output parentheses around the expression,
13656 but that does not work on the 386 (either ATT or BSD assembler). */
13657 output_pic_addr_const (file, XEXP (x, 0), code);
13658 break;
13660 case CONST_DOUBLE:
13661 if (GET_MODE (x) == VOIDmode)
13663 /* We can use %d if the number is <32 bits and positive. */
13664 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13665 fprintf (file, "0x%lx%08lx",
13666 (unsigned long) CONST_DOUBLE_HIGH (x),
13667 (unsigned long) CONST_DOUBLE_LOW (x));
13668 else
13669 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13671 else
13672 /* We can't handle floating point constants;
13673 TARGET_PRINT_OPERAND must handle them. */
13674 output_operand_lossage ("floating constant misused");
13675 break;
13677 case PLUS:
13678 /* Some assemblers need integer constants to appear first. */
13679 if (CONST_INT_P (XEXP (x, 0)))
13681 output_pic_addr_const (file, XEXP (x, 0), code);
13682 putc ('+', file);
13683 output_pic_addr_const (file, XEXP (x, 1), code);
13685 else
13687 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13688 output_pic_addr_const (file, XEXP (x, 1), code);
13689 putc ('+', file);
13690 output_pic_addr_const (file, XEXP (x, 0), code);
13692 break;
13694 case MINUS:
13695 if (!TARGET_MACHO)
13696 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13697 output_pic_addr_const (file, XEXP (x, 0), code);
13698 putc ('-', file);
13699 output_pic_addr_const (file, XEXP (x, 1), code);
13700 if (!TARGET_MACHO)
13701 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13702 break;
13704 case UNSPEC:
13705 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13707 bool f = i386_asm_output_addr_const_extra (file, x);
13708 gcc_assert (f);
13709 break;
13712 gcc_assert (XVECLEN (x, 0) == 1);
13713 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13714 switch (XINT (x, 1))
13716 case UNSPEC_GOT:
13717 fputs ("@GOT", file);
13718 break;
13719 case UNSPEC_GOTOFF:
13720 fputs ("@GOTOFF", file);
13721 break;
13722 case UNSPEC_PLTOFF:
13723 fputs ("@PLTOFF", file);
13724 break;
13725 case UNSPEC_PCREL:
13726 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13727 "(%rip)" : "[rip]", file);
13728 break;
13729 case UNSPEC_GOTPCREL:
13730 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13731 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13732 break;
13733 case UNSPEC_GOTTPOFF:
13734 /* FIXME: This might be @TPOFF in Sun ld too. */
13735 fputs ("@gottpoff", file);
13736 break;
13737 case UNSPEC_TPOFF:
13738 fputs ("@tpoff", file);
13739 break;
13740 case UNSPEC_NTPOFF:
13741 if (TARGET_64BIT)
13742 fputs ("@tpoff", file);
13743 else
13744 fputs ("@ntpoff", file);
13745 break;
13746 case UNSPEC_DTPOFF:
13747 fputs ("@dtpoff", file);
13748 break;
13749 case UNSPEC_GOTNTPOFF:
13750 if (TARGET_64BIT)
13751 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13752 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13753 else
13754 fputs ("@gotntpoff", file);
13755 break;
13756 case UNSPEC_INDNTPOFF:
13757 fputs ("@indntpoff", file);
13758 break;
13759 #if TARGET_MACHO
13760 case UNSPEC_MACHOPIC_OFFSET:
13761 putc ('-', file);
13762 machopic_output_function_base_name (file);
13763 break;
13764 #endif
13765 default:
13766 output_operand_lossage ("invalid UNSPEC as operand");
13767 break;
13769 break;
13771 default:
13772 output_operand_lossage ("invalid expression as operand");
13776 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13777 We need to emit DTP-relative relocations. */
13779 static void ATTRIBUTE_UNUSED
13780 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13782 fputs (ASM_LONG, file);
13783 output_addr_const (file, x);
13784 fputs ("@dtpoff", file);
13785 switch (size)
13787 case 4:
13788 break;
13789 case 8:
13790 fputs (", 0", file);
13791 break;
13792 default:
13793 gcc_unreachable ();
13797 /* Return true if X is a representation of the PIC register. This copes
13798 with calls from ix86_find_base_term, where the register might have
13799 been replaced by a cselib value. */
13801 static bool
13802 ix86_pic_register_p (rtx x)
13804 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13805 return (pic_offset_table_rtx
13806 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13807 else
13808 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13811 /* Helper function for ix86_delegitimize_address.
13812 Attempt to delegitimize TLS local-exec accesses. */
13814 static rtx
13815 ix86_delegitimize_tls_address (rtx orig_x)
13817 rtx x = orig_x, unspec;
13818 struct ix86_address addr;
13820 if (!TARGET_TLS_DIRECT_SEG_REFS)
13821 return orig_x;
13822 if (MEM_P (x))
13823 x = XEXP (x, 0);
13824 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13825 return orig_x;
13826 if (ix86_decompose_address (x, &addr) == 0
13827 || addr.seg != DEFAULT_TLS_SEG_REG
13828 || addr.disp == NULL_RTX
13829 || GET_CODE (addr.disp) != CONST)
13830 return orig_x;
13831 unspec = XEXP (addr.disp, 0);
13832 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13833 unspec = XEXP (unspec, 0);
13834 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13835 return orig_x;
13836 x = XVECEXP (unspec, 0, 0);
13837 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13838 if (unspec != XEXP (addr.disp, 0))
13839 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13840 if (addr.index)
13842 rtx idx = addr.index;
13843 if (addr.scale != 1)
13844 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13845 x = gen_rtx_PLUS (Pmode, idx, x);
13847 if (addr.base)
13848 x = gen_rtx_PLUS (Pmode, addr.base, x);
13849 if (MEM_P (orig_x))
13850 x = replace_equiv_address_nv (orig_x, x);
13851 return x;
13854 /* In the name of slightly smaller debug output, and to cater to
13855 general assembler lossage, recognize PIC+GOTOFF and turn it back
13856 into a direct symbol reference.
13858 On Darwin, this is necessary to avoid a crash, because Darwin
13859 has a different PIC label for each routine but the DWARF debugging
13860 information is not associated with any particular routine, so it's
13861 necessary to remove references to the PIC label from RTL stored by
13862 the DWARF output code. */
13864 static rtx
13865 ix86_delegitimize_address (rtx x)
13867 rtx orig_x = delegitimize_mem_from_attrs (x);
13868 /* addend is NULL or some rtx if x is something+GOTOFF where
13869 something doesn't include the PIC register. */
13870 rtx addend = NULL_RTX;
13871 /* reg_addend is NULL or a multiple of some register. */
13872 rtx reg_addend = NULL_RTX;
13873 /* const_addend is NULL or a const_int. */
13874 rtx const_addend = NULL_RTX;
13875 /* This is the result, or NULL. */
13876 rtx result = NULL_RTX;
13878 x = orig_x;
13880 if (MEM_P (x))
13881 x = XEXP (x, 0);
13883 if (TARGET_64BIT)
13885 if (GET_CODE (x) == CONST
13886 && GET_CODE (XEXP (x, 0)) == PLUS
13887 && GET_MODE (XEXP (x, 0)) == Pmode
13888 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13889 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13890 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13892 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13893 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13894 if (MEM_P (orig_x))
13895 x = replace_equiv_address_nv (orig_x, x);
13896 return x;
13899 if (GET_CODE (x) == CONST
13900 && GET_CODE (XEXP (x, 0)) == UNSPEC
13901 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
13902 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
13903 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
13905 x = XVECEXP (XEXP (x, 0), 0, 0);
13906 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13908 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13909 GET_MODE (x), 0);
13910 if (x == NULL_RTX)
13911 return orig_x;
13913 return x;
13916 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
13917 return ix86_delegitimize_tls_address (orig_x);
13919 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
13920 and -mcmodel=medium -fpic. */
13923 if (GET_CODE (x) != PLUS
13924 || GET_CODE (XEXP (x, 1)) != CONST)
13925 return ix86_delegitimize_tls_address (orig_x);
13927 if (ix86_pic_register_p (XEXP (x, 0)))
13928 /* %ebx + GOT/GOTOFF */
13930 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13932 /* %ebx + %reg * scale + GOT/GOTOFF */
13933 reg_addend = XEXP (x, 0);
13934 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13935 reg_addend = XEXP (reg_addend, 1);
13936 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13937 reg_addend = XEXP (reg_addend, 0);
13938 else
13940 reg_addend = NULL_RTX;
13941 addend = XEXP (x, 0);
13944 else
13945 addend = XEXP (x, 0);
13947 x = XEXP (XEXP (x, 1), 0);
13948 if (GET_CODE (x) == PLUS
13949 && CONST_INT_P (XEXP (x, 1)))
13951 const_addend = XEXP (x, 1);
13952 x = XEXP (x, 0);
13955 if (GET_CODE (x) == UNSPEC
13956 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13957 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
13958 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
13959 && !MEM_P (orig_x) && !addend)))
13960 result = XVECEXP (x, 0, 0);
13962 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
13963 && !MEM_P (orig_x))
13964 result = XVECEXP (x, 0, 0);
13966 if (! result)
13967 return ix86_delegitimize_tls_address (orig_x);
13969 if (const_addend)
13970 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13971 if (reg_addend)
13972 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13973 if (addend)
13975 /* If the rest of original X doesn't involve the PIC register, add
13976 addend and subtract pic_offset_table_rtx. This can happen e.g.
13977 for code like:
13978 leal (%ebx, %ecx, 4), %ecx
13980 movl foo@GOTOFF(%ecx), %edx
13981 in which case we return (%ecx - %ebx) + foo. */
13982 if (pic_offset_table_rtx)
13983 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13984 pic_offset_table_rtx),
13985 result);
13986 else
13987 return orig_x;
13989 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13991 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13992 if (result == NULL_RTX)
13993 return orig_x;
13995 return result;
13998 /* If X is a machine specific address (i.e. a symbol or label being
13999 referenced as a displacement from the GOT implemented using an
14000 UNSPEC), then return the base term. Otherwise return X. */
14003 ix86_find_base_term (rtx x)
14005 rtx term;
14007 if (TARGET_64BIT)
14009 if (GET_CODE (x) != CONST)
14010 return x;
14011 term = XEXP (x, 0);
14012 if (GET_CODE (term) == PLUS
14013 && (CONST_INT_P (XEXP (term, 1))
14014 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
14015 term = XEXP (term, 0);
14016 if (GET_CODE (term) != UNSPEC
14017 || (XINT (term, 1) != UNSPEC_GOTPCREL
14018 && XINT (term, 1) != UNSPEC_PCREL))
14019 return x;
14021 return XVECEXP (term, 0, 0);
14024 return ix86_delegitimize_address (x);
14027 static void
14028 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14029 bool fp, FILE *file)
14031 const char *suffix;
14033 if (mode == CCFPmode || mode == CCFPUmode)
14035 code = ix86_fp_compare_code_to_integer (code);
14036 mode = CCmode;
14038 if (reverse)
14039 code = reverse_condition (code);
14041 switch (code)
14043 case EQ:
14044 switch (mode)
14046 case CCAmode:
14047 suffix = "a";
14048 break;
14050 case CCCmode:
14051 suffix = "c";
14052 break;
14054 case CCOmode:
14055 suffix = "o";
14056 break;
14058 case CCSmode:
14059 suffix = "s";
14060 break;
14062 default:
14063 suffix = "e";
14065 break;
14066 case NE:
14067 switch (mode)
14069 case CCAmode:
14070 suffix = "na";
14071 break;
14073 case CCCmode:
14074 suffix = "nc";
14075 break;
14077 case CCOmode:
14078 suffix = "no";
14079 break;
14081 case CCSmode:
14082 suffix = "ns";
14083 break;
14085 default:
14086 suffix = "ne";
14088 break;
14089 case GT:
14090 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14091 suffix = "g";
14092 break;
14093 case GTU:
14094 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14095 Those same assemblers have the same but opposite lossage on cmov. */
14096 if (mode == CCmode)
14097 suffix = fp ? "nbe" : "a";
14098 else if (mode == CCCmode)
14099 suffix = "b";
14100 else
14101 gcc_unreachable ();
14102 break;
14103 case LT:
14104 switch (mode)
14106 case CCNOmode:
14107 case CCGOCmode:
14108 suffix = "s";
14109 break;
14111 case CCmode:
14112 case CCGCmode:
14113 suffix = "l";
14114 break;
14116 default:
14117 gcc_unreachable ();
14119 break;
14120 case LTU:
14121 gcc_assert (mode == CCmode || mode == CCCmode);
14122 suffix = "b";
14123 break;
14124 case GE:
14125 switch (mode)
14127 case CCNOmode:
14128 case CCGOCmode:
14129 suffix = "ns";
14130 break;
14132 case CCmode:
14133 case CCGCmode:
14134 suffix = "ge";
14135 break;
14137 default:
14138 gcc_unreachable ();
14140 break;
14141 case GEU:
14142 /* ??? As above. */
14143 gcc_assert (mode == CCmode || mode == CCCmode);
14144 suffix = fp ? "nb" : "ae";
14145 break;
14146 case LE:
14147 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14148 suffix = "le";
14149 break;
14150 case LEU:
14151 /* ??? As above. */
14152 if (mode == CCmode)
14153 suffix = "be";
14154 else if (mode == CCCmode)
14155 suffix = fp ? "nb" : "ae";
14156 else
14157 gcc_unreachable ();
14158 break;
14159 case UNORDERED:
14160 suffix = fp ? "u" : "p";
14161 break;
14162 case ORDERED:
14163 suffix = fp ? "nu" : "np";
14164 break;
14165 default:
14166 gcc_unreachable ();
14168 fputs (suffix, file);
14171 /* Print the name of register X to FILE based on its machine mode and number.
14172 If CODE is 'w', pretend the mode is HImode.
14173 If CODE is 'b', pretend the mode is QImode.
14174 If CODE is 'k', pretend the mode is SImode.
14175 If CODE is 'q', pretend the mode is DImode.
14176 If CODE is 'x', pretend the mode is V4SFmode.
14177 If CODE is 't', pretend the mode is V8SFmode.
14178 If CODE is 'g', pretend the mode is V16SFmode.
14179 If CODE is 'h', pretend the reg is the 'high' byte register.
14180 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14181 If CODE is 'd', duplicate the operand for AVX instruction.
14184 void
14185 print_reg (rtx x, int code, FILE *file)
14187 const char *reg;
14188 unsigned int regno;
14189 bool duplicated = code == 'd' && TARGET_AVX;
14191 if (ASSEMBLER_DIALECT == ASM_ATT)
14192 putc ('%', file);
14194 if (x == pc_rtx)
14196 gcc_assert (TARGET_64BIT);
14197 fputs ("rip", file);
14198 return;
14201 regno = true_regnum (x);
14202 gcc_assert (regno != ARG_POINTER_REGNUM
14203 && regno != FRAME_POINTER_REGNUM
14204 && regno != FLAGS_REG
14205 && regno != FPSR_REG
14206 && regno != FPCR_REG);
14208 if (code == 'w' || MMX_REG_P (x))
14209 code = 2;
14210 else if (code == 'b')
14211 code = 1;
14212 else if (code == 'k')
14213 code = 4;
14214 else if (code == 'q')
14215 code = 8;
14216 else if (code == 'y')
14217 code = 3;
14218 else if (code == 'h')
14219 code = 0;
14220 else if (code == 'x')
14221 code = 16;
14222 else if (code == 't')
14223 code = 32;
14224 else if (code == 'g')
14225 code = 64;
14226 else
14227 code = GET_MODE_SIZE (GET_MODE (x));
14229 /* Irritatingly, AMD extended registers use different naming convention
14230 from the normal registers: "r%d[bwd]" */
14231 if (REX_INT_REGNO_P (regno))
14233 gcc_assert (TARGET_64BIT);
14234 putc ('r', file);
14235 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14236 switch (code)
14238 case 0:
14239 error ("extended registers have no high halves");
14240 break;
14241 case 1:
14242 putc ('b', file);
14243 break;
14244 case 2:
14245 putc ('w', file);
14246 break;
14247 case 4:
14248 putc ('d', file);
14249 break;
14250 case 8:
14251 /* no suffix */
14252 break;
14253 default:
14254 error ("unsupported operand size for extended register");
14255 break;
14257 return;
14260 reg = NULL;
14261 switch (code)
14263 case 3:
14264 if (STACK_TOP_P (x))
14266 reg = "st(0)";
14267 break;
14269 /* FALLTHRU */
14270 case 8:
14271 case 4:
14272 case 12:
14273 if (! ANY_FP_REG_P (x))
14274 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14275 /* FALLTHRU */
14276 case 16:
14277 case 2:
14278 normal:
14279 reg = hi_reg_name[regno];
14280 break;
14281 case 1:
14282 if (regno >= ARRAY_SIZE (qi_reg_name))
14283 goto normal;
14284 reg = qi_reg_name[regno];
14285 break;
14286 case 0:
14287 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14288 goto normal;
14289 reg = qi_high_reg_name[regno];
14290 break;
14291 case 32:
14292 if (SSE_REG_P (x))
14294 gcc_assert (!duplicated);
14295 putc ('y', file);
14296 fputs (hi_reg_name[regno] + 1, file);
14297 return;
14299 case 64:
14300 if (SSE_REG_P (x))
14302 gcc_assert (!duplicated);
14303 putc ('z', file);
14304 fputs (hi_reg_name[REGNO (x)] + 1, file);
14305 return;
14307 break;
14308 default:
14309 gcc_unreachable ();
14312 fputs (reg, file);
14313 if (duplicated)
14315 if (ASSEMBLER_DIALECT == ASM_ATT)
14316 fprintf (file, ", %%%s", reg);
14317 else
14318 fprintf (file, ", %s", reg);
14322 /* Locate some local-dynamic symbol still in use by this function
14323 so that we can print its name in some tls_local_dynamic_base
14324 pattern. */
14326 static int
14327 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14329 rtx x = *px;
14331 if (GET_CODE (x) == SYMBOL_REF
14332 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14334 cfun->machine->some_ld_name = XSTR (x, 0);
14335 return 1;
14338 return 0;
14341 static const char *
14342 get_some_local_dynamic_name (void)
14344 rtx insn;
14346 if (cfun->machine->some_ld_name)
14347 return cfun->machine->some_ld_name;
14349 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14350 if (NONDEBUG_INSN_P (insn)
14351 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14352 return cfun->machine->some_ld_name;
14354 return NULL;
14357 /* Meaning of CODE:
14358 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14359 C -- print opcode suffix for set/cmov insn.
14360 c -- like C, but print reversed condition
14361 F,f -- likewise, but for floating-point.
14362 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14363 otherwise nothing
14364 R -- print the prefix for register names.
14365 z -- print the opcode suffix for the size of the current operand.
14366 Z -- likewise, with special suffixes for x87 instructions.
14367 * -- print a star (in certain assembler syntax)
14368 A -- print an absolute memory reference.
14369 E -- print address with DImode register names if TARGET_64BIT.
14370 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14371 s -- print a shift double count, followed by the assemblers argument
14372 delimiter.
14373 b -- print the QImode name of the register for the indicated operand.
14374 %b0 would print %al if operands[0] is reg 0.
14375 w -- likewise, print the HImode name of the register.
14376 k -- likewise, print the SImode name of the register.
14377 q -- likewise, print the DImode name of the register.
14378 x -- likewise, print the V4SFmode name of the register.
14379 t -- likewise, print the V8SFmode name of the register.
14380 g -- likewise, print the V16SFmode name of the register.
14381 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14382 y -- print "st(0)" instead of "st" as a register.
14383 d -- print duplicated register operand for AVX instruction.
14384 D -- print condition for SSE cmp instruction.
14385 P -- if PIC, print an @PLT suffix.
14386 p -- print raw symbol name.
14387 X -- don't print any sort of PIC '@' suffix for a symbol.
14388 & -- print some in-use local-dynamic symbol name.
14389 H -- print a memory address offset by 8; used for sse high-parts
14390 Y -- print condition for XOP pcom* instruction.
14391 + -- print a branch hint as 'cs' or 'ds' prefix
14392 ; -- print a semicolon (after prefixes due to bug in older gas).
14393 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14394 @ -- print a segment register of thread base pointer load
14395 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14398 void
14399 ix86_print_operand (FILE *file, rtx x, int code)
14401 if (code)
14403 switch (code)
14405 case 'A':
14406 switch (ASSEMBLER_DIALECT)
14408 case ASM_ATT:
14409 putc ('*', file);
14410 break;
14412 case ASM_INTEL:
14413 /* Intel syntax. For absolute addresses, registers should not
14414 be surrounded by braces. */
14415 if (!REG_P (x))
14417 putc ('[', file);
14418 ix86_print_operand (file, x, 0);
14419 putc (']', file);
14420 return;
14422 break;
14424 default:
14425 gcc_unreachable ();
14428 ix86_print_operand (file, x, 0);
14429 return;
14431 case 'E':
14432 /* Wrap address in an UNSPEC to declare special handling. */
14433 if (TARGET_64BIT)
14434 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14436 output_address (x);
14437 return;
14439 case 'L':
14440 if (ASSEMBLER_DIALECT == ASM_ATT)
14441 putc ('l', file);
14442 return;
14444 case 'W':
14445 if (ASSEMBLER_DIALECT == ASM_ATT)
14446 putc ('w', file);
14447 return;
14449 case 'B':
14450 if (ASSEMBLER_DIALECT == ASM_ATT)
14451 putc ('b', file);
14452 return;
14454 case 'Q':
14455 if (ASSEMBLER_DIALECT == ASM_ATT)
14456 putc ('l', file);
14457 return;
14459 case 'S':
14460 if (ASSEMBLER_DIALECT == ASM_ATT)
14461 putc ('s', file);
14462 return;
14464 case 'T':
14465 if (ASSEMBLER_DIALECT == ASM_ATT)
14466 putc ('t', file);
14467 return;
14469 case 'O':
14470 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14471 if (ASSEMBLER_DIALECT != ASM_ATT)
14472 return;
14474 switch (GET_MODE_SIZE (GET_MODE (x)))
14476 case 2:
14477 putc ('w', file);
14478 break;
14480 case 4:
14481 putc ('l', file);
14482 break;
14484 case 8:
14485 putc ('q', file);
14486 break;
14488 default:
14489 output_operand_lossage
14490 ("invalid operand size for operand code 'O'");
14491 return;
14494 putc ('.', file);
14495 #endif
14496 return;
14498 case 'z':
14499 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14501 /* Opcodes don't get size suffixes if using Intel opcodes. */
14502 if (ASSEMBLER_DIALECT == ASM_INTEL)
14503 return;
14505 switch (GET_MODE_SIZE (GET_MODE (x)))
14507 case 1:
14508 putc ('b', file);
14509 return;
14511 case 2:
14512 putc ('w', file);
14513 return;
14515 case 4:
14516 putc ('l', file);
14517 return;
14519 case 8:
14520 putc ('q', file);
14521 return;
14523 default:
14524 output_operand_lossage
14525 ("invalid operand size for operand code 'z'");
14526 return;
14530 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14531 warning
14532 (0, "non-integer operand used with operand code 'z'");
14533 /* FALLTHRU */
14535 case 'Z':
14536 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14537 if (ASSEMBLER_DIALECT == ASM_INTEL)
14538 return;
14540 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14542 switch (GET_MODE_SIZE (GET_MODE (x)))
14544 case 2:
14545 #ifdef HAVE_AS_IX86_FILDS
14546 putc ('s', file);
14547 #endif
14548 return;
14550 case 4:
14551 putc ('l', file);
14552 return;
14554 case 8:
14555 #ifdef HAVE_AS_IX86_FILDQ
14556 putc ('q', file);
14557 #else
14558 fputs ("ll", file);
14559 #endif
14560 return;
14562 default:
14563 break;
14566 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14568 /* 387 opcodes don't get size suffixes
14569 if the operands are registers. */
14570 if (STACK_REG_P (x))
14571 return;
14573 switch (GET_MODE_SIZE (GET_MODE (x)))
14575 case 4:
14576 putc ('s', file);
14577 return;
14579 case 8:
14580 putc ('l', file);
14581 return;
14583 case 12:
14584 case 16:
14585 putc ('t', file);
14586 return;
14588 default:
14589 break;
14592 else
14594 output_operand_lossage
14595 ("invalid operand type used with operand code 'Z'");
14596 return;
14599 output_operand_lossage
14600 ("invalid operand size for operand code 'Z'");
14601 return;
14603 case 'd':
14604 case 'b':
14605 case 'w':
14606 case 'k':
14607 case 'q':
14608 case 'h':
14609 case 't':
14610 case 'g':
14611 case 'y':
14612 case 'x':
14613 case 'X':
14614 case 'P':
14615 case 'p':
14616 break;
14618 case 's':
14619 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14621 ix86_print_operand (file, x, 0);
14622 fputs (", ", file);
14624 return;
14626 case 'Y':
14627 switch (GET_CODE (x))
14629 case NE:
14630 fputs ("neq", file);
14631 break;
14632 case EQ:
14633 fputs ("eq", file);
14634 break;
14635 case GE:
14636 case GEU:
14637 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14638 break;
14639 case GT:
14640 case GTU:
14641 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14642 break;
14643 case LE:
14644 case LEU:
14645 fputs ("le", file);
14646 break;
14647 case LT:
14648 case LTU:
14649 fputs ("lt", file);
14650 break;
14651 case UNORDERED:
14652 fputs ("unord", file);
14653 break;
14654 case ORDERED:
14655 fputs ("ord", file);
14656 break;
14657 case UNEQ:
14658 fputs ("ueq", file);
14659 break;
14660 case UNGE:
14661 fputs ("nlt", file);
14662 break;
14663 case UNGT:
14664 fputs ("nle", file);
14665 break;
14666 case UNLE:
14667 fputs ("ule", file);
14668 break;
14669 case UNLT:
14670 fputs ("ult", file);
14671 break;
14672 case LTGT:
14673 fputs ("une", file);
14674 break;
14675 default:
14676 output_operand_lossage ("operand is not a condition code, "
14677 "invalid operand code 'Y'");
14678 return;
14680 return;
14682 case 'D':
14683 /* Little bit of braindamage here. The SSE compare instructions
14684 does use completely different names for the comparisons that the
14685 fp conditional moves. */
14686 switch (GET_CODE (x))
14688 case UNEQ:
14689 if (TARGET_AVX)
14691 fputs ("eq_us", file);
14692 break;
14694 case EQ:
14695 fputs ("eq", file);
14696 break;
14697 case UNLT:
14698 if (TARGET_AVX)
14700 fputs ("nge", file);
14701 break;
14703 case LT:
14704 fputs ("lt", file);
14705 break;
14706 case UNLE:
14707 if (TARGET_AVX)
14709 fputs ("ngt", file);
14710 break;
14712 case LE:
14713 fputs ("le", file);
14714 break;
14715 case UNORDERED:
14716 fputs ("unord", file);
14717 break;
14718 case LTGT:
14719 if (TARGET_AVX)
14721 fputs ("neq_oq", file);
14722 break;
14724 case NE:
14725 fputs ("neq", file);
14726 break;
14727 case GE:
14728 if (TARGET_AVX)
14730 fputs ("ge", file);
14731 break;
14733 case UNGE:
14734 fputs ("nlt", file);
14735 break;
14736 case GT:
14737 if (TARGET_AVX)
14739 fputs ("gt", file);
14740 break;
14742 case UNGT:
14743 fputs ("nle", file);
14744 break;
14745 case ORDERED:
14746 fputs ("ord", file);
14747 break;
14748 default:
14749 output_operand_lossage ("operand is not a condition code, "
14750 "invalid operand code 'D'");
14751 return;
14753 return;
14755 case 'F':
14756 case 'f':
14757 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14758 if (ASSEMBLER_DIALECT == ASM_ATT)
14759 putc ('.', file);
14760 #endif
14762 case 'C':
14763 case 'c':
14764 if (!COMPARISON_P (x))
14766 output_operand_lossage ("operand is not a condition code, "
14767 "invalid operand code '%c'", code);
14768 return;
14770 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14771 code == 'c' || code == 'f',
14772 code == 'F' || code == 'f',
14773 file);
14774 return;
14776 case 'H':
14777 if (!offsettable_memref_p (x))
14779 output_operand_lossage ("operand is not an offsettable memory "
14780 "reference, invalid operand code 'H'");
14781 return;
14783 /* It doesn't actually matter what mode we use here, as we're
14784 only going to use this for printing. */
14785 x = adjust_address_nv (x, DImode, 8);
14786 /* Output 'qword ptr' for intel assembler dialect. */
14787 if (ASSEMBLER_DIALECT == ASM_INTEL)
14788 code = 'q';
14789 break;
14791 case 'K':
14792 gcc_assert (CONST_INT_P (x));
14794 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14795 #ifdef HAVE_AS_IX86_HLE
14796 fputs ("xacquire ", file);
14797 #else
14798 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14799 #endif
14800 else if (INTVAL (x) & IX86_HLE_RELEASE)
14801 #ifdef HAVE_AS_IX86_HLE
14802 fputs ("xrelease ", file);
14803 #else
14804 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14805 #endif
14806 /* We do not want to print value of the operand. */
14807 return;
14809 case '*':
14810 if (ASSEMBLER_DIALECT == ASM_ATT)
14811 putc ('*', file);
14812 return;
14814 case '&':
14816 const char *name = get_some_local_dynamic_name ();
14817 if (name == NULL)
14818 output_operand_lossage ("'%%&' used without any "
14819 "local dynamic TLS references");
14820 else
14821 assemble_name (file, name);
14822 return;
14825 case '+':
14827 rtx x;
14829 if (!optimize
14830 || optimize_function_for_size_p (cfun)
14831 || !TARGET_BRANCH_PREDICTION_HINTS)
14832 return;
14834 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14835 if (x)
14837 int pred_val = XINT (x, 0);
14839 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14840 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14842 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14843 bool cputaken
14844 = final_forward_branch_p (current_output_insn) == 0;
14846 /* Emit hints only in the case default branch prediction
14847 heuristics would fail. */
14848 if (taken != cputaken)
14850 /* We use 3e (DS) prefix for taken branches and
14851 2e (CS) prefix for not taken branches. */
14852 if (taken)
14853 fputs ("ds ; ", file);
14854 else
14855 fputs ("cs ; ", file);
14859 return;
14862 case ';':
14863 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14864 putc (';', file);
14865 #endif
14866 return;
14868 case '@':
14869 if (ASSEMBLER_DIALECT == ASM_ATT)
14870 putc ('%', file);
14872 /* The kernel uses a different segment register for performance
14873 reasons; a system call would not have to trash the userspace
14874 segment register, which would be expensive. */
14875 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14876 fputs ("fs", file);
14877 else
14878 fputs ("gs", file);
14879 return;
14881 case '~':
14882 putc (TARGET_AVX2 ? 'i' : 'f', file);
14883 return;
14885 case '^':
14886 if (TARGET_64BIT && Pmode != word_mode)
14887 fputs ("addr32 ", file);
14888 return;
14890 default:
14891 output_operand_lossage ("invalid operand code '%c'", code);
14895 if (REG_P (x))
14896 print_reg (x, code, file);
14898 else if (MEM_P (x))
14900 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14901 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14902 && GET_MODE (x) != BLKmode)
14904 const char * size;
14905 switch (GET_MODE_SIZE (GET_MODE (x)))
14907 case 1: size = "BYTE"; break;
14908 case 2: size = "WORD"; break;
14909 case 4: size = "DWORD"; break;
14910 case 8: size = "QWORD"; break;
14911 case 12: size = "TBYTE"; break;
14912 case 16:
14913 if (GET_MODE (x) == XFmode)
14914 size = "TBYTE";
14915 else
14916 size = "XMMWORD";
14917 break;
14918 case 32: size = "YMMWORD"; break;
14919 case 64: size = "ZMMWORD"; break;
14920 default:
14921 gcc_unreachable ();
14924 /* Check for explicit size override (codes 'b', 'w', 'k',
14925 'q' and 'x') */
14926 if (code == 'b')
14927 size = "BYTE";
14928 else if (code == 'w')
14929 size = "WORD";
14930 else if (code == 'k')
14931 size = "DWORD";
14932 else if (code == 'q')
14933 size = "QWORD";
14934 else if (code == 'x')
14935 size = "XMMWORD";
14937 fputs (size, file);
14938 fputs (" PTR ", file);
14941 x = XEXP (x, 0);
14942 /* Avoid (%rip) for call operands. */
14943 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14944 && !CONST_INT_P (x))
14945 output_addr_const (file, x);
14946 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14947 output_operand_lossage ("invalid constraints for operand");
14948 else
14949 output_address (x);
14952 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14954 REAL_VALUE_TYPE r;
14955 long l;
14957 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14958 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14960 if (ASSEMBLER_DIALECT == ASM_ATT)
14961 putc ('$', file);
14962 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14963 if (code == 'q')
14964 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14965 (unsigned long long) (int) l);
14966 else
14967 fprintf (file, "0x%08x", (unsigned int) l);
14970 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14972 REAL_VALUE_TYPE r;
14973 long l[2];
14975 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14976 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14978 if (ASSEMBLER_DIALECT == ASM_ATT)
14979 putc ('$', file);
14980 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14983 /* These float cases don't actually occur as immediate operands. */
14984 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14986 char dstr[30];
14988 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14989 fputs (dstr, file);
14992 else
14994 /* We have patterns that allow zero sets of memory, for instance.
14995 In 64-bit mode, we should probably support all 8-byte vectors,
14996 since we can in fact encode that into an immediate. */
14997 if (GET_CODE (x) == CONST_VECTOR)
14999 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
15000 x = const0_rtx;
15003 if (code != 'P' && code != 'p')
15005 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
15007 if (ASSEMBLER_DIALECT == ASM_ATT)
15008 putc ('$', file);
15010 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
15011 || GET_CODE (x) == LABEL_REF)
15013 if (ASSEMBLER_DIALECT == ASM_ATT)
15014 putc ('$', file);
15015 else
15016 fputs ("OFFSET FLAT:", file);
15019 if (CONST_INT_P (x))
15020 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
15021 else if (flag_pic || MACHOPIC_INDIRECT)
15022 output_pic_addr_const (file, x, code);
15023 else
15024 output_addr_const (file, x);
15028 static bool
15029 ix86_print_operand_punct_valid_p (unsigned char code)
15031 return (code == '@' || code == '*' || code == '+' || code == '&'
15032 || code == ';' || code == '~' || code == '^');
15035 /* Print a memory operand whose address is ADDR. */
15037 static void
15038 ix86_print_operand_address (FILE *file, rtx addr)
15040 struct ix86_address parts;
15041 rtx base, index, disp;
15042 int scale;
15043 int ok;
15044 bool vsib = false;
15045 int code = 0;
15047 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15049 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15050 gcc_assert (parts.index == NULL_RTX);
15051 parts.index = XVECEXP (addr, 0, 1);
15052 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15053 addr = XVECEXP (addr, 0, 0);
15054 vsib = true;
15056 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15058 gcc_assert (TARGET_64BIT);
15059 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15060 code = 'q';
15062 else
15063 ok = ix86_decompose_address (addr, &parts);
15065 gcc_assert (ok);
15067 base = parts.base;
15068 index = parts.index;
15069 disp = parts.disp;
15070 scale = parts.scale;
15072 switch (parts.seg)
15074 case SEG_DEFAULT:
15075 break;
15076 case SEG_FS:
15077 case SEG_GS:
15078 if (ASSEMBLER_DIALECT == ASM_ATT)
15079 putc ('%', file);
15080 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15081 break;
15082 default:
15083 gcc_unreachable ();
15086 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15087 if (TARGET_64BIT && !base && !index)
15089 rtx symbol = disp;
15091 if (GET_CODE (disp) == CONST
15092 && GET_CODE (XEXP (disp, 0)) == PLUS
15093 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15094 symbol = XEXP (XEXP (disp, 0), 0);
15096 if (GET_CODE (symbol) == LABEL_REF
15097 || (GET_CODE (symbol) == SYMBOL_REF
15098 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15099 base = pc_rtx;
15101 if (!base && !index)
15103 /* Displacement only requires special attention. */
15105 if (CONST_INT_P (disp))
15107 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15108 fputs ("ds:", file);
15109 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15111 else if (flag_pic)
15112 output_pic_addr_const (file, disp, 0);
15113 else
15114 output_addr_const (file, disp);
15116 else
15118 /* Print SImode register names to force addr32 prefix. */
15119 if (SImode_address_operand (addr, VOIDmode))
15121 #ifdef ENABLE_CHECKING
15122 gcc_assert (TARGET_64BIT);
15123 switch (GET_CODE (addr))
15125 case SUBREG:
15126 gcc_assert (GET_MODE (addr) == SImode);
15127 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15128 break;
15129 case ZERO_EXTEND:
15130 case AND:
15131 gcc_assert (GET_MODE (addr) == DImode);
15132 break;
15133 default:
15134 gcc_unreachable ();
15136 #endif
15137 gcc_assert (!code);
15138 code = 'k';
15140 else if (code == 0
15141 && TARGET_X32
15142 && disp
15143 && CONST_INT_P (disp)
15144 && INTVAL (disp) < -16*1024*1024)
15146 /* X32 runs in 64-bit mode, where displacement, DISP, in
15147 address DISP(%r64), is encoded as 32-bit immediate sign-
15148 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15149 address is %r64 + 0xffffffffbffffd00. When %r64 <
15150 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15151 which is invalid for x32. The correct address is %r64
15152 - 0x40000300 == 0xf7ffdd64. To properly encode
15153 -0x40000300(%r64) for x32, we zero-extend negative
15154 displacement by forcing addr32 prefix which truncates
15155 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15156 zero-extend all negative displacements, including -1(%rsp).
15157 However, for small negative displacements, sign-extension
15158 won't cause overflow. We only zero-extend negative
15159 displacements if they < -16*1024*1024, which is also used
15160 to check legitimate address displacements for PIC. */
15161 code = 'k';
15164 if (ASSEMBLER_DIALECT == ASM_ATT)
15166 if (disp)
15168 if (flag_pic)
15169 output_pic_addr_const (file, disp, 0);
15170 else if (GET_CODE (disp) == LABEL_REF)
15171 output_asm_label (disp);
15172 else
15173 output_addr_const (file, disp);
15176 putc ('(', file);
15177 if (base)
15178 print_reg (base, code, file);
15179 if (index)
15181 putc (',', file);
15182 print_reg (index, vsib ? 0 : code, file);
15183 if (scale != 1 || vsib)
15184 fprintf (file, ",%d", scale);
15186 putc (')', file);
15188 else
15190 rtx offset = NULL_RTX;
15192 if (disp)
15194 /* Pull out the offset of a symbol; print any symbol itself. */
15195 if (GET_CODE (disp) == CONST
15196 && GET_CODE (XEXP (disp, 0)) == PLUS
15197 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15199 offset = XEXP (XEXP (disp, 0), 1);
15200 disp = gen_rtx_CONST (VOIDmode,
15201 XEXP (XEXP (disp, 0), 0));
15204 if (flag_pic)
15205 output_pic_addr_const (file, disp, 0);
15206 else if (GET_CODE (disp) == LABEL_REF)
15207 output_asm_label (disp);
15208 else if (CONST_INT_P (disp))
15209 offset = disp;
15210 else
15211 output_addr_const (file, disp);
15214 putc ('[', file);
15215 if (base)
15217 print_reg (base, code, file);
15218 if (offset)
15220 if (INTVAL (offset) >= 0)
15221 putc ('+', file);
15222 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15225 else if (offset)
15226 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15227 else
15228 putc ('0', file);
15230 if (index)
15232 putc ('+', file);
15233 print_reg (index, vsib ? 0 : code, file);
15234 if (scale != 1 || vsib)
15235 fprintf (file, "*%d", scale);
15237 putc (']', file);
15242 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15244 static bool
15245 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15247 rtx op;
15249 if (GET_CODE (x) != UNSPEC)
15250 return false;
15252 op = XVECEXP (x, 0, 0);
15253 switch (XINT (x, 1))
15255 case UNSPEC_GOTTPOFF:
15256 output_addr_const (file, op);
15257 /* FIXME: This might be @TPOFF in Sun ld. */
15258 fputs ("@gottpoff", file);
15259 break;
15260 case UNSPEC_TPOFF:
15261 output_addr_const (file, op);
15262 fputs ("@tpoff", file);
15263 break;
15264 case UNSPEC_NTPOFF:
15265 output_addr_const (file, op);
15266 if (TARGET_64BIT)
15267 fputs ("@tpoff", file);
15268 else
15269 fputs ("@ntpoff", file);
15270 break;
15271 case UNSPEC_DTPOFF:
15272 output_addr_const (file, op);
15273 fputs ("@dtpoff", file);
15274 break;
15275 case UNSPEC_GOTNTPOFF:
15276 output_addr_const (file, op);
15277 if (TARGET_64BIT)
15278 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15279 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15280 else
15281 fputs ("@gotntpoff", file);
15282 break;
15283 case UNSPEC_INDNTPOFF:
15284 output_addr_const (file, op);
15285 fputs ("@indntpoff", file);
15286 break;
15287 #if TARGET_MACHO
15288 case UNSPEC_MACHOPIC_OFFSET:
15289 output_addr_const (file, op);
15290 putc ('-', file);
15291 machopic_output_function_base_name (file);
15292 break;
15293 #endif
15295 case UNSPEC_STACK_CHECK:
15297 int offset;
15299 gcc_assert (flag_split_stack);
15301 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15302 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15303 #else
15304 gcc_unreachable ();
15305 #endif
15307 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15309 break;
15311 default:
15312 return false;
15315 return true;
15318 /* Split one or more double-mode RTL references into pairs of half-mode
15319 references. The RTL can be REG, offsettable MEM, integer constant, or
15320 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15321 split and "num" is its length. lo_half and hi_half are output arrays
15322 that parallel "operands". */
15324 void
15325 split_double_mode (enum machine_mode mode, rtx operands[],
15326 int num, rtx lo_half[], rtx hi_half[])
15328 enum machine_mode half_mode;
15329 unsigned int byte;
15331 switch (mode)
15333 case TImode:
15334 half_mode = DImode;
15335 break;
15336 case DImode:
15337 half_mode = SImode;
15338 break;
15339 default:
15340 gcc_unreachable ();
15343 byte = GET_MODE_SIZE (half_mode);
15345 while (num--)
15347 rtx op = operands[num];
15349 /* simplify_subreg refuse to split volatile memory addresses,
15350 but we still have to handle it. */
15351 if (MEM_P (op))
15353 lo_half[num] = adjust_address (op, half_mode, 0);
15354 hi_half[num] = adjust_address (op, half_mode, byte);
15356 else
15358 lo_half[num] = simplify_gen_subreg (half_mode, op,
15359 GET_MODE (op) == VOIDmode
15360 ? mode : GET_MODE (op), 0);
15361 hi_half[num] = simplify_gen_subreg (half_mode, op,
15362 GET_MODE (op) == VOIDmode
15363 ? mode : GET_MODE (op), byte);
15368 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15369 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15370 is the expression of the binary operation. The output may either be
15371 emitted here, or returned to the caller, like all output_* functions.
15373 There is no guarantee that the operands are the same mode, as they
15374 might be within FLOAT or FLOAT_EXTEND expressions. */
15376 #ifndef SYSV386_COMPAT
15377 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15378 wants to fix the assemblers because that causes incompatibility
15379 with gcc. No-one wants to fix gcc because that causes
15380 incompatibility with assemblers... You can use the option of
15381 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15382 #define SYSV386_COMPAT 1
15383 #endif
15385 const char *
15386 output_387_binary_op (rtx insn, rtx *operands)
15388 static char buf[40];
15389 const char *p;
15390 const char *ssep;
15391 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15393 #ifdef ENABLE_CHECKING
15394 /* Even if we do not want to check the inputs, this documents input
15395 constraints. Which helps in understanding the following code. */
15396 if (STACK_REG_P (operands[0])
15397 && ((REG_P (operands[1])
15398 && REGNO (operands[0]) == REGNO (operands[1])
15399 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15400 || (REG_P (operands[2])
15401 && REGNO (operands[0]) == REGNO (operands[2])
15402 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15403 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15404 ; /* ok */
15405 else
15406 gcc_assert (is_sse);
15407 #endif
15409 switch (GET_CODE (operands[3]))
15411 case PLUS:
15412 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15413 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15414 p = "fiadd";
15415 else
15416 p = "fadd";
15417 ssep = "vadd";
15418 break;
15420 case MINUS:
15421 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15422 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15423 p = "fisub";
15424 else
15425 p = "fsub";
15426 ssep = "vsub";
15427 break;
15429 case MULT:
15430 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15431 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15432 p = "fimul";
15433 else
15434 p = "fmul";
15435 ssep = "vmul";
15436 break;
15438 case DIV:
15439 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15440 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15441 p = "fidiv";
15442 else
15443 p = "fdiv";
15444 ssep = "vdiv";
15445 break;
15447 default:
15448 gcc_unreachable ();
15451 if (is_sse)
15453 if (TARGET_AVX)
15455 strcpy (buf, ssep);
15456 if (GET_MODE (operands[0]) == SFmode)
15457 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15458 else
15459 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15461 else
15463 strcpy (buf, ssep + 1);
15464 if (GET_MODE (operands[0]) == SFmode)
15465 strcat (buf, "ss\t{%2, %0|%0, %2}");
15466 else
15467 strcat (buf, "sd\t{%2, %0|%0, %2}");
15469 return buf;
15471 strcpy (buf, p);
15473 switch (GET_CODE (operands[3]))
15475 case MULT:
15476 case PLUS:
15477 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15479 rtx temp = operands[2];
15480 operands[2] = operands[1];
15481 operands[1] = temp;
15484 /* know operands[0] == operands[1]. */
15486 if (MEM_P (operands[2]))
15488 p = "%Z2\t%2";
15489 break;
15492 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15494 if (STACK_TOP_P (operands[0]))
15495 /* How is it that we are storing to a dead operand[2]?
15496 Well, presumably operands[1] is dead too. We can't
15497 store the result to st(0) as st(0) gets popped on this
15498 instruction. Instead store to operands[2] (which I
15499 think has to be st(1)). st(1) will be popped later.
15500 gcc <= 2.8.1 didn't have this check and generated
15501 assembly code that the Unixware assembler rejected. */
15502 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15503 else
15504 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15505 break;
15508 if (STACK_TOP_P (operands[0]))
15509 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15510 else
15511 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15512 break;
15514 case MINUS:
15515 case DIV:
15516 if (MEM_P (operands[1]))
15518 p = "r%Z1\t%1";
15519 break;
15522 if (MEM_P (operands[2]))
15524 p = "%Z2\t%2";
15525 break;
15528 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15530 #if SYSV386_COMPAT
15531 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15532 derived assemblers, confusingly reverse the direction of
15533 the operation for fsub{r} and fdiv{r} when the
15534 destination register is not st(0). The Intel assembler
15535 doesn't have this brain damage. Read !SYSV386_COMPAT to
15536 figure out what the hardware really does. */
15537 if (STACK_TOP_P (operands[0]))
15538 p = "{p\t%0, %2|rp\t%2, %0}";
15539 else
15540 p = "{rp\t%2, %0|p\t%0, %2}";
15541 #else
15542 if (STACK_TOP_P (operands[0]))
15543 /* As above for fmul/fadd, we can't store to st(0). */
15544 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15545 else
15546 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15547 #endif
15548 break;
15551 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15553 #if SYSV386_COMPAT
15554 if (STACK_TOP_P (operands[0]))
15555 p = "{rp\t%0, %1|p\t%1, %0}";
15556 else
15557 p = "{p\t%1, %0|rp\t%0, %1}";
15558 #else
15559 if (STACK_TOP_P (operands[0]))
15560 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15561 else
15562 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15563 #endif
15564 break;
15567 if (STACK_TOP_P (operands[0]))
15569 if (STACK_TOP_P (operands[1]))
15570 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15571 else
15572 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15573 break;
15575 else if (STACK_TOP_P (operands[1]))
15577 #if SYSV386_COMPAT
15578 p = "{\t%1, %0|r\t%0, %1}";
15579 #else
15580 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15581 #endif
15583 else
15585 #if SYSV386_COMPAT
15586 p = "{r\t%2, %0|\t%0, %2}";
15587 #else
15588 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15589 #endif
15591 break;
15593 default:
15594 gcc_unreachable ();
15597 strcat (buf, p);
15598 return buf;
15601 /* Check if a 256bit AVX register is referenced inside of EXP. */
15603 static int
15604 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15606 rtx exp = *pexp;
15608 if (GET_CODE (exp) == SUBREG)
15609 exp = SUBREG_REG (exp);
15611 if (REG_P (exp)
15612 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15613 return 1;
15615 return 0;
15618 /* Return needed mode for entity in optimize_mode_switching pass. */
15620 static int
15621 ix86_avx_u128_mode_needed (rtx insn)
15623 if (CALL_P (insn))
15625 rtx link;
15627 /* Needed mode is set to AVX_U128_CLEAN if there are
15628 no 256bit modes used in function arguments. */
15629 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15630 link;
15631 link = XEXP (link, 1))
15633 if (GET_CODE (XEXP (link, 0)) == USE)
15635 rtx arg = XEXP (XEXP (link, 0), 0);
15637 if (ix86_check_avx256_register (&arg, NULL))
15638 return AVX_U128_ANY;
15642 return AVX_U128_CLEAN;
15645 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15646 changes state only when a 256bit register is written to, but we need
15647 to prevent the compiler from moving optimal insertion point above
15648 eventual read from 256bit register. */
15649 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15650 return AVX_U128_DIRTY;
15652 return AVX_U128_ANY;
15655 /* Return mode that i387 must be switched into
15656 prior to the execution of insn. */
15658 static int
15659 ix86_i387_mode_needed (int entity, rtx insn)
15661 enum attr_i387_cw mode;
15663 /* The mode UNINITIALIZED is used to store control word after a
15664 function call or ASM pattern. The mode ANY specify that function
15665 has no requirements on the control word and make no changes in the
15666 bits we are interested in. */
15668 if (CALL_P (insn)
15669 || (NONJUMP_INSN_P (insn)
15670 && (asm_noperands (PATTERN (insn)) >= 0
15671 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15672 return I387_CW_UNINITIALIZED;
15674 if (recog_memoized (insn) < 0)
15675 return I387_CW_ANY;
15677 mode = get_attr_i387_cw (insn);
15679 switch (entity)
15681 case I387_TRUNC:
15682 if (mode == I387_CW_TRUNC)
15683 return mode;
15684 break;
15686 case I387_FLOOR:
15687 if (mode == I387_CW_FLOOR)
15688 return mode;
15689 break;
15691 case I387_CEIL:
15692 if (mode == I387_CW_CEIL)
15693 return mode;
15694 break;
15696 case I387_MASK_PM:
15697 if (mode == I387_CW_MASK_PM)
15698 return mode;
15699 break;
15701 default:
15702 gcc_unreachable ();
15705 return I387_CW_ANY;
15708 /* Return mode that entity must be switched into
15709 prior to the execution of insn. */
15712 ix86_mode_needed (int entity, rtx insn)
15714 switch (entity)
15716 case AVX_U128:
15717 return ix86_avx_u128_mode_needed (insn);
15718 case I387_TRUNC:
15719 case I387_FLOOR:
15720 case I387_CEIL:
15721 case I387_MASK_PM:
15722 return ix86_i387_mode_needed (entity, insn);
15723 default:
15724 gcc_unreachable ();
15726 return 0;
15729 /* Check if a 256bit AVX register is referenced in stores. */
15731 static void
15732 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15734 if (ix86_check_avx256_register (&dest, NULL))
15736 bool *used = (bool *) data;
15737 *used = true;
15741 /* Calculate mode of upper 128bit AVX registers after the insn. */
15743 static int
15744 ix86_avx_u128_mode_after (int mode, rtx insn)
15746 rtx pat = PATTERN (insn);
15748 if (vzeroupper_operation (pat, VOIDmode)
15749 || vzeroall_operation (pat, VOIDmode))
15750 return AVX_U128_CLEAN;
15752 /* We know that state is clean after CALL insn if there are no
15753 256bit registers used in the function return register. */
15754 if (CALL_P (insn))
15756 bool avx_reg256_found = false;
15757 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15758 if (!avx_reg256_found)
15759 return AVX_U128_CLEAN;
15762 /* Otherwise, return current mode. Remember that if insn
15763 references AVX 256bit registers, the mode was already changed
15764 to DIRTY from MODE_NEEDED. */
15765 return mode;
15768 /* Return the mode that an insn results in. */
15771 ix86_mode_after (int entity, int mode, rtx insn)
15773 switch (entity)
15775 case AVX_U128:
15776 return ix86_avx_u128_mode_after (mode, insn);
15777 case I387_TRUNC:
15778 case I387_FLOOR:
15779 case I387_CEIL:
15780 case I387_MASK_PM:
15781 return mode;
15782 default:
15783 gcc_unreachable ();
15787 static int
15788 ix86_avx_u128_mode_entry (void)
15790 tree arg;
15792 /* Entry mode is set to AVX_U128_DIRTY if there are
15793 256bit modes used in function arguments. */
15794 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15795 arg = TREE_CHAIN (arg))
15797 rtx incoming = DECL_INCOMING_RTL (arg);
15799 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15800 return AVX_U128_DIRTY;
15803 return AVX_U128_CLEAN;
15806 /* Return a mode that ENTITY is assumed to be
15807 switched to at function entry. */
15810 ix86_mode_entry (int entity)
15812 switch (entity)
15814 case AVX_U128:
15815 return ix86_avx_u128_mode_entry ();
15816 case I387_TRUNC:
15817 case I387_FLOOR:
15818 case I387_CEIL:
15819 case I387_MASK_PM:
15820 return I387_CW_ANY;
15821 default:
15822 gcc_unreachable ();
15826 static int
15827 ix86_avx_u128_mode_exit (void)
15829 rtx reg = crtl->return_rtx;
15831 /* Exit mode is set to AVX_U128_DIRTY if there are
15832 256bit modes used in the function return register. */
15833 if (reg && ix86_check_avx256_register (&reg, NULL))
15834 return AVX_U128_DIRTY;
15836 return AVX_U128_CLEAN;
15839 /* Return a mode that ENTITY is assumed to be
15840 switched to at function exit. */
15843 ix86_mode_exit (int entity)
15845 switch (entity)
15847 case AVX_U128:
15848 return ix86_avx_u128_mode_exit ();
15849 case I387_TRUNC:
15850 case I387_FLOOR:
15851 case I387_CEIL:
15852 case I387_MASK_PM:
15853 return I387_CW_ANY;
15854 default:
15855 gcc_unreachable ();
15859 /* Output code to initialize control word copies used by trunc?f?i and
15860 rounding patterns. CURRENT_MODE is set to current control word,
15861 while NEW_MODE is set to new control word. */
15863 static void
15864 emit_i387_cw_initialization (int mode)
15866 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15867 rtx new_mode;
15869 enum ix86_stack_slot slot;
15871 rtx reg = gen_reg_rtx (HImode);
15873 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15874 emit_move_insn (reg, copy_rtx (stored_mode));
15876 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15877 || optimize_insn_for_size_p ())
15879 switch (mode)
15881 case I387_CW_TRUNC:
15882 /* round toward zero (truncate) */
15883 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15884 slot = SLOT_CW_TRUNC;
15885 break;
15887 case I387_CW_FLOOR:
15888 /* round down toward -oo */
15889 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15890 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15891 slot = SLOT_CW_FLOOR;
15892 break;
15894 case I387_CW_CEIL:
15895 /* round up toward +oo */
15896 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15897 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15898 slot = SLOT_CW_CEIL;
15899 break;
15901 case I387_CW_MASK_PM:
15902 /* mask precision exception for nearbyint() */
15903 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15904 slot = SLOT_CW_MASK_PM;
15905 break;
15907 default:
15908 gcc_unreachable ();
15911 else
15913 switch (mode)
15915 case I387_CW_TRUNC:
15916 /* round toward zero (truncate) */
15917 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15918 slot = SLOT_CW_TRUNC;
15919 break;
15921 case I387_CW_FLOOR:
15922 /* round down toward -oo */
15923 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15924 slot = SLOT_CW_FLOOR;
15925 break;
15927 case I387_CW_CEIL:
15928 /* round up toward +oo */
15929 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15930 slot = SLOT_CW_CEIL;
15931 break;
15933 case I387_CW_MASK_PM:
15934 /* mask precision exception for nearbyint() */
15935 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15936 slot = SLOT_CW_MASK_PM;
15937 break;
15939 default:
15940 gcc_unreachable ();
15944 gcc_assert (slot < MAX_386_STACK_LOCALS);
15946 new_mode = assign_386_stack_local (HImode, slot);
15947 emit_move_insn (new_mode, reg);
15950 /* Emit vzeroupper. */
15952 void
15953 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15955 int i;
15957 /* Cancel automatic vzeroupper insertion if there are
15958 live call-saved SSE registers at the insertion point. */
15960 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15961 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15962 return;
15964 if (TARGET_64BIT)
15965 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15966 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15967 return;
15969 emit_insn (gen_avx_vzeroupper ());
15972 /* Generate one or more insns to set ENTITY to MODE. */
15974 void
15975 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15977 switch (entity)
15979 case AVX_U128:
15980 if (mode == AVX_U128_CLEAN)
15981 ix86_avx_emit_vzeroupper (regs_live);
15982 break;
15983 case I387_TRUNC:
15984 case I387_FLOOR:
15985 case I387_CEIL:
15986 case I387_MASK_PM:
15987 if (mode != I387_CW_ANY
15988 && mode != I387_CW_UNINITIALIZED)
15989 emit_i387_cw_initialization (mode);
15990 break;
15991 default:
15992 gcc_unreachable ();
15996 /* Output code for INSN to convert a float to a signed int. OPERANDS
15997 are the insn operands. The output may be [HSD]Imode and the input
15998 operand may be [SDX]Fmode. */
16000 const char *
16001 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
16003 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16004 int dimode_p = GET_MODE (operands[0]) == DImode;
16005 int round_mode = get_attr_i387_cw (insn);
16007 /* Jump through a hoop or two for DImode, since the hardware has no
16008 non-popping instruction. We used to do this a different way, but
16009 that was somewhat fragile and broke with post-reload splitters. */
16010 if ((dimode_p || fisttp) && !stack_top_dies)
16011 output_asm_insn ("fld\t%y1", operands);
16013 gcc_assert (STACK_TOP_P (operands[1]));
16014 gcc_assert (MEM_P (operands[0]));
16015 gcc_assert (GET_MODE (operands[1]) != TFmode);
16017 if (fisttp)
16018 output_asm_insn ("fisttp%Z0\t%0", operands);
16019 else
16021 if (round_mode != I387_CW_ANY)
16022 output_asm_insn ("fldcw\t%3", operands);
16023 if (stack_top_dies || dimode_p)
16024 output_asm_insn ("fistp%Z0\t%0", operands);
16025 else
16026 output_asm_insn ("fist%Z0\t%0", operands);
16027 if (round_mode != I387_CW_ANY)
16028 output_asm_insn ("fldcw\t%2", operands);
16031 return "";
16034 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16035 have the values zero or one, indicates the ffreep insn's operand
16036 from the OPERANDS array. */
16038 static const char *
16039 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16041 if (TARGET_USE_FFREEP)
16042 #ifdef HAVE_AS_IX86_FFREEP
16043 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16044 #else
16046 static char retval[32];
16047 int regno = REGNO (operands[opno]);
16049 gcc_assert (STACK_REGNO_P (regno));
16051 regno -= FIRST_STACK_REG;
16053 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16054 return retval;
16056 #endif
16058 return opno ? "fstp\t%y1" : "fstp\t%y0";
16062 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16063 should be used. UNORDERED_P is true when fucom should be used. */
16065 const char *
16066 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16068 int stack_top_dies;
16069 rtx cmp_op0, cmp_op1;
16070 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16072 if (eflags_p)
16074 cmp_op0 = operands[0];
16075 cmp_op1 = operands[1];
16077 else
16079 cmp_op0 = operands[1];
16080 cmp_op1 = operands[2];
16083 if (is_sse)
16085 if (GET_MODE (operands[0]) == SFmode)
16086 if (unordered_p)
16087 return "%vucomiss\t{%1, %0|%0, %1}";
16088 else
16089 return "%vcomiss\t{%1, %0|%0, %1}";
16090 else
16091 if (unordered_p)
16092 return "%vucomisd\t{%1, %0|%0, %1}";
16093 else
16094 return "%vcomisd\t{%1, %0|%0, %1}";
16097 gcc_assert (STACK_TOP_P (cmp_op0));
16099 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16101 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16103 if (stack_top_dies)
16105 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16106 return output_387_ffreep (operands, 1);
16108 else
16109 return "ftst\n\tfnstsw\t%0";
16112 if (STACK_REG_P (cmp_op1)
16113 && stack_top_dies
16114 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16115 && REGNO (cmp_op1) != FIRST_STACK_REG)
16117 /* If both the top of the 387 stack dies, and the other operand
16118 is also a stack register that dies, then this must be a
16119 `fcompp' float compare */
16121 if (eflags_p)
16123 /* There is no double popping fcomi variant. Fortunately,
16124 eflags is immune from the fstp's cc clobbering. */
16125 if (unordered_p)
16126 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16127 else
16128 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16129 return output_387_ffreep (operands, 0);
16131 else
16133 if (unordered_p)
16134 return "fucompp\n\tfnstsw\t%0";
16135 else
16136 return "fcompp\n\tfnstsw\t%0";
16139 else
16141 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16143 static const char * const alt[16] =
16145 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16146 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16147 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16148 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16150 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16151 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16152 NULL,
16153 NULL,
16155 "fcomi\t{%y1, %0|%0, %y1}",
16156 "fcomip\t{%y1, %0|%0, %y1}",
16157 "fucomi\t{%y1, %0|%0, %y1}",
16158 "fucomip\t{%y1, %0|%0, %y1}",
16160 NULL,
16161 NULL,
16162 NULL,
16163 NULL
16166 int mask;
16167 const char *ret;
16169 mask = eflags_p << 3;
16170 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16171 mask |= unordered_p << 1;
16172 mask |= stack_top_dies;
16174 gcc_assert (mask < 16);
16175 ret = alt[mask];
16176 gcc_assert (ret);
16178 return ret;
16182 void
16183 ix86_output_addr_vec_elt (FILE *file, int value)
16185 const char *directive = ASM_LONG;
16187 #ifdef ASM_QUAD
16188 if (TARGET_LP64)
16189 directive = ASM_QUAD;
16190 #else
16191 gcc_assert (!TARGET_64BIT);
16192 #endif
16194 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16197 void
16198 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16200 const char *directive = ASM_LONG;
16202 #ifdef ASM_QUAD
16203 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16204 directive = ASM_QUAD;
16205 #else
16206 gcc_assert (!TARGET_64BIT);
16207 #endif
16208 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16209 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16210 fprintf (file, "%s%s%d-%s%d\n",
16211 directive, LPREFIX, value, LPREFIX, rel);
16212 else if (HAVE_AS_GOTOFF_IN_DATA)
16213 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16214 #if TARGET_MACHO
16215 else if (TARGET_MACHO)
16217 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16218 machopic_output_function_base_name (file);
16219 putc ('\n', file);
16221 #endif
16222 else
16223 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16224 GOT_SYMBOL_NAME, LPREFIX, value);
16227 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16228 for the target. */
16230 void
16231 ix86_expand_clear (rtx dest)
16233 rtx tmp;
16235 /* We play register width games, which are only valid after reload. */
16236 gcc_assert (reload_completed);
16238 /* Avoid HImode and its attendant prefix byte. */
16239 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16240 dest = gen_rtx_REG (SImode, REGNO (dest));
16241 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16243 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16244 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16246 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16247 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16250 emit_insn (tmp);
16253 /* X is an unchanging MEM. If it is a constant pool reference, return
16254 the constant pool rtx, else NULL. */
16257 maybe_get_pool_constant (rtx x)
16259 x = ix86_delegitimize_address (XEXP (x, 0));
16261 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16262 return get_pool_constant (x);
16264 return NULL_RTX;
16267 void
16268 ix86_expand_move (enum machine_mode mode, rtx operands[])
16270 rtx op0, op1;
16271 enum tls_model model;
16273 op0 = operands[0];
16274 op1 = operands[1];
16276 if (GET_CODE (op1) == SYMBOL_REF)
16278 rtx tmp;
16280 model = SYMBOL_REF_TLS_MODEL (op1);
16281 if (model)
16283 op1 = legitimize_tls_address (op1, model, true);
16284 op1 = force_operand (op1, op0);
16285 if (op1 == op0)
16286 return;
16287 op1 = convert_to_mode (mode, op1, 1);
16289 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16290 op1 = tmp;
16292 else if (GET_CODE (op1) == CONST
16293 && GET_CODE (XEXP (op1, 0)) == PLUS
16294 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16296 rtx addend = XEXP (XEXP (op1, 0), 1);
16297 rtx symbol = XEXP (XEXP (op1, 0), 0);
16298 rtx tmp;
16300 model = SYMBOL_REF_TLS_MODEL (symbol);
16301 if (model)
16302 tmp = legitimize_tls_address (symbol, model, true);
16303 else
16304 tmp = legitimize_pe_coff_symbol (symbol, true);
16306 if (tmp)
16308 tmp = force_operand (tmp, NULL);
16309 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16310 op0, 1, OPTAB_DIRECT);
16311 if (tmp == op0)
16312 return;
16313 op1 = convert_to_mode (mode, tmp, 1);
16317 if ((flag_pic || MACHOPIC_INDIRECT)
16318 && symbolic_operand (op1, mode))
16320 if (TARGET_MACHO && !TARGET_64BIT)
16322 #if TARGET_MACHO
16323 /* dynamic-no-pic */
16324 if (MACHOPIC_INDIRECT)
16326 rtx temp = ((reload_in_progress
16327 || ((op0 && REG_P (op0))
16328 && mode == Pmode))
16329 ? op0 : gen_reg_rtx (Pmode));
16330 op1 = machopic_indirect_data_reference (op1, temp);
16331 if (MACHOPIC_PURE)
16332 op1 = machopic_legitimize_pic_address (op1, mode,
16333 temp == op1 ? 0 : temp);
16335 if (op0 != op1 && GET_CODE (op0) != MEM)
16337 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16338 emit_insn (insn);
16339 return;
16341 if (GET_CODE (op0) == MEM)
16342 op1 = force_reg (Pmode, op1);
16343 else
16345 rtx temp = op0;
16346 if (GET_CODE (temp) != REG)
16347 temp = gen_reg_rtx (Pmode);
16348 temp = legitimize_pic_address (op1, temp);
16349 if (temp == op0)
16350 return;
16351 op1 = temp;
16353 /* dynamic-no-pic */
16354 #endif
16356 else
16358 if (MEM_P (op0))
16359 op1 = force_reg (mode, op1);
16360 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16362 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16363 op1 = legitimize_pic_address (op1, reg);
16364 if (op0 == op1)
16365 return;
16366 op1 = convert_to_mode (mode, op1, 1);
16370 else
16372 if (MEM_P (op0)
16373 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16374 || !push_operand (op0, mode))
16375 && MEM_P (op1))
16376 op1 = force_reg (mode, op1);
16378 if (push_operand (op0, mode)
16379 && ! general_no_elim_operand (op1, mode))
16380 op1 = copy_to_mode_reg (mode, op1);
16382 /* Force large constants in 64bit compilation into register
16383 to get them CSEed. */
16384 if (can_create_pseudo_p ()
16385 && (mode == DImode) && TARGET_64BIT
16386 && immediate_operand (op1, mode)
16387 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16388 && !register_operand (op0, mode)
16389 && optimize)
16390 op1 = copy_to_mode_reg (mode, op1);
16392 if (can_create_pseudo_p ()
16393 && FLOAT_MODE_P (mode)
16394 && GET_CODE (op1) == CONST_DOUBLE)
16396 /* If we are loading a floating point constant to a register,
16397 force the value to memory now, since we'll get better code
16398 out the back end. */
16400 op1 = validize_mem (force_const_mem (mode, op1));
16401 if (!register_operand (op0, mode))
16403 rtx temp = gen_reg_rtx (mode);
16404 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16405 emit_move_insn (op0, temp);
16406 return;
16411 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16414 void
16415 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16417 rtx op0 = operands[0], op1 = operands[1];
16418 unsigned int align = GET_MODE_ALIGNMENT (mode);
16420 /* Force constants other than zero into memory. We do not know how
16421 the instructions used to build constants modify the upper 64 bits
16422 of the register, once we have that information we may be able
16423 to handle some of them more efficiently. */
16424 if (can_create_pseudo_p ()
16425 && register_operand (op0, mode)
16426 && (CONSTANT_P (op1)
16427 || (GET_CODE (op1) == SUBREG
16428 && CONSTANT_P (SUBREG_REG (op1))))
16429 && !standard_sse_constant_p (op1))
16430 op1 = validize_mem (force_const_mem (mode, op1));
16432 /* We need to check memory alignment for SSE mode since attribute
16433 can make operands unaligned. */
16434 if (can_create_pseudo_p ()
16435 && SSE_REG_MODE_P (mode)
16436 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16437 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16439 rtx tmp[2];
16441 /* ix86_expand_vector_move_misalign() does not like constants ... */
16442 if (CONSTANT_P (op1)
16443 || (GET_CODE (op1) == SUBREG
16444 && CONSTANT_P (SUBREG_REG (op1))))
16445 op1 = validize_mem (force_const_mem (mode, op1));
16447 /* ... nor both arguments in memory. */
16448 if (!register_operand (op0, mode)
16449 && !register_operand (op1, mode))
16450 op1 = force_reg (mode, op1);
16452 tmp[0] = op0; tmp[1] = op1;
16453 ix86_expand_vector_move_misalign (mode, tmp);
16454 return;
16457 /* Make operand1 a register if it isn't already. */
16458 if (can_create_pseudo_p ()
16459 && !register_operand (op0, mode)
16460 && !register_operand (op1, mode))
16462 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16463 return;
16466 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16469 /* Split 32-byte AVX unaligned load and store if needed. */
16471 static void
16472 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16474 rtx m;
16475 rtx (*extract) (rtx, rtx, rtx);
16476 rtx (*load_unaligned) (rtx, rtx);
16477 rtx (*store_unaligned) (rtx, rtx);
16478 enum machine_mode mode;
16480 switch (GET_MODE (op0))
16482 default:
16483 gcc_unreachable ();
16484 case V32QImode:
16485 extract = gen_avx_vextractf128v32qi;
16486 load_unaligned = gen_avx_loaddquv32qi;
16487 store_unaligned = gen_avx_storedquv32qi;
16488 mode = V16QImode;
16489 break;
16490 case V8SFmode:
16491 extract = gen_avx_vextractf128v8sf;
16492 load_unaligned = gen_avx_loadups256;
16493 store_unaligned = gen_avx_storeups256;
16494 mode = V4SFmode;
16495 break;
16496 case V4DFmode:
16497 extract = gen_avx_vextractf128v4df;
16498 load_unaligned = gen_avx_loadupd256;
16499 store_unaligned = gen_avx_storeupd256;
16500 mode = V2DFmode;
16501 break;
16504 if (MEM_P (op1))
16506 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16508 rtx r = gen_reg_rtx (mode);
16509 m = adjust_address (op1, mode, 0);
16510 emit_move_insn (r, m);
16511 m = adjust_address (op1, mode, 16);
16512 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16513 emit_move_insn (op0, r);
16515 else
16516 emit_insn (load_unaligned (op0, op1));
16518 else if (MEM_P (op0))
16520 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16522 m = adjust_address (op0, mode, 0);
16523 emit_insn (extract (m, op1, const0_rtx));
16524 m = adjust_address (op0, mode, 16);
16525 emit_insn (extract (m, op1, const1_rtx));
16527 else
16528 emit_insn (store_unaligned (op0, op1));
16530 else
16531 gcc_unreachable ();
16534 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16535 straight to ix86_expand_vector_move. */
16536 /* Code generation for scalar reg-reg moves of single and double precision data:
16537 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16538 movaps reg, reg
16539 else
16540 movss reg, reg
16541 if (x86_sse_partial_reg_dependency == true)
16542 movapd reg, reg
16543 else
16544 movsd reg, reg
16546 Code generation for scalar loads of double precision data:
16547 if (x86_sse_split_regs == true)
16548 movlpd mem, reg (gas syntax)
16549 else
16550 movsd mem, reg
16552 Code generation for unaligned packed loads of single precision data
16553 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16554 if (x86_sse_unaligned_move_optimal)
16555 movups mem, reg
16557 if (x86_sse_partial_reg_dependency == true)
16559 xorps reg, reg
16560 movlps mem, reg
16561 movhps mem+8, reg
16563 else
16565 movlps mem, reg
16566 movhps mem+8, reg
16569 Code generation for unaligned packed loads of double precision data
16570 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16571 if (x86_sse_unaligned_move_optimal)
16572 movupd mem, reg
16574 if (x86_sse_split_regs == true)
16576 movlpd mem, reg
16577 movhpd mem+8, reg
16579 else
16581 movsd mem, reg
16582 movhpd mem+8, reg
16586 void
16587 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16589 rtx op0, op1, m;
16590 rtx (*load_unaligned) (rtx, rtx);
16591 rtx (*store_unaligned) (rtx, rtx);
16593 op0 = operands[0];
16594 op1 = operands[1];
16596 if (GET_MODE_SIZE (mode) == 64)
16598 switch (GET_MODE_CLASS (mode))
16600 case MODE_VECTOR_INT:
16601 case MODE_INT:
16602 op0 = gen_lowpart (V16SImode, op0);
16603 op1 = gen_lowpart (V16SImode, op1);
16604 /* FALLTHRU */
16606 case MODE_VECTOR_FLOAT:
16607 switch (GET_MODE (op0))
16609 default:
16610 gcc_unreachable ();
16611 case V16SImode:
16612 load_unaligned = gen_avx512f_loaddquv16si;
16613 store_unaligned = gen_avx512f_storedquv16si;
16614 break;
16615 case V16SFmode:
16616 load_unaligned = gen_avx512f_loadups512;
16617 store_unaligned = gen_avx512f_storeups512;
16618 break;
16619 case V8DFmode:
16620 load_unaligned = gen_avx512f_loadupd512;
16621 store_unaligned = gen_avx512f_storeupd512;
16622 break;
16625 if (MEM_P (op1))
16626 emit_insn (load_unaligned (op0, op1));
16627 else if (MEM_P (op0))
16628 emit_insn (store_unaligned (op0, op1));
16629 else
16630 gcc_unreachable ();
16631 break;
16633 default:
16634 gcc_unreachable ();
16637 return;
16640 if (TARGET_AVX
16641 && GET_MODE_SIZE (mode) == 32)
16643 switch (GET_MODE_CLASS (mode))
16645 case MODE_VECTOR_INT:
16646 case MODE_INT:
16647 op0 = gen_lowpart (V32QImode, op0);
16648 op1 = gen_lowpart (V32QImode, op1);
16649 /* FALLTHRU */
16651 case MODE_VECTOR_FLOAT:
16652 ix86_avx256_split_vector_move_misalign (op0, op1);
16653 break;
16655 default:
16656 gcc_unreachable ();
16659 return;
16662 if (MEM_P (op1))
16664 /* ??? If we have typed data, then it would appear that using
16665 movdqu is the only way to get unaligned data loaded with
16666 integer type. */
16667 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16669 op0 = gen_lowpart (V16QImode, op0);
16670 op1 = gen_lowpart (V16QImode, op1);
16671 /* We will eventually emit movups based on insn attributes. */
16672 emit_insn (gen_sse2_loaddquv16qi (op0, op1));
16674 else if (TARGET_SSE2 && mode == V2DFmode)
16676 rtx zero;
16678 if (TARGET_AVX
16679 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16680 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16681 || optimize_insn_for_size_p ())
16683 /* We will eventually emit movups based on insn attributes. */
16684 emit_insn (gen_sse2_loadupd (op0, op1));
16685 return;
16688 /* When SSE registers are split into halves, we can avoid
16689 writing to the top half twice. */
16690 if (TARGET_SSE_SPLIT_REGS)
16692 emit_clobber (op0);
16693 zero = op0;
16695 else
16697 /* ??? Not sure about the best option for the Intel chips.
16698 The following would seem to satisfy; the register is
16699 entirely cleared, breaking the dependency chain. We
16700 then store to the upper half, with a dependency depth
16701 of one. A rumor has it that Intel recommends two movsd
16702 followed by an unpacklpd, but this is unconfirmed. And
16703 given that the dependency depth of the unpacklpd would
16704 still be one, I'm not sure why this would be better. */
16705 zero = CONST0_RTX (V2DFmode);
16708 m = adjust_address (op1, DFmode, 0);
16709 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16710 m = adjust_address (op1, DFmode, 8);
16711 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16713 else
16715 if (TARGET_AVX
16716 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16717 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16718 || optimize_insn_for_size_p ())
16720 op0 = gen_lowpart (V4SFmode, op0);
16721 op1 = gen_lowpart (V4SFmode, op1);
16722 emit_insn (gen_sse_loadups (op0, op1));
16723 return;
16726 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16727 emit_move_insn (op0, CONST0_RTX (mode));
16728 else
16729 emit_clobber (op0);
16731 if (mode != V4SFmode)
16732 op0 = gen_lowpart (V4SFmode, op0);
16734 m = adjust_address (op1, V2SFmode, 0);
16735 emit_insn (gen_sse_loadlps (op0, op0, m));
16736 m = adjust_address (op1, V2SFmode, 8);
16737 emit_insn (gen_sse_loadhps (op0, op0, m));
16740 else if (MEM_P (op0))
16742 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16744 op0 = gen_lowpart (V16QImode, op0);
16745 op1 = gen_lowpart (V16QImode, op1);
16746 /* We will eventually emit movups based on insn attributes. */
16747 emit_insn (gen_sse2_storedquv16qi (op0, op1));
16749 else if (TARGET_SSE2 && mode == V2DFmode)
16751 if (TARGET_AVX
16752 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16753 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16754 || optimize_insn_for_size_p ())
16755 /* We will eventually emit movups based on insn attributes. */
16756 emit_insn (gen_sse2_storeupd (op0, op1));
16757 else
16759 m = adjust_address (op0, DFmode, 0);
16760 emit_insn (gen_sse2_storelpd (m, op1));
16761 m = adjust_address (op0, DFmode, 8);
16762 emit_insn (gen_sse2_storehpd (m, op1));
16765 else
16767 if (mode != V4SFmode)
16768 op1 = gen_lowpart (V4SFmode, op1);
16770 if (TARGET_AVX
16771 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16772 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16773 || optimize_insn_for_size_p ())
16775 op0 = gen_lowpart (V4SFmode, op0);
16776 emit_insn (gen_sse_storeups (op0, op1));
16778 else
16780 m = adjust_address (op0, V2SFmode, 0);
16781 emit_insn (gen_sse_storelps (m, op1));
16782 m = adjust_address (op0, V2SFmode, 8);
16783 emit_insn (gen_sse_storehps (m, op1));
16787 else
16788 gcc_unreachable ();
16791 /* Expand a push in MODE. This is some mode for which we do not support
16792 proper push instructions, at least from the registers that we expect
16793 the value to live in. */
16795 void
16796 ix86_expand_push (enum machine_mode mode, rtx x)
16798 rtx tmp;
16800 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16801 GEN_INT (-GET_MODE_SIZE (mode)),
16802 stack_pointer_rtx, 1, OPTAB_DIRECT);
16803 if (tmp != stack_pointer_rtx)
16804 emit_move_insn (stack_pointer_rtx, tmp);
16806 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16808 /* When we push an operand onto stack, it has to be aligned at least
16809 at the function argument boundary. However since we don't have
16810 the argument type, we can't determine the actual argument
16811 boundary. */
16812 emit_move_insn (tmp, x);
16815 /* Helper function of ix86_fixup_binary_operands to canonicalize
16816 operand order. Returns true if the operands should be swapped. */
16818 static bool
16819 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16820 rtx operands[])
16822 rtx dst = operands[0];
16823 rtx src1 = operands[1];
16824 rtx src2 = operands[2];
16826 /* If the operation is not commutative, we can't do anything. */
16827 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16828 return false;
16830 /* Highest priority is that src1 should match dst. */
16831 if (rtx_equal_p (dst, src1))
16832 return false;
16833 if (rtx_equal_p (dst, src2))
16834 return true;
16836 /* Next highest priority is that immediate constants come second. */
16837 if (immediate_operand (src2, mode))
16838 return false;
16839 if (immediate_operand (src1, mode))
16840 return true;
16842 /* Lowest priority is that memory references should come second. */
16843 if (MEM_P (src2))
16844 return false;
16845 if (MEM_P (src1))
16846 return true;
16848 return false;
16852 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16853 destination to use for the operation. If different from the true
16854 destination in operands[0], a copy operation will be required. */
16857 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16858 rtx operands[])
16860 rtx dst = operands[0];
16861 rtx src1 = operands[1];
16862 rtx src2 = operands[2];
16864 /* Canonicalize operand order. */
16865 if (ix86_swap_binary_operands_p (code, mode, operands))
16867 rtx temp;
16869 /* It is invalid to swap operands of different modes. */
16870 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16872 temp = src1;
16873 src1 = src2;
16874 src2 = temp;
16877 /* Both source operands cannot be in memory. */
16878 if (MEM_P (src1) && MEM_P (src2))
16880 /* Optimization: Only read from memory once. */
16881 if (rtx_equal_p (src1, src2))
16883 src2 = force_reg (mode, src2);
16884 src1 = src2;
16886 else if (rtx_equal_p (dst, src1))
16887 src2 = force_reg (mode, src2);
16888 else
16889 src1 = force_reg (mode, src1);
16892 /* If the destination is memory, and we do not have matching source
16893 operands, do things in registers. */
16894 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16895 dst = gen_reg_rtx (mode);
16897 /* Source 1 cannot be a constant. */
16898 if (CONSTANT_P (src1))
16899 src1 = force_reg (mode, src1);
16901 /* Source 1 cannot be a non-matching memory. */
16902 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16903 src1 = force_reg (mode, src1);
16905 /* Improve address combine. */
16906 if (code == PLUS
16907 && GET_MODE_CLASS (mode) == MODE_INT
16908 && MEM_P (src2))
16909 src2 = force_reg (mode, src2);
16911 operands[1] = src1;
16912 operands[2] = src2;
16913 return dst;
16916 /* Similarly, but assume that the destination has already been
16917 set up properly. */
16919 void
16920 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16921 enum machine_mode mode, rtx operands[])
16923 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16924 gcc_assert (dst == operands[0]);
16927 /* Attempt to expand a binary operator. Make the expansion closer to the
16928 actual machine, then just general_operand, which will allow 3 separate
16929 memory references (one output, two input) in a single insn. */
16931 void
16932 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16933 rtx operands[])
16935 rtx src1, src2, dst, op, clob;
16937 dst = ix86_fixup_binary_operands (code, mode, operands);
16938 src1 = operands[1];
16939 src2 = operands[2];
16941 /* Emit the instruction. */
16943 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16944 if (reload_in_progress)
16946 /* Reload doesn't know about the flags register, and doesn't know that
16947 it doesn't want to clobber it. We can only do this with PLUS. */
16948 gcc_assert (code == PLUS);
16949 emit_insn (op);
16951 else if (reload_completed
16952 && code == PLUS
16953 && !rtx_equal_p (dst, src1))
16955 /* This is going to be an LEA; avoid splitting it later. */
16956 emit_insn (op);
16958 else
16960 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16961 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16964 /* Fix up the destination if needed. */
16965 if (dst != operands[0])
16966 emit_move_insn (operands[0], dst);
16969 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16970 the given OPERANDS. */
16972 void
16973 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16974 rtx operands[])
16976 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16977 if (GET_CODE (operands[1]) == SUBREG)
16979 op1 = operands[1];
16980 op2 = operands[2];
16982 else if (GET_CODE (operands[2]) == SUBREG)
16984 op1 = operands[2];
16985 op2 = operands[1];
16987 /* Optimize (__m128i) d | (__m128i) e and similar code
16988 when d and e are float vectors into float vector logical
16989 insn. In C/C++ without using intrinsics there is no other way
16990 to express vector logical operation on float vectors than
16991 to cast them temporarily to integer vectors. */
16992 if (op1
16993 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16994 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16995 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16996 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16997 && SUBREG_BYTE (op1) == 0
16998 && (GET_CODE (op2) == CONST_VECTOR
16999 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
17000 && SUBREG_BYTE (op2) == 0))
17001 && can_create_pseudo_p ())
17003 rtx dst;
17004 switch (GET_MODE (SUBREG_REG (op1)))
17006 case V4SFmode:
17007 case V8SFmode:
17008 case V2DFmode:
17009 case V4DFmode:
17010 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
17011 if (GET_CODE (op2) == CONST_VECTOR)
17013 op2 = gen_lowpart (GET_MODE (dst), op2);
17014 op2 = force_reg (GET_MODE (dst), op2);
17016 else
17018 op1 = operands[1];
17019 op2 = SUBREG_REG (operands[2]);
17020 if (!nonimmediate_operand (op2, GET_MODE (dst)))
17021 op2 = force_reg (GET_MODE (dst), op2);
17023 op1 = SUBREG_REG (op1);
17024 if (!nonimmediate_operand (op1, GET_MODE (dst)))
17025 op1 = force_reg (GET_MODE (dst), op1);
17026 emit_insn (gen_rtx_SET (VOIDmode, dst,
17027 gen_rtx_fmt_ee (code, GET_MODE (dst),
17028 op1, op2)));
17029 emit_move_insn (operands[0], gen_lowpart (mode, dst));
17030 return;
17031 default:
17032 break;
17035 if (!nonimmediate_operand (operands[1], mode))
17036 operands[1] = force_reg (mode, operands[1]);
17037 if (!nonimmediate_operand (operands[2], mode))
17038 operands[2] = force_reg (mode, operands[2]);
17039 ix86_fixup_binary_operands_no_copy (code, mode, operands);
17040 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
17041 gen_rtx_fmt_ee (code, mode, operands[1],
17042 operands[2])));
17045 /* Return TRUE or FALSE depending on whether the binary operator meets the
17046 appropriate constraints. */
17048 bool
17049 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
17050 rtx operands[3])
17052 rtx dst = operands[0];
17053 rtx src1 = operands[1];
17054 rtx src2 = operands[2];
17056 /* Both source operands cannot be in memory. */
17057 if (MEM_P (src1) && MEM_P (src2))
17058 return false;
17060 /* Canonicalize operand order for commutative operators. */
17061 if (ix86_swap_binary_operands_p (code, mode, operands))
17063 rtx temp = src1;
17064 src1 = src2;
17065 src2 = temp;
17068 /* If the destination is memory, we must have a matching source operand. */
17069 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17070 return false;
17072 /* Source 1 cannot be a constant. */
17073 if (CONSTANT_P (src1))
17074 return false;
17076 /* Source 1 cannot be a non-matching memory. */
17077 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17078 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17079 return (code == AND
17080 && (mode == HImode
17081 || mode == SImode
17082 || (TARGET_64BIT && mode == DImode))
17083 && satisfies_constraint_L (src2));
17085 return true;
17088 /* Attempt to expand a unary operator. Make the expansion closer to the
17089 actual machine, then just general_operand, which will allow 2 separate
17090 memory references (one output, one input) in a single insn. */
17092 void
17093 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17094 rtx operands[])
17096 int matching_memory;
17097 rtx src, dst, op, clob;
17099 dst = operands[0];
17100 src = operands[1];
17102 /* If the destination is memory, and we do not have matching source
17103 operands, do things in registers. */
17104 matching_memory = 0;
17105 if (MEM_P (dst))
17107 if (rtx_equal_p (dst, src))
17108 matching_memory = 1;
17109 else
17110 dst = gen_reg_rtx (mode);
17113 /* When source operand is memory, destination must match. */
17114 if (MEM_P (src) && !matching_memory)
17115 src = force_reg (mode, src);
17117 /* Emit the instruction. */
17119 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17120 if (reload_in_progress || code == NOT)
17122 /* Reload doesn't know about the flags register, and doesn't know that
17123 it doesn't want to clobber it. */
17124 gcc_assert (code == NOT);
17125 emit_insn (op);
17127 else
17129 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17130 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17133 /* Fix up the destination if needed. */
17134 if (dst != operands[0])
17135 emit_move_insn (operands[0], dst);
17138 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17139 divisor are within the range [0-255]. */
17141 void
17142 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17143 bool signed_p)
17145 rtx end_label, qimode_label;
17146 rtx insn, div, mod;
17147 rtx scratch, tmp0, tmp1, tmp2;
17148 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17149 rtx (*gen_zero_extend) (rtx, rtx);
17150 rtx (*gen_test_ccno_1) (rtx, rtx);
17152 switch (mode)
17154 case SImode:
17155 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17156 gen_test_ccno_1 = gen_testsi_ccno_1;
17157 gen_zero_extend = gen_zero_extendqisi2;
17158 break;
17159 case DImode:
17160 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17161 gen_test_ccno_1 = gen_testdi_ccno_1;
17162 gen_zero_extend = gen_zero_extendqidi2;
17163 break;
17164 default:
17165 gcc_unreachable ();
17168 end_label = gen_label_rtx ();
17169 qimode_label = gen_label_rtx ();
17171 scratch = gen_reg_rtx (mode);
17173 /* Use 8bit unsigned divimod if dividend and divisor are within
17174 the range [0-255]. */
17175 emit_move_insn (scratch, operands[2]);
17176 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17177 scratch, 1, OPTAB_DIRECT);
17178 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17179 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17180 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17181 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17182 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17183 pc_rtx);
17184 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17185 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17186 JUMP_LABEL (insn) = qimode_label;
17188 /* Generate original signed/unsigned divimod. */
17189 div = gen_divmod4_1 (operands[0], operands[1],
17190 operands[2], operands[3]);
17191 emit_insn (div);
17193 /* Branch to the end. */
17194 emit_jump_insn (gen_jump (end_label));
17195 emit_barrier ();
17197 /* Generate 8bit unsigned divide. */
17198 emit_label (qimode_label);
17199 /* Don't use operands[0] for result of 8bit divide since not all
17200 registers support QImode ZERO_EXTRACT. */
17201 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17202 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17203 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17204 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17206 if (signed_p)
17208 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17209 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17211 else
17213 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17214 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17217 /* Extract remainder from AH. */
17218 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17219 if (REG_P (operands[1]))
17220 insn = emit_move_insn (operands[1], tmp1);
17221 else
17223 /* Need a new scratch register since the old one has result
17224 of 8bit divide. */
17225 scratch = gen_reg_rtx (mode);
17226 emit_move_insn (scratch, tmp1);
17227 insn = emit_move_insn (operands[1], scratch);
17229 set_unique_reg_note (insn, REG_EQUAL, mod);
17231 /* Zero extend quotient from AL. */
17232 tmp1 = gen_lowpart (QImode, tmp0);
17233 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17234 set_unique_reg_note (insn, REG_EQUAL, div);
17236 emit_label (end_label);
17239 #define LEA_MAX_STALL (3)
17240 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17242 /* Increase given DISTANCE in half-cycles according to
17243 dependencies between PREV and NEXT instructions.
17244 Add 1 half-cycle if there is no dependency and
17245 go to next cycle if there is some dependecy. */
17247 static unsigned int
17248 increase_distance (rtx prev, rtx next, unsigned int distance)
17250 df_ref *use_rec;
17251 df_ref *def_rec;
17253 if (!prev || !next)
17254 return distance + (distance & 1) + 2;
17256 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17257 return distance + 1;
17259 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17260 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17261 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17262 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17263 return distance + (distance & 1) + 2;
17265 return distance + 1;
17268 /* Function checks if instruction INSN defines register number
17269 REGNO1 or REGNO2. */
17271 static bool
17272 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17273 rtx insn)
17275 df_ref *def_rec;
17277 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17278 if (DF_REF_REG_DEF_P (*def_rec)
17279 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17280 && (regno1 == DF_REF_REGNO (*def_rec)
17281 || regno2 == DF_REF_REGNO (*def_rec)))
17283 return true;
17286 return false;
17289 /* Function checks if instruction INSN uses register number
17290 REGNO as a part of address expression. */
17292 static bool
17293 insn_uses_reg_mem (unsigned int regno, rtx insn)
17295 df_ref *use_rec;
17297 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17298 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17299 return true;
17301 return false;
17304 /* Search backward for non-agu definition of register number REGNO1
17305 or register number REGNO2 in basic block starting from instruction
17306 START up to head of basic block or instruction INSN.
17308 Function puts true value into *FOUND var if definition was found
17309 and false otherwise.
17311 Distance in half-cycles between START and found instruction or head
17312 of BB is added to DISTANCE and returned. */
17314 static int
17315 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17316 rtx insn, int distance,
17317 rtx start, bool *found)
17319 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17320 rtx prev = start;
17321 rtx next = NULL;
17323 *found = false;
17325 while (prev
17326 && prev != insn
17327 && distance < LEA_SEARCH_THRESHOLD)
17329 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17331 distance = increase_distance (prev, next, distance);
17332 if (insn_defines_reg (regno1, regno2, prev))
17334 if (recog_memoized (prev) < 0
17335 || get_attr_type (prev) != TYPE_LEA)
17337 *found = true;
17338 return distance;
17342 next = prev;
17344 if (prev == BB_HEAD (bb))
17345 break;
17347 prev = PREV_INSN (prev);
17350 return distance;
17353 /* Search backward for non-agu definition of register number REGNO1
17354 or register number REGNO2 in INSN's basic block until
17355 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17356 2. Reach neighbour BBs boundary, or
17357 3. Reach agu definition.
17358 Returns the distance between the non-agu definition point and INSN.
17359 If no definition point, returns -1. */
17361 static int
17362 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17363 rtx insn)
17365 basic_block bb = BLOCK_FOR_INSN (insn);
17366 int distance = 0;
17367 bool found = false;
17369 if (insn != BB_HEAD (bb))
17370 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17371 distance, PREV_INSN (insn),
17372 &found);
17374 if (!found && distance < LEA_SEARCH_THRESHOLD)
17376 edge e;
17377 edge_iterator ei;
17378 bool simple_loop = false;
17380 FOR_EACH_EDGE (e, ei, bb->preds)
17381 if (e->src == bb)
17383 simple_loop = true;
17384 break;
17387 if (simple_loop)
17388 distance = distance_non_agu_define_in_bb (regno1, regno2,
17389 insn, distance,
17390 BB_END (bb), &found);
17391 else
17393 int shortest_dist = -1;
17394 bool found_in_bb = false;
17396 FOR_EACH_EDGE (e, ei, bb->preds)
17398 int bb_dist
17399 = distance_non_agu_define_in_bb (regno1, regno2,
17400 insn, distance,
17401 BB_END (e->src),
17402 &found_in_bb);
17403 if (found_in_bb)
17405 if (shortest_dist < 0)
17406 shortest_dist = bb_dist;
17407 else if (bb_dist > 0)
17408 shortest_dist = MIN (bb_dist, shortest_dist);
17410 found = true;
17414 distance = shortest_dist;
17418 /* get_attr_type may modify recog data. We want to make sure
17419 that recog data is valid for instruction INSN, on which
17420 distance_non_agu_define is called. INSN is unchanged here. */
17421 extract_insn_cached (insn);
17423 if (!found)
17424 return -1;
17426 return distance >> 1;
17429 /* Return the distance in half-cycles between INSN and the next
17430 insn that uses register number REGNO in memory address added
17431 to DISTANCE. Return -1 if REGNO0 is set.
17433 Put true value into *FOUND if register usage was found and
17434 false otherwise.
17435 Put true value into *REDEFINED if register redefinition was
17436 found and false otherwise. */
17438 static int
17439 distance_agu_use_in_bb (unsigned int regno,
17440 rtx insn, int distance, rtx start,
17441 bool *found, bool *redefined)
17443 basic_block bb = NULL;
17444 rtx next = start;
17445 rtx prev = NULL;
17447 *found = false;
17448 *redefined = false;
17450 if (start != NULL_RTX)
17452 bb = BLOCK_FOR_INSN (start);
17453 if (start != BB_HEAD (bb))
17454 /* If insn and start belong to the same bb, set prev to insn,
17455 so the call to increase_distance will increase the distance
17456 between insns by 1. */
17457 prev = insn;
17460 while (next
17461 && next != insn
17462 && distance < LEA_SEARCH_THRESHOLD)
17464 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17466 distance = increase_distance(prev, next, distance);
17467 if (insn_uses_reg_mem (regno, next))
17469 /* Return DISTANCE if OP0 is used in memory
17470 address in NEXT. */
17471 *found = true;
17472 return distance;
17475 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17477 /* Return -1 if OP0 is set in NEXT. */
17478 *redefined = true;
17479 return -1;
17482 prev = next;
17485 if (next == BB_END (bb))
17486 break;
17488 next = NEXT_INSN (next);
17491 return distance;
17494 /* Return the distance between INSN and the next insn that uses
17495 register number REGNO0 in memory address. Return -1 if no such
17496 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17498 static int
17499 distance_agu_use (unsigned int regno0, rtx insn)
17501 basic_block bb = BLOCK_FOR_INSN (insn);
17502 int distance = 0;
17503 bool found = false;
17504 bool redefined = false;
17506 if (insn != BB_END (bb))
17507 distance = distance_agu_use_in_bb (regno0, insn, distance,
17508 NEXT_INSN (insn),
17509 &found, &redefined);
17511 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17513 edge e;
17514 edge_iterator ei;
17515 bool simple_loop = false;
17517 FOR_EACH_EDGE (e, ei, bb->succs)
17518 if (e->dest == bb)
17520 simple_loop = true;
17521 break;
17524 if (simple_loop)
17525 distance = distance_agu_use_in_bb (regno0, insn,
17526 distance, BB_HEAD (bb),
17527 &found, &redefined);
17528 else
17530 int shortest_dist = -1;
17531 bool found_in_bb = false;
17532 bool redefined_in_bb = false;
17534 FOR_EACH_EDGE (e, ei, bb->succs)
17536 int bb_dist
17537 = distance_agu_use_in_bb (regno0, insn,
17538 distance, BB_HEAD (e->dest),
17539 &found_in_bb, &redefined_in_bb);
17540 if (found_in_bb)
17542 if (shortest_dist < 0)
17543 shortest_dist = bb_dist;
17544 else if (bb_dist > 0)
17545 shortest_dist = MIN (bb_dist, shortest_dist);
17547 found = true;
17551 distance = shortest_dist;
17555 if (!found || redefined)
17556 return -1;
17558 return distance >> 1;
17561 /* Define this macro to tune LEA priority vs ADD, it take effect when
17562 there is a dilemma of choicing LEA or ADD
17563 Negative value: ADD is more preferred than LEA
17564 Zero: Netrual
17565 Positive value: LEA is more preferred than ADD*/
17566 #define IX86_LEA_PRIORITY 0
17568 /* Return true if usage of lea INSN has performance advantage
17569 over a sequence of instructions. Instructions sequence has
17570 SPLIT_COST cycles higher latency than lea latency. */
17572 static bool
17573 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17574 unsigned int regno2, int split_cost, bool has_scale)
17576 int dist_define, dist_use;
17578 /* For Silvermont if using a 2-source or 3-source LEA for
17579 non-destructive destination purposes, or due to wanting
17580 ability to use SCALE, the use of LEA is justified. */
17581 if (ix86_tune == PROCESSOR_SLM)
17583 if (has_scale)
17584 return true;
17585 if (split_cost < 1)
17586 return false;
17587 if (regno0 == regno1 || regno0 == regno2)
17588 return false;
17589 return true;
17592 dist_define = distance_non_agu_define (regno1, regno2, insn);
17593 dist_use = distance_agu_use (regno0, insn);
17595 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17597 /* If there is no non AGU operand definition, no AGU
17598 operand usage and split cost is 0 then both lea
17599 and non lea variants have same priority. Currently
17600 we prefer lea for 64 bit code and non lea on 32 bit
17601 code. */
17602 if (dist_use < 0 && split_cost == 0)
17603 return TARGET_64BIT || IX86_LEA_PRIORITY;
17604 else
17605 return true;
17608 /* With longer definitions distance lea is more preferable.
17609 Here we change it to take into account splitting cost and
17610 lea priority. */
17611 dist_define += split_cost + IX86_LEA_PRIORITY;
17613 /* If there is no use in memory addess then we just check
17614 that split cost exceeds AGU stall. */
17615 if (dist_use < 0)
17616 return dist_define > LEA_MAX_STALL;
17618 /* If this insn has both backward non-agu dependence and forward
17619 agu dependence, the one with short distance takes effect. */
17620 return dist_define >= dist_use;
17623 /* Return true if it is legal to clobber flags by INSN and
17624 false otherwise. */
17626 static bool
17627 ix86_ok_to_clobber_flags (rtx insn)
17629 basic_block bb = BLOCK_FOR_INSN (insn);
17630 df_ref *use;
17631 bitmap live;
17633 while (insn)
17635 if (NONDEBUG_INSN_P (insn))
17637 for (use = DF_INSN_USES (insn); *use; use++)
17638 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17639 return false;
17641 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17642 return true;
17645 if (insn == BB_END (bb))
17646 break;
17648 insn = NEXT_INSN (insn);
17651 live = df_get_live_out(bb);
17652 return !REGNO_REG_SET_P (live, FLAGS_REG);
17655 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17656 move and add to avoid AGU stalls. */
17658 bool
17659 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17661 unsigned int regno0, regno1, regno2;
17663 /* Check if we need to optimize. */
17664 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17665 return false;
17667 /* Check it is correct to split here. */
17668 if (!ix86_ok_to_clobber_flags(insn))
17669 return false;
17671 regno0 = true_regnum (operands[0]);
17672 regno1 = true_regnum (operands[1]);
17673 regno2 = true_regnum (operands[2]);
17675 /* We need to split only adds with non destructive
17676 destination operand. */
17677 if (regno0 == regno1 || regno0 == regno2)
17678 return false;
17679 else
17680 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17683 /* Return true if we should emit lea instruction instead of mov
17684 instruction. */
17686 bool
17687 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17689 unsigned int regno0, regno1;
17691 /* Check if we need to optimize. */
17692 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17693 return false;
17695 /* Use lea for reg to reg moves only. */
17696 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17697 return false;
17699 regno0 = true_regnum (operands[0]);
17700 regno1 = true_regnum (operands[1]);
17702 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
17705 /* Return true if we need to split lea into a sequence of
17706 instructions to avoid AGU stalls. */
17708 bool
17709 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17711 unsigned int regno0, regno1, regno2;
17712 int split_cost;
17713 struct ix86_address parts;
17714 int ok;
17716 /* Check we need to optimize. */
17717 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17718 return false;
17720 /* Check it is correct to split here. */
17721 if (!ix86_ok_to_clobber_flags(insn))
17722 return false;
17724 ok = ix86_decompose_address (operands[1], &parts);
17725 gcc_assert (ok);
17727 /* There should be at least two components in the address. */
17728 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17729 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17730 return false;
17732 /* We should not split into add if non legitimate pic
17733 operand is used as displacement. */
17734 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17735 return false;
17737 regno0 = true_regnum (operands[0]) ;
17738 regno1 = INVALID_REGNUM;
17739 regno2 = INVALID_REGNUM;
17741 if (parts.base)
17742 regno1 = true_regnum (parts.base);
17743 if (parts.index)
17744 regno2 = true_regnum (parts.index);
17746 split_cost = 0;
17748 /* Compute how many cycles we will add to execution time
17749 if split lea into a sequence of instructions. */
17750 if (parts.base || parts.index)
17752 /* Have to use mov instruction if non desctructive
17753 destination form is used. */
17754 if (regno1 != regno0 && regno2 != regno0)
17755 split_cost += 1;
17757 /* Have to add index to base if both exist. */
17758 if (parts.base && parts.index)
17759 split_cost += 1;
17761 /* Have to use shift and adds if scale is 2 or greater. */
17762 if (parts.scale > 1)
17764 if (regno0 != regno1)
17765 split_cost += 1;
17766 else if (regno2 == regno0)
17767 split_cost += 4;
17768 else
17769 split_cost += parts.scale;
17772 /* Have to use add instruction with immediate if
17773 disp is non zero. */
17774 if (parts.disp && parts.disp != const0_rtx)
17775 split_cost += 1;
17777 /* Subtract the price of lea. */
17778 split_cost -= 1;
17781 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
17782 parts.scale > 1);
17785 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17786 matches destination. RTX includes clobber of FLAGS_REG. */
17788 static void
17789 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17790 rtx dst, rtx src)
17792 rtx op, clob;
17794 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17795 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17797 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17800 /* Return true if regno1 def is nearest to the insn. */
17802 static bool
17803 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17805 rtx prev = insn;
17806 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17808 if (insn == start)
17809 return false;
17810 while (prev && prev != start)
17812 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17814 prev = PREV_INSN (prev);
17815 continue;
17817 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17818 return true;
17819 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17820 return false;
17821 prev = PREV_INSN (prev);
17824 /* None of the regs is defined in the bb. */
17825 return false;
17828 /* Split lea instructions into a sequence of instructions
17829 which are executed on ALU to avoid AGU stalls.
17830 It is assumed that it is allowed to clobber flags register
17831 at lea position. */
17833 void
17834 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17836 unsigned int regno0, regno1, regno2;
17837 struct ix86_address parts;
17838 rtx target, tmp;
17839 int ok, adds;
17841 ok = ix86_decompose_address (operands[1], &parts);
17842 gcc_assert (ok);
17844 target = gen_lowpart (mode, operands[0]);
17846 regno0 = true_regnum (target);
17847 regno1 = INVALID_REGNUM;
17848 regno2 = INVALID_REGNUM;
17850 if (parts.base)
17852 parts.base = gen_lowpart (mode, parts.base);
17853 regno1 = true_regnum (parts.base);
17856 if (parts.index)
17858 parts.index = gen_lowpart (mode, parts.index);
17859 regno2 = true_regnum (parts.index);
17862 if (parts.disp)
17863 parts.disp = gen_lowpart (mode, parts.disp);
17865 if (parts.scale > 1)
17867 /* Case r1 = r1 + ... */
17868 if (regno1 == regno0)
17870 /* If we have a case r1 = r1 + C * r1 then we
17871 should use multiplication which is very
17872 expensive. Assume cost model is wrong if we
17873 have such case here. */
17874 gcc_assert (regno2 != regno0);
17876 for (adds = parts.scale; adds > 0; adds--)
17877 ix86_emit_binop (PLUS, mode, target, parts.index);
17879 else
17881 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17882 if (regno0 != regno2)
17883 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17885 /* Use shift for scaling. */
17886 ix86_emit_binop (ASHIFT, mode, target,
17887 GEN_INT (exact_log2 (parts.scale)));
17889 if (parts.base)
17890 ix86_emit_binop (PLUS, mode, target, parts.base);
17892 if (parts.disp && parts.disp != const0_rtx)
17893 ix86_emit_binop (PLUS, mode, target, parts.disp);
17896 else if (!parts.base && !parts.index)
17898 gcc_assert(parts.disp);
17899 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17901 else
17903 if (!parts.base)
17905 if (regno0 != regno2)
17906 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17908 else if (!parts.index)
17910 if (regno0 != regno1)
17911 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17913 else
17915 if (regno0 == regno1)
17916 tmp = parts.index;
17917 else if (regno0 == regno2)
17918 tmp = parts.base;
17919 else
17921 rtx tmp1;
17923 /* Find better operand for SET instruction, depending
17924 on which definition is farther from the insn. */
17925 if (find_nearest_reg_def (insn, regno1, regno2))
17926 tmp = parts.index, tmp1 = parts.base;
17927 else
17928 tmp = parts.base, tmp1 = parts.index;
17930 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17932 if (parts.disp && parts.disp != const0_rtx)
17933 ix86_emit_binop (PLUS, mode, target, parts.disp);
17935 ix86_emit_binop (PLUS, mode, target, tmp1);
17936 return;
17939 ix86_emit_binop (PLUS, mode, target, tmp);
17942 if (parts.disp && parts.disp != const0_rtx)
17943 ix86_emit_binop (PLUS, mode, target, parts.disp);
17947 /* Return true if it is ok to optimize an ADD operation to LEA
17948 operation to avoid flag register consumation. For most processors,
17949 ADD is faster than LEA. For the processors like ATOM, if the
17950 destination register of LEA holds an actual address which will be
17951 used soon, LEA is better and otherwise ADD is better. */
17953 bool
17954 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17956 unsigned int regno0 = true_regnum (operands[0]);
17957 unsigned int regno1 = true_regnum (operands[1]);
17958 unsigned int regno2 = true_regnum (operands[2]);
17960 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17961 if (regno0 != regno1 && regno0 != regno2)
17962 return true;
17964 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17965 return false;
17967 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
17970 /* Return true if destination reg of SET_BODY is shift count of
17971 USE_BODY. */
17973 static bool
17974 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17976 rtx set_dest;
17977 rtx shift_rtx;
17978 int i;
17980 /* Retrieve destination of SET_BODY. */
17981 switch (GET_CODE (set_body))
17983 case SET:
17984 set_dest = SET_DEST (set_body);
17985 if (!set_dest || !REG_P (set_dest))
17986 return false;
17987 break;
17988 case PARALLEL:
17989 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17990 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17991 use_body))
17992 return true;
17993 default:
17994 return false;
17995 break;
17998 /* Retrieve shift count of USE_BODY. */
17999 switch (GET_CODE (use_body))
18001 case SET:
18002 shift_rtx = XEXP (use_body, 1);
18003 break;
18004 case PARALLEL:
18005 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
18006 if (ix86_dep_by_shift_count_body (set_body,
18007 XVECEXP (use_body, 0, i)))
18008 return true;
18009 default:
18010 return false;
18011 break;
18014 if (shift_rtx
18015 && (GET_CODE (shift_rtx) == ASHIFT
18016 || GET_CODE (shift_rtx) == LSHIFTRT
18017 || GET_CODE (shift_rtx) == ASHIFTRT
18018 || GET_CODE (shift_rtx) == ROTATE
18019 || GET_CODE (shift_rtx) == ROTATERT))
18021 rtx shift_count = XEXP (shift_rtx, 1);
18023 /* Return true if shift count is dest of SET_BODY. */
18024 if (REG_P (shift_count))
18026 /* Add check since it can be invoked before register
18027 allocation in pre-reload schedule. */
18028 if (reload_completed
18029 && true_regnum (set_dest) == true_regnum (shift_count))
18030 return true;
18031 else if (REGNO(set_dest) == REGNO(shift_count))
18032 return true;
18036 return false;
18039 /* Return true if destination reg of SET_INSN is shift count of
18040 USE_INSN. */
18042 bool
18043 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
18045 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
18046 PATTERN (use_insn));
18049 /* Return TRUE or FALSE depending on whether the unary operator meets the
18050 appropriate constraints. */
18052 bool
18053 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
18054 enum machine_mode mode ATTRIBUTE_UNUSED,
18055 rtx operands[2])
18057 /* If one of operands is memory, source and destination must match. */
18058 if ((MEM_P (operands[0])
18059 || MEM_P (operands[1]))
18060 && ! rtx_equal_p (operands[0], operands[1]))
18061 return false;
18062 return true;
18065 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
18066 are ok, keeping in mind the possible movddup alternative. */
18068 bool
18069 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
18071 if (MEM_P (operands[0]))
18072 return rtx_equal_p (operands[0], operands[1 + high]);
18073 if (MEM_P (operands[1]) && MEM_P (operands[2]))
18074 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
18075 return true;
18078 /* Post-reload splitter for converting an SF or DFmode value in an
18079 SSE register into an unsigned SImode. */
18081 void
18082 ix86_split_convert_uns_si_sse (rtx operands[])
18084 enum machine_mode vecmode;
18085 rtx value, large, zero_or_two31, input, two31, x;
18087 large = operands[1];
18088 zero_or_two31 = operands[2];
18089 input = operands[3];
18090 two31 = operands[4];
18091 vecmode = GET_MODE (large);
18092 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18094 /* Load up the value into the low element. We must ensure that the other
18095 elements are valid floats -- zero is the easiest such value. */
18096 if (MEM_P (input))
18098 if (vecmode == V4SFmode)
18099 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18100 else
18101 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18103 else
18105 input = gen_rtx_REG (vecmode, REGNO (input));
18106 emit_move_insn (value, CONST0_RTX (vecmode));
18107 if (vecmode == V4SFmode)
18108 emit_insn (gen_sse_movss (value, value, input));
18109 else
18110 emit_insn (gen_sse2_movsd (value, value, input));
18113 emit_move_insn (large, two31);
18114 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18116 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18117 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18119 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18120 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18122 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18123 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18125 large = gen_rtx_REG (V4SImode, REGNO (large));
18126 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18128 x = gen_rtx_REG (V4SImode, REGNO (value));
18129 if (vecmode == V4SFmode)
18130 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18131 else
18132 emit_insn (gen_sse2_cvttpd2dq (x, value));
18133 value = x;
18135 emit_insn (gen_xorv4si3 (value, value, large));
18138 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18139 Expects the 64-bit DImode to be supplied in a pair of integral
18140 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18141 -mfpmath=sse, !optimize_size only. */
18143 void
18144 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18146 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18147 rtx int_xmm, fp_xmm;
18148 rtx biases, exponents;
18149 rtx x;
18151 int_xmm = gen_reg_rtx (V4SImode);
18152 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18153 emit_insn (gen_movdi_to_sse (int_xmm, input));
18154 else if (TARGET_SSE_SPLIT_REGS)
18156 emit_clobber (int_xmm);
18157 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18159 else
18161 x = gen_reg_rtx (V2DImode);
18162 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18163 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18166 x = gen_rtx_CONST_VECTOR (V4SImode,
18167 gen_rtvec (4, GEN_INT (0x43300000UL),
18168 GEN_INT (0x45300000UL),
18169 const0_rtx, const0_rtx));
18170 exponents = validize_mem (force_const_mem (V4SImode, x));
18172 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18173 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18175 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18176 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18177 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18178 (0x1.0p84 + double(fp_value_hi_xmm)).
18179 Note these exponents differ by 32. */
18181 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18183 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18184 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18185 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18186 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18187 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18188 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18189 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18190 biases = validize_mem (force_const_mem (V2DFmode, biases));
18191 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18193 /* Add the upper and lower DFmode values together. */
18194 if (TARGET_SSE3)
18195 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18196 else
18198 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18199 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18200 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18203 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18206 /* Not used, but eases macroization of patterns. */
18207 void
18208 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18209 rtx input ATTRIBUTE_UNUSED)
18211 gcc_unreachable ();
18214 /* Convert an unsigned SImode value into a DFmode. Only currently used
18215 for SSE, but applicable anywhere. */
18217 void
18218 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18220 REAL_VALUE_TYPE TWO31r;
18221 rtx x, fp;
18223 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18224 NULL, 1, OPTAB_DIRECT);
18226 fp = gen_reg_rtx (DFmode);
18227 emit_insn (gen_floatsidf2 (fp, x));
18229 real_ldexp (&TWO31r, &dconst1, 31);
18230 x = const_double_from_real_value (TWO31r, DFmode);
18232 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18233 if (x != target)
18234 emit_move_insn (target, x);
18237 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18238 32-bit mode; otherwise we have a direct convert instruction. */
18240 void
18241 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18243 REAL_VALUE_TYPE TWO32r;
18244 rtx fp_lo, fp_hi, x;
18246 fp_lo = gen_reg_rtx (DFmode);
18247 fp_hi = gen_reg_rtx (DFmode);
18249 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18251 real_ldexp (&TWO32r, &dconst1, 32);
18252 x = const_double_from_real_value (TWO32r, DFmode);
18253 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18255 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18257 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18258 0, OPTAB_DIRECT);
18259 if (x != target)
18260 emit_move_insn (target, x);
18263 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18264 For x86_32, -mfpmath=sse, !optimize_size only. */
18265 void
18266 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18268 REAL_VALUE_TYPE ONE16r;
18269 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18271 real_ldexp (&ONE16r, &dconst1, 16);
18272 x = const_double_from_real_value (ONE16r, SFmode);
18273 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18274 NULL, 0, OPTAB_DIRECT);
18275 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18276 NULL, 0, OPTAB_DIRECT);
18277 fp_hi = gen_reg_rtx (SFmode);
18278 fp_lo = gen_reg_rtx (SFmode);
18279 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18280 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18281 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18282 0, OPTAB_DIRECT);
18283 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18284 0, OPTAB_DIRECT);
18285 if (!rtx_equal_p (target, fp_hi))
18286 emit_move_insn (target, fp_hi);
18289 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18290 a vector of unsigned ints VAL to vector of floats TARGET. */
18292 void
18293 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18295 rtx tmp[8];
18296 REAL_VALUE_TYPE TWO16r;
18297 enum machine_mode intmode = GET_MODE (val);
18298 enum machine_mode fltmode = GET_MODE (target);
18299 rtx (*cvt) (rtx, rtx);
18301 if (intmode == V4SImode)
18302 cvt = gen_floatv4siv4sf2;
18303 else
18304 cvt = gen_floatv8siv8sf2;
18305 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18306 tmp[0] = force_reg (intmode, tmp[0]);
18307 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18308 OPTAB_DIRECT);
18309 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18310 NULL_RTX, 1, OPTAB_DIRECT);
18311 tmp[3] = gen_reg_rtx (fltmode);
18312 emit_insn (cvt (tmp[3], tmp[1]));
18313 tmp[4] = gen_reg_rtx (fltmode);
18314 emit_insn (cvt (tmp[4], tmp[2]));
18315 real_ldexp (&TWO16r, &dconst1, 16);
18316 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18317 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18318 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18319 OPTAB_DIRECT);
18320 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18321 OPTAB_DIRECT);
18322 if (tmp[7] != target)
18323 emit_move_insn (target, tmp[7]);
18326 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18327 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18328 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18329 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18332 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18334 REAL_VALUE_TYPE TWO31r;
18335 rtx two31r, tmp[4];
18336 enum machine_mode mode = GET_MODE (val);
18337 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18338 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18339 rtx (*cmp) (rtx, rtx, rtx, rtx);
18340 int i;
18342 for (i = 0; i < 3; i++)
18343 tmp[i] = gen_reg_rtx (mode);
18344 real_ldexp (&TWO31r, &dconst1, 31);
18345 two31r = const_double_from_real_value (TWO31r, scalarmode);
18346 two31r = ix86_build_const_vector (mode, 1, two31r);
18347 two31r = force_reg (mode, two31r);
18348 switch (mode)
18350 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18351 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18352 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18353 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18354 default: gcc_unreachable ();
18356 tmp[3] = gen_rtx_LE (mode, two31r, val);
18357 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18358 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18359 0, OPTAB_DIRECT);
18360 if (intmode == V4SImode || TARGET_AVX2)
18361 *xorp = expand_simple_binop (intmode, ASHIFT,
18362 gen_lowpart (intmode, tmp[0]),
18363 GEN_INT (31), NULL_RTX, 0,
18364 OPTAB_DIRECT);
18365 else
18367 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18368 two31 = ix86_build_const_vector (intmode, 1, two31);
18369 *xorp = expand_simple_binop (intmode, AND,
18370 gen_lowpart (intmode, tmp[0]),
18371 two31, NULL_RTX, 0,
18372 OPTAB_DIRECT);
18374 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18375 0, OPTAB_DIRECT);
18378 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18379 then replicate the value for all elements of the vector
18380 register. */
18383 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18385 int i, n_elt;
18386 rtvec v;
18387 enum machine_mode scalar_mode;
18389 switch (mode)
18391 case V32QImode:
18392 case V16QImode:
18393 case V16HImode:
18394 case V8HImode:
18395 case V8SImode:
18396 case V4SImode:
18397 case V4DImode:
18398 case V2DImode:
18399 gcc_assert (vect);
18400 case V8SFmode:
18401 case V4SFmode:
18402 case V4DFmode:
18403 case V2DFmode:
18404 n_elt = GET_MODE_NUNITS (mode);
18405 v = rtvec_alloc (n_elt);
18406 scalar_mode = GET_MODE_INNER (mode);
18408 RTVEC_ELT (v, 0) = value;
18410 for (i = 1; i < n_elt; ++i)
18411 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18413 return gen_rtx_CONST_VECTOR (mode, v);
18415 default:
18416 gcc_unreachable ();
18420 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18421 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18422 for an SSE register. If VECT is true, then replicate the mask for
18423 all elements of the vector register. If INVERT is true, then create
18424 a mask excluding the sign bit. */
18427 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18429 enum machine_mode vec_mode, imode;
18430 HOST_WIDE_INT hi, lo;
18431 int shift = 63;
18432 rtx v;
18433 rtx mask;
18435 /* Find the sign bit, sign extended to 2*HWI. */
18436 switch (mode)
18438 case V8SImode:
18439 case V4SImode:
18440 case V8SFmode:
18441 case V4SFmode:
18442 vec_mode = mode;
18443 mode = GET_MODE_INNER (mode);
18444 imode = SImode;
18445 lo = 0x80000000, hi = lo < 0;
18446 break;
18448 case V4DImode:
18449 case V2DImode:
18450 case V4DFmode:
18451 case V2DFmode:
18452 vec_mode = mode;
18453 mode = GET_MODE_INNER (mode);
18454 imode = DImode;
18455 if (HOST_BITS_PER_WIDE_INT >= 64)
18456 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18457 else
18458 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18459 break;
18461 case TImode:
18462 case TFmode:
18463 vec_mode = VOIDmode;
18464 if (HOST_BITS_PER_WIDE_INT >= 64)
18466 imode = TImode;
18467 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18469 else
18471 rtvec vec;
18473 imode = DImode;
18474 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18476 if (invert)
18478 lo = ~lo, hi = ~hi;
18479 v = constm1_rtx;
18481 else
18482 v = const0_rtx;
18484 mask = immed_double_const (lo, hi, imode);
18486 vec = gen_rtvec (2, v, mask);
18487 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18488 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18490 return v;
18492 break;
18494 default:
18495 gcc_unreachable ();
18498 if (invert)
18499 lo = ~lo, hi = ~hi;
18501 /* Force this value into the low part of a fp vector constant. */
18502 mask = immed_double_const (lo, hi, imode);
18503 mask = gen_lowpart (mode, mask);
18505 if (vec_mode == VOIDmode)
18506 return force_reg (mode, mask);
18508 v = ix86_build_const_vector (vec_mode, vect, mask);
18509 return force_reg (vec_mode, v);
18512 /* Generate code for floating point ABS or NEG. */
18514 void
18515 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18516 rtx operands[])
18518 rtx mask, set, dst, src;
18519 bool use_sse = false;
18520 bool vector_mode = VECTOR_MODE_P (mode);
18521 enum machine_mode vmode = mode;
18523 if (vector_mode)
18524 use_sse = true;
18525 else if (mode == TFmode)
18526 use_sse = true;
18527 else if (TARGET_SSE_MATH)
18529 use_sse = SSE_FLOAT_MODE_P (mode);
18530 if (mode == SFmode)
18531 vmode = V4SFmode;
18532 else if (mode == DFmode)
18533 vmode = V2DFmode;
18536 /* NEG and ABS performed with SSE use bitwise mask operations.
18537 Create the appropriate mask now. */
18538 if (use_sse)
18539 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18540 else
18541 mask = NULL_RTX;
18543 dst = operands[0];
18544 src = operands[1];
18546 set = gen_rtx_fmt_e (code, mode, src);
18547 set = gen_rtx_SET (VOIDmode, dst, set);
18549 if (mask)
18551 rtx use, clob;
18552 rtvec par;
18554 use = gen_rtx_USE (VOIDmode, mask);
18555 if (vector_mode)
18556 par = gen_rtvec (2, set, use);
18557 else
18559 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18560 par = gen_rtvec (3, set, use, clob);
18562 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18564 else
18565 emit_insn (set);
18568 /* Expand a copysign operation. Special case operand 0 being a constant. */
18570 void
18571 ix86_expand_copysign (rtx operands[])
18573 enum machine_mode mode, vmode;
18574 rtx dest, op0, op1, mask, nmask;
18576 dest = operands[0];
18577 op0 = operands[1];
18578 op1 = operands[2];
18580 mode = GET_MODE (dest);
18582 if (mode == SFmode)
18583 vmode = V4SFmode;
18584 else if (mode == DFmode)
18585 vmode = V2DFmode;
18586 else
18587 vmode = mode;
18589 if (GET_CODE (op0) == CONST_DOUBLE)
18591 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18593 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18594 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18596 if (mode == SFmode || mode == DFmode)
18598 if (op0 == CONST0_RTX (mode))
18599 op0 = CONST0_RTX (vmode);
18600 else
18602 rtx v = ix86_build_const_vector (vmode, false, op0);
18604 op0 = force_reg (vmode, v);
18607 else if (op0 != CONST0_RTX (mode))
18608 op0 = force_reg (mode, op0);
18610 mask = ix86_build_signbit_mask (vmode, 0, 0);
18612 if (mode == SFmode)
18613 copysign_insn = gen_copysignsf3_const;
18614 else if (mode == DFmode)
18615 copysign_insn = gen_copysigndf3_const;
18616 else
18617 copysign_insn = gen_copysigntf3_const;
18619 emit_insn (copysign_insn (dest, op0, op1, mask));
18621 else
18623 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18625 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18626 mask = ix86_build_signbit_mask (vmode, 0, 0);
18628 if (mode == SFmode)
18629 copysign_insn = gen_copysignsf3_var;
18630 else if (mode == DFmode)
18631 copysign_insn = gen_copysigndf3_var;
18632 else
18633 copysign_insn = gen_copysigntf3_var;
18635 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18639 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18640 be a constant, and so has already been expanded into a vector constant. */
18642 void
18643 ix86_split_copysign_const (rtx operands[])
18645 enum machine_mode mode, vmode;
18646 rtx dest, op0, mask, x;
18648 dest = operands[0];
18649 op0 = operands[1];
18650 mask = operands[3];
18652 mode = GET_MODE (dest);
18653 vmode = GET_MODE (mask);
18655 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18656 x = gen_rtx_AND (vmode, dest, mask);
18657 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18659 if (op0 != CONST0_RTX (vmode))
18661 x = gen_rtx_IOR (vmode, dest, op0);
18662 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18666 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18667 so we have to do two masks. */
18669 void
18670 ix86_split_copysign_var (rtx operands[])
18672 enum machine_mode mode, vmode;
18673 rtx dest, scratch, op0, op1, mask, nmask, x;
18675 dest = operands[0];
18676 scratch = operands[1];
18677 op0 = operands[2];
18678 op1 = operands[3];
18679 nmask = operands[4];
18680 mask = operands[5];
18682 mode = GET_MODE (dest);
18683 vmode = GET_MODE (mask);
18685 if (rtx_equal_p (op0, op1))
18687 /* Shouldn't happen often (it's useless, obviously), but when it does
18688 we'd generate incorrect code if we continue below. */
18689 emit_move_insn (dest, op0);
18690 return;
18693 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18695 gcc_assert (REGNO (op1) == REGNO (scratch));
18697 x = gen_rtx_AND (vmode, scratch, mask);
18698 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18700 dest = mask;
18701 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18702 x = gen_rtx_NOT (vmode, dest);
18703 x = gen_rtx_AND (vmode, x, op0);
18704 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18706 else
18708 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18710 x = gen_rtx_AND (vmode, scratch, mask);
18712 else /* alternative 2,4 */
18714 gcc_assert (REGNO (mask) == REGNO (scratch));
18715 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18716 x = gen_rtx_AND (vmode, scratch, op1);
18718 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18720 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18722 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18723 x = gen_rtx_AND (vmode, dest, nmask);
18725 else /* alternative 3,4 */
18727 gcc_assert (REGNO (nmask) == REGNO (dest));
18728 dest = nmask;
18729 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18730 x = gen_rtx_AND (vmode, dest, op0);
18732 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18735 x = gen_rtx_IOR (vmode, dest, scratch);
18736 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18739 /* Return TRUE or FALSE depending on whether the first SET in INSN
18740 has source and destination with matching CC modes, and that the
18741 CC mode is at least as constrained as REQ_MODE. */
18743 bool
18744 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18746 rtx set;
18747 enum machine_mode set_mode;
18749 set = PATTERN (insn);
18750 if (GET_CODE (set) == PARALLEL)
18751 set = XVECEXP (set, 0, 0);
18752 gcc_assert (GET_CODE (set) == SET);
18753 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18755 set_mode = GET_MODE (SET_DEST (set));
18756 switch (set_mode)
18758 case CCNOmode:
18759 if (req_mode != CCNOmode
18760 && (req_mode != CCmode
18761 || XEXP (SET_SRC (set), 1) != const0_rtx))
18762 return false;
18763 break;
18764 case CCmode:
18765 if (req_mode == CCGCmode)
18766 return false;
18767 /* FALLTHRU */
18768 case CCGCmode:
18769 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18770 return false;
18771 /* FALLTHRU */
18772 case CCGOCmode:
18773 if (req_mode == CCZmode)
18774 return false;
18775 /* FALLTHRU */
18776 case CCZmode:
18777 break;
18779 case CCAmode:
18780 case CCCmode:
18781 case CCOmode:
18782 case CCSmode:
18783 if (set_mode != req_mode)
18784 return false;
18785 break;
18787 default:
18788 gcc_unreachable ();
18791 return GET_MODE (SET_SRC (set)) == set_mode;
18794 /* Generate insn patterns to do an integer compare of OPERANDS. */
18796 static rtx
18797 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18799 enum machine_mode cmpmode;
18800 rtx tmp, flags;
18802 cmpmode = SELECT_CC_MODE (code, op0, op1);
18803 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18805 /* This is very simple, but making the interface the same as in the
18806 FP case makes the rest of the code easier. */
18807 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18808 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18810 /* Return the test that should be put into the flags user, i.e.
18811 the bcc, scc, or cmov instruction. */
18812 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18815 /* Figure out whether to use ordered or unordered fp comparisons.
18816 Return the appropriate mode to use. */
18818 enum machine_mode
18819 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18821 /* ??? In order to make all comparisons reversible, we do all comparisons
18822 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18823 all forms trapping and nontrapping comparisons, we can make inequality
18824 comparisons trapping again, since it results in better code when using
18825 FCOM based compares. */
18826 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18829 enum machine_mode
18830 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18832 enum machine_mode mode = GET_MODE (op0);
18834 if (SCALAR_FLOAT_MODE_P (mode))
18836 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18837 return ix86_fp_compare_mode (code);
18840 switch (code)
18842 /* Only zero flag is needed. */
18843 case EQ: /* ZF=0 */
18844 case NE: /* ZF!=0 */
18845 return CCZmode;
18846 /* Codes needing carry flag. */
18847 case GEU: /* CF=0 */
18848 case LTU: /* CF=1 */
18849 /* Detect overflow checks. They need just the carry flag. */
18850 if (GET_CODE (op0) == PLUS
18851 && rtx_equal_p (op1, XEXP (op0, 0)))
18852 return CCCmode;
18853 else
18854 return CCmode;
18855 case GTU: /* CF=0 & ZF=0 */
18856 case LEU: /* CF=1 | ZF=1 */
18857 /* Detect overflow checks. They need just the carry flag. */
18858 if (GET_CODE (op0) == MINUS
18859 && rtx_equal_p (op1, XEXP (op0, 0)))
18860 return CCCmode;
18861 else
18862 return CCmode;
18863 /* Codes possibly doable only with sign flag when
18864 comparing against zero. */
18865 case GE: /* SF=OF or SF=0 */
18866 case LT: /* SF<>OF or SF=1 */
18867 if (op1 == const0_rtx)
18868 return CCGOCmode;
18869 else
18870 /* For other cases Carry flag is not required. */
18871 return CCGCmode;
18872 /* Codes doable only with sign flag when comparing
18873 against zero, but we miss jump instruction for it
18874 so we need to use relational tests against overflow
18875 that thus needs to be zero. */
18876 case GT: /* ZF=0 & SF=OF */
18877 case LE: /* ZF=1 | SF<>OF */
18878 if (op1 == const0_rtx)
18879 return CCNOmode;
18880 else
18881 return CCGCmode;
18882 /* strcmp pattern do (use flags) and combine may ask us for proper
18883 mode. */
18884 case USE:
18885 return CCmode;
18886 default:
18887 gcc_unreachable ();
18891 /* Return the fixed registers used for condition codes. */
18893 static bool
18894 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18896 *p1 = FLAGS_REG;
18897 *p2 = FPSR_REG;
18898 return true;
18901 /* If two condition code modes are compatible, return a condition code
18902 mode which is compatible with both. Otherwise, return
18903 VOIDmode. */
18905 static enum machine_mode
18906 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18908 if (m1 == m2)
18909 return m1;
18911 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18912 return VOIDmode;
18914 if ((m1 == CCGCmode && m2 == CCGOCmode)
18915 || (m1 == CCGOCmode && m2 == CCGCmode))
18916 return CCGCmode;
18918 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18919 return m2;
18920 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18921 return m1;
18923 switch (m1)
18925 default:
18926 gcc_unreachable ();
18928 case CCmode:
18929 case CCGCmode:
18930 case CCGOCmode:
18931 case CCNOmode:
18932 case CCAmode:
18933 case CCCmode:
18934 case CCOmode:
18935 case CCSmode:
18936 case CCZmode:
18937 switch (m2)
18939 default:
18940 return VOIDmode;
18942 case CCmode:
18943 case CCGCmode:
18944 case CCGOCmode:
18945 case CCNOmode:
18946 case CCAmode:
18947 case CCCmode:
18948 case CCOmode:
18949 case CCSmode:
18950 case CCZmode:
18951 return CCmode;
18954 case CCFPmode:
18955 case CCFPUmode:
18956 /* These are only compatible with themselves, which we already
18957 checked above. */
18958 return VOIDmode;
18963 /* Return a comparison we can do and that it is equivalent to
18964 swap_condition (code) apart possibly from orderedness.
18965 But, never change orderedness if TARGET_IEEE_FP, returning
18966 UNKNOWN in that case if necessary. */
18968 static enum rtx_code
18969 ix86_fp_swap_condition (enum rtx_code code)
18971 switch (code)
18973 case GT: /* GTU - CF=0 & ZF=0 */
18974 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18975 case GE: /* GEU - CF=0 */
18976 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18977 case UNLT: /* LTU - CF=1 */
18978 return TARGET_IEEE_FP ? UNKNOWN : GT;
18979 case UNLE: /* LEU - CF=1 | ZF=1 */
18980 return TARGET_IEEE_FP ? UNKNOWN : GE;
18981 default:
18982 return swap_condition (code);
18986 /* Return cost of comparison CODE using the best strategy for performance.
18987 All following functions do use number of instructions as a cost metrics.
18988 In future this should be tweaked to compute bytes for optimize_size and
18989 take into account performance of various instructions on various CPUs. */
18991 static int
18992 ix86_fp_comparison_cost (enum rtx_code code)
18994 int arith_cost;
18996 /* The cost of code using bit-twiddling on %ah. */
18997 switch (code)
18999 case UNLE:
19000 case UNLT:
19001 case LTGT:
19002 case GT:
19003 case GE:
19004 case UNORDERED:
19005 case ORDERED:
19006 case UNEQ:
19007 arith_cost = 4;
19008 break;
19009 case LT:
19010 case NE:
19011 case EQ:
19012 case UNGE:
19013 arith_cost = TARGET_IEEE_FP ? 5 : 4;
19014 break;
19015 case LE:
19016 case UNGT:
19017 arith_cost = TARGET_IEEE_FP ? 6 : 4;
19018 break;
19019 default:
19020 gcc_unreachable ();
19023 switch (ix86_fp_comparison_strategy (code))
19025 case IX86_FPCMP_COMI:
19026 return arith_cost > 4 ? 3 : 2;
19027 case IX86_FPCMP_SAHF:
19028 return arith_cost > 4 ? 4 : 3;
19029 default:
19030 return arith_cost;
19034 /* Return strategy to use for floating-point. We assume that fcomi is always
19035 preferrable where available, since that is also true when looking at size
19036 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
19038 enum ix86_fpcmp_strategy
19039 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
19041 /* Do fcomi/sahf based test when profitable. */
19043 if (TARGET_CMOVE)
19044 return IX86_FPCMP_COMI;
19046 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19047 return IX86_FPCMP_SAHF;
19049 return IX86_FPCMP_ARITH;
19052 /* Swap, force into registers, or otherwise massage the two operands
19053 to a fp comparison. The operands are updated in place; the new
19054 comparison code is returned. */
19056 static enum rtx_code
19057 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
19059 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
19060 rtx op0 = *pop0, op1 = *pop1;
19061 enum machine_mode op_mode = GET_MODE (op0);
19062 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
19064 /* All of the unordered compare instructions only work on registers.
19065 The same is true of the fcomi compare instructions. The XFmode
19066 compare instructions require registers except when comparing
19067 against zero or when converting operand 1 from fixed point to
19068 floating point. */
19070 if (!is_sse
19071 && (fpcmp_mode == CCFPUmode
19072 || (op_mode == XFmode
19073 && ! (standard_80387_constant_p (op0) == 1
19074 || standard_80387_constant_p (op1) == 1)
19075 && GET_CODE (op1) != FLOAT)
19076 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
19078 op0 = force_reg (op_mode, op0);
19079 op1 = force_reg (op_mode, op1);
19081 else
19083 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19084 things around if they appear profitable, otherwise force op0
19085 into a register. */
19087 if (standard_80387_constant_p (op0) == 0
19088 || (MEM_P (op0)
19089 && ! (standard_80387_constant_p (op1) == 0
19090 || MEM_P (op1))))
19092 enum rtx_code new_code = ix86_fp_swap_condition (code);
19093 if (new_code != UNKNOWN)
19095 rtx tmp;
19096 tmp = op0, op0 = op1, op1 = tmp;
19097 code = new_code;
19101 if (!REG_P (op0))
19102 op0 = force_reg (op_mode, op0);
19104 if (CONSTANT_P (op1))
19106 int tmp = standard_80387_constant_p (op1);
19107 if (tmp == 0)
19108 op1 = validize_mem (force_const_mem (op_mode, op1));
19109 else if (tmp == 1)
19111 if (TARGET_CMOVE)
19112 op1 = force_reg (op_mode, op1);
19114 else
19115 op1 = force_reg (op_mode, op1);
19119 /* Try to rearrange the comparison to make it cheaper. */
19120 if (ix86_fp_comparison_cost (code)
19121 > ix86_fp_comparison_cost (swap_condition (code))
19122 && (REG_P (op1) || can_create_pseudo_p ()))
19124 rtx tmp;
19125 tmp = op0, op0 = op1, op1 = tmp;
19126 code = swap_condition (code);
19127 if (!REG_P (op0))
19128 op0 = force_reg (op_mode, op0);
19131 *pop0 = op0;
19132 *pop1 = op1;
19133 return code;
19136 /* Convert comparison codes we use to represent FP comparison to integer
19137 code that will result in proper branch. Return UNKNOWN if no such code
19138 is available. */
19140 enum rtx_code
19141 ix86_fp_compare_code_to_integer (enum rtx_code code)
19143 switch (code)
19145 case GT:
19146 return GTU;
19147 case GE:
19148 return GEU;
19149 case ORDERED:
19150 case UNORDERED:
19151 return code;
19152 break;
19153 case UNEQ:
19154 return EQ;
19155 break;
19156 case UNLT:
19157 return LTU;
19158 break;
19159 case UNLE:
19160 return LEU;
19161 break;
19162 case LTGT:
19163 return NE;
19164 break;
19165 default:
19166 return UNKNOWN;
19170 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19172 static rtx
19173 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19175 enum machine_mode fpcmp_mode, intcmp_mode;
19176 rtx tmp, tmp2;
19178 fpcmp_mode = ix86_fp_compare_mode (code);
19179 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19181 /* Do fcomi/sahf based test when profitable. */
19182 switch (ix86_fp_comparison_strategy (code))
19184 case IX86_FPCMP_COMI:
19185 intcmp_mode = fpcmp_mode;
19186 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19187 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19188 tmp);
19189 emit_insn (tmp);
19190 break;
19192 case IX86_FPCMP_SAHF:
19193 intcmp_mode = fpcmp_mode;
19194 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19195 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19196 tmp);
19198 if (!scratch)
19199 scratch = gen_reg_rtx (HImode);
19200 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19201 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19202 break;
19204 case IX86_FPCMP_ARITH:
19205 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19206 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19207 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19208 if (!scratch)
19209 scratch = gen_reg_rtx (HImode);
19210 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19212 /* In the unordered case, we have to check C2 for NaN's, which
19213 doesn't happen to work out to anything nice combination-wise.
19214 So do some bit twiddling on the value we've got in AH to come
19215 up with an appropriate set of condition codes. */
19217 intcmp_mode = CCNOmode;
19218 switch (code)
19220 case GT:
19221 case UNGT:
19222 if (code == GT || !TARGET_IEEE_FP)
19224 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19225 code = EQ;
19227 else
19229 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19230 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19231 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19232 intcmp_mode = CCmode;
19233 code = GEU;
19235 break;
19236 case LT:
19237 case UNLT:
19238 if (code == LT && TARGET_IEEE_FP)
19240 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19241 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19242 intcmp_mode = CCmode;
19243 code = EQ;
19245 else
19247 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19248 code = NE;
19250 break;
19251 case GE:
19252 case UNGE:
19253 if (code == GE || !TARGET_IEEE_FP)
19255 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19256 code = EQ;
19258 else
19260 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19261 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19262 code = NE;
19264 break;
19265 case LE:
19266 case UNLE:
19267 if (code == LE && TARGET_IEEE_FP)
19269 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19270 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19271 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19272 intcmp_mode = CCmode;
19273 code = LTU;
19275 else
19277 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19278 code = NE;
19280 break;
19281 case EQ:
19282 case UNEQ:
19283 if (code == EQ && TARGET_IEEE_FP)
19285 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19286 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19287 intcmp_mode = CCmode;
19288 code = EQ;
19290 else
19292 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19293 code = NE;
19295 break;
19296 case NE:
19297 case LTGT:
19298 if (code == NE && TARGET_IEEE_FP)
19300 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19301 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19302 GEN_INT (0x40)));
19303 code = NE;
19305 else
19307 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19308 code = EQ;
19310 break;
19312 case UNORDERED:
19313 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19314 code = NE;
19315 break;
19316 case ORDERED:
19317 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19318 code = EQ;
19319 break;
19321 default:
19322 gcc_unreachable ();
19324 break;
19326 default:
19327 gcc_unreachable();
19330 /* Return the test that should be put into the flags user, i.e.
19331 the bcc, scc, or cmov instruction. */
19332 return gen_rtx_fmt_ee (code, VOIDmode,
19333 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19334 const0_rtx);
19337 static rtx
19338 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19340 rtx ret;
19342 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19343 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19345 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19347 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19348 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19350 else
19351 ret = ix86_expand_int_compare (code, op0, op1);
19353 return ret;
19356 void
19357 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19359 enum machine_mode mode = GET_MODE (op0);
19360 rtx tmp;
19362 switch (mode)
19364 case SFmode:
19365 case DFmode:
19366 case XFmode:
19367 case QImode:
19368 case HImode:
19369 case SImode:
19370 simple:
19371 tmp = ix86_expand_compare (code, op0, op1);
19372 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19373 gen_rtx_LABEL_REF (VOIDmode, label),
19374 pc_rtx);
19375 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19376 return;
19378 case DImode:
19379 if (TARGET_64BIT)
19380 goto simple;
19381 case TImode:
19382 /* Expand DImode branch into multiple compare+branch. */
19384 rtx lo[2], hi[2], label2;
19385 enum rtx_code code1, code2, code3;
19386 enum machine_mode submode;
19388 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19390 tmp = op0, op0 = op1, op1 = tmp;
19391 code = swap_condition (code);
19394 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19395 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19397 submode = mode == DImode ? SImode : DImode;
19399 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19400 avoid two branches. This costs one extra insn, so disable when
19401 optimizing for size. */
19403 if ((code == EQ || code == NE)
19404 && (!optimize_insn_for_size_p ()
19405 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19407 rtx xor0, xor1;
19409 xor1 = hi[0];
19410 if (hi[1] != const0_rtx)
19411 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19412 NULL_RTX, 0, OPTAB_WIDEN);
19414 xor0 = lo[0];
19415 if (lo[1] != const0_rtx)
19416 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19417 NULL_RTX, 0, OPTAB_WIDEN);
19419 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19420 NULL_RTX, 0, OPTAB_WIDEN);
19422 ix86_expand_branch (code, tmp, const0_rtx, label);
19423 return;
19426 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19427 op1 is a constant and the low word is zero, then we can just
19428 examine the high word. Similarly for low word -1 and
19429 less-or-equal-than or greater-than. */
19431 if (CONST_INT_P (hi[1]))
19432 switch (code)
19434 case LT: case LTU: case GE: case GEU:
19435 if (lo[1] == const0_rtx)
19437 ix86_expand_branch (code, hi[0], hi[1], label);
19438 return;
19440 break;
19441 case LE: case LEU: case GT: case GTU:
19442 if (lo[1] == constm1_rtx)
19444 ix86_expand_branch (code, hi[0], hi[1], label);
19445 return;
19447 break;
19448 default:
19449 break;
19452 /* Otherwise, we need two or three jumps. */
19454 label2 = gen_label_rtx ();
19456 code1 = code;
19457 code2 = swap_condition (code);
19458 code3 = unsigned_condition (code);
19460 switch (code)
19462 case LT: case GT: case LTU: case GTU:
19463 break;
19465 case LE: code1 = LT; code2 = GT; break;
19466 case GE: code1 = GT; code2 = LT; break;
19467 case LEU: code1 = LTU; code2 = GTU; break;
19468 case GEU: code1 = GTU; code2 = LTU; break;
19470 case EQ: code1 = UNKNOWN; code2 = NE; break;
19471 case NE: code2 = UNKNOWN; break;
19473 default:
19474 gcc_unreachable ();
19478 * a < b =>
19479 * if (hi(a) < hi(b)) goto true;
19480 * if (hi(a) > hi(b)) goto false;
19481 * if (lo(a) < lo(b)) goto true;
19482 * false:
19485 if (code1 != UNKNOWN)
19486 ix86_expand_branch (code1, hi[0], hi[1], label);
19487 if (code2 != UNKNOWN)
19488 ix86_expand_branch (code2, hi[0], hi[1], label2);
19490 ix86_expand_branch (code3, lo[0], lo[1], label);
19492 if (code2 != UNKNOWN)
19493 emit_label (label2);
19494 return;
19497 default:
19498 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19499 goto simple;
19503 /* Split branch based on floating point condition. */
19504 void
19505 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19506 rtx target1, rtx target2, rtx tmp, rtx pushed)
19508 rtx condition;
19509 rtx i;
19511 if (target2 != pc_rtx)
19513 rtx tmp = target2;
19514 code = reverse_condition_maybe_unordered (code);
19515 target2 = target1;
19516 target1 = tmp;
19519 condition = ix86_expand_fp_compare (code, op1, op2,
19520 tmp);
19522 /* Remove pushed operand from stack. */
19523 if (pushed)
19524 ix86_free_from_memory (GET_MODE (pushed));
19526 i = emit_jump_insn (gen_rtx_SET
19527 (VOIDmode, pc_rtx,
19528 gen_rtx_IF_THEN_ELSE (VOIDmode,
19529 condition, target1, target2)));
19530 if (split_branch_probability >= 0)
19531 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
19534 void
19535 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19537 rtx ret;
19539 gcc_assert (GET_MODE (dest) == QImode);
19541 ret = ix86_expand_compare (code, op0, op1);
19542 PUT_MODE (ret, QImode);
19543 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19546 /* Expand comparison setting or clearing carry flag. Return true when
19547 successful and set pop for the operation. */
19548 static bool
19549 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19551 enum machine_mode mode =
19552 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19554 /* Do not handle double-mode compares that go through special path. */
19555 if (mode == (TARGET_64BIT ? TImode : DImode))
19556 return false;
19558 if (SCALAR_FLOAT_MODE_P (mode))
19560 rtx compare_op, compare_seq;
19562 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19564 /* Shortcut: following common codes never translate
19565 into carry flag compares. */
19566 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19567 || code == ORDERED || code == UNORDERED)
19568 return false;
19570 /* These comparisons require zero flag; swap operands so they won't. */
19571 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19572 && !TARGET_IEEE_FP)
19574 rtx tmp = op0;
19575 op0 = op1;
19576 op1 = tmp;
19577 code = swap_condition (code);
19580 /* Try to expand the comparison and verify that we end up with
19581 carry flag based comparison. This fails to be true only when
19582 we decide to expand comparison using arithmetic that is not
19583 too common scenario. */
19584 start_sequence ();
19585 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19586 compare_seq = get_insns ();
19587 end_sequence ();
19589 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19590 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19591 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19592 else
19593 code = GET_CODE (compare_op);
19595 if (code != LTU && code != GEU)
19596 return false;
19598 emit_insn (compare_seq);
19599 *pop = compare_op;
19600 return true;
19603 if (!INTEGRAL_MODE_P (mode))
19604 return false;
19606 switch (code)
19608 case LTU:
19609 case GEU:
19610 break;
19612 /* Convert a==0 into (unsigned)a<1. */
19613 case EQ:
19614 case NE:
19615 if (op1 != const0_rtx)
19616 return false;
19617 op1 = const1_rtx;
19618 code = (code == EQ ? LTU : GEU);
19619 break;
19621 /* Convert a>b into b<a or a>=b-1. */
19622 case GTU:
19623 case LEU:
19624 if (CONST_INT_P (op1))
19626 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19627 /* Bail out on overflow. We still can swap operands but that
19628 would force loading of the constant into register. */
19629 if (op1 == const0_rtx
19630 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19631 return false;
19632 code = (code == GTU ? GEU : LTU);
19634 else
19636 rtx tmp = op1;
19637 op1 = op0;
19638 op0 = tmp;
19639 code = (code == GTU ? LTU : GEU);
19641 break;
19643 /* Convert a>=0 into (unsigned)a<0x80000000. */
19644 case LT:
19645 case GE:
19646 if (mode == DImode || op1 != const0_rtx)
19647 return false;
19648 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19649 code = (code == LT ? GEU : LTU);
19650 break;
19651 case LE:
19652 case GT:
19653 if (mode == DImode || op1 != constm1_rtx)
19654 return false;
19655 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19656 code = (code == LE ? GEU : LTU);
19657 break;
19659 default:
19660 return false;
19662 /* Swapping operands may cause constant to appear as first operand. */
19663 if (!nonimmediate_operand (op0, VOIDmode))
19665 if (!can_create_pseudo_p ())
19666 return false;
19667 op0 = force_reg (mode, op0);
19669 *pop = ix86_expand_compare (code, op0, op1);
19670 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19671 return true;
19674 bool
19675 ix86_expand_int_movcc (rtx operands[])
19677 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19678 rtx compare_seq, compare_op;
19679 enum machine_mode mode = GET_MODE (operands[0]);
19680 bool sign_bit_compare_p = false;
19681 rtx op0 = XEXP (operands[1], 0);
19682 rtx op1 = XEXP (operands[1], 1);
19684 if (GET_MODE (op0) == TImode
19685 || (GET_MODE (op0) == DImode
19686 && !TARGET_64BIT))
19687 return false;
19689 start_sequence ();
19690 compare_op = ix86_expand_compare (code, op0, op1);
19691 compare_seq = get_insns ();
19692 end_sequence ();
19694 compare_code = GET_CODE (compare_op);
19696 if ((op1 == const0_rtx && (code == GE || code == LT))
19697 || (op1 == constm1_rtx && (code == GT || code == LE)))
19698 sign_bit_compare_p = true;
19700 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19701 HImode insns, we'd be swallowed in word prefix ops. */
19703 if ((mode != HImode || TARGET_FAST_PREFIX)
19704 && (mode != (TARGET_64BIT ? TImode : DImode))
19705 && CONST_INT_P (operands[2])
19706 && CONST_INT_P (operands[3]))
19708 rtx out = operands[0];
19709 HOST_WIDE_INT ct = INTVAL (operands[2]);
19710 HOST_WIDE_INT cf = INTVAL (operands[3]);
19711 HOST_WIDE_INT diff;
19713 diff = ct - cf;
19714 /* Sign bit compares are better done using shifts than we do by using
19715 sbb. */
19716 if (sign_bit_compare_p
19717 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19719 /* Detect overlap between destination and compare sources. */
19720 rtx tmp = out;
19722 if (!sign_bit_compare_p)
19724 rtx flags;
19725 bool fpcmp = false;
19727 compare_code = GET_CODE (compare_op);
19729 flags = XEXP (compare_op, 0);
19731 if (GET_MODE (flags) == CCFPmode
19732 || GET_MODE (flags) == CCFPUmode)
19734 fpcmp = true;
19735 compare_code
19736 = ix86_fp_compare_code_to_integer (compare_code);
19739 /* To simplify rest of code, restrict to the GEU case. */
19740 if (compare_code == LTU)
19742 HOST_WIDE_INT tmp = ct;
19743 ct = cf;
19744 cf = tmp;
19745 compare_code = reverse_condition (compare_code);
19746 code = reverse_condition (code);
19748 else
19750 if (fpcmp)
19751 PUT_CODE (compare_op,
19752 reverse_condition_maybe_unordered
19753 (GET_CODE (compare_op)));
19754 else
19755 PUT_CODE (compare_op,
19756 reverse_condition (GET_CODE (compare_op)));
19758 diff = ct - cf;
19760 if (reg_overlap_mentioned_p (out, op0)
19761 || reg_overlap_mentioned_p (out, op1))
19762 tmp = gen_reg_rtx (mode);
19764 if (mode == DImode)
19765 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19766 else
19767 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19768 flags, compare_op));
19770 else
19772 if (code == GT || code == GE)
19773 code = reverse_condition (code);
19774 else
19776 HOST_WIDE_INT tmp = ct;
19777 ct = cf;
19778 cf = tmp;
19779 diff = ct - cf;
19781 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19784 if (diff == 1)
19787 * cmpl op0,op1
19788 * sbbl dest,dest
19789 * [addl dest, ct]
19791 * Size 5 - 8.
19793 if (ct)
19794 tmp = expand_simple_binop (mode, PLUS,
19795 tmp, GEN_INT (ct),
19796 copy_rtx (tmp), 1, OPTAB_DIRECT);
19798 else if (cf == -1)
19801 * cmpl op0,op1
19802 * sbbl dest,dest
19803 * orl $ct, dest
19805 * Size 8.
19807 tmp = expand_simple_binop (mode, IOR,
19808 tmp, GEN_INT (ct),
19809 copy_rtx (tmp), 1, OPTAB_DIRECT);
19811 else if (diff == -1 && ct)
19814 * cmpl op0,op1
19815 * sbbl dest,dest
19816 * notl dest
19817 * [addl dest, cf]
19819 * Size 8 - 11.
19821 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19822 if (cf)
19823 tmp = expand_simple_binop (mode, PLUS,
19824 copy_rtx (tmp), GEN_INT (cf),
19825 copy_rtx (tmp), 1, OPTAB_DIRECT);
19827 else
19830 * cmpl op0,op1
19831 * sbbl dest,dest
19832 * [notl dest]
19833 * andl cf - ct, dest
19834 * [addl dest, ct]
19836 * Size 8 - 11.
19839 if (cf == 0)
19841 cf = ct;
19842 ct = 0;
19843 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19846 tmp = expand_simple_binop (mode, AND,
19847 copy_rtx (tmp),
19848 gen_int_mode (cf - ct, mode),
19849 copy_rtx (tmp), 1, OPTAB_DIRECT);
19850 if (ct)
19851 tmp = expand_simple_binop (mode, PLUS,
19852 copy_rtx (tmp), GEN_INT (ct),
19853 copy_rtx (tmp), 1, OPTAB_DIRECT);
19856 if (!rtx_equal_p (tmp, out))
19857 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19859 return true;
19862 if (diff < 0)
19864 enum machine_mode cmp_mode = GET_MODE (op0);
19866 HOST_WIDE_INT tmp;
19867 tmp = ct, ct = cf, cf = tmp;
19868 diff = -diff;
19870 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19872 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19874 /* We may be reversing unordered compare to normal compare, that
19875 is not valid in general (we may convert non-trapping condition
19876 to trapping one), however on i386 we currently emit all
19877 comparisons unordered. */
19878 compare_code = reverse_condition_maybe_unordered (compare_code);
19879 code = reverse_condition_maybe_unordered (code);
19881 else
19883 compare_code = reverse_condition (compare_code);
19884 code = reverse_condition (code);
19888 compare_code = UNKNOWN;
19889 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19890 && CONST_INT_P (op1))
19892 if (op1 == const0_rtx
19893 && (code == LT || code == GE))
19894 compare_code = code;
19895 else if (op1 == constm1_rtx)
19897 if (code == LE)
19898 compare_code = LT;
19899 else if (code == GT)
19900 compare_code = GE;
19904 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19905 if (compare_code != UNKNOWN
19906 && GET_MODE (op0) == GET_MODE (out)
19907 && (cf == -1 || ct == -1))
19909 /* If lea code below could be used, only optimize
19910 if it results in a 2 insn sequence. */
19912 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19913 || diff == 3 || diff == 5 || diff == 9)
19914 || (compare_code == LT && ct == -1)
19915 || (compare_code == GE && cf == -1))
19918 * notl op1 (if necessary)
19919 * sarl $31, op1
19920 * orl cf, op1
19922 if (ct != -1)
19924 cf = ct;
19925 ct = -1;
19926 code = reverse_condition (code);
19929 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19931 out = expand_simple_binop (mode, IOR,
19932 out, GEN_INT (cf),
19933 out, 1, OPTAB_DIRECT);
19934 if (out != operands[0])
19935 emit_move_insn (operands[0], out);
19937 return true;
19942 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19943 || diff == 3 || diff == 5 || diff == 9)
19944 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19945 && (mode != DImode
19946 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19949 * xorl dest,dest
19950 * cmpl op1,op2
19951 * setcc dest
19952 * lea cf(dest*(ct-cf)),dest
19954 * Size 14.
19956 * This also catches the degenerate setcc-only case.
19959 rtx tmp;
19960 int nops;
19962 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19964 nops = 0;
19965 /* On x86_64 the lea instruction operates on Pmode, so we need
19966 to get arithmetics done in proper mode to match. */
19967 if (diff == 1)
19968 tmp = copy_rtx (out);
19969 else
19971 rtx out1;
19972 out1 = copy_rtx (out);
19973 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19974 nops++;
19975 if (diff & 1)
19977 tmp = gen_rtx_PLUS (mode, tmp, out1);
19978 nops++;
19981 if (cf != 0)
19983 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19984 nops++;
19986 if (!rtx_equal_p (tmp, out))
19988 if (nops == 1)
19989 out = force_operand (tmp, copy_rtx (out));
19990 else
19991 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19993 if (!rtx_equal_p (out, operands[0]))
19994 emit_move_insn (operands[0], copy_rtx (out));
19996 return true;
20000 * General case: Jumpful:
20001 * xorl dest,dest cmpl op1, op2
20002 * cmpl op1, op2 movl ct, dest
20003 * setcc dest jcc 1f
20004 * decl dest movl cf, dest
20005 * andl (cf-ct),dest 1:
20006 * addl ct,dest
20008 * Size 20. Size 14.
20010 * This is reasonably steep, but branch mispredict costs are
20011 * high on modern cpus, so consider failing only if optimizing
20012 * for space.
20015 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20016 && BRANCH_COST (optimize_insn_for_speed_p (),
20017 false) >= 2)
20019 if (cf == 0)
20021 enum machine_mode cmp_mode = GET_MODE (op0);
20023 cf = ct;
20024 ct = 0;
20026 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20028 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20030 /* We may be reversing unordered compare to normal compare,
20031 that is not valid in general (we may convert non-trapping
20032 condition to trapping one), however on i386 we currently
20033 emit all comparisons unordered. */
20034 code = reverse_condition_maybe_unordered (code);
20036 else
20038 code = reverse_condition (code);
20039 if (compare_code != UNKNOWN)
20040 compare_code = reverse_condition (compare_code);
20044 if (compare_code != UNKNOWN)
20046 /* notl op1 (if needed)
20047 sarl $31, op1
20048 andl (cf-ct), op1
20049 addl ct, op1
20051 For x < 0 (resp. x <= -1) there will be no notl,
20052 so if possible swap the constants to get rid of the
20053 complement.
20054 True/false will be -1/0 while code below (store flag
20055 followed by decrement) is 0/-1, so the constants need
20056 to be exchanged once more. */
20058 if (compare_code == GE || !cf)
20060 code = reverse_condition (code);
20061 compare_code = LT;
20063 else
20065 HOST_WIDE_INT tmp = cf;
20066 cf = ct;
20067 ct = tmp;
20070 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20072 else
20074 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20076 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
20077 constm1_rtx,
20078 copy_rtx (out), 1, OPTAB_DIRECT);
20081 out = expand_simple_binop (mode, AND, copy_rtx (out),
20082 gen_int_mode (cf - ct, mode),
20083 copy_rtx (out), 1, OPTAB_DIRECT);
20084 if (ct)
20085 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20086 copy_rtx (out), 1, OPTAB_DIRECT);
20087 if (!rtx_equal_p (out, operands[0]))
20088 emit_move_insn (operands[0], copy_rtx (out));
20090 return true;
20094 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20096 /* Try a few things more with specific constants and a variable. */
20098 optab op;
20099 rtx var, orig_out, out, tmp;
20101 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20102 return false;
20104 /* If one of the two operands is an interesting constant, load a
20105 constant with the above and mask it in with a logical operation. */
20107 if (CONST_INT_P (operands[2]))
20109 var = operands[3];
20110 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20111 operands[3] = constm1_rtx, op = and_optab;
20112 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20113 operands[3] = const0_rtx, op = ior_optab;
20114 else
20115 return false;
20117 else if (CONST_INT_P (operands[3]))
20119 var = operands[2];
20120 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20121 operands[2] = constm1_rtx, op = and_optab;
20122 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20123 operands[2] = const0_rtx, op = ior_optab;
20124 else
20125 return false;
20127 else
20128 return false;
20130 orig_out = operands[0];
20131 tmp = gen_reg_rtx (mode);
20132 operands[0] = tmp;
20134 /* Recurse to get the constant loaded. */
20135 if (ix86_expand_int_movcc (operands) == 0)
20136 return false;
20138 /* Mask in the interesting variable. */
20139 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20140 OPTAB_WIDEN);
20141 if (!rtx_equal_p (out, orig_out))
20142 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20144 return true;
20148 * For comparison with above,
20150 * movl cf,dest
20151 * movl ct,tmp
20152 * cmpl op1,op2
20153 * cmovcc tmp,dest
20155 * Size 15.
20158 if (! nonimmediate_operand (operands[2], mode))
20159 operands[2] = force_reg (mode, operands[2]);
20160 if (! nonimmediate_operand (operands[3], mode))
20161 operands[3] = force_reg (mode, operands[3]);
20163 if (! register_operand (operands[2], VOIDmode)
20164 && (mode == QImode
20165 || ! register_operand (operands[3], VOIDmode)))
20166 operands[2] = force_reg (mode, operands[2]);
20168 if (mode == QImode
20169 && ! register_operand (operands[3], VOIDmode))
20170 operands[3] = force_reg (mode, operands[3]);
20172 emit_insn (compare_seq);
20173 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20174 gen_rtx_IF_THEN_ELSE (mode,
20175 compare_op, operands[2],
20176 operands[3])));
20177 return true;
20180 /* Swap, force into registers, or otherwise massage the two operands
20181 to an sse comparison with a mask result. Thus we differ a bit from
20182 ix86_prepare_fp_compare_args which expects to produce a flags result.
20184 The DEST operand exists to help determine whether to commute commutative
20185 operators. The POP0/POP1 operands are updated in place. The new
20186 comparison code is returned, or UNKNOWN if not implementable. */
20188 static enum rtx_code
20189 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20190 rtx *pop0, rtx *pop1)
20192 rtx tmp;
20194 switch (code)
20196 case LTGT:
20197 case UNEQ:
20198 /* AVX supports all the needed comparisons. */
20199 if (TARGET_AVX)
20200 break;
20201 /* We have no LTGT as an operator. We could implement it with
20202 NE & ORDERED, but this requires an extra temporary. It's
20203 not clear that it's worth it. */
20204 return UNKNOWN;
20206 case LT:
20207 case LE:
20208 case UNGT:
20209 case UNGE:
20210 /* These are supported directly. */
20211 break;
20213 case EQ:
20214 case NE:
20215 case UNORDERED:
20216 case ORDERED:
20217 /* AVX has 3 operand comparisons, no need to swap anything. */
20218 if (TARGET_AVX)
20219 break;
20220 /* For commutative operators, try to canonicalize the destination
20221 operand to be first in the comparison - this helps reload to
20222 avoid extra moves. */
20223 if (!dest || !rtx_equal_p (dest, *pop1))
20224 break;
20225 /* FALLTHRU */
20227 case GE:
20228 case GT:
20229 case UNLE:
20230 case UNLT:
20231 /* These are not supported directly before AVX, and furthermore
20232 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20233 comparison operands to transform into something that is
20234 supported. */
20235 tmp = *pop0;
20236 *pop0 = *pop1;
20237 *pop1 = tmp;
20238 code = swap_condition (code);
20239 break;
20241 default:
20242 gcc_unreachable ();
20245 return code;
20248 /* Detect conditional moves that exactly match min/max operational
20249 semantics. Note that this is IEEE safe, as long as we don't
20250 interchange the operands.
20252 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20253 and TRUE if the operation is successful and instructions are emitted. */
20255 static bool
20256 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20257 rtx cmp_op1, rtx if_true, rtx if_false)
20259 enum machine_mode mode;
20260 bool is_min;
20261 rtx tmp;
20263 if (code == LT)
20265 else if (code == UNGE)
20267 tmp = if_true;
20268 if_true = if_false;
20269 if_false = tmp;
20271 else
20272 return false;
20274 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20275 is_min = true;
20276 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20277 is_min = false;
20278 else
20279 return false;
20281 mode = GET_MODE (dest);
20283 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20284 but MODE may be a vector mode and thus not appropriate. */
20285 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20287 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20288 rtvec v;
20290 if_true = force_reg (mode, if_true);
20291 v = gen_rtvec (2, if_true, if_false);
20292 tmp = gen_rtx_UNSPEC (mode, v, u);
20294 else
20296 code = is_min ? SMIN : SMAX;
20297 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20300 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20301 return true;
20304 /* Expand an sse vector comparison. Return the register with the result. */
20306 static rtx
20307 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20308 rtx op_true, rtx op_false)
20310 enum machine_mode mode = GET_MODE (dest);
20311 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20312 rtx x;
20314 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20315 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20316 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20318 if (optimize
20319 || reg_overlap_mentioned_p (dest, op_true)
20320 || reg_overlap_mentioned_p (dest, op_false))
20321 dest = gen_reg_rtx (mode);
20323 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20324 if (cmp_mode != mode)
20326 x = force_reg (cmp_mode, x);
20327 convert_move (dest, x, false);
20329 else
20330 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20332 return dest;
20335 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20336 operations. This is used for both scalar and vector conditional moves. */
20338 static void
20339 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20341 enum machine_mode mode = GET_MODE (dest);
20342 rtx t2, t3, x;
20344 if (vector_all_ones_operand (op_true, mode)
20345 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20347 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20349 else if (op_false == CONST0_RTX (mode))
20351 op_true = force_reg (mode, op_true);
20352 x = gen_rtx_AND (mode, cmp, op_true);
20353 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20355 else if (op_true == CONST0_RTX (mode))
20357 op_false = force_reg (mode, op_false);
20358 x = gen_rtx_NOT (mode, cmp);
20359 x = gen_rtx_AND (mode, x, op_false);
20360 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20362 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20364 op_false = force_reg (mode, op_false);
20365 x = gen_rtx_IOR (mode, cmp, op_false);
20366 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20368 else if (TARGET_XOP)
20370 op_true = force_reg (mode, op_true);
20372 if (!nonimmediate_operand (op_false, mode))
20373 op_false = force_reg (mode, op_false);
20375 emit_insn (gen_rtx_SET (mode, dest,
20376 gen_rtx_IF_THEN_ELSE (mode, cmp,
20377 op_true,
20378 op_false)));
20380 else
20382 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20384 if (!nonimmediate_operand (op_true, mode))
20385 op_true = force_reg (mode, op_true);
20387 op_false = force_reg (mode, op_false);
20389 switch (mode)
20391 case V4SFmode:
20392 if (TARGET_SSE4_1)
20393 gen = gen_sse4_1_blendvps;
20394 break;
20395 case V2DFmode:
20396 if (TARGET_SSE4_1)
20397 gen = gen_sse4_1_blendvpd;
20398 break;
20399 case V16QImode:
20400 case V8HImode:
20401 case V4SImode:
20402 case V2DImode:
20403 if (TARGET_SSE4_1)
20405 gen = gen_sse4_1_pblendvb;
20406 dest = gen_lowpart (V16QImode, dest);
20407 op_false = gen_lowpart (V16QImode, op_false);
20408 op_true = gen_lowpart (V16QImode, op_true);
20409 cmp = gen_lowpart (V16QImode, cmp);
20411 break;
20412 case V8SFmode:
20413 if (TARGET_AVX)
20414 gen = gen_avx_blendvps256;
20415 break;
20416 case V4DFmode:
20417 if (TARGET_AVX)
20418 gen = gen_avx_blendvpd256;
20419 break;
20420 case V32QImode:
20421 case V16HImode:
20422 case V8SImode:
20423 case V4DImode:
20424 if (TARGET_AVX2)
20426 gen = gen_avx2_pblendvb;
20427 dest = gen_lowpart (V32QImode, dest);
20428 op_false = gen_lowpart (V32QImode, op_false);
20429 op_true = gen_lowpart (V32QImode, op_true);
20430 cmp = gen_lowpart (V32QImode, cmp);
20432 break;
20433 default:
20434 break;
20437 if (gen != NULL)
20438 emit_insn (gen (dest, op_false, op_true, cmp));
20439 else
20441 op_true = force_reg (mode, op_true);
20443 t2 = gen_reg_rtx (mode);
20444 if (optimize)
20445 t3 = gen_reg_rtx (mode);
20446 else
20447 t3 = dest;
20449 x = gen_rtx_AND (mode, op_true, cmp);
20450 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20452 x = gen_rtx_NOT (mode, cmp);
20453 x = gen_rtx_AND (mode, x, op_false);
20454 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20456 x = gen_rtx_IOR (mode, t3, t2);
20457 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20462 /* Expand a floating-point conditional move. Return true if successful. */
20464 bool
20465 ix86_expand_fp_movcc (rtx operands[])
20467 enum machine_mode mode = GET_MODE (operands[0]);
20468 enum rtx_code code = GET_CODE (operands[1]);
20469 rtx tmp, compare_op;
20470 rtx op0 = XEXP (operands[1], 0);
20471 rtx op1 = XEXP (operands[1], 1);
20473 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20475 enum machine_mode cmode;
20477 /* Since we've no cmove for sse registers, don't force bad register
20478 allocation just to gain access to it. Deny movcc when the
20479 comparison mode doesn't match the move mode. */
20480 cmode = GET_MODE (op0);
20481 if (cmode == VOIDmode)
20482 cmode = GET_MODE (op1);
20483 if (cmode != mode)
20484 return false;
20486 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20487 if (code == UNKNOWN)
20488 return false;
20490 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20491 operands[2], operands[3]))
20492 return true;
20494 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20495 operands[2], operands[3]);
20496 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20497 return true;
20500 if (GET_MODE (op0) == TImode
20501 || (GET_MODE (op0) == DImode
20502 && !TARGET_64BIT))
20503 return false;
20505 /* The floating point conditional move instructions don't directly
20506 support conditions resulting from a signed integer comparison. */
20508 compare_op = ix86_expand_compare (code, op0, op1);
20509 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20511 tmp = gen_reg_rtx (QImode);
20512 ix86_expand_setcc (tmp, code, op0, op1);
20514 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20517 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20518 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20519 operands[2], operands[3])));
20521 return true;
20524 /* Expand a floating-point vector conditional move; a vcond operation
20525 rather than a movcc operation. */
20527 bool
20528 ix86_expand_fp_vcond (rtx operands[])
20530 enum rtx_code code = GET_CODE (operands[3]);
20531 rtx cmp;
20533 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20534 &operands[4], &operands[5]);
20535 if (code == UNKNOWN)
20537 rtx temp;
20538 switch (GET_CODE (operands[3]))
20540 case LTGT:
20541 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20542 operands[5], operands[0], operands[0]);
20543 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20544 operands[5], operands[1], operands[2]);
20545 code = AND;
20546 break;
20547 case UNEQ:
20548 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20549 operands[5], operands[0], operands[0]);
20550 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20551 operands[5], operands[1], operands[2]);
20552 code = IOR;
20553 break;
20554 default:
20555 gcc_unreachable ();
20557 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20558 OPTAB_DIRECT);
20559 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20560 return true;
20563 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20564 operands[5], operands[1], operands[2]))
20565 return true;
20567 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20568 operands[1], operands[2]);
20569 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20570 return true;
20573 /* Expand a signed/unsigned integral vector conditional move. */
20575 bool
20576 ix86_expand_int_vcond (rtx operands[])
20578 enum machine_mode data_mode = GET_MODE (operands[0]);
20579 enum machine_mode mode = GET_MODE (operands[4]);
20580 enum rtx_code code = GET_CODE (operands[3]);
20581 bool negate = false;
20582 rtx x, cop0, cop1;
20584 cop0 = operands[4];
20585 cop1 = operands[5];
20587 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20588 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20589 if ((code == LT || code == GE)
20590 && data_mode == mode
20591 && cop1 == CONST0_RTX (mode)
20592 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20593 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20594 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20595 && (GET_MODE_SIZE (data_mode) == 16
20596 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20598 rtx negop = operands[2 - (code == LT)];
20599 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20600 if (negop == CONST1_RTX (data_mode))
20602 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20603 operands[0], 1, OPTAB_DIRECT);
20604 if (res != operands[0])
20605 emit_move_insn (operands[0], res);
20606 return true;
20608 else if (GET_MODE_INNER (data_mode) != DImode
20609 && vector_all_ones_operand (negop, data_mode))
20611 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20612 operands[0], 0, OPTAB_DIRECT);
20613 if (res != operands[0])
20614 emit_move_insn (operands[0], res);
20615 return true;
20619 if (!nonimmediate_operand (cop1, mode))
20620 cop1 = force_reg (mode, cop1);
20621 if (!general_operand (operands[1], data_mode))
20622 operands[1] = force_reg (data_mode, operands[1]);
20623 if (!general_operand (operands[2], data_mode))
20624 operands[2] = force_reg (data_mode, operands[2]);
20626 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20627 if (TARGET_XOP
20628 && (mode == V16QImode || mode == V8HImode
20629 || mode == V4SImode || mode == V2DImode))
20631 else
20633 /* Canonicalize the comparison to EQ, GT, GTU. */
20634 switch (code)
20636 case EQ:
20637 case GT:
20638 case GTU:
20639 break;
20641 case NE:
20642 case LE:
20643 case LEU:
20644 code = reverse_condition (code);
20645 negate = true;
20646 break;
20648 case GE:
20649 case GEU:
20650 code = reverse_condition (code);
20651 negate = true;
20652 /* FALLTHRU */
20654 case LT:
20655 case LTU:
20656 code = swap_condition (code);
20657 x = cop0, cop0 = cop1, cop1 = x;
20658 break;
20660 default:
20661 gcc_unreachable ();
20664 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20665 if (mode == V2DImode)
20667 switch (code)
20669 case EQ:
20670 /* SSE4.1 supports EQ. */
20671 if (!TARGET_SSE4_1)
20672 return false;
20673 break;
20675 case GT:
20676 case GTU:
20677 /* SSE4.2 supports GT/GTU. */
20678 if (!TARGET_SSE4_2)
20679 return false;
20680 break;
20682 default:
20683 gcc_unreachable ();
20687 /* Unsigned parallel compare is not supported by the hardware.
20688 Play some tricks to turn this into a signed comparison
20689 against 0. */
20690 if (code == GTU)
20692 cop0 = force_reg (mode, cop0);
20694 switch (mode)
20696 case V8SImode:
20697 case V4DImode:
20698 case V4SImode:
20699 case V2DImode:
20701 rtx t1, t2, mask;
20702 rtx (*gen_sub3) (rtx, rtx, rtx);
20704 switch (mode)
20706 case V8SImode: gen_sub3 = gen_subv8si3; break;
20707 case V4DImode: gen_sub3 = gen_subv4di3; break;
20708 case V4SImode: gen_sub3 = gen_subv4si3; break;
20709 case V2DImode: gen_sub3 = gen_subv2di3; break;
20710 default:
20711 gcc_unreachable ();
20713 /* Subtract (-(INT MAX) - 1) from both operands to make
20714 them signed. */
20715 mask = ix86_build_signbit_mask (mode, true, false);
20716 t1 = gen_reg_rtx (mode);
20717 emit_insn (gen_sub3 (t1, cop0, mask));
20719 t2 = gen_reg_rtx (mode);
20720 emit_insn (gen_sub3 (t2, cop1, mask));
20722 cop0 = t1;
20723 cop1 = t2;
20724 code = GT;
20726 break;
20728 case V32QImode:
20729 case V16HImode:
20730 case V16QImode:
20731 case V8HImode:
20732 /* Perform a parallel unsigned saturating subtraction. */
20733 x = gen_reg_rtx (mode);
20734 emit_insn (gen_rtx_SET (VOIDmode, x,
20735 gen_rtx_US_MINUS (mode, cop0, cop1)));
20737 cop0 = x;
20738 cop1 = CONST0_RTX (mode);
20739 code = EQ;
20740 negate = !negate;
20741 break;
20743 default:
20744 gcc_unreachable ();
20749 /* Allow the comparison to be done in one mode, but the movcc to
20750 happen in another mode. */
20751 if (data_mode == mode)
20753 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20754 operands[1+negate], operands[2-negate]);
20756 else
20758 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20759 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20760 code, cop0, cop1,
20761 operands[1+negate], operands[2-negate]);
20762 x = gen_lowpart (data_mode, x);
20765 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20766 operands[2-negate]);
20767 return true;
20770 /* Expand a variable vector permutation. */
20772 void
20773 ix86_expand_vec_perm (rtx operands[])
20775 rtx target = operands[0];
20776 rtx op0 = operands[1];
20777 rtx op1 = operands[2];
20778 rtx mask = operands[3];
20779 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20780 enum machine_mode mode = GET_MODE (op0);
20781 enum machine_mode maskmode = GET_MODE (mask);
20782 int w, e, i;
20783 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20785 /* Number of elements in the vector. */
20786 w = GET_MODE_NUNITS (mode);
20787 e = GET_MODE_UNIT_SIZE (mode);
20788 gcc_assert (w <= 32);
20790 if (TARGET_AVX2)
20792 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20794 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20795 an constant shuffle operand. With a tiny bit of effort we can
20796 use VPERMD instead. A re-interpretation stall for V4DFmode is
20797 unfortunate but there's no avoiding it.
20798 Similarly for V16HImode we don't have instructions for variable
20799 shuffling, while for V32QImode we can use after preparing suitable
20800 masks vpshufb; vpshufb; vpermq; vpor. */
20802 if (mode == V16HImode)
20804 maskmode = mode = V32QImode;
20805 w = 32;
20806 e = 1;
20808 else
20810 maskmode = mode = V8SImode;
20811 w = 8;
20812 e = 4;
20814 t1 = gen_reg_rtx (maskmode);
20816 /* Replicate the low bits of the V4DImode mask into V8SImode:
20817 mask = { A B C D }
20818 t1 = { A A B B C C D D }. */
20819 for (i = 0; i < w / 2; ++i)
20820 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20821 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20822 vt = force_reg (maskmode, vt);
20823 mask = gen_lowpart (maskmode, mask);
20824 if (maskmode == V8SImode)
20825 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20826 else
20827 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20829 /* Multiply the shuffle indicies by two. */
20830 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20831 OPTAB_DIRECT);
20833 /* Add one to the odd shuffle indicies:
20834 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20835 for (i = 0; i < w / 2; ++i)
20837 vec[i * 2] = const0_rtx;
20838 vec[i * 2 + 1] = const1_rtx;
20840 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20841 vt = validize_mem (force_const_mem (maskmode, vt));
20842 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20843 OPTAB_DIRECT);
20845 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20846 operands[3] = mask = t1;
20847 target = gen_lowpart (mode, target);
20848 op0 = gen_lowpart (mode, op0);
20849 op1 = gen_lowpart (mode, op1);
20852 switch (mode)
20854 case V8SImode:
20855 /* The VPERMD and VPERMPS instructions already properly ignore
20856 the high bits of the shuffle elements. No need for us to
20857 perform an AND ourselves. */
20858 if (one_operand_shuffle)
20859 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20860 else
20862 t1 = gen_reg_rtx (V8SImode);
20863 t2 = gen_reg_rtx (V8SImode);
20864 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20865 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20866 goto merge_two;
20868 return;
20870 case V8SFmode:
20871 mask = gen_lowpart (V8SFmode, mask);
20872 if (one_operand_shuffle)
20873 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20874 else
20876 t1 = gen_reg_rtx (V8SFmode);
20877 t2 = gen_reg_rtx (V8SFmode);
20878 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20879 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20880 goto merge_two;
20882 return;
20884 case V4SImode:
20885 /* By combining the two 128-bit input vectors into one 256-bit
20886 input vector, we can use VPERMD and VPERMPS for the full
20887 two-operand shuffle. */
20888 t1 = gen_reg_rtx (V8SImode);
20889 t2 = gen_reg_rtx (V8SImode);
20890 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20891 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20892 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20893 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20894 return;
20896 case V4SFmode:
20897 t1 = gen_reg_rtx (V8SFmode);
20898 t2 = gen_reg_rtx (V8SImode);
20899 mask = gen_lowpart (V4SImode, mask);
20900 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20901 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20902 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20903 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20904 return;
20906 case V32QImode:
20907 t1 = gen_reg_rtx (V32QImode);
20908 t2 = gen_reg_rtx (V32QImode);
20909 t3 = gen_reg_rtx (V32QImode);
20910 vt2 = GEN_INT (128);
20911 for (i = 0; i < 32; i++)
20912 vec[i] = vt2;
20913 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20914 vt = force_reg (V32QImode, vt);
20915 for (i = 0; i < 32; i++)
20916 vec[i] = i < 16 ? vt2 : const0_rtx;
20917 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20918 vt2 = force_reg (V32QImode, vt2);
20919 /* From mask create two adjusted masks, which contain the same
20920 bits as mask in the low 7 bits of each vector element.
20921 The first mask will have the most significant bit clear
20922 if it requests element from the same 128-bit lane
20923 and MSB set if it requests element from the other 128-bit lane.
20924 The second mask will have the opposite values of the MSB,
20925 and additionally will have its 128-bit lanes swapped.
20926 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20927 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20928 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20929 stands for other 12 bytes. */
20930 /* The bit whether element is from the same lane or the other
20931 lane is bit 4, so shift it up by 3 to the MSB position. */
20932 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20933 gen_lowpart (V4DImode, mask),
20934 GEN_INT (3)));
20935 /* Clear MSB bits from the mask just in case it had them set. */
20936 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20937 /* After this t1 will have MSB set for elements from other lane. */
20938 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20939 /* Clear bits other than MSB. */
20940 emit_insn (gen_andv32qi3 (t1, t1, vt));
20941 /* Or in the lower bits from mask into t3. */
20942 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20943 /* And invert MSB bits in t1, so MSB is set for elements from the same
20944 lane. */
20945 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20946 /* Swap 128-bit lanes in t3. */
20947 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20948 gen_lowpart (V4DImode, t3),
20949 const2_rtx, GEN_INT (3),
20950 const0_rtx, const1_rtx));
20951 /* And or in the lower bits from mask into t1. */
20952 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20953 if (one_operand_shuffle)
20955 /* Each of these shuffles will put 0s in places where
20956 element from the other 128-bit lane is needed, otherwise
20957 will shuffle in the requested value. */
20958 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20959 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20960 /* For t3 the 128-bit lanes are swapped again. */
20961 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20962 gen_lowpart (V4DImode, t3),
20963 const2_rtx, GEN_INT (3),
20964 const0_rtx, const1_rtx));
20965 /* And oring both together leads to the result. */
20966 emit_insn (gen_iorv32qi3 (target, t1, t3));
20967 return;
20970 t4 = gen_reg_rtx (V32QImode);
20971 /* Similarly to the above one_operand_shuffle code,
20972 just for repeated twice for each operand. merge_two:
20973 code will merge the two results together. */
20974 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20975 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20976 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20977 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20978 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20979 gen_lowpart (V4DImode, t4),
20980 const2_rtx, GEN_INT (3),
20981 const0_rtx, const1_rtx));
20982 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20983 gen_lowpart (V4DImode, t3),
20984 const2_rtx, GEN_INT (3),
20985 const0_rtx, const1_rtx));
20986 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20987 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20988 t1 = t4;
20989 t2 = t3;
20990 goto merge_two;
20992 default:
20993 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20994 break;
20998 if (TARGET_XOP)
21000 /* The XOP VPPERM insn supports three inputs. By ignoring the
21001 one_operand_shuffle special case, we avoid creating another
21002 set of constant vectors in memory. */
21003 one_operand_shuffle = false;
21005 /* mask = mask & {2*w-1, ...} */
21006 vt = GEN_INT (2*w - 1);
21008 else
21010 /* mask = mask & {w-1, ...} */
21011 vt = GEN_INT (w - 1);
21014 for (i = 0; i < w; i++)
21015 vec[i] = vt;
21016 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21017 mask = expand_simple_binop (maskmode, AND, mask, vt,
21018 NULL_RTX, 0, OPTAB_DIRECT);
21020 /* For non-QImode operations, convert the word permutation control
21021 into a byte permutation control. */
21022 if (mode != V16QImode)
21024 mask = expand_simple_binop (maskmode, ASHIFT, mask,
21025 GEN_INT (exact_log2 (e)),
21026 NULL_RTX, 0, OPTAB_DIRECT);
21028 /* Convert mask to vector of chars. */
21029 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
21031 /* Replicate each of the input bytes into byte positions:
21032 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
21033 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
21034 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
21035 for (i = 0; i < 16; ++i)
21036 vec[i] = GEN_INT (i/e * e);
21037 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21038 vt = validize_mem (force_const_mem (V16QImode, vt));
21039 if (TARGET_XOP)
21040 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
21041 else
21042 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
21044 /* Convert it into the byte positions by doing
21045 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
21046 for (i = 0; i < 16; ++i)
21047 vec[i] = GEN_INT (i % e);
21048 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21049 vt = validize_mem (force_const_mem (V16QImode, vt));
21050 emit_insn (gen_addv16qi3 (mask, mask, vt));
21053 /* The actual shuffle operations all operate on V16QImode. */
21054 op0 = gen_lowpart (V16QImode, op0);
21055 op1 = gen_lowpart (V16QImode, op1);
21056 target = gen_lowpart (V16QImode, target);
21058 if (TARGET_XOP)
21060 emit_insn (gen_xop_pperm (target, op0, op1, mask));
21062 else if (one_operand_shuffle)
21064 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
21066 else
21068 rtx xops[6];
21069 bool ok;
21071 /* Shuffle the two input vectors independently. */
21072 t1 = gen_reg_rtx (V16QImode);
21073 t2 = gen_reg_rtx (V16QImode);
21074 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
21075 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
21077 merge_two:
21078 /* Then merge them together. The key is whether any given control
21079 element contained a bit set that indicates the second word. */
21080 mask = operands[3];
21081 vt = GEN_INT (w);
21082 if (maskmode == V2DImode && !TARGET_SSE4_1)
21084 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21085 more shuffle to convert the V2DI input mask into a V4SI
21086 input mask. At which point the masking that expand_int_vcond
21087 will work as desired. */
21088 rtx t3 = gen_reg_rtx (V4SImode);
21089 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21090 const0_rtx, const0_rtx,
21091 const2_rtx, const2_rtx));
21092 mask = t3;
21093 maskmode = V4SImode;
21094 e = w = 4;
21097 for (i = 0; i < w; i++)
21098 vec[i] = vt;
21099 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21100 vt = force_reg (maskmode, vt);
21101 mask = expand_simple_binop (maskmode, AND, mask, vt,
21102 NULL_RTX, 0, OPTAB_DIRECT);
21104 xops[0] = gen_lowpart (mode, operands[0]);
21105 xops[1] = gen_lowpart (mode, t2);
21106 xops[2] = gen_lowpart (mode, t1);
21107 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21108 xops[4] = mask;
21109 xops[5] = vt;
21110 ok = ix86_expand_int_vcond (xops);
21111 gcc_assert (ok);
21115 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21116 true if we should do zero extension, else sign extension. HIGH_P is
21117 true if we want the N/2 high elements, else the low elements. */
21119 void
21120 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21122 enum machine_mode imode = GET_MODE (src);
21123 rtx tmp;
21125 if (TARGET_SSE4_1)
21127 rtx (*unpack)(rtx, rtx);
21128 rtx (*extract)(rtx, rtx) = NULL;
21129 enum machine_mode halfmode = BLKmode;
21131 switch (imode)
21133 case V32QImode:
21134 if (unsigned_p)
21135 unpack = gen_avx2_zero_extendv16qiv16hi2;
21136 else
21137 unpack = gen_avx2_sign_extendv16qiv16hi2;
21138 halfmode = V16QImode;
21139 extract
21140 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21141 break;
21142 case V16HImode:
21143 if (unsigned_p)
21144 unpack = gen_avx2_zero_extendv8hiv8si2;
21145 else
21146 unpack = gen_avx2_sign_extendv8hiv8si2;
21147 halfmode = V8HImode;
21148 extract
21149 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21150 break;
21151 case V8SImode:
21152 if (unsigned_p)
21153 unpack = gen_avx2_zero_extendv4siv4di2;
21154 else
21155 unpack = gen_avx2_sign_extendv4siv4di2;
21156 halfmode = V4SImode;
21157 extract
21158 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21159 break;
21160 case V16QImode:
21161 if (unsigned_p)
21162 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21163 else
21164 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21165 break;
21166 case V8HImode:
21167 if (unsigned_p)
21168 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21169 else
21170 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21171 break;
21172 case V4SImode:
21173 if (unsigned_p)
21174 unpack = gen_sse4_1_zero_extendv2siv2di2;
21175 else
21176 unpack = gen_sse4_1_sign_extendv2siv2di2;
21177 break;
21178 default:
21179 gcc_unreachable ();
21182 if (GET_MODE_SIZE (imode) == 32)
21184 tmp = gen_reg_rtx (halfmode);
21185 emit_insn (extract (tmp, src));
21187 else if (high_p)
21189 /* Shift higher 8 bytes to lower 8 bytes. */
21190 tmp = gen_reg_rtx (imode);
21191 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
21192 gen_lowpart (V1TImode, src),
21193 GEN_INT (64)));
21195 else
21196 tmp = src;
21198 emit_insn (unpack (dest, tmp));
21200 else
21202 rtx (*unpack)(rtx, rtx, rtx);
21204 switch (imode)
21206 case V16QImode:
21207 if (high_p)
21208 unpack = gen_vec_interleave_highv16qi;
21209 else
21210 unpack = gen_vec_interleave_lowv16qi;
21211 break;
21212 case V8HImode:
21213 if (high_p)
21214 unpack = gen_vec_interleave_highv8hi;
21215 else
21216 unpack = gen_vec_interleave_lowv8hi;
21217 break;
21218 case V4SImode:
21219 if (high_p)
21220 unpack = gen_vec_interleave_highv4si;
21221 else
21222 unpack = gen_vec_interleave_lowv4si;
21223 break;
21224 default:
21225 gcc_unreachable ();
21228 if (unsigned_p)
21229 tmp = force_reg (imode, CONST0_RTX (imode));
21230 else
21231 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21232 src, pc_rtx, pc_rtx);
21234 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
21238 /* Expand conditional increment or decrement using adb/sbb instructions.
21239 The default case using setcc followed by the conditional move can be
21240 done by generic code. */
21241 bool
21242 ix86_expand_int_addcc (rtx operands[])
21244 enum rtx_code code = GET_CODE (operands[1]);
21245 rtx flags;
21246 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21247 rtx compare_op;
21248 rtx val = const0_rtx;
21249 bool fpcmp = false;
21250 enum machine_mode mode;
21251 rtx op0 = XEXP (operands[1], 0);
21252 rtx op1 = XEXP (operands[1], 1);
21254 if (operands[3] != const1_rtx
21255 && operands[3] != constm1_rtx)
21256 return false;
21257 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21258 return false;
21259 code = GET_CODE (compare_op);
21261 flags = XEXP (compare_op, 0);
21263 if (GET_MODE (flags) == CCFPmode
21264 || GET_MODE (flags) == CCFPUmode)
21266 fpcmp = true;
21267 code = ix86_fp_compare_code_to_integer (code);
21270 if (code != LTU)
21272 val = constm1_rtx;
21273 if (fpcmp)
21274 PUT_CODE (compare_op,
21275 reverse_condition_maybe_unordered
21276 (GET_CODE (compare_op)));
21277 else
21278 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21281 mode = GET_MODE (operands[0]);
21283 /* Construct either adc or sbb insn. */
21284 if ((code == LTU) == (operands[3] == constm1_rtx))
21286 switch (mode)
21288 case QImode:
21289 insn = gen_subqi3_carry;
21290 break;
21291 case HImode:
21292 insn = gen_subhi3_carry;
21293 break;
21294 case SImode:
21295 insn = gen_subsi3_carry;
21296 break;
21297 case DImode:
21298 insn = gen_subdi3_carry;
21299 break;
21300 default:
21301 gcc_unreachable ();
21304 else
21306 switch (mode)
21308 case QImode:
21309 insn = gen_addqi3_carry;
21310 break;
21311 case HImode:
21312 insn = gen_addhi3_carry;
21313 break;
21314 case SImode:
21315 insn = gen_addsi3_carry;
21316 break;
21317 case DImode:
21318 insn = gen_adddi3_carry;
21319 break;
21320 default:
21321 gcc_unreachable ();
21324 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21326 return true;
21330 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21331 but works for floating pointer parameters and nonoffsetable memories.
21332 For pushes, it returns just stack offsets; the values will be saved
21333 in the right order. Maximally three parts are generated. */
21335 static int
21336 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21338 int size;
21340 if (!TARGET_64BIT)
21341 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21342 else
21343 size = (GET_MODE_SIZE (mode) + 4) / 8;
21345 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21346 gcc_assert (size >= 2 && size <= 4);
21348 /* Optimize constant pool reference to immediates. This is used by fp
21349 moves, that force all constants to memory to allow combining. */
21350 if (MEM_P (operand) && MEM_READONLY_P (operand))
21352 rtx tmp = maybe_get_pool_constant (operand);
21353 if (tmp)
21354 operand = tmp;
21357 if (MEM_P (operand) && !offsettable_memref_p (operand))
21359 /* The only non-offsetable memories we handle are pushes. */
21360 int ok = push_operand (operand, VOIDmode);
21362 gcc_assert (ok);
21364 operand = copy_rtx (operand);
21365 PUT_MODE (operand, word_mode);
21366 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21367 return size;
21370 if (GET_CODE (operand) == CONST_VECTOR)
21372 enum machine_mode imode = int_mode_for_mode (mode);
21373 /* Caution: if we looked through a constant pool memory above,
21374 the operand may actually have a different mode now. That's
21375 ok, since we want to pun this all the way back to an integer. */
21376 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21377 gcc_assert (operand != NULL);
21378 mode = imode;
21381 if (!TARGET_64BIT)
21383 if (mode == DImode)
21384 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21385 else
21387 int i;
21389 if (REG_P (operand))
21391 gcc_assert (reload_completed);
21392 for (i = 0; i < size; i++)
21393 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21395 else if (offsettable_memref_p (operand))
21397 operand = adjust_address (operand, SImode, 0);
21398 parts[0] = operand;
21399 for (i = 1; i < size; i++)
21400 parts[i] = adjust_address (operand, SImode, 4 * i);
21402 else if (GET_CODE (operand) == CONST_DOUBLE)
21404 REAL_VALUE_TYPE r;
21405 long l[4];
21407 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21408 switch (mode)
21410 case TFmode:
21411 real_to_target (l, &r, mode);
21412 parts[3] = gen_int_mode (l[3], SImode);
21413 parts[2] = gen_int_mode (l[2], SImode);
21414 break;
21415 case XFmode:
21416 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21417 long double may not be 80-bit. */
21418 real_to_target (l, &r, mode);
21419 parts[2] = gen_int_mode (l[2], SImode);
21420 break;
21421 case DFmode:
21422 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21423 break;
21424 default:
21425 gcc_unreachable ();
21427 parts[1] = gen_int_mode (l[1], SImode);
21428 parts[0] = gen_int_mode (l[0], SImode);
21430 else
21431 gcc_unreachable ();
21434 else
21436 if (mode == TImode)
21437 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21438 if (mode == XFmode || mode == TFmode)
21440 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21441 if (REG_P (operand))
21443 gcc_assert (reload_completed);
21444 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21445 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21447 else if (offsettable_memref_p (operand))
21449 operand = adjust_address (operand, DImode, 0);
21450 parts[0] = operand;
21451 parts[1] = adjust_address (operand, upper_mode, 8);
21453 else if (GET_CODE (operand) == CONST_DOUBLE)
21455 REAL_VALUE_TYPE r;
21456 long l[4];
21458 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21459 real_to_target (l, &r, mode);
21461 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21462 if (HOST_BITS_PER_WIDE_INT >= 64)
21463 parts[0]
21464 = gen_int_mode
21465 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21466 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21467 DImode);
21468 else
21469 parts[0] = immed_double_const (l[0], l[1], DImode);
21471 if (upper_mode == SImode)
21472 parts[1] = gen_int_mode (l[2], SImode);
21473 else if (HOST_BITS_PER_WIDE_INT >= 64)
21474 parts[1]
21475 = gen_int_mode
21476 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21477 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21478 DImode);
21479 else
21480 parts[1] = immed_double_const (l[2], l[3], DImode);
21482 else
21483 gcc_unreachable ();
21487 return size;
21490 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21491 Return false when normal moves are needed; true when all required
21492 insns have been emitted. Operands 2-4 contain the input values
21493 int the correct order; operands 5-7 contain the output values. */
21495 void
21496 ix86_split_long_move (rtx operands[])
21498 rtx part[2][4];
21499 int nparts, i, j;
21500 int push = 0;
21501 int collisions = 0;
21502 enum machine_mode mode = GET_MODE (operands[0]);
21503 bool collisionparts[4];
21505 /* The DFmode expanders may ask us to move double.
21506 For 64bit target this is single move. By hiding the fact
21507 here we simplify i386.md splitters. */
21508 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21510 /* Optimize constant pool reference to immediates. This is used by
21511 fp moves, that force all constants to memory to allow combining. */
21513 if (MEM_P (operands[1])
21514 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21515 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21516 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21517 if (push_operand (operands[0], VOIDmode))
21519 operands[0] = copy_rtx (operands[0]);
21520 PUT_MODE (operands[0], word_mode);
21522 else
21523 operands[0] = gen_lowpart (DImode, operands[0]);
21524 operands[1] = gen_lowpart (DImode, operands[1]);
21525 emit_move_insn (operands[0], operands[1]);
21526 return;
21529 /* The only non-offsettable memory we handle is push. */
21530 if (push_operand (operands[0], VOIDmode))
21531 push = 1;
21532 else
21533 gcc_assert (!MEM_P (operands[0])
21534 || offsettable_memref_p (operands[0]));
21536 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21537 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21539 /* When emitting push, take care for source operands on the stack. */
21540 if (push && MEM_P (operands[1])
21541 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21543 rtx src_base = XEXP (part[1][nparts - 1], 0);
21545 /* Compensate for the stack decrement by 4. */
21546 if (!TARGET_64BIT && nparts == 3
21547 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21548 src_base = plus_constant (Pmode, src_base, 4);
21550 /* src_base refers to the stack pointer and is
21551 automatically decreased by emitted push. */
21552 for (i = 0; i < nparts; i++)
21553 part[1][i] = change_address (part[1][i],
21554 GET_MODE (part[1][i]), src_base);
21557 /* We need to do copy in the right order in case an address register
21558 of the source overlaps the destination. */
21559 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21561 rtx tmp;
21563 for (i = 0; i < nparts; i++)
21565 collisionparts[i]
21566 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21567 if (collisionparts[i])
21568 collisions++;
21571 /* Collision in the middle part can be handled by reordering. */
21572 if (collisions == 1 && nparts == 3 && collisionparts [1])
21574 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21575 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21577 else if (collisions == 1
21578 && nparts == 4
21579 && (collisionparts [1] || collisionparts [2]))
21581 if (collisionparts [1])
21583 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21584 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21586 else
21588 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21589 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21593 /* If there are more collisions, we can't handle it by reordering.
21594 Do an lea to the last part and use only one colliding move. */
21595 else if (collisions > 1)
21597 rtx base;
21599 collisions = 1;
21601 base = part[0][nparts - 1];
21603 /* Handle the case when the last part isn't valid for lea.
21604 Happens in 64-bit mode storing the 12-byte XFmode. */
21605 if (GET_MODE (base) != Pmode)
21606 base = gen_rtx_REG (Pmode, REGNO (base));
21608 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21609 part[1][0] = replace_equiv_address (part[1][0], base);
21610 for (i = 1; i < nparts; i++)
21612 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21613 part[1][i] = replace_equiv_address (part[1][i], tmp);
21618 if (push)
21620 if (!TARGET_64BIT)
21622 if (nparts == 3)
21624 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21625 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21626 stack_pointer_rtx, GEN_INT (-4)));
21627 emit_move_insn (part[0][2], part[1][2]);
21629 else if (nparts == 4)
21631 emit_move_insn (part[0][3], part[1][3]);
21632 emit_move_insn (part[0][2], part[1][2]);
21635 else
21637 /* In 64bit mode we don't have 32bit push available. In case this is
21638 register, it is OK - we will just use larger counterpart. We also
21639 retype memory - these comes from attempt to avoid REX prefix on
21640 moving of second half of TFmode value. */
21641 if (GET_MODE (part[1][1]) == SImode)
21643 switch (GET_CODE (part[1][1]))
21645 case MEM:
21646 part[1][1] = adjust_address (part[1][1], DImode, 0);
21647 break;
21649 case REG:
21650 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21651 break;
21653 default:
21654 gcc_unreachable ();
21657 if (GET_MODE (part[1][0]) == SImode)
21658 part[1][0] = part[1][1];
21661 emit_move_insn (part[0][1], part[1][1]);
21662 emit_move_insn (part[0][0], part[1][0]);
21663 return;
21666 /* Choose correct order to not overwrite the source before it is copied. */
21667 if ((REG_P (part[0][0])
21668 && REG_P (part[1][1])
21669 && (REGNO (part[0][0]) == REGNO (part[1][1])
21670 || (nparts == 3
21671 && REGNO (part[0][0]) == REGNO (part[1][2]))
21672 || (nparts == 4
21673 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21674 || (collisions > 0
21675 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21677 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21679 operands[2 + i] = part[0][j];
21680 operands[6 + i] = part[1][j];
21683 else
21685 for (i = 0; i < nparts; i++)
21687 operands[2 + i] = part[0][i];
21688 operands[6 + i] = part[1][i];
21692 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21693 if (optimize_insn_for_size_p ())
21695 for (j = 0; j < nparts - 1; j++)
21696 if (CONST_INT_P (operands[6 + j])
21697 && operands[6 + j] != const0_rtx
21698 && REG_P (operands[2 + j]))
21699 for (i = j; i < nparts - 1; i++)
21700 if (CONST_INT_P (operands[7 + i])
21701 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21702 operands[7 + i] = operands[2 + j];
21705 for (i = 0; i < nparts; i++)
21706 emit_move_insn (operands[2 + i], operands[6 + i]);
21708 return;
21711 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21712 left shift by a constant, either using a single shift or
21713 a sequence of add instructions. */
21715 static void
21716 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21718 rtx (*insn)(rtx, rtx, rtx);
21720 if (count == 1
21721 || (count * ix86_cost->add <= ix86_cost->shift_const
21722 && !optimize_insn_for_size_p ()))
21724 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21725 while (count-- > 0)
21726 emit_insn (insn (operand, operand, operand));
21728 else
21730 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21731 emit_insn (insn (operand, operand, GEN_INT (count)));
21735 void
21736 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21738 rtx (*gen_ashl3)(rtx, rtx, rtx);
21739 rtx (*gen_shld)(rtx, rtx, rtx);
21740 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21742 rtx low[2], high[2];
21743 int count;
21745 if (CONST_INT_P (operands[2]))
21747 split_double_mode (mode, operands, 2, low, high);
21748 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21750 if (count >= half_width)
21752 emit_move_insn (high[0], low[1]);
21753 emit_move_insn (low[0], const0_rtx);
21755 if (count > half_width)
21756 ix86_expand_ashl_const (high[0], count - half_width, mode);
21758 else
21760 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21762 if (!rtx_equal_p (operands[0], operands[1]))
21763 emit_move_insn (operands[0], operands[1]);
21765 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21766 ix86_expand_ashl_const (low[0], count, mode);
21768 return;
21771 split_double_mode (mode, operands, 1, low, high);
21773 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21775 if (operands[1] == const1_rtx)
21777 /* Assuming we've chosen a QImode capable registers, then 1 << N
21778 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21779 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21781 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21783 ix86_expand_clear (low[0]);
21784 ix86_expand_clear (high[0]);
21785 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21787 d = gen_lowpart (QImode, low[0]);
21788 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21789 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21790 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21792 d = gen_lowpart (QImode, high[0]);
21793 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21794 s = gen_rtx_NE (QImode, flags, const0_rtx);
21795 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21798 /* Otherwise, we can get the same results by manually performing
21799 a bit extract operation on bit 5/6, and then performing the two
21800 shifts. The two methods of getting 0/1 into low/high are exactly
21801 the same size. Avoiding the shift in the bit extract case helps
21802 pentium4 a bit; no one else seems to care much either way. */
21803 else
21805 enum machine_mode half_mode;
21806 rtx (*gen_lshr3)(rtx, rtx, rtx);
21807 rtx (*gen_and3)(rtx, rtx, rtx);
21808 rtx (*gen_xor3)(rtx, rtx, rtx);
21809 HOST_WIDE_INT bits;
21810 rtx x;
21812 if (mode == DImode)
21814 half_mode = SImode;
21815 gen_lshr3 = gen_lshrsi3;
21816 gen_and3 = gen_andsi3;
21817 gen_xor3 = gen_xorsi3;
21818 bits = 5;
21820 else
21822 half_mode = DImode;
21823 gen_lshr3 = gen_lshrdi3;
21824 gen_and3 = gen_anddi3;
21825 gen_xor3 = gen_xordi3;
21826 bits = 6;
21829 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21830 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21831 else
21832 x = gen_lowpart (half_mode, operands[2]);
21833 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21835 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21836 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21837 emit_move_insn (low[0], high[0]);
21838 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21841 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21842 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21843 return;
21846 if (operands[1] == constm1_rtx)
21848 /* For -1 << N, we can avoid the shld instruction, because we
21849 know that we're shifting 0...31/63 ones into a -1. */
21850 emit_move_insn (low[0], constm1_rtx);
21851 if (optimize_insn_for_size_p ())
21852 emit_move_insn (high[0], low[0]);
21853 else
21854 emit_move_insn (high[0], constm1_rtx);
21856 else
21858 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21860 if (!rtx_equal_p (operands[0], operands[1]))
21861 emit_move_insn (operands[0], operands[1]);
21863 split_double_mode (mode, operands, 1, low, high);
21864 emit_insn (gen_shld (high[0], low[0], operands[2]));
21867 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21869 if (TARGET_CMOVE && scratch)
21871 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21872 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21874 ix86_expand_clear (scratch);
21875 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21877 else
21879 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21880 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21882 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21886 void
21887 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21889 rtx (*gen_ashr3)(rtx, rtx, rtx)
21890 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21891 rtx (*gen_shrd)(rtx, rtx, rtx);
21892 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21894 rtx low[2], high[2];
21895 int count;
21897 if (CONST_INT_P (operands[2]))
21899 split_double_mode (mode, operands, 2, low, high);
21900 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21902 if (count == GET_MODE_BITSIZE (mode) - 1)
21904 emit_move_insn (high[0], high[1]);
21905 emit_insn (gen_ashr3 (high[0], high[0],
21906 GEN_INT (half_width - 1)));
21907 emit_move_insn (low[0], high[0]);
21910 else if (count >= half_width)
21912 emit_move_insn (low[0], high[1]);
21913 emit_move_insn (high[0], low[0]);
21914 emit_insn (gen_ashr3 (high[0], high[0],
21915 GEN_INT (half_width - 1)));
21917 if (count > half_width)
21918 emit_insn (gen_ashr3 (low[0], low[0],
21919 GEN_INT (count - half_width)));
21921 else
21923 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21925 if (!rtx_equal_p (operands[0], operands[1]))
21926 emit_move_insn (operands[0], operands[1]);
21928 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21929 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21932 else
21934 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21936 if (!rtx_equal_p (operands[0], operands[1]))
21937 emit_move_insn (operands[0], operands[1]);
21939 split_double_mode (mode, operands, 1, low, high);
21941 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21942 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21944 if (TARGET_CMOVE && scratch)
21946 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21947 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21949 emit_move_insn (scratch, high[0]);
21950 emit_insn (gen_ashr3 (scratch, scratch,
21951 GEN_INT (half_width - 1)));
21952 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21953 scratch));
21955 else
21957 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21958 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21960 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21965 void
21966 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21968 rtx (*gen_lshr3)(rtx, rtx, rtx)
21969 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21970 rtx (*gen_shrd)(rtx, rtx, rtx);
21971 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21973 rtx low[2], high[2];
21974 int count;
21976 if (CONST_INT_P (operands[2]))
21978 split_double_mode (mode, operands, 2, low, high);
21979 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21981 if (count >= half_width)
21983 emit_move_insn (low[0], high[1]);
21984 ix86_expand_clear (high[0]);
21986 if (count > half_width)
21987 emit_insn (gen_lshr3 (low[0], low[0],
21988 GEN_INT (count - half_width)));
21990 else
21992 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21994 if (!rtx_equal_p (operands[0], operands[1]))
21995 emit_move_insn (operands[0], operands[1]);
21997 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21998 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
22001 else
22003 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22005 if (!rtx_equal_p (operands[0], operands[1]))
22006 emit_move_insn (operands[0], operands[1]);
22008 split_double_mode (mode, operands, 1, low, high);
22010 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22011 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
22013 if (TARGET_CMOVE && scratch)
22015 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22016 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22018 ix86_expand_clear (scratch);
22019 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22020 scratch));
22022 else
22024 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22025 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22027 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
22032 /* Predict just emitted jump instruction to be taken with probability PROB. */
22033 static void
22034 predict_jump (int prob)
22036 rtx insn = get_last_insn ();
22037 gcc_assert (JUMP_P (insn));
22038 add_int_reg_note (insn, REG_BR_PROB, prob);
22041 /* Helper function for the string operations below. Dest VARIABLE whether
22042 it is aligned to VALUE bytes. If true, jump to the label. */
22043 static rtx
22044 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
22046 rtx label = gen_label_rtx ();
22047 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
22048 if (GET_MODE (variable) == DImode)
22049 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
22050 else
22051 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
22052 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
22053 1, label);
22054 if (epilogue)
22055 predict_jump (REG_BR_PROB_BASE * 50 / 100);
22056 else
22057 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22058 return label;
22061 /* Adjust COUNTER by the VALUE. */
22062 static void
22063 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
22065 rtx (*gen_add)(rtx, rtx, rtx)
22066 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
22068 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
22071 /* Zero extend possibly SImode EXP to Pmode register. */
22073 ix86_zero_extend_to_Pmode (rtx exp)
22075 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
22078 /* Divide COUNTREG by SCALE. */
22079 static rtx
22080 scale_counter (rtx countreg, int scale)
22082 rtx sc;
22084 if (scale == 1)
22085 return countreg;
22086 if (CONST_INT_P (countreg))
22087 return GEN_INT (INTVAL (countreg) / scale);
22088 gcc_assert (REG_P (countreg));
22090 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22091 GEN_INT (exact_log2 (scale)),
22092 NULL, 1, OPTAB_DIRECT);
22093 return sc;
22096 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22097 DImode for constant loop counts. */
22099 static enum machine_mode
22100 counter_mode (rtx count_exp)
22102 if (GET_MODE (count_exp) != VOIDmode)
22103 return GET_MODE (count_exp);
22104 if (!CONST_INT_P (count_exp))
22105 return Pmode;
22106 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22107 return DImode;
22108 return SImode;
22111 /* Copy the address to a Pmode register. This is used for x32 to
22112 truncate DImode TLS address to a SImode register. */
22114 static rtx
22115 ix86_copy_addr_to_reg (rtx addr)
22117 if (GET_MODE (addr) == Pmode)
22118 return copy_addr_to_reg (addr);
22119 else
22121 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
22122 return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
22126 /* When SRCPTR is non-NULL, output simple loop to move memory
22127 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
22128 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
22129 equivalent loop to set memory by VALUE (supposed to be in MODE).
22131 The size is rounded down to whole number of chunk size moved at once.
22132 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22135 static void
22136 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22137 rtx destptr, rtx srcptr, rtx value,
22138 rtx count, enum machine_mode mode, int unroll,
22139 int expected_size)
22141 rtx out_label, top_label, iter, tmp;
22142 enum machine_mode iter_mode = counter_mode (count);
22143 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22144 rtx piece_size = GEN_INT (piece_size_n);
22145 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22146 rtx size;
22147 int i;
22149 top_label = gen_label_rtx ();
22150 out_label = gen_label_rtx ();
22151 iter = gen_reg_rtx (iter_mode);
22153 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22154 NULL, 1, OPTAB_DIRECT);
22155 /* Those two should combine. */
22156 if (piece_size == const1_rtx)
22158 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22159 true, out_label);
22160 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22162 emit_move_insn (iter, const0_rtx);
22164 emit_label (top_label);
22166 tmp = convert_modes (Pmode, iter_mode, iter, true);
22168 /* This assert could be relaxed - in this case we'll need to compute
22169 smallest power of two, containing in PIECE_SIZE_N and pass it to
22170 offset_address. */
22171 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22172 destmem = offset_address (destmem, tmp, piece_size_n);
22173 destmem = adjust_address (destmem, mode, 0);
22175 if (srcmem)
22177 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22178 srcmem = adjust_address (srcmem, mode, 0);
22180 /* When unrolling for chips that reorder memory reads and writes,
22181 we can save registers by using single temporary.
22182 Also using 4 temporaries is overkill in 32bit mode. */
22183 if (!TARGET_64BIT && 0)
22185 for (i = 0; i < unroll; i++)
22187 if (i)
22189 destmem =
22190 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22191 srcmem =
22192 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22194 emit_move_insn (destmem, srcmem);
22197 else
22199 rtx tmpreg[4];
22200 gcc_assert (unroll <= 4);
22201 for (i = 0; i < unroll; i++)
22203 tmpreg[i] = gen_reg_rtx (mode);
22204 if (i)
22206 srcmem =
22207 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22209 emit_move_insn (tmpreg[i], srcmem);
22211 for (i = 0; i < unroll; i++)
22213 if (i)
22215 destmem =
22216 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22218 emit_move_insn (destmem, tmpreg[i]);
22222 else
22223 for (i = 0; i < unroll; i++)
22225 if (i)
22226 destmem =
22227 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22228 emit_move_insn (destmem, value);
22231 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22232 true, OPTAB_LIB_WIDEN);
22233 if (tmp != iter)
22234 emit_move_insn (iter, tmp);
22236 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22237 true, top_label);
22238 if (expected_size != -1)
22240 expected_size /= GET_MODE_SIZE (mode) * unroll;
22241 if (expected_size == 0)
22242 predict_jump (0);
22243 else if (expected_size > REG_BR_PROB_BASE)
22244 predict_jump (REG_BR_PROB_BASE - 1);
22245 else
22246 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22248 else
22249 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22250 iter = ix86_zero_extend_to_Pmode (iter);
22251 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22252 true, OPTAB_LIB_WIDEN);
22253 if (tmp != destptr)
22254 emit_move_insn (destptr, tmp);
22255 if (srcptr)
22257 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22258 true, OPTAB_LIB_WIDEN);
22259 if (tmp != srcptr)
22260 emit_move_insn (srcptr, tmp);
22262 emit_label (out_label);
22265 /* Output "rep; mov" instruction.
22266 Arguments have same meaning as for previous function */
22267 static void
22268 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
22269 rtx destptr, rtx srcptr,
22270 rtx count,
22271 enum machine_mode mode)
22273 rtx destexp;
22274 rtx srcexp;
22275 rtx countreg;
22276 HOST_WIDE_INT rounded_count;
22278 /* If the size is known, it is shorter to use rep movs. */
22279 if (mode == QImode && CONST_INT_P (count)
22280 && !(INTVAL (count) & 3))
22281 mode = SImode;
22283 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22284 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22285 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22286 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22287 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22288 if (mode != QImode)
22290 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22291 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22292 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22293 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22294 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22295 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22297 else
22299 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22300 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22302 if (CONST_INT_P (count))
22304 rounded_count = (INTVAL (count)
22305 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22306 destmem = shallow_copy_rtx (destmem);
22307 srcmem = shallow_copy_rtx (srcmem);
22308 set_mem_size (destmem, rounded_count);
22309 set_mem_size (srcmem, rounded_count);
22311 else
22313 if (MEM_SIZE_KNOWN_P (destmem))
22314 clear_mem_size (destmem);
22315 if (MEM_SIZE_KNOWN_P (srcmem))
22316 clear_mem_size (srcmem);
22318 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22319 destexp, srcexp));
22322 /* Output "rep; stos" instruction.
22323 Arguments have same meaning as for previous function */
22324 static void
22325 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
22326 rtx count, enum machine_mode mode,
22327 rtx orig_value)
22329 rtx destexp;
22330 rtx countreg;
22331 HOST_WIDE_INT rounded_count;
22333 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22334 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22335 value = force_reg (mode, gen_lowpart (mode, value));
22336 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22337 if (mode != QImode)
22339 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22340 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22341 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22343 else
22344 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22345 if (orig_value == const0_rtx && CONST_INT_P (count))
22347 rounded_count = (INTVAL (count)
22348 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22349 destmem = shallow_copy_rtx (destmem);
22350 set_mem_size (destmem, rounded_count);
22352 else if (MEM_SIZE_KNOWN_P (destmem))
22353 clear_mem_size (destmem);
22354 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22357 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22358 DESTMEM.
22359 SRC is passed by pointer to be updated on return.
22360 Return value is updated DST. */
22361 static rtx
22362 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22363 HOST_WIDE_INT size_to_move)
22365 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22366 enum insn_code code;
22367 enum machine_mode move_mode;
22368 int piece_size, i;
22370 /* Find the widest mode in which we could perform moves.
22371 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22372 it until move of such size is supported. */
22373 piece_size = 1 << floor_log2 (size_to_move);
22374 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22375 code = optab_handler (mov_optab, move_mode);
22376 while (code == CODE_FOR_nothing && piece_size > 1)
22378 piece_size >>= 1;
22379 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22380 code = optab_handler (mov_optab, move_mode);
22383 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22384 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22385 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22387 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22388 move_mode = mode_for_vector (word_mode, nunits);
22389 code = optab_handler (mov_optab, move_mode);
22390 if (code == CODE_FOR_nothing)
22392 move_mode = word_mode;
22393 piece_size = GET_MODE_SIZE (move_mode);
22394 code = optab_handler (mov_optab, move_mode);
22397 gcc_assert (code != CODE_FOR_nothing);
22399 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22400 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22402 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22403 gcc_assert (size_to_move % piece_size == 0);
22404 adjust = GEN_INT (piece_size);
22405 for (i = 0; i < size_to_move; i += piece_size)
22407 /* We move from memory to memory, so we'll need to do it via
22408 a temporary register. */
22409 tempreg = gen_reg_rtx (move_mode);
22410 emit_insn (GEN_FCN (code) (tempreg, src));
22411 emit_insn (GEN_FCN (code) (dst, tempreg));
22413 emit_move_insn (destptr,
22414 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22415 emit_move_insn (srcptr,
22416 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22418 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22419 piece_size);
22420 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22421 piece_size);
22424 /* Update DST and SRC rtx. */
22425 *srcmem = src;
22426 return dst;
22429 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22430 static void
22431 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22432 rtx destptr, rtx srcptr, rtx count, int max_size)
22434 rtx src, dest;
22435 if (CONST_INT_P (count))
22437 HOST_WIDE_INT countval = INTVAL (count);
22438 HOST_WIDE_INT epilogue_size = countval % max_size;
22439 int i;
22441 /* For now MAX_SIZE should be a power of 2. This assert could be
22442 relaxed, but it'll require a bit more complicated epilogue
22443 expanding. */
22444 gcc_assert ((max_size & (max_size - 1)) == 0);
22445 for (i = max_size; i >= 1; i >>= 1)
22447 if (epilogue_size & i)
22448 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22450 return;
22452 if (max_size > 8)
22454 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22455 count, 1, OPTAB_DIRECT);
22456 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22457 count, QImode, 1, 4);
22458 return;
22461 /* When there are stringops, we can cheaply increase dest and src pointers.
22462 Otherwise we save code size by maintaining offset (zero is readily
22463 available from preceding rep operation) and using x86 addressing modes.
22465 if (TARGET_SINGLE_STRINGOP)
22467 if (max_size > 4)
22469 rtx label = ix86_expand_aligntest (count, 4, true);
22470 src = change_address (srcmem, SImode, srcptr);
22471 dest = change_address (destmem, SImode, destptr);
22472 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22473 emit_label (label);
22474 LABEL_NUSES (label) = 1;
22476 if (max_size > 2)
22478 rtx label = ix86_expand_aligntest (count, 2, true);
22479 src = change_address (srcmem, HImode, srcptr);
22480 dest = change_address (destmem, HImode, destptr);
22481 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22482 emit_label (label);
22483 LABEL_NUSES (label) = 1;
22485 if (max_size > 1)
22487 rtx label = ix86_expand_aligntest (count, 1, true);
22488 src = change_address (srcmem, QImode, srcptr);
22489 dest = change_address (destmem, QImode, destptr);
22490 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22491 emit_label (label);
22492 LABEL_NUSES (label) = 1;
22495 else
22497 rtx offset = force_reg (Pmode, const0_rtx);
22498 rtx tmp;
22500 if (max_size > 4)
22502 rtx label = ix86_expand_aligntest (count, 4, true);
22503 src = change_address (srcmem, SImode, srcptr);
22504 dest = change_address (destmem, SImode, destptr);
22505 emit_move_insn (dest, src);
22506 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22507 true, OPTAB_LIB_WIDEN);
22508 if (tmp != offset)
22509 emit_move_insn (offset, tmp);
22510 emit_label (label);
22511 LABEL_NUSES (label) = 1;
22513 if (max_size > 2)
22515 rtx label = ix86_expand_aligntest (count, 2, true);
22516 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22517 src = change_address (srcmem, HImode, tmp);
22518 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22519 dest = change_address (destmem, HImode, tmp);
22520 emit_move_insn (dest, src);
22521 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22522 true, OPTAB_LIB_WIDEN);
22523 if (tmp != offset)
22524 emit_move_insn (offset, tmp);
22525 emit_label (label);
22526 LABEL_NUSES (label) = 1;
22528 if (max_size > 1)
22530 rtx label = ix86_expand_aligntest (count, 1, true);
22531 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22532 src = change_address (srcmem, QImode, tmp);
22533 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22534 dest = change_address (destmem, QImode, tmp);
22535 emit_move_insn (dest, src);
22536 emit_label (label);
22537 LABEL_NUSES (label) = 1;
22542 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22543 static void
22544 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22545 rtx count, int max_size)
22547 count =
22548 expand_simple_binop (counter_mode (count), AND, count,
22549 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22550 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22551 gen_lowpart (QImode, value), count, QImode,
22552 1, max_size / 2);
22555 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22556 static void
22557 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22559 rtx dest;
22561 if (CONST_INT_P (count))
22563 HOST_WIDE_INT countval = INTVAL (count);
22564 int offset = 0;
22566 if ((countval & 0x10) && max_size > 16)
22568 if (TARGET_64BIT)
22570 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22571 emit_insn (gen_strset (destptr, dest, value));
22572 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22573 emit_insn (gen_strset (destptr, dest, value));
22575 else
22576 gcc_unreachable ();
22577 offset += 16;
22579 if ((countval & 0x08) && max_size > 8)
22581 if (TARGET_64BIT)
22583 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22584 emit_insn (gen_strset (destptr, dest, value));
22586 else
22588 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22589 emit_insn (gen_strset (destptr, dest, value));
22590 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22591 emit_insn (gen_strset (destptr, dest, value));
22593 offset += 8;
22595 if ((countval & 0x04) && max_size > 4)
22597 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22598 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22599 offset += 4;
22601 if ((countval & 0x02) && max_size > 2)
22603 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22604 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22605 offset += 2;
22607 if ((countval & 0x01) && max_size > 1)
22609 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22610 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22611 offset += 1;
22613 return;
22615 if (max_size > 32)
22617 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22618 return;
22620 if (max_size > 16)
22622 rtx label = ix86_expand_aligntest (count, 16, true);
22623 if (TARGET_64BIT)
22625 dest = change_address (destmem, DImode, destptr);
22626 emit_insn (gen_strset (destptr, dest, value));
22627 emit_insn (gen_strset (destptr, dest, value));
22629 else
22631 dest = change_address (destmem, SImode, destptr);
22632 emit_insn (gen_strset (destptr, dest, value));
22633 emit_insn (gen_strset (destptr, dest, value));
22634 emit_insn (gen_strset (destptr, dest, value));
22635 emit_insn (gen_strset (destptr, dest, value));
22637 emit_label (label);
22638 LABEL_NUSES (label) = 1;
22640 if (max_size > 8)
22642 rtx label = ix86_expand_aligntest (count, 8, true);
22643 if (TARGET_64BIT)
22645 dest = change_address (destmem, DImode, destptr);
22646 emit_insn (gen_strset (destptr, dest, value));
22648 else
22650 dest = change_address (destmem, SImode, destptr);
22651 emit_insn (gen_strset (destptr, dest, value));
22652 emit_insn (gen_strset (destptr, dest, value));
22654 emit_label (label);
22655 LABEL_NUSES (label) = 1;
22657 if (max_size > 4)
22659 rtx label = ix86_expand_aligntest (count, 4, true);
22660 dest = change_address (destmem, SImode, destptr);
22661 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22662 emit_label (label);
22663 LABEL_NUSES (label) = 1;
22665 if (max_size > 2)
22667 rtx label = ix86_expand_aligntest (count, 2, true);
22668 dest = change_address (destmem, HImode, destptr);
22669 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22670 emit_label (label);
22671 LABEL_NUSES (label) = 1;
22673 if (max_size > 1)
22675 rtx label = ix86_expand_aligntest (count, 1, true);
22676 dest = change_address (destmem, QImode, destptr);
22677 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22678 emit_label (label);
22679 LABEL_NUSES (label) = 1;
22683 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22684 DESIRED_ALIGNMENT.
22685 Return value is updated DESTMEM. */
22686 static rtx
22687 expand_movmem_prologue (rtx destmem, rtx srcmem,
22688 rtx destptr, rtx srcptr, rtx count,
22689 int align, int desired_alignment)
22691 int i;
22692 for (i = 1; i < desired_alignment; i <<= 1)
22694 if (align <= i)
22696 rtx label = ix86_expand_aligntest (destptr, i, false);
22697 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22698 ix86_adjust_counter (count, i);
22699 emit_label (label);
22700 LABEL_NUSES (label) = 1;
22701 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
22704 return destmem;
22707 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22708 ALIGN_BYTES is how many bytes need to be copied.
22709 The function updates DST and SRC, namely, it sets proper alignment.
22710 DST is returned via return value, SRC is updated via pointer SRCP. */
22711 static rtx
22712 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22713 int desired_align, int align_bytes)
22715 rtx src = *srcp;
22716 rtx orig_dst = dst;
22717 rtx orig_src = src;
22718 int piece_size = 1;
22719 int copied_bytes = 0;
22720 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22721 if (src_align_bytes >= 0)
22722 src_align_bytes = desired_align - src_align_bytes;
22724 for (piece_size = 1;
22725 piece_size <= desired_align && copied_bytes < align_bytes;
22726 piece_size <<= 1)
22728 if (align_bytes & piece_size)
22730 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
22731 copied_bytes += piece_size;
22735 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22736 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22737 if (src_align_bytes >= 0)
22739 unsigned int src_align;
22740 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
22742 if ((src_align_bytes & (src_align - 1))
22743 == (align_bytes & (src_align - 1)))
22744 break;
22746 if (src_align > (unsigned int) desired_align)
22747 src_align = desired_align;
22748 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22749 set_mem_align (src, src_align * BITS_PER_UNIT);
22751 if (MEM_SIZE_KNOWN_P (orig_dst))
22752 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22753 if (MEM_SIZE_KNOWN_P (orig_src))
22754 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22755 *srcp = src;
22756 return dst;
22759 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22760 DESIRED_ALIGNMENT. */
22761 static void
22762 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22763 int align, int desired_alignment)
22765 if (align <= 1 && desired_alignment > 1)
22767 rtx label = ix86_expand_aligntest (destptr, 1, false);
22768 destmem = change_address (destmem, QImode, destptr);
22769 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22770 ix86_adjust_counter (count, 1);
22771 emit_label (label);
22772 LABEL_NUSES (label) = 1;
22774 if (align <= 2 && desired_alignment > 2)
22776 rtx label = ix86_expand_aligntest (destptr, 2, false);
22777 destmem = change_address (destmem, HImode, destptr);
22778 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22779 ix86_adjust_counter (count, 2);
22780 emit_label (label);
22781 LABEL_NUSES (label) = 1;
22783 if (align <= 4 && desired_alignment > 4)
22785 rtx label = ix86_expand_aligntest (destptr, 4, false);
22786 destmem = change_address (destmem, SImode, destptr);
22787 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22788 ix86_adjust_counter (count, 4);
22789 emit_label (label);
22790 LABEL_NUSES (label) = 1;
22792 gcc_assert (desired_alignment <= 8);
22795 /* Set enough from DST to align DST known to by aligned by ALIGN to
22796 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22797 static rtx
22798 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22799 int desired_align, int align_bytes)
22801 int off = 0;
22802 rtx orig_dst = dst;
22803 if (align_bytes & 1)
22805 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22806 off = 1;
22807 emit_insn (gen_strset (destreg, dst,
22808 gen_lowpart (QImode, value)));
22810 if (align_bytes & 2)
22812 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22813 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22814 set_mem_align (dst, 2 * BITS_PER_UNIT);
22815 off = 2;
22816 emit_insn (gen_strset (destreg, dst,
22817 gen_lowpart (HImode, value)));
22819 if (align_bytes & 4)
22821 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22822 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22823 set_mem_align (dst, 4 * BITS_PER_UNIT);
22824 off = 4;
22825 emit_insn (gen_strset (destreg, dst,
22826 gen_lowpart (SImode, value)));
22828 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22829 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22830 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22831 if (MEM_SIZE_KNOWN_P (orig_dst))
22832 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22833 return dst;
22836 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22837 static enum stringop_alg
22838 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22839 int *dynamic_check, bool *noalign)
22841 const struct stringop_algs * algs;
22842 bool optimize_for_speed;
22843 /* Algorithms using the rep prefix want at least edi and ecx;
22844 additionally, memset wants eax and memcpy wants esi. Don't
22845 consider such algorithms if the user has appropriated those
22846 registers for their own purposes. */
22847 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22848 || (memset
22849 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22850 *noalign = false;
22852 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22853 || (alg != rep_prefix_1_byte \
22854 && alg != rep_prefix_4_byte \
22855 && alg != rep_prefix_8_byte))
22856 const struct processor_costs *cost;
22858 /* Even if the string operation call is cold, we still might spend a lot
22859 of time processing large blocks. */
22860 if (optimize_function_for_size_p (cfun)
22861 || (optimize_insn_for_size_p ()
22862 && expected_size != -1 && expected_size < 256))
22863 optimize_for_speed = false;
22864 else
22865 optimize_for_speed = true;
22867 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22869 *dynamic_check = -1;
22870 if (memset)
22871 algs = &cost->memset[TARGET_64BIT != 0];
22872 else
22873 algs = &cost->memcpy[TARGET_64BIT != 0];
22874 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22875 return ix86_stringop_alg;
22876 /* rep; movq or rep; movl is the smallest variant. */
22877 else if (!optimize_for_speed)
22879 if (!count || (count & 3))
22880 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22881 else
22882 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22884 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22886 else if (expected_size != -1 && expected_size < 4)
22887 return loop_1_byte;
22888 else if (expected_size != -1)
22890 unsigned int i;
22891 enum stringop_alg alg = libcall;
22892 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22894 /* We get here if the algorithms that were not libcall-based
22895 were rep-prefix based and we are unable to use rep prefixes
22896 based on global register usage. Break out of the loop and
22897 use the heuristic below. */
22898 if (algs->size[i].max == 0)
22899 break;
22900 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22902 enum stringop_alg candidate = algs->size[i].alg;
22904 if (candidate != libcall && ALG_USABLE_P (candidate))
22905 alg = candidate;
22906 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22907 last non-libcall inline algorithm. */
22908 if (TARGET_INLINE_ALL_STRINGOPS)
22910 /* When the current size is best to be copied by a libcall,
22911 but we are still forced to inline, run the heuristic below
22912 that will pick code for medium sized blocks. */
22913 if (alg != libcall)
22914 return alg;
22915 break;
22917 else if (ALG_USABLE_P (candidate))
22919 *noalign = algs->size[i].noalign;
22920 return candidate;
22924 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22926 /* When asked to inline the call anyway, try to pick meaningful choice.
22927 We look for maximal size of block that is faster to copy by hand and
22928 take blocks of at most of that size guessing that average size will
22929 be roughly half of the block.
22931 If this turns out to be bad, we might simply specify the preferred
22932 choice in ix86_costs. */
22933 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22934 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22936 int max = -1;
22937 enum stringop_alg alg;
22938 int i;
22939 bool any_alg_usable_p = true;
22941 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22943 enum stringop_alg candidate = algs->size[i].alg;
22944 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22946 if (candidate != libcall && candidate
22947 && ALG_USABLE_P (candidate))
22948 max = algs->size[i].max;
22950 /* If there aren't any usable algorithms, then recursing on
22951 smaller sizes isn't going to find anything. Just return the
22952 simple byte-at-a-time copy loop. */
22953 if (!any_alg_usable_p)
22955 /* Pick something reasonable. */
22956 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22957 *dynamic_check = 128;
22958 return loop_1_byte;
22960 if (max == -1)
22961 max = 4096;
22962 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22963 gcc_assert (*dynamic_check == -1);
22964 gcc_assert (alg != libcall);
22965 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22966 *dynamic_check = max;
22967 return alg;
22969 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22970 #undef ALG_USABLE_P
22973 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22974 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22975 static int
22976 decide_alignment (int align,
22977 enum stringop_alg alg,
22978 int expected_size,
22979 enum machine_mode move_mode)
22981 int desired_align = 0;
22983 gcc_assert (alg != no_stringop);
22985 if (alg == libcall)
22986 return 0;
22987 if (move_mode == VOIDmode)
22988 return 0;
22990 desired_align = GET_MODE_SIZE (move_mode);
22991 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22992 copying whole cacheline at once. */
22993 if (TARGET_PENTIUMPRO
22994 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
22995 desired_align = 8;
22997 if (optimize_size)
22998 desired_align = 1;
22999 if (desired_align < align)
23000 desired_align = align;
23001 if (expected_size != -1 && expected_size < 4)
23002 desired_align = align;
23004 return desired_align;
23007 /* Expand string move (memcpy) operation. Use i386 string operations
23008 when profitable. expand_setmem contains similar code. The code
23009 depends upon architecture, block size and alignment, but always has
23010 the same overall structure:
23012 1) Prologue guard: Conditional that jumps up to epilogues for small
23013 blocks that can be handled by epilogue alone. This is faster
23014 but also needed for correctness, since prologue assume the block
23015 is larger than the desired alignment.
23017 Optional dynamic check for size and libcall for large
23018 blocks is emitted here too, with -minline-stringops-dynamically.
23020 2) Prologue: copy first few bytes in order to get destination
23021 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
23022 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
23023 copied. We emit either a jump tree on power of two sized
23024 blocks, or a byte loop.
23026 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23027 with specified algorithm.
23029 4) Epilogue: code copying tail of the block that is too small to be
23030 handled by main body (or up to size guarded by prologue guard). */
23032 bool
23033 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
23034 rtx expected_align_exp, rtx expected_size_exp)
23036 rtx destreg;
23037 rtx srcreg;
23038 rtx label = NULL;
23039 rtx tmp;
23040 rtx jump_around_label = NULL;
23041 HOST_WIDE_INT align = 1;
23042 unsigned HOST_WIDE_INT count = 0;
23043 HOST_WIDE_INT expected_size = -1;
23044 int size_needed = 0, epilogue_size_needed;
23045 int desired_align = 0, align_bytes = 0;
23046 enum stringop_alg alg;
23047 int dynamic_check;
23048 bool need_zero_guard = false;
23049 bool noalign;
23050 enum machine_mode move_mode = VOIDmode;
23051 int unroll_factor = 1;
23053 if (CONST_INT_P (align_exp))
23054 align = INTVAL (align_exp);
23055 /* i386 can do misaligned access on reasonably increased cost. */
23056 if (CONST_INT_P (expected_align_exp)
23057 && INTVAL (expected_align_exp) > align)
23058 align = INTVAL (expected_align_exp);
23059 /* ALIGN is the minimum of destination and source alignment, but we care here
23060 just about destination alignment. */
23061 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
23062 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
23064 if (CONST_INT_P (count_exp))
23065 count = expected_size = INTVAL (count_exp);
23066 if (CONST_INT_P (expected_size_exp) && count == 0)
23067 expected_size = INTVAL (expected_size_exp);
23069 /* Make sure we don't need to care about overflow later on. */
23070 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23071 return false;
23073 /* Step 0: Decide on preferred algorithm, desired alignment and
23074 size of chunks to be copied by main loop. */
23075 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
23076 if (alg == libcall)
23077 return false;
23078 gcc_assert (alg != no_stringop);
23080 if (!count)
23081 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
23082 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23083 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
23085 unroll_factor = 1;
23086 move_mode = word_mode;
23087 switch (alg)
23089 case libcall:
23090 case no_stringop:
23091 case last_alg:
23092 gcc_unreachable ();
23093 case loop_1_byte:
23094 need_zero_guard = true;
23095 move_mode = QImode;
23096 break;
23097 case loop:
23098 need_zero_guard = true;
23099 break;
23100 case unrolled_loop:
23101 need_zero_guard = true;
23102 unroll_factor = (TARGET_64BIT ? 4 : 2);
23103 break;
23104 case vector_loop:
23105 need_zero_guard = true;
23106 unroll_factor = 4;
23107 /* Find the widest supported mode. */
23108 move_mode = word_mode;
23109 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23110 != CODE_FOR_nothing)
23111 move_mode = GET_MODE_WIDER_MODE (move_mode);
23113 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23114 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23115 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23117 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23118 move_mode = mode_for_vector (word_mode, nunits);
23119 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23120 move_mode = word_mode;
23122 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23123 break;
23124 case rep_prefix_8_byte:
23125 move_mode = DImode;
23126 break;
23127 case rep_prefix_4_byte:
23128 move_mode = SImode;
23129 break;
23130 case rep_prefix_1_byte:
23131 move_mode = QImode;
23132 break;
23134 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23135 epilogue_size_needed = size_needed;
23137 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23138 if (!TARGET_ALIGN_STRINGOPS || noalign)
23139 align = desired_align;
23141 /* Step 1: Prologue guard. */
23143 /* Alignment code needs count to be in register. */
23144 if (CONST_INT_P (count_exp) && desired_align > align)
23146 if (INTVAL (count_exp) > desired_align
23147 && INTVAL (count_exp) > size_needed)
23149 align_bytes
23150 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23151 if (align_bytes <= 0)
23152 align_bytes = 0;
23153 else
23154 align_bytes = desired_align - align_bytes;
23156 if (align_bytes == 0)
23157 count_exp = force_reg (counter_mode (count_exp), count_exp);
23159 gcc_assert (desired_align >= 1 && align >= 1);
23161 /* Ensure that alignment prologue won't copy past end of block. */
23162 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23164 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23165 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
23166 Make sure it is power of 2. */
23167 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23169 if (count)
23171 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23173 /* If main algorithm works on QImode, no epilogue is needed.
23174 For small sizes just don't align anything. */
23175 if (size_needed == 1)
23176 desired_align = align;
23177 else
23178 goto epilogue;
23181 else
23183 label = gen_label_rtx ();
23184 emit_cmp_and_jump_insns (count_exp,
23185 GEN_INT (epilogue_size_needed),
23186 LTU, 0, counter_mode (count_exp), 1, label);
23187 if (expected_size == -1 || expected_size < epilogue_size_needed)
23188 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23189 else
23190 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23194 /* Emit code to decide on runtime whether library call or inline should be
23195 used. */
23196 if (dynamic_check != -1)
23198 if (CONST_INT_P (count_exp))
23200 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
23202 emit_block_move_via_libcall (dst, src, count_exp, false);
23203 count_exp = const0_rtx;
23204 goto epilogue;
23207 else
23209 rtx hot_label = gen_label_rtx ();
23210 jump_around_label = gen_label_rtx ();
23211 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23212 LEU, 0, GET_MODE (count_exp), 1, hot_label);
23213 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23214 emit_block_move_via_libcall (dst, src, count_exp, false);
23215 emit_jump (jump_around_label);
23216 emit_label (hot_label);
23220 /* Step 2: Alignment prologue. */
23222 if (desired_align > align)
23224 if (align_bytes == 0)
23226 /* Except for the first move in epilogue, we no longer know
23227 constant offset in aliasing info. It don't seems to worth
23228 the pain to maintain it for the first move, so throw away
23229 the info early. */
23230 src = change_address (src, BLKmode, srcreg);
23231 dst = change_address (dst, BLKmode, destreg);
23232 dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
23233 desired_align);
23235 else
23237 /* If we know how many bytes need to be stored before dst is
23238 sufficiently aligned, maintain aliasing info accurately. */
23239 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
23240 desired_align, align_bytes);
23241 count_exp = plus_constant (counter_mode (count_exp),
23242 count_exp, -align_bytes);
23243 count -= align_bytes;
23245 if (need_zero_guard
23246 && (count < (unsigned HOST_WIDE_INT) size_needed
23247 || (align_bytes == 0
23248 && count < ((unsigned HOST_WIDE_INT) size_needed
23249 + desired_align - align))))
23251 /* It is possible that we copied enough so the main loop will not
23252 execute. */
23253 gcc_assert (size_needed > 1);
23254 if (label == NULL_RTX)
23255 label = gen_label_rtx ();
23256 emit_cmp_and_jump_insns (count_exp,
23257 GEN_INT (size_needed),
23258 LTU, 0, counter_mode (count_exp), 1, label);
23259 if (expected_size == -1
23260 || expected_size < (desired_align - align) / 2 + size_needed)
23261 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23262 else
23263 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23266 if (label && size_needed == 1)
23268 emit_label (label);
23269 LABEL_NUSES (label) = 1;
23270 label = NULL;
23271 epilogue_size_needed = 1;
23273 else if (label == NULL_RTX)
23274 epilogue_size_needed = size_needed;
23276 /* Step 3: Main loop. */
23278 switch (alg)
23280 case libcall:
23281 case no_stringop:
23282 case last_alg:
23283 gcc_unreachable ();
23284 case loop_1_byte:
23285 case loop:
23286 case unrolled_loop:
23287 case vector_loop:
23288 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
23289 count_exp, move_mode, unroll_factor,
23290 expected_size);
23291 break;
23292 case rep_prefix_8_byte:
23293 case rep_prefix_4_byte:
23294 case rep_prefix_1_byte:
23295 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
23296 move_mode);
23297 break;
23299 /* Adjust properly the offset of src and dest memory for aliasing. */
23300 if (CONST_INT_P (count_exp))
23302 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
23303 (count / size_needed) * size_needed);
23304 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23305 (count / size_needed) * size_needed);
23307 else
23309 src = change_address (src, BLKmode, srcreg);
23310 dst = change_address (dst, BLKmode, destreg);
23313 /* Step 4: Epilogue to copy the remaining bytes. */
23314 epilogue:
23315 if (label)
23317 /* When the main loop is done, COUNT_EXP might hold original count,
23318 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23319 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23320 bytes. Compensate if needed. */
23322 if (size_needed < epilogue_size_needed)
23324 tmp =
23325 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23326 GEN_INT (size_needed - 1), count_exp, 1,
23327 OPTAB_DIRECT);
23328 if (tmp != count_exp)
23329 emit_move_insn (count_exp, tmp);
23331 emit_label (label);
23332 LABEL_NUSES (label) = 1;
23335 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23336 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
23337 epilogue_size_needed);
23338 if (jump_around_label)
23339 emit_label (jump_around_label);
23340 return true;
23343 /* Helper function for memcpy. For QImode value 0xXY produce
23344 0xXYXYXYXY of wide specified by MODE. This is essentially
23345 a * 0x10101010, but we can do slightly better than
23346 synth_mult by unwinding the sequence by hand on CPUs with
23347 slow multiply. */
23348 static rtx
23349 promote_duplicated_reg (enum machine_mode mode, rtx val)
23351 enum machine_mode valmode = GET_MODE (val);
23352 rtx tmp;
23353 int nops = mode == DImode ? 3 : 2;
23355 gcc_assert (mode == SImode || mode == DImode);
23356 if (val == const0_rtx)
23357 return copy_to_mode_reg (mode, const0_rtx);
23358 if (CONST_INT_P (val))
23360 HOST_WIDE_INT v = INTVAL (val) & 255;
23362 v |= v << 8;
23363 v |= v << 16;
23364 if (mode == DImode)
23365 v |= (v << 16) << 16;
23366 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23369 if (valmode == VOIDmode)
23370 valmode = QImode;
23371 if (valmode != QImode)
23372 val = gen_lowpart (QImode, val);
23373 if (mode == QImode)
23374 return val;
23375 if (!TARGET_PARTIAL_REG_STALL)
23376 nops--;
23377 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23378 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23379 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23380 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23382 rtx reg = convert_modes (mode, QImode, val, true);
23383 tmp = promote_duplicated_reg (mode, const1_rtx);
23384 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23385 OPTAB_DIRECT);
23387 else
23389 rtx reg = convert_modes (mode, QImode, val, true);
23391 if (!TARGET_PARTIAL_REG_STALL)
23392 if (mode == SImode)
23393 emit_insn (gen_movsi_insv_1 (reg, reg));
23394 else
23395 emit_insn (gen_movdi_insv_1 (reg, reg));
23396 else
23398 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23399 NULL, 1, OPTAB_DIRECT);
23400 reg =
23401 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23403 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23404 NULL, 1, OPTAB_DIRECT);
23405 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23406 if (mode == SImode)
23407 return reg;
23408 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23409 NULL, 1, OPTAB_DIRECT);
23410 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23411 return reg;
23415 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23416 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23417 alignment from ALIGN to DESIRED_ALIGN. */
23418 static rtx
23419 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23421 rtx promoted_val;
23423 if (TARGET_64BIT
23424 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23425 promoted_val = promote_duplicated_reg (DImode, val);
23426 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23427 promoted_val = promote_duplicated_reg (SImode, val);
23428 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23429 promoted_val = promote_duplicated_reg (HImode, val);
23430 else
23431 promoted_val = val;
23433 return promoted_val;
23436 /* Expand string clear operation (bzero). Use i386 string operations when
23437 profitable. See expand_movmem comment for explanation of individual
23438 steps performed. */
23439 bool
23440 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23441 rtx expected_align_exp, rtx expected_size_exp)
23443 rtx destreg;
23444 rtx label = NULL;
23445 rtx tmp;
23446 rtx jump_around_label = NULL;
23447 HOST_WIDE_INT align = 1;
23448 unsigned HOST_WIDE_INT count = 0;
23449 HOST_WIDE_INT expected_size = -1;
23450 int size_needed = 0, epilogue_size_needed;
23451 int desired_align = 0, align_bytes = 0;
23452 enum stringop_alg alg;
23453 rtx promoted_val = NULL;
23454 bool force_loopy_epilogue = false;
23455 int dynamic_check;
23456 bool need_zero_guard = false;
23457 bool noalign;
23458 enum machine_mode move_mode = VOIDmode;
23459 int unroll_factor;
23461 if (CONST_INT_P (align_exp))
23462 align = INTVAL (align_exp);
23463 /* i386 can do misaligned access on reasonably increased cost. */
23464 if (CONST_INT_P (expected_align_exp)
23465 && INTVAL (expected_align_exp) > align)
23466 align = INTVAL (expected_align_exp);
23467 if (CONST_INT_P (count_exp))
23468 count = expected_size = INTVAL (count_exp);
23469 if (CONST_INT_P (expected_size_exp) && count == 0)
23470 expected_size = INTVAL (expected_size_exp);
23472 /* Make sure we don't need to care about overflow later on. */
23473 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23474 return false;
23476 /* Step 0: Decide on preferred algorithm, desired alignment and
23477 size of chunks to be copied by main loop. */
23479 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23480 if (alg == libcall)
23481 return false;
23482 gcc_assert (alg != no_stringop);
23484 if (!count)
23485 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23486 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23488 move_mode = word_mode;
23489 unroll_factor = 1;
23490 switch (alg)
23492 case libcall:
23493 case no_stringop:
23494 case last_alg:
23495 gcc_unreachable ();
23496 case loop:
23497 need_zero_guard = true;
23498 break;
23499 case vector_loop:
23500 case unrolled_loop:
23501 need_zero_guard = true;
23502 unroll_factor = 4;
23503 break;
23504 case rep_prefix_8_byte:
23505 move_mode = DImode;
23506 break;
23507 case rep_prefix_4_byte:
23508 move_mode = SImode;
23509 break;
23510 case rep_prefix_1_byte:
23511 move_mode = QImode;
23512 break;
23513 case loop_1_byte:
23514 need_zero_guard = true;
23515 move_mode = QImode;
23516 break;
23518 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23519 epilogue_size_needed = size_needed;
23521 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23522 if (!TARGET_ALIGN_STRINGOPS || noalign)
23523 align = desired_align;
23525 /* Step 1: Prologue guard. */
23527 /* Alignment code needs count to be in register. */
23528 if (CONST_INT_P (count_exp) && desired_align > align)
23530 if (INTVAL (count_exp) > desired_align
23531 && INTVAL (count_exp) > size_needed)
23533 align_bytes
23534 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23535 if (align_bytes <= 0)
23536 align_bytes = 0;
23537 else
23538 align_bytes = desired_align - align_bytes;
23540 if (align_bytes == 0)
23542 enum machine_mode mode = SImode;
23543 if (TARGET_64BIT && (count & ~0xffffffff))
23544 mode = DImode;
23545 count_exp = force_reg (mode, count_exp);
23548 /* Do the cheap promotion to allow better CSE across the
23549 main loop and epilogue (ie one load of the big constant in the
23550 front of all code. */
23551 if (CONST_INT_P (val_exp))
23552 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23553 desired_align, align);
23554 /* Ensure that alignment prologue won't copy past end of block. */
23555 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23557 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23558 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23559 Make sure it is power of 2. */
23560 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23562 /* To improve performance of small blocks, we jump around the VAL
23563 promoting mode. This mean that if the promoted VAL is not constant,
23564 we might not use it in the epilogue and have to use byte
23565 loop variant. */
23566 if (epilogue_size_needed > 2 && !promoted_val)
23567 force_loopy_epilogue = true;
23568 if (count)
23570 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23572 /* If main algorithm works on QImode, no epilogue is needed.
23573 For small sizes just don't align anything. */
23574 if (size_needed == 1)
23575 desired_align = align;
23576 else
23577 goto epilogue;
23580 else
23582 label = gen_label_rtx ();
23583 emit_cmp_and_jump_insns (count_exp,
23584 GEN_INT (epilogue_size_needed),
23585 LTU, 0, counter_mode (count_exp), 1, label);
23586 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23587 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23588 else
23589 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23592 if (dynamic_check != -1)
23594 rtx hot_label = gen_label_rtx ();
23595 jump_around_label = gen_label_rtx ();
23596 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23597 LEU, 0, counter_mode (count_exp), 1, hot_label);
23598 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23599 set_storage_via_libcall (dst, count_exp, val_exp, false);
23600 emit_jump (jump_around_label);
23601 emit_label (hot_label);
23604 /* Step 2: Alignment prologue. */
23606 /* Do the expensive promotion once we branched off the small blocks. */
23607 if (!promoted_val)
23608 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23609 desired_align, align);
23610 gcc_assert (desired_align >= 1 && align >= 1);
23612 if (desired_align > align)
23614 if (align_bytes == 0)
23616 /* Except for the first move in epilogue, we no longer know
23617 constant offset in aliasing info. It don't seems to worth
23618 the pain to maintain it for the first move, so throw away
23619 the info early. */
23620 dst = change_address (dst, BLKmode, destreg);
23621 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23622 desired_align);
23624 else
23626 /* If we know how many bytes need to be stored before dst is
23627 sufficiently aligned, maintain aliasing info accurately. */
23628 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23629 desired_align, align_bytes);
23630 count_exp = plus_constant (counter_mode (count_exp),
23631 count_exp, -align_bytes);
23632 count -= align_bytes;
23634 if (need_zero_guard
23635 && (count < (unsigned HOST_WIDE_INT) size_needed
23636 || (align_bytes == 0
23637 && count < ((unsigned HOST_WIDE_INT) size_needed
23638 + desired_align - align))))
23640 /* It is possible that we copied enough so the main loop will not
23641 execute. */
23642 gcc_assert (size_needed > 1);
23643 if (label == NULL_RTX)
23644 label = gen_label_rtx ();
23645 emit_cmp_and_jump_insns (count_exp,
23646 GEN_INT (size_needed),
23647 LTU, 0, counter_mode (count_exp), 1, label);
23648 if (expected_size == -1
23649 || expected_size < (desired_align - align) / 2 + size_needed)
23650 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23651 else
23652 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23655 if (label && size_needed == 1)
23657 emit_label (label);
23658 LABEL_NUSES (label) = 1;
23659 label = NULL;
23660 promoted_val = val_exp;
23661 epilogue_size_needed = 1;
23663 else if (label == NULL_RTX)
23664 epilogue_size_needed = size_needed;
23666 /* Step 3: Main loop. */
23668 switch (alg)
23670 case libcall:
23671 case no_stringop:
23672 case last_alg:
23673 gcc_unreachable ();
23674 case loop_1_byte:
23675 case loop:
23676 case vector_loop:
23677 case unrolled_loop:
23678 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23679 count_exp, move_mode, unroll_factor,
23680 expected_size);
23681 break;
23682 case rep_prefix_8_byte:
23683 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23684 DImode, val_exp);
23685 break;
23686 case rep_prefix_4_byte:
23687 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23688 SImode, val_exp);
23689 break;
23690 case rep_prefix_1_byte:
23691 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23692 QImode, val_exp);
23693 break;
23695 /* Adjust properly the offset of src and dest memory for aliasing. */
23696 if (CONST_INT_P (count_exp))
23697 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23698 (count / size_needed) * size_needed);
23699 else
23700 dst = change_address (dst, BLKmode, destreg);
23702 /* Step 4: Epilogue to copy the remaining bytes. */
23704 if (label)
23706 /* When the main loop is done, COUNT_EXP might hold original count,
23707 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23708 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23709 bytes. Compensate if needed. */
23711 if (size_needed < epilogue_size_needed)
23713 tmp =
23714 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23715 GEN_INT (size_needed - 1), count_exp, 1,
23716 OPTAB_DIRECT);
23717 if (tmp != count_exp)
23718 emit_move_insn (count_exp, tmp);
23720 emit_label (label);
23721 LABEL_NUSES (label) = 1;
23723 epilogue:
23724 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23726 if (force_loopy_epilogue)
23727 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23728 epilogue_size_needed);
23729 else
23730 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23731 epilogue_size_needed);
23733 if (jump_around_label)
23734 emit_label (jump_around_label);
23735 return true;
23738 /* Expand the appropriate insns for doing strlen if not just doing
23739 repnz; scasb
23741 out = result, initialized with the start address
23742 align_rtx = alignment of the address.
23743 scratch = scratch register, initialized with the startaddress when
23744 not aligned, otherwise undefined
23746 This is just the body. It needs the initializations mentioned above and
23747 some address computing at the end. These things are done in i386.md. */
23749 static void
23750 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23752 int align;
23753 rtx tmp;
23754 rtx align_2_label = NULL_RTX;
23755 rtx align_3_label = NULL_RTX;
23756 rtx align_4_label = gen_label_rtx ();
23757 rtx end_0_label = gen_label_rtx ();
23758 rtx mem;
23759 rtx tmpreg = gen_reg_rtx (SImode);
23760 rtx scratch = gen_reg_rtx (SImode);
23761 rtx cmp;
23763 align = 0;
23764 if (CONST_INT_P (align_rtx))
23765 align = INTVAL (align_rtx);
23767 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23769 /* Is there a known alignment and is it less than 4? */
23770 if (align < 4)
23772 rtx scratch1 = gen_reg_rtx (Pmode);
23773 emit_move_insn (scratch1, out);
23774 /* Is there a known alignment and is it not 2? */
23775 if (align != 2)
23777 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23778 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23780 /* Leave just the 3 lower bits. */
23781 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23782 NULL_RTX, 0, OPTAB_WIDEN);
23784 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23785 Pmode, 1, align_4_label);
23786 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23787 Pmode, 1, align_2_label);
23788 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23789 Pmode, 1, align_3_label);
23791 else
23793 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23794 check if is aligned to 4 - byte. */
23796 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23797 NULL_RTX, 0, OPTAB_WIDEN);
23799 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23800 Pmode, 1, align_4_label);
23803 mem = change_address (src, QImode, out);
23805 /* Now compare the bytes. */
23807 /* Compare the first n unaligned byte on a byte per byte basis. */
23808 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23809 QImode, 1, end_0_label);
23811 /* Increment the address. */
23812 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23814 /* Not needed with an alignment of 2 */
23815 if (align != 2)
23817 emit_label (align_2_label);
23819 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23820 end_0_label);
23822 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23824 emit_label (align_3_label);
23827 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23828 end_0_label);
23830 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23833 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23834 align this loop. It gives only huge programs, but does not help to
23835 speed up. */
23836 emit_label (align_4_label);
23838 mem = change_address (src, SImode, out);
23839 emit_move_insn (scratch, mem);
23840 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23842 /* This formula yields a nonzero result iff one of the bytes is zero.
23843 This saves three branches inside loop and many cycles. */
23845 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23846 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23847 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23848 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23849 gen_int_mode (0x80808080, SImode)));
23850 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23851 align_4_label);
23853 if (TARGET_CMOVE)
23855 rtx reg = gen_reg_rtx (SImode);
23856 rtx reg2 = gen_reg_rtx (Pmode);
23857 emit_move_insn (reg, tmpreg);
23858 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23860 /* If zero is not in the first two bytes, move two bytes forward. */
23861 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23862 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23863 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23864 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23865 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23866 reg,
23867 tmpreg)));
23868 /* Emit lea manually to avoid clobbering of flags. */
23869 emit_insn (gen_rtx_SET (SImode, reg2,
23870 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23872 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23873 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23874 emit_insn (gen_rtx_SET (VOIDmode, out,
23875 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23876 reg2,
23877 out)));
23879 else
23881 rtx end_2_label = gen_label_rtx ();
23882 /* Is zero in the first two bytes? */
23884 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23885 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23886 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23887 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23888 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23889 pc_rtx);
23890 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23891 JUMP_LABEL (tmp) = end_2_label;
23893 /* Not in the first two. Move two bytes forward. */
23894 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23895 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23897 emit_label (end_2_label);
23901 /* Avoid branch in fixing the byte. */
23902 tmpreg = gen_lowpart (QImode, tmpreg);
23903 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23904 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23905 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23906 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23908 emit_label (end_0_label);
23911 /* Expand strlen. */
23913 bool
23914 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23916 rtx addr, scratch1, scratch2, scratch3, scratch4;
23918 /* The generic case of strlen expander is long. Avoid it's
23919 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23921 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23922 && !TARGET_INLINE_ALL_STRINGOPS
23923 && !optimize_insn_for_size_p ()
23924 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23925 return false;
23927 addr = force_reg (Pmode, XEXP (src, 0));
23928 scratch1 = gen_reg_rtx (Pmode);
23930 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23931 && !optimize_insn_for_size_p ())
23933 /* Well it seems that some optimizer does not combine a call like
23934 foo(strlen(bar), strlen(bar));
23935 when the move and the subtraction is done here. It does calculate
23936 the length just once when these instructions are done inside of
23937 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23938 often used and I use one fewer register for the lifetime of
23939 output_strlen_unroll() this is better. */
23941 emit_move_insn (out, addr);
23943 ix86_expand_strlensi_unroll_1 (out, src, align);
23945 /* strlensi_unroll_1 returns the address of the zero at the end of
23946 the string, like memchr(), so compute the length by subtracting
23947 the start address. */
23948 emit_insn (ix86_gen_sub3 (out, out, addr));
23950 else
23952 rtx unspec;
23954 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23955 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23956 return false;
23958 scratch2 = gen_reg_rtx (Pmode);
23959 scratch3 = gen_reg_rtx (Pmode);
23960 scratch4 = force_reg (Pmode, constm1_rtx);
23962 emit_move_insn (scratch3, addr);
23963 eoschar = force_reg (QImode, eoschar);
23965 src = replace_equiv_address_nv (src, scratch3);
23967 /* If .md starts supporting :P, this can be done in .md. */
23968 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23969 scratch4), UNSPEC_SCAS);
23970 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23971 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23972 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23974 return true;
23977 /* For given symbol (function) construct code to compute address of it's PLT
23978 entry in large x86-64 PIC model. */
23979 static rtx
23980 construct_plt_address (rtx symbol)
23982 rtx tmp, unspec;
23984 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23985 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
23986 gcc_assert (Pmode == DImode);
23988 tmp = gen_reg_rtx (Pmode);
23989 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23991 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23992 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23993 return tmp;
23997 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23998 rtx callarg2,
23999 rtx pop, bool sibcall)
24001 unsigned int const cregs_size
24002 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
24003 rtx vec[3 + cregs_size];
24004 rtx use = NULL, call;
24005 unsigned int vec_len = 0;
24007 if (pop == const0_rtx)
24008 pop = NULL;
24009 gcc_assert (!TARGET_64BIT || !pop);
24011 if (TARGET_MACHO && !TARGET_64BIT)
24013 #if TARGET_MACHO
24014 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
24015 fnaddr = machopic_indirect_call_target (fnaddr);
24016 #endif
24018 else
24020 /* Static functions and indirect calls don't need the pic register. */
24021 if (flag_pic
24022 && (!TARGET_64BIT
24023 || (ix86_cmodel == CM_LARGE_PIC
24024 && DEFAULT_ABI != MS_ABI))
24025 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24026 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
24027 use_reg (&use, pic_offset_table_rtx);
24030 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
24032 rtx al = gen_rtx_REG (QImode, AX_REG);
24033 emit_move_insn (al, callarg2);
24034 use_reg (&use, al);
24037 if (ix86_cmodel == CM_LARGE_PIC
24038 && !TARGET_PECOFF
24039 && MEM_P (fnaddr)
24040 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24041 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
24042 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
24043 else if (sibcall
24044 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
24045 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
24047 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
24048 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
24051 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
24052 if (retval)
24053 call = gen_rtx_SET (VOIDmode, retval, call);
24054 vec[vec_len++] = call;
24056 if (pop)
24058 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
24059 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
24060 vec[vec_len++] = pop;
24063 if (TARGET_64BIT_MS_ABI
24064 && (!callarg2 || INTVAL (callarg2) != -2))
24066 unsigned i;
24068 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
24069 UNSPEC_MS_TO_SYSV_CALL);
24071 for (i = 0; i < cregs_size; i++)
24073 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
24074 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
24076 vec[vec_len++]
24077 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
24081 if (vec_len > 1)
24082 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
24083 call = emit_call_insn (call);
24084 if (use)
24085 CALL_INSN_FUNCTION_USAGE (call) = use;
24087 return call;
24090 /* Output the assembly for a call instruction. */
24092 const char *
24093 ix86_output_call_insn (rtx insn, rtx call_op)
24095 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
24096 bool seh_nop_p = false;
24097 const char *xasm;
24099 if (SIBLING_CALL_P (insn))
24101 if (direct_p)
24102 xasm = "jmp\t%P0";
24103 /* SEH epilogue detection requires the indirect branch case
24104 to include REX.W. */
24105 else if (TARGET_SEH)
24106 xasm = "rex.W jmp %A0";
24107 else
24108 xasm = "jmp\t%A0";
24110 output_asm_insn (xasm, &call_op);
24111 return "";
24114 /* SEH unwinding can require an extra nop to be emitted in several
24115 circumstances. Determine if we have one of those. */
24116 if (TARGET_SEH)
24118 rtx i;
24120 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24122 /* If we get to another real insn, we don't need the nop. */
24123 if (INSN_P (i))
24124 break;
24126 /* If we get to the epilogue note, prevent a catch region from
24127 being adjacent to the standard epilogue sequence. If non-
24128 call-exceptions, we'll have done this during epilogue emission. */
24129 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24130 && !flag_non_call_exceptions
24131 && !can_throw_internal (insn))
24133 seh_nop_p = true;
24134 break;
24138 /* If we didn't find a real insn following the call, prevent the
24139 unwinder from looking into the next function. */
24140 if (i == NULL)
24141 seh_nop_p = true;
24144 if (direct_p)
24145 xasm = "call\t%P0";
24146 else
24147 xasm = "call\t%A0";
24149 output_asm_insn (xasm, &call_op);
24151 if (seh_nop_p)
24152 return "nop";
24154 return "";
24157 /* Clear stack slot assignments remembered from previous functions.
24158 This is called from INIT_EXPANDERS once before RTL is emitted for each
24159 function. */
24161 static struct machine_function *
24162 ix86_init_machine_status (void)
24164 struct machine_function *f;
24166 f = ggc_alloc_cleared_machine_function ();
24167 f->use_fast_prologue_epilogue_nregs = -1;
24168 f->call_abi = ix86_abi;
24170 return f;
24173 /* Return a MEM corresponding to a stack slot with mode MODE.
24174 Allocate a new slot if necessary.
24176 The RTL for a function can have several slots available: N is
24177 which slot to use. */
24180 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24182 struct stack_local_entry *s;
24184 gcc_assert (n < MAX_386_STACK_LOCALS);
24186 for (s = ix86_stack_locals; s; s = s->next)
24187 if (s->mode == mode && s->n == n)
24188 return validize_mem (copy_rtx (s->rtl));
24190 s = ggc_alloc_stack_local_entry ();
24191 s->n = n;
24192 s->mode = mode;
24193 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24195 s->next = ix86_stack_locals;
24196 ix86_stack_locals = s;
24197 return validize_mem (s->rtl);
24200 static void
24201 ix86_instantiate_decls (void)
24203 struct stack_local_entry *s;
24205 for (s = ix86_stack_locals; s; s = s->next)
24206 if (s->rtl != NULL_RTX)
24207 instantiate_decl_rtl (s->rtl);
24210 /* Calculate the length of the memory address in the instruction encoding.
24211 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24212 or other prefixes. We never generate addr32 prefix for LEA insn. */
24215 memory_address_length (rtx addr, bool lea)
24217 struct ix86_address parts;
24218 rtx base, index, disp;
24219 int len;
24220 int ok;
24222 if (GET_CODE (addr) == PRE_DEC
24223 || GET_CODE (addr) == POST_INC
24224 || GET_CODE (addr) == PRE_MODIFY
24225 || GET_CODE (addr) == POST_MODIFY)
24226 return 0;
24228 ok = ix86_decompose_address (addr, &parts);
24229 gcc_assert (ok);
24231 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24233 /* If this is not LEA instruction, add the length of addr32 prefix. */
24234 if (TARGET_64BIT && !lea
24235 && (SImode_address_operand (addr, VOIDmode)
24236 || (parts.base && GET_MODE (parts.base) == SImode)
24237 || (parts.index && GET_MODE (parts.index) == SImode)))
24238 len++;
24240 base = parts.base;
24241 index = parts.index;
24242 disp = parts.disp;
24244 if (base && GET_CODE (base) == SUBREG)
24245 base = SUBREG_REG (base);
24246 if (index && GET_CODE (index) == SUBREG)
24247 index = SUBREG_REG (index);
24249 gcc_assert (base == NULL_RTX || REG_P (base));
24250 gcc_assert (index == NULL_RTX || REG_P (index));
24252 /* Rule of thumb:
24253 - esp as the base always wants an index,
24254 - ebp as the base always wants a displacement,
24255 - r12 as the base always wants an index,
24256 - r13 as the base always wants a displacement. */
24258 /* Register Indirect. */
24259 if (base && !index && !disp)
24261 /* esp (for its index) and ebp (for its displacement) need
24262 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24263 code. */
24264 if (base == arg_pointer_rtx
24265 || base == frame_pointer_rtx
24266 || REGNO (base) == SP_REG
24267 || REGNO (base) == BP_REG
24268 || REGNO (base) == R12_REG
24269 || REGNO (base) == R13_REG)
24270 len++;
24273 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24274 is not disp32, but disp32(%rip), so for disp32
24275 SIB byte is needed, unless print_operand_address
24276 optimizes it into disp32(%rip) or (%rip) is implied
24277 by UNSPEC. */
24278 else if (disp && !base && !index)
24280 len += 4;
24281 if (TARGET_64BIT)
24283 rtx symbol = disp;
24285 if (GET_CODE (disp) == CONST)
24286 symbol = XEXP (disp, 0);
24287 if (GET_CODE (symbol) == PLUS
24288 && CONST_INT_P (XEXP (symbol, 1)))
24289 symbol = XEXP (symbol, 0);
24291 if (GET_CODE (symbol) != LABEL_REF
24292 && (GET_CODE (symbol) != SYMBOL_REF
24293 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
24294 && (GET_CODE (symbol) != UNSPEC
24295 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
24296 && XINT (symbol, 1) != UNSPEC_PCREL
24297 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
24298 len++;
24301 else
24303 /* Find the length of the displacement constant. */
24304 if (disp)
24306 if (base && satisfies_constraint_K (disp))
24307 len += 1;
24308 else
24309 len += 4;
24311 /* ebp always wants a displacement. Similarly r13. */
24312 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24313 len++;
24315 /* An index requires the two-byte modrm form.... */
24316 if (index
24317 /* ...like esp (or r12), which always wants an index. */
24318 || base == arg_pointer_rtx
24319 || base == frame_pointer_rtx
24320 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24321 len++;
24324 return len;
24327 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24328 is set, expect that insn have 8bit immediate alternative. */
24330 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24332 int len = 0;
24333 int i;
24334 extract_insn_cached (insn);
24335 for (i = recog_data.n_operands - 1; i >= 0; --i)
24336 if (CONSTANT_P (recog_data.operand[i]))
24338 enum attr_mode mode = get_attr_mode (insn);
24340 gcc_assert (!len);
24341 if (shortform && CONST_INT_P (recog_data.operand[i]))
24343 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24344 switch (mode)
24346 case MODE_QI:
24347 len = 1;
24348 continue;
24349 case MODE_HI:
24350 ival = trunc_int_for_mode (ival, HImode);
24351 break;
24352 case MODE_SI:
24353 ival = trunc_int_for_mode (ival, SImode);
24354 break;
24355 default:
24356 break;
24358 if (IN_RANGE (ival, -128, 127))
24360 len = 1;
24361 continue;
24364 switch (mode)
24366 case MODE_QI:
24367 len = 1;
24368 break;
24369 case MODE_HI:
24370 len = 2;
24371 break;
24372 case MODE_SI:
24373 len = 4;
24374 break;
24375 /* Immediates for DImode instructions are encoded
24376 as 32bit sign extended values. */
24377 case MODE_DI:
24378 len = 4;
24379 break;
24380 default:
24381 fatal_insn ("unknown insn mode", insn);
24384 return len;
24387 /* Compute default value for "length_address" attribute. */
24389 ix86_attr_length_address_default (rtx insn)
24391 int i;
24393 if (get_attr_type (insn) == TYPE_LEA)
24395 rtx set = PATTERN (insn), addr;
24397 if (GET_CODE (set) == PARALLEL)
24398 set = XVECEXP (set, 0, 0);
24400 gcc_assert (GET_CODE (set) == SET);
24402 addr = SET_SRC (set);
24404 return memory_address_length (addr, true);
24407 extract_insn_cached (insn);
24408 for (i = recog_data.n_operands - 1; i >= 0; --i)
24409 if (MEM_P (recog_data.operand[i]))
24411 constrain_operands_cached (reload_completed);
24412 if (which_alternative != -1)
24414 const char *constraints = recog_data.constraints[i];
24415 int alt = which_alternative;
24417 while (*constraints == '=' || *constraints == '+')
24418 constraints++;
24419 while (alt-- > 0)
24420 while (*constraints++ != ',')
24422 /* Skip ignored operands. */
24423 if (*constraints == 'X')
24424 continue;
24426 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24428 return 0;
24431 /* Compute default value for "length_vex" attribute. It includes
24432 2 or 3 byte VEX prefix and 1 opcode byte. */
24435 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24437 int i;
24439 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24440 byte VEX prefix. */
24441 if (!has_0f_opcode || has_vex_w)
24442 return 3 + 1;
24444 /* We can always use 2 byte VEX prefix in 32bit. */
24445 if (!TARGET_64BIT)
24446 return 2 + 1;
24448 extract_insn_cached (insn);
24450 for (i = recog_data.n_operands - 1; i >= 0; --i)
24451 if (REG_P (recog_data.operand[i]))
24453 /* REX.W bit uses 3 byte VEX prefix. */
24454 if (GET_MODE (recog_data.operand[i]) == DImode
24455 && GENERAL_REG_P (recog_data.operand[i]))
24456 return 3 + 1;
24458 else
24460 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24461 if (MEM_P (recog_data.operand[i])
24462 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24463 return 3 + 1;
24466 return 2 + 1;
24469 /* Return the maximum number of instructions a cpu can issue. */
24471 static int
24472 ix86_issue_rate (void)
24474 switch (ix86_tune)
24476 case PROCESSOR_PENTIUM:
24477 case PROCESSOR_ATOM:
24478 case PROCESSOR_SLM:
24479 case PROCESSOR_K6:
24480 case PROCESSOR_BTVER2:
24481 case PROCESSOR_PENTIUM4:
24482 case PROCESSOR_NOCONA:
24483 return 2;
24485 case PROCESSOR_PENTIUMPRO:
24486 case PROCESSOR_ATHLON:
24487 case PROCESSOR_K8:
24488 case PROCESSOR_AMDFAM10:
24489 case PROCESSOR_GENERIC:
24490 case PROCESSOR_BDVER1:
24491 case PROCESSOR_BDVER2:
24492 case PROCESSOR_BDVER3:
24493 case PROCESSOR_BTVER1:
24494 return 3;
24496 case PROCESSOR_CORE2:
24497 case PROCESSOR_COREI7:
24498 case PROCESSOR_HASWELL:
24499 return 4;
24501 default:
24502 return 1;
24506 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24507 by DEP_INSN and nothing set by DEP_INSN. */
24509 static bool
24510 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24512 rtx set, set2;
24514 /* Simplify the test for uninteresting insns. */
24515 if (insn_type != TYPE_SETCC
24516 && insn_type != TYPE_ICMOV
24517 && insn_type != TYPE_FCMOV
24518 && insn_type != TYPE_IBR)
24519 return false;
24521 if ((set = single_set (dep_insn)) != 0)
24523 set = SET_DEST (set);
24524 set2 = NULL_RTX;
24526 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24527 && XVECLEN (PATTERN (dep_insn), 0) == 2
24528 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24529 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24531 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24532 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24534 else
24535 return false;
24537 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24538 return false;
24540 /* This test is true if the dependent insn reads the flags but
24541 not any other potentially set register. */
24542 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24543 return false;
24545 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24546 return false;
24548 return true;
24551 /* Return true iff USE_INSN has a memory address with operands set by
24552 SET_INSN. */
24554 bool
24555 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24557 int i;
24558 extract_insn_cached (use_insn);
24559 for (i = recog_data.n_operands - 1; i >= 0; --i)
24560 if (MEM_P (recog_data.operand[i]))
24562 rtx addr = XEXP (recog_data.operand[i], 0);
24563 return modified_in_p (addr, set_insn) != 0;
24565 return false;
24568 /* Helper function for exact_store_load_dependency.
24569 Return true if addr is found in insn. */
24570 static bool
24571 exact_dependency_1 (rtx addr, rtx insn)
24573 enum rtx_code code;
24574 const char *format_ptr;
24575 int i, j;
24577 code = GET_CODE (insn);
24578 switch (code)
24580 case MEM:
24581 if (rtx_equal_p (addr, insn))
24582 return true;
24583 break;
24584 case REG:
24585 CASE_CONST_ANY:
24586 case SYMBOL_REF:
24587 case CODE_LABEL:
24588 case PC:
24589 case CC0:
24590 case EXPR_LIST:
24591 return false;
24592 default:
24593 break;
24596 format_ptr = GET_RTX_FORMAT (code);
24597 for (i = 0; i < GET_RTX_LENGTH (code); i++)
24599 switch (*format_ptr++)
24601 case 'e':
24602 if (exact_dependency_1 (addr, XEXP (insn, i)))
24603 return true;
24604 break;
24605 case 'E':
24606 for (j = 0; j < XVECLEN (insn, i); j++)
24607 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
24608 return true;
24609 break;
24612 return false;
24615 /* Return true if there exists exact dependency for store & load, i.e.
24616 the same memory address is used in them. */
24617 static bool
24618 exact_store_load_dependency (rtx store, rtx load)
24620 rtx set1, set2;
24622 set1 = single_set (store);
24623 if (!set1)
24624 return false;
24625 if (!MEM_P (SET_DEST (set1)))
24626 return false;
24627 set2 = single_set (load);
24628 if (!set2)
24629 return false;
24630 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
24631 return true;
24632 return false;
24635 static int
24636 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24638 enum attr_type insn_type, dep_insn_type;
24639 enum attr_memory memory;
24640 rtx set, set2;
24641 int dep_insn_code_number;
24643 /* Anti and output dependencies have zero cost on all CPUs. */
24644 if (REG_NOTE_KIND (link) != 0)
24645 return 0;
24647 dep_insn_code_number = recog_memoized (dep_insn);
24649 /* If we can't recognize the insns, we can't really do anything. */
24650 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24651 return cost;
24653 insn_type = get_attr_type (insn);
24654 dep_insn_type = get_attr_type (dep_insn);
24656 switch (ix86_tune)
24658 case PROCESSOR_PENTIUM:
24659 /* Address Generation Interlock adds a cycle of latency. */
24660 if (insn_type == TYPE_LEA)
24662 rtx addr = PATTERN (insn);
24664 if (GET_CODE (addr) == PARALLEL)
24665 addr = XVECEXP (addr, 0, 0);
24667 gcc_assert (GET_CODE (addr) == SET);
24669 addr = SET_SRC (addr);
24670 if (modified_in_p (addr, dep_insn))
24671 cost += 1;
24673 else if (ix86_agi_dependent (dep_insn, insn))
24674 cost += 1;
24676 /* ??? Compares pair with jump/setcc. */
24677 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24678 cost = 0;
24680 /* Floating point stores require value to be ready one cycle earlier. */
24681 if (insn_type == TYPE_FMOV
24682 && get_attr_memory (insn) == MEMORY_STORE
24683 && !ix86_agi_dependent (dep_insn, insn))
24684 cost += 1;
24685 break;
24687 case PROCESSOR_PENTIUMPRO:
24688 memory = get_attr_memory (insn);
24690 /* INT->FP conversion is expensive. */
24691 if (get_attr_fp_int_src (dep_insn))
24692 cost += 5;
24694 /* There is one cycle extra latency between an FP op and a store. */
24695 if (insn_type == TYPE_FMOV
24696 && (set = single_set (dep_insn)) != NULL_RTX
24697 && (set2 = single_set (insn)) != NULL_RTX
24698 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24699 && MEM_P (SET_DEST (set2)))
24700 cost += 1;
24702 /* Show ability of reorder buffer to hide latency of load by executing
24703 in parallel with previous instruction in case
24704 previous instruction is not needed to compute the address. */
24705 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24706 && !ix86_agi_dependent (dep_insn, insn))
24708 /* Claim moves to take one cycle, as core can issue one load
24709 at time and the next load can start cycle later. */
24710 if (dep_insn_type == TYPE_IMOV
24711 || dep_insn_type == TYPE_FMOV)
24712 cost = 1;
24713 else if (cost > 1)
24714 cost--;
24716 break;
24718 case PROCESSOR_K6:
24719 memory = get_attr_memory (insn);
24721 /* The esp dependency is resolved before the instruction is really
24722 finished. */
24723 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24724 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24725 return 1;
24727 /* INT->FP conversion is expensive. */
24728 if (get_attr_fp_int_src (dep_insn))
24729 cost += 5;
24731 /* Show ability of reorder buffer to hide latency of load by executing
24732 in parallel with previous instruction in case
24733 previous instruction is not needed to compute the address. */
24734 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24735 && !ix86_agi_dependent (dep_insn, insn))
24737 /* Claim moves to take one cycle, as core can issue one load
24738 at time and the next load can start cycle later. */
24739 if (dep_insn_type == TYPE_IMOV
24740 || dep_insn_type == TYPE_FMOV)
24741 cost = 1;
24742 else if (cost > 2)
24743 cost -= 2;
24744 else
24745 cost = 1;
24747 break;
24749 case PROCESSOR_ATHLON:
24750 case PROCESSOR_K8:
24751 case PROCESSOR_AMDFAM10:
24752 case PROCESSOR_BDVER1:
24753 case PROCESSOR_BDVER2:
24754 case PROCESSOR_BDVER3:
24755 case PROCESSOR_BTVER1:
24756 case PROCESSOR_BTVER2:
24757 case PROCESSOR_GENERIC:
24758 memory = get_attr_memory (insn);
24760 /* Stack engine allows to execute push&pop instructions in parall. */
24761 if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24762 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24763 && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8))
24764 return 0;
24766 /* Show ability of reorder buffer to hide latency of load by executing
24767 in parallel with previous instruction in case
24768 previous instruction is not needed to compute the address. */
24769 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24770 && !ix86_agi_dependent (dep_insn, insn))
24772 enum attr_unit unit = get_attr_unit (insn);
24773 int loadcost = 3;
24775 /* Because of the difference between the length of integer and
24776 floating unit pipeline preparation stages, the memory operands
24777 for floating point are cheaper.
24779 ??? For Athlon it the difference is most probably 2. */
24780 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24781 loadcost = 3;
24782 else
24783 loadcost = TARGET_ATHLON ? 2 : 0;
24785 if (cost >= loadcost)
24786 cost -= loadcost;
24787 else
24788 cost = 0;
24790 break;
24792 case PROCESSOR_CORE2:
24793 case PROCESSOR_COREI7:
24794 case PROCESSOR_HASWELL:
24795 memory = get_attr_memory (insn);
24797 /* Stack engine allows to execute push&pop instructions in parall. */
24798 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24799 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24800 return 0;
24802 /* Show ability of reorder buffer to hide latency of load by executing
24803 in parallel with previous instruction in case
24804 previous instruction is not needed to compute the address. */
24805 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24806 && !ix86_agi_dependent (dep_insn, insn))
24808 if (cost >= 4)
24809 cost -= 4;
24810 else
24811 cost = 0;
24813 break;
24815 case PROCESSOR_SLM:
24816 if (!reload_completed)
24817 return cost;
24819 /* Increase cost of integer loads. */
24820 memory = get_attr_memory (dep_insn);
24821 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24823 enum attr_unit unit = get_attr_unit (dep_insn);
24824 if (unit == UNIT_INTEGER && cost == 1)
24826 if (memory == MEMORY_LOAD)
24827 cost = 3;
24828 else
24830 /* Increase cost of ld/st for short int types only
24831 because of store forwarding issue. */
24832 rtx set = single_set (dep_insn);
24833 if (set && (GET_MODE (SET_DEST (set)) == QImode
24834 || GET_MODE (SET_DEST (set)) == HImode))
24836 /* Increase cost of store/load insn if exact
24837 dependence exists and it is load insn. */
24838 enum attr_memory insn_memory = get_attr_memory (insn);
24839 if (insn_memory == MEMORY_LOAD
24840 && exact_store_load_dependency (dep_insn, insn))
24841 cost = 3;
24847 default:
24848 break;
24851 return cost;
24854 /* How many alternative schedules to try. This should be as wide as the
24855 scheduling freedom in the DFA, but no wider. Making this value too
24856 large results extra work for the scheduler. */
24858 static int
24859 ia32_multipass_dfa_lookahead (void)
24861 switch (ix86_tune)
24863 case PROCESSOR_PENTIUM:
24864 return 2;
24866 case PROCESSOR_PENTIUMPRO:
24867 case PROCESSOR_K6:
24868 return 1;
24870 case PROCESSOR_CORE2:
24871 case PROCESSOR_COREI7:
24872 case PROCESSOR_HASWELL:
24873 case PROCESSOR_ATOM:
24874 case PROCESSOR_SLM:
24875 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24876 as many instructions can be executed on a cycle, i.e.,
24877 issue_rate. I wonder why tuning for many CPUs does not do this. */
24878 if (reload_completed)
24879 return ix86_issue_rate ();
24880 /* Don't use lookahead for pre-reload schedule to save compile time. */
24881 return 0;
24883 default:
24884 return 0;
24888 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24889 execution. It is applied if
24890 (1) IMUL instruction is on the top of list;
24891 (2) There exists the only producer of independent IMUL instruction in
24892 ready list.
24893 Return index of IMUL producer if it was found and -1 otherwise. */
24894 static int
24895 do_reorder_for_imul (rtx *ready, int n_ready)
24897 rtx insn, set, insn1, insn2;
24898 sd_iterator_def sd_it;
24899 dep_t dep;
24900 int index = -1;
24901 int i;
24903 if (ix86_tune != PROCESSOR_ATOM)
24904 return index;
24906 /* Check that IMUL instruction is on the top of ready list. */
24907 insn = ready[n_ready - 1];
24908 set = single_set (insn);
24909 if (!set)
24910 return index;
24911 if (!(GET_CODE (SET_SRC (set)) == MULT
24912 && GET_MODE (SET_SRC (set)) == SImode))
24913 return index;
24915 /* Search for producer of independent IMUL instruction. */
24916 for (i = n_ready - 2; i >= 0; i--)
24918 insn = ready[i];
24919 if (!NONDEBUG_INSN_P (insn))
24920 continue;
24921 /* Skip IMUL instruction. */
24922 insn2 = PATTERN (insn);
24923 if (GET_CODE (insn2) == PARALLEL)
24924 insn2 = XVECEXP (insn2, 0, 0);
24925 if (GET_CODE (insn2) == SET
24926 && GET_CODE (SET_SRC (insn2)) == MULT
24927 && GET_MODE (SET_SRC (insn2)) == SImode)
24928 continue;
24930 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24932 rtx con;
24933 con = DEP_CON (dep);
24934 if (!NONDEBUG_INSN_P (con))
24935 continue;
24936 insn1 = PATTERN (con);
24937 if (GET_CODE (insn1) == PARALLEL)
24938 insn1 = XVECEXP (insn1, 0, 0);
24940 if (GET_CODE (insn1) == SET
24941 && GET_CODE (SET_SRC (insn1)) == MULT
24942 && GET_MODE (SET_SRC (insn1)) == SImode)
24944 sd_iterator_def sd_it1;
24945 dep_t dep1;
24946 /* Check if there is no other dependee for IMUL. */
24947 index = i;
24948 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24950 rtx pro;
24951 pro = DEP_PRO (dep1);
24952 if (!NONDEBUG_INSN_P (pro))
24953 continue;
24954 if (pro != insn)
24955 index = -1;
24957 if (index >= 0)
24958 break;
24961 if (index >= 0)
24962 break;
24964 return index;
24967 /* Try to find the best candidate on the top of ready list if two insns
24968 have the same priority - candidate is best if its dependees were
24969 scheduled earlier. Applied for Silvermont only.
24970 Return true if top 2 insns must be interchanged. */
24971 static bool
24972 swap_top_of_ready_list (rtx *ready, int n_ready)
24974 rtx top = ready[n_ready - 1];
24975 rtx next = ready[n_ready - 2];
24976 rtx set;
24977 sd_iterator_def sd_it;
24978 dep_t dep;
24979 int clock1 = -1;
24980 int clock2 = -1;
24981 #define INSN_TICK(INSN) (HID (INSN)->tick)
24983 if (ix86_tune != PROCESSOR_SLM)
24984 return false;
24986 if (!NONDEBUG_INSN_P (top))
24987 return false;
24988 if (!NONJUMP_INSN_P (top))
24989 return false;
24990 if (!NONDEBUG_INSN_P (next))
24991 return false;
24992 if (!NONJUMP_INSN_P (next))
24993 return false;
24994 set = single_set (top);
24995 if (!set)
24996 return false;
24997 set = single_set (next);
24998 if (!set)
24999 return false;
25001 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
25003 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
25004 return false;
25005 /* Determine winner more precise. */
25006 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
25008 rtx pro;
25009 pro = DEP_PRO (dep);
25010 if (!NONDEBUG_INSN_P (pro))
25011 continue;
25012 if (INSN_TICK (pro) > clock1)
25013 clock1 = INSN_TICK (pro);
25015 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
25017 rtx pro;
25018 pro = DEP_PRO (dep);
25019 if (!NONDEBUG_INSN_P (pro))
25020 continue;
25021 if (INSN_TICK (pro) > clock2)
25022 clock2 = INSN_TICK (pro);
25025 if (clock1 == clock2)
25027 /* Determine winner - load must win. */
25028 enum attr_memory memory1, memory2;
25029 memory1 = get_attr_memory (top);
25030 memory2 = get_attr_memory (next);
25031 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
25032 return true;
25034 return (bool) (clock2 < clock1);
25036 return false;
25037 #undef INSN_TICK
25040 /* Perform possible reodering of ready list for Atom/Silvermont only.
25041 Return issue rate. */
25042 static int
25043 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
25044 int clock_var)
25046 int issue_rate = -1;
25047 int n_ready = *pn_ready;
25048 int i;
25049 rtx insn;
25050 int index = -1;
25052 /* Set up issue rate. */
25053 issue_rate = ix86_issue_rate ();
25055 /* Do reodering for Atom/SLM only. */
25056 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
25057 return issue_rate;
25059 /* Nothing to do if ready list contains only 1 instruction. */
25060 if (n_ready <= 1)
25061 return issue_rate;
25063 /* Do reodering for post-reload scheduler only. */
25064 if (!reload_completed)
25065 return issue_rate;
25067 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
25069 if (sched_verbose > 1)
25070 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
25071 INSN_UID (ready[index]));
25073 /* Put IMUL producer (ready[index]) at the top of ready list. */
25074 insn = ready[index];
25075 for (i = index; i < n_ready - 1; i++)
25076 ready[i] = ready[i + 1];
25077 ready[n_ready - 1] = insn;
25078 return issue_rate;
25080 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
25082 if (sched_verbose > 1)
25083 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
25084 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
25085 /* Swap 2 top elements of ready list. */
25086 insn = ready[n_ready - 1];
25087 ready[n_ready - 1] = ready[n_ready - 2];
25088 ready[n_ready - 2] = insn;
25090 return issue_rate;
25093 static bool
25094 ix86_class_likely_spilled_p (reg_class_t);
25096 /* Returns true if lhs of insn is HW function argument register and set up
25097 is_spilled to true if it is likely spilled HW register. */
25098 static bool
25099 insn_is_function_arg (rtx insn, bool* is_spilled)
25101 rtx dst;
25103 if (!NONDEBUG_INSN_P (insn))
25104 return false;
25105 /* Call instructions are not movable, ignore it. */
25106 if (CALL_P (insn))
25107 return false;
25108 insn = PATTERN (insn);
25109 if (GET_CODE (insn) == PARALLEL)
25110 insn = XVECEXP (insn, 0, 0);
25111 if (GET_CODE (insn) != SET)
25112 return false;
25113 dst = SET_DEST (insn);
25114 if (REG_P (dst) && HARD_REGISTER_P (dst)
25115 && ix86_function_arg_regno_p (REGNO (dst)))
25117 /* Is it likely spilled HW register? */
25118 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
25119 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
25120 *is_spilled = true;
25121 return true;
25123 return false;
25126 /* Add output dependencies for chain of function adjacent arguments if only
25127 there is a move to likely spilled HW register. Return first argument
25128 if at least one dependence was added or NULL otherwise. */
25129 static rtx
25130 add_parameter_dependencies (rtx call, rtx head)
25132 rtx insn;
25133 rtx last = call;
25134 rtx first_arg = NULL;
25135 bool is_spilled = false;
25137 head = PREV_INSN (head);
25139 /* Find nearest to call argument passing instruction. */
25140 while (true)
25142 last = PREV_INSN (last);
25143 if (last == head)
25144 return NULL;
25145 if (!NONDEBUG_INSN_P (last))
25146 continue;
25147 if (insn_is_function_arg (last, &is_spilled))
25148 break;
25149 return NULL;
25152 first_arg = last;
25153 while (true)
25155 insn = PREV_INSN (last);
25156 if (!INSN_P (insn))
25157 break;
25158 if (insn == head)
25159 break;
25160 if (!NONDEBUG_INSN_P (insn))
25162 last = insn;
25163 continue;
25165 if (insn_is_function_arg (insn, &is_spilled))
25167 /* Add output depdendence between two function arguments if chain
25168 of output arguments contains likely spilled HW registers. */
25169 if (is_spilled)
25170 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25171 first_arg = last = insn;
25173 else
25174 break;
25176 if (!is_spilled)
25177 return NULL;
25178 return first_arg;
25181 /* Add output or anti dependency from insn to first_arg to restrict its code
25182 motion. */
25183 static void
25184 avoid_func_arg_motion (rtx first_arg, rtx insn)
25186 rtx set;
25187 rtx tmp;
25189 set = single_set (insn);
25190 if (!set)
25191 return;
25192 tmp = SET_DEST (set);
25193 if (REG_P (tmp))
25195 /* Add output dependency to the first function argument. */
25196 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25197 return;
25199 /* Add anti dependency. */
25200 add_dependence (first_arg, insn, REG_DEP_ANTI);
25203 /* Avoid cross block motion of function argument through adding dependency
25204 from the first non-jump instruction in bb. */
25205 static void
25206 add_dependee_for_func_arg (rtx arg, basic_block bb)
25208 rtx insn = BB_END (bb);
25210 while (insn)
25212 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25214 rtx set = single_set (insn);
25215 if (set)
25217 avoid_func_arg_motion (arg, insn);
25218 return;
25221 if (insn == BB_HEAD (bb))
25222 return;
25223 insn = PREV_INSN (insn);
25227 /* Hook for pre-reload schedule - avoid motion of function arguments
25228 passed in likely spilled HW registers. */
25229 static void
25230 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25232 rtx insn;
25233 rtx first_arg = NULL;
25234 if (reload_completed)
25235 return;
25236 while (head != tail && DEBUG_INSN_P (head))
25237 head = NEXT_INSN (head);
25238 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25239 if (INSN_P (insn) && CALL_P (insn))
25241 first_arg = add_parameter_dependencies (insn, head);
25242 if (first_arg)
25244 /* Add dependee for first argument to predecessors if only
25245 region contains more than one block. */
25246 basic_block bb = BLOCK_FOR_INSN (insn);
25247 int rgn = CONTAINING_RGN (bb->index);
25248 int nr_blks = RGN_NR_BLOCKS (rgn);
25249 /* Skip trivial regions and region head blocks that can have
25250 predecessors outside of region. */
25251 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25253 edge e;
25254 edge_iterator ei;
25255 /* Assume that region is SCC, i.e. all immediate predecessors
25256 of non-head block are in the same region. */
25257 FOR_EACH_EDGE (e, ei, bb->preds)
25259 /* Avoid creating of loop-carried dependencies through
25260 using topological odering in region. */
25261 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25262 add_dependee_for_func_arg (first_arg, e->src);
25265 insn = first_arg;
25266 if (insn == head)
25267 break;
25270 else if (first_arg)
25271 avoid_func_arg_motion (first_arg, insn);
25274 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25275 HW registers to maximum, to schedule them at soon as possible. These are
25276 moves from function argument registers at the top of the function entry
25277 and moves from function return value registers after call. */
25278 static int
25279 ix86_adjust_priority (rtx insn, int priority)
25281 rtx set;
25283 if (reload_completed)
25284 return priority;
25286 if (!NONDEBUG_INSN_P (insn))
25287 return priority;
25289 set = single_set (insn);
25290 if (set)
25292 rtx tmp = SET_SRC (set);
25293 if (REG_P (tmp)
25294 && HARD_REGISTER_P (tmp)
25295 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25296 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25297 return current_sched_info->sched_max_insns_priority;
25300 return priority;
25303 /* Model decoder of Core 2/i7.
25304 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25305 track the instruction fetch block boundaries and make sure that long
25306 (9+ bytes) instructions are assigned to D0. */
25308 /* Maximum length of an insn that can be handled by
25309 a secondary decoder unit. '8' for Core 2/i7. */
25310 static int core2i7_secondary_decoder_max_insn_size;
25312 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25313 '16' for Core 2/i7. */
25314 static int core2i7_ifetch_block_size;
25316 /* Maximum number of instructions decoder can handle per cycle.
25317 '6' for Core 2/i7. */
25318 static int core2i7_ifetch_block_max_insns;
25320 typedef struct ix86_first_cycle_multipass_data_ *
25321 ix86_first_cycle_multipass_data_t;
25322 typedef const struct ix86_first_cycle_multipass_data_ *
25323 const_ix86_first_cycle_multipass_data_t;
25325 /* A variable to store target state across calls to max_issue within
25326 one cycle. */
25327 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25328 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25330 /* Initialize DATA. */
25331 static void
25332 core2i7_first_cycle_multipass_init (void *_data)
25334 ix86_first_cycle_multipass_data_t data
25335 = (ix86_first_cycle_multipass_data_t) _data;
25337 data->ifetch_block_len = 0;
25338 data->ifetch_block_n_insns = 0;
25339 data->ready_try_change = NULL;
25340 data->ready_try_change_size = 0;
25343 /* Advancing the cycle; reset ifetch block counts. */
25344 static void
25345 core2i7_dfa_post_advance_cycle (void)
25347 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
25349 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25351 data->ifetch_block_len = 0;
25352 data->ifetch_block_n_insns = 0;
25355 static int min_insn_size (rtx);
25357 /* Filter out insns from ready_try that the core will not be able to issue
25358 on current cycle due to decoder. */
25359 static void
25360 core2i7_first_cycle_multipass_filter_ready_try
25361 (const_ix86_first_cycle_multipass_data_t data,
25362 char *ready_try, int n_ready, bool first_cycle_insn_p)
25364 while (n_ready--)
25366 rtx insn;
25367 int insn_size;
25369 if (ready_try[n_ready])
25370 continue;
25372 insn = get_ready_element (n_ready);
25373 insn_size = min_insn_size (insn);
25375 if (/* If this is a too long an insn for a secondary decoder ... */
25376 (!first_cycle_insn_p
25377 && insn_size > core2i7_secondary_decoder_max_insn_size)
25378 /* ... or it would not fit into the ifetch block ... */
25379 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
25380 /* ... or the decoder is full already ... */
25381 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
25382 /* ... mask the insn out. */
25384 ready_try[n_ready] = 1;
25386 if (data->ready_try_change)
25387 bitmap_set_bit (data->ready_try_change, n_ready);
25392 /* Prepare for a new round of multipass lookahead scheduling. */
25393 static void
25394 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
25395 bool first_cycle_insn_p)
25397 ix86_first_cycle_multipass_data_t data
25398 = (ix86_first_cycle_multipass_data_t) _data;
25399 const_ix86_first_cycle_multipass_data_t prev_data
25400 = ix86_first_cycle_multipass_data;
25402 /* Restore the state from the end of the previous round. */
25403 data->ifetch_block_len = prev_data->ifetch_block_len;
25404 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
25406 /* Filter instructions that cannot be issued on current cycle due to
25407 decoder restrictions. */
25408 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25409 first_cycle_insn_p);
25412 /* INSN is being issued in current solution. Account for its impact on
25413 the decoder model. */
25414 static void
25415 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
25416 rtx insn, const void *_prev_data)
25418 ix86_first_cycle_multipass_data_t data
25419 = (ix86_first_cycle_multipass_data_t) _data;
25420 const_ix86_first_cycle_multipass_data_t prev_data
25421 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
25423 int insn_size = min_insn_size (insn);
25425 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
25426 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
25427 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
25428 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25430 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
25431 if (!data->ready_try_change)
25433 data->ready_try_change = sbitmap_alloc (n_ready);
25434 data->ready_try_change_size = n_ready;
25436 else if (data->ready_try_change_size < n_ready)
25438 data->ready_try_change = sbitmap_resize (data->ready_try_change,
25439 n_ready, 0);
25440 data->ready_try_change_size = n_ready;
25442 bitmap_clear (data->ready_try_change);
25444 /* Filter out insns from ready_try that the core will not be able to issue
25445 on current cycle due to decoder. */
25446 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25447 false);
25450 /* Revert the effect on ready_try. */
25451 static void
25452 core2i7_first_cycle_multipass_backtrack (const void *_data,
25453 char *ready_try,
25454 int n_ready ATTRIBUTE_UNUSED)
25456 const_ix86_first_cycle_multipass_data_t data
25457 = (const_ix86_first_cycle_multipass_data_t) _data;
25458 unsigned int i = 0;
25459 sbitmap_iterator sbi;
25461 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
25462 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
25464 ready_try[i] = 0;
25468 /* Save the result of multipass lookahead scheduling for the next round. */
25469 static void
25470 core2i7_first_cycle_multipass_end (const void *_data)
25472 const_ix86_first_cycle_multipass_data_t data
25473 = (const_ix86_first_cycle_multipass_data_t) _data;
25474 ix86_first_cycle_multipass_data_t next_data
25475 = ix86_first_cycle_multipass_data;
25477 if (data != NULL)
25479 next_data->ifetch_block_len = data->ifetch_block_len;
25480 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
25484 /* Deallocate target data. */
25485 static void
25486 core2i7_first_cycle_multipass_fini (void *_data)
25488 ix86_first_cycle_multipass_data_t data
25489 = (ix86_first_cycle_multipass_data_t) _data;
25491 if (data->ready_try_change)
25493 sbitmap_free (data->ready_try_change);
25494 data->ready_try_change = NULL;
25495 data->ready_try_change_size = 0;
25499 /* Prepare for scheduling pass. */
25500 static void
25501 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
25502 int verbose ATTRIBUTE_UNUSED,
25503 int max_uid ATTRIBUTE_UNUSED)
25505 /* Install scheduling hooks for current CPU. Some of these hooks are used
25506 in time-critical parts of the scheduler, so we only set them up when
25507 they are actually used. */
25508 switch (ix86_tune)
25510 case PROCESSOR_CORE2:
25511 case PROCESSOR_COREI7:
25512 case PROCESSOR_HASWELL:
25513 /* Do not perform multipass scheduling for pre-reload schedule
25514 to save compile time. */
25515 if (reload_completed)
25517 targetm.sched.dfa_post_advance_cycle
25518 = core2i7_dfa_post_advance_cycle;
25519 targetm.sched.first_cycle_multipass_init
25520 = core2i7_first_cycle_multipass_init;
25521 targetm.sched.first_cycle_multipass_begin
25522 = core2i7_first_cycle_multipass_begin;
25523 targetm.sched.first_cycle_multipass_issue
25524 = core2i7_first_cycle_multipass_issue;
25525 targetm.sched.first_cycle_multipass_backtrack
25526 = core2i7_first_cycle_multipass_backtrack;
25527 targetm.sched.first_cycle_multipass_end
25528 = core2i7_first_cycle_multipass_end;
25529 targetm.sched.first_cycle_multipass_fini
25530 = core2i7_first_cycle_multipass_fini;
25532 /* Set decoder parameters. */
25533 core2i7_secondary_decoder_max_insn_size = 8;
25534 core2i7_ifetch_block_size = 16;
25535 core2i7_ifetch_block_max_insns = 6;
25536 break;
25538 /* ... Fall through ... */
25539 default:
25540 targetm.sched.dfa_post_advance_cycle = NULL;
25541 targetm.sched.first_cycle_multipass_init = NULL;
25542 targetm.sched.first_cycle_multipass_begin = NULL;
25543 targetm.sched.first_cycle_multipass_issue = NULL;
25544 targetm.sched.first_cycle_multipass_backtrack = NULL;
25545 targetm.sched.first_cycle_multipass_end = NULL;
25546 targetm.sched.first_cycle_multipass_fini = NULL;
25547 break;
25552 /* Compute the alignment given to a constant that is being placed in memory.
25553 EXP is the constant and ALIGN is the alignment that the object would
25554 ordinarily have.
25555 The value of this function is used instead of that alignment to align
25556 the object. */
25559 ix86_constant_alignment (tree exp, int align)
25561 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
25562 || TREE_CODE (exp) == INTEGER_CST)
25564 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
25565 return 64;
25566 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
25567 return 128;
25569 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
25570 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
25571 return BITS_PER_WORD;
25573 return align;
25576 /* Compute the alignment for a static variable.
25577 TYPE is the data type, and ALIGN is the alignment that
25578 the object would ordinarily have. The value of this function is used
25579 instead of that alignment to align the object. */
25582 ix86_data_alignment (tree type, int align, bool opt)
25584 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
25586 if (opt
25587 && AGGREGATE_TYPE_P (type)
25588 && TYPE_SIZE (type)
25589 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25590 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
25591 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
25592 && align < max_align)
25593 align = max_align;
25595 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25596 to 16byte boundary. */
25597 if (TARGET_64BIT)
25599 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
25600 && TYPE_SIZE (type)
25601 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25602 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
25603 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25604 return 128;
25607 if (!opt)
25608 return align;
25610 if (TREE_CODE (type) == ARRAY_TYPE)
25612 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25613 return 64;
25614 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25615 return 128;
25617 else if (TREE_CODE (type) == COMPLEX_TYPE)
25620 if (TYPE_MODE (type) == DCmode && align < 64)
25621 return 64;
25622 if ((TYPE_MODE (type) == XCmode
25623 || TYPE_MODE (type) == TCmode) && align < 128)
25624 return 128;
25626 else if ((TREE_CODE (type) == RECORD_TYPE
25627 || TREE_CODE (type) == UNION_TYPE
25628 || TREE_CODE (type) == QUAL_UNION_TYPE)
25629 && TYPE_FIELDS (type))
25631 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25632 return 64;
25633 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25634 return 128;
25636 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25637 || TREE_CODE (type) == INTEGER_TYPE)
25639 if (TYPE_MODE (type) == DFmode && align < 64)
25640 return 64;
25641 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25642 return 128;
25645 return align;
25648 /* Compute the alignment for a local variable or a stack slot. EXP is
25649 the data type or decl itself, MODE is the widest mode available and
25650 ALIGN is the alignment that the object would ordinarily have. The
25651 value of this macro is used instead of that alignment to align the
25652 object. */
25654 unsigned int
25655 ix86_local_alignment (tree exp, enum machine_mode mode,
25656 unsigned int align)
25658 tree type, decl;
25660 if (exp && DECL_P (exp))
25662 type = TREE_TYPE (exp);
25663 decl = exp;
25665 else
25667 type = exp;
25668 decl = NULL;
25671 /* Don't do dynamic stack realignment for long long objects with
25672 -mpreferred-stack-boundary=2. */
25673 if (!TARGET_64BIT
25674 && align == 64
25675 && ix86_preferred_stack_boundary < 64
25676 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25677 && (!type || !TYPE_USER_ALIGN (type))
25678 && (!decl || !DECL_USER_ALIGN (decl)))
25679 align = 32;
25681 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25682 register in MODE. We will return the largest alignment of XF
25683 and DF. */
25684 if (!type)
25686 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25687 align = GET_MODE_ALIGNMENT (DFmode);
25688 return align;
25691 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25692 to 16byte boundary. Exact wording is:
25694 An array uses the same alignment as its elements, except that a local or
25695 global array variable of length at least 16 bytes or
25696 a C99 variable-length array variable always has alignment of at least 16 bytes.
25698 This was added to allow use of aligned SSE instructions at arrays. This
25699 rule is meant for static storage (where compiler can not do the analysis
25700 by itself). We follow it for automatic variables only when convenient.
25701 We fully control everything in the function compiled and functions from
25702 other unit can not rely on the alignment.
25704 Exclude va_list type. It is the common case of local array where
25705 we can not benefit from the alignment.
25707 TODO: Probably one should optimize for size only when var is not escaping. */
25708 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25709 && TARGET_SSE)
25711 if (AGGREGATE_TYPE_P (type)
25712 && (va_list_type_node == NULL_TREE
25713 || (TYPE_MAIN_VARIANT (type)
25714 != TYPE_MAIN_VARIANT (va_list_type_node)))
25715 && TYPE_SIZE (type)
25716 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25717 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25718 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25719 return 128;
25721 if (TREE_CODE (type) == ARRAY_TYPE)
25723 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25724 return 64;
25725 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25726 return 128;
25728 else if (TREE_CODE (type) == COMPLEX_TYPE)
25730 if (TYPE_MODE (type) == DCmode && align < 64)
25731 return 64;
25732 if ((TYPE_MODE (type) == XCmode
25733 || TYPE_MODE (type) == TCmode) && align < 128)
25734 return 128;
25736 else if ((TREE_CODE (type) == RECORD_TYPE
25737 || TREE_CODE (type) == UNION_TYPE
25738 || TREE_CODE (type) == QUAL_UNION_TYPE)
25739 && TYPE_FIELDS (type))
25741 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25742 return 64;
25743 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25744 return 128;
25746 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25747 || TREE_CODE (type) == INTEGER_TYPE)
25750 if (TYPE_MODE (type) == DFmode && align < 64)
25751 return 64;
25752 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25753 return 128;
25755 return align;
25758 /* Compute the minimum required alignment for dynamic stack realignment
25759 purposes for a local variable, parameter or a stack slot. EXP is
25760 the data type or decl itself, MODE is its mode and ALIGN is the
25761 alignment that the object would ordinarily have. */
25763 unsigned int
25764 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25765 unsigned int align)
25767 tree type, decl;
25769 if (exp && DECL_P (exp))
25771 type = TREE_TYPE (exp);
25772 decl = exp;
25774 else
25776 type = exp;
25777 decl = NULL;
25780 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25781 return align;
25783 /* Don't do dynamic stack realignment for long long objects with
25784 -mpreferred-stack-boundary=2. */
25785 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25786 && (!type || !TYPE_USER_ALIGN (type))
25787 && (!decl || !DECL_USER_ALIGN (decl)))
25788 return 32;
25790 return align;
25793 /* Find a location for the static chain incoming to a nested function.
25794 This is a register, unless all free registers are used by arguments. */
25796 static rtx
25797 ix86_static_chain (const_tree fndecl, bool incoming_p)
25799 unsigned regno;
25801 if (!DECL_STATIC_CHAIN (fndecl))
25802 return NULL;
25804 if (TARGET_64BIT)
25806 /* We always use R10 in 64-bit mode. */
25807 regno = R10_REG;
25809 else
25811 tree fntype;
25812 unsigned int ccvt;
25814 /* By default in 32-bit mode we use ECX to pass the static chain. */
25815 regno = CX_REG;
25817 fntype = TREE_TYPE (fndecl);
25818 ccvt = ix86_get_callcvt (fntype);
25819 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25821 /* Fastcall functions use ecx/edx for arguments, which leaves
25822 us with EAX for the static chain.
25823 Thiscall functions use ecx for arguments, which also
25824 leaves us with EAX for the static chain. */
25825 regno = AX_REG;
25827 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25829 /* Thiscall functions use ecx for arguments, which leaves
25830 us with EAX and EDX for the static chain.
25831 We are using for abi-compatibility EAX. */
25832 regno = AX_REG;
25834 else if (ix86_function_regparm (fntype, fndecl) == 3)
25836 /* For regparm 3, we have no free call-clobbered registers in
25837 which to store the static chain. In order to implement this,
25838 we have the trampoline push the static chain to the stack.
25839 However, we can't push a value below the return address when
25840 we call the nested function directly, so we have to use an
25841 alternate entry point. For this we use ESI, and have the
25842 alternate entry point push ESI, so that things appear the
25843 same once we're executing the nested function. */
25844 if (incoming_p)
25846 if (fndecl == current_function_decl)
25847 ix86_static_chain_on_stack = true;
25848 return gen_frame_mem (SImode,
25849 plus_constant (Pmode,
25850 arg_pointer_rtx, -8));
25852 regno = SI_REG;
25856 return gen_rtx_REG (Pmode, regno);
25859 /* Emit RTL insns to initialize the variable parts of a trampoline.
25860 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25861 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25862 to be passed to the target function. */
25864 static void
25865 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25867 rtx mem, fnaddr;
25868 int opcode;
25869 int offset = 0;
25871 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25873 if (TARGET_64BIT)
25875 int size;
25877 /* Load the function address to r11. Try to load address using
25878 the shorter movl instead of movabs. We may want to support
25879 movq for kernel mode, but kernel does not use trampolines at
25880 the moment. FNADDR is a 32bit address and may not be in
25881 DImode when ptr_mode == SImode. Always use movl in this
25882 case. */
25883 if (ptr_mode == SImode
25884 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25886 fnaddr = copy_addr_to_reg (fnaddr);
25888 mem = adjust_address (m_tramp, HImode, offset);
25889 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25891 mem = adjust_address (m_tramp, SImode, offset + 2);
25892 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25893 offset += 6;
25895 else
25897 mem = adjust_address (m_tramp, HImode, offset);
25898 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25900 mem = adjust_address (m_tramp, DImode, offset + 2);
25901 emit_move_insn (mem, fnaddr);
25902 offset += 10;
25905 /* Load static chain using movabs to r10. Use the shorter movl
25906 instead of movabs when ptr_mode == SImode. */
25907 if (ptr_mode == SImode)
25909 opcode = 0xba41;
25910 size = 6;
25912 else
25914 opcode = 0xba49;
25915 size = 10;
25918 mem = adjust_address (m_tramp, HImode, offset);
25919 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25921 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25922 emit_move_insn (mem, chain_value);
25923 offset += size;
25925 /* Jump to r11; the last (unused) byte is a nop, only there to
25926 pad the write out to a single 32-bit store. */
25927 mem = adjust_address (m_tramp, SImode, offset);
25928 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25929 offset += 4;
25931 else
25933 rtx disp, chain;
25935 /* Depending on the static chain location, either load a register
25936 with a constant, or push the constant to the stack. All of the
25937 instructions are the same size. */
25938 chain = ix86_static_chain (fndecl, true);
25939 if (REG_P (chain))
25941 switch (REGNO (chain))
25943 case AX_REG:
25944 opcode = 0xb8; break;
25945 case CX_REG:
25946 opcode = 0xb9; break;
25947 default:
25948 gcc_unreachable ();
25951 else
25952 opcode = 0x68;
25954 mem = adjust_address (m_tramp, QImode, offset);
25955 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25957 mem = adjust_address (m_tramp, SImode, offset + 1);
25958 emit_move_insn (mem, chain_value);
25959 offset += 5;
25961 mem = adjust_address (m_tramp, QImode, offset);
25962 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25964 mem = adjust_address (m_tramp, SImode, offset + 1);
25966 /* Compute offset from the end of the jmp to the target function.
25967 In the case in which the trampoline stores the static chain on
25968 the stack, we need to skip the first insn which pushes the
25969 (call-saved) register static chain; this push is 1 byte. */
25970 offset += 5;
25971 disp = expand_binop (SImode, sub_optab, fnaddr,
25972 plus_constant (Pmode, XEXP (m_tramp, 0),
25973 offset - (MEM_P (chain) ? 1 : 0)),
25974 NULL_RTX, 1, OPTAB_DIRECT);
25975 emit_move_insn (mem, disp);
25978 gcc_assert (offset <= TRAMPOLINE_SIZE);
25980 #ifdef HAVE_ENABLE_EXECUTE_STACK
25981 #ifdef CHECK_EXECUTE_STACK_ENABLED
25982 if (CHECK_EXECUTE_STACK_ENABLED)
25983 #endif
25984 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25985 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25986 #endif
25989 /* The following file contains several enumerations and data structures
25990 built from the definitions in i386-builtin-types.def. */
25992 #include "i386-builtin-types.inc"
25994 /* Table for the ix86 builtin non-function types. */
25995 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25997 /* Retrieve an element from the above table, building some of
25998 the types lazily. */
26000 static tree
26001 ix86_get_builtin_type (enum ix86_builtin_type tcode)
26003 unsigned int index;
26004 tree type, itype;
26006 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
26008 type = ix86_builtin_type_tab[(int) tcode];
26009 if (type != NULL)
26010 return type;
26012 gcc_assert (tcode > IX86_BT_LAST_PRIM);
26013 if (tcode <= IX86_BT_LAST_VECT)
26015 enum machine_mode mode;
26017 index = tcode - IX86_BT_LAST_PRIM - 1;
26018 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
26019 mode = ix86_builtin_type_vect_mode[index];
26021 type = build_vector_type_for_mode (itype, mode);
26023 else
26025 int quals;
26027 index = tcode - IX86_BT_LAST_VECT - 1;
26028 if (tcode <= IX86_BT_LAST_PTR)
26029 quals = TYPE_UNQUALIFIED;
26030 else
26031 quals = TYPE_QUAL_CONST;
26033 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
26034 if (quals != TYPE_UNQUALIFIED)
26035 itype = build_qualified_type (itype, quals);
26037 type = build_pointer_type (itype);
26040 ix86_builtin_type_tab[(int) tcode] = type;
26041 return type;
26044 /* Table for the ix86 builtin function types. */
26045 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
26047 /* Retrieve an element from the above table, building some of
26048 the types lazily. */
26050 static tree
26051 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
26053 tree type;
26055 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
26057 type = ix86_builtin_func_type_tab[(int) tcode];
26058 if (type != NULL)
26059 return type;
26061 if (tcode <= IX86_BT_LAST_FUNC)
26063 unsigned start = ix86_builtin_func_start[(int) tcode];
26064 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
26065 tree rtype, atype, args = void_list_node;
26066 unsigned i;
26068 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
26069 for (i = after - 1; i > start; --i)
26071 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
26072 args = tree_cons (NULL, atype, args);
26075 type = build_function_type (rtype, args);
26077 else
26079 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
26080 enum ix86_builtin_func_type icode;
26082 icode = ix86_builtin_func_alias_base[index];
26083 type = ix86_get_builtin_func_type (icode);
26086 ix86_builtin_func_type_tab[(int) tcode] = type;
26087 return type;
26091 /* Codes for all the SSE/MMX builtins. */
26092 enum ix86_builtins
26094 IX86_BUILTIN_ADDPS,
26095 IX86_BUILTIN_ADDSS,
26096 IX86_BUILTIN_DIVPS,
26097 IX86_BUILTIN_DIVSS,
26098 IX86_BUILTIN_MULPS,
26099 IX86_BUILTIN_MULSS,
26100 IX86_BUILTIN_SUBPS,
26101 IX86_BUILTIN_SUBSS,
26103 IX86_BUILTIN_CMPEQPS,
26104 IX86_BUILTIN_CMPLTPS,
26105 IX86_BUILTIN_CMPLEPS,
26106 IX86_BUILTIN_CMPGTPS,
26107 IX86_BUILTIN_CMPGEPS,
26108 IX86_BUILTIN_CMPNEQPS,
26109 IX86_BUILTIN_CMPNLTPS,
26110 IX86_BUILTIN_CMPNLEPS,
26111 IX86_BUILTIN_CMPNGTPS,
26112 IX86_BUILTIN_CMPNGEPS,
26113 IX86_BUILTIN_CMPORDPS,
26114 IX86_BUILTIN_CMPUNORDPS,
26115 IX86_BUILTIN_CMPEQSS,
26116 IX86_BUILTIN_CMPLTSS,
26117 IX86_BUILTIN_CMPLESS,
26118 IX86_BUILTIN_CMPNEQSS,
26119 IX86_BUILTIN_CMPNLTSS,
26120 IX86_BUILTIN_CMPNLESS,
26121 IX86_BUILTIN_CMPORDSS,
26122 IX86_BUILTIN_CMPUNORDSS,
26124 IX86_BUILTIN_COMIEQSS,
26125 IX86_BUILTIN_COMILTSS,
26126 IX86_BUILTIN_COMILESS,
26127 IX86_BUILTIN_COMIGTSS,
26128 IX86_BUILTIN_COMIGESS,
26129 IX86_BUILTIN_COMINEQSS,
26130 IX86_BUILTIN_UCOMIEQSS,
26131 IX86_BUILTIN_UCOMILTSS,
26132 IX86_BUILTIN_UCOMILESS,
26133 IX86_BUILTIN_UCOMIGTSS,
26134 IX86_BUILTIN_UCOMIGESS,
26135 IX86_BUILTIN_UCOMINEQSS,
26137 IX86_BUILTIN_CVTPI2PS,
26138 IX86_BUILTIN_CVTPS2PI,
26139 IX86_BUILTIN_CVTSI2SS,
26140 IX86_BUILTIN_CVTSI642SS,
26141 IX86_BUILTIN_CVTSS2SI,
26142 IX86_BUILTIN_CVTSS2SI64,
26143 IX86_BUILTIN_CVTTPS2PI,
26144 IX86_BUILTIN_CVTTSS2SI,
26145 IX86_BUILTIN_CVTTSS2SI64,
26147 IX86_BUILTIN_MAXPS,
26148 IX86_BUILTIN_MAXSS,
26149 IX86_BUILTIN_MINPS,
26150 IX86_BUILTIN_MINSS,
26152 IX86_BUILTIN_LOADUPS,
26153 IX86_BUILTIN_STOREUPS,
26154 IX86_BUILTIN_MOVSS,
26156 IX86_BUILTIN_MOVHLPS,
26157 IX86_BUILTIN_MOVLHPS,
26158 IX86_BUILTIN_LOADHPS,
26159 IX86_BUILTIN_LOADLPS,
26160 IX86_BUILTIN_STOREHPS,
26161 IX86_BUILTIN_STORELPS,
26163 IX86_BUILTIN_MASKMOVQ,
26164 IX86_BUILTIN_MOVMSKPS,
26165 IX86_BUILTIN_PMOVMSKB,
26167 IX86_BUILTIN_MOVNTPS,
26168 IX86_BUILTIN_MOVNTQ,
26170 IX86_BUILTIN_LOADDQU,
26171 IX86_BUILTIN_STOREDQU,
26173 IX86_BUILTIN_PACKSSWB,
26174 IX86_BUILTIN_PACKSSDW,
26175 IX86_BUILTIN_PACKUSWB,
26177 IX86_BUILTIN_PADDB,
26178 IX86_BUILTIN_PADDW,
26179 IX86_BUILTIN_PADDD,
26180 IX86_BUILTIN_PADDQ,
26181 IX86_BUILTIN_PADDSB,
26182 IX86_BUILTIN_PADDSW,
26183 IX86_BUILTIN_PADDUSB,
26184 IX86_BUILTIN_PADDUSW,
26185 IX86_BUILTIN_PSUBB,
26186 IX86_BUILTIN_PSUBW,
26187 IX86_BUILTIN_PSUBD,
26188 IX86_BUILTIN_PSUBQ,
26189 IX86_BUILTIN_PSUBSB,
26190 IX86_BUILTIN_PSUBSW,
26191 IX86_BUILTIN_PSUBUSB,
26192 IX86_BUILTIN_PSUBUSW,
26194 IX86_BUILTIN_PAND,
26195 IX86_BUILTIN_PANDN,
26196 IX86_BUILTIN_POR,
26197 IX86_BUILTIN_PXOR,
26199 IX86_BUILTIN_PAVGB,
26200 IX86_BUILTIN_PAVGW,
26202 IX86_BUILTIN_PCMPEQB,
26203 IX86_BUILTIN_PCMPEQW,
26204 IX86_BUILTIN_PCMPEQD,
26205 IX86_BUILTIN_PCMPGTB,
26206 IX86_BUILTIN_PCMPGTW,
26207 IX86_BUILTIN_PCMPGTD,
26209 IX86_BUILTIN_PMADDWD,
26211 IX86_BUILTIN_PMAXSW,
26212 IX86_BUILTIN_PMAXUB,
26213 IX86_BUILTIN_PMINSW,
26214 IX86_BUILTIN_PMINUB,
26216 IX86_BUILTIN_PMULHUW,
26217 IX86_BUILTIN_PMULHW,
26218 IX86_BUILTIN_PMULLW,
26220 IX86_BUILTIN_PSADBW,
26221 IX86_BUILTIN_PSHUFW,
26223 IX86_BUILTIN_PSLLW,
26224 IX86_BUILTIN_PSLLD,
26225 IX86_BUILTIN_PSLLQ,
26226 IX86_BUILTIN_PSRAW,
26227 IX86_BUILTIN_PSRAD,
26228 IX86_BUILTIN_PSRLW,
26229 IX86_BUILTIN_PSRLD,
26230 IX86_BUILTIN_PSRLQ,
26231 IX86_BUILTIN_PSLLWI,
26232 IX86_BUILTIN_PSLLDI,
26233 IX86_BUILTIN_PSLLQI,
26234 IX86_BUILTIN_PSRAWI,
26235 IX86_BUILTIN_PSRADI,
26236 IX86_BUILTIN_PSRLWI,
26237 IX86_BUILTIN_PSRLDI,
26238 IX86_BUILTIN_PSRLQI,
26240 IX86_BUILTIN_PUNPCKHBW,
26241 IX86_BUILTIN_PUNPCKHWD,
26242 IX86_BUILTIN_PUNPCKHDQ,
26243 IX86_BUILTIN_PUNPCKLBW,
26244 IX86_BUILTIN_PUNPCKLWD,
26245 IX86_BUILTIN_PUNPCKLDQ,
26247 IX86_BUILTIN_SHUFPS,
26249 IX86_BUILTIN_RCPPS,
26250 IX86_BUILTIN_RCPSS,
26251 IX86_BUILTIN_RSQRTPS,
26252 IX86_BUILTIN_RSQRTPS_NR,
26253 IX86_BUILTIN_RSQRTSS,
26254 IX86_BUILTIN_RSQRTF,
26255 IX86_BUILTIN_SQRTPS,
26256 IX86_BUILTIN_SQRTPS_NR,
26257 IX86_BUILTIN_SQRTSS,
26259 IX86_BUILTIN_UNPCKHPS,
26260 IX86_BUILTIN_UNPCKLPS,
26262 IX86_BUILTIN_ANDPS,
26263 IX86_BUILTIN_ANDNPS,
26264 IX86_BUILTIN_ORPS,
26265 IX86_BUILTIN_XORPS,
26267 IX86_BUILTIN_EMMS,
26268 IX86_BUILTIN_LDMXCSR,
26269 IX86_BUILTIN_STMXCSR,
26270 IX86_BUILTIN_SFENCE,
26272 IX86_BUILTIN_FXSAVE,
26273 IX86_BUILTIN_FXRSTOR,
26274 IX86_BUILTIN_FXSAVE64,
26275 IX86_BUILTIN_FXRSTOR64,
26277 IX86_BUILTIN_XSAVE,
26278 IX86_BUILTIN_XRSTOR,
26279 IX86_BUILTIN_XSAVE64,
26280 IX86_BUILTIN_XRSTOR64,
26282 IX86_BUILTIN_XSAVEOPT,
26283 IX86_BUILTIN_XSAVEOPT64,
26285 /* 3DNow! Original */
26286 IX86_BUILTIN_FEMMS,
26287 IX86_BUILTIN_PAVGUSB,
26288 IX86_BUILTIN_PF2ID,
26289 IX86_BUILTIN_PFACC,
26290 IX86_BUILTIN_PFADD,
26291 IX86_BUILTIN_PFCMPEQ,
26292 IX86_BUILTIN_PFCMPGE,
26293 IX86_BUILTIN_PFCMPGT,
26294 IX86_BUILTIN_PFMAX,
26295 IX86_BUILTIN_PFMIN,
26296 IX86_BUILTIN_PFMUL,
26297 IX86_BUILTIN_PFRCP,
26298 IX86_BUILTIN_PFRCPIT1,
26299 IX86_BUILTIN_PFRCPIT2,
26300 IX86_BUILTIN_PFRSQIT1,
26301 IX86_BUILTIN_PFRSQRT,
26302 IX86_BUILTIN_PFSUB,
26303 IX86_BUILTIN_PFSUBR,
26304 IX86_BUILTIN_PI2FD,
26305 IX86_BUILTIN_PMULHRW,
26307 /* 3DNow! Athlon Extensions */
26308 IX86_BUILTIN_PF2IW,
26309 IX86_BUILTIN_PFNACC,
26310 IX86_BUILTIN_PFPNACC,
26311 IX86_BUILTIN_PI2FW,
26312 IX86_BUILTIN_PSWAPDSI,
26313 IX86_BUILTIN_PSWAPDSF,
26315 /* SSE2 */
26316 IX86_BUILTIN_ADDPD,
26317 IX86_BUILTIN_ADDSD,
26318 IX86_BUILTIN_DIVPD,
26319 IX86_BUILTIN_DIVSD,
26320 IX86_BUILTIN_MULPD,
26321 IX86_BUILTIN_MULSD,
26322 IX86_BUILTIN_SUBPD,
26323 IX86_BUILTIN_SUBSD,
26325 IX86_BUILTIN_CMPEQPD,
26326 IX86_BUILTIN_CMPLTPD,
26327 IX86_BUILTIN_CMPLEPD,
26328 IX86_BUILTIN_CMPGTPD,
26329 IX86_BUILTIN_CMPGEPD,
26330 IX86_BUILTIN_CMPNEQPD,
26331 IX86_BUILTIN_CMPNLTPD,
26332 IX86_BUILTIN_CMPNLEPD,
26333 IX86_BUILTIN_CMPNGTPD,
26334 IX86_BUILTIN_CMPNGEPD,
26335 IX86_BUILTIN_CMPORDPD,
26336 IX86_BUILTIN_CMPUNORDPD,
26337 IX86_BUILTIN_CMPEQSD,
26338 IX86_BUILTIN_CMPLTSD,
26339 IX86_BUILTIN_CMPLESD,
26340 IX86_BUILTIN_CMPNEQSD,
26341 IX86_BUILTIN_CMPNLTSD,
26342 IX86_BUILTIN_CMPNLESD,
26343 IX86_BUILTIN_CMPORDSD,
26344 IX86_BUILTIN_CMPUNORDSD,
26346 IX86_BUILTIN_COMIEQSD,
26347 IX86_BUILTIN_COMILTSD,
26348 IX86_BUILTIN_COMILESD,
26349 IX86_BUILTIN_COMIGTSD,
26350 IX86_BUILTIN_COMIGESD,
26351 IX86_BUILTIN_COMINEQSD,
26352 IX86_BUILTIN_UCOMIEQSD,
26353 IX86_BUILTIN_UCOMILTSD,
26354 IX86_BUILTIN_UCOMILESD,
26355 IX86_BUILTIN_UCOMIGTSD,
26356 IX86_BUILTIN_UCOMIGESD,
26357 IX86_BUILTIN_UCOMINEQSD,
26359 IX86_BUILTIN_MAXPD,
26360 IX86_BUILTIN_MAXSD,
26361 IX86_BUILTIN_MINPD,
26362 IX86_BUILTIN_MINSD,
26364 IX86_BUILTIN_ANDPD,
26365 IX86_BUILTIN_ANDNPD,
26366 IX86_BUILTIN_ORPD,
26367 IX86_BUILTIN_XORPD,
26369 IX86_BUILTIN_SQRTPD,
26370 IX86_BUILTIN_SQRTSD,
26372 IX86_BUILTIN_UNPCKHPD,
26373 IX86_BUILTIN_UNPCKLPD,
26375 IX86_BUILTIN_SHUFPD,
26377 IX86_BUILTIN_LOADUPD,
26378 IX86_BUILTIN_STOREUPD,
26379 IX86_BUILTIN_MOVSD,
26381 IX86_BUILTIN_LOADHPD,
26382 IX86_BUILTIN_LOADLPD,
26384 IX86_BUILTIN_CVTDQ2PD,
26385 IX86_BUILTIN_CVTDQ2PS,
26387 IX86_BUILTIN_CVTPD2DQ,
26388 IX86_BUILTIN_CVTPD2PI,
26389 IX86_BUILTIN_CVTPD2PS,
26390 IX86_BUILTIN_CVTTPD2DQ,
26391 IX86_BUILTIN_CVTTPD2PI,
26393 IX86_BUILTIN_CVTPI2PD,
26394 IX86_BUILTIN_CVTSI2SD,
26395 IX86_BUILTIN_CVTSI642SD,
26397 IX86_BUILTIN_CVTSD2SI,
26398 IX86_BUILTIN_CVTSD2SI64,
26399 IX86_BUILTIN_CVTSD2SS,
26400 IX86_BUILTIN_CVTSS2SD,
26401 IX86_BUILTIN_CVTTSD2SI,
26402 IX86_BUILTIN_CVTTSD2SI64,
26404 IX86_BUILTIN_CVTPS2DQ,
26405 IX86_BUILTIN_CVTPS2PD,
26406 IX86_BUILTIN_CVTTPS2DQ,
26408 IX86_BUILTIN_MOVNTI,
26409 IX86_BUILTIN_MOVNTI64,
26410 IX86_BUILTIN_MOVNTPD,
26411 IX86_BUILTIN_MOVNTDQ,
26413 IX86_BUILTIN_MOVQ128,
26415 /* SSE2 MMX */
26416 IX86_BUILTIN_MASKMOVDQU,
26417 IX86_BUILTIN_MOVMSKPD,
26418 IX86_BUILTIN_PMOVMSKB128,
26420 IX86_BUILTIN_PACKSSWB128,
26421 IX86_BUILTIN_PACKSSDW128,
26422 IX86_BUILTIN_PACKUSWB128,
26424 IX86_BUILTIN_PADDB128,
26425 IX86_BUILTIN_PADDW128,
26426 IX86_BUILTIN_PADDD128,
26427 IX86_BUILTIN_PADDQ128,
26428 IX86_BUILTIN_PADDSB128,
26429 IX86_BUILTIN_PADDSW128,
26430 IX86_BUILTIN_PADDUSB128,
26431 IX86_BUILTIN_PADDUSW128,
26432 IX86_BUILTIN_PSUBB128,
26433 IX86_BUILTIN_PSUBW128,
26434 IX86_BUILTIN_PSUBD128,
26435 IX86_BUILTIN_PSUBQ128,
26436 IX86_BUILTIN_PSUBSB128,
26437 IX86_BUILTIN_PSUBSW128,
26438 IX86_BUILTIN_PSUBUSB128,
26439 IX86_BUILTIN_PSUBUSW128,
26441 IX86_BUILTIN_PAND128,
26442 IX86_BUILTIN_PANDN128,
26443 IX86_BUILTIN_POR128,
26444 IX86_BUILTIN_PXOR128,
26446 IX86_BUILTIN_PAVGB128,
26447 IX86_BUILTIN_PAVGW128,
26449 IX86_BUILTIN_PCMPEQB128,
26450 IX86_BUILTIN_PCMPEQW128,
26451 IX86_BUILTIN_PCMPEQD128,
26452 IX86_BUILTIN_PCMPGTB128,
26453 IX86_BUILTIN_PCMPGTW128,
26454 IX86_BUILTIN_PCMPGTD128,
26456 IX86_BUILTIN_PMADDWD128,
26458 IX86_BUILTIN_PMAXSW128,
26459 IX86_BUILTIN_PMAXUB128,
26460 IX86_BUILTIN_PMINSW128,
26461 IX86_BUILTIN_PMINUB128,
26463 IX86_BUILTIN_PMULUDQ,
26464 IX86_BUILTIN_PMULUDQ128,
26465 IX86_BUILTIN_PMULHUW128,
26466 IX86_BUILTIN_PMULHW128,
26467 IX86_BUILTIN_PMULLW128,
26469 IX86_BUILTIN_PSADBW128,
26470 IX86_BUILTIN_PSHUFHW,
26471 IX86_BUILTIN_PSHUFLW,
26472 IX86_BUILTIN_PSHUFD,
26474 IX86_BUILTIN_PSLLDQI128,
26475 IX86_BUILTIN_PSLLWI128,
26476 IX86_BUILTIN_PSLLDI128,
26477 IX86_BUILTIN_PSLLQI128,
26478 IX86_BUILTIN_PSRAWI128,
26479 IX86_BUILTIN_PSRADI128,
26480 IX86_BUILTIN_PSRLDQI128,
26481 IX86_BUILTIN_PSRLWI128,
26482 IX86_BUILTIN_PSRLDI128,
26483 IX86_BUILTIN_PSRLQI128,
26485 IX86_BUILTIN_PSLLDQ128,
26486 IX86_BUILTIN_PSLLW128,
26487 IX86_BUILTIN_PSLLD128,
26488 IX86_BUILTIN_PSLLQ128,
26489 IX86_BUILTIN_PSRAW128,
26490 IX86_BUILTIN_PSRAD128,
26491 IX86_BUILTIN_PSRLW128,
26492 IX86_BUILTIN_PSRLD128,
26493 IX86_BUILTIN_PSRLQ128,
26495 IX86_BUILTIN_PUNPCKHBW128,
26496 IX86_BUILTIN_PUNPCKHWD128,
26497 IX86_BUILTIN_PUNPCKHDQ128,
26498 IX86_BUILTIN_PUNPCKHQDQ128,
26499 IX86_BUILTIN_PUNPCKLBW128,
26500 IX86_BUILTIN_PUNPCKLWD128,
26501 IX86_BUILTIN_PUNPCKLDQ128,
26502 IX86_BUILTIN_PUNPCKLQDQ128,
26504 IX86_BUILTIN_CLFLUSH,
26505 IX86_BUILTIN_MFENCE,
26506 IX86_BUILTIN_LFENCE,
26507 IX86_BUILTIN_PAUSE,
26509 IX86_BUILTIN_BSRSI,
26510 IX86_BUILTIN_BSRDI,
26511 IX86_BUILTIN_RDPMC,
26512 IX86_BUILTIN_RDTSC,
26513 IX86_BUILTIN_RDTSCP,
26514 IX86_BUILTIN_ROLQI,
26515 IX86_BUILTIN_ROLHI,
26516 IX86_BUILTIN_RORQI,
26517 IX86_BUILTIN_RORHI,
26519 /* SSE3. */
26520 IX86_BUILTIN_ADDSUBPS,
26521 IX86_BUILTIN_HADDPS,
26522 IX86_BUILTIN_HSUBPS,
26523 IX86_BUILTIN_MOVSHDUP,
26524 IX86_BUILTIN_MOVSLDUP,
26525 IX86_BUILTIN_ADDSUBPD,
26526 IX86_BUILTIN_HADDPD,
26527 IX86_BUILTIN_HSUBPD,
26528 IX86_BUILTIN_LDDQU,
26530 IX86_BUILTIN_MONITOR,
26531 IX86_BUILTIN_MWAIT,
26533 /* SSSE3. */
26534 IX86_BUILTIN_PHADDW,
26535 IX86_BUILTIN_PHADDD,
26536 IX86_BUILTIN_PHADDSW,
26537 IX86_BUILTIN_PHSUBW,
26538 IX86_BUILTIN_PHSUBD,
26539 IX86_BUILTIN_PHSUBSW,
26540 IX86_BUILTIN_PMADDUBSW,
26541 IX86_BUILTIN_PMULHRSW,
26542 IX86_BUILTIN_PSHUFB,
26543 IX86_BUILTIN_PSIGNB,
26544 IX86_BUILTIN_PSIGNW,
26545 IX86_BUILTIN_PSIGND,
26546 IX86_BUILTIN_PALIGNR,
26547 IX86_BUILTIN_PABSB,
26548 IX86_BUILTIN_PABSW,
26549 IX86_BUILTIN_PABSD,
26551 IX86_BUILTIN_PHADDW128,
26552 IX86_BUILTIN_PHADDD128,
26553 IX86_BUILTIN_PHADDSW128,
26554 IX86_BUILTIN_PHSUBW128,
26555 IX86_BUILTIN_PHSUBD128,
26556 IX86_BUILTIN_PHSUBSW128,
26557 IX86_BUILTIN_PMADDUBSW128,
26558 IX86_BUILTIN_PMULHRSW128,
26559 IX86_BUILTIN_PSHUFB128,
26560 IX86_BUILTIN_PSIGNB128,
26561 IX86_BUILTIN_PSIGNW128,
26562 IX86_BUILTIN_PSIGND128,
26563 IX86_BUILTIN_PALIGNR128,
26564 IX86_BUILTIN_PABSB128,
26565 IX86_BUILTIN_PABSW128,
26566 IX86_BUILTIN_PABSD128,
26568 /* AMDFAM10 - SSE4A New Instructions. */
26569 IX86_BUILTIN_MOVNTSD,
26570 IX86_BUILTIN_MOVNTSS,
26571 IX86_BUILTIN_EXTRQI,
26572 IX86_BUILTIN_EXTRQ,
26573 IX86_BUILTIN_INSERTQI,
26574 IX86_BUILTIN_INSERTQ,
26576 /* SSE4.1. */
26577 IX86_BUILTIN_BLENDPD,
26578 IX86_BUILTIN_BLENDPS,
26579 IX86_BUILTIN_BLENDVPD,
26580 IX86_BUILTIN_BLENDVPS,
26581 IX86_BUILTIN_PBLENDVB128,
26582 IX86_BUILTIN_PBLENDW128,
26584 IX86_BUILTIN_DPPD,
26585 IX86_BUILTIN_DPPS,
26587 IX86_BUILTIN_INSERTPS128,
26589 IX86_BUILTIN_MOVNTDQA,
26590 IX86_BUILTIN_MPSADBW128,
26591 IX86_BUILTIN_PACKUSDW128,
26592 IX86_BUILTIN_PCMPEQQ,
26593 IX86_BUILTIN_PHMINPOSUW128,
26595 IX86_BUILTIN_PMAXSB128,
26596 IX86_BUILTIN_PMAXSD128,
26597 IX86_BUILTIN_PMAXUD128,
26598 IX86_BUILTIN_PMAXUW128,
26600 IX86_BUILTIN_PMINSB128,
26601 IX86_BUILTIN_PMINSD128,
26602 IX86_BUILTIN_PMINUD128,
26603 IX86_BUILTIN_PMINUW128,
26605 IX86_BUILTIN_PMOVSXBW128,
26606 IX86_BUILTIN_PMOVSXBD128,
26607 IX86_BUILTIN_PMOVSXBQ128,
26608 IX86_BUILTIN_PMOVSXWD128,
26609 IX86_BUILTIN_PMOVSXWQ128,
26610 IX86_BUILTIN_PMOVSXDQ128,
26612 IX86_BUILTIN_PMOVZXBW128,
26613 IX86_BUILTIN_PMOVZXBD128,
26614 IX86_BUILTIN_PMOVZXBQ128,
26615 IX86_BUILTIN_PMOVZXWD128,
26616 IX86_BUILTIN_PMOVZXWQ128,
26617 IX86_BUILTIN_PMOVZXDQ128,
26619 IX86_BUILTIN_PMULDQ128,
26620 IX86_BUILTIN_PMULLD128,
26622 IX86_BUILTIN_ROUNDSD,
26623 IX86_BUILTIN_ROUNDSS,
26625 IX86_BUILTIN_ROUNDPD,
26626 IX86_BUILTIN_ROUNDPS,
26628 IX86_BUILTIN_FLOORPD,
26629 IX86_BUILTIN_CEILPD,
26630 IX86_BUILTIN_TRUNCPD,
26631 IX86_BUILTIN_RINTPD,
26632 IX86_BUILTIN_ROUNDPD_AZ,
26634 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26635 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26636 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26638 IX86_BUILTIN_FLOORPS,
26639 IX86_BUILTIN_CEILPS,
26640 IX86_BUILTIN_TRUNCPS,
26641 IX86_BUILTIN_RINTPS,
26642 IX86_BUILTIN_ROUNDPS_AZ,
26644 IX86_BUILTIN_FLOORPS_SFIX,
26645 IX86_BUILTIN_CEILPS_SFIX,
26646 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26648 IX86_BUILTIN_PTESTZ,
26649 IX86_BUILTIN_PTESTC,
26650 IX86_BUILTIN_PTESTNZC,
26652 IX86_BUILTIN_VEC_INIT_V2SI,
26653 IX86_BUILTIN_VEC_INIT_V4HI,
26654 IX86_BUILTIN_VEC_INIT_V8QI,
26655 IX86_BUILTIN_VEC_EXT_V2DF,
26656 IX86_BUILTIN_VEC_EXT_V2DI,
26657 IX86_BUILTIN_VEC_EXT_V4SF,
26658 IX86_BUILTIN_VEC_EXT_V4SI,
26659 IX86_BUILTIN_VEC_EXT_V8HI,
26660 IX86_BUILTIN_VEC_EXT_V2SI,
26661 IX86_BUILTIN_VEC_EXT_V4HI,
26662 IX86_BUILTIN_VEC_EXT_V16QI,
26663 IX86_BUILTIN_VEC_SET_V2DI,
26664 IX86_BUILTIN_VEC_SET_V4SF,
26665 IX86_BUILTIN_VEC_SET_V4SI,
26666 IX86_BUILTIN_VEC_SET_V8HI,
26667 IX86_BUILTIN_VEC_SET_V4HI,
26668 IX86_BUILTIN_VEC_SET_V16QI,
26670 IX86_BUILTIN_VEC_PACK_SFIX,
26671 IX86_BUILTIN_VEC_PACK_SFIX256,
26673 /* SSE4.2. */
26674 IX86_BUILTIN_CRC32QI,
26675 IX86_BUILTIN_CRC32HI,
26676 IX86_BUILTIN_CRC32SI,
26677 IX86_BUILTIN_CRC32DI,
26679 IX86_BUILTIN_PCMPESTRI128,
26680 IX86_BUILTIN_PCMPESTRM128,
26681 IX86_BUILTIN_PCMPESTRA128,
26682 IX86_BUILTIN_PCMPESTRC128,
26683 IX86_BUILTIN_PCMPESTRO128,
26684 IX86_BUILTIN_PCMPESTRS128,
26685 IX86_BUILTIN_PCMPESTRZ128,
26686 IX86_BUILTIN_PCMPISTRI128,
26687 IX86_BUILTIN_PCMPISTRM128,
26688 IX86_BUILTIN_PCMPISTRA128,
26689 IX86_BUILTIN_PCMPISTRC128,
26690 IX86_BUILTIN_PCMPISTRO128,
26691 IX86_BUILTIN_PCMPISTRS128,
26692 IX86_BUILTIN_PCMPISTRZ128,
26694 IX86_BUILTIN_PCMPGTQ,
26696 /* AES instructions */
26697 IX86_BUILTIN_AESENC128,
26698 IX86_BUILTIN_AESENCLAST128,
26699 IX86_BUILTIN_AESDEC128,
26700 IX86_BUILTIN_AESDECLAST128,
26701 IX86_BUILTIN_AESIMC128,
26702 IX86_BUILTIN_AESKEYGENASSIST128,
26704 /* PCLMUL instruction */
26705 IX86_BUILTIN_PCLMULQDQ128,
26707 /* AVX */
26708 IX86_BUILTIN_ADDPD256,
26709 IX86_BUILTIN_ADDPS256,
26710 IX86_BUILTIN_ADDSUBPD256,
26711 IX86_BUILTIN_ADDSUBPS256,
26712 IX86_BUILTIN_ANDPD256,
26713 IX86_BUILTIN_ANDPS256,
26714 IX86_BUILTIN_ANDNPD256,
26715 IX86_BUILTIN_ANDNPS256,
26716 IX86_BUILTIN_BLENDPD256,
26717 IX86_BUILTIN_BLENDPS256,
26718 IX86_BUILTIN_BLENDVPD256,
26719 IX86_BUILTIN_BLENDVPS256,
26720 IX86_BUILTIN_DIVPD256,
26721 IX86_BUILTIN_DIVPS256,
26722 IX86_BUILTIN_DPPS256,
26723 IX86_BUILTIN_HADDPD256,
26724 IX86_BUILTIN_HADDPS256,
26725 IX86_BUILTIN_HSUBPD256,
26726 IX86_BUILTIN_HSUBPS256,
26727 IX86_BUILTIN_MAXPD256,
26728 IX86_BUILTIN_MAXPS256,
26729 IX86_BUILTIN_MINPD256,
26730 IX86_BUILTIN_MINPS256,
26731 IX86_BUILTIN_MULPD256,
26732 IX86_BUILTIN_MULPS256,
26733 IX86_BUILTIN_ORPD256,
26734 IX86_BUILTIN_ORPS256,
26735 IX86_BUILTIN_SHUFPD256,
26736 IX86_BUILTIN_SHUFPS256,
26737 IX86_BUILTIN_SUBPD256,
26738 IX86_BUILTIN_SUBPS256,
26739 IX86_BUILTIN_XORPD256,
26740 IX86_BUILTIN_XORPS256,
26741 IX86_BUILTIN_CMPSD,
26742 IX86_BUILTIN_CMPSS,
26743 IX86_BUILTIN_CMPPD,
26744 IX86_BUILTIN_CMPPS,
26745 IX86_BUILTIN_CMPPD256,
26746 IX86_BUILTIN_CMPPS256,
26747 IX86_BUILTIN_CVTDQ2PD256,
26748 IX86_BUILTIN_CVTDQ2PS256,
26749 IX86_BUILTIN_CVTPD2PS256,
26750 IX86_BUILTIN_CVTPS2DQ256,
26751 IX86_BUILTIN_CVTPS2PD256,
26752 IX86_BUILTIN_CVTTPD2DQ256,
26753 IX86_BUILTIN_CVTPD2DQ256,
26754 IX86_BUILTIN_CVTTPS2DQ256,
26755 IX86_BUILTIN_EXTRACTF128PD256,
26756 IX86_BUILTIN_EXTRACTF128PS256,
26757 IX86_BUILTIN_EXTRACTF128SI256,
26758 IX86_BUILTIN_VZEROALL,
26759 IX86_BUILTIN_VZEROUPPER,
26760 IX86_BUILTIN_VPERMILVARPD,
26761 IX86_BUILTIN_VPERMILVARPS,
26762 IX86_BUILTIN_VPERMILVARPD256,
26763 IX86_BUILTIN_VPERMILVARPS256,
26764 IX86_BUILTIN_VPERMILPD,
26765 IX86_BUILTIN_VPERMILPS,
26766 IX86_BUILTIN_VPERMILPD256,
26767 IX86_BUILTIN_VPERMILPS256,
26768 IX86_BUILTIN_VPERMIL2PD,
26769 IX86_BUILTIN_VPERMIL2PS,
26770 IX86_BUILTIN_VPERMIL2PD256,
26771 IX86_BUILTIN_VPERMIL2PS256,
26772 IX86_BUILTIN_VPERM2F128PD256,
26773 IX86_BUILTIN_VPERM2F128PS256,
26774 IX86_BUILTIN_VPERM2F128SI256,
26775 IX86_BUILTIN_VBROADCASTSS,
26776 IX86_BUILTIN_VBROADCASTSD256,
26777 IX86_BUILTIN_VBROADCASTSS256,
26778 IX86_BUILTIN_VBROADCASTPD256,
26779 IX86_BUILTIN_VBROADCASTPS256,
26780 IX86_BUILTIN_VINSERTF128PD256,
26781 IX86_BUILTIN_VINSERTF128PS256,
26782 IX86_BUILTIN_VINSERTF128SI256,
26783 IX86_BUILTIN_LOADUPD256,
26784 IX86_BUILTIN_LOADUPS256,
26785 IX86_BUILTIN_STOREUPD256,
26786 IX86_BUILTIN_STOREUPS256,
26787 IX86_BUILTIN_LDDQU256,
26788 IX86_BUILTIN_MOVNTDQ256,
26789 IX86_BUILTIN_MOVNTPD256,
26790 IX86_BUILTIN_MOVNTPS256,
26791 IX86_BUILTIN_LOADDQU256,
26792 IX86_BUILTIN_STOREDQU256,
26793 IX86_BUILTIN_MASKLOADPD,
26794 IX86_BUILTIN_MASKLOADPS,
26795 IX86_BUILTIN_MASKSTOREPD,
26796 IX86_BUILTIN_MASKSTOREPS,
26797 IX86_BUILTIN_MASKLOADPD256,
26798 IX86_BUILTIN_MASKLOADPS256,
26799 IX86_BUILTIN_MASKSTOREPD256,
26800 IX86_BUILTIN_MASKSTOREPS256,
26801 IX86_BUILTIN_MOVSHDUP256,
26802 IX86_BUILTIN_MOVSLDUP256,
26803 IX86_BUILTIN_MOVDDUP256,
26805 IX86_BUILTIN_SQRTPD256,
26806 IX86_BUILTIN_SQRTPS256,
26807 IX86_BUILTIN_SQRTPS_NR256,
26808 IX86_BUILTIN_RSQRTPS256,
26809 IX86_BUILTIN_RSQRTPS_NR256,
26811 IX86_BUILTIN_RCPPS256,
26813 IX86_BUILTIN_ROUNDPD256,
26814 IX86_BUILTIN_ROUNDPS256,
26816 IX86_BUILTIN_FLOORPD256,
26817 IX86_BUILTIN_CEILPD256,
26818 IX86_BUILTIN_TRUNCPD256,
26819 IX86_BUILTIN_RINTPD256,
26820 IX86_BUILTIN_ROUNDPD_AZ256,
26822 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26823 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26824 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26826 IX86_BUILTIN_FLOORPS256,
26827 IX86_BUILTIN_CEILPS256,
26828 IX86_BUILTIN_TRUNCPS256,
26829 IX86_BUILTIN_RINTPS256,
26830 IX86_BUILTIN_ROUNDPS_AZ256,
26832 IX86_BUILTIN_FLOORPS_SFIX256,
26833 IX86_BUILTIN_CEILPS_SFIX256,
26834 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26836 IX86_BUILTIN_UNPCKHPD256,
26837 IX86_BUILTIN_UNPCKLPD256,
26838 IX86_BUILTIN_UNPCKHPS256,
26839 IX86_BUILTIN_UNPCKLPS256,
26841 IX86_BUILTIN_SI256_SI,
26842 IX86_BUILTIN_PS256_PS,
26843 IX86_BUILTIN_PD256_PD,
26844 IX86_BUILTIN_SI_SI256,
26845 IX86_BUILTIN_PS_PS256,
26846 IX86_BUILTIN_PD_PD256,
26848 IX86_BUILTIN_VTESTZPD,
26849 IX86_BUILTIN_VTESTCPD,
26850 IX86_BUILTIN_VTESTNZCPD,
26851 IX86_BUILTIN_VTESTZPS,
26852 IX86_BUILTIN_VTESTCPS,
26853 IX86_BUILTIN_VTESTNZCPS,
26854 IX86_BUILTIN_VTESTZPD256,
26855 IX86_BUILTIN_VTESTCPD256,
26856 IX86_BUILTIN_VTESTNZCPD256,
26857 IX86_BUILTIN_VTESTZPS256,
26858 IX86_BUILTIN_VTESTCPS256,
26859 IX86_BUILTIN_VTESTNZCPS256,
26860 IX86_BUILTIN_PTESTZ256,
26861 IX86_BUILTIN_PTESTC256,
26862 IX86_BUILTIN_PTESTNZC256,
26864 IX86_BUILTIN_MOVMSKPD256,
26865 IX86_BUILTIN_MOVMSKPS256,
26867 /* AVX2 */
26868 IX86_BUILTIN_MPSADBW256,
26869 IX86_BUILTIN_PABSB256,
26870 IX86_BUILTIN_PABSW256,
26871 IX86_BUILTIN_PABSD256,
26872 IX86_BUILTIN_PACKSSDW256,
26873 IX86_BUILTIN_PACKSSWB256,
26874 IX86_BUILTIN_PACKUSDW256,
26875 IX86_BUILTIN_PACKUSWB256,
26876 IX86_BUILTIN_PADDB256,
26877 IX86_BUILTIN_PADDW256,
26878 IX86_BUILTIN_PADDD256,
26879 IX86_BUILTIN_PADDQ256,
26880 IX86_BUILTIN_PADDSB256,
26881 IX86_BUILTIN_PADDSW256,
26882 IX86_BUILTIN_PADDUSB256,
26883 IX86_BUILTIN_PADDUSW256,
26884 IX86_BUILTIN_PALIGNR256,
26885 IX86_BUILTIN_AND256I,
26886 IX86_BUILTIN_ANDNOT256I,
26887 IX86_BUILTIN_PAVGB256,
26888 IX86_BUILTIN_PAVGW256,
26889 IX86_BUILTIN_PBLENDVB256,
26890 IX86_BUILTIN_PBLENDVW256,
26891 IX86_BUILTIN_PCMPEQB256,
26892 IX86_BUILTIN_PCMPEQW256,
26893 IX86_BUILTIN_PCMPEQD256,
26894 IX86_BUILTIN_PCMPEQQ256,
26895 IX86_BUILTIN_PCMPGTB256,
26896 IX86_BUILTIN_PCMPGTW256,
26897 IX86_BUILTIN_PCMPGTD256,
26898 IX86_BUILTIN_PCMPGTQ256,
26899 IX86_BUILTIN_PHADDW256,
26900 IX86_BUILTIN_PHADDD256,
26901 IX86_BUILTIN_PHADDSW256,
26902 IX86_BUILTIN_PHSUBW256,
26903 IX86_BUILTIN_PHSUBD256,
26904 IX86_BUILTIN_PHSUBSW256,
26905 IX86_BUILTIN_PMADDUBSW256,
26906 IX86_BUILTIN_PMADDWD256,
26907 IX86_BUILTIN_PMAXSB256,
26908 IX86_BUILTIN_PMAXSW256,
26909 IX86_BUILTIN_PMAXSD256,
26910 IX86_BUILTIN_PMAXUB256,
26911 IX86_BUILTIN_PMAXUW256,
26912 IX86_BUILTIN_PMAXUD256,
26913 IX86_BUILTIN_PMINSB256,
26914 IX86_BUILTIN_PMINSW256,
26915 IX86_BUILTIN_PMINSD256,
26916 IX86_BUILTIN_PMINUB256,
26917 IX86_BUILTIN_PMINUW256,
26918 IX86_BUILTIN_PMINUD256,
26919 IX86_BUILTIN_PMOVMSKB256,
26920 IX86_BUILTIN_PMOVSXBW256,
26921 IX86_BUILTIN_PMOVSXBD256,
26922 IX86_BUILTIN_PMOVSXBQ256,
26923 IX86_BUILTIN_PMOVSXWD256,
26924 IX86_BUILTIN_PMOVSXWQ256,
26925 IX86_BUILTIN_PMOVSXDQ256,
26926 IX86_BUILTIN_PMOVZXBW256,
26927 IX86_BUILTIN_PMOVZXBD256,
26928 IX86_BUILTIN_PMOVZXBQ256,
26929 IX86_BUILTIN_PMOVZXWD256,
26930 IX86_BUILTIN_PMOVZXWQ256,
26931 IX86_BUILTIN_PMOVZXDQ256,
26932 IX86_BUILTIN_PMULDQ256,
26933 IX86_BUILTIN_PMULHRSW256,
26934 IX86_BUILTIN_PMULHUW256,
26935 IX86_BUILTIN_PMULHW256,
26936 IX86_BUILTIN_PMULLW256,
26937 IX86_BUILTIN_PMULLD256,
26938 IX86_BUILTIN_PMULUDQ256,
26939 IX86_BUILTIN_POR256,
26940 IX86_BUILTIN_PSADBW256,
26941 IX86_BUILTIN_PSHUFB256,
26942 IX86_BUILTIN_PSHUFD256,
26943 IX86_BUILTIN_PSHUFHW256,
26944 IX86_BUILTIN_PSHUFLW256,
26945 IX86_BUILTIN_PSIGNB256,
26946 IX86_BUILTIN_PSIGNW256,
26947 IX86_BUILTIN_PSIGND256,
26948 IX86_BUILTIN_PSLLDQI256,
26949 IX86_BUILTIN_PSLLWI256,
26950 IX86_BUILTIN_PSLLW256,
26951 IX86_BUILTIN_PSLLDI256,
26952 IX86_BUILTIN_PSLLD256,
26953 IX86_BUILTIN_PSLLQI256,
26954 IX86_BUILTIN_PSLLQ256,
26955 IX86_BUILTIN_PSRAWI256,
26956 IX86_BUILTIN_PSRAW256,
26957 IX86_BUILTIN_PSRADI256,
26958 IX86_BUILTIN_PSRAD256,
26959 IX86_BUILTIN_PSRLDQI256,
26960 IX86_BUILTIN_PSRLWI256,
26961 IX86_BUILTIN_PSRLW256,
26962 IX86_BUILTIN_PSRLDI256,
26963 IX86_BUILTIN_PSRLD256,
26964 IX86_BUILTIN_PSRLQI256,
26965 IX86_BUILTIN_PSRLQ256,
26966 IX86_BUILTIN_PSUBB256,
26967 IX86_BUILTIN_PSUBW256,
26968 IX86_BUILTIN_PSUBD256,
26969 IX86_BUILTIN_PSUBQ256,
26970 IX86_BUILTIN_PSUBSB256,
26971 IX86_BUILTIN_PSUBSW256,
26972 IX86_BUILTIN_PSUBUSB256,
26973 IX86_BUILTIN_PSUBUSW256,
26974 IX86_BUILTIN_PUNPCKHBW256,
26975 IX86_BUILTIN_PUNPCKHWD256,
26976 IX86_BUILTIN_PUNPCKHDQ256,
26977 IX86_BUILTIN_PUNPCKHQDQ256,
26978 IX86_BUILTIN_PUNPCKLBW256,
26979 IX86_BUILTIN_PUNPCKLWD256,
26980 IX86_BUILTIN_PUNPCKLDQ256,
26981 IX86_BUILTIN_PUNPCKLQDQ256,
26982 IX86_BUILTIN_PXOR256,
26983 IX86_BUILTIN_MOVNTDQA256,
26984 IX86_BUILTIN_VBROADCASTSS_PS,
26985 IX86_BUILTIN_VBROADCASTSS_PS256,
26986 IX86_BUILTIN_VBROADCASTSD_PD256,
26987 IX86_BUILTIN_VBROADCASTSI256,
26988 IX86_BUILTIN_PBLENDD256,
26989 IX86_BUILTIN_PBLENDD128,
26990 IX86_BUILTIN_PBROADCASTB256,
26991 IX86_BUILTIN_PBROADCASTW256,
26992 IX86_BUILTIN_PBROADCASTD256,
26993 IX86_BUILTIN_PBROADCASTQ256,
26994 IX86_BUILTIN_PBROADCASTB128,
26995 IX86_BUILTIN_PBROADCASTW128,
26996 IX86_BUILTIN_PBROADCASTD128,
26997 IX86_BUILTIN_PBROADCASTQ128,
26998 IX86_BUILTIN_VPERMVARSI256,
26999 IX86_BUILTIN_VPERMDF256,
27000 IX86_BUILTIN_VPERMVARSF256,
27001 IX86_BUILTIN_VPERMDI256,
27002 IX86_BUILTIN_VPERMTI256,
27003 IX86_BUILTIN_VEXTRACT128I256,
27004 IX86_BUILTIN_VINSERT128I256,
27005 IX86_BUILTIN_MASKLOADD,
27006 IX86_BUILTIN_MASKLOADQ,
27007 IX86_BUILTIN_MASKLOADD256,
27008 IX86_BUILTIN_MASKLOADQ256,
27009 IX86_BUILTIN_MASKSTORED,
27010 IX86_BUILTIN_MASKSTOREQ,
27011 IX86_BUILTIN_MASKSTORED256,
27012 IX86_BUILTIN_MASKSTOREQ256,
27013 IX86_BUILTIN_PSLLVV4DI,
27014 IX86_BUILTIN_PSLLVV2DI,
27015 IX86_BUILTIN_PSLLVV8SI,
27016 IX86_BUILTIN_PSLLVV4SI,
27017 IX86_BUILTIN_PSRAVV8SI,
27018 IX86_BUILTIN_PSRAVV4SI,
27019 IX86_BUILTIN_PSRLVV4DI,
27020 IX86_BUILTIN_PSRLVV2DI,
27021 IX86_BUILTIN_PSRLVV8SI,
27022 IX86_BUILTIN_PSRLVV4SI,
27024 IX86_BUILTIN_GATHERSIV2DF,
27025 IX86_BUILTIN_GATHERSIV4DF,
27026 IX86_BUILTIN_GATHERDIV2DF,
27027 IX86_BUILTIN_GATHERDIV4DF,
27028 IX86_BUILTIN_GATHERSIV4SF,
27029 IX86_BUILTIN_GATHERSIV8SF,
27030 IX86_BUILTIN_GATHERDIV4SF,
27031 IX86_BUILTIN_GATHERDIV8SF,
27032 IX86_BUILTIN_GATHERSIV2DI,
27033 IX86_BUILTIN_GATHERSIV4DI,
27034 IX86_BUILTIN_GATHERDIV2DI,
27035 IX86_BUILTIN_GATHERDIV4DI,
27036 IX86_BUILTIN_GATHERSIV4SI,
27037 IX86_BUILTIN_GATHERSIV8SI,
27038 IX86_BUILTIN_GATHERDIV4SI,
27039 IX86_BUILTIN_GATHERDIV8SI,
27041 /* Alternate 4 element gather for the vectorizer where
27042 all operands are 32-byte wide. */
27043 IX86_BUILTIN_GATHERALTSIV4DF,
27044 IX86_BUILTIN_GATHERALTDIV8SF,
27045 IX86_BUILTIN_GATHERALTSIV4DI,
27046 IX86_BUILTIN_GATHERALTDIV8SI,
27048 /* TFmode support builtins. */
27049 IX86_BUILTIN_INFQ,
27050 IX86_BUILTIN_HUGE_VALQ,
27051 IX86_BUILTIN_FABSQ,
27052 IX86_BUILTIN_COPYSIGNQ,
27054 /* Vectorizer support builtins. */
27055 IX86_BUILTIN_CPYSGNPS,
27056 IX86_BUILTIN_CPYSGNPD,
27057 IX86_BUILTIN_CPYSGNPS256,
27058 IX86_BUILTIN_CPYSGNPD256,
27060 /* FMA4 instructions. */
27061 IX86_BUILTIN_VFMADDSS,
27062 IX86_BUILTIN_VFMADDSD,
27063 IX86_BUILTIN_VFMADDPS,
27064 IX86_BUILTIN_VFMADDPD,
27065 IX86_BUILTIN_VFMADDPS256,
27066 IX86_BUILTIN_VFMADDPD256,
27067 IX86_BUILTIN_VFMADDSUBPS,
27068 IX86_BUILTIN_VFMADDSUBPD,
27069 IX86_BUILTIN_VFMADDSUBPS256,
27070 IX86_BUILTIN_VFMADDSUBPD256,
27072 /* FMA3 instructions. */
27073 IX86_BUILTIN_VFMADDSS3,
27074 IX86_BUILTIN_VFMADDSD3,
27076 /* XOP instructions. */
27077 IX86_BUILTIN_VPCMOV,
27078 IX86_BUILTIN_VPCMOV_V2DI,
27079 IX86_BUILTIN_VPCMOV_V4SI,
27080 IX86_BUILTIN_VPCMOV_V8HI,
27081 IX86_BUILTIN_VPCMOV_V16QI,
27082 IX86_BUILTIN_VPCMOV_V4SF,
27083 IX86_BUILTIN_VPCMOV_V2DF,
27084 IX86_BUILTIN_VPCMOV256,
27085 IX86_BUILTIN_VPCMOV_V4DI256,
27086 IX86_BUILTIN_VPCMOV_V8SI256,
27087 IX86_BUILTIN_VPCMOV_V16HI256,
27088 IX86_BUILTIN_VPCMOV_V32QI256,
27089 IX86_BUILTIN_VPCMOV_V8SF256,
27090 IX86_BUILTIN_VPCMOV_V4DF256,
27092 IX86_BUILTIN_VPPERM,
27094 IX86_BUILTIN_VPMACSSWW,
27095 IX86_BUILTIN_VPMACSWW,
27096 IX86_BUILTIN_VPMACSSWD,
27097 IX86_BUILTIN_VPMACSWD,
27098 IX86_BUILTIN_VPMACSSDD,
27099 IX86_BUILTIN_VPMACSDD,
27100 IX86_BUILTIN_VPMACSSDQL,
27101 IX86_BUILTIN_VPMACSSDQH,
27102 IX86_BUILTIN_VPMACSDQL,
27103 IX86_BUILTIN_VPMACSDQH,
27104 IX86_BUILTIN_VPMADCSSWD,
27105 IX86_BUILTIN_VPMADCSWD,
27107 IX86_BUILTIN_VPHADDBW,
27108 IX86_BUILTIN_VPHADDBD,
27109 IX86_BUILTIN_VPHADDBQ,
27110 IX86_BUILTIN_VPHADDWD,
27111 IX86_BUILTIN_VPHADDWQ,
27112 IX86_BUILTIN_VPHADDDQ,
27113 IX86_BUILTIN_VPHADDUBW,
27114 IX86_BUILTIN_VPHADDUBD,
27115 IX86_BUILTIN_VPHADDUBQ,
27116 IX86_BUILTIN_VPHADDUWD,
27117 IX86_BUILTIN_VPHADDUWQ,
27118 IX86_BUILTIN_VPHADDUDQ,
27119 IX86_BUILTIN_VPHSUBBW,
27120 IX86_BUILTIN_VPHSUBWD,
27121 IX86_BUILTIN_VPHSUBDQ,
27123 IX86_BUILTIN_VPROTB,
27124 IX86_BUILTIN_VPROTW,
27125 IX86_BUILTIN_VPROTD,
27126 IX86_BUILTIN_VPROTQ,
27127 IX86_BUILTIN_VPROTB_IMM,
27128 IX86_BUILTIN_VPROTW_IMM,
27129 IX86_BUILTIN_VPROTD_IMM,
27130 IX86_BUILTIN_VPROTQ_IMM,
27132 IX86_BUILTIN_VPSHLB,
27133 IX86_BUILTIN_VPSHLW,
27134 IX86_BUILTIN_VPSHLD,
27135 IX86_BUILTIN_VPSHLQ,
27136 IX86_BUILTIN_VPSHAB,
27137 IX86_BUILTIN_VPSHAW,
27138 IX86_BUILTIN_VPSHAD,
27139 IX86_BUILTIN_VPSHAQ,
27141 IX86_BUILTIN_VFRCZSS,
27142 IX86_BUILTIN_VFRCZSD,
27143 IX86_BUILTIN_VFRCZPS,
27144 IX86_BUILTIN_VFRCZPD,
27145 IX86_BUILTIN_VFRCZPS256,
27146 IX86_BUILTIN_VFRCZPD256,
27148 IX86_BUILTIN_VPCOMEQUB,
27149 IX86_BUILTIN_VPCOMNEUB,
27150 IX86_BUILTIN_VPCOMLTUB,
27151 IX86_BUILTIN_VPCOMLEUB,
27152 IX86_BUILTIN_VPCOMGTUB,
27153 IX86_BUILTIN_VPCOMGEUB,
27154 IX86_BUILTIN_VPCOMFALSEUB,
27155 IX86_BUILTIN_VPCOMTRUEUB,
27157 IX86_BUILTIN_VPCOMEQUW,
27158 IX86_BUILTIN_VPCOMNEUW,
27159 IX86_BUILTIN_VPCOMLTUW,
27160 IX86_BUILTIN_VPCOMLEUW,
27161 IX86_BUILTIN_VPCOMGTUW,
27162 IX86_BUILTIN_VPCOMGEUW,
27163 IX86_BUILTIN_VPCOMFALSEUW,
27164 IX86_BUILTIN_VPCOMTRUEUW,
27166 IX86_BUILTIN_VPCOMEQUD,
27167 IX86_BUILTIN_VPCOMNEUD,
27168 IX86_BUILTIN_VPCOMLTUD,
27169 IX86_BUILTIN_VPCOMLEUD,
27170 IX86_BUILTIN_VPCOMGTUD,
27171 IX86_BUILTIN_VPCOMGEUD,
27172 IX86_BUILTIN_VPCOMFALSEUD,
27173 IX86_BUILTIN_VPCOMTRUEUD,
27175 IX86_BUILTIN_VPCOMEQUQ,
27176 IX86_BUILTIN_VPCOMNEUQ,
27177 IX86_BUILTIN_VPCOMLTUQ,
27178 IX86_BUILTIN_VPCOMLEUQ,
27179 IX86_BUILTIN_VPCOMGTUQ,
27180 IX86_BUILTIN_VPCOMGEUQ,
27181 IX86_BUILTIN_VPCOMFALSEUQ,
27182 IX86_BUILTIN_VPCOMTRUEUQ,
27184 IX86_BUILTIN_VPCOMEQB,
27185 IX86_BUILTIN_VPCOMNEB,
27186 IX86_BUILTIN_VPCOMLTB,
27187 IX86_BUILTIN_VPCOMLEB,
27188 IX86_BUILTIN_VPCOMGTB,
27189 IX86_BUILTIN_VPCOMGEB,
27190 IX86_BUILTIN_VPCOMFALSEB,
27191 IX86_BUILTIN_VPCOMTRUEB,
27193 IX86_BUILTIN_VPCOMEQW,
27194 IX86_BUILTIN_VPCOMNEW,
27195 IX86_BUILTIN_VPCOMLTW,
27196 IX86_BUILTIN_VPCOMLEW,
27197 IX86_BUILTIN_VPCOMGTW,
27198 IX86_BUILTIN_VPCOMGEW,
27199 IX86_BUILTIN_VPCOMFALSEW,
27200 IX86_BUILTIN_VPCOMTRUEW,
27202 IX86_BUILTIN_VPCOMEQD,
27203 IX86_BUILTIN_VPCOMNED,
27204 IX86_BUILTIN_VPCOMLTD,
27205 IX86_BUILTIN_VPCOMLED,
27206 IX86_BUILTIN_VPCOMGTD,
27207 IX86_BUILTIN_VPCOMGED,
27208 IX86_BUILTIN_VPCOMFALSED,
27209 IX86_BUILTIN_VPCOMTRUED,
27211 IX86_BUILTIN_VPCOMEQQ,
27212 IX86_BUILTIN_VPCOMNEQ,
27213 IX86_BUILTIN_VPCOMLTQ,
27214 IX86_BUILTIN_VPCOMLEQ,
27215 IX86_BUILTIN_VPCOMGTQ,
27216 IX86_BUILTIN_VPCOMGEQ,
27217 IX86_BUILTIN_VPCOMFALSEQ,
27218 IX86_BUILTIN_VPCOMTRUEQ,
27220 /* LWP instructions. */
27221 IX86_BUILTIN_LLWPCB,
27222 IX86_BUILTIN_SLWPCB,
27223 IX86_BUILTIN_LWPVAL32,
27224 IX86_BUILTIN_LWPVAL64,
27225 IX86_BUILTIN_LWPINS32,
27226 IX86_BUILTIN_LWPINS64,
27228 IX86_BUILTIN_CLZS,
27230 /* RTM */
27231 IX86_BUILTIN_XBEGIN,
27232 IX86_BUILTIN_XEND,
27233 IX86_BUILTIN_XABORT,
27234 IX86_BUILTIN_XTEST,
27236 /* BMI instructions. */
27237 IX86_BUILTIN_BEXTR32,
27238 IX86_BUILTIN_BEXTR64,
27239 IX86_BUILTIN_CTZS,
27241 /* TBM instructions. */
27242 IX86_BUILTIN_BEXTRI32,
27243 IX86_BUILTIN_BEXTRI64,
27245 /* BMI2 instructions. */
27246 IX86_BUILTIN_BZHI32,
27247 IX86_BUILTIN_BZHI64,
27248 IX86_BUILTIN_PDEP32,
27249 IX86_BUILTIN_PDEP64,
27250 IX86_BUILTIN_PEXT32,
27251 IX86_BUILTIN_PEXT64,
27253 /* ADX instructions. */
27254 IX86_BUILTIN_ADDCARRYX32,
27255 IX86_BUILTIN_ADDCARRYX64,
27257 /* FSGSBASE instructions. */
27258 IX86_BUILTIN_RDFSBASE32,
27259 IX86_BUILTIN_RDFSBASE64,
27260 IX86_BUILTIN_RDGSBASE32,
27261 IX86_BUILTIN_RDGSBASE64,
27262 IX86_BUILTIN_WRFSBASE32,
27263 IX86_BUILTIN_WRFSBASE64,
27264 IX86_BUILTIN_WRGSBASE32,
27265 IX86_BUILTIN_WRGSBASE64,
27267 /* RDRND instructions. */
27268 IX86_BUILTIN_RDRAND16_STEP,
27269 IX86_BUILTIN_RDRAND32_STEP,
27270 IX86_BUILTIN_RDRAND64_STEP,
27272 /* RDSEED instructions. */
27273 IX86_BUILTIN_RDSEED16_STEP,
27274 IX86_BUILTIN_RDSEED32_STEP,
27275 IX86_BUILTIN_RDSEED64_STEP,
27277 /* F16C instructions. */
27278 IX86_BUILTIN_CVTPH2PS,
27279 IX86_BUILTIN_CVTPH2PS256,
27280 IX86_BUILTIN_CVTPS2PH,
27281 IX86_BUILTIN_CVTPS2PH256,
27283 /* CFString built-in for darwin */
27284 IX86_BUILTIN_CFSTRING,
27286 /* Builtins to get CPU type and supported features. */
27287 IX86_BUILTIN_CPU_INIT,
27288 IX86_BUILTIN_CPU_IS,
27289 IX86_BUILTIN_CPU_SUPPORTS,
27291 IX86_BUILTIN_MAX
27294 /* Table for the ix86 builtin decls. */
27295 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27297 /* Table of all of the builtin functions that are possible with different ISA's
27298 but are waiting to be built until a function is declared to use that
27299 ISA. */
27300 struct builtin_isa {
27301 const char *name; /* function name */
27302 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27303 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27304 bool const_p; /* true if the declaration is constant */
27305 bool set_and_not_built_p;
27308 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27311 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27312 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27313 function decl in the ix86_builtins array. Returns the function decl or
27314 NULL_TREE, if the builtin was not added.
27316 If the front end has a special hook for builtin functions, delay adding
27317 builtin functions that aren't in the current ISA until the ISA is changed
27318 with function specific optimization. Doing so, can save about 300K for the
27319 default compiler. When the builtin is expanded, check at that time whether
27320 it is valid.
27322 If the front end doesn't have a special hook, record all builtins, even if
27323 it isn't an instruction set in the current ISA in case the user uses
27324 function specific options for a different ISA, so that we don't get scope
27325 errors if a builtin is added in the middle of a function scope. */
27327 static inline tree
27328 def_builtin (HOST_WIDE_INT mask, const char *name,
27329 enum ix86_builtin_func_type tcode,
27330 enum ix86_builtins code)
27332 tree decl = NULL_TREE;
27334 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
27336 ix86_builtins_isa[(int) code].isa = mask;
27338 mask &= ~OPTION_MASK_ISA_64BIT;
27339 if (mask == 0
27340 || (mask & ix86_isa_flags) != 0
27341 || (lang_hooks.builtin_function
27342 == lang_hooks.builtin_function_ext_scope))
27345 tree type = ix86_get_builtin_func_type (tcode);
27346 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27347 NULL, NULL_TREE);
27348 ix86_builtins[(int) code] = decl;
27349 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
27351 else
27353 ix86_builtins[(int) code] = NULL_TREE;
27354 ix86_builtins_isa[(int) code].tcode = tcode;
27355 ix86_builtins_isa[(int) code].name = name;
27356 ix86_builtins_isa[(int) code].const_p = false;
27357 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
27361 return decl;
27364 /* Like def_builtin, but also marks the function decl "const". */
27366 static inline tree
27367 def_builtin_const (HOST_WIDE_INT mask, const char *name,
27368 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
27370 tree decl = def_builtin (mask, name, tcode, code);
27371 if (decl)
27372 TREE_READONLY (decl) = 1;
27373 else
27374 ix86_builtins_isa[(int) code].const_p = true;
27376 return decl;
27379 /* Add any new builtin functions for a given ISA that may not have been
27380 declared. This saves a bit of space compared to adding all of the
27381 declarations to the tree, even if we didn't use them. */
27383 static void
27384 ix86_add_new_builtins (HOST_WIDE_INT isa)
27386 int i;
27388 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
27390 if ((ix86_builtins_isa[i].isa & isa) != 0
27391 && ix86_builtins_isa[i].set_and_not_built_p)
27393 tree decl, type;
27395 /* Don't define the builtin again. */
27396 ix86_builtins_isa[i].set_and_not_built_p = false;
27398 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
27399 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
27400 type, i, BUILT_IN_MD, NULL,
27401 NULL_TREE);
27403 ix86_builtins[i] = decl;
27404 if (ix86_builtins_isa[i].const_p)
27405 TREE_READONLY (decl) = 1;
27410 /* Bits for builtin_description.flag. */
27412 /* Set when we don't support the comparison natively, and should
27413 swap_comparison in order to support it. */
27414 #define BUILTIN_DESC_SWAP_OPERANDS 1
27416 struct builtin_description
27418 const HOST_WIDE_INT mask;
27419 const enum insn_code icode;
27420 const char *const name;
27421 const enum ix86_builtins code;
27422 const enum rtx_code comparison;
27423 const int flag;
27426 static const struct builtin_description bdesc_comi[] =
27428 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
27429 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
27430 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
27431 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
27432 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
27433 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
27434 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
27435 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
27436 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
27437 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
27438 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
27439 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
27440 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
27441 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
27442 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
27443 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
27444 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
27445 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
27446 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
27447 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
27448 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
27449 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
27450 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
27451 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
27454 static const struct builtin_description bdesc_pcmpestr[] =
27456 /* SSE4.2 */
27457 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
27458 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
27459 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
27460 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
27461 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
27462 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
27463 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
27466 static const struct builtin_description bdesc_pcmpistr[] =
27468 /* SSE4.2 */
27469 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
27470 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
27471 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
27472 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
27473 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
27474 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
27475 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
27478 /* Special builtins with variable number of arguments. */
27479 static const struct builtin_description bdesc_special_args[] =
27481 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
27482 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
27483 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
27485 /* MMX */
27486 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27488 /* 3DNow! */
27489 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27491 /* FXSR, XSAVE and XSAVEOPT */
27492 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
27493 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
27494 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27495 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27496 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27498 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27499 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27500 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27501 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27502 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27504 /* SSE */
27505 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27506 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27507 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27509 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27510 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27511 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27512 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27514 /* SSE or 3DNow!A */
27515 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27516 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
27518 /* SSE2 */
27519 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27520 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27521 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27522 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
27523 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27524 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
27525 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
27526 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
27527 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
27528 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27530 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27531 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27533 /* SSE3 */
27534 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27536 /* SSE4.1 */
27537 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
27539 /* SSE4A */
27540 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27541 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27543 /* AVX */
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27549 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
27551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
27559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
27562 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27563 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
27566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
27568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
27569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
27570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
27572 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
27574 /* AVX2 */
27575 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
27576 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
27577 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
27578 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
27579 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
27580 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
27581 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
27582 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
27583 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
27585 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
27586 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
27587 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
27588 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
27589 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
27590 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
27592 /* FSGSBASE */
27593 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27594 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27595 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27596 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27597 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27598 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27599 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27600 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27602 /* RTM */
27603 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27604 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
27605 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
27608 /* Builtins with variable number of arguments. */
27609 static const struct builtin_description bdesc_args[] =
27611 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
27612 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
27613 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
27614 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27615 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27616 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27617 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27619 /* MMX */
27620 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27621 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27622 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27623 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27624 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27625 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27627 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27628 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27629 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27630 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27631 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27632 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27633 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27634 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27636 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27637 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27639 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27640 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27641 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27642 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27644 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27645 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27646 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27647 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27648 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27649 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27651 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27652 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27653 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27654 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27655 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27656 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27658 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27659 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27660 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27662 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27664 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27665 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27666 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27667 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27668 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27669 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27671 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27672 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27673 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27674 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27675 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27676 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27678 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27679 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27680 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27681 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27683 /* 3DNow! */
27684 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27685 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27686 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27687 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27689 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27690 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27691 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27692 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27693 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27694 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27695 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27696 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27697 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27698 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27699 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27700 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27701 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27702 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27703 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27705 /* 3DNow!A */
27706 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27707 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27708 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27709 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27710 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27711 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27713 /* SSE */
27714 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27715 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27716 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27717 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27718 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27719 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27720 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27721 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27722 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27723 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27724 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27725 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27727 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27729 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27730 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27731 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27732 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27733 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27734 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27735 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27736 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27738 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27739 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27740 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27741 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27742 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27743 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27744 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27745 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27746 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27747 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27748 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27749 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27750 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27751 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27752 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27753 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27754 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27755 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27756 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27757 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27759 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27760 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27761 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27762 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27764 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27765 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27766 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27767 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27769 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27771 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27772 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27773 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27774 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27775 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27777 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27778 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27779 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27781 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27783 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27784 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27785 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27787 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27788 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27790 /* SSE MMX or 3Dnow!A */
27791 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27792 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27793 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27795 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27796 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27797 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27798 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27800 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27801 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27803 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27805 /* SSE2 */
27806 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27808 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27809 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27810 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27811 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27812 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27814 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27815 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27816 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27817 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27818 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27820 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27822 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27823 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27824 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27825 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27827 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27828 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27829 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27831 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27832 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27833 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27834 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27835 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27836 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27837 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27838 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27840 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27841 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27842 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27843 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27844 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27845 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27846 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27847 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27848 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27849 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27850 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27851 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27852 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27853 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27854 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27855 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27856 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27857 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27858 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27859 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27861 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27862 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27863 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27864 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27866 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27867 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27868 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27869 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27871 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27873 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27874 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27875 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27877 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27879 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27880 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27881 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27882 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27883 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27884 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27885 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27886 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27888 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27889 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27890 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27891 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27892 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27893 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27894 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27895 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27897 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27898 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27900 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27901 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27902 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27903 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27905 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27906 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27908 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27909 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27910 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27911 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27912 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27913 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27915 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27916 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27917 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27918 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27920 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27921 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27922 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27923 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27924 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27925 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27926 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27927 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27929 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27930 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27931 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27933 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27934 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27936 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27937 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27939 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27941 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27942 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27943 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27944 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27946 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27947 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27948 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27949 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27950 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27951 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27952 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27954 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27955 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27956 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27957 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27958 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27959 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27960 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27962 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27963 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27964 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27965 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27967 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27968 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27969 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27971 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27973 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27975 /* SSE2 MMX */
27976 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27977 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27979 /* SSE3 */
27980 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27981 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27983 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27984 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27985 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27986 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27987 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27988 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27990 /* SSSE3 */
27991 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27992 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27993 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27994 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27995 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27996 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27998 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27999 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28000 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28001 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28002 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28003 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28004 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28005 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28006 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28007 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28008 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28009 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28010 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
28011 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
28012 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28013 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28014 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28015 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28016 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28017 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28018 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28019 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28020 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28021 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28023 /* SSSE3. */
28024 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
28025 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
28027 /* SSE4.1 */
28028 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28029 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28030 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
28031 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
28032 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28033 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28034 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28035 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
28036 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
28037 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
28039 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28040 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28041 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28042 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28043 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28044 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28045 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28046 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28047 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28048 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28049 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28050 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28051 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28053 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28054 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28055 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28056 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28057 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28058 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28059 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28060 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28061 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28062 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28063 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28064 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28066 /* SSE4.1 */
28067 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28068 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28069 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28070 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28072 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
28073 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
28074 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
28075 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
28077 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28078 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28080 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28081 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28083 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
28084 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
28085 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
28086 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
28088 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
28089 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
28091 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28092 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28094 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28095 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28096 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28098 /* SSE4.2 */
28099 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28100 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
28101 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
28102 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28103 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28105 /* SSE4A */
28106 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
28107 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
28108 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
28109 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28111 /* AES */
28112 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
28113 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28115 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28116 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28117 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28118 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28120 /* PCLMUL */
28121 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
28123 /* AVX */
28124 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28125 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28126 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28127 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28128 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28129 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28130 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28131 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28132 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28133 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28134 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28135 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28136 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28137 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28138 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28139 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28140 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28141 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28142 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28143 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28144 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28145 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28146 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28147 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28148 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28149 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28151 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28152 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28153 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28154 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28156 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28157 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28158 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28159 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28160 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28161 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28162 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28163 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28164 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28165 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28166 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28167 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28168 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28169 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28170 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28171 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28172 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28173 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28174 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28175 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28176 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28177 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28178 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28179 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28180 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28181 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28182 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28183 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28184 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28185 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28186 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28187 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28188 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28189 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28191 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28192 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28193 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28195 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28196 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28197 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28198 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28199 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28201 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28203 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28204 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28206 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28207 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28208 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28209 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28211 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28212 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28214 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28215 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28217 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28218 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28219 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28220 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28222 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28223 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28225 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28226 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28228 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28229 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28230 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28231 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28233 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28234 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28235 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28236 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28237 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28238 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28240 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28241 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28242 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28243 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28244 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28245 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28246 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28247 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28248 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28249 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28250 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28251 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28252 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28253 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28254 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28256 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28257 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28259 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28260 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28262 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28264 /* AVX2 */
28265 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28266 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28267 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28268 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28269 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28270 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28271 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28272 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28273 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28274 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28275 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28276 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28277 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28278 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28279 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28280 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28281 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28282 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28283 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28284 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28285 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28286 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28287 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28288 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28289 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28290 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28291 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28292 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28293 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28294 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28295 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28296 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28297 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28298 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28299 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28300 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28301 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28302 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28303 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28304 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28305 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28306 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28307 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28308 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28309 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28310 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28311 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28312 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28313 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28314 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28315 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28316 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28317 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28318 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28319 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28320 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28321 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28322 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28323 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28324 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28325 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28326 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28327 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28328 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28329 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28330 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28331 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28332 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28333 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28334 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28335 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28336 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28337 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28338 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28339 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
28340 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28341 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28342 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28343 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28344 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28345 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28346 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28347 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28348 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28349 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28350 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28351 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28352 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28353 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28354 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28355 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28356 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28357 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28358 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28359 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28360 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28361 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28362 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28363 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28364 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28365 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28366 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28367 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28368 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28369 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28370 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28371 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28372 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28373 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28374 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28375 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28376 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28377 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28378 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28379 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28380 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28381 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28382 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28383 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28384 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
28385 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28386 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
28387 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
28388 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28389 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28390 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28391 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28392 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28393 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28394 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28395 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28396 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28397 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
28398 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
28399 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
28400 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
28401 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28402 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28403 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28404 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28405 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28406 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28407 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28408 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28409 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28410 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28412 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28414 /* BMI */
28415 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28416 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28417 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28419 /* TBM */
28420 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28421 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28423 /* F16C */
28424 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
28425 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
28426 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
28427 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
28429 /* BMI2 */
28430 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28431 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28432 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28433 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28434 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28435 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28438 /* FMA4 and XOP. */
28439 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
28440 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
28441 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
28442 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
28443 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
28444 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
28445 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
28446 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
28447 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
28448 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
28449 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
28450 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
28451 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
28452 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
28453 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
28454 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
28455 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
28456 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
28457 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
28458 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
28459 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
28460 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
28461 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
28462 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
28463 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
28464 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
28465 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
28466 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
28467 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
28468 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
28469 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
28470 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
28471 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
28472 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
28473 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
28474 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
28475 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
28476 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
28477 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
28478 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
28479 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
28480 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
28481 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
28482 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
28483 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
28484 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
28485 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
28486 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
28487 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
28488 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
28489 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
28490 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
28492 static const struct builtin_description bdesc_multi_arg[] =
28494 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
28495 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
28496 UNKNOWN, (int)MULTI_ARG_3_SF },
28497 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
28498 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
28499 UNKNOWN, (int)MULTI_ARG_3_DF },
28501 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
28502 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
28503 UNKNOWN, (int)MULTI_ARG_3_SF },
28504 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
28505 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
28506 UNKNOWN, (int)MULTI_ARG_3_DF },
28508 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
28509 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
28510 UNKNOWN, (int)MULTI_ARG_3_SF },
28511 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
28512 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
28513 UNKNOWN, (int)MULTI_ARG_3_DF },
28514 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
28515 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
28516 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28517 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
28518 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
28519 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28521 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
28522 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
28523 UNKNOWN, (int)MULTI_ARG_3_SF },
28524 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
28525 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
28526 UNKNOWN, (int)MULTI_ARG_3_DF },
28527 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
28528 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
28529 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28530 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
28531 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
28532 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28534 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
28535 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
28536 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
28537 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
28538 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
28539 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
28540 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
28542 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28543 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28544 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
28545 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
28546 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
28547 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
28548 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
28550 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
28552 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28553 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28554 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28555 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28556 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28557 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28558 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28559 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28560 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28561 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28562 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28563 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28565 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28566 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
28567 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
28568 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
28569 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
28570 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
28571 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
28572 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
28573 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28574 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
28575 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
28576 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
28577 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28578 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
28579 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
28580 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
28582 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
28583 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
28584 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
28585 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
28586 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
28587 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
28589 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28590 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28591 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28592 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28593 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28594 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28595 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28596 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28597 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28598 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28599 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28600 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28601 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28602 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28603 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28605 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
28606 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28607 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28608 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
28609 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
28610 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
28611 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
28613 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
28614 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28615 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28616 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
28617 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
28618 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
28619 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
28621 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
28622 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28623 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28624 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
28625 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
28626 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
28627 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28629 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28630 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28631 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28632 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28633 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28634 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28635 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28637 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28638 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28639 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28640 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28641 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28642 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28643 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28645 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28646 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28647 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28648 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28649 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28650 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28651 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28653 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28654 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28655 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28656 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28657 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28658 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28659 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28661 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28662 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28663 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28664 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28665 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28666 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28667 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28669 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28670 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28671 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28672 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28673 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28674 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28675 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28676 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28678 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28679 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28680 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28681 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28682 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28683 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28684 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28685 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28687 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28688 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28689 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28690 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28694 /* TM vector builtins. */
28696 /* Reuse the existing x86-specific `struct builtin_description' cause
28697 we're lazy. Add casts to make them fit. */
28698 static const struct builtin_description bdesc_tm[] =
28700 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28701 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28702 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28703 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28704 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28705 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28706 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28708 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28709 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28710 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28711 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28712 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28713 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28714 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28716 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28717 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28718 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28719 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28720 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28721 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28722 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28724 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28725 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28726 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28729 /* TM callbacks. */
28731 /* Return the builtin decl needed to load a vector of TYPE. */
28733 static tree
28734 ix86_builtin_tm_load (tree type)
28736 if (TREE_CODE (type) == VECTOR_TYPE)
28738 switch (tree_low_cst (TYPE_SIZE (type), 1))
28740 case 64:
28741 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28742 case 128:
28743 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28744 case 256:
28745 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28748 return NULL_TREE;
28751 /* Return the builtin decl needed to store a vector of TYPE. */
28753 static tree
28754 ix86_builtin_tm_store (tree type)
28756 if (TREE_CODE (type) == VECTOR_TYPE)
28758 switch (tree_low_cst (TYPE_SIZE (type), 1))
28760 case 64:
28761 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28762 case 128:
28763 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28764 case 256:
28765 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28768 return NULL_TREE;
28771 /* Initialize the transactional memory vector load/store builtins. */
28773 static void
28774 ix86_init_tm_builtins (void)
28776 enum ix86_builtin_func_type ftype;
28777 const struct builtin_description *d;
28778 size_t i;
28779 tree decl;
28780 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28781 tree attrs_log, attrs_type_log;
28783 if (!flag_tm)
28784 return;
28786 /* If there are no builtins defined, we must be compiling in a
28787 language without trans-mem support. */
28788 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28789 return;
28791 /* Use whatever attributes a normal TM load has. */
28792 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28793 attrs_load = DECL_ATTRIBUTES (decl);
28794 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28795 /* Use whatever attributes a normal TM store has. */
28796 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28797 attrs_store = DECL_ATTRIBUTES (decl);
28798 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28799 /* Use whatever attributes a normal TM log has. */
28800 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28801 attrs_log = DECL_ATTRIBUTES (decl);
28802 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28804 for (i = 0, d = bdesc_tm;
28805 i < ARRAY_SIZE (bdesc_tm);
28806 i++, d++)
28808 if ((d->mask & ix86_isa_flags) != 0
28809 || (lang_hooks.builtin_function
28810 == lang_hooks.builtin_function_ext_scope))
28812 tree type, attrs, attrs_type;
28813 enum built_in_function code = (enum built_in_function) d->code;
28815 ftype = (enum ix86_builtin_func_type) d->flag;
28816 type = ix86_get_builtin_func_type (ftype);
28818 if (BUILTIN_TM_LOAD_P (code))
28820 attrs = attrs_load;
28821 attrs_type = attrs_type_load;
28823 else if (BUILTIN_TM_STORE_P (code))
28825 attrs = attrs_store;
28826 attrs_type = attrs_type_store;
28828 else
28830 attrs = attrs_log;
28831 attrs_type = attrs_type_log;
28833 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28834 /* The builtin without the prefix for
28835 calling it directly. */
28836 d->name + strlen ("__builtin_"),
28837 attrs);
28838 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28839 set the TYPE_ATTRIBUTES. */
28840 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28842 set_builtin_decl (code, decl, false);
28847 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28848 in the current target ISA to allow the user to compile particular modules
28849 with different target specific options that differ from the command line
28850 options. */
28851 static void
28852 ix86_init_mmx_sse_builtins (void)
28854 const struct builtin_description * d;
28855 enum ix86_builtin_func_type ftype;
28856 size_t i;
28858 /* Add all special builtins with variable number of operands. */
28859 for (i = 0, d = bdesc_special_args;
28860 i < ARRAY_SIZE (bdesc_special_args);
28861 i++, d++)
28863 if (d->name == 0)
28864 continue;
28866 ftype = (enum ix86_builtin_func_type) d->flag;
28867 def_builtin (d->mask, d->name, ftype, d->code);
28870 /* Add all builtins with variable number of operands. */
28871 for (i = 0, d = bdesc_args;
28872 i < ARRAY_SIZE (bdesc_args);
28873 i++, d++)
28875 if (d->name == 0)
28876 continue;
28878 ftype = (enum ix86_builtin_func_type) d->flag;
28879 def_builtin_const (d->mask, d->name, ftype, d->code);
28882 /* pcmpestr[im] insns. */
28883 for (i = 0, d = bdesc_pcmpestr;
28884 i < ARRAY_SIZE (bdesc_pcmpestr);
28885 i++, d++)
28887 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28888 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28889 else
28890 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28891 def_builtin_const (d->mask, d->name, ftype, d->code);
28894 /* pcmpistr[im] insns. */
28895 for (i = 0, d = bdesc_pcmpistr;
28896 i < ARRAY_SIZE (bdesc_pcmpistr);
28897 i++, d++)
28899 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28900 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28901 else
28902 ftype = INT_FTYPE_V16QI_V16QI_INT;
28903 def_builtin_const (d->mask, d->name, ftype, d->code);
28906 /* comi/ucomi insns. */
28907 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28909 if (d->mask == OPTION_MASK_ISA_SSE2)
28910 ftype = INT_FTYPE_V2DF_V2DF;
28911 else
28912 ftype = INT_FTYPE_V4SF_V4SF;
28913 def_builtin_const (d->mask, d->name, ftype, d->code);
28916 /* SSE */
28917 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28918 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28919 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28920 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28922 /* SSE or 3DNow!A */
28923 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28924 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28925 IX86_BUILTIN_MASKMOVQ);
28927 /* SSE2 */
28928 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28929 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28931 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28932 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28933 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28934 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28936 /* SSE3. */
28937 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28938 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28939 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28940 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28942 /* AES */
28943 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28944 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28945 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28946 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28947 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28948 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28949 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28950 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28951 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28952 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28953 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28954 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28956 /* PCLMUL */
28957 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28958 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28960 /* RDRND */
28961 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28962 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28963 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28964 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28965 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28966 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28967 IX86_BUILTIN_RDRAND64_STEP);
28969 /* AVX2 */
28970 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28971 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28972 IX86_BUILTIN_GATHERSIV2DF);
28974 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28975 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28976 IX86_BUILTIN_GATHERSIV4DF);
28978 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28979 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28980 IX86_BUILTIN_GATHERDIV2DF);
28982 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28983 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28984 IX86_BUILTIN_GATHERDIV4DF);
28986 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28987 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28988 IX86_BUILTIN_GATHERSIV4SF);
28990 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28991 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28992 IX86_BUILTIN_GATHERSIV8SF);
28994 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28995 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28996 IX86_BUILTIN_GATHERDIV4SF);
28998 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28999 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
29000 IX86_BUILTIN_GATHERDIV8SF);
29002 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
29003 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
29004 IX86_BUILTIN_GATHERSIV2DI);
29006 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
29007 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
29008 IX86_BUILTIN_GATHERSIV4DI);
29010 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
29011 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
29012 IX86_BUILTIN_GATHERDIV2DI);
29014 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
29015 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
29016 IX86_BUILTIN_GATHERDIV4DI);
29018 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
29019 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
29020 IX86_BUILTIN_GATHERSIV4SI);
29022 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
29023 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
29024 IX86_BUILTIN_GATHERSIV8SI);
29026 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
29027 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
29028 IX86_BUILTIN_GATHERDIV4SI);
29030 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
29031 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
29032 IX86_BUILTIN_GATHERDIV8SI);
29034 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
29035 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
29036 IX86_BUILTIN_GATHERALTSIV4DF);
29038 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
29039 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
29040 IX86_BUILTIN_GATHERALTDIV8SF);
29042 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
29043 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
29044 IX86_BUILTIN_GATHERALTSIV4DI);
29046 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
29047 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
29048 IX86_BUILTIN_GATHERALTDIV8SI);
29050 /* RTM. */
29051 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
29052 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
29054 /* MMX access to the vec_init patterns. */
29055 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
29056 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
29058 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
29059 V4HI_FTYPE_HI_HI_HI_HI,
29060 IX86_BUILTIN_VEC_INIT_V4HI);
29062 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
29063 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
29064 IX86_BUILTIN_VEC_INIT_V8QI);
29066 /* Access to the vec_extract patterns. */
29067 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
29068 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
29069 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
29070 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
29071 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
29072 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
29073 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
29074 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
29075 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
29076 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
29078 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29079 "__builtin_ia32_vec_ext_v4hi",
29080 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
29082 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
29083 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
29085 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
29086 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
29088 /* Access to the vec_set patterns. */
29089 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
29090 "__builtin_ia32_vec_set_v2di",
29091 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
29093 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
29094 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
29096 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
29097 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
29099 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
29100 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
29102 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29103 "__builtin_ia32_vec_set_v4hi",
29104 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
29106 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
29107 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
29109 /* RDSEED */
29110 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
29111 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
29112 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
29113 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
29114 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
29115 "__builtin_ia32_rdseed_di_step",
29116 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
29118 /* ADCX */
29119 def_builtin (0, "__builtin_ia32_addcarryx_u32",
29120 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
29121 def_builtin (OPTION_MASK_ISA_64BIT,
29122 "__builtin_ia32_addcarryx_u64",
29123 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
29124 IX86_BUILTIN_ADDCARRYX64);
29126 /* Add FMA4 multi-arg argument instructions */
29127 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29129 if (d->name == 0)
29130 continue;
29132 ftype = (enum ix86_builtin_func_type) d->flag;
29133 def_builtin_const (d->mask, d->name, ftype, d->code);
29137 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29138 to return a pointer to VERSION_DECL if the outcome of the expression
29139 formed by PREDICATE_CHAIN is true. This function will be called during
29140 version dispatch to decide which function version to execute. It returns
29141 the basic block at the end, to which more conditions can be added. */
29143 static basic_block
29144 add_condition_to_bb (tree function_decl, tree version_decl,
29145 tree predicate_chain, basic_block new_bb)
29147 gimple return_stmt;
29148 tree convert_expr, result_var;
29149 gimple convert_stmt;
29150 gimple call_cond_stmt;
29151 gimple if_else_stmt;
29153 basic_block bb1, bb2, bb3;
29154 edge e12, e23;
29156 tree cond_var, and_expr_var = NULL_TREE;
29157 gimple_seq gseq;
29159 tree predicate_decl, predicate_arg;
29161 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29163 gcc_assert (new_bb != NULL);
29164 gseq = bb_seq (new_bb);
29167 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29168 build_fold_addr_expr (version_decl));
29169 result_var = create_tmp_var (ptr_type_node, NULL);
29170 convert_stmt = gimple_build_assign (result_var, convert_expr);
29171 return_stmt = gimple_build_return (result_var);
29173 if (predicate_chain == NULL_TREE)
29175 gimple_seq_add_stmt (&gseq, convert_stmt);
29176 gimple_seq_add_stmt (&gseq, return_stmt);
29177 set_bb_seq (new_bb, gseq);
29178 gimple_set_bb (convert_stmt, new_bb);
29179 gimple_set_bb (return_stmt, new_bb);
29180 pop_cfun ();
29181 return new_bb;
29184 while (predicate_chain != NULL)
29186 cond_var = create_tmp_var (integer_type_node, NULL);
29187 predicate_decl = TREE_PURPOSE (predicate_chain);
29188 predicate_arg = TREE_VALUE (predicate_chain);
29189 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29190 gimple_call_set_lhs (call_cond_stmt, cond_var);
29192 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29193 gimple_set_bb (call_cond_stmt, new_bb);
29194 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29196 predicate_chain = TREE_CHAIN (predicate_chain);
29198 if (and_expr_var == NULL)
29199 and_expr_var = cond_var;
29200 else
29202 gimple assign_stmt;
29203 /* Use MIN_EXPR to check if any integer is zero?.
29204 and_expr_var = min_expr <cond_var, and_expr_var> */
29205 assign_stmt = gimple_build_assign (and_expr_var,
29206 build2 (MIN_EXPR, integer_type_node,
29207 cond_var, and_expr_var));
29209 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29210 gimple_set_bb (assign_stmt, new_bb);
29211 gimple_seq_add_stmt (&gseq, assign_stmt);
29215 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29216 integer_zero_node,
29217 NULL_TREE, NULL_TREE);
29218 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29219 gimple_set_bb (if_else_stmt, new_bb);
29220 gimple_seq_add_stmt (&gseq, if_else_stmt);
29222 gimple_seq_add_stmt (&gseq, convert_stmt);
29223 gimple_seq_add_stmt (&gseq, return_stmt);
29224 set_bb_seq (new_bb, gseq);
29226 bb1 = new_bb;
29227 e12 = split_block (bb1, if_else_stmt);
29228 bb2 = e12->dest;
29229 e12->flags &= ~EDGE_FALLTHRU;
29230 e12->flags |= EDGE_TRUE_VALUE;
29232 e23 = split_block (bb2, return_stmt);
29234 gimple_set_bb (convert_stmt, bb2);
29235 gimple_set_bb (return_stmt, bb2);
29237 bb3 = e23->dest;
29238 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29240 remove_edge (e23);
29241 make_edge (bb2, EXIT_BLOCK_PTR, 0);
29243 pop_cfun ();
29245 return bb3;
29248 /* This parses the attribute arguments to target in DECL and determines
29249 the right builtin to use to match the platform specification.
29250 It returns the priority value for this version decl. If PREDICATE_LIST
29251 is not NULL, it stores the list of cpu features that need to be checked
29252 before dispatching this function. */
29254 static unsigned int
29255 get_builtin_code_for_version (tree decl, tree *predicate_list)
29257 tree attrs;
29258 struct cl_target_option cur_target;
29259 tree target_node;
29260 struct cl_target_option *new_target;
29261 const char *arg_str = NULL;
29262 const char *attrs_str = NULL;
29263 char *tok_str = NULL;
29264 char *token;
29266 /* Priority of i386 features, greater value is higher priority. This is
29267 used to decide the order in which function dispatch must happen. For
29268 instance, a version specialized for SSE4.2 should be checked for dispatch
29269 before a version for SSE3, as SSE4.2 implies SSE3. */
29270 enum feature_priority
29272 P_ZERO = 0,
29273 P_MMX,
29274 P_SSE,
29275 P_SSE2,
29276 P_SSE3,
29277 P_SSSE3,
29278 P_PROC_SSSE3,
29279 P_SSE4_a,
29280 P_PROC_SSE4_a,
29281 P_SSE4_1,
29282 P_SSE4_2,
29283 P_PROC_SSE4_2,
29284 P_POPCNT,
29285 P_AVX,
29286 P_AVX2,
29287 P_FMA,
29288 P_PROC_FMA
29291 enum feature_priority priority = P_ZERO;
29293 /* These are the target attribute strings for which a dispatcher is
29294 available, from fold_builtin_cpu. */
29296 static struct _feature_list
29298 const char *const name;
29299 const enum feature_priority priority;
29301 const feature_list[] =
29303 {"mmx", P_MMX},
29304 {"sse", P_SSE},
29305 {"sse2", P_SSE2},
29306 {"sse3", P_SSE3},
29307 {"ssse3", P_SSSE3},
29308 {"sse4.1", P_SSE4_1},
29309 {"sse4.2", P_SSE4_2},
29310 {"popcnt", P_POPCNT},
29311 {"avx", P_AVX},
29312 {"avx2", P_AVX2}
29316 static unsigned int NUM_FEATURES
29317 = sizeof (feature_list) / sizeof (struct _feature_list);
29319 unsigned int i;
29321 tree predicate_chain = NULL_TREE;
29322 tree predicate_decl, predicate_arg;
29324 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29325 gcc_assert (attrs != NULL);
29327 attrs = TREE_VALUE (TREE_VALUE (attrs));
29329 gcc_assert (TREE_CODE (attrs) == STRING_CST);
29330 attrs_str = TREE_STRING_POINTER (attrs);
29332 /* Return priority zero for default function. */
29333 if (strcmp (attrs_str, "default") == 0)
29334 return 0;
29336 /* Handle arch= if specified. For priority, set it to be 1 more than
29337 the best instruction set the processor can handle. For instance, if
29338 there is a version for atom and a version for ssse3 (the highest ISA
29339 priority for atom), the atom version must be checked for dispatch
29340 before the ssse3 version. */
29341 if (strstr (attrs_str, "arch=") != NULL)
29343 cl_target_option_save (&cur_target, &global_options);
29344 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
29345 &global_options_set);
29347 gcc_assert (target_node);
29348 new_target = TREE_TARGET_OPTION (target_node);
29349 gcc_assert (new_target);
29351 if (new_target->arch_specified && new_target->arch > 0)
29353 switch (new_target->arch)
29355 case PROCESSOR_CORE2:
29356 arg_str = "core2";
29357 priority = P_PROC_SSSE3;
29358 break;
29359 case PROCESSOR_COREI7:
29360 arg_str = "corei7";
29361 priority = P_PROC_SSE4_2;
29362 break;
29363 case PROCESSOR_ATOM:
29364 arg_str = "atom";
29365 priority = P_PROC_SSSE3;
29366 break;
29367 case PROCESSOR_AMDFAM10:
29368 arg_str = "amdfam10h";
29369 priority = P_PROC_SSE4_a;
29370 break;
29371 case PROCESSOR_BDVER1:
29372 arg_str = "bdver1";
29373 priority = P_PROC_FMA;
29374 break;
29375 case PROCESSOR_BDVER2:
29376 arg_str = "bdver2";
29377 priority = P_PROC_FMA;
29378 break;
29382 cl_target_option_restore (&global_options, &cur_target);
29384 if (predicate_list && arg_str == NULL)
29386 error_at (DECL_SOURCE_LOCATION (decl),
29387 "No dispatcher found for the versioning attributes");
29388 return 0;
29391 if (predicate_list)
29393 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
29394 /* For a C string literal the length includes the trailing NULL. */
29395 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
29396 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29397 predicate_chain);
29401 /* Process feature name. */
29402 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
29403 strcpy (tok_str, attrs_str);
29404 token = strtok (tok_str, ",");
29405 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
29407 while (token != NULL)
29409 /* Do not process "arch=" */
29410 if (strncmp (token, "arch=", 5) == 0)
29412 token = strtok (NULL, ",");
29413 continue;
29415 for (i = 0; i < NUM_FEATURES; ++i)
29417 if (strcmp (token, feature_list[i].name) == 0)
29419 if (predicate_list)
29421 predicate_arg = build_string_literal (
29422 strlen (feature_list[i].name) + 1,
29423 feature_list[i].name);
29424 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29425 predicate_chain);
29427 /* Find the maximum priority feature. */
29428 if (feature_list[i].priority > priority)
29429 priority = feature_list[i].priority;
29431 break;
29434 if (predicate_list && i == NUM_FEATURES)
29436 error_at (DECL_SOURCE_LOCATION (decl),
29437 "No dispatcher found for %s", token);
29438 return 0;
29440 token = strtok (NULL, ",");
29442 free (tok_str);
29444 if (predicate_list && predicate_chain == NULL_TREE)
29446 error_at (DECL_SOURCE_LOCATION (decl),
29447 "No dispatcher found for the versioning attributes : %s",
29448 attrs_str);
29449 return 0;
29451 else if (predicate_list)
29453 predicate_chain = nreverse (predicate_chain);
29454 *predicate_list = predicate_chain;
29457 return priority;
29460 /* This compares the priority of target features in function DECL1
29461 and DECL2. It returns positive value if DECL1 is higher priority,
29462 negative value if DECL2 is higher priority and 0 if they are the
29463 same. */
29465 static int
29466 ix86_compare_version_priority (tree decl1, tree decl2)
29468 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
29469 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
29471 return (int)priority1 - (int)priority2;
29474 /* V1 and V2 point to function versions with different priorities
29475 based on the target ISA. This function compares their priorities. */
29477 static int
29478 feature_compare (const void *v1, const void *v2)
29480 typedef struct _function_version_info
29482 tree version_decl;
29483 tree predicate_chain;
29484 unsigned int dispatch_priority;
29485 } function_version_info;
29487 const function_version_info c1 = *(const function_version_info *)v1;
29488 const function_version_info c2 = *(const function_version_info *)v2;
29489 return (c2.dispatch_priority - c1.dispatch_priority);
29492 /* This function generates the dispatch function for
29493 multi-versioned functions. DISPATCH_DECL is the function which will
29494 contain the dispatch logic. FNDECLS are the function choices for
29495 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
29496 in DISPATCH_DECL in which the dispatch code is generated. */
29498 static int
29499 dispatch_function_versions (tree dispatch_decl,
29500 void *fndecls_p,
29501 basic_block *empty_bb)
29503 tree default_decl;
29504 gimple ifunc_cpu_init_stmt;
29505 gimple_seq gseq;
29506 int ix;
29507 tree ele;
29508 vec<tree> *fndecls;
29509 unsigned int num_versions = 0;
29510 unsigned int actual_versions = 0;
29511 unsigned int i;
29513 struct _function_version_info
29515 tree version_decl;
29516 tree predicate_chain;
29517 unsigned int dispatch_priority;
29518 }*function_version_info;
29520 gcc_assert (dispatch_decl != NULL
29521 && fndecls_p != NULL
29522 && empty_bb != NULL);
29524 /*fndecls_p is actually a vector. */
29525 fndecls = static_cast<vec<tree> *> (fndecls_p);
29527 /* At least one more version other than the default. */
29528 num_versions = fndecls->length ();
29529 gcc_assert (num_versions >= 2);
29531 function_version_info = (struct _function_version_info *)
29532 XNEWVEC (struct _function_version_info, (num_versions - 1));
29534 /* The first version in the vector is the default decl. */
29535 default_decl = (*fndecls)[0];
29537 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
29539 gseq = bb_seq (*empty_bb);
29540 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
29541 constructors, so explicity call __builtin_cpu_init here. */
29542 ifunc_cpu_init_stmt = gimple_build_call_vec (
29543 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
29544 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
29545 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
29546 set_bb_seq (*empty_bb, gseq);
29548 pop_cfun ();
29551 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
29553 tree version_decl = ele;
29554 tree predicate_chain = NULL_TREE;
29555 unsigned int priority;
29556 /* Get attribute string, parse it and find the right predicate decl.
29557 The predicate function could be a lengthy combination of many
29558 features, like arch-type and various isa-variants. */
29559 priority = get_builtin_code_for_version (version_decl,
29560 &predicate_chain);
29562 if (predicate_chain == NULL_TREE)
29563 continue;
29565 function_version_info [actual_versions].version_decl = version_decl;
29566 function_version_info [actual_versions].predicate_chain
29567 = predicate_chain;
29568 function_version_info [actual_versions].dispatch_priority = priority;
29569 actual_versions++;
29572 /* Sort the versions according to descending order of dispatch priority. The
29573 priority is based on the ISA. This is not a perfect solution. There
29574 could still be ambiguity. If more than one function version is suitable
29575 to execute, which one should be dispatched? In future, allow the user
29576 to specify a dispatch priority next to the version. */
29577 qsort (function_version_info, actual_versions,
29578 sizeof (struct _function_version_info), feature_compare);
29580 for (i = 0; i < actual_versions; ++i)
29581 *empty_bb = add_condition_to_bb (dispatch_decl,
29582 function_version_info[i].version_decl,
29583 function_version_info[i].predicate_chain,
29584 *empty_bb);
29586 /* dispatch default version at the end. */
29587 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
29588 NULL, *empty_bb);
29590 free (function_version_info);
29591 return 0;
29594 /* Comparator function to be used in qsort routine to sort attribute
29595 specification strings to "target". */
29597 static int
29598 attr_strcmp (const void *v1, const void *v2)
29600 const char *c1 = *(char *const*)v1;
29601 const char *c2 = *(char *const*)v2;
29602 return strcmp (c1, c2);
29605 /* ARGLIST is the argument to target attribute. This function tokenizes
29606 the comma separated arguments, sorts them and returns a string which
29607 is a unique identifier for the comma separated arguments. It also
29608 replaces non-identifier characters "=,-" with "_". */
29610 static char *
29611 sorted_attr_string (tree arglist)
29613 tree arg;
29614 size_t str_len_sum = 0;
29615 char **args = NULL;
29616 char *attr_str, *ret_str;
29617 char *attr = NULL;
29618 unsigned int argnum = 1;
29619 unsigned int i;
29621 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29623 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29624 size_t len = strlen (str);
29625 str_len_sum += len + 1;
29626 if (arg != arglist)
29627 argnum++;
29628 for (i = 0; i < strlen (str); i++)
29629 if (str[i] == ',')
29630 argnum++;
29633 attr_str = XNEWVEC (char, str_len_sum);
29634 str_len_sum = 0;
29635 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29637 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29638 size_t len = strlen (str);
29639 memcpy (attr_str + str_len_sum, str, len);
29640 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29641 str_len_sum += len + 1;
29644 /* Replace "=,-" with "_". */
29645 for (i = 0; i < strlen (attr_str); i++)
29646 if (attr_str[i] == '=' || attr_str[i]== '-')
29647 attr_str[i] = '_';
29649 if (argnum == 1)
29650 return attr_str;
29652 args = XNEWVEC (char *, argnum);
29654 i = 0;
29655 attr = strtok (attr_str, ",");
29656 while (attr != NULL)
29658 args[i] = attr;
29659 i++;
29660 attr = strtok (NULL, ",");
29663 qsort (args, argnum, sizeof (char *), attr_strcmp);
29665 ret_str = XNEWVEC (char, str_len_sum);
29666 str_len_sum = 0;
29667 for (i = 0; i < argnum; i++)
29669 size_t len = strlen (args[i]);
29670 memcpy (ret_str + str_len_sum, args[i], len);
29671 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29672 str_len_sum += len + 1;
29675 XDELETEVEC (args);
29676 XDELETEVEC (attr_str);
29677 return ret_str;
29680 /* This function changes the assembler name for functions that are
29681 versions. If DECL is a function version and has a "target"
29682 attribute, it appends the attribute string to its assembler name. */
29684 static tree
29685 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29687 tree version_attr;
29688 const char *orig_name, *version_string;
29689 char *attr_str, *assembler_name;
29691 if (DECL_DECLARED_INLINE_P (decl)
29692 && lookup_attribute ("gnu_inline",
29693 DECL_ATTRIBUTES (decl)))
29694 error_at (DECL_SOURCE_LOCATION (decl),
29695 "Function versions cannot be marked as gnu_inline,"
29696 " bodies have to be generated");
29698 if (DECL_VIRTUAL_P (decl)
29699 || DECL_VINDEX (decl))
29700 sorry ("Virtual function multiversioning not supported");
29702 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29704 /* target attribute string cannot be NULL. */
29705 gcc_assert (version_attr != NULL_TREE);
29707 orig_name = IDENTIFIER_POINTER (id);
29708 version_string
29709 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29711 if (strcmp (version_string, "default") == 0)
29712 return id;
29714 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29715 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29717 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29719 /* Allow assembler name to be modified if already set. */
29720 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29721 SET_DECL_RTL (decl, NULL);
29723 tree ret = get_identifier (assembler_name);
29724 XDELETEVEC (attr_str);
29725 XDELETEVEC (assembler_name);
29726 return ret;
29729 /* This function returns true if FN1 and FN2 are versions of the same function,
29730 that is, the target strings of the function decls are different. This assumes
29731 that FN1 and FN2 have the same signature. */
29733 static bool
29734 ix86_function_versions (tree fn1, tree fn2)
29736 tree attr1, attr2;
29737 char *target1, *target2;
29738 bool result;
29740 if (TREE_CODE (fn1) != FUNCTION_DECL
29741 || TREE_CODE (fn2) != FUNCTION_DECL)
29742 return false;
29744 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29745 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29747 /* At least one function decl should have the target attribute specified. */
29748 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29749 return false;
29751 /* Diagnose missing target attribute if one of the decls is already
29752 multi-versioned. */
29753 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29755 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29757 if (attr2 != NULL_TREE)
29759 tree tem = fn1;
29760 fn1 = fn2;
29761 fn2 = tem;
29762 attr1 = attr2;
29764 error_at (DECL_SOURCE_LOCATION (fn2),
29765 "missing %<target%> attribute for multi-versioned %D",
29766 fn2);
29767 inform (DECL_SOURCE_LOCATION (fn1),
29768 "previous declaration of %D", fn1);
29769 /* Prevent diagnosing of the same error multiple times. */
29770 DECL_ATTRIBUTES (fn2)
29771 = tree_cons (get_identifier ("target"),
29772 copy_node (TREE_VALUE (attr1)),
29773 DECL_ATTRIBUTES (fn2));
29775 return false;
29778 target1 = sorted_attr_string (TREE_VALUE (attr1));
29779 target2 = sorted_attr_string (TREE_VALUE (attr2));
29781 /* The sorted target strings must be different for fn1 and fn2
29782 to be versions. */
29783 if (strcmp (target1, target2) == 0)
29784 result = false;
29785 else
29786 result = true;
29788 XDELETEVEC (target1);
29789 XDELETEVEC (target2);
29791 return result;
29794 static tree
29795 ix86_mangle_decl_assembler_name (tree decl, tree id)
29797 /* For function version, add the target suffix to the assembler name. */
29798 if (TREE_CODE (decl) == FUNCTION_DECL
29799 && DECL_FUNCTION_VERSIONED (decl))
29800 id = ix86_mangle_function_version_assembler_name (decl, id);
29801 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29802 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29803 #endif
29805 return id;
29808 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29809 is true, append the full path name of the source file. */
29811 static char *
29812 make_name (tree decl, const char *suffix, bool make_unique)
29814 char *global_var_name;
29815 int name_len;
29816 const char *name;
29817 const char *unique_name = NULL;
29819 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29821 /* Get a unique name that can be used globally without any chances
29822 of collision at link time. */
29823 if (make_unique)
29824 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29826 name_len = strlen (name) + strlen (suffix) + 2;
29828 if (make_unique)
29829 name_len += strlen (unique_name) + 1;
29830 global_var_name = XNEWVEC (char, name_len);
29832 /* Use '.' to concatenate names as it is demangler friendly. */
29833 if (make_unique)
29834 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29835 suffix);
29836 else
29837 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29839 return global_var_name;
29842 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29844 /* Make a dispatcher declaration for the multi-versioned function DECL.
29845 Calls to DECL function will be replaced with calls to the dispatcher
29846 by the front-end. Return the decl created. */
29848 static tree
29849 make_dispatcher_decl (const tree decl)
29851 tree func_decl;
29852 char *func_name;
29853 tree fn_type, func_type;
29854 bool is_uniq = false;
29856 if (TREE_PUBLIC (decl) == 0)
29857 is_uniq = true;
29859 func_name = make_name (decl, "ifunc", is_uniq);
29861 fn_type = TREE_TYPE (decl);
29862 func_type = build_function_type (TREE_TYPE (fn_type),
29863 TYPE_ARG_TYPES (fn_type));
29865 func_decl = build_fn_decl (func_name, func_type);
29866 XDELETEVEC (func_name);
29867 TREE_USED (func_decl) = 1;
29868 DECL_CONTEXT (func_decl) = NULL_TREE;
29869 DECL_INITIAL (func_decl) = error_mark_node;
29870 DECL_ARTIFICIAL (func_decl) = 1;
29871 /* Mark this func as external, the resolver will flip it again if
29872 it gets generated. */
29873 DECL_EXTERNAL (func_decl) = 1;
29874 /* This will be of type IFUNCs have to be externally visible. */
29875 TREE_PUBLIC (func_decl) = 1;
29877 return func_decl;
29880 #endif
29882 /* Returns true if decl is multi-versioned and DECL is the default function,
29883 that is it is not tagged with target specific optimization. */
29885 static bool
29886 is_function_default_version (const tree decl)
29888 if (TREE_CODE (decl) != FUNCTION_DECL
29889 || !DECL_FUNCTION_VERSIONED (decl))
29890 return false;
29891 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29892 gcc_assert (attr);
29893 attr = TREE_VALUE (TREE_VALUE (attr));
29894 return (TREE_CODE (attr) == STRING_CST
29895 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29898 /* Make a dispatcher declaration for the multi-versioned function DECL.
29899 Calls to DECL function will be replaced with calls to the dispatcher
29900 by the front-end. Returns the decl of the dispatcher function. */
29902 static tree
29903 ix86_get_function_versions_dispatcher (void *decl)
29905 tree fn = (tree) decl;
29906 struct cgraph_node *node = NULL;
29907 struct cgraph_node *default_node = NULL;
29908 struct cgraph_function_version_info *node_v = NULL;
29909 struct cgraph_function_version_info *first_v = NULL;
29911 tree dispatch_decl = NULL;
29913 struct cgraph_function_version_info *default_version_info = NULL;
29915 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29917 node = cgraph_get_node (fn);
29918 gcc_assert (node != NULL);
29920 node_v = get_cgraph_node_version (node);
29921 gcc_assert (node_v != NULL);
29923 if (node_v->dispatcher_resolver != NULL)
29924 return node_v->dispatcher_resolver;
29926 /* Find the default version and make it the first node. */
29927 first_v = node_v;
29928 /* Go to the beginning of the chain. */
29929 while (first_v->prev != NULL)
29930 first_v = first_v->prev;
29931 default_version_info = first_v;
29932 while (default_version_info != NULL)
29934 if (is_function_default_version
29935 (default_version_info->this_node->symbol.decl))
29936 break;
29937 default_version_info = default_version_info->next;
29940 /* If there is no default node, just return NULL. */
29941 if (default_version_info == NULL)
29942 return NULL;
29944 /* Make default info the first node. */
29945 if (first_v != default_version_info)
29947 default_version_info->prev->next = default_version_info->next;
29948 if (default_version_info->next)
29949 default_version_info->next->prev = default_version_info->prev;
29950 first_v->prev = default_version_info;
29951 default_version_info->next = first_v;
29952 default_version_info->prev = NULL;
29955 default_node = default_version_info->this_node;
29957 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29958 if (targetm.has_ifunc_p ())
29960 struct cgraph_function_version_info *it_v = NULL;
29961 struct cgraph_node *dispatcher_node = NULL;
29962 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29964 /* Right now, the dispatching is done via ifunc. */
29965 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29967 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29968 gcc_assert (dispatcher_node != NULL);
29969 dispatcher_node->dispatcher_function = 1;
29970 dispatcher_version_info
29971 = insert_new_cgraph_node_version (dispatcher_node);
29972 dispatcher_version_info->next = default_version_info;
29973 dispatcher_node->symbol.definition = 1;
29975 /* Set the dispatcher for all the versions. */
29976 it_v = default_version_info;
29977 while (it_v != NULL)
29979 it_v->dispatcher_resolver = dispatch_decl;
29980 it_v = it_v->next;
29983 else
29984 #endif
29986 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29987 "multiversioning needs ifunc which is not supported "
29988 "on this target");
29991 return dispatch_decl;
29994 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29995 it to CHAIN. */
29997 static tree
29998 make_attribute (const char *name, const char *arg_name, tree chain)
30000 tree attr_name;
30001 tree attr_arg_name;
30002 tree attr_args;
30003 tree attr;
30005 attr_name = get_identifier (name);
30006 attr_arg_name = build_string (strlen (arg_name), arg_name);
30007 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
30008 attr = tree_cons (attr_name, attr_args, chain);
30009 return attr;
30012 /* Make the resolver function decl to dispatch the versions of
30013 a multi-versioned function, DEFAULT_DECL. Create an
30014 empty basic block in the resolver and store the pointer in
30015 EMPTY_BB. Return the decl of the resolver function. */
30017 static tree
30018 make_resolver_func (const tree default_decl,
30019 const tree dispatch_decl,
30020 basic_block *empty_bb)
30022 char *resolver_name;
30023 tree decl, type, decl_name, t;
30024 bool is_uniq = false;
30026 /* IFUNC's have to be globally visible. So, if the default_decl is
30027 not, then the name of the IFUNC should be made unique. */
30028 if (TREE_PUBLIC (default_decl) == 0)
30029 is_uniq = true;
30031 /* Append the filename to the resolver function if the versions are
30032 not externally visible. This is because the resolver function has
30033 to be externally visible for the loader to find it. So, appending
30034 the filename will prevent conflicts with a resolver function from
30035 another module which is based on the same version name. */
30036 resolver_name = make_name (default_decl, "resolver", is_uniq);
30038 /* The resolver function should return a (void *). */
30039 type = build_function_type_list (ptr_type_node, NULL_TREE);
30041 decl = build_fn_decl (resolver_name, type);
30042 decl_name = get_identifier (resolver_name);
30043 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
30045 DECL_NAME (decl) = decl_name;
30046 TREE_USED (decl) = 1;
30047 DECL_ARTIFICIAL (decl) = 1;
30048 DECL_IGNORED_P (decl) = 0;
30049 /* IFUNC resolvers have to be externally visible. */
30050 TREE_PUBLIC (decl) = 1;
30051 DECL_UNINLINABLE (decl) = 1;
30053 /* Resolver is not external, body is generated. */
30054 DECL_EXTERNAL (decl) = 0;
30055 DECL_EXTERNAL (dispatch_decl) = 0;
30057 DECL_CONTEXT (decl) = NULL_TREE;
30058 DECL_INITIAL (decl) = make_node (BLOCK);
30059 DECL_STATIC_CONSTRUCTOR (decl) = 0;
30061 if (DECL_COMDAT_GROUP (default_decl)
30062 || TREE_PUBLIC (default_decl))
30064 /* In this case, each translation unit with a call to this
30065 versioned function will put out a resolver. Ensure it
30066 is comdat to keep just one copy. */
30067 DECL_COMDAT (decl) = 1;
30068 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
30070 /* Build result decl and add to function_decl. */
30071 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
30072 DECL_ARTIFICIAL (t) = 1;
30073 DECL_IGNORED_P (t) = 1;
30074 DECL_RESULT (decl) = t;
30076 gimplify_function_tree (decl);
30077 push_cfun (DECL_STRUCT_FUNCTION (decl));
30078 *empty_bb = init_lowered_empty_function (decl, false);
30080 cgraph_add_new_function (decl, true);
30081 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
30083 pop_cfun ();
30085 gcc_assert (dispatch_decl != NULL);
30086 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
30087 DECL_ATTRIBUTES (dispatch_decl)
30088 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
30090 /* Create the alias for dispatch to resolver here. */
30091 /*cgraph_create_function_alias (dispatch_decl, decl);*/
30092 cgraph_same_body_alias (NULL, dispatch_decl, decl);
30093 XDELETEVEC (resolver_name);
30094 return decl;
30097 /* Generate the dispatching code body to dispatch multi-versioned function
30098 DECL. The target hook is called to process the "target" attributes and
30099 provide the code to dispatch the right function at run-time. NODE points
30100 to the dispatcher decl whose body will be created. */
30102 static tree
30103 ix86_generate_version_dispatcher_body (void *node_p)
30105 tree resolver_decl;
30106 basic_block empty_bb;
30107 vec<tree> fn_ver_vec = vNULL;
30108 tree default_ver_decl;
30109 struct cgraph_node *versn;
30110 struct cgraph_node *node;
30112 struct cgraph_function_version_info *node_version_info = NULL;
30113 struct cgraph_function_version_info *versn_info = NULL;
30115 node = (cgraph_node *)node_p;
30117 node_version_info = get_cgraph_node_version (node);
30118 gcc_assert (node->dispatcher_function
30119 && node_version_info != NULL);
30121 if (node_version_info->dispatcher_resolver)
30122 return node_version_info->dispatcher_resolver;
30124 /* The first version in the chain corresponds to the default version. */
30125 default_ver_decl = node_version_info->next->this_node->symbol.decl;
30127 /* node is going to be an alias, so remove the finalized bit. */
30128 node->symbol.definition = false;
30130 resolver_decl = make_resolver_func (default_ver_decl,
30131 node->symbol.decl, &empty_bb);
30133 node_version_info->dispatcher_resolver = resolver_decl;
30135 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30137 fn_ver_vec.create (2);
30139 for (versn_info = node_version_info->next; versn_info;
30140 versn_info = versn_info->next)
30142 versn = versn_info->this_node;
30143 /* Check for virtual functions here again, as by this time it should
30144 have been determined if this function needs a vtable index or
30145 not. This happens for methods in derived classes that override
30146 virtual methods in base classes but are not explicitly marked as
30147 virtual. */
30148 if (DECL_VINDEX (versn->symbol.decl))
30149 sorry ("Virtual function multiversioning not supported");
30151 fn_ver_vec.safe_push (versn->symbol.decl);
30154 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30155 fn_ver_vec.release ();
30156 rebuild_cgraph_edges ();
30157 pop_cfun ();
30158 return resolver_decl;
30160 /* This builds the processor_model struct type defined in
30161 libgcc/config/i386/cpuinfo.c */
30163 static tree
30164 build_processor_model_struct (void)
30166 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30167 "__cpu_features"};
30168 tree field = NULL_TREE, field_chain = NULL_TREE;
30169 int i;
30170 tree type = make_node (RECORD_TYPE);
30172 /* The first 3 fields are unsigned int. */
30173 for (i = 0; i < 3; ++i)
30175 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30176 get_identifier (field_name[i]), unsigned_type_node);
30177 if (field_chain != NULL_TREE)
30178 DECL_CHAIN (field) = field_chain;
30179 field_chain = field;
30182 /* The last field is an array of unsigned integers of size one. */
30183 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30184 get_identifier (field_name[3]),
30185 build_array_type (unsigned_type_node,
30186 build_index_type (size_one_node)));
30187 if (field_chain != NULL_TREE)
30188 DECL_CHAIN (field) = field_chain;
30189 field_chain = field;
30191 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30192 return type;
30195 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30197 static tree
30198 make_var_decl (tree type, const char *name)
30200 tree new_decl;
30202 new_decl = build_decl (UNKNOWN_LOCATION,
30203 VAR_DECL,
30204 get_identifier(name),
30205 type);
30207 DECL_EXTERNAL (new_decl) = 1;
30208 TREE_STATIC (new_decl) = 1;
30209 TREE_PUBLIC (new_decl) = 1;
30210 DECL_INITIAL (new_decl) = 0;
30211 DECL_ARTIFICIAL (new_decl) = 0;
30212 DECL_PRESERVE_P (new_decl) = 1;
30214 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30215 assemble_variable (new_decl, 0, 0, 0);
30217 return new_decl;
30220 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30221 into an integer defined in libgcc/config/i386/cpuinfo.c */
30223 static tree
30224 fold_builtin_cpu (tree fndecl, tree *args)
30226 unsigned int i;
30227 enum ix86_builtins fn_code = (enum ix86_builtins)
30228 DECL_FUNCTION_CODE (fndecl);
30229 tree param_string_cst = NULL;
30231 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30232 enum processor_features
30234 F_CMOV = 0,
30235 F_MMX,
30236 F_POPCNT,
30237 F_SSE,
30238 F_SSE2,
30239 F_SSE3,
30240 F_SSSE3,
30241 F_SSE4_1,
30242 F_SSE4_2,
30243 F_AVX,
30244 F_AVX2,
30245 F_MAX
30248 /* These are the values for vendor types and cpu types and subtypes
30249 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30250 the corresponding start value. */
30251 enum processor_model
30253 M_INTEL = 1,
30254 M_AMD,
30255 M_CPU_TYPE_START,
30256 M_INTEL_ATOM,
30257 M_INTEL_CORE2,
30258 M_INTEL_COREI7,
30259 M_AMDFAM10H,
30260 M_AMDFAM15H,
30261 M_INTEL_SLM,
30262 M_CPU_SUBTYPE_START,
30263 M_INTEL_COREI7_NEHALEM,
30264 M_INTEL_COREI7_WESTMERE,
30265 M_INTEL_COREI7_SANDYBRIDGE,
30266 M_AMDFAM10H_BARCELONA,
30267 M_AMDFAM10H_SHANGHAI,
30268 M_AMDFAM10H_ISTANBUL,
30269 M_AMDFAM15H_BDVER1,
30270 M_AMDFAM15H_BDVER2,
30271 M_AMDFAM15H_BDVER3
30274 static struct _arch_names_table
30276 const char *const name;
30277 const enum processor_model model;
30279 const arch_names_table[] =
30281 {"amd", M_AMD},
30282 {"intel", M_INTEL},
30283 {"atom", M_INTEL_ATOM},
30284 {"slm", M_INTEL_SLM},
30285 {"core2", M_INTEL_CORE2},
30286 {"corei7", M_INTEL_COREI7},
30287 {"nehalem", M_INTEL_COREI7_NEHALEM},
30288 {"westmere", M_INTEL_COREI7_WESTMERE},
30289 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30290 {"amdfam10h", M_AMDFAM10H},
30291 {"barcelona", M_AMDFAM10H_BARCELONA},
30292 {"shanghai", M_AMDFAM10H_SHANGHAI},
30293 {"istanbul", M_AMDFAM10H_ISTANBUL},
30294 {"amdfam15h", M_AMDFAM15H},
30295 {"bdver1", M_AMDFAM15H_BDVER1},
30296 {"bdver2", M_AMDFAM15H_BDVER2},
30297 {"bdver3", M_AMDFAM15H_BDVER3},
30300 static struct _isa_names_table
30302 const char *const name;
30303 const enum processor_features feature;
30305 const isa_names_table[] =
30307 {"cmov", F_CMOV},
30308 {"mmx", F_MMX},
30309 {"popcnt", F_POPCNT},
30310 {"sse", F_SSE},
30311 {"sse2", F_SSE2},
30312 {"sse3", F_SSE3},
30313 {"ssse3", F_SSSE3},
30314 {"sse4.1", F_SSE4_1},
30315 {"sse4.2", F_SSE4_2},
30316 {"avx", F_AVX},
30317 {"avx2", F_AVX2}
30320 tree __processor_model_type = build_processor_model_struct ();
30321 tree __cpu_model_var = make_var_decl (__processor_model_type,
30322 "__cpu_model");
30325 varpool_add_new_variable (__cpu_model_var);
30327 gcc_assert ((args != NULL) && (*args != NULL));
30329 param_string_cst = *args;
30330 while (param_string_cst
30331 && TREE_CODE (param_string_cst) != STRING_CST)
30333 /* *args must be a expr that can contain other EXPRS leading to a
30334 STRING_CST. */
30335 if (!EXPR_P (param_string_cst))
30337 error ("Parameter to builtin must be a string constant or literal");
30338 return integer_zero_node;
30340 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
30343 gcc_assert (param_string_cst);
30345 if (fn_code == IX86_BUILTIN_CPU_IS)
30347 tree ref;
30348 tree field;
30349 tree final;
30351 unsigned int field_val = 0;
30352 unsigned int NUM_ARCH_NAMES
30353 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
30355 for (i = 0; i < NUM_ARCH_NAMES; i++)
30356 if (strcmp (arch_names_table[i].name,
30357 TREE_STRING_POINTER (param_string_cst)) == 0)
30358 break;
30360 if (i == NUM_ARCH_NAMES)
30362 error ("Parameter to builtin not valid: %s",
30363 TREE_STRING_POINTER (param_string_cst));
30364 return integer_zero_node;
30367 field = TYPE_FIELDS (__processor_model_type);
30368 field_val = arch_names_table[i].model;
30370 /* CPU types are stored in the next field. */
30371 if (field_val > M_CPU_TYPE_START
30372 && field_val < M_CPU_SUBTYPE_START)
30374 field = DECL_CHAIN (field);
30375 field_val -= M_CPU_TYPE_START;
30378 /* CPU subtypes are stored in the next field. */
30379 if (field_val > M_CPU_SUBTYPE_START)
30381 field = DECL_CHAIN ( DECL_CHAIN (field));
30382 field_val -= M_CPU_SUBTYPE_START;
30385 /* Get the appropriate field in __cpu_model. */
30386 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30387 field, NULL_TREE);
30389 /* Check the value. */
30390 final = build2 (EQ_EXPR, unsigned_type_node, ref,
30391 build_int_cstu (unsigned_type_node, field_val));
30392 return build1 (CONVERT_EXPR, integer_type_node, final);
30394 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30396 tree ref;
30397 tree array_elt;
30398 tree field;
30399 tree final;
30401 unsigned int field_val = 0;
30402 unsigned int NUM_ISA_NAMES
30403 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
30405 for (i = 0; i < NUM_ISA_NAMES; i++)
30406 if (strcmp (isa_names_table[i].name,
30407 TREE_STRING_POINTER (param_string_cst)) == 0)
30408 break;
30410 if (i == NUM_ISA_NAMES)
30412 error ("Parameter to builtin not valid: %s",
30413 TREE_STRING_POINTER (param_string_cst));
30414 return integer_zero_node;
30417 field = TYPE_FIELDS (__processor_model_type);
30418 /* Get the last field, which is __cpu_features. */
30419 while (DECL_CHAIN (field))
30420 field = DECL_CHAIN (field);
30422 /* Get the appropriate field: __cpu_model.__cpu_features */
30423 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30424 field, NULL_TREE);
30426 /* Access the 0th element of __cpu_features array. */
30427 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
30428 integer_zero_node, NULL_TREE, NULL_TREE);
30430 field_val = (1 << isa_names_table[i].feature);
30431 /* Return __cpu_model.__cpu_features[0] & field_val */
30432 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
30433 build_int_cstu (unsigned_type_node, field_val));
30434 return build1 (CONVERT_EXPR, integer_type_node, final);
30436 gcc_unreachable ();
30439 static tree
30440 ix86_fold_builtin (tree fndecl, int n_args,
30441 tree *args, bool ignore ATTRIBUTE_UNUSED)
30443 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30445 enum ix86_builtins fn_code = (enum ix86_builtins)
30446 DECL_FUNCTION_CODE (fndecl);
30447 if (fn_code == IX86_BUILTIN_CPU_IS
30448 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30450 gcc_assert (n_args == 1);
30451 return fold_builtin_cpu (fndecl, args);
30455 #ifdef SUBTARGET_FOLD_BUILTIN
30456 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
30457 #endif
30459 return NULL_TREE;
30462 /* Make builtins to detect cpu type and features supported. NAME is
30463 the builtin name, CODE is the builtin code, and FTYPE is the function
30464 type of the builtin. */
30466 static void
30467 make_cpu_type_builtin (const char* name, int code,
30468 enum ix86_builtin_func_type ftype, bool is_const)
30470 tree decl;
30471 tree type;
30473 type = ix86_get_builtin_func_type (ftype);
30474 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30475 NULL, NULL_TREE);
30476 gcc_assert (decl != NULL_TREE);
30477 ix86_builtins[(int) code] = decl;
30478 TREE_READONLY (decl) = is_const;
30481 /* Make builtins to get CPU type and features supported. The created
30482 builtins are :
30484 __builtin_cpu_init (), to detect cpu type and features,
30485 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
30486 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
30489 static void
30490 ix86_init_platform_type_builtins (void)
30492 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
30493 INT_FTYPE_VOID, false);
30494 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
30495 INT_FTYPE_PCCHAR, true);
30496 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
30497 INT_FTYPE_PCCHAR, true);
30500 /* Internal method for ix86_init_builtins. */
30502 static void
30503 ix86_init_builtins_va_builtins_abi (void)
30505 tree ms_va_ref, sysv_va_ref;
30506 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
30507 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
30508 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
30509 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
30511 if (!TARGET_64BIT)
30512 return;
30513 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
30514 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
30515 ms_va_ref = build_reference_type (ms_va_list_type_node);
30516 sysv_va_ref =
30517 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
30519 fnvoid_va_end_ms =
30520 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30521 fnvoid_va_start_ms =
30522 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30523 fnvoid_va_end_sysv =
30524 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
30525 fnvoid_va_start_sysv =
30526 build_varargs_function_type_list (void_type_node, sysv_va_ref,
30527 NULL_TREE);
30528 fnvoid_va_copy_ms =
30529 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
30530 NULL_TREE);
30531 fnvoid_va_copy_sysv =
30532 build_function_type_list (void_type_node, sysv_va_ref,
30533 sysv_va_ref, NULL_TREE);
30535 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
30536 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
30537 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
30538 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
30539 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
30540 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
30541 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
30542 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30543 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
30544 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30545 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
30546 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30549 static void
30550 ix86_init_builtin_types (void)
30552 tree float128_type_node, float80_type_node;
30554 /* The __float80 type. */
30555 float80_type_node = long_double_type_node;
30556 if (TYPE_MODE (float80_type_node) != XFmode)
30558 /* The __float80 type. */
30559 float80_type_node = make_node (REAL_TYPE);
30561 TYPE_PRECISION (float80_type_node) = 80;
30562 layout_type (float80_type_node);
30564 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
30566 /* The __float128 type. */
30567 float128_type_node = make_node (REAL_TYPE);
30568 TYPE_PRECISION (float128_type_node) = 128;
30569 layout_type (float128_type_node);
30570 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
30572 /* This macro is built by i386-builtin-types.awk. */
30573 DEFINE_BUILTIN_PRIMITIVE_TYPES;
30576 static void
30577 ix86_init_builtins (void)
30579 tree t;
30581 ix86_init_builtin_types ();
30583 /* Builtins to get CPU type and features. */
30584 ix86_init_platform_type_builtins ();
30586 /* TFmode support builtins. */
30587 def_builtin_const (0, "__builtin_infq",
30588 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
30589 def_builtin_const (0, "__builtin_huge_valq",
30590 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
30592 /* We will expand them to normal call if SSE isn't available since
30593 they are used by libgcc. */
30594 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
30595 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
30596 BUILT_IN_MD, "__fabstf2", NULL_TREE);
30597 TREE_READONLY (t) = 1;
30598 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
30600 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
30601 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
30602 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
30603 TREE_READONLY (t) = 1;
30604 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
30606 ix86_init_tm_builtins ();
30607 ix86_init_mmx_sse_builtins ();
30609 if (TARGET_LP64)
30610 ix86_init_builtins_va_builtins_abi ();
30612 #ifdef SUBTARGET_INIT_BUILTINS
30613 SUBTARGET_INIT_BUILTINS;
30614 #endif
30617 /* Return the ix86 builtin for CODE. */
30619 static tree
30620 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
30622 if (code >= IX86_BUILTIN_MAX)
30623 return error_mark_node;
30625 return ix86_builtins[code];
30628 /* Errors in the source file can cause expand_expr to return const0_rtx
30629 where we expect a vector. To avoid crashing, use one of the vector
30630 clear instructions. */
30631 static rtx
30632 safe_vector_operand (rtx x, enum machine_mode mode)
30634 if (x == const0_rtx)
30635 x = CONST0_RTX (mode);
30636 return x;
30639 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30641 static rtx
30642 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30644 rtx pat;
30645 tree arg0 = CALL_EXPR_ARG (exp, 0);
30646 tree arg1 = CALL_EXPR_ARG (exp, 1);
30647 rtx op0 = expand_normal (arg0);
30648 rtx op1 = expand_normal (arg1);
30649 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30650 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30651 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30653 if (VECTOR_MODE_P (mode0))
30654 op0 = safe_vector_operand (op0, mode0);
30655 if (VECTOR_MODE_P (mode1))
30656 op1 = safe_vector_operand (op1, mode1);
30658 if (optimize || !target
30659 || GET_MODE (target) != tmode
30660 || !insn_data[icode].operand[0].predicate (target, tmode))
30661 target = gen_reg_rtx (tmode);
30663 if (GET_MODE (op1) == SImode && mode1 == TImode)
30665 rtx x = gen_reg_rtx (V4SImode);
30666 emit_insn (gen_sse2_loadd (x, op1));
30667 op1 = gen_lowpart (TImode, x);
30670 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30671 op0 = copy_to_mode_reg (mode0, op0);
30672 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30673 op1 = copy_to_mode_reg (mode1, op1);
30675 pat = GEN_FCN (icode) (target, op0, op1);
30676 if (! pat)
30677 return 0;
30679 emit_insn (pat);
30681 return target;
30684 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30686 static rtx
30687 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30688 enum ix86_builtin_func_type m_type,
30689 enum rtx_code sub_code)
30691 rtx pat;
30692 int i;
30693 int nargs;
30694 bool comparison_p = false;
30695 bool tf_p = false;
30696 bool last_arg_constant = false;
30697 int num_memory = 0;
30698 struct {
30699 rtx op;
30700 enum machine_mode mode;
30701 } args[4];
30703 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30705 switch (m_type)
30707 case MULTI_ARG_4_DF2_DI_I:
30708 case MULTI_ARG_4_DF2_DI_I1:
30709 case MULTI_ARG_4_SF2_SI_I:
30710 case MULTI_ARG_4_SF2_SI_I1:
30711 nargs = 4;
30712 last_arg_constant = true;
30713 break;
30715 case MULTI_ARG_3_SF:
30716 case MULTI_ARG_3_DF:
30717 case MULTI_ARG_3_SF2:
30718 case MULTI_ARG_3_DF2:
30719 case MULTI_ARG_3_DI:
30720 case MULTI_ARG_3_SI:
30721 case MULTI_ARG_3_SI_DI:
30722 case MULTI_ARG_3_HI:
30723 case MULTI_ARG_3_HI_SI:
30724 case MULTI_ARG_3_QI:
30725 case MULTI_ARG_3_DI2:
30726 case MULTI_ARG_3_SI2:
30727 case MULTI_ARG_3_HI2:
30728 case MULTI_ARG_3_QI2:
30729 nargs = 3;
30730 break;
30732 case MULTI_ARG_2_SF:
30733 case MULTI_ARG_2_DF:
30734 case MULTI_ARG_2_DI:
30735 case MULTI_ARG_2_SI:
30736 case MULTI_ARG_2_HI:
30737 case MULTI_ARG_2_QI:
30738 nargs = 2;
30739 break;
30741 case MULTI_ARG_2_DI_IMM:
30742 case MULTI_ARG_2_SI_IMM:
30743 case MULTI_ARG_2_HI_IMM:
30744 case MULTI_ARG_2_QI_IMM:
30745 nargs = 2;
30746 last_arg_constant = true;
30747 break;
30749 case MULTI_ARG_1_SF:
30750 case MULTI_ARG_1_DF:
30751 case MULTI_ARG_1_SF2:
30752 case MULTI_ARG_1_DF2:
30753 case MULTI_ARG_1_DI:
30754 case MULTI_ARG_1_SI:
30755 case MULTI_ARG_1_HI:
30756 case MULTI_ARG_1_QI:
30757 case MULTI_ARG_1_SI_DI:
30758 case MULTI_ARG_1_HI_DI:
30759 case MULTI_ARG_1_HI_SI:
30760 case MULTI_ARG_1_QI_DI:
30761 case MULTI_ARG_1_QI_SI:
30762 case MULTI_ARG_1_QI_HI:
30763 nargs = 1;
30764 break;
30766 case MULTI_ARG_2_DI_CMP:
30767 case MULTI_ARG_2_SI_CMP:
30768 case MULTI_ARG_2_HI_CMP:
30769 case MULTI_ARG_2_QI_CMP:
30770 nargs = 2;
30771 comparison_p = true;
30772 break;
30774 case MULTI_ARG_2_SF_TF:
30775 case MULTI_ARG_2_DF_TF:
30776 case MULTI_ARG_2_DI_TF:
30777 case MULTI_ARG_2_SI_TF:
30778 case MULTI_ARG_2_HI_TF:
30779 case MULTI_ARG_2_QI_TF:
30780 nargs = 2;
30781 tf_p = true;
30782 break;
30784 default:
30785 gcc_unreachable ();
30788 if (optimize || !target
30789 || GET_MODE (target) != tmode
30790 || !insn_data[icode].operand[0].predicate (target, tmode))
30791 target = gen_reg_rtx (tmode);
30793 gcc_assert (nargs <= 4);
30795 for (i = 0; i < nargs; i++)
30797 tree arg = CALL_EXPR_ARG (exp, i);
30798 rtx op = expand_normal (arg);
30799 int adjust = (comparison_p) ? 1 : 0;
30800 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30802 if (last_arg_constant && i == nargs - 1)
30804 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30806 enum insn_code new_icode = icode;
30807 switch (icode)
30809 case CODE_FOR_xop_vpermil2v2df3:
30810 case CODE_FOR_xop_vpermil2v4sf3:
30811 case CODE_FOR_xop_vpermil2v4df3:
30812 case CODE_FOR_xop_vpermil2v8sf3:
30813 error ("the last argument must be a 2-bit immediate");
30814 return gen_reg_rtx (tmode);
30815 case CODE_FOR_xop_rotlv2di3:
30816 new_icode = CODE_FOR_rotlv2di3;
30817 goto xop_rotl;
30818 case CODE_FOR_xop_rotlv4si3:
30819 new_icode = CODE_FOR_rotlv4si3;
30820 goto xop_rotl;
30821 case CODE_FOR_xop_rotlv8hi3:
30822 new_icode = CODE_FOR_rotlv8hi3;
30823 goto xop_rotl;
30824 case CODE_FOR_xop_rotlv16qi3:
30825 new_icode = CODE_FOR_rotlv16qi3;
30826 xop_rotl:
30827 if (CONST_INT_P (op))
30829 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30830 op = GEN_INT (INTVAL (op) & mask);
30831 gcc_checking_assert
30832 (insn_data[icode].operand[i + 1].predicate (op, mode));
30834 else
30836 gcc_checking_assert
30837 (nargs == 2
30838 && insn_data[new_icode].operand[0].mode == tmode
30839 && insn_data[new_icode].operand[1].mode == tmode
30840 && insn_data[new_icode].operand[2].mode == mode
30841 && insn_data[new_icode].operand[0].predicate
30842 == insn_data[icode].operand[0].predicate
30843 && insn_data[new_icode].operand[1].predicate
30844 == insn_data[icode].operand[1].predicate);
30845 icode = new_icode;
30846 goto non_constant;
30848 break;
30849 default:
30850 gcc_unreachable ();
30854 else
30856 non_constant:
30857 if (VECTOR_MODE_P (mode))
30858 op = safe_vector_operand (op, mode);
30860 /* If we aren't optimizing, only allow one memory operand to be
30861 generated. */
30862 if (memory_operand (op, mode))
30863 num_memory++;
30865 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30867 if (optimize
30868 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30869 || num_memory > 1)
30870 op = force_reg (mode, op);
30873 args[i].op = op;
30874 args[i].mode = mode;
30877 switch (nargs)
30879 case 1:
30880 pat = GEN_FCN (icode) (target, args[0].op);
30881 break;
30883 case 2:
30884 if (tf_p)
30885 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30886 GEN_INT ((int)sub_code));
30887 else if (! comparison_p)
30888 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30889 else
30891 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30892 args[0].op,
30893 args[1].op);
30895 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30897 break;
30899 case 3:
30900 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30901 break;
30903 case 4:
30904 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30905 break;
30907 default:
30908 gcc_unreachable ();
30911 if (! pat)
30912 return 0;
30914 emit_insn (pat);
30915 return target;
30918 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30919 insns with vec_merge. */
30921 static rtx
30922 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30923 rtx target)
30925 rtx pat;
30926 tree arg0 = CALL_EXPR_ARG (exp, 0);
30927 rtx op1, op0 = expand_normal (arg0);
30928 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30929 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30931 if (optimize || !target
30932 || GET_MODE (target) != tmode
30933 || !insn_data[icode].operand[0].predicate (target, tmode))
30934 target = gen_reg_rtx (tmode);
30936 if (VECTOR_MODE_P (mode0))
30937 op0 = safe_vector_operand (op0, mode0);
30939 if ((optimize && !register_operand (op0, mode0))
30940 || !insn_data[icode].operand[1].predicate (op0, mode0))
30941 op0 = copy_to_mode_reg (mode0, op0);
30943 op1 = op0;
30944 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30945 op1 = copy_to_mode_reg (mode0, op1);
30947 pat = GEN_FCN (icode) (target, op0, op1);
30948 if (! pat)
30949 return 0;
30950 emit_insn (pat);
30951 return target;
30954 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30956 static rtx
30957 ix86_expand_sse_compare (const struct builtin_description *d,
30958 tree exp, rtx target, bool swap)
30960 rtx pat;
30961 tree arg0 = CALL_EXPR_ARG (exp, 0);
30962 tree arg1 = CALL_EXPR_ARG (exp, 1);
30963 rtx op0 = expand_normal (arg0);
30964 rtx op1 = expand_normal (arg1);
30965 rtx op2;
30966 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30967 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30968 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30969 enum rtx_code comparison = d->comparison;
30971 if (VECTOR_MODE_P (mode0))
30972 op0 = safe_vector_operand (op0, mode0);
30973 if (VECTOR_MODE_P (mode1))
30974 op1 = safe_vector_operand (op1, mode1);
30976 /* Swap operands if we have a comparison that isn't available in
30977 hardware. */
30978 if (swap)
30980 rtx tmp = gen_reg_rtx (mode1);
30981 emit_move_insn (tmp, op1);
30982 op1 = op0;
30983 op0 = tmp;
30986 if (optimize || !target
30987 || GET_MODE (target) != tmode
30988 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30989 target = gen_reg_rtx (tmode);
30991 if ((optimize && !register_operand (op0, mode0))
30992 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30993 op0 = copy_to_mode_reg (mode0, op0);
30994 if ((optimize && !register_operand (op1, mode1))
30995 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30996 op1 = copy_to_mode_reg (mode1, op1);
30998 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30999 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31000 if (! pat)
31001 return 0;
31002 emit_insn (pat);
31003 return target;
31006 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
31008 static rtx
31009 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
31010 rtx target)
31012 rtx pat;
31013 tree arg0 = CALL_EXPR_ARG (exp, 0);
31014 tree arg1 = CALL_EXPR_ARG (exp, 1);
31015 rtx op0 = expand_normal (arg0);
31016 rtx op1 = expand_normal (arg1);
31017 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31018 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31019 enum rtx_code comparison = d->comparison;
31021 if (VECTOR_MODE_P (mode0))
31022 op0 = safe_vector_operand (op0, mode0);
31023 if (VECTOR_MODE_P (mode1))
31024 op1 = safe_vector_operand (op1, mode1);
31026 /* Swap operands if we have a comparison that isn't available in
31027 hardware. */
31028 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
31030 rtx tmp = op1;
31031 op1 = op0;
31032 op0 = tmp;
31035 target = gen_reg_rtx (SImode);
31036 emit_move_insn (target, const0_rtx);
31037 target = gen_rtx_SUBREG (QImode, target, 0);
31039 if ((optimize && !register_operand (op0, mode0))
31040 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31041 op0 = copy_to_mode_reg (mode0, op0);
31042 if ((optimize && !register_operand (op1, mode1))
31043 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31044 op1 = copy_to_mode_reg (mode1, op1);
31046 pat = GEN_FCN (d->icode) (op0, op1);
31047 if (! pat)
31048 return 0;
31049 emit_insn (pat);
31050 emit_insn (gen_rtx_SET (VOIDmode,
31051 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31052 gen_rtx_fmt_ee (comparison, QImode,
31053 SET_DEST (pat),
31054 const0_rtx)));
31056 return SUBREG_REG (target);
31059 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
31061 static rtx
31062 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
31063 rtx target)
31065 rtx pat;
31066 tree arg0 = CALL_EXPR_ARG (exp, 0);
31067 rtx op1, op0 = expand_normal (arg0);
31068 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31069 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31071 if (optimize || target == 0
31072 || GET_MODE (target) != tmode
31073 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31074 target = gen_reg_rtx (tmode);
31076 if (VECTOR_MODE_P (mode0))
31077 op0 = safe_vector_operand (op0, mode0);
31079 if ((optimize && !register_operand (op0, mode0))
31080 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31081 op0 = copy_to_mode_reg (mode0, op0);
31083 op1 = GEN_INT (d->comparison);
31085 pat = GEN_FCN (d->icode) (target, op0, op1);
31086 if (! pat)
31087 return 0;
31088 emit_insn (pat);
31089 return target;
31092 static rtx
31093 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
31094 tree exp, rtx target)
31096 rtx pat;
31097 tree arg0 = CALL_EXPR_ARG (exp, 0);
31098 tree arg1 = CALL_EXPR_ARG (exp, 1);
31099 rtx op0 = expand_normal (arg0);
31100 rtx op1 = expand_normal (arg1);
31101 rtx op2;
31102 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31103 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31104 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31106 if (optimize || target == 0
31107 || GET_MODE (target) != tmode
31108 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31109 target = gen_reg_rtx (tmode);
31111 op0 = safe_vector_operand (op0, mode0);
31112 op1 = safe_vector_operand (op1, mode1);
31114 if ((optimize && !register_operand (op0, mode0))
31115 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31116 op0 = copy_to_mode_reg (mode0, op0);
31117 if ((optimize && !register_operand (op1, mode1))
31118 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31119 op1 = copy_to_mode_reg (mode1, op1);
31121 op2 = GEN_INT (d->comparison);
31123 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31124 if (! pat)
31125 return 0;
31126 emit_insn (pat);
31127 return target;
31130 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31132 static rtx
31133 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31134 rtx target)
31136 rtx pat;
31137 tree arg0 = CALL_EXPR_ARG (exp, 0);
31138 tree arg1 = CALL_EXPR_ARG (exp, 1);
31139 rtx op0 = expand_normal (arg0);
31140 rtx op1 = expand_normal (arg1);
31141 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31142 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31143 enum rtx_code comparison = d->comparison;
31145 if (VECTOR_MODE_P (mode0))
31146 op0 = safe_vector_operand (op0, mode0);
31147 if (VECTOR_MODE_P (mode1))
31148 op1 = safe_vector_operand (op1, mode1);
31150 target = gen_reg_rtx (SImode);
31151 emit_move_insn (target, const0_rtx);
31152 target = gen_rtx_SUBREG (QImode, target, 0);
31154 if ((optimize && !register_operand (op0, mode0))
31155 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31156 op0 = copy_to_mode_reg (mode0, op0);
31157 if ((optimize && !register_operand (op1, mode1))
31158 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31159 op1 = copy_to_mode_reg (mode1, op1);
31161 pat = GEN_FCN (d->icode) (op0, op1);
31162 if (! pat)
31163 return 0;
31164 emit_insn (pat);
31165 emit_insn (gen_rtx_SET (VOIDmode,
31166 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31167 gen_rtx_fmt_ee (comparison, QImode,
31168 SET_DEST (pat),
31169 const0_rtx)));
31171 return SUBREG_REG (target);
31174 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31176 static rtx
31177 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31178 tree exp, rtx target)
31180 rtx pat;
31181 tree arg0 = CALL_EXPR_ARG (exp, 0);
31182 tree arg1 = CALL_EXPR_ARG (exp, 1);
31183 tree arg2 = CALL_EXPR_ARG (exp, 2);
31184 tree arg3 = CALL_EXPR_ARG (exp, 3);
31185 tree arg4 = CALL_EXPR_ARG (exp, 4);
31186 rtx scratch0, scratch1;
31187 rtx op0 = expand_normal (arg0);
31188 rtx op1 = expand_normal (arg1);
31189 rtx op2 = expand_normal (arg2);
31190 rtx op3 = expand_normal (arg3);
31191 rtx op4 = expand_normal (arg4);
31192 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31194 tmode0 = insn_data[d->icode].operand[0].mode;
31195 tmode1 = insn_data[d->icode].operand[1].mode;
31196 modev2 = insn_data[d->icode].operand[2].mode;
31197 modei3 = insn_data[d->icode].operand[3].mode;
31198 modev4 = insn_data[d->icode].operand[4].mode;
31199 modei5 = insn_data[d->icode].operand[5].mode;
31200 modeimm = insn_data[d->icode].operand[6].mode;
31202 if (VECTOR_MODE_P (modev2))
31203 op0 = safe_vector_operand (op0, modev2);
31204 if (VECTOR_MODE_P (modev4))
31205 op2 = safe_vector_operand (op2, modev4);
31207 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31208 op0 = copy_to_mode_reg (modev2, op0);
31209 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31210 op1 = copy_to_mode_reg (modei3, op1);
31211 if ((optimize && !register_operand (op2, modev4))
31212 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31213 op2 = copy_to_mode_reg (modev4, op2);
31214 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31215 op3 = copy_to_mode_reg (modei5, op3);
31217 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31219 error ("the fifth argument must be an 8-bit immediate");
31220 return const0_rtx;
31223 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31225 if (optimize || !target
31226 || GET_MODE (target) != tmode0
31227 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31228 target = gen_reg_rtx (tmode0);
31230 scratch1 = gen_reg_rtx (tmode1);
31232 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31234 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31236 if (optimize || !target
31237 || GET_MODE (target) != tmode1
31238 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31239 target = gen_reg_rtx (tmode1);
31241 scratch0 = gen_reg_rtx (tmode0);
31243 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31245 else
31247 gcc_assert (d->flag);
31249 scratch0 = gen_reg_rtx (tmode0);
31250 scratch1 = gen_reg_rtx (tmode1);
31252 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31255 if (! pat)
31256 return 0;
31258 emit_insn (pat);
31260 if (d->flag)
31262 target = gen_reg_rtx (SImode);
31263 emit_move_insn (target, const0_rtx);
31264 target = gen_rtx_SUBREG (QImode, target, 0);
31266 emit_insn
31267 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31268 gen_rtx_fmt_ee (EQ, QImode,
31269 gen_rtx_REG ((enum machine_mode) d->flag,
31270 FLAGS_REG),
31271 const0_rtx)));
31272 return SUBREG_REG (target);
31274 else
31275 return target;
31279 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31281 static rtx
31282 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31283 tree exp, rtx target)
31285 rtx pat;
31286 tree arg0 = CALL_EXPR_ARG (exp, 0);
31287 tree arg1 = CALL_EXPR_ARG (exp, 1);
31288 tree arg2 = CALL_EXPR_ARG (exp, 2);
31289 rtx scratch0, scratch1;
31290 rtx op0 = expand_normal (arg0);
31291 rtx op1 = expand_normal (arg1);
31292 rtx op2 = expand_normal (arg2);
31293 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31295 tmode0 = insn_data[d->icode].operand[0].mode;
31296 tmode1 = insn_data[d->icode].operand[1].mode;
31297 modev2 = insn_data[d->icode].operand[2].mode;
31298 modev3 = insn_data[d->icode].operand[3].mode;
31299 modeimm = insn_data[d->icode].operand[4].mode;
31301 if (VECTOR_MODE_P (modev2))
31302 op0 = safe_vector_operand (op0, modev2);
31303 if (VECTOR_MODE_P (modev3))
31304 op1 = safe_vector_operand (op1, modev3);
31306 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31307 op0 = copy_to_mode_reg (modev2, op0);
31308 if ((optimize && !register_operand (op1, modev3))
31309 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31310 op1 = copy_to_mode_reg (modev3, op1);
31312 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
31314 error ("the third argument must be an 8-bit immediate");
31315 return const0_rtx;
31318 if (d->code == IX86_BUILTIN_PCMPISTRI128)
31320 if (optimize || !target
31321 || GET_MODE (target) != tmode0
31322 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31323 target = gen_reg_rtx (tmode0);
31325 scratch1 = gen_reg_rtx (tmode1);
31327 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
31329 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
31331 if (optimize || !target
31332 || GET_MODE (target) != tmode1
31333 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31334 target = gen_reg_rtx (tmode1);
31336 scratch0 = gen_reg_rtx (tmode0);
31338 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
31340 else
31342 gcc_assert (d->flag);
31344 scratch0 = gen_reg_rtx (tmode0);
31345 scratch1 = gen_reg_rtx (tmode1);
31347 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
31350 if (! pat)
31351 return 0;
31353 emit_insn (pat);
31355 if (d->flag)
31357 target = gen_reg_rtx (SImode);
31358 emit_move_insn (target, const0_rtx);
31359 target = gen_rtx_SUBREG (QImode, target, 0);
31361 emit_insn
31362 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31363 gen_rtx_fmt_ee (EQ, QImode,
31364 gen_rtx_REG ((enum machine_mode) d->flag,
31365 FLAGS_REG),
31366 const0_rtx)));
31367 return SUBREG_REG (target);
31369 else
31370 return target;
31373 /* Subroutine of ix86_expand_builtin to take care of insns with
31374 variable number of operands. */
31376 static rtx
31377 ix86_expand_args_builtin (const struct builtin_description *d,
31378 tree exp, rtx target)
31380 rtx pat, real_target;
31381 unsigned int i, nargs;
31382 unsigned int nargs_constant = 0;
31383 int num_memory = 0;
31384 struct
31386 rtx op;
31387 enum machine_mode mode;
31388 } args[4];
31389 bool last_arg_count = false;
31390 enum insn_code icode = d->icode;
31391 const struct insn_data_d *insn_p = &insn_data[icode];
31392 enum machine_mode tmode = insn_p->operand[0].mode;
31393 enum machine_mode rmode = VOIDmode;
31394 bool swap = false;
31395 enum rtx_code comparison = d->comparison;
31397 switch ((enum ix86_builtin_func_type) d->flag)
31399 case V2DF_FTYPE_V2DF_ROUND:
31400 case V4DF_FTYPE_V4DF_ROUND:
31401 case V4SF_FTYPE_V4SF_ROUND:
31402 case V8SF_FTYPE_V8SF_ROUND:
31403 case V4SI_FTYPE_V4SF_ROUND:
31404 case V8SI_FTYPE_V8SF_ROUND:
31405 return ix86_expand_sse_round (d, exp, target);
31406 case V4SI_FTYPE_V2DF_V2DF_ROUND:
31407 case V8SI_FTYPE_V4DF_V4DF_ROUND:
31408 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
31409 case INT_FTYPE_V8SF_V8SF_PTEST:
31410 case INT_FTYPE_V4DI_V4DI_PTEST:
31411 case INT_FTYPE_V4DF_V4DF_PTEST:
31412 case INT_FTYPE_V4SF_V4SF_PTEST:
31413 case INT_FTYPE_V2DI_V2DI_PTEST:
31414 case INT_FTYPE_V2DF_V2DF_PTEST:
31415 return ix86_expand_sse_ptest (d, exp, target);
31416 case FLOAT128_FTYPE_FLOAT128:
31417 case FLOAT_FTYPE_FLOAT:
31418 case INT_FTYPE_INT:
31419 case UINT64_FTYPE_INT:
31420 case UINT16_FTYPE_UINT16:
31421 case INT64_FTYPE_INT64:
31422 case INT64_FTYPE_V4SF:
31423 case INT64_FTYPE_V2DF:
31424 case INT_FTYPE_V16QI:
31425 case INT_FTYPE_V8QI:
31426 case INT_FTYPE_V8SF:
31427 case INT_FTYPE_V4DF:
31428 case INT_FTYPE_V4SF:
31429 case INT_FTYPE_V2DF:
31430 case INT_FTYPE_V32QI:
31431 case V16QI_FTYPE_V16QI:
31432 case V8SI_FTYPE_V8SF:
31433 case V8SI_FTYPE_V4SI:
31434 case V8HI_FTYPE_V8HI:
31435 case V8HI_FTYPE_V16QI:
31436 case V8QI_FTYPE_V8QI:
31437 case V8SF_FTYPE_V8SF:
31438 case V8SF_FTYPE_V8SI:
31439 case V8SF_FTYPE_V4SF:
31440 case V8SF_FTYPE_V8HI:
31441 case V4SI_FTYPE_V4SI:
31442 case V4SI_FTYPE_V16QI:
31443 case V4SI_FTYPE_V4SF:
31444 case V4SI_FTYPE_V8SI:
31445 case V4SI_FTYPE_V8HI:
31446 case V4SI_FTYPE_V4DF:
31447 case V4SI_FTYPE_V2DF:
31448 case V4HI_FTYPE_V4HI:
31449 case V4DF_FTYPE_V4DF:
31450 case V4DF_FTYPE_V4SI:
31451 case V4DF_FTYPE_V4SF:
31452 case V4DF_FTYPE_V2DF:
31453 case V4SF_FTYPE_V4SF:
31454 case V4SF_FTYPE_V4SI:
31455 case V4SF_FTYPE_V8SF:
31456 case V4SF_FTYPE_V4DF:
31457 case V4SF_FTYPE_V8HI:
31458 case V4SF_FTYPE_V2DF:
31459 case V2DI_FTYPE_V2DI:
31460 case V2DI_FTYPE_V16QI:
31461 case V2DI_FTYPE_V8HI:
31462 case V2DI_FTYPE_V4SI:
31463 case V2DF_FTYPE_V2DF:
31464 case V2DF_FTYPE_V4SI:
31465 case V2DF_FTYPE_V4DF:
31466 case V2DF_FTYPE_V4SF:
31467 case V2DF_FTYPE_V2SI:
31468 case V2SI_FTYPE_V2SI:
31469 case V2SI_FTYPE_V4SF:
31470 case V2SI_FTYPE_V2SF:
31471 case V2SI_FTYPE_V2DF:
31472 case V2SF_FTYPE_V2SF:
31473 case V2SF_FTYPE_V2SI:
31474 case V32QI_FTYPE_V32QI:
31475 case V32QI_FTYPE_V16QI:
31476 case V16HI_FTYPE_V16HI:
31477 case V16HI_FTYPE_V8HI:
31478 case V8SI_FTYPE_V8SI:
31479 case V16HI_FTYPE_V16QI:
31480 case V8SI_FTYPE_V16QI:
31481 case V4DI_FTYPE_V16QI:
31482 case V8SI_FTYPE_V8HI:
31483 case V4DI_FTYPE_V8HI:
31484 case V4DI_FTYPE_V4SI:
31485 case V4DI_FTYPE_V2DI:
31486 nargs = 1;
31487 break;
31488 case V4SF_FTYPE_V4SF_VEC_MERGE:
31489 case V2DF_FTYPE_V2DF_VEC_MERGE:
31490 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
31491 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
31492 case V16QI_FTYPE_V16QI_V16QI:
31493 case V16QI_FTYPE_V8HI_V8HI:
31494 case V8QI_FTYPE_V8QI_V8QI:
31495 case V8QI_FTYPE_V4HI_V4HI:
31496 case V8HI_FTYPE_V8HI_V8HI:
31497 case V8HI_FTYPE_V16QI_V16QI:
31498 case V8HI_FTYPE_V4SI_V4SI:
31499 case V8SF_FTYPE_V8SF_V8SF:
31500 case V8SF_FTYPE_V8SF_V8SI:
31501 case V4SI_FTYPE_V4SI_V4SI:
31502 case V4SI_FTYPE_V8HI_V8HI:
31503 case V4SI_FTYPE_V4SF_V4SF:
31504 case V4SI_FTYPE_V2DF_V2DF:
31505 case V4HI_FTYPE_V4HI_V4HI:
31506 case V4HI_FTYPE_V8QI_V8QI:
31507 case V4HI_FTYPE_V2SI_V2SI:
31508 case V4DF_FTYPE_V4DF_V4DF:
31509 case V4DF_FTYPE_V4DF_V4DI:
31510 case V4SF_FTYPE_V4SF_V4SF:
31511 case V4SF_FTYPE_V4SF_V4SI:
31512 case V4SF_FTYPE_V4SF_V2SI:
31513 case V4SF_FTYPE_V4SF_V2DF:
31514 case V4SF_FTYPE_V4SF_DI:
31515 case V4SF_FTYPE_V4SF_SI:
31516 case V2DI_FTYPE_V2DI_V2DI:
31517 case V2DI_FTYPE_V16QI_V16QI:
31518 case V2DI_FTYPE_V4SI_V4SI:
31519 case V2UDI_FTYPE_V4USI_V4USI:
31520 case V2DI_FTYPE_V2DI_V16QI:
31521 case V2DI_FTYPE_V2DF_V2DF:
31522 case V2SI_FTYPE_V2SI_V2SI:
31523 case V2SI_FTYPE_V4HI_V4HI:
31524 case V2SI_FTYPE_V2SF_V2SF:
31525 case V2DF_FTYPE_V2DF_V2DF:
31526 case V2DF_FTYPE_V2DF_V4SF:
31527 case V2DF_FTYPE_V2DF_V2DI:
31528 case V2DF_FTYPE_V2DF_DI:
31529 case V2DF_FTYPE_V2DF_SI:
31530 case V2SF_FTYPE_V2SF_V2SF:
31531 case V1DI_FTYPE_V1DI_V1DI:
31532 case V1DI_FTYPE_V8QI_V8QI:
31533 case V1DI_FTYPE_V2SI_V2SI:
31534 case V32QI_FTYPE_V16HI_V16HI:
31535 case V16HI_FTYPE_V8SI_V8SI:
31536 case V32QI_FTYPE_V32QI_V32QI:
31537 case V16HI_FTYPE_V32QI_V32QI:
31538 case V16HI_FTYPE_V16HI_V16HI:
31539 case V8SI_FTYPE_V4DF_V4DF:
31540 case V8SI_FTYPE_V8SI_V8SI:
31541 case V8SI_FTYPE_V16HI_V16HI:
31542 case V4DI_FTYPE_V4DI_V4DI:
31543 case V4DI_FTYPE_V8SI_V8SI:
31544 case V4UDI_FTYPE_V8USI_V8USI:
31545 if (comparison == UNKNOWN)
31546 return ix86_expand_binop_builtin (icode, exp, target);
31547 nargs = 2;
31548 break;
31549 case V4SF_FTYPE_V4SF_V4SF_SWAP:
31550 case V2DF_FTYPE_V2DF_V2DF_SWAP:
31551 gcc_assert (comparison != UNKNOWN);
31552 nargs = 2;
31553 swap = true;
31554 break;
31555 case V16HI_FTYPE_V16HI_V8HI_COUNT:
31556 case V16HI_FTYPE_V16HI_SI_COUNT:
31557 case V8SI_FTYPE_V8SI_V4SI_COUNT:
31558 case V8SI_FTYPE_V8SI_SI_COUNT:
31559 case V4DI_FTYPE_V4DI_V2DI_COUNT:
31560 case V4DI_FTYPE_V4DI_INT_COUNT:
31561 case V8HI_FTYPE_V8HI_V8HI_COUNT:
31562 case V8HI_FTYPE_V8HI_SI_COUNT:
31563 case V4SI_FTYPE_V4SI_V4SI_COUNT:
31564 case V4SI_FTYPE_V4SI_SI_COUNT:
31565 case V4HI_FTYPE_V4HI_V4HI_COUNT:
31566 case V4HI_FTYPE_V4HI_SI_COUNT:
31567 case V2DI_FTYPE_V2DI_V2DI_COUNT:
31568 case V2DI_FTYPE_V2DI_SI_COUNT:
31569 case V2SI_FTYPE_V2SI_V2SI_COUNT:
31570 case V2SI_FTYPE_V2SI_SI_COUNT:
31571 case V1DI_FTYPE_V1DI_V1DI_COUNT:
31572 case V1DI_FTYPE_V1DI_SI_COUNT:
31573 nargs = 2;
31574 last_arg_count = true;
31575 break;
31576 case UINT64_FTYPE_UINT64_UINT64:
31577 case UINT_FTYPE_UINT_UINT:
31578 case UINT_FTYPE_UINT_USHORT:
31579 case UINT_FTYPE_UINT_UCHAR:
31580 case UINT16_FTYPE_UINT16_INT:
31581 case UINT8_FTYPE_UINT8_INT:
31582 nargs = 2;
31583 break;
31584 case V2DI_FTYPE_V2DI_INT_CONVERT:
31585 nargs = 2;
31586 rmode = V1TImode;
31587 nargs_constant = 1;
31588 break;
31589 case V4DI_FTYPE_V4DI_INT_CONVERT:
31590 nargs = 2;
31591 rmode = V2TImode;
31592 nargs_constant = 1;
31593 break;
31594 case V8HI_FTYPE_V8HI_INT:
31595 case V8HI_FTYPE_V8SF_INT:
31596 case V8HI_FTYPE_V4SF_INT:
31597 case V8SF_FTYPE_V8SF_INT:
31598 case V4SI_FTYPE_V4SI_INT:
31599 case V4SI_FTYPE_V8SI_INT:
31600 case V4HI_FTYPE_V4HI_INT:
31601 case V4DF_FTYPE_V4DF_INT:
31602 case V4SF_FTYPE_V4SF_INT:
31603 case V4SF_FTYPE_V8SF_INT:
31604 case V2DI_FTYPE_V2DI_INT:
31605 case V2DF_FTYPE_V2DF_INT:
31606 case V2DF_FTYPE_V4DF_INT:
31607 case V16HI_FTYPE_V16HI_INT:
31608 case V8SI_FTYPE_V8SI_INT:
31609 case V4DI_FTYPE_V4DI_INT:
31610 case V2DI_FTYPE_V4DI_INT:
31611 nargs = 2;
31612 nargs_constant = 1;
31613 break;
31614 case V16QI_FTYPE_V16QI_V16QI_V16QI:
31615 case V8SF_FTYPE_V8SF_V8SF_V8SF:
31616 case V4DF_FTYPE_V4DF_V4DF_V4DF:
31617 case V4SF_FTYPE_V4SF_V4SF_V4SF:
31618 case V2DF_FTYPE_V2DF_V2DF_V2DF:
31619 case V32QI_FTYPE_V32QI_V32QI_V32QI:
31620 nargs = 3;
31621 break;
31622 case V32QI_FTYPE_V32QI_V32QI_INT:
31623 case V16HI_FTYPE_V16HI_V16HI_INT:
31624 case V16QI_FTYPE_V16QI_V16QI_INT:
31625 case V4DI_FTYPE_V4DI_V4DI_INT:
31626 case V8HI_FTYPE_V8HI_V8HI_INT:
31627 case V8SI_FTYPE_V8SI_V8SI_INT:
31628 case V8SI_FTYPE_V8SI_V4SI_INT:
31629 case V8SF_FTYPE_V8SF_V8SF_INT:
31630 case V8SF_FTYPE_V8SF_V4SF_INT:
31631 case V4SI_FTYPE_V4SI_V4SI_INT:
31632 case V4DF_FTYPE_V4DF_V4DF_INT:
31633 case V4DF_FTYPE_V4DF_V2DF_INT:
31634 case V4SF_FTYPE_V4SF_V4SF_INT:
31635 case V2DI_FTYPE_V2DI_V2DI_INT:
31636 case V4DI_FTYPE_V4DI_V2DI_INT:
31637 case V2DF_FTYPE_V2DF_V2DF_INT:
31638 nargs = 3;
31639 nargs_constant = 1;
31640 break;
31641 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31642 nargs = 3;
31643 rmode = V4DImode;
31644 nargs_constant = 1;
31645 break;
31646 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31647 nargs = 3;
31648 rmode = V2DImode;
31649 nargs_constant = 1;
31650 break;
31651 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31652 nargs = 3;
31653 rmode = DImode;
31654 nargs_constant = 1;
31655 break;
31656 case V2DI_FTYPE_V2DI_UINT_UINT:
31657 nargs = 3;
31658 nargs_constant = 2;
31659 break;
31660 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31661 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31662 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31663 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31664 nargs = 4;
31665 nargs_constant = 1;
31666 break;
31667 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31668 nargs = 4;
31669 nargs_constant = 2;
31670 break;
31671 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31672 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31673 nargs = 4;
31674 break;
31675 default:
31676 gcc_unreachable ();
31679 gcc_assert (nargs <= ARRAY_SIZE (args));
31681 if (comparison != UNKNOWN)
31683 gcc_assert (nargs == 2);
31684 return ix86_expand_sse_compare (d, exp, target, swap);
31687 if (rmode == VOIDmode || rmode == tmode)
31689 if (optimize
31690 || target == 0
31691 || GET_MODE (target) != tmode
31692 || !insn_p->operand[0].predicate (target, tmode))
31693 target = gen_reg_rtx (tmode);
31694 real_target = target;
31696 else
31698 target = gen_reg_rtx (rmode);
31699 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31702 for (i = 0; i < nargs; i++)
31704 tree arg = CALL_EXPR_ARG (exp, i);
31705 rtx op = expand_normal (arg);
31706 enum machine_mode mode = insn_p->operand[i + 1].mode;
31707 bool match = insn_p->operand[i + 1].predicate (op, mode);
31709 if (last_arg_count && (i + 1) == nargs)
31711 /* SIMD shift insns take either an 8-bit immediate or
31712 register as count. But builtin functions take int as
31713 count. If count doesn't match, we put it in register. */
31714 if (!match)
31716 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31717 if (!insn_p->operand[i + 1].predicate (op, mode))
31718 op = copy_to_reg (op);
31721 else if ((nargs - i) <= nargs_constant)
31723 if (!match)
31724 switch (icode)
31726 case CODE_FOR_avx2_inserti128:
31727 case CODE_FOR_avx2_extracti128:
31728 error ("the last argument must be an 1-bit immediate");
31729 return const0_rtx;
31731 case CODE_FOR_sse4_1_roundsd:
31732 case CODE_FOR_sse4_1_roundss:
31734 case CODE_FOR_sse4_1_roundpd:
31735 case CODE_FOR_sse4_1_roundps:
31736 case CODE_FOR_avx_roundpd256:
31737 case CODE_FOR_avx_roundps256:
31739 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31740 case CODE_FOR_sse4_1_roundps_sfix:
31741 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31742 case CODE_FOR_avx_roundps_sfix256:
31744 case CODE_FOR_sse4_1_blendps:
31745 case CODE_FOR_avx_blendpd256:
31746 case CODE_FOR_avx_vpermilv4df:
31747 error ("the last argument must be a 4-bit immediate");
31748 return const0_rtx;
31750 case CODE_FOR_sse4_1_blendpd:
31751 case CODE_FOR_avx_vpermilv2df:
31752 case CODE_FOR_xop_vpermil2v2df3:
31753 case CODE_FOR_xop_vpermil2v4sf3:
31754 case CODE_FOR_xop_vpermil2v4df3:
31755 case CODE_FOR_xop_vpermil2v8sf3:
31756 error ("the last argument must be a 2-bit immediate");
31757 return const0_rtx;
31759 case CODE_FOR_avx_vextractf128v4df:
31760 case CODE_FOR_avx_vextractf128v8sf:
31761 case CODE_FOR_avx_vextractf128v8si:
31762 case CODE_FOR_avx_vinsertf128v4df:
31763 case CODE_FOR_avx_vinsertf128v8sf:
31764 case CODE_FOR_avx_vinsertf128v8si:
31765 error ("the last argument must be a 1-bit immediate");
31766 return const0_rtx;
31768 case CODE_FOR_avx_vmcmpv2df3:
31769 case CODE_FOR_avx_vmcmpv4sf3:
31770 case CODE_FOR_avx_cmpv2df3:
31771 case CODE_FOR_avx_cmpv4sf3:
31772 case CODE_FOR_avx_cmpv4df3:
31773 case CODE_FOR_avx_cmpv8sf3:
31774 error ("the last argument must be a 5-bit immediate");
31775 return const0_rtx;
31777 default:
31778 switch (nargs_constant)
31780 case 2:
31781 if ((nargs - i) == nargs_constant)
31783 error ("the next to last argument must be an 8-bit immediate");
31784 break;
31786 case 1:
31787 error ("the last argument must be an 8-bit immediate");
31788 break;
31789 default:
31790 gcc_unreachable ();
31792 return const0_rtx;
31795 else
31797 if (VECTOR_MODE_P (mode))
31798 op = safe_vector_operand (op, mode);
31800 /* If we aren't optimizing, only allow one memory operand to
31801 be generated. */
31802 if (memory_operand (op, mode))
31803 num_memory++;
31805 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31807 if (optimize || !match || num_memory > 1)
31808 op = copy_to_mode_reg (mode, op);
31810 else
31812 op = copy_to_reg (op);
31813 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31817 args[i].op = op;
31818 args[i].mode = mode;
31821 switch (nargs)
31823 case 1:
31824 pat = GEN_FCN (icode) (real_target, args[0].op);
31825 break;
31826 case 2:
31827 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31828 break;
31829 case 3:
31830 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31831 args[2].op);
31832 break;
31833 case 4:
31834 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31835 args[2].op, args[3].op);
31836 break;
31837 default:
31838 gcc_unreachable ();
31841 if (! pat)
31842 return 0;
31844 emit_insn (pat);
31845 return target;
31848 /* Subroutine of ix86_expand_builtin to take care of special insns
31849 with variable number of operands. */
31851 static rtx
31852 ix86_expand_special_args_builtin (const struct builtin_description *d,
31853 tree exp, rtx target)
31855 tree arg;
31856 rtx pat, op;
31857 unsigned int i, nargs, arg_adjust, memory;
31858 struct
31860 rtx op;
31861 enum machine_mode mode;
31862 } args[3];
31863 enum insn_code icode = d->icode;
31864 bool last_arg_constant = false;
31865 const struct insn_data_d *insn_p = &insn_data[icode];
31866 enum machine_mode tmode = insn_p->operand[0].mode;
31867 enum { load, store } klass;
31869 switch ((enum ix86_builtin_func_type) d->flag)
31871 case VOID_FTYPE_VOID:
31872 emit_insn (GEN_FCN (icode) (target));
31873 return 0;
31874 case VOID_FTYPE_UINT64:
31875 case VOID_FTYPE_UNSIGNED:
31876 nargs = 0;
31877 klass = store;
31878 memory = 0;
31879 break;
31881 case INT_FTYPE_VOID:
31882 case UINT64_FTYPE_VOID:
31883 case UNSIGNED_FTYPE_VOID:
31884 nargs = 0;
31885 klass = load;
31886 memory = 0;
31887 break;
31888 case UINT64_FTYPE_PUNSIGNED:
31889 case V2DI_FTYPE_PV2DI:
31890 case V4DI_FTYPE_PV4DI:
31891 case V32QI_FTYPE_PCCHAR:
31892 case V16QI_FTYPE_PCCHAR:
31893 case V8SF_FTYPE_PCV4SF:
31894 case V8SF_FTYPE_PCFLOAT:
31895 case V4SF_FTYPE_PCFLOAT:
31896 case V4DF_FTYPE_PCV2DF:
31897 case V4DF_FTYPE_PCDOUBLE:
31898 case V2DF_FTYPE_PCDOUBLE:
31899 case VOID_FTYPE_PVOID:
31900 nargs = 1;
31901 klass = load;
31902 memory = 0;
31903 break;
31904 case VOID_FTYPE_PV2SF_V4SF:
31905 case VOID_FTYPE_PV4DI_V4DI:
31906 case VOID_FTYPE_PV2DI_V2DI:
31907 case VOID_FTYPE_PCHAR_V32QI:
31908 case VOID_FTYPE_PCHAR_V16QI:
31909 case VOID_FTYPE_PFLOAT_V8SF:
31910 case VOID_FTYPE_PFLOAT_V4SF:
31911 case VOID_FTYPE_PDOUBLE_V4DF:
31912 case VOID_FTYPE_PDOUBLE_V2DF:
31913 case VOID_FTYPE_PLONGLONG_LONGLONG:
31914 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31915 case VOID_FTYPE_PINT_INT:
31916 nargs = 1;
31917 klass = store;
31918 /* Reserve memory operand for target. */
31919 memory = ARRAY_SIZE (args);
31920 break;
31921 case V4SF_FTYPE_V4SF_PCV2SF:
31922 case V2DF_FTYPE_V2DF_PCDOUBLE:
31923 nargs = 2;
31924 klass = load;
31925 memory = 1;
31926 break;
31927 case V8SF_FTYPE_PCV8SF_V8SI:
31928 case V4DF_FTYPE_PCV4DF_V4DI:
31929 case V4SF_FTYPE_PCV4SF_V4SI:
31930 case V2DF_FTYPE_PCV2DF_V2DI:
31931 case V8SI_FTYPE_PCV8SI_V8SI:
31932 case V4DI_FTYPE_PCV4DI_V4DI:
31933 case V4SI_FTYPE_PCV4SI_V4SI:
31934 case V2DI_FTYPE_PCV2DI_V2DI:
31935 nargs = 2;
31936 klass = load;
31937 memory = 0;
31938 break;
31939 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31940 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31941 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31942 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31943 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31944 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31945 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31946 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31947 nargs = 2;
31948 klass = store;
31949 /* Reserve memory operand for target. */
31950 memory = ARRAY_SIZE (args);
31951 break;
31952 case VOID_FTYPE_UINT_UINT_UINT:
31953 case VOID_FTYPE_UINT64_UINT_UINT:
31954 case UCHAR_FTYPE_UINT_UINT_UINT:
31955 case UCHAR_FTYPE_UINT64_UINT_UINT:
31956 nargs = 3;
31957 klass = load;
31958 memory = ARRAY_SIZE (args);
31959 last_arg_constant = true;
31960 break;
31961 default:
31962 gcc_unreachable ();
31965 gcc_assert (nargs <= ARRAY_SIZE (args));
31967 if (klass == store)
31969 arg = CALL_EXPR_ARG (exp, 0);
31970 op = expand_normal (arg);
31971 gcc_assert (target == 0);
31972 if (memory)
31974 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31975 target = gen_rtx_MEM (tmode, op);
31977 else
31978 target = force_reg (tmode, op);
31979 arg_adjust = 1;
31981 else
31983 arg_adjust = 0;
31984 if (optimize
31985 || target == 0
31986 || !register_operand (target, tmode)
31987 || GET_MODE (target) != tmode)
31988 target = gen_reg_rtx (tmode);
31991 for (i = 0; i < nargs; i++)
31993 enum machine_mode mode = insn_p->operand[i + 1].mode;
31994 bool match;
31996 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31997 op = expand_normal (arg);
31998 match = insn_p->operand[i + 1].predicate (op, mode);
32000 if (last_arg_constant && (i + 1) == nargs)
32002 if (!match)
32004 if (icode == CODE_FOR_lwp_lwpvalsi3
32005 || icode == CODE_FOR_lwp_lwpinssi3
32006 || icode == CODE_FOR_lwp_lwpvaldi3
32007 || icode == CODE_FOR_lwp_lwpinsdi3)
32008 error ("the last argument must be a 32-bit immediate");
32009 else
32010 error ("the last argument must be an 8-bit immediate");
32011 return const0_rtx;
32014 else
32016 if (i == memory)
32018 /* This must be the memory operand. */
32019 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32020 op = gen_rtx_MEM (mode, op);
32021 gcc_assert (GET_MODE (op) == mode
32022 || GET_MODE (op) == VOIDmode);
32024 else
32026 /* This must be register. */
32027 if (VECTOR_MODE_P (mode))
32028 op = safe_vector_operand (op, mode);
32030 gcc_assert (GET_MODE (op) == mode
32031 || GET_MODE (op) == VOIDmode);
32032 op = copy_to_mode_reg (mode, op);
32036 args[i].op = op;
32037 args[i].mode = mode;
32040 switch (nargs)
32042 case 0:
32043 pat = GEN_FCN (icode) (target);
32044 break;
32045 case 1:
32046 pat = GEN_FCN (icode) (target, args[0].op);
32047 break;
32048 case 2:
32049 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32050 break;
32051 case 3:
32052 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32053 break;
32054 default:
32055 gcc_unreachable ();
32058 if (! pat)
32059 return 0;
32060 emit_insn (pat);
32061 return klass == store ? 0 : target;
32064 /* Return the integer constant in ARG. Constrain it to be in the range
32065 of the subparts of VEC_TYPE; issue an error if not. */
32067 static int
32068 get_element_number (tree vec_type, tree arg)
32070 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
32072 if (!host_integerp (arg, 1)
32073 || (elt = tree_low_cst (arg, 1), elt > max))
32075 error ("selector must be an integer constant in the range 0..%wi", max);
32076 return 0;
32079 return elt;
32082 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32083 ix86_expand_vector_init. We DO have language-level syntax for this, in
32084 the form of (type){ init-list }. Except that since we can't place emms
32085 instructions from inside the compiler, we can't allow the use of MMX
32086 registers unless the user explicitly asks for it. So we do *not* define
32087 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
32088 we have builtins invoked by mmintrin.h that gives us license to emit
32089 these sorts of instructions. */
32091 static rtx
32092 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
32094 enum machine_mode tmode = TYPE_MODE (type);
32095 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
32096 int i, n_elt = GET_MODE_NUNITS (tmode);
32097 rtvec v = rtvec_alloc (n_elt);
32099 gcc_assert (VECTOR_MODE_P (tmode));
32100 gcc_assert (call_expr_nargs (exp) == n_elt);
32102 for (i = 0; i < n_elt; ++i)
32104 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
32105 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
32108 if (!target || !register_operand (target, tmode))
32109 target = gen_reg_rtx (tmode);
32111 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
32112 return target;
32115 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32116 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
32117 had a language-level syntax for referencing vector elements. */
32119 static rtx
32120 ix86_expand_vec_ext_builtin (tree exp, rtx target)
32122 enum machine_mode tmode, mode0;
32123 tree arg0, arg1;
32124 int elt;
32125 rtx op0;
32127 arg0 = CALL_EXPR_ARG (exp, 0);
32128 arg1 = CALL_EXPR_ARG (exp, 1);
32130 op0 = expand_normal (arg0);
32131 elt = get_element_number (TREE_TYPE (arg0), arg1);
32133 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32134 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32135 gcc_assert (VECTOR_MODE_P (mode0));
32137 op0 = force_reg (mode0, op0);
32139 if (optimize || !target || !register_operand (target, tmode))
32140 target = gen_reg_rtx (tmode);
32142 ix86_expand_vector_extract (true, target, op0, elt);
32144 return target;
32147 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32148 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32149 a language-level syntax for referencing vector elements. */
32151 static rtx
32152 ix86_expand_vec_set_builtin (tree exp)
32154 enum machine_mode tmode, mode1;
32155 tree arg0, arg1, arg2;
32156 int elt;
32157 rtx op0, op1, target;
32159 arg0 = CALL_EXPR_ARG (exp, 0);
32160 arg1 = CALL_EXPR_ARG (exp, 1);
32161 arg2 = CALL_EXPR_ARG (exp, 2);
32163 tmode = TYPE_MODE (TREE_TYPE (arg0));
32164 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32165 gcc_assert (VECTOR_MODE_P (tmode));
32167 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32168 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32169 elt = get_element_number (TREE_TYPE (arg0), arg2);
32171 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32172 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32174 op0 = force_reg (tmode, op0);
32175 op1 = force_reg (mode1, op1);
32177 /* OP0 is the source of these builtin functions and shouldn't be
32178 modified. Create a copy, use it and return it as target. */
32179 target = gen_reg_rtx (tmode);
32180 emit_move_insn (target, op0);
32181 ix86_expand_vector_set (true, target, op1, elt);
32183 return target;
32186 /* Expand an expression EXP that calls a built-in function,
32187 with result going to TARGET if that's convenient
32188 (and in mode MODE if that's convenient).
32189 SUBTARGET may be used as the target for computing one of EXP's operands.
32190 IGNORE is nonzero if the value is to be ignored. */
32192 static rtx
32193 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
32194 enum machine_mode mode, int ignore)
32196 const struct builtin_description *d;
32197 size_t i;
32198 enum insn_code icode;
32199 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32200 tree arg0, arg1, arg2, arg3, arg4;
32201 rtx op0, op1, op2, op3, op4, pat, insn;
32202 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32203 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32205 /* For CPU builtins that can be folded, fold first and expand the fold. */
32206 switch (fcode)
32208 case IX86_BUILTIN_CPU_INIT:
32210 /* Make it call __cpu_indicator_init in libgcc. */
32211 tree call_expr, fndecl, type;
32212 type = build_function_type_list (integer_type_node, NULL_TREE);
32213 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32214 call_expr = build_call_expr (fndecl, 0);
32215 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32217 case IX86_BUILTIN_CPU_IS:
32218 case IX86_BUILTIN_CPU_SUPPORTS:
32220 tree arg0 = CALL_EXPR_ARG (exp, 0);
32221 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32222 gcc_assert (fold_expr != NULL_TREE);
32223 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32227 /* Determine whether the builtin function is available under the current ISA.
32228 Originally the builtin was not created if it wasn't applicable to the
32229 current ISA based on the command line switches. With function specific
32230 options, we need to check in the context of the function making the call
32231 whether it is supported. */
32232 if (ix86_builtins_isa[fcode].isa
32233 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32235 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32236 NULL, (enum fpmath_unit) 0, false);
32238 if (!opts)
32239 error ("%qE needs unknown isa option", fndecl);
32240 else
32242 gcc_assert (opts != NULL);
32243 error ("%qE needs isa option %s", fndecl, opts);
32244 free (opts);
32246 return const0_rtx;
32249 switch (fcode)
32251 case IX86_BUILTIN_MASKMOVQ:
32252 case IX86_BUILTIN_MASKMOVDQU:
32253 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32254 ? CODE_FOR_mmx_maskmovq
32255 : CODE_FOR_sse2_maskmovdqu);
32256 /* Note the arg order is different from the operand order. */
32257 arg1 = CALL_EXPR_ARG (exp, 0);
32258 arg2 = CALL_EXPR_ARG (exp, 1);
32259 arg0 = CALL_EXPR_ARG (exp, 2);
32260 op0 = expand_normal (arg0);
32261 op1 = expand_normal (arg1);
32262 op2 = expand_normal (arg2);
32263 mode0 = insn_data[icode].operand[0].mode;
32264 mode1 = insn_data[icode].operand[1].mode;
32265 mode2 = insn_data[icode].operand[2].mode;
32267 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32268 op0 = gen_rtx_MEM (mode1, op0);
32270 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32271 op0 = copy_to_mode_reg (mode0, op0);
32272 if (!insn_data[icode].operand[1].predicate (op1, mode1))
32273 op1 = copy_to_mode_reg (mode1, op1);
32274 if (!insn_data[icode].operand[2].predicate (op2, mode2))
32275 op2 = copy_to_mode_reg (mode2, op2);
32276 pat = GEN_FCN (icode) (op0, op1, op2);
32277 if (! pat)
32278 return 0;
32279 emit_insn (pat);
32280 return 0;
32282 case IX86_BUILTIN_LDMXCSR:
32283 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
32284 target = assign_386_stack_local (SImode, SLOT_TEMP);
32285 emit_move_insn (target, op0);
32286 emit_insn (gen_sse_ldmxcsr (target));
32287 return 0;
32289 case IX86_BUILTIN_STMXCSR:
32290 target = assign_386_stack_local (SImode, SLOT_TEMP);
32291 emit_insn (gen_sse_stmxcsr (target));
32292 return copy_to_mode_reg (SImode, target);
32294 case IX86_BUILTIN_CLFLUSH:
32295 arg0 = CALL_EXPR_ARG (exp, 0);
32296 op0 = expand_normal (arg0);
32297 icode = CODE_FOR_sse2_clflush;
32298 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32299 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32301 emit_insn (gen_sse2_clflush (op0));
32302 return 0;
32304 case IX86_BUILTIN_MONITOR:
32305 arg0 = CALL_EXPR_ARG (exp, 0);
32306 arg1 = CALL_EXPR_ARG (exp, 1);
32307 arg2 = CALL_EXPR_ARG (exp, 2);
32308 op0 = expand_normal (arg0);
32309 op1 = expand_normal (arg1);
32310 op2 = expand_normal (arg2);
32311 if (!REG_P (op0))
32312 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32313 if (!REG_P (op1))
32314 op1 = copy_to_mode_reg (SImode, op1);
32315 if (!REG_P (op2))
32316 op2 = copy_to_mode_reg (SImode, op2);
32317 emit_insn (ix86_gen_monitor (op0, op1, op2));
32318 return 0;
32320 case IX86_BUILTIN_MWAIT:
32321 arg0 = CALL_EXPR_ARG (exp, 0);
32322 arg1 = CALL_EXPR_ARG (exp, 1);
32323 op0 = expand_normal (arg0);
32324 op1 = expand_normal (arg1);
32325 if (!REG_P (op0))
32326 op0 = copy_to_mode_reg (SImode, op0);
32327 if (!REG_P (op1))
32328 op1 = copy_to_mode_reg (SImode, op1);
32329 emit_insn (gen_sse3_mwait (op0, op1));
32330 return 0;
32332 case IX86_BUILTIN_VEC_INIT_V2SI:
32333 case IX86_BUILTIN_VEC_INIT_V4HI:
32334 case IX86_BUILTIN_VEC_INIT_V8QI:
32335 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
32337 case IX86_BUILTIN_VEC_EXT_V2DF:
32338 case IX86_BUILTIN_VEC_EXT_V2DI:
32339 case IX86_BUILTIN_VEC_EXT_V4SF:
32340 case IX86_BUILTIN_VEC_EXT_V4SI:
32341 case IX86_BUILTIN_VEC_EXT_V8HI:
32342 case IX86_BUILTIN_VEC_EXT_V2SI:
32343 case IX86_BUILTIN_VEC_EXT_V4HI:
32344 case IX86_BUILTIN_VEC_EXT_V16QI:
32345 return ix86_expand_vec_ext_builtin (exp, target);
32347 case IX86_BUILTIN_VEC_SET_V2DI:
32348 case IX86_BUILTIN_VEC_SET_V4SF:
32349 case IX86_BUILTIN_VEC_SET_V4SI:
32350 case IX86_BUILTIN_VEC_SET_V8HI:
32351 case IX86_BUILTIN_VEC_SET_V4HI:
32352 case IX86_BUILTIN_VEC_SET_V16QI:
32353 return ix86_expand_vec_set_builtin (exp);
32355 case IX86_BUILTIN_INFQ:
32356 case IX86_BUILTIN_HUGE_VALQ:
32358 REAL_VALUE_TYPE inf;
32359 rtx tmp;
32361 real_inf (&inf);
32362 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
32364 tmp = validize_mem (force_const_mem (mode, tmp));
32366 if (target == 0)
32367 target = gen_reg_rtx (mode);
32369 emit_move_insn (target, tmp);
32370 return target;
32373 case IX86_BUILTIN_RDPMC:
32374 case IX86_BUILTIN_RDTSC:
32375 case IX86_BUILTIN_RDTSCP:
32377 op0 = gen_reg_rtx (DImode);
32378 op1 = gen_reg_rtx (DImode);
32380 if (fcode == IX86_BUILTIN_RDPMC)
32382 arg0 = CALL_EXPR_ARG (exp, 0);
32383 op2 = expand_normal (arg0);
32384 if (!register_operand (op2, SImode))
32385 op2 = copy_to_mode_reg (SImode, op2);
32387 insn = (TARGET_64BIT
32388 ? gen_rdpmc_rex64 (op0, op1, op2)
32389 : gen_rdpmc (op0, op2));
32390 emit_insn (insn);
32392 else if (fcode == IX86_BUILTIN_RDTSC)
32394 insn = (TARGET_64BIT
32395 ? gen_rdtsc_rex64 (op0, op1)
32396 : gen_rdtsc (op0));
32397 emit_insn (insn);
32399 else
32401 op2 = gen_reg_rtx (SImode);
32403 insn = (TARGET_64BIT
32404 ? gen_rdtscp_rex64 (op0, op1, op2)
32405 : gen_rdtscp (op0, op2));
32406 emit_insn (insn);
32408 arg0 = CALL_EXPR_ARG (exp, 0);
32409 op4 = expand_normal (arg0);
32410 if (!address_operand (op4, VOIDmode))
32412 op4 = convert_memory_address (Pmode, op4);
32413 op4 = copy_addr_to_reg (op4);
32415 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
32418 if (target == 0)
32420 /* mode is VOIDmode if __builtin_rd* has been called
32421 without lhs. */
32422 if (mode == VOIDmode)
32423 return target;
32424 target = gen_reg_rtx (mode);
32427 if (TARGET_64BIT)
32429 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
32430 op1, 1, OPTAB_DIRECT);
32431 op0 = expand_simple_binop (DImode, IOR, op0, op1,
32432 op0, 1, OPTAB_DIRECT);
32435 emit_move_insn (target, op0);
32436 return target;
32438 case IX86_BUILTIN_FXSAVE:
32439 case IX86_BUILTIN_FXRSTOR:
32440 case IX86_BUILTIN_FXSAVE64:
32441 case IX86_BUILTIN_FXRSTOR64:
32442 switch (fcode)
32444 case IX86_BUILTIN_FXSAVE:
32445 icode = CODE_FOR_fxsave;
32446 break;
32447 case IX86_BUILTIN_FXRSTOR:
32448 icode = CODE_FOR_fxrstor;
32449 break;
32450 case IX86_BUILTIN_FXSAVE64:
32451 icode = CODE_FOR_fxsave64;
32452 break;
32453 case IX86_BUILTIN_FXRSTOR64:
32454 icode = CODE_FOR_fxrstor64;
32455 break;
32456 default:
32457 gcc_unreachable ();
32460 arg0 = CALL_EXPR_ARG (exp, 0);
32461 op0 = expand_normal (arg0);
32463 if (!address_operand (op0, VOIDmode))
32465 op0 = convert_memory_address (Pmode, op0);
32466 op0 = copy_addr_to_reg (op0);
32468 op0 = gen_rtx_MEM (BLKmode, op0);
32470 pat = GEN_FCN (icode) (op0);
32471 if (pat)
32472 emit_insn (pat);
32473 return 0;
32475 case IX86_BUILTIN_XSAVE:
32476 case IX86_BUILTIN_XRSTOR:
32477 case IX86_BUILTIN_XSAVE64:
32478 case IX86_BUILTIN_XRSTOR64:
32479 case IX86_BUILTIN_XSAVEOPT:
32480 case IX86_BUILTIN_XSAVEOPT64:
32481 arg0 = CALL_EXPR_ARG (exp, 0);
32482 arg1 = CALL_EXPR_ARG (exp, 1);
32483 op0 = expand_normal (arg0);
32484 op1 = expand_normal (arg1);
32486 if (!address_operand (op0, VOIDmode))
32488 op0 = convert_memory_address (Pmode, op0);
32489 op0 = copy_addr_to_reg (op0);
32491 op0 = gen_rtx_MEM (BLKmode, op0);
32493 op1 = force_reg (DImode, op1);
32495 if (TARGET_64BIT)
32497 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
32498 NULL, 1, OPTAB_DIRECT);
32499 switch (fcode)
32501 case IX86_BUILTIN_XSAVE:
32502 icode = CODE_FOR_xsave_rex64;
32503 break;
32504 case IX86_BUILTIN_XRSTOR:
32505 icode = CODE_FOR_xrstor_rex64;
32506 break;
32507 case IX86_BUILTIN_XSAVE64:
32508 icode = CODE_FOR_xsave64;
32509 break;
32510 case IX86_BUILTIN_XRSTOR64:
32511 icode = CODE_FOR_xrstor64;
32512 break;
32513 case IX86_BUILTIN_XSAVEOPT:
32514 icode = CODE_FOR_xsaveopt_rex64;
32515 break;
32516 case IX86_BUILTIN_XSAVEOPT64:
32517 icode = CODE_FOR_xsaveopt64;
32518 break;
32519 default:
32520 gcc_unreachable ();
32523 op2 = gen_lowpart (SImode, op2);
32524 op1 = gen_lowpart (SImode, op1);
32525 pat = GEN_FCN (icode) (op0, op1, op2);
32527 else
32529 switch (fcode)
32531 case IX86_BUILTIN_XSAVE:
32532 icode = CODE_FOR_xsave;
32533 break;
32534 case IX86_BUILTIN_XRSTOR:
32535 icode = CODE_FOR_xrstor;
32536 break;
32537 case IX86_BUILTIN_XSAVEOPT:
32538 icode = CODE_FOR_xsaveopt;
32539 break;
32540 default:
32541 gcc_unreachable ();
32543 pat = GEN_FCN (icode) (op0, op1);
32546 if (pat)
32547 emit_insn (pat);
32548 return 0;
32550 case IX86_BUILTIN_LLWPCB:
32551 arg0 = CALL_EXPR_ARG (exp, 0);
32552 op0 = expand_normal (arg0);
32553 icode = CODE_FOR_lwp_llwpcb;
32554 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32555 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32556 emit_insn (gen_lwp_llwpcb (op0));
32557 return 0;
32559 case IX86_BUILTIN_SLWPCB:
32560 icode = CODE_FOR_lwp_slwpcb;
32561 if (!target
32562 || !insn_data[icode].operand[0].predicate (target, Pmode))
32563 target = gen_reg_rtx (Pmode);
32564 emit_insn (gen_lwp_slwpcb (target));
32565 return target;
32567 case IX86_BUILTIN_BEXTRI32:
32568 case IX86_BUILTIN_BEXTRI64:
32569 arg0 = CALL_EXPR_ARG (exp, 0);
32570 arg1 = CALL_EXPR_ARG (exp, 1);
32571 op0 = expand_normal (arg0);
32572 op1 = expand_normal (arg1);
32573 icode = (fcode == IX86_BUILTIN_BEXTRI32
32574 ? CODE_FOR_tbm_bextri_si
32575 : CODE_FOR_tbm_bextri_di);
32576 if (!CONST_INT_P (op1))
32578 error ("last argument must be an immediate");
32579 return const0_rtx;
32581 else
32583 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
32584 unsigned char lsb_index = INTVAL (op1) & 0xFF;
32585 op1 = GEN_INT (length);
32586 op2 = GEN_INT (lsb_index);
32587 pat = GEN_FCN (icode) (target, op0, op1, op2);
32588 if (pat)
32589 emit_insn (pat);
32590 return target;
32593 case IX86_BUILTIN_RDRAND16_STEP:
32594 icode = CODE_FOR_rdrandhi_1;
32595 mode0 = HImode;
32596 goto rdrand_step;
32598 case IX86_BUILTIN_RDRAND32_STEP:
32599 icode = CODE_FOR_rdrandsi_1;
32600 mode0 = SImode;
32601 goto rdrand_step;
32603 case IX86_BUILTIN_RDRAND64_STEP:
32604 icode = CODE_FOR_rdranddi_1;
32605 mode0 = DImode;
32607 rdrand_step:
32608 op0 = gen_reg_rtx (mode0);
32609 emit_insn (GEN_FCN (icode) (op0));
32611 arg0 = CALL_EXPR_ARG (exp, 0);
32612 op1 = expand_normal (arg0);
32613 if (!address_operand (op1, VOIDmode))
32615 op1 = convert_memory_address (Pmode, op1);
32616 op1 = copy_addr_to_reg (op1);
32618 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32620 op1 = gen_reg_rtx (SImode);
32621 emit_move_insn (op1, CONST1_RTX (SImode));
32623 /* Emit SImode conditional move. */
32624 if (mode0 == HImode)
32626 op2 = gen_reg_rtx (SImode);
32627 emit_insn (gen_zero_extendhisi2 (op2, op0));
32629 else if (mode0 == SImode)
32630 op2 = op0;
32631 else
32632 op2 = gen_rtx_SUBREG (SImode, op0, 0);
32634 if (target == 0)
32635 target = gen_reg_rtx (SImode);
32637 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
32638 const0_rtx);
32639 emit_insn (gen_rtx_SET (VOIDmode, target,
32640 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32641 return target;
32643 case IX86_BUILTIN_RDSEED16_STEP:
32644 icode = CODE_FOR_rdseedhi_1;
32645 mode0 = HImode;
32646 goto rdseed_step;
32648 case IX86_BUILTIN_RDSEED32_STEP:
32649 icode = CODE_FOR_rdseedsi_1;
32650 mode0 = SImode;
32651 goto rdseed_step;
32653 case IX86_BUILTIN_RDSEED64_STEP:
32654 icode = CODE_FOR_rdseeddi_1;
32655 mode0 = DImode;
32657 rdseed_step:
32658 op0 = gen_reg_rtx (mode0);
32659 emit_insn (GEN_FCN (icode) (op0));
32661 arg0 = CALL_EXPR_ARG (exp, 0);
32662 op1 = expand_normal (arg0);
32663 if (!address_operand (op1, VOIDmode))
32665 op1 = convert_memory_address (Pmode, op1);
32666 op1 = copy_addr_to_reg (op1);
32668 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32670 op2 = gen_reg_rtx (QImode);
32672 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32673 const0_rtx);
32674 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32676 if (target == 0)
32677 target = gen_reg_rtx (SImode);
32679 emit_insn (gen_zero_extendqisi2 (target, op2));
32680 return target;
32682 case IX86_BUILTIN_ADDCARRYX32:
32683 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32684 mode0 = SImode;
32685 goto addcarryx;
32687 case IX86_BUILTIN_ADDCARRYX64:
32688 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32689 mode0 = DImode;
32691 addcarryx:
32692 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32693 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32694 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32695 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32697 op0 = gen_reg_rtx (QImode);
32699 /* Generate CF from input operand. */
32700 op1 = expand_normal (arg0);
32701 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32702 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32704 /* Gen ADCX instruction to compute X+Y+CF. */
32705 op2 = expand_normal (arg1);
32706 op3 = expand_normal (arg2);
32708 if (!REG_P (op2))
32709 op2 = copy_to_mode_reg (mode0, op2);
32710 if (!REG_P (op3))
32711 op3 = copy_to_mode_reg (mode0, op3);
32713 op0 = gen_reg_rtx (mode0);
32715 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32716 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32717 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32719 /* Store the result. */
32720 op4 = expand_normal (arg3);
32721 if (!address_operand (op4, VOIDmode))
32723 op4 = convert_memory_address (Pmode, op4);
32724 op4 = copy_addr_to_reg (op4);
32726 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32728 /* Return current CF value. */
32729 if (target == 0)
32730 target = gen_reg_rtx (QImode);
32732 PUT_MODE (pat, QImode);
32733 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32734 return target;
32736 case IX86_BUILTIN_GATHERSIV2DF:
32737 icode = CODE_FOR_avx2_gathersiv2df;
32738 goto gather_gen;
32739 case IX86_BUILTIN_GATHERSIV4DF:
32740 icode = CODE_FOR_avx2_gathersiv4df;
32741 goto gather_gen;
32742 case IX86_BUILTIN_GATHERDIV2DF:
32743 icode = CODE_FOR_avx2_gatherdiv2df;
32744 goto gather_gen;
32745 case IX86_BUILTIN_GATHERDIV4DF:
32746 icode = CODE_FOR_avx2_gatherdiv4df;
32747 goto gather_gen;
32748 case IX86_BUILTIN_GATHERSIV4SF:
32749 icode = CODE_FOR_avx2_gathersiv4sf;
32750 goto gather_gen;
32751 case IX86_BUILTIN_GATHERSIV8SF:
32752 icode = CODE_FOR_avx2_gathersiv8sf;
32753 goto gather_gen;
32754 case IX86_BUILTIN_GATHERDIV4SF:
32755 icode = CODE_FOR_avx2_gatherdiv4sf;
32756 goto gather_gen;
32757 case IX86_BUILTIN_GATHERDIV8SF:
32758 icode = CODE_FOR_avx2_gatherdiv8sf;
32759 goto gather_gen;
32760 case IX86_BUILTIN_GATHERSIV2DI:
32761 icode = CODE_FOR_avx2_gathersiv2di;
32762 goto gather_gen;
32763 case IX86_BUILTIN_GATHERSIV4DI:
32764 icode = CODE_FOR_avx2_gathersiv4di;
32765 goto gather_gen;
32766 case IX86_BUILTIN_GATHERDIV2DI:
32767 icode = CODE_FOR_avx2_gatherdiv2di;
32768 goto gather_gen;
32769 case IX86_BUILTIN_GATHERDIV4DI:
32770 icode = CODE_FOR_avx2_gatherdiv4di;
32771 goto gather_gen;
32772 case IX86_BUILTIN_GATHERSIV4SI:
32773 icode = CODE_FOR_avx2_gathersiv4si;
32774 goto gather_gen;
32775 case IX86_BUILTIN_GATHERSIV8SI:
32776 icode = CODE_FOR_avx2_gathersiv8si;
32777 goto gather_gen;
32778 case IX86_BUILTIN_GATHERDIV4SI:
32779 icode = CODE_FOR_avx2_gatherdiv4si;
32780 goto gather_gen;
32781 case IX86_BUILTIN_GATHERDIV8SI:
32782 icode = CODE_FOR_avx2_gatherdiv8si;
32783 goto gather_gen;
32784 case IX86_BUILTIN_GATHERALTSIV4DF:
32785 icode = CODE_FOR_avx2_gathersiv4df;
32786 goto gather_gen;
32787 case IX86_BUILTIN_GATHERALTDIV8SF:
32788 icode = CODE_FOR_avx2_gatherdiv8sf;
32789 goto gather_gen;
32790 case IX86_BUILTIN_GATHERALTSIV4DI:
32791 icode = CODE_FOR_avx2_gathersiv4di;
32792 goto gather_gen;
32793 case IX86_BUILTIN_GATHERALTDIV8SI:
32794 icode = CODE_FOR_avx2_gatherdiv8si;
32795 goto gather_gen;
32797 gather_gen:
32798 arg0 = CALL_EXPR_ARG (exp, 0);
32799 arg1 = CALL_EXPR_ARG (exp, 1);
32800 arg2 = CALL_EXPR_ARG (exp, 2);
32801 arg3 = CALL_EXPR_ARG (exp, 3);
32802 arg4 = CALL_EXPR_ARG (exp, 4);
32803 op0 = expand_normal (arg0);
32804 op1 = expand_normal (arg1);
32805 op2 = expand_normal (arg2);
32806 op3 = expand_normal (arg3);
32807 op4 = expand_normal (arg4);
32808 /* Note the arg order is different from the operand order. */
32809 mode0 = insn_data[icode].operand[1].mode;
32810 mode2 = insn_data[icode].operand[3].mode;
32811 mode3 = insn_data[icode].operand[4].mode;
32812 mode4 = insn_data[icode].operand[5].mode;
32814 if (target == NULL_RTX
32815 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32816 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32817 else
32818 subtarget = target;
32820 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32821 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32823 rtx half = gen_reg_rtx (V4SImode);
32824 if (!nonimmediate_operand (op2, V8SImode))
32825 op2 = copy_to_mode_reg (V8SImode, op2);
32826 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32827 op2 = half;
32829 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32830 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32832 rtx (*gen) (rtx, rtx);
32833 rtx half = gen_reg_rtx (mode0);
32834 if (mode0 == V4SFmode)
32835 gen = gen_vec_extract_lo_v8sf;
32836 else
32837 gen = gen_vec_extract_lo_v8si;
32838 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32839 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32840 emit_insn (gen (half, op0));
32841 op0 = half;
32842 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32843 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32844 emit_insn (gen (half, op3));
32845 op3 = half;
32848 /* Force memory operand only with base register here. But we
32849 don't want to do it on memory operand for other builtin
32850 functions. */
32851 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32853 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32854 op0 = copy_to_mode_reg (mode0, op0);
32855 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32856 op1 = copy_to_mode_reg (Pmode, op1);
32857 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32858 op2 = copy_to_mode_reg (mode2, op2);
32859 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32860 op3 = copy_to_mode_reg (mode3, op3);
32861 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32863 error ("last argument must be scale 1, 2, 4, 8");
32864 return const0_rtx;
32867 /* Optimize. If mask is known to have all high bits set,
32868 replace op0 with pc_rtx to signal that the instruction
32869 overwrites the whole destination and doesn't use its
32870 previous contents. */
32871 if (optimize)
32873 if (TREE_CODE (arg3) == VECTOR_CST)
32875 unsigned int negative = 0;
32876 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32878 tree cst = VECTOR_CST_ELT (arg3, i);
32879 if (TREE_CODE (cst) == INTEGER_CST
32880 && tree_int_cst_sign_bit (cst))
32881 negative++;
32882 else if (TREE_CODE (cst) == REAL_CST
32883 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32884 negative++;
32886 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32887 op0 = pc_rtx;
32889 else if (TREE_CODE (arg3) == SSA_NAME)
32891 /* Recognize also when mask is like:
32892 __v2df src = _mm_setzero_pd ();
32893 __v2df mask = _mm_cmpeq_pd (src, src);
32895 __v8sf src = _mm256_setzero_ps ();
32896 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32897 as that is a cheaper way to load all ones into
32898 a register than having to load a constant from
32899 memory. */
32900 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32901 if (is_gimple_call (def_stmt))
32903 tree fndecl = gimple_call_fndecl (def_stmt);
32904 if (fndecl
32905 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32906 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32908 case IX86_BUILTIN_CMPPD:
32909 case IX86_BUILTIN_CMPPS:
32910 case IX86_BUILTIN_CMPPD256:
32911 case IX86_BUILTIN_CMPPS256:
32912 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32913 break;
32914 /* FALLTHRU */
32915 case IX86_BUILTIN_CMPEQPD:
32916 case IX86_BUILTIN_CMPEQPS:
32917 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32918 && initializer_zerop (gimple_call_arg (def_stmt,
32919 1)))
32920 op0 = pc_rtx;
32921 break;
32922 default:
32923 break;
32929 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32930 if (! pat)
32931 return const0_rtx;
32932 emit_insn (pat);
32934 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32935 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32937 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32938 ? V4SFmode : V4SImode;
32939 if (target == NULL_RTX)
32940 target = gen_reg_rtx (tmode);
32941 if (tmode == V4SFmode)
32942 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32943 else
32944 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32946 else
32947 target = subtarget;
32949 return target;
32951 case IX86_BUILTIN_XABORT:
32952 icode = CODE_FOR_xabort;
32953 arg0 = CALL_EXPR_ARG (exp, 0);
32954 op0 = expand_normal (arg0);
32955 mode0 = insn_data[icode].operand[0].mode;
32956 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32958 error ("the xabort's argument must be an 8-bit immediate");
32959 return const0_rtx;
32961 emit_insn (gen_xabort (op0));
32962 return 0;
32964 default:
32965 break;
32968 for (i = 0, d = bdesc_special_args;
32969 i < ARRAY_SIZE (bdesc_special_args);
32970 i++, d++)
32971 if (d->code == fcode)
32972 return ix86_expand_special_args_builtin (d, exp, target);
32974 for (i = 0, d = bdesc_args;
32975 i < ARRAY_SIZE (bdesc_args);
32976 i++, d++)
32977 if (d->code == fcode)
32978 switch (fcode)
32980 case IX86_BUILTIN_FABSQ:
32981 case IX86_BUILTIN_COPYSIGNQ:
32982 if (!TARGET_SSE)
32983 /* Emit a normal call if SSE isn't available. */
32984 return expand_call (exp, target, ignore);
32985 default:
32986 return ix86_expand_args_builtin (d, exp, target);
32989 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32990 if (d->code == fcode)
32991 return ix86_expand_sse_comi (d, exp, target);
32993 for (i = 0, d = bdesc_pcmpestr;
32994 i < ARRAY_SIZE (bdesc_pcmpestr);
32995 i++, d++)
32996 if (d->code == fcode)
32997 return ix86_expand_sse_pcmpestr (d, exp, target);
32999 for (i = 0, d = bdesc_pcmpistr;
33000 i < ARRAY_SIZE (bdesc_pcmpistr);
33001 i++, d++)
33002 if (d->code == fcode)
33003 return ix86_expand_sse_pcmpistr (d, exp, target);
33005 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33006 if (d->code == fcode)
33007 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
33008 (enum ix86_builtin_func_type)
33009 d->flag, d->comparison);
33011 gcc_unreachable ();
33014 /* Returns a function decl for a vectorized version of the builtin function
33015 with builtin function code FN and the result vector type TYPE, or NULL_TREE
33016 if it is not available. */
33018 static tree
33019 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
33020 tree type_in)
33022 enum machine_mode in_mode, out_mode;
33023 int in_n, out_n;
33024 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
33026 if (TREE_CODE (type_out) != VECTOR_TYPE
33027 || TREE_CODE (type_in) != VECTOR_TYPE
33028 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
33029 return NULL_TREE;
33031 out_mode = TYPE_MODE (TREE_TYPE (type_out));
33032 out_n = TYPE_VECTOR_SUBPARTS (type_out);
33033 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33034 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33036 switch (fn)
33038 case BUILT_IN_SQRT:
33039 if (out_mode == DFmode && in_mode == DFmode)
33041 if (out_n == 2 && in_n == 2)
33042 return ix86_builtins[IX86_BUILTIN_SQRTPD];
33043 else if (out_n == 4 && in_n == 4)
33044 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
33046 break;
33048 case BUILT_IN_SQRTF:
33049 if (out_mode == SFmode && in_mode == SFmode)
33051 if (out_n == 4 && in_n == 4)
33052 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
33053 else if (out_n == 8 && in_n == 8)
33054 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
33056 break;
33058 case BUILT_IN_IFLOOR:
33059 case BUILT_IN_LFLOOR:
33060 case BUILT_IN_LLFLOOR:
33061 /* The round insn does not trap on denormals. */
33062 if (flag_trapping_math || !TARGET_ROUND)
33063 break;
33065 if (out_mode == SImode && in_mode == DFmode)
33067 if (out_n == 4 && in_n == 2)
33068 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
33069 else if (out_n == 8 && in_n == 4)
33070 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
33072 break;
33074 case BUILT_IN_IFLOORF:
33075 case BUILT_IN_LFLOORF:
33076 case BUILT_IN_LLFLOORF:
33077 /* The round insn does not trap on denormals. */
33078 if (flag_trapping_math || !TARGET_ROUND)
33079 break;
33081 if (out_mode == SImode && in_mode == SFmode)
33083 if (out_n == 4 && in_n == 4)
33084 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
33085 else if (out_n == 8 && in_n == 8)
33086 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
33088 break;
33090 case BUILT_IN_ICEIL:
33091 case BUILT_IN_LCEIL:
33092 case BUILT_IN_LLCEIL:
33093 /* The round insn does not trap on denormals. */
33094 if (flag_trapping_math || !TARGET_ROUND)
33095 break;
33097 if (out_mode == SImode && in_mode == DFmode)
33099 if (out_n == 4 && in_n == 2)
33100 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
33101 else if (out_n == 8 && in_n == 4)
33102 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
33104 break;
33106 case BUILT_IN_ICEILF:
33107 case BUILT_IN_LCEILF:
33108 case BUILT_IN_LLCEILF:
33109 /* The round insn does not trap on denormals. */
33110 if (flag_trapping_math || !TARGET_ROUND)
33111 break;
33113 if (out_mode == SImode && in_mode == SFmode)
33115 if (out_n == 4 && in_n == 4)
33116 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
33117 else if (out_n == 8 && in_n == 8)
33118 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
33120 break;
33122 case BUILT_IN_IRINT:
33123 case BUILT_IN_LRINT:
33124 case BUILT_IN_LLRINT:
33125 if (out_mode == SImode && in_mode == DFmode)
33127 if (out_n == 4 && in_n == 2)
33128 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
33129 else if (out_n == 8 && in_n == 4)
33130 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
33132 break;
33134 case BUILT_IN_IRINTF:
33135 case BUILT_IN_LRINTF:
33136 case BUILT_IN_LLRINTF:
33137 if (out_mode == SImode && in_mode == SFmode)
33139 if (out_n == 4 && in_n == 4)
33140 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
33141 else if (out_n == 8 && in_n == 8)
33142 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
33144 break;
33146 case BUILT_IN_IROUND:
33147 case BUILT_IN_LROUND:
33148 case BUILT_IN_LLROUND:
33149 /* The round insn does not trap on denormals. */
33150 if (flag_trapping_math || !TARGET_ROUND)
33151 break;
33153 if (out_mode == SImode && in_mode == DFmode)
33155 if (out_n == 4 && in_n == 2)
33156 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
33157 else if (out_n == 8 && in_n == 4)
33158 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
33160 break;
33162 case BUILT_IN_IROUNDF:
33163 case BUILT_IN_LROUNDF:
33164 case BUILT_IN_LLROUNDF:
33165 /* The round insn does not trap on denormals. */
33166 if (flag_trapping_math || !TARGET_ROUND)
33167 break;
33169 if (out_mode == SImode && in_mode == SFmode)
33171 if (out_n == 4 && in_n == 4)
33172 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
33173 else if (out_n == 8 && in_n == 8)
33174 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
33176 break;
33178 case BUILT_IN_COPYSIGN:
33179 if (out_mode == DFmode && in_mode == DFmode)
33181 if (out_n == 2 && in_n == 2)
33182 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
33183 else if (out_n == 4 && in_n == 4)
33184 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
33186 break;
33188 case BUILT_IN_COPYSIGNF:
33189 if (out_mode == SFmode && in_mode == SFmode)
33191 if (out_n == 4 && in_n == 4)
33192 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
33193 else if (out_n == 8 && in_n == 8)
33194 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
33196 break;
33198 case BUILT_IN_FLOOR:
33199 /* The round insn does not trap on denormals. */
33200 if (flag_trapping_math || !TARGET_ROUND)
33201 break;
33203 if (out_mode == DFmode && in_mode == DFmode)
33205 if (out_n == 2 && in_n == 2)
33206 return ix86_builtins[IX86_BUILTIN_FLOORPD];
33207 else if (out_n == 4 && in_n == 4)
33208 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
33210 break;
33212 case BUILT_IN_FLOORF:
33213 /* The round insn does not trap on denormals. */
33214 if (flag_trapping_math || !TARGET_ROUND)
33215 break;
33217 if (out_mode == SFmode && in_mode == SFmode)
33219 if (out_n == 4 && in_n == 4)
33220 return ix86_builtins[IX86_BUILTIN_FLOORPS];
33221 else if (out_n == 8 && in_n == 8)
33222 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
33224 break;
33226 case BUILT_IN_CEIL:
33227 /* The round insn does not trap on denormals. */
33228 if (flag_trapping_math || !TARGET_ROUND)
33229 break;
33231 if (out_mode == DFmode && in_mode == DFmode)
33233 if (out_n == 2 && in_n == 2)
33234 return ix86_builtins[IX86_BUILTIN_CEILPD];
33235 else if (out_n == 4 && in_n == 4)
33236 return ix86_builtins[IX86_BUILTIN_CEILPD256];
33238 break;
33240 case BUILT_IN_CEILF:
33241 /* The round insn does not trap on denormals. */
33242 if (flag_trapping_math || !TARGET_ROUND)
33243 break;
33245 if (out_mode == SFmode && in_mode == SFmode)
33247 if (out_n == 4 && in_n == 4)
33248 return ix86_builtins[IX86_BUILTIN_CEILPS];
33249 else if (out_n == 8 && in_n == 8)
33250 return ix86_builtins[IX86_BUILTIN_CEILPS256];
33252 break;
33254 case BUILT_IN_TRUNC:
33255 /* The round insn does not trap on denormals. */
33256 if (flag_trapping_math || !TARGET_ROUND)
33257 break;
33259 if (out_mode == DFmode && in_mode == DFmode)
33261 if (out_n == 2 && in_n == 2)
33262 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
33263 else if (out_n == 4 && in_n == 4)
33264 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
33266 break;
33268 case BUILT_IN_TRUNCF:
33269 /* The round insn does not trap on denormals. */
33270 if (flag_trapping_math || !TARGET_ROUND)
33271 break;
33273 if (out_mode == SFmode && in_mode == SFmode)
33275 if (out_n == 4 && in_n == 4)
33276 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
33277 else if (out_n == 8 && in_n == 8)
33278 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
33280 break;
33282 case BUILT_IN_RINT:
33283 /* The round insn does not trap on denormals. */
33284 if (flag_trapping_math || !TARGET_ROUND)
33285 break;
33287 if (out_mode == DFmode && in_mode == DFmode)
33289 if (out_n == 2 && in_n == 2)
33290 return ix86_builtins[IX86_BUILTIN_RINTPD];
33291 else if (out_n == 4 && in_n == 4)
33292 return ix86_builtins[IX86_BUILTIN_RINTPD256];
33294 break;
33296 case BUILT_IN_RINTF:
33297 /* The round insn does not trap on denormals. */
33298 if (flag_trapping_math || !TARGET_ROUND)
33299 break;
33301 if (out_mode == SFmode && in_mode == SFmode)
33303 if (out_n == 4 && in_n == 4)
33304 return ix86_builtins[IX86_BUILTIN_RINTPS];
33305 else if (out_n == 8 && in_n == 8)
33306 return ix86_builtins[IX86_BUILTIN_RINTPS256];
33308 break;
33310 case BUILT_IN_ROUND:
33311 /* The round insn does not trap on denormals. */
33312 if (flag_trapping_math || !TARGET_ROUND)
33313 break;
33315 if (out_mode == DFmode && in_mode == DFmode)
33317 if (out_n == 2 && in_n == 2)
33318 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
33319 else if (out_n == 4 && in_n == 4)
33320 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
33322 break;
33324 case BUILT_IN_ROUNDF:
33325 /* The round insn does not trap on denormals. */
33326 if (flag_trapping_math || !TARGET_ROUND)
33327 break;
33329 if (out_mode == SFmode && in_mode == SFmode)
33331 if (out_n == 4 && in_n == 4)
33332 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
33333 else if (out_n == 8 && in_n == 8)
33334 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
33336 break;
33338 case BUILT_IN_FMA:
33339 if (out_mode == DFmode && in_mode == DFmode)
33341 if (out_n == 2 && in_n == 2)
33342 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
33343 if (out_n == 4 && in_n == 4)
33344 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
33346 break;
33348 case BUILT_IN_FMAF:
33349 if (out_mode == SFmode && in_mode == SFmode)
33351 if (out_n == 4 && in_n == 4)
33352 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
33353 if (out_n == 8 && in_n == 8)
33354 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
33356 break;
33358 default:
33359 break;
33362 /* Dispatch to a handler for a vectorization library. */
33363 if (ix86_veclib_handler)
33364 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
33365 type_in);
33367 return NULL_TREE;
33370 /* Handler for an SVML-style interface to
33371 a library with vectorized intrinsics. */
33373 static tree
33374 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
33376 char name[20];
33377 tree fntype, new_fndecl, args;
33378 unsigned arity;
33379 const char *bname;
33380 enum machine_mode el_mode, in_mode;
33381 int n, in_n;
33383 /* The SVML is suitable for unsafe math only. */
33384 if (!flag_unsafe_math_optimizations)
33385 return NULL_TREE;
33387 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33388 n = TYPE_VECTOR_SUBPARTS (type_out);
33389 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33390 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33391 if (el_mode != in_mode
33392 || n != in_n)
33393 return NULL_TREE;
33395 switch (fn)
33397 case BUILT_IN_EXP:
33398 case BUILT_IN_LOG:
33399 case BUILT_IN_LOG10:
33400 case BUILT_IN_POW:
33401 case BUILT_IN_TANH:
33402 case BUILT_IN_TAN:
33403 case BUILT_IN_ATAN:
33404 case BUILT_IN_ATAN2:
33405 case BUILT_IN_ATANH:
33406 case BUILT_IN_CBRT:
33407 case BUILT_IN_SINH:
33408 case BUILT_IN_SIN:
33409 case BUILT_IN_ASINH:
33410 case BUILT_IN_ASIN:
33411 case BUILT_IN_COSH:
33412 case BUILT_IN_COS:
33413 case BUILT_IN_ACOSH:
33414 case BUILT_IN_ACOS:
33415 if (el_mode != DFmode || n != 2)
33416 return NULL_TREE;
33417 break;
33419 case BUILT_IN_EXPF:
33420 case BUILT_IN_LOGF:
33421 case BUILT_IN_LOG10F:
33422 case BUILT_IN_POWF:
33423 case BUILT_IN_TANHF:
33424 case BUILT_IN_TANF:
33425 case BUILT_IN_ATANF:
33426 case BUILT_IN_ATAN2F:
33427 case BUILT_IN_ATANHF:
33428 case BUILT_IN_CBRTF:
33429 case BUILT_IN_SINHF:
33430 case BUILT_IN_SINF:
33431 case BUILT_IN_ASINHF:
33432 case BUILT_IN_ASINF:
33433 case BUILT_IN_COSHF:
33434 case BUILT_IN_COSF:
33435 case BUILT_IN_ACOSHF:
33436 case BUILT_IN_ACOSF:
33437 if (el_mode != SFmode || n != 4)
33438 return NULL_TREE;
33439 break;
33441 default:
33442 return NULL_TREE;
33445 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33447 if (fn == BUILT_IN_LOGF)
33448 strcpy (name, "vmlsLn4");
33449 else if (fn == BUILT_IN_LOG)
33450 strcpy (name, "vmldLn2");
33451 else if (n == 4)
33453 sprintf (name, "vmls%s", bname+10);
33454 name[strlen (name)-1] = '4';
33456 else
33457 sprintf (name, "vmld%s2", bname+10);
33459 /* Convert to uppercase. */
33460 name[4] &= ~0x20;
33462 arity = 0;
33463 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33464 args;
33465 args = TREE_CHAIN (args))
33466 arity++;
33468 if (arity == 1)
33469 fntype = build_function_type_list (type_out, type_in, NULL);
33470 else
33471 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33473 /* Build a function declaration for the vectorized function. */
33474 new_fndecl = build_decl (BUILTINS_LOCATION,
33475 FUNCTION_DECL, get_identifier (name), fntype);
33476 TREE_PUBLIC (new_fndecl) = 1;
33477 DECL_EXTERNAL (new_fndecl) = 1;
33478 DECL_IS_NOVOPS (new_fndecl) = 1;
33479 TREE_READONLY (new_fndecl) = 1;
33481 return new_fndecl;
33484 /* Handler for an ACML-style interface to
33485 a library with vectorized intrinsics. */
33487 static tree
33488 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
33490 char name[20] = "__vr.._";
33491 tree fntype, new_fndecl, args;
33492 unsigned arity;
33493 const char *bname;
33494 enum machine_mode el_mode, in_mode;
33495 int n, in_n;
33497 /* The ACML is 64bits only and suitable for unsafe math only as
33498 it does not correctly support parts of IEEE with the required
33499 precision such as denormals. */
33500 if (!TARGET_64BIT
33501 || !flag_unsafe_math_optimizations)
33502 return NULL_TREE;
33504 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33505 n = TYPE_VECTOR_SUBPARTS (type_out);
33506 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33507 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33508 if (el_mode != in_mode
33509 || n != in_n)
33510 return NULL_TREE;
33512 switch (fn)
33514 case BUILT_IN_SIN:
33515 case BUILT_IN_COS:
33516 case BUILT_IN_EXP:
33517 case BUILT_IN_LOG:
33518 case BUILT_IN_LOG2:
33519 case BUILT_IN_LOG10:
33520 name[4] = 'd';
33521 name[5] = '2';
33522 if (el_mode != DFmode
33523 || n != 2)
33524 return NULL_TREE;
33525 break;
33527 case BUILT_IN_SINF:
33528 case BUILT_IN_COSF:
33529 case BUILT_IN_EXPF:
33530 case BUILT_IN_POWF:
33531 case BUILT_IN_LOGF:
33532 case BUILT_IN_LOG2F:
33533 case BUILT_IN_LOG10F:
33534 name[4] = 's';
33535 name[5] = '4';
33536 if (el_mode != SFmode
33537 || n != 4)
33538 return NULL_TREE;
33539 break;
33541 default:
33542 return NULL_TREE;
33545 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33546 sprintf (name + 7, "%s", bname+10);
33548 arity = 0;
33549 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33550 args;
33551 args = TREE_CHAIN (args))
33552 arity++;
33554 if (arity == 1)
33555 fntype = build_function_type_list (type_out, type_in, NULL);
33556 else
33557 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33559 /* Build a function declaration for the vectorized function. */
33560 new_fndecl = build_decl (BUILTINS_LOCATION,
33561 FUNCTION_DECL, get_identifier (name), fntype);
33562 TREE_PUBLIC (new_fndecl) = 1;
33563 DECL_EXTERNAL (new_fndecl) = 1;
33564 DECL_IS_NOVOPS (new_fndecl) = 1;
33565 TREE_READONLY (new_fndecl) = 1;
33567 return new_fndecl;
33570 /* Returns a decl of a function that implements gather load with
33571 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
33572 Return NULL_TREE if it is not available. */
33574 static tree
33575 ix86_vectorize_builtin_gather (const_tree mem_vectype,
33576 const_tree index_type, int scale)
33578 bool si;
33579 enum ix86_builtins code;
33581 if (! TARGET_AVX2)
33582 return NULL_TREE;
33584 if ((TREE_CODE (index_type) != INTEGER_TYPE
33585 && !POINTER_TYPE_P (index_type))
33586 || (TYPE_MODE (index_type) != SImode
33587 && TYPE_MODE (index_type) != DImode))
33588 return NULL_TREE;
33590 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
33591 return NULL_TREE;
33593 /* v*gather* insn sign extends index to pointer mode. */
33594 if (TYPE_PRECISION (index_type) < POINTER_SIZE
33595 && TYPE_UNSIGNED (index_type))
33596 return NULL_TREE;
33598 if (scale <= 0
33599 || scale > 8
33600 || (scale & (scale - 1)) != 0)
33601 return NULL_TREE;
33603 si = TYPE_MODE (index_type) == SImode;
33604 switch (TYPE_MODE (mem_vectype))
33606 case V2DFmode:
33607 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
33608 break;
33609 case V4DFmode:
33610 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
33611 break;
33612 case V2DImode:
33613 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
33614 break;
33615 case V4DImode:
33616 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
33617 break;
33618 case V4SFmode:
33619 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
33620 break;
33621 case V8SFmode:
33622 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
33623 break;
33624 case V4SImode:
33625 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
33626 break;
33627 case V8SImode:
33628 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
33629 break;
33630 default:
33631 return NULL_TREE;
33634 return ix86_builtins[code];
33637 /* Returns a code for a target-specific builtin that implements
33638 reciprocal of the function, or NULL_TREE if not available. */
33640 static tree
33641 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33642 bool sqrt ATTRIBUTE_UNUSED)
33644 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33645 && flag_finite_math_only && !flag_trapping_math
33646 && flag_unsafe_math_optimizations))
33647 return NULL_TREE;
33649 if (md_fn)
33650 /* Machine dependent builtins. */
33651 switch (fn)
33653 /* Vectorized version of sqrt to rsqrt conversion. */
33654 case IX86_BUILTIN_SQRTPS_NR:
33655 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33657 case IX86_BUILTIN_SQRTPS_NR256:
33658 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33660 default:
33661 return NULL_TREE;
33663 else
33664 /* Normal builtins. */
33665 switch (fn)
33667 /* Sqrt to rsqrt conversion. */
33668 case BUILT_IN_SQRTF:
33669 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33671 default:
33672 return NULL_TREE;
33676 /* Helper for avx_vpermilps256_operand et al. This is also used by
33677 the expansion functions to turn the parallel back into a mask.
33678 The return value is 0 for no match and the imm8+1 for a match. */
33681 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33683 unsigned i, nelt = GET_MODE_NUNITS (mode);
33684 unsigned mask = 0;
33685 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33687 if (XVECLEN (par, 0) != (int) nelt)
33688 return 0;
33690 /* Validate that all of the elements are constants, and not totally
33691 out of range. Copy the data into an integral array to make the
33692 subsequent checks easier. */
33693 for (i = 0; i < nelt; ++i)
33695 rtx er = XVECEXP (par, 0, i);
33696 unsigned HOST_WIDE_INT ei;
33698 if (!CONST_INT_P (er))
33699 return 0;
33700 ei = INTVAL (er);
33701 if (ei >= nelt)
33702 return 0;
33703 ipar[i] = ei;
33706 switch (mode)
33708 case V4DFmode:
33709 /* In the 256-bit DFmode case, we can only move elements within
33710 a 128-bit lane. */
33711 for (i = 0; i < 2; ++i)
33713 if (ipar[i] >= 2)
33714 return 0;
33715 mask |= ipar[i] << i;
33717 for (i = 2; i < 4; ++i)
33719 if (ipar[i] < 2)
33720 return 0;
33721 mask |= (ipar[i] - 2) << i;
33723 break;
33725 case V8SFmode:
33726 /* In the 256-bit SFmode case, we have full freedom of movement
33727 within the low 128-bit lane, but the high 128-bit lane must
33728 mirror the exact same pattern. */
33729 for (i = 0; i < 4; ++i)
33730 if (ipar[i] + 4 != ipar[i + 4])
33731 return 0;
33732 nelt = 4;
33733 /* FALLTHRU */
33735 case V2DFmode:
33736 case V4SFmode:
33737 /* In the 128-bit case, we've full freedom in the placement of
33738 the elements from the source operand. */
33739 for (i = 0; i < nelt; ++i)
33740 mask |= ipar[i] << (i * (nelt / 2));
33741 break;
33743 default:
33744 gcc_unreachable ();
33747 /* Make sure success has a non-zero value by adding one. */
33748 return mask + 1;
33751 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33752 the expansion functions to turn the parallel back into a mask.
33753 The return value is 0 for no match and the imm8+1 for a match. */
33756 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33758 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33759 unsigned mask = 0;
33760 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33762 if (XVECLEN (par, 0) != (int) nelt)
33763 return 0;
33765 /* Validate that all of the elements are constants, and not totally
33766 out of range. Copy the data into an integral array to make the
33767 subsequent checks easier. */
33768 for (i = 0; i < nelt; ++i)
33770 rtx er = XVECEXP (par, 0, i);
33771 unsigned HOST_WIDE_INT ei;
33773 if (!CONST_INT_P (er))
33774 return 0;
33775 ei = INTVAL (er);
33776 if (ei >= 2 * nelt)
33777 return 0;
33778 ipar[i] = ei;
33781 /* Validate that the halves of the permute are halves. */
33782 for (i = 0; i < nelt2 - 1; ++i)
33783 if (ipar[i] + 1 != ipar[i + 1])
33784 return 0;
33785 for (i = nelt2; i < nelt - 1; ++i)
33786 if (ipar[i] + 1 != ipar[i + 1])
33787 return 0;
33789 /* Reconstruct the mask. */
33790 for (i = 0; i < 2; ++i)
33792 unsigned e = ipar[i * nelt2];
33793 if (e % nelt2)
33794 return 0;
33795 e /= nelt2;
33796 mask |= e << (i * 4);
33799 /* Make sure success has a non-zero value by adding one. */
33800 return mask + 1;
33803 /* Store OPERAND to the memory after reload is completed. This means
33804 that we can't easily use assign_stack_local. */
33806 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33808 rtx result;
33810 gcc_assert (reload_completed);
33811 if (ix86_using_red_zone ())
33813 result = gen_rtx_MEM (mode,
33814 gen_rtx_PLUS (Pmode,
33815 stack_pointer_rtx,
33816 GEN_INT (-RED_ZONE_SIZE)));
33817 emit_move_insn (result, operand);
33819 else if (TARGET_64BIT)
33821 switch (mode)
33823 case HImode:
33824 case SImode:
33825 operand = gen_lowpart (DImode, operand);
33826 /* FALLTHRU */
33827 case DImode:
33828 emit_insn (
33829 gen_rtx_SET (VOIDmode,
33830 gen_rtx_MEM (DImode,
33831 gen_rtx_PRE_DEC (DImode,
33832 stack_pointer_rtx)),
33833 operand));
33834 break;
33835 default:
33836 gcc_unreachable ();
33838 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33840 else
33842 switch (mode)
33844 case DImode:
33846 rtx operands[2];
33847 split_double_mode (mode, &operand, 1, operands, operands + 1);
33848 emit_insn (
33849 gen_rtx_SET (VOIDmode,
33850 gen_rtx_MEM (SImode,
33851 gen_rtx_PRE_DEC (Pmode,
33852 stack_pointer_rtx)),
33853 operands[1]));
33854 emit_insn (
33855 gen_rtx_SET (VOIDmode,
33856 gen_rtx_MEM (SImode,
33857 gen_rtx_PRE_DEC (Pmode,
33858 stack_pointer_rtx)),
33859 operands[0]));
33861 break;
33862 case HImode:
33863 /* Store HImodes as SImodes. */
33864 operand = gen_lowpart (SImode, operand);
33865 /* FALLTHRU */
33866 case SImode:
33867 emit_insn (
33868 gen_rtx_SET (VOIDmode,
33869 gen_rtx_MEM (GET_MODE (operand),
33870 gen_rtx_PRE_DEC (SImode,
33871 stack_pointer_rtx)),
33872 operand));
33873 break;
33874 default:
33875 gcc_unreachable ();
33877 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33879 return result;
33882 /* Free operand from the memory. */
33883 void
33884 ix86_free_from_memory (enum machine_mode mode)
33886 if (!ix86_using_red_zone ())
33888 int size;
33890 if (mode == DImode || TARGET_64BIT)
33891 size = 8;
33892 else
33893 size = 4;
33894 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33895 to pop or add instruction if registers are available. */
33896 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33897 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33898 GEN_INT (size))));
33902 /* Return a register priority for hard reg REGNO. */
33903 static int
33904 ix86_register_priority (int hard_regno)
33906 /* ebp and r13 as the base always wants a displacement, r12 as the
33907 base always wants an index. So discourage their usage in an
33908 address. */
33909 if (hard_regno == R12_REG || hard_regno == R13_REG)
33910 return 0;
33911 if (hard_regno == BP_REG)
33912 return 1;
33913 /* New x86-64 int registers result in bigger code size. Discourage
33914 them. */
33915 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33916 return 2;
33917 /* New x86-64 SSE registers result in bigger code size. Discourage
33918 them. */
33919 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33920 return 2;
33921 /* Usage of AX register results in smaller code. Prefer it. */
33922 if (hard_regno == 0)
33923 return 4;
33924 return 3;
33927 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33929 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33930 QImode must go into class Q_REGS.
33931 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33932 movdf to do mem-to-mem moves through integer regs. */
33934 static reg_class_t
33935 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33937 enum machine_mode mode = GET_MODE (x);
33939 /* We're only allowed to return a subclass of CLASS. Many of the
33940 following checks fail for NO_REGS, so eliminate that early. */
33941 if (regclass == NO_REGS)
33942 return NO_REGS;
33944 /* All classes can load zeros. */
33945 if (x == CONST0_RTX (mode))
33946 return regclass;
33948 /* Force constants into memory if we are loading a (nonzero) constant into
33949 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
33950 instructions to load from a constant. */
33951 if (CONSTANT_P (x)
33952 && (MAYBE_MMX_CLASS_P (regclass)
33953 || MAYBE_SSE_CLASS_P (regclass)
33954 || MAYBE_MASK_CLASS_P (regclass)))
33955 return NO_REGS;
33957 /* Prefer SSE regs only, if we can use them for math. */
33958 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33959 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33961 /* Floating-point constants need more complex checks. */
33962 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33964 /* General regs can load everything. */
33965 if (reg_class_subset_p (regclass, GENERAL_REGS))
33966 return regclass;
33968 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33969 zero above. We only want to wind up preferring 80387 registers if
33970 we plan on doing computation with them. */
33971 if (TARGET_80387
33972 && standard_80387_constant_p (x) > 0)
33974 /* Limit class to non-sse. */
33975 if (regclass == FLOAT_SSE_REGS)
33976 return FLOAT_REGS;
33977 if (regclass == FP_TOP_SSE_REGS)
33978 return FP_TOP_REG;
33979 if (regclass == FP_SECOND_SSE_REGS)
33980 return FP_SECOND_REG;
33981 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33982 return regclass;
33985 return NO_REGS;
33988 /* Generally when we see PLUS here, it's the function invariant
33989 (plus soft-fp const_int). Which can only be computed into general
33990 regs. */
33991 if (GET_CODE (x) == PLUS)
33992 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33994 /* QImode constants are easy to load, but non-constant QImode data
33995 must go into Q_REGS. */
33996 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33998 if (reg_class_subset_p (regclass, Q_REGS))
33999 return regclass;
34000 if (reg_class_subset_p (Q_REGS, regclass))
34001 return Q_REGS;
34002 return NO_REGS;
34005 return regclass;
34008 /* Discourage putting floating-point values in SSE registers unless
34009 SSE math is being used, and likewise for the 387 registers. */
34010 static reg_class_t
34011 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
34013 enum machine_mode mode = GET_MODE (x);
34015 /* Restrict the output reload class to the register bank that we are doing
34016 math on. If we would like not to return a subset of CLASS, reject this
34017 alternative: if reload cannot do this, it will still use its choice. */
34018 mode = GET_MODE (x);
34019 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
34020 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
34022 if (X87_FLOAT_MODE_P (mode))
34024 if (regclass == FP_TOP_SSE_REGS)
34025 return FP_TOP_REG;
34026 else if (regclass == FP_SECOND_SSE_REGS)
34027 return FP_SECOND_REG;
34028 else
34029 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
34032 return regclass;
34035 static reg_class_t
34036 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
34037 enum machine_mode mode, secondary_reload_info *sri)
34039 /* Double-word spills from general registers to non-offsettable memory
34040 references (zero-extended addresses) require special handling. */
34041 if (TARGET_64BIT
34042 && MEM_P (x)
34043 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
34044 && INTEGER_CLASS_P (rclass)
34045 && !offsettable_memref_p (x))
34047 sri->icode = (in_p
34048 ? CODE_FOR_reload_noff_load
34049 : CODE_FOR_reload_noff_store);
34050 /* Add the cost of moving address to a temporary. */
34051 sri->extra_cost = 1;
34053 return NO_REGS;
34056 /* QImode spills from non-QI registers require
34057 intermediate register on 32bit targets. */
34058 if (mode == QImode
34059 && (MAYBE_MASK_CLASS_P (rclass)
34060 || (!TARGET_64BIT && !in_p
34061 && INTEGER_CLASS_P (rclass)
34062 && MAYBE_NON_Q_CLASS_P (rclass))))
34064 int regno;
34066 if (REG_P (x))
34067 regno = REGNO (x);
34068 else
34069 regno = -1;
34071 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
34072 regno = true_regnum (x);
34074 /* Return Q_REGS if the operand is in memory. */
34075 if (regno == -1)
34076 return Q_REGS;
34079 /* This condition handles corner case where an expression involving
34080 pointers gets vectorized. We're trying to use the address of a
34081 stack slot as a vector initializer.
34083 (set (reg:V2DI 74 [ vect_cst_.2 ])
34084 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
34086 Eventually frame gets turned into sp+offset like this:
34088 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34089 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34090 (const_int 392 [0x188]))))
34092 That later gets turned into:
34094 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34095 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34096 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
34098 We'll have the following reload recorded:
34100 Reload 0: reload_in (DI) =
34101 (plus:DI (reg/f:DI 7 sp)
34102 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
34103 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34104 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
34105 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
34106 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34107 reload_reg_rtx: (reg:V2DI 22 xmm1)
34109 Which isn't going to work since SSE instructions can't handle scalar
34110 additions. Returning GENERAL_REGS forces the addition into integer
34111 register and reload can handle subsequent reloads without problems. */
34113 if (in_p && GET_CODE (x) == PLUS
34114 && SSE_CLASS_P (rclass)
34115 && SCALAR_INT_MODE_P (mode))
34116 return GENERAL_REGS;
34118 return NO_REGS;
34121 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
34123 static bool
34124 ix86_class_likely_spilled_p (reg_class_t rclass)
34126 switch (rclass)
34128 case AREG:
34129 case DREG:
34130 case CREG:
34131 case BREG:
34132 case AD_REGS:
34133 case SIREG:
34134 case DIREG:
34135 case SSE_FIRST_REG:
34136 case FP_TOP_REG:
34137 case FP_SECOND_REG:
34138 return true;
34140 default:
34141 break;
34144 return false;
34147 /* If we are copying between general and FP registers, we need a memory
34148 location. The same is true for SSE and MMX registers.
34150 To optimize register_move_cost performance, allow inline variant.
34152 The macro can't work reliably when one of the CLASSES is class containing
34153 registers from multiple units (SSE, MMX, integer). We avoid this by never
34154 combining those units in single alternative in the machine description.
34155 Ensure that this constraint holds to avoid unexpected surprises.
34157 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34158 enforce these sanity checks. */
34160 static inline bool
34161 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34162 enum machine_mode mode, int strict)
34164 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34165 return false;
34166 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34167 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34168 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34169 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34170 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34171 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34173 gcc_assert (!strict || lra_in_progress);
34174 return true;
34177 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34178 return true;
34180 /* ??? This is a lie. We do have moves between mmx/general, and for
34181 mmx/sse2. But by saying we need secondary memory we discourage the
34182 register allocator from using the mmx registers unless needed. */
34183 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34184 return true;
34186 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34188 /* SSE1 doesn't have any direct moves from other classes. */
34189 if (!TARGET_SSE2)
34190 return true;
34192 /* If the target says that inter-unit moves are more expensive
34193 than moving through memory, then don't generate them. */
34194 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34195 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34196 return true;
34198 /* Between SSE and general, we have moves no larger than word size. */
34199 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34200 return true;
34203 return false;
34206 bool
34207 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34208 enum machine_mode mode, int strict)
34210 return inline_secondary_memory_needed (class1, class2, mode, strict);
34213 /* Implement the TARGET_CLASS_MAX_NREGS hook.
34215 On the 80386, this is the size of MODE in words,
34216 except in the FP regs, where a single reg is always enough. */
34218 static unsigned char
34219 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
34221 if (MAYBE_INTEGER_CLASS_P (rclass))
34223 if (mode == XFmode)
34224 return (TARGET_64BIT ? 2 : 3);
34225 else if (mode == XCmode)
34226 return (TARGET_64BIT ? 4 : 6);
34227 else
34228 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
34230 else
34232 if (COMPLEX_MODE_P (mode))
34233 return 2;
34234 else
34235 return 1;
34239 /* Return true if the registers in CLASS cannot represent the change from
34240 modes FROM to TO. */
34242 bool
34243 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
34244 enum reg_class regclass)
34246 if (from == to)
34247 return false;
34249 /* x87 registers can't do subreg at all, as all values are reformatted
34250 to extended precision. */
34251 if (MAYBE_FLOAT_CLASS_P (regclass))
34252 return true;
34254 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
34256 /* Vector registers do not support QI or HImode loads. If we don't
34257 disallow a change to these modes, reload will assume it's ok to
34258 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
34259 the vec_dupv4hi pattern. */
34260 if (GET_MODE_SIZE (from) < 4)
34261 return true;
34263 /* Vector registers do not support subreg with nonzero offsets, which
34264 are otherwise valid for integer registers. Since we can't see
34265 whether we have a nonzero offset from here, prohibit all
34266 nonparadoxical subregs changing size. */
34267 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
34268 return true;
34271 return false;
34274 /* Return the cost of moving data of mode M between a
34275 register and memory. A value of 2 is the default; this cost is
34276 relative to those in `REGISTER_MOVE_COST'.
34278 This function is used extensively by register_move_cost that is used to
34279 build tables at startup. Make it inline in this case.
34280 When IN is 2, return maximum of in and out move cost.
34282 If moving between registers and memory is more expensive than
34283 between two registers, you should define this macro to express the
34284 relative cost.
34286 Model also increased moving costs of QImode registers in non
34287 Q_REGS classes.
34289 static inline int
34290 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
34291 int in)
34293 int cost;
34294 if (FLOAT_CLASS_P (regclass))
34296 int index;
34297 switch (mode)
34299 case SFmode:
34300 index = 0;
34301 break;
34302 case DFmode:
34303 index = 1;
34304 break;
34305 case XFmode:
34306 index = 2;
34307 break;
34308 default:
34309 return 100;
34311 if (in == 2)
34312 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
34313 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
34315 if (SSE_CLASS_P (regclass))
34317 int index;
34318 switch (GET_MODE_SIZE (mode))
34320 case 4:
34321 index = 0;
34322 break;
34323 case 8:
34324 index = 1;
34325 break;
34326 case 16:
34327 index = 2;
34328 break;
34329 default:
34330 return 100;
34332 if (in == 2)
34333 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
34334 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
34336 if (MMX_CLASS_P (regclass))
34338 int index;
34339 switch (GET_MODE_SIZE (mode))
34341 case 4:
34342 index = 0;
34343 break;
34344 case 8:
34345 index = 1;
34346 break;
34347 default:
34348 return 100;
34350 if (in)
34351 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
34352 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
34354 switch (GET_MODE_SIZE (mode))
34356 case 1:
34357 if (Q_CLASS_P (regclass) || TARGET_64BIT)
34359 if (!in)
34360 return ix86_cost->int_store[0];
34361 if (TARGET_PARTIAL_REG_DEPENDENCY
34362 && optimize_function_for_speed_p (cfun))
34363 cost = ix86_cost->movzbl_load;
34364 else
34365 cost = ix86_cost->int_load[0];
34366 if (in == 2)
34367 return MAX (cost, ix86_cost->int_store[0]);
34368 return cost;
34370 else
34372 if (in == 2)
34373 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
34374 if (in)
34375 return ix86_cost->movzbl_load;
34376 else
34377 return ix86_cost->int_store[0] + 4;
34379 break;
34380 case 2:
34381 if (in == 2)
34382 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
34383 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
34384 default:
34385 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
34386 if (mode == TFmode)
34387 mode = XFmode;
34388 if (in == 2)
34389 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
34390 else if (in)
34391 cost = ix86_cost->int_load[2];
34392 else
34393 cost = ix86_cost->int_store[2];
34394 return (cost * (((int) GET_MODE_SIZE (mode)
34395 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
34399 static int
34400 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
34401 bool in)
34403 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
34407 /* Return the cost of moving data from a register in class CLASS1 to
34408 one in class CLASS2.
34410 It is not required that the cost always equal 2 when FROM is the same as TO;
34411 on some machines it is expensive to move between registers if they are not
34412 general registers. */
34414 static int
34415 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
34416 reg_class_t class2_i)
34418 enum reg_class class1 = (enum reg_class) class1_i;
34419 enum reg_class class2 = (enum reg_class) class2_i;
34421 /* In case we require secondary memory, compute cost of the store followed
34422 by load. In order to avoid bad register allocation choices, we need
34423 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
34425 if (inline_secondary_memory_needed (class1, class2, mode, 0))
34427 int cost = 1;
34429 cost += inline_memory_move_cost (mode, class1, 2);
34430 cost += inline_memory_move_cost (mode, class2, 2);
34432 /* In case of copying from general_purpose_register we may emit multiple
34433 stores followed by single load causing memory size mismatch stall.
34434 Count this as arbitrarily high cost of 20. */
34435 if (targetm.class_max_nregs (class1, mode)
34436 > targetm.class_max_nregs (class2, mode))
34437 cost += 20;
34439 /* In the case of FP/MMX moves, the registers actually overlap, and we
34440 have to switch modes in order to treat them differently. */
34441 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
34442 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
34443 cost += 20;
34445 return cost;
34448 /* Moves between SSE/MMX and integer unit are expensive. */
34449 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
34450 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34452 /* ??? By keeping returned value relatively high, we limit the number
34453 of moves between integer and MMX/SSE registers for all targets.
34454 Additionally, high value prevents problem with x86_modes_tieable_p(),
34455 where integer modes in MMX/SSE registers are not tieable
34456 because of missing QImode and HImode moves to, from or between
34457 MMX/SSE registers. */
34458 return MAX (8, ix86_cost->mmxsse_to_integer);
34460 if (MAYBE_FLOAT_CLASS_P (class1))
34461 return ix86_cost->fp_move;
34462 if (MAYBE_SSE_CLASS_P (class1))
34463 return ix86_cost->sse_move;
34464 if (MAYBE_MMX_CLASS_P (class1))
34465 return ix86_cost->mmx_move;
34466 return 2;
34469 /* Return TRUE if hard register REGNO can hold a value of machine-mode
34470 MODE. */
34472 bool
34473 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
34475 /* Flags and only flags can only hold CCmode values. */
34476 if (CC_REGNO_P (regno))
34477 return GET_MODE_CLASS (mode) == MODE_CC;
34478 if (GET_MODE_CLASS (mode) == MODE_CC
34479 || GET_MODE_CLASS (mode) == MODE_RANDOM
34480 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
34481 return false;
34482 if (STACK_REGNO_P (regno))
34483 return VALID_FP_MODE_P (mode);
34484 if (MASK_REGNO_P (regno))
34485 return VALID_MASK_REG_MODE (mode);
34486 if (SSE_REGNO_P (regno))
34488 /* We implement the move patterns for all vector modes into and
34489 out of SSE registers, even when no operation instructions
34490 are available. */
34492 /* For AVX-512 we allow, regardless of regno:
34493 - XI mode
34494 - any of 512-bit wide vector mode
34495 - any scalar mode. */
34496 if (TARGET_AVX512F
34497 && (mode == XImode
34498 || VALID_AVX512F_REG_MODE (mode)
34499 || VALID_AVX512F_SCALAR_MODE (mode)))
34500 return true;
34502 /* xmm16-xmm31 are only available for AVX-512. */
34503 if (EXT_REX_SSE_REGNO_P (regno))
34504 return false;
34506 /* OImode move is available only when AVX is enabled. */
34507 return ((TARGET_AVX && mode == OImode)
34508 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34509 || VALID_SSE_REG_MODE (mode)
34510 || VALID_SSE2_REG_MODE (mode)
34511 || VALID_MMX_REG_MODE (mode)
34512 || VALID_MMX_REG_MODE_3DNOW (mode));
34514 if (MMX_REGNO_P (regno))
34516 /* We implement the move patterns for 3DNOW modes even in MMX mode,
34517 so if the register is available at all, then we can move data of
34518 the given mode into or out of it. */
34519 return (VALID_MMX_REG_MODE (mode)
34520 || VALID_MMX_REG_MODE_3DNOW (mode));
34523 if (mode == QImode)
34525 /* Take care for QImode values - they can be in non-QI regs,
34526 but then they do cause partial register stalls. */
34527 if (ANY_QI_REGNO_P (regno))
34528 return true;
34529 if (!TARGET_PARTIAL_REG_STALL)
34530 return true;
34531 /* LRA checks if the hard register is OK for the given mode.
34532 QImode values can live in non-QI regs, so we allow all
34533 registers here. */
34534 if (lra_in_progress)
34535 return true;
34536 return !can_create_pseudo_p ();
34538 /* We handle both integer and floats in the general purpose registers. */
34539 else if (VALID_INT_MODE_P (mode))
34540 return true;
34541 else if (VALID_FP_MODE_P (mode))
34542 return true;
34543 else if (VALID_DFP_MODE_P (mode))
34544 return true;
34545 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
34546 on to use that value in smaller contexts, this can easily force a
34547 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
34548 supporting DImode, allow it. */
34549 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
34550 return true;
34552 return false;
34555 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
34556 tieable integer mode. */
34558 static bool
34559 ix86_tieable_integer_mode_p (enum machine_mode mode)
34561 switch (mode)
34563 case HImode:
34564 case SImode:
34565 return true;
34567 case QImode:
34568 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
34570 case DImode:
34571 return TARGET_64BIT;
34573 default:
34574 return false;
34578 /* Return true if MODE1 is accessible in a register that can hold MODE2
34579 without copying. That is, all register classes that can hold MODE2
34580 can also hold MODE1. */
34582 bool
34583 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
34585 if (mode1 == mode2)
34586 return true;
34588 if (ix86_tieable_integer_mode_p (mode1)
34589 && ix86_tieable_integer_mode_p (mode2))
34590 return true;
34592 /* MODE2 being XFmode implies fp stack or general regs, which means we
34593 can tie any smaller floating point modes to it. Note that we do not
34594 tie this with TFmode. */
34595 if (mode2 == XFmode)
34596 return mode1 == SFmode || mode1 == DFmode;
34598 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
34599 that we can tie it with SFmode. */
34600 if (mode2 == DFmode)
34601 return mode1 == SFmode;
34603 /* If MODE2 is only appropriate for an SSE register, then tie with
34604 any other mode acceptable to SSE registers. */
34605 if (GET_MODE_SIZE (mode2) == 32
34606 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34607 return (GET_MODE_SIZE (mode1) == 32
34608 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34609 if (GET_MODE_SIZE (mode2) == 16
34610 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34611 return (GET_MODE_SIZE (mode1) == 16
34612 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34614 /* If MODE2 is appropriate for an MMX register, then tie
34615 with any other mode acceptable to MMX registers. */
34616 if (GET_MODE_SIZE (mode2) == 8
34617 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
34618 return (GET_MODE_SIZE (mode1) == 8
34619 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
34621 return false;
34624 /* Return the cost of moving between two registers of mode MODE. */
34626 static int
34627 ix86_set_reg_reg_cost (enum machine_mode mode)
34629 unsigned int units = UNITS_PER_WORD;
34631 switch (GET_MODE_CLASS (mode))
34633 default:
34634 break;
34636 case MODE_CC:
34637 units = GET_MODE_SIZE (CCmode);
34638 break;
34640 case MODE_FLOAT:
34641 if ((TARGET_SSE && mode == TFmode)
34642 || (TARGET_80387 && mode == XFmode)
34643 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
34644 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
34645 units = GET_MODE_SIZE (mode);
34646 break;
34648 case MODE_COMPLEX_FLOAT:
34649 if ((TARGET_SSE && mode == TCmode)
34650 || (TARGET_80387 && mode == XCmode)
34651 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
34652 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
34653 units = GET_MODE_SIZE (mode);
34654 break;
34656 case MODE_VECTOR_INT:
34657 case MODE_VECTOR_FLOAT:
34658 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
34659 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34660 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34661 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34662 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
34663 units = GET_MODE_SIZE (mode);
34666 /* Return the cost of moving between two registers of mode MODE,
34667 assuming that the move will be in pieces of at most UNITS bytes. */
34668 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
34671 /* Compute a (partial) cost for rtx X. Return true if the complete
34672 cost has been computed, and false if subexpressions should be
34673 scanned. In either case, *TOTAL contains the cost result. */
34675 static bool
34676 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34677 bool speed)
34679 enum rtx_code code = (enum rtx_code) code_i;
34680 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34681 enum machine_mode mode = GET_MODE (x);
34682 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34684 switch (code)
34686 case SET:
34687 if (register_operand (SET_DEST (x), VOIDmode)
34688 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34690 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34691 return true;
34693 return false;
34695 case CONST_INT:
34696 case CONST:
34697 case LABEL_REF:
34698 case SYMBOL_REF:
34699 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34700 *total = 3;
34701 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34702 *total = 2;
34703 else if (flag_pic && SYMBOLIC_CONST (x)
34704 && (!TARGET_64BIT
34705 || (!GET_CODE (x) != LABEL_REF
34706 && (GET_CODE (x) != SYMBOL_REF
34707 || !SYMBOL_REF_LOCAL_P (x)))))
34708 *total = 1;
34709 else
34710 *total = 0;
34711 return true;
34713 case CONST_DOUBLE:
34714 if (mode == VOIDmode)
34716 *total = 0;
34717 return true;
34719 switch (standard_80387_constant_p (x))
34721 case 1: /* 0.0 */
34722 *total = 1;
34723 return true;
34724 default: /* Other constants */
34725 *total = 2;
34726 return true;
34727 case 0:
34728 case -1:
34729 break;
34731 if (SSE_FLOAT_MODE_P (mode))
34733 case CONST_VECTOR:
34734 switch (standard_sse_constant_p (x))
34736 case 0:
34737 break;
34738 case 1: /* 0: xor eliminates false dependency */
34739 *total = 0;
34740 return true;
34741 default: /* -1: cmp contains false dependency */
34742 *total = 1;
34743 return true;
34746 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34747 it'll probably end up. Add a penalty for size. */
34748 *total = (COSTS_N_INSNS (1)
34749 + (flag_pic != 0 && !TARGET_64BIT)
34750 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34751 return true;
34753 case ZERO_EXTEND:
34754 /* The zero extensions is often completely free on x86_64, so make
34755 it as cheap as possible. */
34756 if (TARGET_64BIT && mode == DImode
34757 && GET_MODE (XEXP (x, 0)) == SImode)
34758 *total = 1;
34759 else if (TARGET_ZERO_EXTEND_WITH_AND)
34760 *total = cost->add;
34761 else
34762 *total = cost->movzx;
34763 return false;
34765 case SIGN_EXTEND:
34766 *total = cost->movsx;
34767 return false;
34769 case ASHIFT:
34770 if (SCALAR_INT_MODE_P (mode)
34771 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34772 && CONST_INT_P (XEXP (x, 1)))
34774 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34775 if (value == 1)
34777 *total = cost->add;
34778 return false;
34780 if ((value == 2 || value == 3)
34781 && cost->lea <= cost->shift_const)
34783 *total = cost->lea;
34784 return false;
34787 /* FALLTHRU */
34789 case ROTATE:
34790 case ASHIFTRT:
34791 case LSHIFTRT:
34792 case ROTATERT:
34793 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34795 /* ??? Should be SSE vector operation cost. */
34796 /* At least for published AMD latencies, this really is the same
34797 as the latency for a simple fpu operation like fabs. */
34798 /* V*QImode is emulated with 1-11 insns. */
34799 if (mode == V16QImode || mode == V32QImode)
34801 int count = 11;
34802 if (TARGET_XOP && mode == V16QImode)
34804 /* For XOP we use vpshab, which requires a broadcast of the
34805 value to the variable shift insn. For constants this
34806 means a V16Q const in mem; even when we can perform the
34807 shift with one insn set the cost to prefer paddb. */
34808 if (CONSTANT_P (XEXP (x, 1)))
34810 *total = (cost->fabs
34811 + rtx_cost (XEXP (x, 0), code, 0, speed)
34812 + (speed ? 2 : COSTS_N_BYTES (16)));
34813 return true;
34815 count = 3;
34817 else if (TARGET_SSSE3)
34818 count = 7;
34819 *total = cost->fabs * count;
34821 else
34822 *total = cost->fabs;
34824 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34826 if (CONST_INT_P (XEXP (x, 1)))
34828 if (INTVAL (XEXP (x, 1)) > 32)
34829 *total = cost->shift_const + COSTS_N_INSNS (2);
34830 else
34831 *total = cost->shift_const * 2;
34833 else
34835 if (GET_CODE (XEXP (x, 1)) == AND)
34836 *total = cost->shift_var * 2;
34837 else
34838 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34841 else
34843 if (CONST_INT_P (XEXP (x, 1)))
34844 *total = cost->shift_const;
34845 else if (GET_CODE (XEXP (x, 1)) == SUBREG
34846 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
34848 /* Return the cost after shift-and truncation. */
34849 *total = cost->shift_var;
34850 return true;
34852 else
34853 *total = cost->shift_var;
34855 return false;
34857 case FMA:
34859 rtx sub;
34861 gcc_assert (FLOAT_MODE_P (mode));
34862 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
34864 /* ??? SSE scalar/vector cost should be used here. */
34865 /* ??? Bald assumption that fma has the same cost as fmul. */
34866 *total = cost->fmul;
34867 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34869 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34870 sub = XEXP (x, 0);
34871 if (GET_CODE (sub) == NEG)
34872 sub = XEXP (sub, 0);
34873 *total += rtx_cost (sub, FMA, 0, speed);
34875 sub = XEXP (x, 2);
34876 if (GET_CODE (sub) == NEG)
34877 sub = XEXP (sub, 0);
34878 *total += rtx_cost (sub, FMA, 2, speed);
34879 return true;
34882 case MULT:
34883 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34885 /* ??? SSE scalar cost should be used here. */
34886 *total = cost->fmul;
34887 return false;
34889 else if (X87_FLOAT_MODE_P (mode))
34891 *total = cost->fmul;
34892 return false;
34894 else if (FLOAT_MODE_P (mode))
34896 /* ??? SSE vector cost should be used here. */
34897 *total = cost->fmul;
34898 return false;
34900 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34902 /* V*QImode is emulated with 7-13 insns. */
34903 if (mode == V16QImode || mode == V32QImode)
34905 int extra = 11;
34906 if (TARGET_XOP && mode == V16QImode)
34907 extra = 5;
34908 else if (TARGET_SSSE3)
34909 extra = 6;
34910 *total = cost->fmul * 2 + cost->fabs * extra;
34912 /* V*DImode is emulated with 5-8 insns. */
34913 else if (mode == V2DImode || mode == V4DImode)
34915 if (TARGET_XOP && mode == V2DImode)
34916 *total = cost->fmul * 2 + cost->fabs * 3;
34917 else
34918 *total = cost->fmul * 3 + cost->fabs * 5;
34920 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34921 insns, including two PMULUDQ. */
34922 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34923 *total = cost->fmul * 2 + cost->fabs * 5;
34924 else
34925 *total = cost->fmul;
34926 return false;
34928 else
34930 rtx op0 = XEXP (x, 0);
34931 rtx op1 = XEXP (x, 1);
34932 int nbits;
34933 if (CONST_INT_P (XEXP (x, 1)))
34935 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34936 for (nbits = 0; value != 0; value &= value - 1)
34937 nbits++;
34939 else
34940 /* This is arbitrary. */
34941 nbits = 7;
34943 /* Compute costs correctly for widening multiplication. */
34944 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34945 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34946 == GET_MODE_SIZE (mode))
34948 int is_mulwiden = 0;
34949 enum machine_mode inner_mode = GET_MODE (op0);
34951 if (GET_CODE (op0) == GET_CODE (op1))
34952 is_mulwiden = 1, op1 = XEXP (op1, 0);
34953 else if (CONST_INT_P (op1))
34955 if (GET_CODE (op0) == SIGN_EXTEND)
34956 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34957 == INTVAL (op1);
34958 else
34959 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34962 if (is_mulwiden)
34963 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34966 *total = (cost->mult_init[MODE_INDEX (mode)]
34967 + nbits * cost->mult_bit
34968 + rtx_cost (op0, outer_code, opno, speed)
34969 + rtx_cost (op1, outer_code, opno, speed));
34971 return true;
34974 case DIV:
34975 case UDIV:
34976 case MOD:
34977 case UMOD:
34978 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34979 /* ??? SSE cost should be used here. */
34980 *total = cost->fdiv;
34981 else if (X87_FLOAT_MODE_P (mode))
34982 *total = cost->fdiv;
34983 else if (FLOAT_MODE_P (mode))
34984 /* ??? SSE vector cost should be used here. */
34985 *total = cost->fdiv;
34986 else
34987 *total = cost->divide[MODE_INDEX (mode)];
34988 return false;
34990 case PLUS:
34991 if (GET_MODE_CLASS (mode) == MODE_INT
34992 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34994 if (GET_CODE (XEXP (x, 0)) == PLUS
34995 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34996 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34997 && CONSTANT_P (XEXP (x, 1)))
34999 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
35000 if (val == 2 || val == 4 || val == 8)
35002 *total = cost->lea;
35003 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35004 outer_code, opno, speed);
35005 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
35006 outer_code, opno, speed);
35007 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35008 return true;
35011 else if (GET_CODE (XEXP (x, 0)) == MULT
35012 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
35014 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
35015 if (val == 2 || val == 4 || val == 8)
35017 *total = cost->lea;
35018 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35019 outer_code, opno, speed);
35020 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35021 return true;
35024 else if (GET_CODE (XEXP (x, 0)) == PLUS)
35026 *total = cost->lea;
35027 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35028 outer_code, opno, speed);
35029 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35030 outer_code, opno, speed);
35031 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35032 return true;
35035 /* FALLTHRU */
35037 case MINUS:
35038 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35040 /* ??? SSE cost should be used here. */
35041 *total = cost->fadd;
35042 return false;
35044 else if (X87_FLOAT_MODE_P (mode))
35046 *total = cost->fadd;
35047 return false;
35049 else if (FLOAT_MODE_P (mode))
35051 /* ??? SSE vector cost should be used here. */
35052 *total = cost->fadd;
35053 return false;
35055 /* FALLTHRU */
35057 case AND:
35058 case IOR:
35059 case XOR:
35060 if (GET_MODE_CLASS (mode) == MODE_INT
35061 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35063 *total = (cost->add * 2
35064 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
35065 << (GET_MODE (XEXP (x, 0)) != DImode))
35066 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
35067 << (GET_MODE (XEXP (x, 1)) != DImode)));
35068 return true;
35070 /* FALLTHRU */
35072 case NEG:
35073 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35075 /* ??? SSE cost should be used here. */
35076 *total = cost->fchs;
35077 return false;
35079 else if (X87_FLOAT_MODE_P (mode))
35081 *total = cost->fchs;
35082 return false;
35084 else if (FLOAT_MODE_P (mode))
35086 /* ??? SSE vector cost should be used here. */
35087 *total = cost->fchs;
35088 return false;
35090 /* FALLTHRU */
35092 case NOT:
35093 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35095 /* ??? Should be SSE vector operation cost. */
35096 /* At least for published AMD latencies, this really is the same
35097 as the latency for a simple fpu operation like fabs. */
35098 *total = cost->fabs;
35100 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35101 *total = cost->add * 2;
35102 else
35103 *total = cost->add;
35104 return false;
35106 case COMPARE:
35107 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
35108 && XEXP (XEXP (x, 0), 1) == const1_rtx
35109 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
35110 && XEXP (x, 1) == const0_rtx)
35112 /* This kind of construct is implemented using test[bwl].
35113 Treat it as if we had an AND. */
35114 *total = (cost->add
35115 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
35116 + rtx_cost (const1_rtx, outer_code, opno, speed));
35117 return true;
35119 return false;
35121 case FLOAT_EXTEND:
35122 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
35123 *total = 0;
35124 return false;
35126 case ABS:
35127 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35128 /* ??? SSE cost should be used here. */
35129 *total = cost->fabs;
35130 else if (X87_FLOAT_MODE_P (mode))
35131 *total = cost->fabs;
35132 else if (FLOAT_MODE_P (mode))
35133 /* ??? SSE vector cost should be used here. */
35134 *total = cost->fabs;
35135 return false;
35137 case SQRT:
35138 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35139 /* ??? SSE cost should be used here. */
35140 *total = cost->fsqrt;
35141 else if (X87_FLOAT_MODE_P (mode))
35142 *total = cost->fsqrt;
35143 else if (FLOAT_MODE_P (mode))
35144 /* ??? SSE vector cost should be used here. */
35145 *total = cost->fsqrt;
35146 return false;
35148 case UNSPEC:
35149 if (XINT (x, 1) == UNSPEC_TP)
35150 *total = 0;
35151 return false;
35153 case VEC_SELECT:
35154 case VEC_CONCAT:
35155 case VEC_MERGE:
35156 case VEC_DUPLICATE:
35157 /* ??? Assume all of these vector manipulation patterns are
35158 recognizable. In which case they all pretty much have the
35159 same cost. */
35160 *total = cost->fabs;
35161 return true;
35163 default:
35164 return false;
35168 #if TARGET_MACHO
35170 static int current_machopic_label_num;
35172 /* Given a symbol name and its associated stub, write out the
35173 definition of the stub. */
35175 void
35176 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35178 unsigned int length;
35179 char *binder_name, *symbol_name, lazy_ptr_name[32];
35180 int label = ++current_machopic_label_num;
35182 /* For 64-bit we shouldn't get here. */
35183 gcc_assert (!TARGET_64BIT);
35185 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35186 symb = targetm.strip_name_encoding (symb);
35188 length = strlen (stub);
35189 binder_name = XALLOCAVEC (char, length + 32);
35190 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35192 length = strlen (symb);
35193 symbol_name = XALLOCAVEC (char, length + 32);
35194 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35196 sprintf (lazy_ptr_name, "L%d$lz", label);
35198 if (MACHOPIC_ATT_STUB)
35199 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35200 else if (MACHOPIC_PURE)
35201 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
35202 else
35203 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
35205 fprintf (file, "%s:\n", stub);
35206 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35208 if (MACHOPIC_ATT_STUB)
35210 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
35212 else if (MACHOPIC_PURE)
35214 /* PIC stub. */
35215 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35216 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
35217 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
35218 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
35219 label, lazy_ptr_name, label);
35220 fprintf (file, "\tjmp\t*%%ecx\n");
35222 else
35223 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
35225 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
35226 it needs no stub-binding-helper. */
35227 if (MACHOPIC_ATT_STUB)
35228 return;
35230 fprintf (file, "%s:\n", binder_name);
35232 if (MACHOPIC_PURE)
35234 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
35235 fprintf (file, "\tpushl\t%%ecx\n");
35237 else
35238 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
35240 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
35242 /* N.B. Keep the correspondence of these
35243 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
35244 old-pic/new-pic/non-pic stubs; altering this will break
35245 compatibility with existing dylibs. */
35246 if (MACHOPIC_PURE)
35248 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35249 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
35251 else
35252 /* 16-byte -mdynamic-no-pic stub. */
35253 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
35255 fprintf (file, "%s:\n", lazy_ptr_name);
35256 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35257 fprintf (file, ASM_LONG "%s\n", binder_name);
35259 #endif /* TARGET_MACHO */
35261 /* Order the registers for register allocator. */
35263 void
35264 x86_order_regs_for_local_alloc (void)
35266 int pos = 0;
35267 int i;
35269 /* First allocate the local general purpose registers. */
35270 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35271 if (GENERAL_REGNO_P (i) && call_used_regs[i])
35272 reg_alloc_order [pos++] = i;
35274 /* Global general purpose registers. */
35275 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35276 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
35277 reg_alloc_order [pos++] = i;
35279 /* x87 registers come first in case we are doing FP math
35280 using them. */
35281 if (!TARGET_SSE_MATH)
35282 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35283 reg_alloc_order [pos++] = i;
35285 /* SSE registers. */
35286 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
35287 reg_alloc_order [pos++] = i;
35288 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
35289 reg_alloc_order [pos++] = i;
35291 /* Extended REX SSE registers. */
35292 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
35293 reg_alloc_order [pos++] = i;
35295 /* Mask register. */
35296 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
35297 reg_alloc_order [pos++] = i;
35299 /* x87 registers. */
35300 if (TARGET_SSE_MATH)
35301 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35302 reg_alloc_order [pos++] = i;
35304 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
35305 reg_alloc_order [pos++] = i;
35307 /* Initialize the rest of array as we do not allocate some registers
35308 at all. */
35309 while (pos < FIRST_PSEUDO_REGISTER)
35310 reg_alloc_order [pos++] = 0;
35313 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
35314 in struct attribute_spec handler. */
35315 static tree
35316 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
35317 tree args,
35318 int flags ATTRIBUTE_UNUSED,
35319 bool *no_add_attrs)
35321 if (TREE_CODE (*node) != FUNCTION_TYPE
35322 && TREE_CODE (*node) != METHOD_TYPE
35323 && TREE_CODE (*node) != FIELD_DECL
35324 && TREE_CODE (*node) != TYPE_DECL)
35326 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35327 name);
35328 *no_add_attrs = true;
35329 return NULL_TREE;
35331 if (TARGET_64BIT)
35333 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
35334 name);
35335 *no_add_attrs = true;
35336 return NULL_TREE;
35338 if (is_attribute_p ("callee_pop_aggregate_return", name))
35340 tree cst;
35342 cst = TREE_VALUE (args);
35343 if (TREE_CODE (cst) != INTEGER_CST)
35345 warning (OPT_Wattributes,
35346 "%qE attribute requires an integer constant argument",
35347 name);
35348 *no_add_attrs = true;
35350 else if (compare_tree_int (cst, 0) != 0
35351 && compare_tree_int (cst, 1) != 0)
35353 warning (OPT_Wattributes,
35354 "argument to %qE attribute is neither zero, nor one",
35355 name);
35356 *no_add_attrs = true;
35359 return NULL_TREE;
35362 return NULL_TREE;
35365 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
35366 struct attribute_spec.handler. */
35367 static tree
35368 ix86_handle_abi_attribute (tree *node, tree name,
35369 tree args ATTRIBUTE_UNUSED,
35370 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35372 if (TREE_CODE (*node) != FUNCTION_TYPE
35373 && TREE_CODE (*node) != METHOD_TYPE
35374 && TREE_CODE (*node) != FIELD_DECL
35375 && TREE_CODE (*node) != TYPE_DECL)
35377 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35378 name);
35379 *no_add_attrs = true;
35380 return NULL_TREE;
35383 /* Can combine regparm with all attributes but fastcall. */
35384 if (is_attribute_p ("ms_abi", name))
35386 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
35388 error ("ms_abi and sysv_abi attributes are not compatible");
35391 return NULL_TREE;
35393 else if (is_attribute_p ("sysv_abi", name))
35395 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
35397 error ("ms_abi and sysv_abi attributes are not compatible");
35400 return NULL_TREE;
35403 return NULL_TREE;
35406 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
35407 struct attribute_spec.handler. */
35408 static tree
35409 ix86_handle_struct_attribute (tree *node, tree name,
35410 tree args ATTRIBUTE_UNUSED,
35411 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35413 tree *type = NULL;
35414 if (DECL_P (*node))
35416 if (TREE_CODE (*node) == TYPE_DECL)
35417 type = &TREE_TYPE (*node);
35419 else
35420 type = node;
35422 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
35424 warning (OPT_Wattributes, "%qE attribute ignored",
35425 name);
35426 *no_add_attrs = true;
35429 else if ((is_attribute_p ("ms_struct", name)
35430 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
35431 || ((is_attribute_p ("gcc_struct", name)
35432 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
35434 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
35435 name);
35436 *no_add_attrs = true;
35439 return NULL_TREE;
35442 static tree
35443 ix86_handle_fndecl_attribute (tree *node, tree name,
35444 tree args ATTRIBUTE_UNUSED,
35445 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35447 if (TREE_CODE (*node) != FUNCTION_DECL)
35449 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35450 name);
35451 *no_add_attrs = true;
35453 return NULL_TREE;
35456 static bool
35457 ix86_ms_bitfield_layout_p (const_tree record_type)
35459 return ((TARGET_MS_BITFIELD_LAYOUT
35460 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
35461 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
35464 /* Returns an expression indicating where the this parameter is
35465 located on entry to the FUNCTION. */
35467 static rtx
35468 x86_this_parameter (tree function)
35470 tree type = TREE_TYPE (function);
35471 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
35472 int nregs;
35474 if (TARGET_64BIT)
35476 const int *parm_regs;
35478 if (ix86_function_type_abi (type) == MS_ABI)
35479 parm_regs = x86_64_ms_abi_int_parameter_registers;
35480 else
35481 parm_regs = x86_64_int_parameter_registers;
35482 return gen_rtx_REG (Pmode, parm_regs[aggr]);
35485 nregs = ix86_function_regparm (type, function);
35487 if (nregs > 0 && !stdarg_p (type))
35489 int regno;
35490 unsigned int ccvt = ix86_get_callcvt (type);
35492 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35493 regno = aggr ? DX_REG : CX_REG;
35494 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35496 regno = CX_REG;
35497 if (aggr)
35498 return gen_rtx_MEM (SImode,
35499 plus_constant (Pmode, stack_pointer_rtx, 4));
35501 else
35503 regno = AX_REG;
35504 if (aggr)
35506 regno = DX_REG;
35507 if (nregs == 1)
35508 return gen_rtx_MEM (SImode,
35509 plus_constant (Pmode,
35510 stack_pointer_rtx, 4));
35513 return gen_rtx_REG (SImode, regno);
35516 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
35517 aggr ? 8 : 4));
35520 /* Determine whether x86_output_mi_thunk can succeed. */
35522 static bool
35523 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
35524 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
35525 HOST_WIDE_INT vcall_offset, const_tree function)
35527 /* 64-bit can handle anything. */
35528 if (TARGET_64BIT)
35529 return true;
35531 /* For 32-bit, everything's fine if we have one free register. */
35532 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
35533 return true;
35535 /* Need a free register for vcall_offset. */
35536 if (vcall_offset)
35537 return false;
35539 /* Need a free register for GOT references. */
35540 if (flag_pic && !targetm.binds_local_p (function))
35541 return false;
35543 /* Otherwise ok. */
35544 return true;
35547 /* Output the assembler code for a thunk function. THUNK_DECL is the
35548 declaration for the thunk function itself, FUNCTION is the decl for
35549 the target function. DELTA is an immediate constant offset to be
35550 added to THIS. If VCALL_OFFSET is nonzero, the word at
35551 *(*this + vcall_offset) should be added to THIS. */
35553 static void
35554 x86_output_mi_thunk (FILE *file,
35555 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
35556 HOST_WIDE_INT vcall_offset, tree function)
35558 rtx this_param = x86_this_parameter (function);
35559 rtx this_reg, tmp, fnaddr;
35560 unsigned int tmp_regno;
35562 if (TARGET_64BIT)
35563 tmp_regno = R10_REG;
35564 else
35566 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
35567 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35568 tmp_regno = AX_REG;
35569 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35570 tmp_regno = DX_REG;
35571 else
35572 tmp_regno = CX_REG;
35575 emit_note (NOTE_INSN_PROLOGUE_END);
35577 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
35578 pull it in now and let DELTA benefit. */
35579 if (REG_P (this_param))
35580 this_reg = this_param;
35581 else if (vcall_offset)
35583 /* Put the this parameter into %eax. */
35584 this_reg = gen_rtx_REG (Pmode, AX_REG);
35585 emit_move_insn (this_reg, this_param);
35587 else
35588 this_reg = NULL_RTX;
35590 /* Adjust the this parameter by a fixed constant. */
35591 if (delta)
35593 rtx delta_rtx = GEN_INT (delta);
35594 rtx delta_dst = this_reg ? this_reg : this_param;
35596 if (TARGET_64BIT)
35598 if (!x86_64_general_operand (delta_rtx, Pmode))
35600 tmp = gen_rtx_REG (Pmode, tmp_regno);
35601 emit_move_insn (tmp, delta_rtx);
35602 delta_rtx = tmp;
35606 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
35609 /* Adjust the this parameter by a value stored in the vtable. */
35610 if (vcall_offset)
35612 rtx vcall_addr, vcall_mem, this_mem;
35614 tmp = gen_rtx_REG (Pmode, tmp_regno);
35616 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
35617 if (Pmode != ptr_mode)
35618 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
35619 emit_move_insn (tmp, this_mem);
35621 /* Adjust the this parameter. */
35622 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
35623 if (TARGET_64BIT
35624 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
35626 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
35627 emit_move_insn (tmp2, GEN_INT (vcall_offset));
35628 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
35631 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
35632 if (Pmode != ptr_mode)
35633 emit_insn (gen_addsi_1_zext (this_reg,
35634 gen_rtx_REG (ptr_mode,
35635 REGNO (this_reg)),
35636 vcall_mem));
35637 else
35638 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
35641 /* If necessary, drop THIS back to its stack slot. */
35642 if (this_reg && this_reg != this_param)
35643 emit_move_insn (this_param, this_reg);
35645 fnaddr = XEXP (DECL_RTL (function), 0);
35646 if (TARGET_64BIT)
35648 if (!flag_pic || targetm.binds_local_p (function)
35649 || TARGET_PECOFF)
35651 else
35653 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
35654 tmp = gen_rtx_CONST (Pmode, tmp);
35655 fnaddr = gen_rtx_MEM (Pmode, tmp);
35658 else
35660 if (!flag_pic || targetm.binds_local_p (function))
35662 #if TARGET_MACHO
35663 else if (TARGET_MACHO)
35665 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
35666 fnaddr = XEXP (fnaddr, 0);
35668 #endif /* TARGET_MACHO */
35669 else
35671 tmp = gen_rtx_REG (Pmode, CX_REG);
35672 output_set_got (tmp, NULL_RTX);
35674 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
35675 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
35676 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
35680 /* Our sibling call patterns do not allow memories, because we have no
35681 predicate that can distinguish between frame and non-frame memory.
35682 For our purposes here, we can get away with (ab)using a jump pattern,
35683 because we're going to do no optimization. */
35684 if (MEM_P (fnaddr))
35685 emit_jump_insn (gen_indirect_jump (fnaddr));
35686 else
35688 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35689 fnaddr = legitimize_pic_address (fnaddr,
35690 gen_rtx_REG (Pmode, tmp_regno));
35692 if (!sibcall_insn_operand (fnaddr, word_mode))
35694 tmp = gen_rtx_REG (word_mode, tmp_regno);
35695 if (GET_MODE (fnaddr) != word_mode)
35696 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35697 emit_move_insn (tmp, fnaddr);
35698 fnaddr = tmp;
35701 tmp = gen_rtx_MEM (QImode, fnaddr);
35702 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35703 tmp = emit_call_insn (tmp);
35704 SIBLING_CALL_P (tmp) = 1;
35706 emit_barrier ();
35708 /* Emit just enough of rest_of_compilation to get the insns emitted.
35709 Note that use_thunk calls assemble_start_function et al. */
35710 tmp = get_insns ();
35711 shorten_branches (tmp);
35712 final_start_function (tmp, file, 1);
35713 final (tmp, file, 1);
35714 final_end_function ();
35717 static void
35718 x86_file_start (void)
35720 default_file_start ();
35721 #if TARGET_MACHO
35722 darwin_file_start ();
35723 #endif
35724 if (X86_FILE_START_VERSION_DIRECTIVE)
35725 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35726 if (X86_FILE_START_FLTUSED)
35727 fputs ("\t.global\t__fltused\n", asm_out_file);
35728 if (ix86_asm_dialect == ASM_INTEL)
35729 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35733 x86_field_alignment (tree field, int computed)
35735 enum machine_mode mode;
35736 tree type = TREE_TYPE (field);
35738 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35739 return computed;
35740 mode = TYPE_MODE (strip_array_types (type));
35741 if (mode == DFmode || mode == DCmode
35742 || GET_MODE_CLASS (mode) == MODE_INT
35743 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35744 return MIN (32, computed);
35745 return computed;
35748 /* Output assembler code to FILE to increment profiler label # LABELNO
35749 for profiling a function entry. */
35750 void
35751 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35753 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35754 : MCOUNT_NAME);
35756 if (TARGET_64BIT)
35758 #ifndef NO_PROFILE_COUNTERS
35759 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35760 #endif
35762 if (!TARGET_PECOFF && flag_pic)
35763 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35764 else
35765 fprintf (file, "\tcall\t%s\n", mcount_name);
35767 else if (flag_pic)
35769 #ifndef NO_PROFILE_COUNTERS
35770 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35771 LPREFIX, labelno);
35772 #endif
35773 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35775 else
35777 #ifndef NO_PROFILE_COUNTERS
35778 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35779 LPREFIX, labelno);
35780 #endif
35781 fprintf (file, "\tcall\t%s\n", mcount_name);
35785 /* We don't have exact information about the insn sizes, but we may assume
35786 quite safely that we are informed about all 1 byte insns and memory
35787 address sizes. This is enough to eliminate unnecessary padding in
35788 99% of cases. */
35790 static int
35791 min_insn_size (rtx insn)
35793 int l = 0, len;
35795 if (!INSN_P (insn) || !active_insn_p (insn))
35796 return 0;
35798 /* Discard alignments we've emit and jump instructions. */
35799 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35800 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35801 return 0;
35803 /* Important case - calls are always 5 bytes.
35804 It is common to have many calls in the row. */
35805 if (CALL_P (insn)
35806 && symbolic_reference_mentioned_p (PATTERN (insn))
35807 && !SIBLING_CALL_P (insn))
35808 return 5;
35809 len = get_attr_length (insn);
35810 if (len <= 1)
35811 return 1;
35813 /* For normal instructions we rely on get_attr_length being exact,
35814 with a few exceptions. */
35815 if (!JUMP_P (insn))
35817 enum attr_type type = get_attr_type (insn);
35819 switch (type)
35821 case TYPE_MULTI:
35822 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35823 || asm_noperands (PATTERN (insn)) >= 0)
35824 return 0;
35825 break;
35826 case TYPE_OTHER:
35827 case TYPE_FCMP:
35828 break;
35829 default:
35830 /* Otherwise trust get_attr_length. */
35831 return len;
35834 l = get_attr_length_address (insn);
35835 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35836 l = 4;
35838 if (l)
35839 return 1+l;
35840 else
35841 return 2;
35844 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35846 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35847 window. */
35849 static void
35850 ix86_avoid_jump_mispredicts (void)
35852 rtx insn, start = get_insns ();
35853 int nbytes = 0, njumps = 0;
35854 int isjump = 0;
35856 /* Look for all minimal intervals of instructions containing 4 jumps.
35857 The intervals are bounded by START and INSN. NBYTES is the total
35858 size of instructions in the interval including INSN and not including
35859 START. When the NBYTES is smaller than 16 bytes, it is possible
35860 that the end of START and INSN ends up in the same 16byte page.
35862 The smallest offset in the page INSN can start is the case where START
35863 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35864 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35866 for (insn = start; insn; insn = NEXT_INSN (insn))
35868 int min_size;
35870 if (LABEL_P (insn))
35872 int align = label_to_alignment (insn);
35873 int max_skip = label_to_max_skip (insn);
35875 if (max_skip > 15)
35876 max_skip = 15;
35877 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35878 already in the current 16 byte page, because otherwise
35879 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35880 bytes to reach 16 byte boundary. */
35881 if (align <= 0
35882 || (align <= 3 && max_skip != (1 << align) - 1))
35883 max_skip = 0;
35884 if (dump_file)
35885 fprintf (dump_file, "Label %i with max_skip %i\n",
35886 INSN_UID (insn), max_skip);
35887 if (max_skip)
35889 while (nbytes + max_skip >= 16)
35891 start = NEXT_INSN (start);
35892 if (JUMP_P (start) || CALL_P (start))
35893 njumps--, isjump = 1;
35894 else
35895 isjump = 0;
35896 nbytes -= min_insn_size (start);
35899 continue;
35902 min_size = min_insn_size (insn);
35903 nbytes += min_size;
35904 if (dump_file)
35905 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35906 INSN_UID (insn), min_size);
35907 if (JUMP_P (insn) || CALL_P (insn))
35908 njumps++;
35909 else
35910 continue;
35912 while (njumps > 3)
35914 start = NEXT_INSN (start);
35915 if (JUMP_P (start) || CALL_P (start))
35916 njumps--, isjump = 1;
35917 else
35918 isjump = 0;
35919 nbytes -= min_insn_size (start);
35921 gcc_assert (njumps >= 0);
35922 if (dump_file)
35923 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35924 INSN_UID (start), INSN_UID (insn), nbytes);
35926 if (njumps == 3 && isjump && nbytes < 16)
35928 int padsize = 15 - nbytes + min_insn_size (insn);
35930 if (dump_file)
35931 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35932 INSN_UID (insn), padsize);
35933 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35937 #endif
35939 /* AMD Athlon works faster
35940 when RET is not destination of conditional jump or directly preceded
35941 by other jump instruction. We avoid the penalty by inserting NOP just
35942 before the RET instructions in such cases. */
35943 static void
35944 ix86_pad_returns (void)
35946 edge e;
35947 edge_iterator ei;
35949 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35951 basic_block bb = e->src;
35952 rtx ret = BB_END (bb);
35953 rtx prev;
35954 bool replace = false;
35956 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35957 || optimize_bb_for_size_p (bb))
35958 continue;
35959 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35960 if (active_insn_p (prev) || LABEL_P (prev))
35961 break;
35962 if (prev && LABEL_P (prev))
35964 edge e;
35965 edge_iterator ei;
35967 FOR_EACH_EDGE (e, ei, bb->preds)
35968 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35969 && !(e->flags & EDGE_FALLTHRU))
35971 replace = true;
35972 break;
35975 if (!replace)
35977 prev = prev_active_insn (ret);
35978 if (prev
35979 && ((JUMP_P (prev) && any_condjump_p (prev))
35980 || CALL_P (prev)))
35981 replace = true;
35982 /* Empty functions get branch mispredict even when
35983 the jump destination is not visible to us. */
35984 if (!prev && !optimize_function_for_size_p (cfun))
35985 replace = true;
35987 if (replace)
35989 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35990 delete_insn (ret);
35995 /* Count the minimum number of instructions in BB. Return 4 if the
35996 number of instructions >= 4. */
35998 static int
35999 ix86_count_insn_bb (basic_block bb)
36001 rtx insn;
36002 int insn_count = 0;
36004 /* Count number of instructions in this block. Return 4 if the number
36005 of instructions >= 4. */
36006 FOR_BB_INSNS (bb, insn)
36008 /* Only happen in exit blocks. */
36009 if (JUMP_P (insn)
36010 && ANY_RETURN_P (PATTERN (insn)))
36011 break;
36013 if (NONDEBUG_INSN_P (insn)
36014 && GET_CODE (PATTERN (insn)) != USE
36015 && GET_CODE (PATTERN (insn)) != CLOBBER)
36017 insn_count++;
36018 if (insn_count >= 4)
36019 return insn_count;
36023 return insn_count;
36027 /* Count the minimum number of instructions in code path in BB.
36028 Return 4 if the number of instructions >= 4. */
36030 static int
36031 ix86_count_insn (basic_block bb)
36033 edge e;
36034 edge_iterator ei;
36035 int min_prev_count;
36037 /* Only bother counting instructions along paths with no
36038 more than 2 basic blocks between entry and exit. Given
36039 that BB has an edge to exit, determine if a predecessor
36040 of BB has an edge from entry. If so, compute the number
36041 of instructions in the predecessor block. If there
36042 happen to be multiple such blocks, compute the minimum. */
36043 min_prev_count = 4;
36044 FOR_EACH_EDGE (e, ei, bb->preds)
36046 edge prev_e;
36047 edge_iterator prev_ei;
36049 if (e->src == ENTRY_BLOCK_PTR)
36051 min_prev_count = 0;
36052 break;
36054 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
36056 if (prev_e->src == ENTRY_BLOCK_PTR)
36058 int count = ix86_count_insn_bb (e->src);
36059 if (count < min_prev_count)
36060 min_prev_count = count;
36061 break;
36066 if (min_prev_count < 4)
36067 min_prev_count += ix86_count_insn_bb (bb);
36069 return min_prev_count;
36072 /* Pad short function to 4 instructions. */
36074 static void
36075 ix86_pad_short_function (void)
36077 edge e;
36078 edge_iterator ei;
36080 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36082 rtx ret = BB_END (e->src);
36083 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
36085 int insn_count = ix86_count_insn (e->src);
36087 /* Pad short function. */
36088 if (insn_count < 4)
36090 rtx insn = ret;
36092 /* Find epilogue. */
36093 while (insn
36094 && (!NOTE_P (insn)
36095 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
36096 insn = PREV_INSN (insn);
36098 if (!insn)
36099 insn = ret;
36101 /* Two NOPs count as one instruction. */
36102 insn_count = 2 * (4 - insn_count);
36103 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
36109 /* Fix up a Windows system unwinder issue. If an EH region falls through into
36110 the epilogue, the Windows system unwinder will apply epilogue logic and
36111 produce incorrect offsets. This can be avoided by adding a nop between
36112 the last insn that can throw and the first insn of the epilogue. */
36114 static void
36115 ix86_seh_fixup_eh_fallthru (void)
36117 edge e;
36118 edge_iterator ei;
36120 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36122 rtx insn, next;
36124 /* Find the beginning of the epilogue. */
36125 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
36126 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
36127 break;
36128 if (insn == NULL)
36129 continue;
36131 /* We only care about preceding insns that can throw. */
36132 insn = prev_active_insn (insn);
36133 if (insn == NULL || !can_throw_internal (insn))
36134 continue;
36136 /* Do not separate calls from their debug information. */
36137 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
36138 if (NOTE_P (next)
36139 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
36140 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
36141 insn = next;
36142 else
36143 break;
36145 emit_insn_after (gen_nops (const1_rtx), insn);
36149 /* Implement machine specific optimizations. We implement padding of returns
36150 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
36151 static void
36152 ix86_reorg (void)
36154 /* We are freeing block_for_insn in the toplev to keep compatibility
36155 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36156 compute_bb_for_insn ();
36158 if (TARGET_SEH && current_function_has_exception_handlers ())
36159 ix86_seh_fixup_eh_fallthru ();
36161 if (optimize && optimize_function_for_speed_p (cfun))
36163 if (TARGET_PAD_SHORT_FUNCTION)
36164 ix86_pad_short_function ();
36165 else if (TARGET_PAD_RETURNS)
36166 ix86_pad_returns ();
36167 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36168 if (TARGET_FOUR_JUMP_LIMIT)
36169 ix86_avoid_jump_mispredicts ();
36170 #endif
36174 /* Return nonzero when QImode register that must be represented via REX prefix
36175 is used. */
36176 bool
36177 x86_extended_QIreg_mentioned_p (rtx insn)
36179 int i;
36180 extract_insn_cached (insn);
36181 for (i = 0; i < recog_data.n_operands; i++)
36182 if (GENERAL_REG_P (recog_data.operand[i])
36183 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36184 return true;
36185 return false;
36188 /* Return nonzero when P points to register encoded via REX prefix.
36189 Called via for_each_rtx. */
36190 static int
36191 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36193 unsigned int regno;
36194 if (!REG_P (*p))
36195 return 0;
36196 regno = REGNO (*p);
36197 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36200 /* Return true when INSN mentions register that must be encoded using REX
36201 prefix. */
36202 bool
36203 x86_extended_reg_mentioned_p (rtx insn)
36205 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
36206 extended_reg_mentioned_1, NULL);
36209 /* If profitable, negate (without causing overflow) integer constant
36210 of mode MODE at location LOC. Return true in this case. */
36211 bool
36212 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
36214 HOST_WIDE_INT val;
36216 if (!CONST_INT_P (*loc))
36217 return false;
36219 switch (mode)
36221 case DImode:
36222 /* DImode x86_64 constants must fit in 32 bits. */
36223 gcc_assert (x86_64_immediate_operand (*loc, mode));
36225 mode = SImode;
36226 break;
36228 case SImode:
36229 case HImode:
36230 case QImode:
36231 break;
36233 default:
36234 gcc_unreachable ();
36237 /* Avoid overflows. */
36238 if (mode_signbit_p (mode, *loc))
36239 return false;
36241 val = INTVAL (*loc);
36243 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
36244 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
36245 if ((val < 0 && val != -128)
36246 || val == 128)
36248 *loc = GEN_INT (-val);
36249 return true;
36252 return false;
36255 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
36256 optabs would emit if we didn't have TFmode patterns. */
36258 void
36259 x86_emit_floatuns (rtx operands[2])
36261 rtx neglab, donelab, i0, i1, f0, in, out;
36262 enum machine_mode mode, inmode;
36264 inmode = GET_MODE (operands[1]);
36265 gcc_assert (inmode == SImode || inmode == DImode);
36267 out = operands[0];
36268 in = force_reg (inmode, operands[1]);
36269 mode = GET_MODE (out);
36270 neglab = gen_label_rtx ();
36271 donelab = gen_label_rtx ();
36272 f0 = gen_reg_rtx (mode);
36274 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
36276 expand_float (out, in, 0);
36278 emit_jump_insn (gen_jump (donelab));
36279 emit_barrier ();
36281 emit_label (neglab);
36283 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
36284 1, OPTAB_DIRECT);
36285 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
36286 1, OPTAB_DIRECT);
36287 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
36289 expand_float (f0, i0, 0);
36291 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
36293 emit_label (donelab);
36296 /* AVX512F does support 64-byte integer vector operations,
36297 thus the longest vector we are faced with is V64QImode. */
36298 #define MAX_VECT_LEN 64
36300 struct expand_vec_perm_d
36302 rtx target, op0, op1;
36303 unsigned char perm[MAX_VECT_LEN];
36304 enum machine_mode vmode;
36305 unsigned char nelt;
36306 bool one_operand_p;
36307 bool testing_p;
36310 static bool canonicalize_perm (struct expand_vec_perm_d *d);
36311 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
36312 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
36314 /* Get a vector mode of the same size as the original but with elements
36315 twice as wide. This is only guaranteed to apply to integral vectors. */
36317 static inline enum machine_mode
36318 get_mode_wider_vector (enum machine_mode o)
36320 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
36321 enum machine_mode n = GET_MODE_WIDER_MODE (o);
36322 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
36323 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
36324 return n;
36327 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36328 with all elements equal to VAR. Return true if successful. */
36330 static bool
36331 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
36332 rtx target, rtx val)
36334 bool ok;
36336 switch (mode)
36338 case V2SImode:
36339 case V2SFmode:
36340 if (!mmx_ok)
36341 return false;
36342 /* FALLTHRU */
36344 case V4DFmode:
36345 case V4DImode:
36346 case V8SFmode:
36347 case V8SImode:
36348 case V2DFmode:
36349 case V2DImode:
36350 case V4SFmode:
36351 case V4SImode:
36353 rtx insn, dup;
36355 /* First attempt to recognize VAL as-is. */
36356 dup = gen_rtx_VEC_DUPLICATE (mode, val);
36357 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
36358 if (recog_memoized (insn) < 0)
36360 rtx seq;
36361 /* If that fails, force VAL into a register. */
36363 start_sequence ();
36364 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
36365 seq = get_insns ();
36366 end_sequence ();
36367 if (seq)
36368 emit_insn_before (seq, insn);
36370 ok = recog_memoized (insn) >= 0;
36371 gcc_assert (ok);
36374 return true;
36376 case V4HImode:
36377 if (!mmx_ok)
36378 return false;
36379 if (TARGET_SSE || TARGET_3DNOW_A)
36381 rtx x;
36383 val = gen_lowpart (SImode, val);
36384 x = gen_rtx_TRUNCATE (HImode, val);
36385 x = gen_rtx_VEC_DUPLICATE (mode, x);
36386 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36387 return true;
36389 goto widen;
36391 case V8QImode:
36392 if (!mmx_ok)
36393 return false;
36394 goto widen;
36396 case V8HImode:
36397 if (TARGET_SSE2)
36399 struct expand_vec_perm_d dperm;
36400 rtx tmp1, tmp2;
36402 permute:
36403 memset (&dperm, 0, sizeof (dperm));
36404 dperm.target = target;
36405 dperm.vmode = mode;
36406 dperm.nelt = GET_MODE_NUNITS (mode);
36407 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
36408 dperm.one_operand_p = true;
36410 /* Extend to SImode using a paradoxical SUBREG. */
36411 tmp1 = gen_reg_rtx (SImode);
36412 emit_move_insn (tmp1, gen_lowpart (SImode, val));
36414 /* Insert the SImode value as low element of a V4SImode vector. */
36415 tmp2 = gen_lowpart (V4SImode, dperm.op0);
36416 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
36418 ok = (expand_vec_perm_1 (&dperm)
36419 || expand_vec_perm_broadcast_1 (&dperm));
36420 gcc_assert (ok);
36421 return ok;
36423 goto widen;
36425 case V16QImode:
36426 if (TARGET_SSE2)
36427 goto permute;
36428 goto widen;
36430 widen:
36431 /* Replicate the value once into the next wider mode and recurse. */
36433 enum machine_mode smode, wsmode, wvmode;
36434 rtx x;
36436 smode = GET_MODE_INNER (mode);
36437 wvmode = get_mode_wider_vector (mode);
36438 wsmode = GET_MODE_INNER (wvmode);
36440 val = convert_modes (wsmode, smode, val, true);
36441 x = expand_simple_binop (wsmode, ASHIFT, val,
36442 GEN_INT (GET_MODE_BITSIZE (smode)),
36443 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36444 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
36446 x = gen_lowpart (wvmode, target);
36447 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
36448 gcc_assert (ok);
36449 return ok;
36452 case V16HImode:
36453 case V32QImode:
36455 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
36456 rtx x = gen_reg_rtx (hvmode);
36458 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
36459 gcc_assert (ok);
36461 x = gen_rtx_VEC_CONCAT (mode, x, x);
36462 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36464 return true;
36466 default:
36467 return false;
36471 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36472 whose ONE_VAR element is VAR, and other elements are zero. Return true
36473 if successful. */
36475 static bool
36476 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
36477 rtx target, rtx var, int one_var)
36479 enum machine_mode vsimode;
36480 rtx new_target;
36481 rtx x, tmp;
36482 bool use_vector_set = false;
36484 switch (mode)
36486 case V2DImode:
36487 /* For SSE4.1, we normally use vector set. But if the second
36488 element is zero and inter-unit moves are OK, we use movq
36489 instead. */
36490 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
36491 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
36492 && one_var == 0));
36493 break;
36494 case V16QImode:
36495 case V4SImode:
36496 case V4SFmode:
36497 use_vector_set = TARGET_SSE4_1;
36498 break;
36499 case V8HImode:
36500 use_vector_set = TARGET_SSE2;
36501 break;
36502 case V4HImode:
36503 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
36504 break;
36505 case V32QImode:
36506 case V16HImode:
36507 case V8SImode:
36508 case V8SFmode:
36509 case V4DFmode:
36510 use_vector_set = TARGET_AVX;
36511 break;
36512 case V4DImode:
36513 /* Use ix86_expand_vector_set in 64bit mode only. */
36514 use_vector_set = TARGET_AVX && TARGET_64BIT;
36515 break;
36516 default:
36517 break;
36520 if (use_vector_set)
36522 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
36523 var = force_reg (GET_MODE_INNER (mode), var);
36524 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36525 return true;
36528 switch (mode)
36530 case V2SFmode:
36531 case V2SImode:
36532 if (!mmx_ok)
36533 return false;
36534 /* FALLTHRU */
36536 case V2DFmode:
36537 case V2DImode:
36538 if (one_var != 0)
36539 return false;
36540 var = force_reg (GET_MODE_INNER (mode), var);
36541 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
36542 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36543 return true;
36545 case V4SFmode:
36546 case V4SImode:
36547 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
36548 new_target = gen_reg_rtx (mode);
36549 else
36550 new_target = target;
36551 var = force_reg (GET_MODE_INNER (mode), var);
36552 x = gen_rtx_VEC_DUPLICATE (mode, var);
36553 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
36554 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
36555 if (one_var != 0)
36557 /* We need to shuffle the value to the correct position, so
36558 create a new pseudo to store the intermediate result. */
36560 /* With SSE2, we can use the integer shuffle insns. */
36561 if (mode != V4SFmode && TARGET_SSE2)
36563 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
36564 const1_rtx,
36565 GEN_INT (one_var == 1 ? 0 : 1),
36566 GEN_INT (one_var == 2 ? 0 : 1),
36567 GEN_INT (one_var == 3 ? 0 : 1)));
36568 if (target != new_target)
36569 emit_move_insn (target, new_target);
36570 return true;
36573 /* Otherwise convert the intermediate result to V4SFmode and
36574 use the SSE1 shuffle instructions. */
36575 if (mode != V4SFmode)
36577 tmp = gen_reg_rtx (V4SFmode);
36578 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
36580 else
36581 tmp = new_target;
36583 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
36584 const1_rtx,
36585 GEN_INT (one_var == 1 ? 0 : 1),
36586 GEN_INT (one_var == 2 ? 0+4 : 1+4),
36587 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
36589 if (mode != V4SFmode)
36590 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
36591 else if (tmp != target)
36592 emit_move_insn (target, tmp);
36594 else if (target != new_target)
36595 emit_move_insn (target, new_target);
36596 return true;
36598 case V8HImode:
36599 case V16QImode:
36600 vsimode = V4SImode;
36601 goto widen;
36602 case V4HImode:
36603 case V8QImode:
36604 if (!mmx_ok)
36605 return false;
36606 vsimode = V2SImode;
36607 goto widen;
36608 widen:
36609 if (one_var != 0)
36610 return false;
36612 /* Zero extend the variable element to SImode and recurse. */
36613 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
36615 x = gen_reg_rtx (vsimode);
36616 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
36617 var, one_var))
36618 gcc_unreachable ();
36620 emit_move_insn (target, gen_lowpart (mode, x));
36621 return true;
36623 default:
36624 return false;
36628 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36629 consisting of the values in VALS. It is known that all elements
36630 except ONE_VAR are constants. Return true if successful. */
36632 static bool
36633 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
36634 rtx target, rtx vals, int one_var)
36636 rtx var = XVECEXP (vals, 0, one_var);
36637 enum machine_mode wmode;
36638 rtx const_vec, x;
36640 const_vec = copy_rtx (vals);
36641 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
36642 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
36644 switch (mode)
36646 case V2DFmode:
36647 case V2DImode:
36648 case V2SFmode:
36649 case V2SImode:
36650 /* For the two element vectors, it's just as easy to use
36651 the general case. */
36652 return false;
36654 case V4DImode:
36655 /* Use ix86_expand_vector_set in 64bit mode only. */
36656 if (!TARGET_64BIT)
36657 return false;
36658 case V4DFmode:
36659 case V8SFmode:
36660 case V8SImode:
36661 case V16HImode:
36662 case V32QImode:
36663 case V4SFmode:
36664 case V4SImode:
36665 case V8HImode:
36666 case V4HImode:
36667 break;
36669 case V16QImode:
36670 if (TARGET_SSE4_1)
36671 break;
36672 wmode = V8HImode;
36673 goto widen;
36674 case V8QImode:
36675 wmode = V4HImode;
36676 goto widen;
36677 widen:
36678 /* There's no way to set one QImode entry easily. Combine
36679 the variable value with its adjacent constant value, and
36680 promote to an HImode set. */
36681 x = XVECEXP (vals, 0, one_var ^ 1);
36682 if (one_var & 1)
36684 var = convert_modes (HImode, QImode, var, true);
36685 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
36686 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36687 x = GEN_INT (INTVAL (x) & 0xff);
36689 else
36691 var = convert_modes (HImode, QImode, var, true);
36692 x = gen_int_mode (INTVAL (x) << 8, HImode);
36694 if (x != const0_rtx)
36695 var = expand_simple_binop (HImode, IOR, var, x, var,
36696 1, OPTAB_LIB_WIDEN);
36698 x = gen_reg_rtx (wmode);
36699 emit_move_insn (x, gen_lowpart (wmode, const_vec));
36700 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
36702 emit_move_insn (target, gen_lowpart (mode, x));
36703 return true;
36705 default:
36706 return false;
36709 emit_move_insn (target, const_vec);
36710 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36711 return true;
36714 /* A subroutine of ix86_expand_vector_init_general. Use vector
36715 concatenate to handle the most general case: all values variable,
36716 and none identical. */
36718 static void
36719 ix86_expand_vector_init_concat (enum machine_mode mode,
36720 rtx target, rtx *ops, int n)
36722 enum machine_mode cmode, hmode = VOIDmode;
36723 rtx first[8], second[4];
36724 rtvec v;
36725 int i, j;
36727 switch (n)
36729 case 2:
36730 switch (mode)
36732 case V8SImode:
36733 cmode = V4SImode;
36734 break;
36735 case V8SFmode:
36736 cmode = V4SFmode;
36737 break;
36738 case V4DImode:
36739 cmode = V2DImode;
36740 break;
36741 case V4DFmode:
36742 cmode = V2DFmode;
36743 break;
36744 case V4SImode:
36745 cmode = V2SImode;
36746 break;
36747 case V4SFmode:
36748 cmode = V2SFmode;
36749 break;
36750 case V2DImode:
36751 cmode = DImode;
36752 break;
36753 case V2SImode:
36754 cmode = SImode;
36755 break;
36756 case V2DFmode:
36757 cmode = DFmode;
36758 break;
36759 case V2SFmode:
36760 cmode = SFmode;
36761 break;
36762 default:
36763 gcc_unreachable ();
36766 if (!register_operand (ops[1], cmode))
36767 ops[1] = force_reg (cmode, ops[1]);
36768 if (!register_operand (ops[0], cmode))
36769 ops[0] = force_reg (cmode, ops[0]);
36770 emit_insn (gen_rtx_SET (VOIDmode, target,
36771 gen_rtx_VEC_CONCAT (mode, ops[0],
36772 ops[1])));
36773 break;
36775 case 4:
36776 switch (mode)
36778 case V4DImode:
36779 cmode = V2DImode;
36780 break;
36781 case V4DFmode:
36782 cmode = V2DFmode;
36783 break;
36784 case V4SImode:
36785 cmode = V2SImode;
36786 break;
36787 case V4SFmode:
36788 cmode = V2SFmode;
36789 break;
36790 default:
36791 gcc_unreachable ();
36793 goto half;
36795 case 8:
36796 switch (mode)
36798 case V8SImode:
36799 cmode = V2SImode;
36800 hmode = V4SImode;
36801 break;
36802 case V8SFmode:
36803 cmode = V2SFmode;
36804 hmode = V4SFmode;
36805 break;
36806 default:
36807 gcc_unreachable ();
36809 goto half;
36811 half:
36812 /* FIXME: We process inputs backward to help RA. PR 36222. */
36813 i = n - 1;
36814 j = (n >> 1) - 1;
36815 for (; i > 0; i -= 2, j--)
36817 first[j] = gen_reg_rtx (cmode);
36818 v = gen_rtvec (2, ops[i - 1], ops[i]);
36819 ix86_expand_vector_init (false, first[j],
36820 gen_rtx_PARALLEL (cmode, v));
36823 n >>= 1;
36824 if (n > 2)
36826 gcc_assert (hmode != VOIDmode);
36827 for (i = j = 0; i < n; i += 2, j++)
36829 second[j] = gen_reg_rtx (hmode);
36830 ix86_expand_vector_init_concat (hmode, second [j],
36831 &first [i], 2);
36833 n >>= 1;
36834 ix86_expand_vector_init_concat (mode, target, second, n);
36836 else
36837 ix86_expand_vector_init_concat (mode, target, first, n);
36838 break;
36840 default:
36841 gcc_unreachable ();
36845 /* A subroutine of ix86_expand_vector_init_general. Use vector
36846 interleave to handle the most general case: all values variable,
36847 and none identical. */
36849 static void
36850 ix86_expand_vector_init_interleave (enum machine_mode mode,
36851 rtx target, rtx *ops, int n)
36853 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36854 int i, j;
36855 rtx op0, op1;
36856 rtx (*gen_load_even) (rtx, rtx, rtx);
36857 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36858 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36860 switch (mode)
36862 case V8HImode:
36863 gen_load_even = gen_vec_setv8hi;
36864 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36865 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36866 inner_mode = HImode;
36867 first_imode = V4SImode;
36868 second_imode = V2DImode;
36869 third_imode = VOIDmode;
36870 break;
36871 case V16QImode:
36872 gen_load_even = gen_vec_setv16qi;
36873 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36874 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36875 inner_mode = QImode;
36876 first_imode = V8HImode;
36877 second_imode = V4SImode;
36878 third_imode = V2DImode;
36879 break;
36880 default:
36881 gcc_unreachable ();
36884 for (i = 0; i < n; i++)
36886 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36887 op0 = gen_reg_rtx (SImode);
36888 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36890 /* Insert the SImode value as low element of V4SImode vector. */
36891 op1 = gen_reg_rtx (V4SImode);
36892 op0 = gen_rtx_VEC_MERGE (V4SImode,
36893 gen_rtx_VEC_DUPLICATE (V4SImode,
36894 op0),
36895 CONST0_RTX (V4SImode),
36896 const1_rtx);
36897 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36899 /* Cast the V4SImode vector back to a vector in orignal mode. */
36900 op0 = gen_reg_rtx (mode);
36901 emit_move_insn (op0, gen_lowpart (mode, op1));
36903 /* Load even elements into the second position. */
36904 emit_insn (gen_load_even (op0,
36905 force_reg (inner_mode,
36906 ops [i + i + 1]),
36907 const1_rtx));
36909 /* Cast vector to FIRST_IMODE vector. */
36910 ops[i] = gen_reg_rtx (first_imode);
36911 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36914 /* Interleave low FIRST_IMODE vectors. */
36915 for (i = j = 0; i < n; i += 2, j++)
36917 op0 = gen_reg_rtx (first_imode);
36918 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36920 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36921 ops[j] = gen_reg_rtx (second_imode);
36922 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36925 /* Interleave low SECOND_IMODE vectors. */
36926 switch (second_imode)
36928 case V4SImode:
36929 for (i = j = 0; i < n / 2; i += 2, j++)
36931 op0 = gen_reg_rtx (second_imode);
36932 emit_insn (gen_interleave_second_low (op0, ops[i],
36933 ops[i + 1]));
36935 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36936 vector. */
36937 ops[j] = gen_reg_rtx (third_imode);
36938 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36940 second_imode = V2DImode;
36941 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36942 /* FALLTHRU */
36944 case V2DImode:
36945 op0 = gen_reg_rtx (second_imode);
36946 emit_insn (gen_interleave_second_low (op0, ops[0],
36947 ops[1]));
36949 /* Cast the SECOND_IMODE vector back to a vector on original
36950 mode. */
36951 emit_insn (gen_rtx_SET (VOIDmode, target,
36952 gen_lowpart (mode, op0)));
36953 break;
36955 default:
36956 gcc_unreachable ();
36960 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36961 all values variable, and none identical. */
36963 static void
36964 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36965 rtx target, rtx vals)
36967 rtx ops[32], op0, op1;
36968 enum machine_mode half_mode = VOIDmode;
36969 int n, i;
36971 switch (mode)
36973 case V2SFmode:
36974 case V2SImode:
36975 if (!mmx_ok && !TARGET_SSE)
36976 break;
36977 /* FALLTHRU */
36979 case V8SFmode:
36980 case V8SImode:
36981 case V4DFmode:
36982 case V4DImode:
36983 case V4SFmode:
36984 case V4SImode:
36985 case V2DFmode:
36986 case V2DImode:
36987 n = GET_MODE_NUNITS (mode);
36988 for (i = 0; i < n; i++)
36989 ops[i] = XVECEXP (vals, 0, i);
36990 ix86_expand_vector_init_concat (mode, target, ops, n);
36991 return;
36993 case V32QImode:
36994 half_mode = V16QImode;
36995 goto half;
36997 case V16HImode:
36998 half_mode = V8HImode;
36999 goto half;
37001 half:
37002 n = GET_MODE_NUNITS (mode);
37003 for (i = 0; i < n; i++)
37004 ops[i] = XVECEXP (vals, 0, i);
37005 op0 = gen_reg_rtx (half_mode);
37006 op1 = gen_reg_rtx (half_mode);
37007 ix86_expand_vector_init_interleave (half_mode, op0, ops,
37008 n >> 2);
37009 ix86_expand_vector_init_interleave (half_mode, op1,
37010 &ops [n >> 1], n >> 2);
37011 emit_insn (gen_rtx_SET (VOIDmode, target,
37012 gen_rtx_VEC_CONCAT (mode, op0, op1)));
37013 return;
37015 case V16QImode:
37016 if (!TARGET_SSE4_1)
37017 break;
37018 /* FALLTHRU */
37020 case V8HImode:
37021 if (!TARGET_SSE2)
37022 break;
37024 /* Don't use ix86_expand_vector_init_interleave if we can't
37025 move from GPR to SSE register directly. */
37026 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
37027 break;
37029 n = GET_MODE_NUNITS (mode);
37030 for (i = 0; i < n; i++)
37031 ops[i] = XVECEXP (vals, 0, i);
37032 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
37033 return;
37035 case V4HImode:
37036 case V8QImode:
37037 break;
37039 default:
37040 gcc_unreachable ();
37044 int i, j, n_elts, n_words, n_elt_per_word;
37045 enum machine_mode inner_mode;
37046 rtx words[4], shift;
37048 inner_mode = GET_MODE_INNER (mode);
37049 n_elts = GET_MODE_NUNITS (mode);
37050 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
37051 n_elt_per_word = n_elts / n_words;
37052 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
37054 for (i = 0; i < n_words; ++i)
37056 rtx word = NULL_RTX;
37058 for (j = 0; j < n_elt_per_word; ++j)
37060 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
37061 elt = convert_modes (word_mode, inner_mode, elt, true);
37063 if (j == 0)
37064 word = elt;
37065 else
37067 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
37068 word, 1, OPTAB_LIB_WIDEN);
37069 word = expand_simple_binop (word_mode, IOR, word, elt,
37070 word, 1, OPTAB_LIB_WIDEN);
37074 words[i] = word;
37077 if (n_words == 1)
37078 emit_move_insn (target, gen_lowpart (mode, words[0]));
37079 else if (n_words == 2)
37081 rtx tmp = gen_reg_rtx (mode);
37082 emit_clobber (tmp);
37083 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
37084 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
37085 emit_move_insn (target, tmp);
37087 else if (n_words == 4)
37089 rtx tmp = gen_reg_rtx (V4SImode);
37090 gcc_assert (word_mode == SImode);
37091 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
37092 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
37093 emit_move_insn (target, gen_lowpart (mode, tmp));
37095 else
37096 gcc_unreachable ();
37100 /* Initialize vector TARGET via VALS. Suppress the use of MMX
37101 instructions unless MMX_OK is true. */
37103 void
37104 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
37106 enum machine_mode mode = GET_MODE (target);
37107 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37108 int n_elts = GET_MODE_NUNITS (mode);
37109 int n_var = 0, one_var = -1;
37110 bool all_same = true, all_const_zero = true;
37111 int i;
37112 rtx x;
37114 for (i = 0; i < n_elts; ++i)
37116 x = XVECEXP (vals, 0, i);
37117 if (!(CONST_INT_P (x)
37118 || GET_CODE (x) == CONST_DOUBLE
37119 || GET_CODE (x) == CONST_FIXED))
37120 n_var++, one_var = i;
37121 else if (x != CONST0_RTX (inner_mode))
37122 all_const_zero = false;
37123 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
37124 all_same = false;
37127 /* Constants are best loaded from the constant pool. */
37128 if (n_var == 0)
37130 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
37131 return;
37134 /* If all values are identical, broadcast the value. */
37135 if (all_same
37136 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
37137 XVECEXP (vals, 0, 0)))
37138 return;
37140 /* Values where only one field is non-constant are best loaded from
37141 the pool and overwritten via move later. */
37142 if (n_var == 1)
37144 if (all_const_zero
37145 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
37146 XVECEXP (vals, 0, one_var),
37147 one_var))
37148 return;
37150 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
37151 return;
37154 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37157 void
37158 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37160 enum machine_mode mode = GET_MODE (target);
37161 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37162 enum machine_mode half_mode;
37163 bool use_vec_merge = false;
37164 rtx tmp;
37165 static rtx (*gen_extract[6][2]) (rtx, rtx)
37167 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37168 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37169 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37170 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37171 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37172 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37174 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37176 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37177 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37178 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37179 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37180 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37181 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37183 int i, j, n;
37185 switch (mode)
37187 case V2SFmode:
37188 case V2SImode:
37189 if (mmx_ok)
37191 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37192 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37193 if (elt == 0)
37194 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37195 else
37196 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37197 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37198 return;
37200 break;
37202 case V2DImode:
37203 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
37204 if (use_vec_merge)
37205 break;
37207 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37208 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
37209 if (elt == 0)
37210 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37211 else
37212 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37213 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37214 return;
37216 case V2DFmode:
37218 rtx op0, op1;
37220 /* For the two element vectors, we implement a VEC_CONCAT with
37221 the extraction of the other element. */
37223 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
37224 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
37226 if (elt == 0)
37227 op0 = val, op1 = tmp;
37228 else
37229 op0 = tmp, op1 = val;
37231 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
37232 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37234 return;
37236 case V4SFmode:
37237 use_vec_merge = TARGET_SSE4_1;
37238 if (use_vec_merge)
37239 break;
37241 switch (elt)
37243 case 0:
37244 use_vec_merge = true;
37245 break;
37247 case 1:
37248 /* tmp = target = A B C D */
37249 tmp = copy_to_reg (target);
37250 /* target = A A B B */
37251 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
37252 /* target = X A B B */
37253 ix86_expand_vector_set (false, target, val, 0);
37254 /* target = A X C D */
37255 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37256 const1_rtx, const0_rtx,
37257 GEN_INT (2+4), GEN_INT (3+4)));
37258 return;
37260 case 2:
37261 /* tmp = target = A B C D */
37262 tmp = copy_to_reg (target);
37263 /* tmp = X B C D */
37264 ix86_expand_vector_set (false, tmp, val, 0);
37265 /* target = A B X D */
37266 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37267 const0_rtx, const1_rtx,
37268 GEN_INT (0+4), GEN_INT (3+4)));
37269 return;
37271 case 3:
37272 /* tmp = target = A B C D */
37273 tmp = copy_to_reg (target);
37274 /* tmp = X B C D */
37275 ix86_expand_vector_set (false, tmp, val, 0);
37276 /* target = A B X D */
37277 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37278 const0_rtx, const1_rtx,
37279 GEN_INT (2+4), GEN_INT (0+4)));
37280 return;
37282 default:
37283 gcc_unreachable ();
37285 break;
37287 case V4SImode:
37288 use_vec_merge = TARGET_SSE4_1;
37289 if (use_vec_merge)
37290 break;
37292 /* Element 0 handled by vec_merge below. */
37293 if (elt == 0)
37295 use_vec_merge = true;
37296 break;
37299 if (TARGET_SSE2)
37301 /* With SSE2, use integer shuffles to swap element 0 and ELT,
37302 store into element 0, then shuffle them back. */
37304 rtx order[4];
37306 order[0] = GEN_INT (elt);
37307 order[1] = const1_rtx;
37308 order[2] = const2_rtx;
37309 order[3] = GEN_INT (3);
37310 order[elt] = const0_rtx;
37312 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37313 order[1], order[2], order[3]));
37315 ix86_expand_vector_set (false, target, val, 0);
37317 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37318 order[1], order[2], order[3]));
37320 else
37322 /* For SSE1, we have to reuse the V4SF code. */
37323 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
37324 gen_lowpart (SFmode, val), elt);
37326 return;
37328 case V8HImode:
37329 use_vec_merge = TARGET_SSE2;
37330 break;
37331 case V4HImode:
37332 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37333 break;
37335 case V16QImode:
37336 use_vec_merge = TARGET_SSE4_1;
37337 break;
37339 case V8QImode:
37340 break;
37342 case V32QImode:
37343 half_mode = V16QImode;
37344 j = 0;
37345 n = 16;
37346 goto half;
37348 case V16HImode:
37349 half_mode = V8HImode;
37350 j = 1;
37351 n = 8;
37352 goto half;
37354 case V8SImode:
37355 half_mode = V4SImode;
37356 j = 2;
37357 n = 4;
37358 goto half;
37360 case V4DImode:
37361 half_mode = V2DImode;
37362 j = 3;
37363 n = 2;
37364 goto half;
37366 case V8SFmode:
37367 half_mode = V4SFmode;
37368 j = 4;
37369 n = 4;
37370 goto half;
37372 case V4DFmode:
37373 half_mode = V2DFmode;
37374 j = 5;
37375 n = 2;
37376 goto half;
37378 half:
37379 /* Compute offset. */
37380 i = elt / n;
37381 elt %= n;
37383 gcc_assert (i <= 1);
37385 /* Extract the half. */
37386 tmp = gen_reg_rtx (half_mode);
37387 emit_insn (gen_extract[j][i] (tmp, target));
37389 /* Put val in tmp at elt. */
37390 ix86_expand_vector_set (false, tmp, val, elt);
37392 /* Put it back. */
37393 emit_insn (gen_insert[j][i] (target, target, tmp));
37394 return;
37396 default:
37397 break;
37400 if (use_vec_merge)
37402 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
37403 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
37404 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37406 else
37408 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37410 emit_move_insn (mem, target);
37412 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37413 emit_move_insn (tmp, val);
37415 emit_move_insn (target, mem);
37419 void
37420 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
37422 enum machine_mode mode = GET_MODE (vec);
37423 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37424 bool use_vec_extr = false;
37425 rtx tmp;
37427 switch (mode)
37429 case V2SImode:
37430 case V2SFmode:
37431 if (!mmx_ok)
37432 break;
37433 /* FALLTHRU */
37435 case V2DFmode:
37436 case V2DImode:
37437 use_vec_extr = true;
37438 break;
37440 case V4SFmode:
37441 use_vec_extr = TARGET_SSE4_1;
37442 if (use_vec_extr)
37443 break;
37445 switch (elt)
37447 case 0:
37448 tmp = vec;
37449 break;
37451 case 1:
37452 case 3:
37453 tmp = gen_reg_rtx (mode);
37454 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
37455 GEN_INT (elt), GEN_INT (elt),
37456 GEN_INT (elt+4), GEN_INT (elt+4)));
37457 break;
37459 case 2:
37460 tmp = gen_reg_rtx (mode);
37461 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
37462 break;
37464 default:
37465 gcc_unreachable ();
37467 vec = tmp;
37468 use_vec_extr = true;
37469 elt = 0;
37470 break;
37472 case V4SImode:
37473 use_vec_extr = TARGET_SSE4_1;
37474 if (use_vec_extr)
37475 break;
37477 if (TARGET_SSE2)
37479 switch (elt)
37481 case 0:
37482 tmp = vec;
37483 break;
37485 case 1:
37486 case 3:
37487 tmp = gen_reg_rtx (mode);
37488 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
37489 GEN_INT (elt), GEN_INT (elt),
37490 GEN_INT (elt), GEN_INT (elt)));
37491 break;
37493 case 2:
37494 tmp = gen_reg_rtx (mode);
37495 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
37496 break;
37498 default:
37499 gcc_unreachable ();
37501 vec = tmp;
37502 use_vec_extr = true;
37503 elt = 0;
37505 else
37507 /* For SSE1, we have to reuse the V4SF code. */
37508 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
37509 gen_lowpart (V4SFmode, vec), elt);
37510 return;
37512 break;
37514 case V8HImode:
37515 use_vec_extr = TARGET_SSE2;
37516 break;
37517 case V4HImode:
37518 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37519 break;
37521 case V16QImode:
37522 use_vec_extr = TARGET_SSE4_1;
37523 break;
37525 case V8SFmode:
37526 if (TARGET_AVX)
37528 tmp = gen_reg_rtx (V4SFmode);
37529 if (elt < 4)
37530 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
37531 else
37532 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
37533 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37534 return;
37536 break;
37538 case V4DFmode:
37539 if (TARGET_AVX)
37541 tmp = gen_reg_rtx (V2DFmode);
37542 if (elt < 2)
37543 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
37544 else
37545 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
37546 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37547 return;
37549 break;
37551 case V32QImode:
37552 if (TARGET_AVX)
37554 tmp = gen_reg_rtx (V16QImode);
37555 if (elt < 16)
37556 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
37557 else
37558 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
37559 ix86_expand_vector_extract (false, target, tmp, elt & 15);
37560 return;
37562 break;
37564 case V16HImode:
37565 if (TARGET_AVX)
37567 tmp = gen_reg_rtx (V8HImode);
37568 if (elt < 8)
37569 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
37570 else
37571 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
37572 ix86_expand_vector_extract (false, target, tmp, elt & 7);
37573 return;
37575 break;
37577 case V8SImode:
37578 if (TARGET_AVX)
37580 tmp = gen_reg_rtx (V4SImode);
37581 if (elt < 4)
37582 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
37583 else
37584 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
37585 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37586 return;
37588 break;
37590 case V4DImode:
37591 if (TARGET_AVX)
37593 tmp = gen_reg_rtx (V2DImode);
37594 if (elt < 2)
37595 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
37596 else
37597 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
37598 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37599 return;
37601 break;
37603 case V8QImode:
37604 /* ??? Could extract the appropriate HImode element and shift. */
37605 default:
37606 break;
37609 if (use_vec_extr)
37611 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
37612 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
37614 /* Let the rtl optimizers know about the zero extension performed. */
37615 if (inner_mode == QImode || inner_mode == HImode)
37617 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
37618 target = gen_lowpart (SImode, target);
37621 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37623 else
37625 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37627 emit_move_insn (mem, vec);
37629 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37630 emit_move_insn (target, tmp);
37634 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
37635 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
37636 The upper bits of DEST are undefined, though they shouldn't cause
37637 exceptions (some bits from src or all zeros are ok). */
37639 static void
37640 emit_reduc_half (rtx dest, rtx src, int i)
37642 rtx tem;
37643 switch (GET_MODE (src))
37645 case V4SFmode:
37646 if (i == 128)
37647 tem = gen_sse_movhlps (dest, src, src);
37648 else
37649 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
37650 GEN_INT (1 + 4), GEN_INT (1 + 4));
37651 break;
37652 case V2DFmode:
37653 tem = gen_vec_interleave_highv2df (dest, src, src);
37654 break;
37655 case V16QImode:
37656 case V8HImode:
37657 case V4SImode:
37658 case V2DImode:
37659 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
37660 gen_lowpart (V1TImode, src),
37661 GEN_INT (i / 2));
37662 break;
37663 case V8SFmode:
37664 if (i == 256)
37665 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
37666 else
37667 tem = gen_avx_shufps256 (dest, src, src,
37668 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
37669 break;
37670 case V4DFmode:
37671 if (i == 256)
37672 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
37673 else
37674 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
37675 break;
37676 case V32QImode:
37677 case V16HImode:
37678 case V8SImode:
37679 case V4DImode:
37680 if (i == 256)
37681 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
37682 gen_lowpart (V4DImode, src),
37683 gen_lowpart (V4DImode, src),
37684 const1_rtx);
37685 else
37686 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
37687 gen_lowpart (V2TImode, src),
37688 GEN_INT (i / 2));
37689 break;
37690 default:
37691 gcc_unreachable ();
37693 emit_insn (tem);
37696 /* Expand a vector reduction. FN is the binary pattern to reduce;
37697 DEST is the destination; IN is the input vector. */
37699 void
37700 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
37702 rtx half, dst, vec = in;
37703 enum machine_mode mode = GET_MODE (in);
37704 int i;
37706 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
37707 if (TARGET_SSE4_1
37708 && mode == V8HImode
37709 && fn == gen_uminv8hi3)
37711 emit_insn (gen_sse4_1_phminposuw (dest, in));
37712 return;
37715 for (i = GET_MODE_BITSIZE (mode);
37716 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
37717 i >>= 1)
37719 half = gen_reg_rtx (mode);
37720 emit_reduc_half (half, vec, i);
37721 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37722 dst = dest;
37723 else
37724 dst = gen_reg_rtx (mode);
37725 emit_insn (fn (dst, half, vec));
37726 vec = dst;
37730 /* Target hook for scalar_mode_supported_p. */
37731 static bool
37732 ix86_scalar_mode_supported_p (enum machine_mode mode)
37734 if (DECIMAL_FLOAT_MODE_P (mode))
37735 return default_decimal_float_supported_p ();
37736 else if (mode == TFmode)
37737 return true;
37738 else
37739 return default_scalar_mode_supported_p (mode);
37742 /* Implements target hook vector_mode_supported_p. */
37743 static bool
37744 ix86_vector_mode_supported_p (enum machine_mode mode)
37746 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37747 return true;
37748 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37749 return true;
37750 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37751 return true;
37752 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37753 return true;
37754 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37755 return true;
37756 return false;
37759 /* Target hook for c_mode_for_suffix. */
37760 static enum machine_mode
37761 ix86_c_mode_for_suffix (char suffix)
37763 if (suffix == 'q')
37764 return TFmode;
37765 if (suffix == 'w')
37766 return XFmode;
37768 return VOIDmode;
37771 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37773 We do this in the new i386 backend to maintain source compatibility
37774 with the old cc0-based compiler. */
37776 static tree
37777 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37778 tree inputs ATTRIBUTE_UNUSED,
37779 tree clobbers)
37781 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37782 clobbers);
37783 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37784 clobbers);
37785 return clobbers;
37788 /* Implements target vector targetm.asm.encode_section_info. */
37790 static void ATTRIBUTE_UNUSED
37791 ix86_encode_section_info (tree decl, rtx rtl, int first)
37793 default_encode_section_info (decl, rtl, first);
37795 if (TREE_CODE (decl) == VAR_DECL
37796 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37797 && ix86_in_large_data_p (decl))
37798 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37801 /* Worker function for REVERSE_CONDITION. */
37803 enum rtx_code
37804 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37806 return (mode != CCFPmode && mode != CCFPUmode
37807 ? reverse_condition (code)
37808 : reverse_condition_maybe_unordered (code));
37811 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37812 to OPERANDS[0]. */
37814 const char *
37815 output_387_reg_move (rtx insn, rtx *operands)
37817 if (REG_P (operands[0]))
37819 if (REG_P (operands[1])
37820 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37822 if (REGNO (operands[0]) == FIRST_STACK_REG)
37823 return output_387_ffreep (operands, 0);
37824 return "fstp\t%y0";
37826 if (STACK_TOP_P (operands[0]))
37827 return "fld%Z1\t%y1";
37828 return "fst\t%y0";
37830 else if (MEM_P (operands[0]))
37832 gcc_assert (REG_P (operands[1]));
37833 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37834 return "fstp%Z0\t%y0";
37835 else
37837 /* There is no non-popping store to memory for XFmode.
37838 So if we need one, follow the store with a load. */
37839 if (GET_MODE (operands[0]) == XFmode)
37840 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37841 else
37842 return "fst%Z0\t%y0";
37845 else
37846 gcc_unreachable();
37849 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37850 FP status register is set. */
37852 void
37853 ix86_emit_fp_unordered_jump (rtx label)
37855 rtx reg = gen_reg_rtx (HImode);
37856 rtx temp;
37858 emit_insn (gen_x86_fnstsw_1 (reg));
37860 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37862 emit_insn (gen_x86_sahf_1 (reg));
37864 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37865 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37867 else
37869 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37871 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37872 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37875 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37876 gen_rtx_LABEL_REF (VOIDmode, label),
37877 pc_rtx);
37878 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37880 emit_jump_insn (temp);
37881 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37884 /* Output code to perform a log1p XFmode calculation. */
37886 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37888 rtx label1 = gen_label_rtx ();
37889 rtx label2 = gen_label_rtx ();
37891 rtx tmp = gen_reg_rtx (XFmode);
37892 rtx tmp2 = gen_reg_rtx (XFmode);
37893 rtx test;
37895 emit_insn (gen_absxf2 (tmp, op1));
37896 test = gen_rtx_GE (VOIDmode, tmp,
37897 CONST_DOUBLE_FROM_REAL_VALUE (
37898 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37899 XFmode));
37900 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37902 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37903 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37904 emit_jump (label2);
37906 emit_label (label1);
37907 emit_move_insn (tmp, CONST1_RTX (XFmode));
37908 emit_insn (gen_addxf3 (tmp, op1, tmp));
37909 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37910 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37912 emit_label (label2);
37915 /* Emit code for round calculation. */
37916 void ix86_emit_i387_round (rtx op0, rtx op1)
37918 enum machine_mode inmode = GET_MODE (op1);
37919 enum machine_mode outmode = GET_MODE (op0);
37920 rtx e1, e2, res, tmp, tmp1, half;
37921 rtx scratch = gen_reg_rtx (HImode);
37922 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37923 rtx jump_label = gen_label_rtx ();
37924 rtx insn;
37925 rtx (*gen_abs) (rtx, rtx);
37926 rtx (*gen_neg) (rtx, rtx);
37928 switch (inmode)
37930 case SFmode:
37931 gen_abs = gen_abssf2;
37932 break;
37933 case DFmode:
37934 gen_abs = gen_absdf2;
37935 break;
37936 case XFmode:
37937 gen_abs = gen_absxf2;
37938 break;
37939 default:
37940 gcc_unreachable ();
37943 switch (outmode)
37945 case SFmode:
37946 gen_neg = gen_negsf2;
37947 break;
37948 case DFmode:
37949 gen_neg = gen_negdf2;
37950 break;
37951 case XFmode:
37952 gen_neg = gen_negxf2;
37953 break;
37954 case HImode:
37955 gen_neg = gen_neghi2;
37956 break;
37957 case SImode:
37958 gen_neg = gen_negsi2;
37959 break;
37960 case DImode:
37961 gen_neg = gen_negdi2;
37962 break;
37963 default:
37964 gcc_unreachable ();
37967 e1 = gen_reg_rtx (inmode);
37968 e2 = gen_reg_rtx (inmode);
37969 res = gen_reg_rtx (outmode);
37971 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37973 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37975 /* scratch = fxam(op1) */
37976 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37977 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37978 UNSPEC_FXAM)));
37979 /* e1 = fabs(op1) */
37980 emit_insn (gen_abs (e1, op1));
37982 /* e2 = e1 + 0.5 */
37983 half = force_reg (inmode, half);
37984 emit_insn (gen_rtx_SET (VOIDmode, e2,
37985 gen_rtx_PLUS (inmode, e1, half)));
37987 /* res = floor(e2) */
37988 if (inmode != XFmode)
37990 tmp1 = gen_reg_rtx (XFmode);
37992 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37993 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37995 else
37996 tmp1 = e2;
37998 switch (outmode)
38000 case SFmode:
38001 case DFmode:
38003 rtx tmp0 = gen_reg_rtx (XFmode);
38005 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
38007 emit_insn (gen_rtx_SET (VOIDmode, res,
38008 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
38009 UNSPEC_TRUNC_NOOP)));
38011 break;
38012 case XFmode:
38013 emit_insn (gen_frndintxf2_floor (res, tmp1));
38014 break;
38015 case HImode:
38016 emit_insn (gen_lfloorxfhi2 (res, tmp1));
38017 break;
38018 case SImode:
38019 emit_insn (gen_lfloorxfsi2 (res, tmp1));
38020 break;
38021 case DImode:
38022 emit_insn (gen_lfloorxfdi2 (res, tmp1));
38023 break;
38024 default:
38025 gcc_unreachable ();
38028 /* flags = signbit(a) */
38029 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
38031 /* if (flags) then res = -res */
38032 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
38033 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
38034 gen_rtx_LABEL_REF (VOIDmode, jump_label),
38035 pc_rtx);
38036 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38037 predict_jump (REG_BR_PROB_BASE * 50 / 100);
38038 JUMP_LABEL (insn) = jump_label;
38040 emit_insn (gen_neg (res, res));
38042 emit_label (jump_label);
38043 LABEL_NUSES (jump_label) = 1;
38045 emit_move_insn (op0, res);
38048 /* Output code to perform a Newton-Rhapson approximation of a single precision
38049 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
38051 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
38053 rtx x0, x1, e0, e1;
38055 x0 = gen_reg_rtx (mode);
38056 e0 = gen_reg_rtx (mode);
38057 e1 = gen_reg_rtx (mode);
38058 x1 = gen_reg_rtx (mode);
38060 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
38062 b = force_reg (mode, b);
38064 /* x0 = rcp(b) estimate */
38065 emit_insn (gen_rtx_SET (VOIDmode, x0,
38066 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
38067 UNSPEC_RCP)));
38068 /* e0 = x0 * b */
38069 emit_insn (gen_rtx_SET (VOIDmode, e0,
38070 gen_rtx_MULT (mode, x0, b)));
38072 /* e0 = x0 * e0 */
38073 emit_insn (gen_rtx_SET (VOIDmode, e0,
38074 gen_rtx_MULT (mode, x0, e0)));
38076 /* e1 = x0 + x0 */
38077 emit_insn (gen_rtx_SET (VOIDmode, e1,
38078 gen_rtx_PLUS (mode, x0, x0)));
38080 /* x1 = e1 - e0 */
38081 emit_insn (gen_rtx_SET (VOIDmode, x1,
38082 gen_rtx_MINUS (mode, e1, e0)));
38084 /* res = a * x1 */
38085 emit_insn (gen_rtx_SET (VOIDmode, res,
38086 gen_rtx_MULT (mode, a, x1)));
38089 /* Output code to perform a Newton-Rhapson approximation of a
38090 single precision floating point [reciprocal] square root. */
38092 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
38093 bool recip)
38095 rtx x0, e0, e1, e2, e3, mthree, mhalf;
38096 REAL_VALUE_TYPE r;
38098 x0 = gen_reg_rtx (mode);
38099 e0 = gen_reg_rtx (mode);
38100 e1 = gen_reg_rtx (mode);
38101 e2 = gen_reg_rtx (mode);
38102 e3 = gen_reg_rtx (mode);
38104 real_from_integer (&r, VOIDmode, -3, -1, 0);
38105 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38107 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
38108 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38110 if (VECTOR_MODE_P (mode))
38112 mthree = ix86_build_const_vector (mode, true, mthree);
38113 mhalf = ix86_build_const_vector (mode, true, mhalf);
38116 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
38117 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
38119 a = force_reg (mode, a);
38121 /* x0 = rsqrt(a) estimate */
38122 emit_insn (gen_rtx_SET (VOIDmode, x0,
38123 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
38124 UNSPEC_RSQRT)));
38126 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
38127 if (!recip)
38129 rtx zero, mask;
38131 zero = gen_reg_rtx (mode);
38132 mask = gen_reg_rtx (mode);
38134 zero = force_reg (mode, CONST0_RTX(mode));
38135 emit_insn (gen_rtx_SET (VOIDmode, mask,
38136 gen_rtx_NE (mode, zero, a)));
38138 emit_insn (gen_rtx_SET (VOIDmode, x0,
38139 gen_rtx_AND (mode, x0, mask)));
38142 /* e0 = x0 * a */
38143 emit_insn (gen_rtx_SET (VOIDmode, e0,
38144 gen_rtx_MULT (mode, x0, a)));
38145 /* e1 = e0 * x0 */
38146 emit_insn (gen_rtx_SET (VOIDmode, e1,
38147 gen_rtx_MULT (mode, e0, x0)));
38149 /* e2 = e1 - 3. */
38150 mthree = force_reg (mode, mthree);
38151 emit_insn (gen_rtx_SET (VOIDmode, e2,
38152 gen_rtx_PLUS (mode, e1, mthree)));
38154 mhalf = force_reg (mode, mhalf);
38155 if (recip)
38156 /* e3 = -.5 * x0 */
38157 emit_insn (gen_rtx_SET (VOIDmode, e3,
38158 gen_rtx_MULT (mode, x0, mhalf)));
38159 else
38160 /* e3 = -.5 * e0 */
38161 emit_insn (gen_rtx_SET (VOIDmode, e3,
38162 gen_rtx_MULT (mode, e0, mhalf)));
38163 /* ret = e2 * e3 */
38164 emit_insn (gen_rtx_SET (VOIDmode, res,
38165 gen_rtx_MULT (mode, e2, e3)));
38168 #ifdef TARGET_SOLARIS
38169 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38171 static void
38172 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38173 tree decl)
38175 /* With Binutils 2.15, the "@unwind" marker must be specified on
38176 every occurrence of the ".eh_frame" section, not just the first
38177 one. */
38178 if (TARGET_64BIT
38179 && strcmp (name, ".eh_frame") == 0)
38181 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38182 flags & SECTION_WRITE ? "aw" : "a");
38183 return;
38186 #ifndef USE_GAS
38187 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38189 solaris_elf_asm_comdat_section (name, flags, decl);
38190 return;
38192 #endif
38194 default_elf_asm_named_section (name, flags, decl);
38196 #endif /* TARGET_SOLARIS */
38198 /* Return the mangling of TYPE if it is an extended fundamental type. */
38200 static const char *
38201 ix86_mangle_type (const_tree type)
38203 type = TYPE_MAIN_VARIANT (type);
38205 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
38206 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
38207 return NULL;
38209 switch (TYPE_MODE (type))
38211 case TFmode:
38212 /* __float128 is "g". */
38213 return "g";
38214 case XFmode:
38215 /* "long double" or __float80 is "e". */
38216 return "e";
38217 default:
38218 return NULL;
38222 /* For 32-bit code we can save PIC register setup by using
38223 __stack_chk_fail_local hidden function instead of calling
38224 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
38225 register, so it is better to call __stack_chk_fail directly. */
38227 static tree ATTRIBUTE_UNUSED
38228 ix86_stack_protect_fail (void)
38230 return TARGET_64BIT
38231 ? default_external_stack_protect_fail ()
38232 : default_hidden_stack_protect_fail ();
38235 /* Select a format to encode pointers in exception handling data. CODE
38236 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
38237 true if the symbol may be affected by dynamic relocations.
38239 ??? All x86 object file formats are capable of representing this.
38240 After all, the relocation needed is the same as for the call insn.
38241 Whether or not a particular assembler allows us to enter such, I
38242 guess we'll have to see. */
38244 asm_preferred_eh_data_format (int code, int global)
38246 if (flag_pic)
38248 int type = DW_EH_PE_sdata8;
38249 if (!TARGET_64BIT
38250 || ix86_cmodel == CM_SMALL_PIC
38251 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
38252 type = DW_EH_PE_sdata4;
38253 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
38255 if (ix86_cmodel == CM_SMALL
38256 || (ix86_cmodel == CM_MEDIUM && code))
38257 return DW_EH_PE_udata4;
38258 return DW_EH_PE_absptr;
38261 /* Expand copysign from SIGN to the positive value ABS_VALUE
38262 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
38263 the sign-bit. */
38264 static void
38265 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
38267 enum machine_mode mode = GET_MODE (sign);
38268 rtx sgn = gen_reg_rtx (mode);
38269 if (mask == NULL_RTX)
38271 enum machine_mode vmode;
38273 if (mode == SFmode)
38274 vmode = V4SFmode;
38275 else if (mode == DFmode)
38276 vmode = V2DFmode;
38277 else
38278 vmode = mode;
38280 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
38281 if (!VECTOR_MODE_P (mode))
38283 /* We need to generate a scalar mode mask in this case. */
38284 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38285 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38286 mask = gen_reg_rtx (mode);
38287 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38290 else
38291 mask = gen_rtx_NOT (mode, mask);
38292 emit_insn (gen_rtx_SET (VOIDmode, sgn,
38293 gen_rtx_AND (mode, mask, sign)));
38294 emit_insn (gen_rtx_SET (VOIDmode, result,
38295 gen_rtx_IOR (mode, abs_value, sgn)));
38298 /* Expand fabs (OP0) and return a new rtx that holds the result. The
38299 mask for masking out the sign-bit is stored in *SMASK, if that is
38300 non-null. */
38301 static rtx
38302 ix86_expand_sse_fabs (rtx op0, rtx *smask)
38304 enum machine_mode vmode, mode = GET_MODE (op0);
38305 rtx xa, mask;
38307 xa = gen_reg_rtx (mode);
38308 if (mode == SFmode)
38309 vmode = V4SFmode;
38310 else if (mode == DFmode)
38311 vmode = V2DFmode;
38312 else
38313 vmode = mode;
38314 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
38315 if (!VECTOR_MODE_P (mode))
38317 /* We need to generate a scalar mode mask in this case. */
38318 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38319 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38320 mask = gen_reg_rtx (mode);
38321 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38323 emit_insn (gen_rtx_SET (VOIDmode, xa,
38324 gen_rtx_AND (mode, op0, mask)));
38326 if (smask)
38327 *smask = mask;
38329 return xa;
38332 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
38333 swapping the operands if SWAP_OPERANDS is true. The expanded
38334 code is a forward jump to a newly created label in case the
38335 comparison is true. The generated label rtx is returned. */
38336 static rtx
38337 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
38338 bool swap_operands)
38340 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
38341 rtx label, tmp;
38343 if (swap_operands)
38345 tmp = op0;
38346 op0 = op1;
38347 op1 = tmp;
38350 label = gen_label_rtx ();
38351 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
38352 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38353 gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
38354 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
38355 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
38356 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
38357 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38358 JUMP_LABEL (tmp) = label;
38360 return label;
38363 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
38364 using comparison code CODE. Operands are swapped for the comparison if
38365 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
38366 static rtx
38367 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
38368 bool swap_operands)
38370 rtx (*insn)(rtx, rtx, rtx, rtx);
38371 enum machine_mode mode = GET_MODE (op0);
38372 rtx mask = gen_reg_rtx (mode);
38374 if (swap_operands)
38376 rtx tmp = op0;
38377 op0 = op1;
38378 op1 = tmp;
38381 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
38383 emit_insn (insn (mask, op0, op1,
38384 gen_rtx_fmt_ee (code, mode, op0, op1)));
38385 return mask;
38388 /* Generate and return a rtx of mode MODE for 2**n where n is the number
38389 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
38390 static rtx
38391 ix86_gen_TWO52 (enum machine_mode mode)
38393 REAL_VALUE_TYPE TWO52r;
38394 rtx TWO52;
38396 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
38397 TWO52 = const_double_from_real_value (TWO52r, mode);
38398 TWO52 = force_reg (mode, TWO52);
38400 return TWO52;
38403 /* Expand SSE sequence for computing lround from OP1 storing
38404 into OP0. */
38405 void
38406 ix86_expand_lround (rtx op0, rtx op1)
38408 /* C code for the stuff we're doing below:
38409 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
38410 return (long)tmp;
38412 enum machine_mode mode = GET_MODE (op1);
38413 const struct real_format *fmt;
38414 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38415 rtx adj;
38417 /* load nextafter (0.5, 0.0) */
38418 fmt = REAL_MODE_FORMAT (mode);
38419 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38420 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38422 /* adj = copysign (0.5, op1) */
38423 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
38424 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
38426 /* adj = op1 + adj */
38427 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
38429 /* op0 = (imode)adj */
38430 expand_fix (op0, adj, 0);
38433 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
38434 into OPERAND0. */
38435 void
38436 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
38438 /* C code for the stuff we're doing below (for do_floor):
38439 xi = (long)op1;
38440 xi -= (double)xi > op1 ? 1 : 0;
38441 return xi;
38443 enum machine_mode fmode = GET_MODE (op1);
38444 enum machine_mode imode = GET_MODE (op0);
38445 rtx ireg, freg, label, tmp;
38447 /* reg = (long)op1 */
38448 ireg = gen_reg_rtx (imode);
38449 expand_fix (ireg, op1, 0);
38451 /* freg = (double)reg */
38452 freg = gen_reg_rtx (fmode);
38453 expand_float (freg, ireg, 0);
38455 /* ireg = (freg > op1) ? ireg - 1 : ireg */
38456 label = ix86_expand_sse_compare_and_jump (UNLE,
38457 freg, op1, !do_floor);
38458 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
38459 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
38460 emit_move_insn (ireg, tmp);
38462 emit_label (label);
38463 LABEL_NUSES (label) = 1;
38465 emit_move_insn (op0, ireg);
38468 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
38469 result in OPERAND0. */
38470 void
38471 ix86_expand_rint (rtx operand0, rtx operand1)
38473 /* C code for the stuff we're doing below:
38474 xa = fabs (operand1);
38475 if (!isless (xa, 2**52))
38476 return operand1;
38477 xa = xa + 2**52 - 2**52;
38478 return copysign (xa, operand1);
38480 enum machine_mode mode = GET_MODE (operand0);
38481 rtx res, xa, label, TWO52, mask;
38483 res = gen_reg_rtx (mode);
38484 emit_move_insn (res, operand1);
38486 /* xa = abs (operand1) */
38487 xa = ix86_expand_sse_fabs (res, &mask);
38489 /* if (!isless (xa, TWO52)) goto label; */
38490 TWO52 = ix86_gen_TWO52 (mode);
38491 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38493 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38494 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38496 ix86_sse_copysign_to_positive (res, xa, res, mask);
38498 emit_label (label);
38499 LABEL_NUSES (label) = 1;
38501 emit_move_insn (operand0, res);
38504 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38505 into OPERAND0. */
38506 void
38507 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
38509 /* C code for the stuff we expand below.
38510 double xa = fabs (x), x2;
38511 if (!isless (xa, TWO52))
38512 return x;
38513 xa = xa + TWO52 - TWO52;
38514 x2 = copysign (xa, x);
38515 Compensate. Floor:
38516 if (x2 > x)
38517 x2 -= 1;
38518 Compensate. Ceil:
38519 if (x2 < x)
38520 x2 -= -1;
38521 return x2;
38523 enum machine_mode mode = GET_MODE (operand0);
38524 rtx xa, TWO52, tmp, label, one, res, mask;
38526 TWO52 = ix86_gen_TWO52 (mode);
38528 /* Temporary for holding the result, initialized to the input
38529 operand to ease control flow. */
38530 res = gen_reg_rtx (mode);
38531 emit_move_insn (res, operand1);
38533 /* xa = abs (operand1) */
38534 xa = ix86_expand_sse_fabs (res, &mask);
38536 /* if (!isless (xa, TWO52)) goto label; */
38537 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38539 /* xa = xa + TWO52 - TWO52; */
38540 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38541 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38543 /* xa = copysign (xa, operand1) */
38544 ix86_sse_copysign_to_positive (xa, xa, res, mask);
38546 /* generate 1.0 or -1.0 */
38547 one = force_reg (mode,
38548 const_double_from_real_value (do_floor
38549 ? dconst1 : dconstm1, mode));
38551 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38552 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38553 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38554 gen_rtx_AND (mode, one, tmp)));
38555 /* We always need to subtract here to preserve signed zero. */
38556 tmp = expand_simple_binop (mode, MINUS,
38557 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38558 emit_move_insn (res, tmp);
38560 emit_label (label);
38561 LABEL_NUSES (label) = 1;
38563 emit_move_insn (operand0, res);
38566 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38567 into OPERAND0. */
38568 void
38569 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
38571 /* C code for the stuff we expand below.
38572 double xa = fabs (x), x2;
38573 if (!isless (xa, TWO52))
38574 return x;
38575 x2 = (double)(long)x;
38576 Compensate. Floor:
38577 if (x2 > x)
38578 x2 -= 1;
38579 Compensate. Ceil:
38580 if (x2 < x)
38581 x2 += 1;
38582 if (HONOR_SIGNED_ZEROS (mode))
38583 return copysign (x2, x);
38584 return x2;
38586 enum machine_mode mode = GET_MODE (operand0);
38587 rtx xa, xi, TWO52, tmp, label, one, res, mask;
38589 TWO52 = ix86_gen_TWO52 (mode);
38591 /* Temporary for holding the result, initialized to the input
38592 operand to ease control flow. */
38593 res = gen_reg_rtx (mode);
38594 emit_move_insn (res, operand1);
38596 /* xa = abs (operand1) */
38597 xa = ix86_expand_sse_fabs (res, &mask);
38599 /* if (!isless (xa, TWO52)) goto label; */
38600 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38602 /* xa = (double)(long)x */
38603 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38604 expand_fix (xi, res, 0);
38605 expand_float (xa, xi, 0);
38607 /* generate 1.0 */
38608 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38610 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38611 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38612 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38613 gen_rtx_AND (mode, one, tmp)));
38614 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
38615 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38616 emit_move_insn (res, tmp);
38618 if (HONOR_SIGNED_ZEROS (mode))
38619 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38621 emit_label (label);
38622 LABEL_NUSES (label) = 1;
38624 emit_move_insn (operand0, res);
38627 /* Expand SSE sequence for computing round from OPERAND1 storing
38628 into OPERAND0. Sequence that works without relying on DImode truncation
38629 via cvttsd2siq that is only available on 64bit targets. */
38630 void
38631 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
38633 /* C code for the stuff we expand below.
38634 double xa = fabs (x), xa2, x2;
38635 if (!isless (xa, TWO52))
38636 return x;
38637 Using the absolute value and copying back sign makes
38638 -0.0 -> -0.0 correct.
38639 xa2 = xa + TWO52 - TWO52;
38640 Compensate.
38641 dxa = xa2 - xa;
38642 if (dxa <= -0.5)
38643 xa2 += 1;
38644 else if (dxa > 0.5)
38645 xa2 -= 1;
38646 x2 = copysign (xa2, x);
38647 return x2;
38649 enum machine_mode mode = GET_MODE (operand0);
38650 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
38652 TWO52 = ix86_gen_TWO52 (mode);
38654 /* Temporary for holding the result, initialized to the input
38655 operand to ease control flow. */
38656 res = gen_reg_rtx (mode);
38657 emit_move_insn (res, operand1);
38659 /* xa = abs (operand1) */
38660 xa = ix86_expand_sse_fabs (res, &mask);
38662 /* if (!isless (xa, TWO52)) goto label; */
38663 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38665 /* xa2 = xa + TWO52 - TWO52; */
38666 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38667 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
38669 /* dxa = xa2 - xa; */
38670 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
38672 /* generate 0.5, 1.0 and -0.5 */
38673 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
38674 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
38675 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
38676 0, OPTAB_DIRECT);
38678 /* Compensate. */
38679 tmp = gen_reg_rtx (mode);
38680 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
38681 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
38682 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38683 gen_rtx_AND (mode, one, tmp)));
38684 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38685 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
38686 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
38687 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38688 gen_rtx_AND (mode, one, tmp)));
38689 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38691 /* res = copysign (xa2, operand1) */
38692 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
38694 emit_label (label);
38695 LABEL_NUSES (label) = 1;
38697 emit_move_insn (operand0, res);
38700 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38701 into OPERAND0. */
38702 void
38703 ix86_expand_trunc (rtx operand0, rtx operand1)
38705 /* C code for SSE variant we expand below.
38706 double xa = fabs (x), x2;
38707 if (!isless (xa, TWO52))
38708 return x;
38709 x2 = (double)(long)x;
38710 if (HONOR_SIGNED_ZEROS (mode))
38711 return copysign (x2, x);
38712 return x2;
38714 enum machine_mode mode = GET_MODE (operand0);
38715 rtx xa, xi, TWO52, label, res, mask;
38717 TWO52 = ix86_gen_TWO52 (mode);
38719 /* Temporary for holding the result, initialized to the input
38720 operand to ease control flow. */
38721 res = gen_reg_rtx (mode);
38722 emit_move_insn (res, operand1);
38724 /* xa = abs (operand1) */
38725 xa = ix86_expand_sse_fabs (res, &mask);
38727 /* if (!isless (xa, TWO52)) goto label; */
38728 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38730 /* x = (double)(long)x */
38731 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38732 expand_fix (xi, res, 0);
38733 expand_float (res, xi, 0);
38735 if (HONOR_SIGNED_ZEROS (mode))
38736 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38738 emit_label (label);
38739 LABEL_NUSES (label) = 1;
38741 emit_move_insn (operand0, res);
38744 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38745 into OPERAND0. */
38746 void
38747 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38749 enum machine_mode mode = GET_MODE (operand0);
38750 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38752 /* C code for SSE variant we expand below.
38753 double xa = fabs (x), x2;
38754 if (!isless (xa, TWO52))
38755 return x;
38756 xa2 = xa + TWO52 - TWO52;
38757 Compensate:
38758 if (xa2 > xa)
38759 xa2 -= 1.0;
38760 x2 = copysign (xa2, x);
38761 return x2;
38764 TWO52 = ix86_gen_TWO52 (mode);
38766 /* Temporary for holding the result, initialized to the input
38767 operand to ease control flow. */
38768 res = gen_reg_rtx (mode);
38769 emit_move_insn (res, operand1);
38771 /* xa = abs (operand1) */
38772 xa = ix86_expand_sse_fabs (res, &smask);
38774 /* if (!isless (xa, TWO52)) goto label; */
38775 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38777 /* res = xa + TWO52 - TWO52; */
38778 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38779 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38780 emit_move_insn (res, tmp);
38782 /* generate 1.0 */
38783 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38785 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38786 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38787 emit_insn (gen_rtx_SET (VOIDmode, mask,
38788 gen_rtx_AND (mode, mask, one)));
38789 tmp = expand_simple_binop (mode, MINUS,
38790 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38791 emit_move_insn (res, tmp);
38793 /* res = copysign (res, operand1) */
38794 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38796 emit_label (label);
38797 LABEL_NUSES (label) = 1;
38799 emit_move_insn (operand0, res);
38802 /* Expand SSE sequence for computing round from OPERAND1 storing
38803 into OPERAND0. */
38804 void
38805 ix86_expand_round (rtx operand0, rtx operand1)
38807 /* C code for the stuff we're doing below:
38808 double xa = fabs (x);
38809 if (!isless (xa, TWO52))
38810 return x;
38811 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38812 return copysign (xa, x);
38814 enum machine_mode mode = GET_MODE (operand0);
38815 rtx res, TWO52, xa, label, xi, half, mask;
38816 const struct real_format *fmt;
38817 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38819 /* Temporary for holding the result, initialized to the input
38820 operand to ease control flow. */
38821 res = gen_reg_rtx (mode);
38822 emit_move_insn (res, operand1);
38824 TWO52 = ix86_gen_TWO52 (mode);
38825 xa = ix86_expand_sse_fabs (res, &mask);
38826 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38828 /* load nextafter (0.5, 0.0) */
38829 fmt = REAL_MODE_FORMAT (mode);
38830 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38831 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38833 /* xa = xa + 0.5 */
38834 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38835 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38837 /* xa = (double)(int64_t)xa */
38838 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38839 expand_fix (xi, xa, 0);
38840 expand_float (xa, xi, 0);
38842 /* res = copysign (xa, operand1) */
38843 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38845 emit_label (label);
38846 LABEL_NUSES (label) = 1;
38848 emit_move_insn (operand0, res);
38851 /* Expand SSE sequence for computing round
38852 from OP1 storing into OP0 using sse4 round insn. */
38853 void
38854 ix86_expand_round_sse4 (rtx op0, rtx op1)
38856 enum machine_mode mode = GET_MODE (op0);
38857 rtx e1, e2, res, half;
38858 const struct real_format *fmt;
38859 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38860 rtx (*gen_copysign) (rtx, rtx, rtx);
38861 rtx (*gen_round) (rtx, rtx, rtx);
38863 switch (mode)
38865 case SFmode:
38866 gen_copysign = gen_copysignsf3;
38867 gen_round = gen_sse4_1_roundsf2;
38868 break;
38869 case DFmode:
38870 gen_copysign = gen_copysigndf3;
38871 gen_round = gen_sse4_1_rounddf2;
38872 break;
38873 default:
38874 gcc_unreachable ();
38877 /* round (a) = trunc (a + copysign (0.5, a)) */
38879 /* load nextafter (0.5, 0.0) */
38880 fmt = REAL_MODE_FORMAT (mode);
38881 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38882 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38883 half = const_double_from_real_value (pred_half, mode);
38885 /* e1 = copysign (0.5, op1) */
38886 e1 = gen_reg_rtx (mode);
38887 emit_insn (gen_copysign (e1, half, op1));
38889 /* e2 = op1 + e1 */
38890 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38892 /* res = trunc (e2) */
38893 res = gen_reg_rtx (mode);
38894 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38896 emit_move_insn (op0, res);
38900 /* Table of valid machine attributes. */
38901 static const struct attribute_spec ix86_attribute_table[] =
38903 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38904 affects_type_identity } */
38905 /* Stdcall attribute says callee is responsible for popping arguments
38906 if they are not variable. */
38907 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38908 true },
38909 /* Fastcall attribute says callee is responsible for popping arguments
38910 if they are not variable. */
38911 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38912 true },
38913 /* Thiscall attribute says callee is responsible for popping arguments
38914 if they are not variable. */
38915 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38916 true },
38917 /* Cdecl attribute says the callee is a normal C declaration */
38918 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38919 true },
38920 /* Regparm attribute specifies how many integer arguments are to be
38921 passed in registers. */
38922 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38923 true },
38924 /* Sseregparm attribute says we are using x86_64 calling conventions
38925 for FP arguments. */
38926 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38927 true },
38928 /* The transactional memory builtins are implicitly regparm or fastcall
38929 depending on the ABI. Override the generic do-nothing attribute that
38930 these builtins were declared with. */
38931 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38932 true },
38933 /* force_align_arg_pointer says this function realigns the stack at entry. */
38934 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38935 false, true, true, ix86_handle_cconv_attribute, false },
38936 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38937 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38938 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38939 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38940 false },
38941 #endif
38942 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38943 false },
38944 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38945 false },
38946 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38947 SUBTARGET_ATTRIBUTE_TABLE,
38948 #endif
38949 /* ms_abi and sysv_abi calling convention function attributes. */
38950 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38951 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38952 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38953 false },
38954 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38955 ix86_handle_callee_pop_aggregate_return, true },
38956 /* End element. */
38957 { NULL, 0, 0, false, false, false, NULL, false }
38960 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38961 static int
38962 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38963 tree vectype,
38964 int misalign ATTRIBUTE_UNUSED)
38966 unsigned elements;
38968 switch (type_of_cost)
38970 case scalar_stmt:
38971 return ix86_cost->scalar_stmt_cost;
38973 case scalar_load:
38974 return ix86_cost->scalar_load_cost;
38976 case scalar_store:
38977 return ix86_cost->scalar_store_cost;
38979 case vector_stmt:
38980 return ix86_cost->vec_stmt_cost;
38982 case vector_load:
38983 return ix86_cost->vec_align_load_cost;
38985 case vector_store:
38986 return ix86_cost->vec_store_cost;
38988 case vec_to_scalar:
38989 return ix86_cost->vec_to_scalar_cost;
38991 case scalar_to_vec:
38992 return ix86_cost->scalar_to_vec_cost;
38994 case unaligned_load:
38995 case unaligned_store:
38996 return ix86_cost->vec_unalign_load_cost;
38998 case cond_branch_taken:
38999 return ix86_cost->cond_taken_branch_cost;
39001 case cond_branch_not_taken:
39002 return ix86_cost->cond_not_taken_branch_cost;
39004 case vec_perm:
39005 case vec_promote_demote:
39006 return ix86_cost->vec_stmt_cost;
39008 case vec_construct:
39009 elements = TYPE_VECTOR_SUBPARTS (vectype);
39010 return elements / 2 + 1;
39012 default:
39013 gcc_unreachable ();
39017 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
39018 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
39019 insn every time. */
39021 static GTY(()) rtx vselect_insn;
39023 /* Initialize vselect_insn. */
39025 static void
39026 init_vselect_insn (void)
39028 unsigned i;
39029 rtx x;
39031 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
39032 for (i = 0; i < MAX_VECT_LEN; ++i)
39033 XVECEXP (x, 0, i) = const0_rtx;
39034 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
39035 const0_rtx), x);
39036 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
39037 start_sequence ();
39038 vselect_insn = emit_insn (x);
39039 end_sequence ();
39042 /* Construct (set target (vec_select op0 (parallel perm))) and
39043 return true if that's a valid instruction in the active ISA. */
39045 static bool
39046 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
39047 unsigned nelt, bool testing_p)
39049 unsigned int i;
39050 rtx x, save_vconcat;
39051 int icode;
39053 if (vselect_insn == NULL_RTX)
39054 init_vselect_insn ();
39056 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
39057 PUT_NUM_ELEM (XVEC (x, 0), nelt);
39058 for (i = 0; i < nelt; ++i)
39059 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
39060 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39061 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
39062 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
39063 SET_DEST (PATTERN (vselect_insn)) = target;
39064 icode = recog_memoized (vselect_insn);
39066 if (icode >= 0 && !testing_p)
39067 emit_insn (copy_rtx (PATTERN (vselect_insn)));
39069 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
39070 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
39071 INSN_CODE (vselect_insn) = -1;
39073 return icode >= 0;
39076 /* Similar, but generate a vec_concat from op0 and op1 as well. */
39078 static bool
39079 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
39080 const unsigned char *perm, unsigned nelt,
39081 bool testing_p)
39083 enum machine_mode v2mode;
39084 rtx x;
39085 bool ok;
39087 if (vselect_insn == NULL_RTX)
39088 init_vselect_insn ();
39090 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
39091 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39092 PUT_MODE (x, v2mode);
39093 XEXP (x, 0) = op0;
39094 XEXP (x, 1) = op1;
39095 ok = expand_vselect (target, x, perm, nelt, testing_p);
39096 XEXP (x, 0) = const0_rtx;
39097 XEXP (x, 1) = const0_rtx;
39098 return ok;
39101 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39102 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
39104 static bool
39105 expand_vec_perm_blend (struct expand_vec_perm_d *d)
39107 enum machine_mode vmode = d->vmode;
39108 unsigned i, mask, nelt = d->nelt;
39109 rtx target, op0, op1, x;
39110 rtx rperm[32], vperm;
39112 if (d->one_operand_p)
39113 return false;
39114 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
39116 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
39118 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
39120 else
39121 return false;
39123 /* This is a blend, not a permute. Elements must stay in their
39124 respective lanes. */
39125 for (i = 0; i < nelt; ++i)
39127 unsigned e = d->perm[i];
39128 if (!(e == i || e == i + nelt))
39129 return false;
39132 if (d->testing_p)
39133 return true;
39135 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
39136 decision should be extracted elsewhere, so that we only try that
39137 sequence once all budget==3 options have been tried. */
39138 target = d->target;
39139 op0 = d->op0;
39140 op1 = d->op1;
39141 mask = 0;
39143 switch (vmode)
39145 case V4DFmode:
39146 case V8SFmode:
39147 case V2DFmode:
39148 case V4SFmode:
39149 case V8HImode:
39150 case V8SImode:
39151 for (i = 0; i < nelt; ++i)
39152 mask |= (d->perm[i] >= nelt) << i;
39153 break;
39155 case V2DImode:
39156 for (i = 0; i < 2; ++i)
39157 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39158 vmode = V8HImode;
39159 goto do_subreg;
39161 case V4SImode:
39162 for (i = 0; i < 4; ++i)
39163 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39164 vmode = V8HImode;
39165 goto do_subreg;
39167 case V16QImode:
39168 /* See if bytes move in pairs so we can use pblendw with
39169 an immediate argument, rather than pblendvb with a vector
39170 argument. */
39171 for (i = 0; i < 16; i += 2)
39172 if (d->perm[i] + 1 != d->perm[i + 1])
39174 use_pblendvb:
39175 for (i = 0; i < nelt; ++i)
39176 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39178 finish_pblendvb:
39179 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39180 vperm = force_reg (vmode, vperm);
39182 if (GET_MODE_SIZE (vmode) == 16)
39183 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39184 else
39185 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39186 return true;
39189 for (i = 0; i < 8; ++i)
39190 mask |= (d->perm[i * 2] >= 16) << i;
39191 vmode = V8HImode;
39192 /* FALLTHRU */
39194 do_subreg:
39195 target = gen_lowpart (vmode, target);
39196 op0 = gen_lowpart (vmode, op0);
39197 op1 = gen_lowpart (vmode, op1);
39198 break;
39200 case V32QImode:
39201 /* See if bytes move in pairs. If not, vpblendvb must be used. */
39202 for (i = 0; i < 32; i += 2)
39203 if (d->perm[i] + 1 != d->perm[i + 1])
39204 goto use_pblendvb;
39205 /* See if bytes move in quadruplets. If yes, vpblendd
39206 with immediate can be used. */
39207 for (i = 0; i < 32; i += 4)
39208 if (d->perm[i] + 2 != d->perm[i + 2])
39209 break;
39210 if (i < 32)
39212 /* See if bytes move the same in both lanes. If yes,
39213 vpblendw with immediate can be used. */
39214 for (i = 0; i < 16; i += 2)
39215 if (d->perm[i] + 16 != d->perm[i + 16])
39216 goto use_pblendvb;
39218 /* Use vpblendw. */
39219 for (i = 0; i < 16; ++i)
39220 mask |= (d->perm[i * 2] >= 32) << i;
39221 vmode = V16HImode;
39222 goto do_subreg;
39225 /* Use vpblendd. */
39226 for (i = 0; i < 8; ++i)
39227 mask |= (d->perm[i * 4] >= 32) << i;
39228 vmode = V8SImode;
39229 goto do_subreg;
39231 case V16HImode:
39232 /* See if words move in pairs. If yes, vpblendd can be used. */
39233 for (i = 0; i < 16; i += 2)
39234 if (d->perm[i] + 1 != d->perm[i + 1])
39235 break;
39236 if (i < 16)
39238 /* See if words move the same in both lanes. If not,
39239 vpblendvb must be used. */
39240 for (i = 0; i < 8; i++)
39241 if (d->perm[i] + 8 != d->perm[i + 8])
39243 /* Use vpblendvb. */
39244 for (i = 0; i < 32; ++i)
39245 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
39247 vmode = V32QImode;
39248 nelt = 32;
39249 target = gen_lowpart (vmode, target);
39250 op0 = gen_lowpart (vmode, op0);
39251 op1 = gen_lowpart (vmode, op1);
39252 goto finish_pblendvb;
39255 /* Use vpblendw. */
39256 for (i = 0; i < 16; ++i)
39257 mask |= (d->perm[i] >= 16) << i;
39258 break;
39261 /* Use vpblendd. */
39262 for (i = 0; i < 8; ++i)
39263 mask |= (d->perm[i * 2] >= 16) << i;
39264 vmode = V8SImode;
39265 goto do_subreg;
39267 case V4DImode:
39268 /* Use vpblendd. */
39269 for (i = 0; i < 4; ++i)
39270 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39271 vmode = V8SImode;
39272 goto do_subreg;
39274 default:
39275 gcc_unreachable ();
39278 /* This matches five different patterns with the different modes. */
39279 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
39280 x = gen_rtx_SET (VOIDmode, target, x);
39281 emit_insn (x);
39283 return true;
39286 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39287 in terms of the variable form of vpermilps.
39289 Note that we will have already failed the immediate input vpermilps,
39290 which requires that the high and low part shuffle be identical; the
39291 variable form doesn't require that. */
39293 static bool
39294 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
39296 rtx rperm[8], vperm;
39297 unsigned i;
39299 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
39300 return false;
39302 /* We can only permute within the 128-bit lane. */
39303 for (i = 0; i < 8; ++i)
39305 unsigned e = d->perm[i];
39306 if (i < 4 ? e >= 4 : e < 4)
39307 return false;
39310 if (d->testing_p)
39311 return true;
39313 for (i = 0; i < 8; ++i)
39315 unsigned e = d->perm[i];
39317 /* Within each 128-bit lane, the elements of op0 are numbered
39318 from 0 and the elements of op1 are numbered from 4. */
39319 if (e >= 8 + 4)
39320 e -= 8;
39321 else if (e >= 4)
39322 e -= 4;
39324 rperm[i] = GEN_INT (e);
39327 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
39328 vperm = force_reg (V8SImode, vperm);
39329 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
39331 return true;
39334 /* Return true if permutation D can be performed as VMODE permutation
39335 instead. */
39337 static bool
39338 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
39340 unsigned int i, j, chunk;
39342 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
39343 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
39344 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
39345 return false;
39347 if (GET_MODE_NUNITS (vmode) >= d->nelt)
39348 return true;
39350 chunk = d->nelt / GET_MODE_NUNITS (vmode);
39351 for (i = 0; i < d->nelt; i += chunk)
39352 if (d->perm[i] & (chunk - 1))
39353 return false;
39354 else
39355 for (j = 1; j < chunk; ++j)
39356 if (d->perm[i] + j != d->perm[i + j])
39357 return false;
39359 return true;
39362 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39363 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
39365 static bool
39366 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
39368 unsigned i, nelt, eltsz, mask;
39369 unsigned char perm[32];
39370 enum machine_mode vmode = V16QImode;
39371 rtx rperm[32], vperm, target, op0, op1;
39373 nelt = d->nelt;
39375 if (!d->one_operand_p)
39377 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
39379 if (TARGET_AVX2
39380 && valid_perm_using_mode_p (V2TImode, d))
39382 if (d->testing_p)
39383 return true;
39385 /* Use vperm2i128 insn. The pattern uses
39386 V4DImode instead of V2TImode. */
39387 target = gen_lowpart (V4DImode, d->target);
39388 op0 = gen_lowpart (V4DImode, d->op0);
39389 op1 = gen_lowpart (V4DImode, d->op1);
39390 rperm[0]
39391 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
39392 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
39393 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
39394 return true;
39396 return false;
39399 else
39401 if (GET_MODE_SIZE (d->vmode) == 16)
39403 if (!TARGET_SSSE3)
39404 return false;
39406 else if (GET_MODE_SIZE (d->vmode) == 32)
39408 if (!TARGET_AVX2)
39409 return false;
39411 /* V4DImode should be already handled through
39412 expand_vselect by vpermq instruction. */
39413 gcc_assert (d->vmode != V4DImode);
39415 vmode = V32QImode;
39416 if (d->vmode == V8SImode
39417 || d->vmode == V16HImode
39418 || d->vmode == V32QImode)
39420 /* First see if vpermq can be used for
39421 V8SImode/V16HImode/V32QImode. */
39422 if (valid_perm_using_mode_p (V4DImode, d))
39424 for (i = 0; i < 4; i++)
39425 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
39426 if (d->testing_p)
39427 return true;
39428 return expand_vselect (gen_lowpart (V4DImode, d->target),
39429 gen_lowpart (V4DImode, d->op0),
39430 perm, 4, false);
39433 /* Next see if vpermd can be used. */
39434 if (valid_perm_using_mode_p (V8SImode, d))
39435 vmode = V8SImode;
39437 /* Or if vpermps can be used. */
39438 else if (d->vmode == V8SFmode)
39439 vmode = V8SImode;
39441 if (vmode == V32QImode)
39443 /* vpshufb only works intra lanes, it is not
39444 possible to shuffle bytes in between the lanes. */
39445 for (i = 0; i < nelt; ++i)
39446 if ((d->perm[i] ^ i) & (nelt / 2))
39447 return false;
39450 else
39451 return false;
39454 if (d->testing_p)
39455 return true;
39457 if (vmode == V8SImode)
39458 for (i = 0; i < 8; ++i)
39459 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
39460 else
39462 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39463 if (!d->one_operand_p)
39464 mask = 2 * nelt - 1;
39465 else if (vmode == V16QImode)
39466 mask = nelt - 1;
39467 else
39468 mask = nelt / 2 - 1;
39470 for (i = 0; i < nelt; ++i)
39472 unsigned j, e = d->perm[i] & mask;
39473 for (j = 0; j < eltsz; ++j)
39474 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
39478 vperm = gen_rtx_CONST_VECTOR (vmode,
39479 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
39480 vperm = force_reg (vmode, vperm);
39482 target = gen_lowpart (vmode, d->target);
39483 op0 = gen_lowpart (vmode, d->op0);
39484 if (d->one_operand_p)
39486 if (vmode == V16QImode)
39487 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
39488 else if (vmode == V32QImode)
39489 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
39490 else if (vmode == V8SFmode)
39491 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
39492 else
39493 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
39495 else
39497 op1 = gen_lowpart (vmode, d->op1);
39498 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
39501 return true;
39504 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
39505 in a single instruction. */
39507 static bool
39508 expand_vec_perm_1 (struct expand_vec_perm_d *d)
39510 unsigned i, nelt = d->nelt;
39511 unsigned char perm2[MAX_VECT_LEN];
39513 /* Check plain VEC_SELECT first, because AVX has instructions that could
39514 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
39515 input where SEL+CONCAT may not. */
39516 if (d->one_operand_p)
39518 int mask = nelt - 1;
39519 bool identity_perm = true;
39520 bool broadcast_perm = true;
39522 for (i = 0; i < nelt; i++)
39524 perm2[i] = d->perm[i] & mask;
39525 if (perm2[i] != i)
39526 identity_perm = false;
39527 if (perm2[i])
39528 broadcast_perm = false;
39531 if (identity_perm)
39533 if (!d->testing_p)
39534 emit_move_insn (d->target, d->op0);
39535 return true;
39537 else if (broadcast_perm && TARGET_AVX2)
39539 /* Use vpbroadcast{b,w,d}. */
39540 rtx (*gen) (rtx, rtx) = NULL;
39541 switch (d->vmode)
39543 case V32QImode:
39544 gen = gen_avx2_pbroadcastv32qi_1;
39545 break;
39546 case V16HImode:
39547 gen = gen_avx2_pbroadcastv16hi_1;
39548 break;
39549 case V8SImode:
39550 gen = gen_avx2_pbroadcastv8si_1;
39551 break;
39552 case V16QImode:
39553 gen = gen_avx2_pbroadcastv16qi;
39554 break;
39555 case V8HImode:
39556 gen = gen_avx2_pbroadcastv8hi;
39557 break;
39558 case V8SFmode:
39559 gen = gen_avx2_vec_dupv8sf_1;
39560 break;
39561 /* For other modes prefer other shuffles this function creates. */
39562 default: break;
39564 if (gen != NULL)
39566 if (!d->testing_p)
39567 emit_insn (gen (d->target, d->op0));
39568 return true;
39572 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
39573 return true;
39575 /* There are plenty of patterns in sse.md that are written for
39576 SEL+CONCAT and are not replicated for a single op. Perhaps
39577 that should be changed, to avoid the nastiness here. */
39579 /* Recognize interleave style patterns, which means incrementing
39580 every other permutation operand. */
39581 for (i = 0; i < nelt; i += 2)
39583 perm2[i] = d->perm[i] & mask;
39584 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
39586 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39587 d->testing_p))
39588 return true;
39590 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
39591 if (nelt >= 4)
39593 for (i = 0; i < nelt; i += 4)
39595 perm2[i + 0] = d->perm[i + 0] & mask;
39596 perm2[i + 1] = d->perm[i + 1] & mask;
39597 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
39598 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
39601 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39602 d->testing_p))
39603 return true;
39607 /* Finally, try the fully general two operand permute. */
39608 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
39609 d->testing_p))
39610 return true;
39612 /* Recognize interleave style patterns with reversed operands. */
39613 if (!d->one_operand_p)
39615 for (i = 0; i < nelt; ++i)
39617 unsigned e = d->perm[i];
39618 if (e >= nelt)
39619 e -= nelt;
39620 else
39621 e += nelt;
39622 perm2[i] = e;
39625 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
39626 d->testing_p))
39627 return true;
39630 /* Try the SSE4.1 blend variable merge instructions. */
39631 if (expand_vec_perm_blend (d))
39632 return true;
39634 /* Try one of the AVX vpermil variable permutations. */
39635 if (expand_vec_perm_vpermil (d))
39636 return true;
39638 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
39639 vpshufb, vpermd, vpermps or vpermq variable permutation. */
39640 if (expand_vec_perm_pshufb (d))
39641 return true;
39643 return false;
39646 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39647 in terms of a pair of pshuflw + pshufhw instructions. */
39649 static bool
39650 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
39652 unsigned char perm2[MAX_VECT_LEN];
39653 unsigned i;
39654 bool ok;
39656 if (d->vmode != V8HImode || !d->one_operand_p)
39657 return false;
39659 /* The two permutations only operate in 64-bit lanes. */
39660 for (i = 0; i < 4; ++i)
39661 if (d->perm[i] >= 4)
39662 return false;
39663 for (i = 4; i < 8; ++i)
39664 if (d->perm[i] < 4)
39665 return false;
39667 if (d->testing_p)
39668 return true;
39670 /* Emit the pshuflw. */
39671 memcpy (perm2, d->perm, 4);
39672 for (i = 4; i < 8; ++i)
39673 perm2[i] = i;
39674 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
39675 gcc_assert (ok);
39677 /* Emit the pshufhw. */
39678 memcpy (perm2 + 4, d->perm + 4, 4);
39679 for (i = 0; i < 4; ++i)
39680 perm2[i] = i;
39681 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
39682 gcc_assert (ok);
39684 return true;
39687 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39688 the permutation using the SSSE3 palignr instruction. This succeeds
39689 when all of the elements in PERM fit within one vector and we merely
39690 need to shift them down so that a single vector permutation has a
39691 chance to succeed. */
39693 static bool
39694 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
39696 unsigned i, nelt = d->nelt;
39697 unsigned min, max;
39698 bool in_order, ok;
39699 rtx shift;
39701 /* Even with AVX, palignr only operates on 128-bit vectors. */
39702 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39703 return false;
39705 min = nelt, max = 0;
39706 for (i = 0; i < nelt; ++i)
39708 unsigned e = d->perm[i];
39709 if (e < min)
39710 min = e;
39711 if (e > max)
39712 max = e;
39714 if (min == 0 || max - min >= nelt)
39715 return false;
39717 /* Given that we have SSSE3, we know we'll be able to implement the
39718 single operand permutation after the palignr with pshufb. */
39719 if (d->testing_p)
39720 return true;
39722 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39723 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39724 gen_lowpart (TImode, d->op1),
39725 gen_lowpart (TImode, d->op0), shift));
39727 d->op0 = d->op1 = d->target;
39728 d->one_operand_p = true;
39730 in_order = true;
39731 for (i = 0; i < nelt; ++i)
39733 unsigned e = d->perm[i] - min;
39734 if (e != i)
39735 in_order = false;
39736 d->perm[i] = e;
39739 /* Test for the degenerate case where the alignment by itself
39740 produces the desired permutation. */
39741 if (in_order)
39742 return true;
39744 ok = expand_vec_perm_1 (d);
39745 gcc_assert (ok);
39747 return ok;
39750 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39752 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39753 a two vector permutation into a single vector permutation by using
39754 an interleave operation to merge the vectors. */
39756 static bool
39757 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39759 struct expand_vec_perm_d dremap, dfinal;
39760 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39761 unsigned HOST_WIDE_INT contents;
39762 unsigned char remap[2 * MAX_VECT_LEN];
39763 rtx seq;
39764 bool ok, same_halves = false;
39766 if (GET_MODE_SIZE (d->vmode) == 16)
39768 if (d->one_operand_p)
39769 return false;
39771 else if (GET_MODE_SIZE (d->vmode) == 32)
39773 if (!TARGET_AVX)
39774 return false;
39775 /* For 32-byte modes allow even d->one_operand_p.
39776 The lack of cross-lane shuffling in some instructions
39777 might prevent a single insn shuffle. */
39778 dfinal = *d;
39779 dfinal.testing_p = true;
39780 /* If expand_vec_perm_interleave3 can expand this into
39781 a 3 insn sequence, give up and let it be expanded as
39782 3 insn sequence. While that is one insn longer,
39783 it doesn't need a memory operand and in the common
39784 case that both interleave low and high permutations
39785 with the same operands are adjacent needs 4 insns
39786 for both after CSE. */
39787 if (expand_vec_perm_interleave3 (&dfinal))
39788 return false;
39790 else
39791 return false;
39793 /* Examine from whence the elements come. */
39794 contents = 0;
39795 for (i = 0; i < nelt; ++i)
39796 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39798 memset (remap, 0xff, sizeof (remap));
39799 dremap = *d;
39801 if (GET_MODE_SIZE (d->vmode) == 16)
39803 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39805 /* Split the two input vectors into 4 halves. */
39806 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39807 h2 = h1 << nelt2;
39808 h3 = h2 << nelt2;
39809 h4 = h3 << nelt2;
39811 /* If the elements from the low halves use interleave low, and similarly
39812 for interleave high. If the elements are from mis-matched halves, we
39813 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39814 if ((contents & (h1 | h3)) == contents)
39816 /* punpckl* */
39817 for (i = 0; i < nelt2; ++i)
39819 remap[i] = i * 2;
39820 remap[i + nelt] = i * 2 + 1;
39821 dremap.perm[i * 2] = i;
39822 dremap.perm[i * 2 + 1] = i + nelt;
39824 if (!TARGET_SSE2 && d->vmode == V4SImode)
39825 dremap.vmode = V4SFmode;
39827 else if ((contents & (h2 | h4)) == contents)
39829 /* punpckh* */
39830 for (i = 0; i < nelt2; ++i)
39832 remap[i + nelt2] = i * 2;
39833 remap[i + nelt + nelt2] = i * 2 + 1;
39834 dremap.perm[i * 2] = i + nelt2;
39835 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39837 if (!TARGET_SSE2 && d->vmode == V4SImode)
39838 dremap.vmode = V4SFmode;
39840 else if ((contents & (h1 | h4)) == contents)
39842 /* shufps */
39843 for (i = 0; i < nelt2; ++i)
39845 remap[i] = i;
39846 remap[i + nelt + nelt2] = i + nelt2;
39847 dremap.perm[i] = i;
39848 dremap.perm[i + nelt2] = i + nelt + nelt2;
39850 if (nelt != 4)
39852 /* shufpd */
39853 dremap.vmode = V2DImode;
39854 dremap.nelt = 2;
39855 dremap.perm[0] = 0;
39856 dremap.perm[1] = 3;
39859 else if ((contents & (h2 | h3)) == contents)
39861 /* shufps */
39862 for (i = 0; i < nelt2; ++i)
39864 remap[i + nelt2] = i;
39865 remap[i + nelt] = i + nelt2;
39866 dremap.perm[i] = i + nelt2;
39867 dremap.perm[i + nelt2] = i + nelt;
39869 if (nelt != 4)
39871 /* shufpd */
39872 dremap.vmode = V2DImode;
39873 dremap.nelt = 2;
39874 dremap.perm[0] = 1;
39875 dremap.perm[1] = 2;
39878 else
39879 return false;
39881 else
39883 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39884 unsigned HOST_WIDE_INT q[8];
39885 unsigned int nonzero_halves[4];
39887 /* Split the two input vectors into 8 quarters. */
39888 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39889 for (i = 1; i < 8; ++i)
39890 q[i] = q[0] << (nelt4 * i);
39891 for (i = 0; i < 4; ++i)
39892 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39894 nonzero_halves[nzcnt] = i;
39895 ++nzcnt;
39898 if (nzcnt == 1)
39900 gcc_assert (d->one_operand_p);
39901 nonzero_halves[1] = nonzero_halves[0];
39902 same_halves = true;
39904 else if (d->one_operand_p)
39906 gcc_assert (nonzero_halves[0] == 0);
39907 gcc_assert (nonzero_halves[1] == 1);
39910 if (nzcnt <= 2)
39912 if (d->perm[0] / nelt2 == nonzero_halves[1])
39914 /* Attempt to increase the likelihood that dfinal
39915 shuffle will be intra-lane. */
39916 char tmph = nonzero_halves[0];
39917 nonzero_halves[0] = nonzero_halves[1];
39918 nonzero_halves[1] = tmph;
39921 /* vperm2f128 or vperm2i128. */
39922 for (i = 0; i < nelt2; ++i)
39924 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39925 remap[i + nonzero_halves[0] * nelt2] = i;
39926 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39927 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39930 if (d->vmode != V8SFmode
39931 && d->vmode != V4DFmode
39932 && d->vmode != V8SImode)
39934 dremap.vmode = V8SImode;
39935 dremap.nelt = 8;
39936 for (i = 0; i < 4; ++i)
39938 dremap.perm[i] = i + nonzero_halves[0] * 4;
39939 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39943 else if (d->one_operand_p)
39944 return false;
39945 else if (TARGET_AVX2
39946 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39948 /* vpunpckl* */
39949 for (i = 0; i < nelt4; ++i)
39951 remap[i] = i * 2;
39952 remap[i + nelt] = i * 2 + 1;
39953 remap[i + nelt2] = i * 2 + nelt2;
39954 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39955 dremap.perm[i * 2] = i;
39956 dremap.perm[i * 2 + 1] = i + nelt;
39957 dremap.perm[i * 2 + nelt2] = i + nelt2;
39958 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39961 else if (TARGET_AVX2
39962 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39964 /* vpunpckh* */
39965 for (i = 0; i < nelt4; ++i)
39967 remap[i + nelt4] = i * 2;
39968 remap[i + nelt + nelt4] = i * 2 + 1;
39969 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39970 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39971 dremap.perm[i * 2] = i + nelt4;
39972 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39973 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39974 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39977 else
39978 return false;
39981 /* Use the remapping array set up above to move the elements from their
39982 swizzled locations into their final destinations. */
39983 dfinal = *d;
39984 for (i = 0; i < nelt; ++i)
39986 unsigned e = remap[d->perm[i]];
39987 gcc_assert (e < nelt);
39988 /* If same_halves is true, both halves of the remapped vector are the
39989 same. Avoid cross-lane accesses if possible. */
39990 if (same_halves && i >= nelt2)
39992 gcc_assert (e < nelt2);
39993 dfinal.perm[i] = e + nelt2;
39995 else
39996 dfinal.perm[i] = e;
39998 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39999 dfinal.op1 = dfinal.op0;
40000 dfinal.one_operand_p = true;
40001 dremap.target = dfinal.op0;
40003 /* Test if the final remap can be done with a single insn. For V4SFmode or
40004 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
40005 start_sequence ();
40006 ok = expand_vec_perm_1 (&dfinal);
40007 seq = get_insns ();
40008 end_sequence ();
40010 if (!ok)
40011 return false;
40013 if (d->testing_p)
40014 return true;
40016 if (dremap.vmode != dfinal.vmode)
40018 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
40019 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
40020 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
40023 ok = expand_vec_perm_1 (&dremap);
40024 gcc_assert (ok);
40026 emit_insn (seq);
40027 return true;
40030 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40031 a single vector cross-lane permutation into vpermq followed
40032 by any of the single insn permutations. */
40034 static bool
40035 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
40037 struct expand_vec_perm_d dremap, dfinal;
40038 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
40039 unsigned contents[2];
40040 bool ok;
40042 if (!(TARGET_AVX2
40043 && (d->vmode == V32QImode || d->vmode == V16HImode)
40044 && d->one_operand_p))
40045 return false;
40047 contents[0] = 0;
40048 contents[1] = 0;
40049 for (i = 0; i < nelt2; ++i)
40051 contents[0] |= 1u << (d->perm[i] / nelt4);
40052 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
40055 for (i = 0; i < 2; ++i)
40057 unsigned int cnt = 0;
40058 for (j = 0; j < 4; ++j)
40059 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
40060 return false;
40063 if (d->testing_p)
40064 return true;
40066 dremap = *d;
40067 dremap.vmode = V4DImode;
40068 dremap.nelt = 4;
40069 dremap.target = gen_reg_rtx (V4DImode);
40070 dremap.op0 = gen_lowpart (V4DImode, d->op0);
40071 dremap.op1 = dremap.op0;
40072 dremap.one_operand_p = true;
40073 for (i = 0; i < 2; ++i)
40075 unsigned int cnt = 0;
40076 for (j = 0; j < 4; ++j)
40077 if ((contents[i] & (1u << j)) != 0)
40078 dremap.perm[2 * i + cnt++] = j;
40079 for (; cnt < 2; ++cnt)
40080 dremap.perm[2 * i + cnt] = 0;
40083 dfinal = *d;
40084 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40085 dfinal.op1 = dfinal.op0;
40086 dfinal.one_operand_p = true;
40087 for (i = 0, j = 0; i < nelt; ++i)
40089 if (i == nelt2)
40090 j = 2;
40091 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
40092 if ((d->perm[i] / nelt4) == dremap.perm[j])
40094 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
40095 dfinal.perm[i] |= nelt4;
40096 else
40097 gcc_unreachable ();
40100 ok = expand_vec_perm_1 (&dremap);
40101 gcc_assert (ok);
40103 ok = expand_vec_perm_1 (&dfinal);
40104 gcc_assert (ok);
40106 return true;
40109 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
40110 a vector permutation using two instructions, vperm2f128 resp.
40111 vperm2i128 followed by any single in-lane permutation. */
40113 static bool
40114 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
40116 struct expand_vec_perm_d dfirst, dsecond;
40117 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
40118 bool ok;
40120 if (!TARGET_AVX
40121 || GET_MODE_SIZE (d->vmode) != 32
40122 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
40123 return false;
40125 dsecond = *d;
40126 dsecond.one_operand_p = false;
40127 dsecond.testing_p = true;
40129 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
40130 immediate. For perm < 16 the second permutation uses
40131 d->op0 as first operand, for perm >= 16 it uses d->op1
40132 as first operand. The second operand is the result of
40133 vperm2[fi]128. */
40134 for (perm = 0; perm < 32; perm++)
40136 /* Ignore permutations which do not move anything cross-lane. */
40137 if (perm < 16)
40139 /* The second shuffle for e.g. V4DFmode has
40140 0123 and ABCD operands.
40141 Ignore AB23, as 23 is already in the second lane
40142 of the first operand. */
40143 if ((perm & 0xc) == (1 << 2)) continue;
40144 /* And 01CD, as 01 is in the first lane of the first
40145 operand. */
40146 if ((perm & 3) == 0) continue;
40147 /* And 4567, as then the vperm2[fi]128 doesn't change
40148 anything on the original 4567 second operand. */
40149 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
40151 else
40153 /* The second shuffle for e.g. V4DFmode has
40154 4567 and ABCD operands.
40155 Ignore AB67, as 67 is already in the second lane
40156 of the first operand. */
40157 if ((perm & 0xc) == (3 << 2)) continue;
40158 /* And 45CD, as 45 is in the first lane of the first
40159 operand. */
40160 if ((perm & 3) == 2) continue;
40161 /* And 0123, as then the vperm2[fi]128 doesn't change
40162 anything on the original 0123 first operand. */
40163 if ((perm & 0xf) == (1 << 2)) continue;
40166 for (i = 0; i < nelt; i++)
40168 j = d->perm[i] / nelt2;
40169 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
40170 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
40171 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
40172 dsecond.perm[i] = d->perm[i] & (nelt - 1);
40173 else
40174 break;
40177 if (i == nelt)
40179 start_sequence ();
40180 ok = expand_vec_perm_1 (&dsecond);
40181 end_sequence ();
40183 else
40184 ok = false;
40186 if (ok)
40188 if (d->testing_p)
40189 return true;
40191 /* Found a usable second shuffle. dfirst will be
40192 vperm2f128 on d->op0 and d->op1. */
40193 dsecond.testing_p = false;
40194 dfirst = *d;
40195 dfirst.target = gen_reg_rtx (d->vmode);
40196 for (i = 0; i < nelt; i++)
40197 dfirst.perm[i] = (i & (nelt2 - 1))
40198 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
40200 ok = expand_vec_perm_1 (&dfirst);
40201 gcc_assert (ok);
40203 /* And dsecond is some single insn shuffle, taking
40204 d->op0 and result of vperm2f128 (if perm < 16) or
40205 d->op1 and result of vperm2f128 (otherwise). */
40206 dsecond.op1 = dfirst.target;
40207 if (perm >= 16)
40208 dsecond.op0 = dfirst.op1;
40210 ok = expand_vec_perm_1 (&dsecond);
40211 gcc_assert (ok);
40213 return true;
40216 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
40217 if (d->one_operand_p)
40218 return false;
40221 return false;
40224 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40225 a two vector permutation using 2 intra-lane interleave insns
40226 and cross-lane shuffle for 32-byte vectors. */
40228 static bool
40229 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
40231 unsigned i, nelt;
40232 rtx (*gen) (rtx, rtx, rtx);
40234 if (d->one_operand_p)
40235 return false;
40236 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
40238 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
40240 else
40241 return false;
40243 nelt = d->nelt;
40244 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
40245 return false;
40246 for (i = 0; i < nelt; i += 2)
40247 if (d->perm[i] != d->perm[0] + i / 2
40248 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
40249 return false;
40251 if (d->testing_p)
40252 return true;
40254 switch (d->vmode)
40256 case V32QImode:
40257 if (d->perm[0])
40258 gen = gen_vec_interleave_highv32qi;
40259 else
40260 gen = gen_vec_interleave_lowv32qi;
40261 break;
40262 case V16HImode:
40263 if (d->perm[0])
40264 gen = gen_vec_interleave_highv16hi;
40265 else
40266 gen = gen_vec_interleave_lowv16hi;
40267 break;
40268 case V8SImode:
40269 if (d->perm[0])
40270 gen = gen_vec_interleave_highv8si;
40271 else
40272 gen = gen_vec_interleave_lowv8si;
40273 break;
40274 case V4DImode:
40275 if (d->perm[0])
40276 gen = gen_vec_interleave_highv4di;
40277 else
40278 gen = gen_vec_interleave_lowv4di;
40279 break;
40280 case V8SFmode:
40281 if (d->perm[0])
40282 gen = gen_vec_interleave_highv8sf;
40283 else
40284 gen = gen_vec_interleave_lowv8sf;
40285 break;
40286 case V4DFmode:
40287 if (d->perm[0])
40288 gen = gen_vec_interleave_highv4df;
40289 else
40290 gen = gen_vec_interleave_lowv4df;
40291 break;
40292 default:
40293 gcc_unreachable ();
40296 emit_insn (gen (d->target, d->op0, d->op1));
40297 return true;
40300 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
40301 a single vector permutation using a single intra-lane vector
40302 permutation, vperm2f128 swapping the lanes and vblend* insn blending
40303 the non-swapped and swapped vectors together. */
40305 static bool
40306 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
40308 struct expand_vec_perm_d dfirst, dsecond;
40309 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
40310 rtx seq;
40311 bool ok;
40312 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
40314 if (!TARGET_AVX
40315 || TARGET_AVX2
40316 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
40317 || !d->one_operand_p)
40318 return false;
40320 dfirst = *d;
40321 for (i = 0; i < nelt; i++)
40322 dfirst.perm[i] = 0xff;
40323 for (i = 0, msk = 0; i < nelt; i++)
40325 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
40326 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
40327 return false;
40328 dfirst.perm[j] = d->perm[i];
40329 if (j != i)
40330 msk |= (1 << i);
40332 for (i = 0; i < nelt; i++)
40333 if (dfirst.perm[i] == 0xff)
40334 dfirst.perm[i] = i;
40336 if (!d->testing_p)
40337 dfirst.target = gen_reg_rtx (dfirst.vmode);
40339 start_sequence ();
40340 ok = expand_vec_perm_1 (&dfirst);
40341 seq = get_insns ();
40342 end_sequence ();
40344 if (!ok)
40345 return false;
40347 if (d->testing_p)
40348 return true;
40350 emit_insn (seq);
40352 dsecond = *d;
40353 dsecond.op0 = dfirst.target;
40354 dsecond.op1 = dfirst.target;
40355 dsecond.one_operand_p = true;
40356 dsecond.target = gen_reg_rtx (dsecond.vmode);
40357 for (i = 0; i < nelt; i++)
40358 dsecond.perm[i] = i ^ nelt2;
40360 ok = expand_vec_perm_1 (&dsecond);
40361 gcc_assert (ok);
40363 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
40364 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
40365 return true;
40368 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
40369 permutation using two vperm2f128, followed by a vshufpd insn blending
40370 the two vectors together. */
40372 static bool
40373 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
40375 struct expand_vec_perm_d dfirst, dsecond, dthird;
40376 bool ok;
40378 if (!TARGET_AVX || (d->vmode != V4DFmode))
40379 return false;
40381 if (d->testing_p)
40382 return true;
40384 dfirst = *d;
40385 dsecond = *d;
40386 dthird = *d;
40388 dfirst.perm[0] = (d->perm[0] & ~1);
40389 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
40390 dfirst.perm[2] = (d->perm[2] & ~1);
40391 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
40392 dsecond.perm[0] = (d->perm[1] & ~1);
40393 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
40394 dsecond.perm[2] = (d->perm[3] & ~1);
40395 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
40396 dthird.perm[0] = (d->perm[0] % 2);
40397 dthird.perm[1] = (d->perm[1] % 2) + 4;
40398 dthird.perm[2] = (d->perm[2] % 2) + 2;
40399 dthird.perm[3] = (d->perm[3] % 2) + 6;
40401 dfirst.target = gen_reg_rtx (dfirst.vmode);
40402 dsecond.target = gen_reg_rtx (dsecond.vmode);
40403 dthird.op0 = dfirst.target;
40404 dthird.op1 = dsecond.target;
40405 dthird.one_operand_p = false;
40407 canonicalize_perm (&dfirst);
40408 canonicalize_perm (&dsecond);
40410 ok = expand_vec_perm_1 (&dfirst)
40411 && expand_vec_perm_1 (&dsecond)
40412 && expand_vec_perm_1 (&dthird);
40414 gcc_assert (ok);
40416 return true;
40419 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
40420 permutation with two pshufb insns and an ior. We should have already
40421 failed all two instruction sequences. */
40423 static bool
40424 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
40426 rtx rperm[2][16], vperm, l, h, op, m128;
40427 unsigned int i, nelt, eltsz;
40429 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40430 return false;
40431 gcc_assert (!d->one_operand_p);
40433 nelt = d->nelt;
40434 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40436 /* Generate two permutation masks. If the required element is within
40437 the given vector it is shuffled into the proper lane. If the required
40438 element is in the other vector, force a zero into the lane by setting
40439 bit 7 in the permutation mask. */
40440 m128 = GEN_INT (-128);
40441 for (i = 0; i < nelt; ++i)
40443 unsigned j, e = d->perm[i];
40444 unsigned which = (e >= nelt);
40445 if (e >= nelt)
40446 e -= nelt;
40448 for (j = 0; j < eltsz; ++j)
40450 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
40451 rperm[1-which][i*eltsz + j] = m128;
40455 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
40456 vperm = force_reg (V16QImode, vperm);
40458 l = gen_reg_rtx (V16QImode);
40459 op = gen_lowpart (V16QImode, d->op0);
40460 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
40462 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
40463 vperm = force_reg (V16QImode, vperm);
40465 h = gen_reg_rtx (V16QImode);
40466 op = gen_lowpart (V16QImode, d->op1);
40467 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
40469 op = gen_lowpart (V16QImode, d->target);
40470 emit_insn (gen_iorv16qi3 (op, l, h));
40472 return true;
40475 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
40476 with two vpshufb insns, vpermq and vpor. We should have already failed
40477 all two or three instruction sequences. */
40479 static bool
40480 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
40482 rtx rperm[2][32], vperm, l, h, hp, op, m128;
40483 unsigned int i, nelt, eltsz;
40485 if (!TARGET_AVX2
40486 || !d->one_operand_p
40487 || (d->vmode != V32QImode && d->vmode != V16HImode))
40488 return false;
40490 if (d->testing_p)
40491 return true;
40493 nelt = d->nelt;
40494 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40496 /* Generate two permutation masks. If the required element is within
40497 the same lane, it is shuffled in. If the required element from the
40498 other lane, force a zero by setting bit 7 in the permutation mask.
40499 In the other mask the mask has non-negative elements if element
40500 is requested from the other lane, but also moved to the other lane,
40501 so that the result of vpshufb can have the two V2TImode halves
40502 swapped. */
40503 m128 = GEN_INT (-128);
40504 for (i = 0; i < nelt; ++i)
40506 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40507 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40509 for (j = 0; j < eltsz; ++j)
40511 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
40512 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
40516 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40517 vperm = force_reg (V32QImode, vperm);
40519 h = gen_reg_rtx (V32QImode);
40520 op = gen_lowpart (V32QImode, d->op0);
40521 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40523 /* Swap the 128-byte lanes of h into hp. */
40524 hp = gen_reg_rtx (V4DImode);
40525 op = gen_lowpart (V4DImode, h);
40526 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
40527 const1_rtx));
40529 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40530 vperm = force_reg (V32QImode, vperm);
40532 l = gen_reg_rtx (V32QImode);
40533 op = gen_lowpart (V32QImode, d->op0);
40534 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40536 op = gen_lowpart (V32QImode, d->target);
40537 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
40539 return true;
40542 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
40543 and extract-odd permutations of two V32QImode and V16QImode operand
40544 with two vpshufb insns, vpor and vpermq. We should have already
40545 failed all two or three instruction sequences. */
40547 static bool
40548 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
40550 rtx rperm[2][32], vperm, l, h, ior, op, m128;
40551 unsigned int i, nelt, eltsz;
40553 if (!TARGET_AVX2
40554 || d->one_operand_p
40555 || (d->vmode != V32QImode && d->vmode != V16HImode))
40556 return false;
40558 for (i = 0; i < d->nelt; ++i)
40559 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
40560 return false;
40562 if (d->testing_p)
40563 return true;
40565 nelt = d->nelt;
40566 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40568 /* Generate two permutation masks. In the first permutation mask
40569 the first quarter will contain indexes for the first half
40570 of the op0, the second quarter will contain bit 7 set, third quarter
40571 will contain indexes for the second half of the op0 and the
40572 last quarter bit 7 set. In the second permutation mask
40573 the first quarter will contain bit 7 set, the second quarter
40574 indexes for the first half of the op1, the third quarter bit 7 set
40575 and last quarter indexes for the second half of the op1.
40576 I.e. the first mask e.g. for V32QImode extract even will be:
40577 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
40578 (all values masked with 0xf except for -128) and second mask
40579 for extract even will be
40580 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
40581 m128 = GEN_INT (-128);
40582 for (i = 0; i < nelt; ++i)
40584 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40585 unsigned which = d->perm[i] >= nelt;
40586 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
40588 for (j = 0; j < eltsz; ++j)
40590 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
40591 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
40595 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40596 vperm = force_reg (V32QImode, vperm);
40598 l = gen_reg_rtx (V32QImode);
40599 op = gen_lowpart (V32QImode, d->op0);
40600 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40602 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40603 vperm = force_reg (V32QImode, vperm);
40605 h = gen_reg_rtx (V32QImode);
40606 op = gen_lowpart (V32QImode, d->op1);
40607 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40609 ior = gen_reg_rtx (V32QImode);
40610 emit_insn (gen_iorv32qi3 (ior, l, h));
40612 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
40613 op = gen_lowpart (V4DImode, d->target);
40614 ior = gen_lowpart (V4DImode, ior);
40615 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
40616 const1_rtx, GEN_INT (3)));
40618 return true;
40621 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
40622 and extract-odd permutations. */
40624 static bool
40625 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
40627 rtx t1, t2, t3;
40629 switch (d->vmode)
40631 case V4DFmode:
40632 t1 = gen_reg_rtx (V4DFmode);
40633 t2 = gen_reg_rtx (V4DFmode);
40635 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40636 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
40637 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
40639 /* Now an unpck[lh]pd will produce the result required. */
40640 if (odd)
40641 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
40642 else
40643 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
40644 emit_insn (t3);
40645 break;
40647 case V8SFmode:
40649 int mask = odd ? 0xdd : 0x88;
40651 t1 = gen_reg_rtx (V8SFmode);
40652 t2 = gen_reg_rtx (V8SFmode);
40653 t3 = gen_reg_rtx (V8SFmode);
40655 /* Shuffle within the 128-bit lanes to produce:
40656 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
40657 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
40658 GEN_INT (mask)));
40660 /* Shuffle the lanes around to produce:
40661 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
40662 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
40663 GEN_INT (0x3)));
40665 /* Shuffle within the 128-bit lanes to produce:
40666 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
40667 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
40669 /* Shuffle within the 128-bit lanes to produce:
40670 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
40671 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
40673 /* Shuffle the lanes around to produce:
40674 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
40675 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
40676 GEN_INT (0x20)));
40678 break;
40680 case V2DFmode:
40681 case V4SFmode:
40682 case V2DImode:
40683 case V4SImode:
40684 /* These are always directly implementable by expand_vec_perm_1. */
40685 gcc_unreachable ();
40687 case V8HImode:
40688 if (TARGET_SSSE3)
40689 return expand_vec_perm_pshufb2 (d);
40690 else
40692 /* We need 2*log2(N)-1 operations to achieve odd/even
40693 with interleave. */
40694 t1 = gen_reg_rtx (V8HImode);
40695 t2 = gen_reg_rtx (V8HImode);
40696 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
40697 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
40698 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
40699 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
40700 if (odd)
40701 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
40702 else
40703 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
40704 emit_insn (t3);
40706 break;
40708 case V16QImode:
40709 if (TARGET_SSSE3)
40710 return expand_vec_perm_pshufb2 (d);
40711 else
40713 t1 = gen_reg_rtx (V16QImode);
40714 t2 = gen_reg_rtx (V16QImode);
40715 t3 = gen_reg_rtx (V16QImode);
40716 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
40717 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
40718 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
40719 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
40720 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
40721 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40722 if (odd)
40723 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40724 else
40725 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40726 emit_insn (t3);
40728 break;
40730 case V16HImode:
40731 case V32QImode:
40732 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40734 case V4DImode:
40735 if (!TARGET_AVX2)
40737 struct expand_vec_perm_d d_copy = *d;
40738 d_copy.vmode = V4DFmode;
40739 d_copy.target = gen_lowpart (V4DFmode, d->target);
40740 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40741 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40742 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40745 t1 = gen_reg_rtx (V4DImode);
40746 t2 = gen_reg_rtx (V4DImode);
40748 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40749 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40750 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40752 /* Now an vpunpck[lh]qdq will produce the result required. */
40753 if (odd)
40754 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40755 else
40756 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40757 emit_insn (t3);
40758 break;
40760 case V8SImode:
40761 if (!TARGET_AVX2)
40763 struct expand_vec_perm_d d_copy = *d;
40764 d_copy.vmode = V8SFmode;
40765 d_copy.target = gen_lowpart (V8SFmode, d->target);
40766 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40767 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40768 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40771 t1 = gen_reg_rtx (V8SImode);
40772 t2 = gen_reg_rtx (V8SImode);
40774 /* Shuffle the lanes around into
40775 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40776 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40777 gen_lowpart (V4DImode, d->op0),
40778 gen_lowpart (V4DImode, d->op1),
40779 GEN_INT (0x20)));
40780 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40781 gen_lowpart (V4DImode, d->op0),
40782 gen_lowpart (V4DImode, d->op1),
40783 GEN_INT (0x31)));
40785 /* Swap the 2nd and 3rd position in each lane into
40786 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40787 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40788 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40789 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40790 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40792 /* Now an vpunpck[lh]qdq will produce
40793 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40794 if (odd)
40795 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40796 gen_lowpart (V4DImode, t1),
40797 gen_lowpart (V4DImode, t2));
40798 else
40799 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40800 gen_lowpart (V4DImode, t1),
40801 gen_lowpart (V4DImode, t2));
40802 emit_insn (t3);
40803 break;
40805 default:
40806 gcc_unreachable ();
40809 return true;
40812 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40813 extract-even and extract-odd permutations. */
40815 static bool
40816 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40818 unsigned i, odd, nelt = d->nelt;
40820 odd = d->perm[0];
40821 if (odd != 0 && odd != 1)
40822 return false;
40824 for (i = 1; i < nelt; ++i)
40825 if (d->perm[i] != 2 * i + odd)
40826 return false;
40828 return expand_vec_perm_even_odd_1 (d, odd);
40831 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40832 permutations. We assume that expand_vec_perm_1 has already failed. */
40834 static bool
40835 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40837 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40838 enum machine_mode vmode = d->vmode;
40839 unsigned char perm2[4];
40840 rtx op0 = d->op0;
40841 bool ok;
40843 switch (vmode)
40845 case V4DFmode:
40846 case V8SFmode:
40847 /* These are special-cased in sse.md so that we can optionally
40848 use the vbroadcast instruction. They expand to two insns
40849 if the input happens to be in a register. */
40850 gcc_unreachable ();
40852 case V2DFmode:
40853 case V2DImode:
40854 case V4SFmode:
40855 case V4SImode:
40856 /* These are always implementable using standard shuffle patterns. */
40857 gcc_unreachable ();
40859 case V8HImode:
40860 case V16QImode:
40861 /* These can be implemented via interleave. We save one insn by
40862 stopping once we have promoted to V4SImode and then use pshufd. */
40865 rtx dest;
40866 rtx (*gen) (rtx, rtx, rtx)
40867 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40868 : gen_vec_interleave_lowv8hi;
40870 if (elt >= nelt2)
40872 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40873 : gen_vec_interleave_highv8hi;
40874 elt -= nelt2;
40876 nelt2 /= 2;
40878 dest = gen_reg_rtx (vmode);
40879 emit_insn (gen (dest, op0, op0));
40880 vmode = get_mode_wider_vector (vmode);
40881 op0 = gen_lowpart (vmode, dest);
40883 while (vmode != V4SImode);
40885 memset (perm2, elt, 4);
40886 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40887 d->testing_p);
40888 gcc_assert (ok);
40889 return true;
40891 case V32QImode:
40892 case V16HImode:
40893 case V8SImode:
40894 case V4DImode:
40895 /* For AVX2 broadcasts of the first element vpbroadcast* or
40896 vpermq should be used by expand_vec_perm_1. */
40897 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40898 return false;
40900 default:
40901 gcc_unreachable ();
40905 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40906 broadcast permutations. */
40908 static bool
40909 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40911 unsigned i, elt, nelt = d->nelt;
40913 if (!d->one_operand_p)
40914 return false;
40916 elt = d->perm[0];
40917 for (i = 1; i < nelt; ++i)
40918 if (d->perm[i] != elt)
40919 return false;
40921 return expand_vec_perm_broadcast_1 (d);
40924 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40925 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40926 all the shorter instruction sequences. */
40928 static bool
40929 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40931 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40932 unsigned int i, nelt, eltsz;
40933 bool used[4];
40935 if (!TARGET_AVX2
40936 || d->one_operand_p
40937 || (d->vmode != V32QImode && d->vmode != V16HImode))
40938 return false;
40940 if (d->testing_p)
40941 return true;
40943 nelt = d->nelt;
40944 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40946 /* Generate 4 permutation masks. If the required element is within
40947 the same lane, it is shuffled in. If the required element from the
40948 other lane, force a zero by setting bit 7 in the permutation mask.
40949 In the other mask the mask has non-negative elements if element
40950 is requested from the other lane, but also moved to the other lane,
40951 so that the result of vpshufb can have the two V2TImode halves
40952 swapped. */
40953 m128 = GEN_INT (-128);
40954 for (i = 0; i < 32; ++i)
40956 rperm[0][i] = m128;
40957 rperm[1][i] = m128;
40958 rperm[2][i] = m128;
40959 rperm[3][i] = m128;
40961 used[0] = false;
40962 used[1] = false;
40963 used[2] = false;
40964 used[3] = false;
40965 for (i = 0; i < nelt; ++i)
40967 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40968 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40969 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40971 for (j = 0; j < eltsz; ++j)
40972 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40973 used[which] = true;
40976 for (i = 0; i < 2; ++i)
40978 if (!used[2 * i + 1])
40980 h[i] = NULL_RTX;
40981 continue;
40983 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40984 gen_rtvec_v (32, rperm[2 * i + 1]));
40985 vperm = force_reg (V32QImode, vperm);
40986 h[i] = gen_reg_rtx (V32QImode);
40987 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40988 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40991 /* Swap the 128-byte lanes of h[X]. */
40992 for (i = 0; i < 2; ++i)
40994 if (h[i] == NULL_RTX)
40995 continue;
40996 op = gen_reg_rtx (V4DImode);
40997 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40998 const2_rtx, GEN_INT (3), const0_rtx,
40999 const1_rtx));
41000 h[i] = gen_lowpart (V32QImode, op);
41003 for (i = 0; i < 2; ++i)
41005 if (!used[2 * i])
41007 l[i] = NULL_RTX;
41008 continue;
41010 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
41011 vperm = force_reg (V32QImode, vperm);
41012 l[i] = gen_reg_rtx (V32QImode);
41013 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41014 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
41017 for (i = 0; i < 2; ++i)
41019 if (h[i] && l[i])
41021 op = gen_reg_rtx (V32QImode);
41022 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
41023 l[i] = op;
41025 else if (h[i])
41026 l[i] = h[i];
41029 gcc_assert (l[0] && l[1]);
41030 op = gen_lowpart (V32QImode, d->target);
41031 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
41032 return true;
41035 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
41036 With all of the interface bits taken care of, perform the expansion
41037 in D and return true on success. */
41039 static bool
41040 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
41042 /* Try a single instruction expansion. */
41043 if (expand_vec_perm_1 (d))
41044 return true;
41046 /* Try sequences of two instructions. */
41048 if (expand_vec_perm_pshuflw_pshufhw (d))
41049 return true;
41051 if (expand_vec_perm_palignr (d))
41052 return true;
41054 if (expand_vec_perm_interleave2 (d))
41055 return true;
41057 if (expand_vec_perm_broadcast (d))
41058 return true;
41060 if (expand_vec_perm_vpermq_perm_1 (d))
41061 return true;
41063 if (expand_vec_perm_vperm2f128 (d))
41064 return true;
41066 /* Try sequences of three instructions. */
41068 if (expand_vec_perm_2vperm2f128_vshuf (d))
41069 return true;
41071 if (expand_vec_perm_pshufb2 (d))
41072 return true;
41074 if (expand_vec_perm_interleave3 (d))
41075 return true;
41077 if (expand_vec_perm_vperm2f128_vblend (d))
41078 return true;
41080 /* Try sequences of four instructions. */
41082 if (expand_vec_perm_vpshufb2_vpermq (d))
41083 return true;
41085 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
41086 return true;
41088 /* ??? Look for narrow permutations whose element orderings would
41089 allow the promotion to a wider mode. */
41091 /* ??? Look for sequences of interleave or a wider permute that place
41092 the data into the correct lanes for a half-vector shuffle like
41093 pshuf[lh]w or vpermilps. */
41095 /* ??? Look for sequences of interleave that produce the desired results.
41096 The combinatorics of punpck[lh] get pretty ugly... */
41098 if (expand_vec_perm_even_odd (d))
41099 return true;
41101 /* Even longer sequences. */
41102 if (expand_vec_perm_vpshufb4_vpermq2 (d))
41103 return true;
41105 return false;
41108 /* If a permutation only uses one operand, make it clear. Returns true
41109 if the permutation references both operands. */
41111 static bool
41112 canonicalize_perm (struct expand_vec_perm_d *d)
41114 int i, which, nelt = d->nelt;
41116 for (i = which = 0; i < nelt; ++i)
41117 which |= (d->perm[i] < nelt ? 1 : 2);
41119 d->one_operand_p = true;
41120 switch (which)
41122 default:
41123 gcc_unreachable();
41125 case 3:
41126 if (!rtx_equal_p (d->op0, d->op1))
41128 d->one_operand_p = false;
41129 break;
41131 /* The elements of PERM do not suggest that only the first operand
41132 is used, but both operands are identical. Allow easier matching
41133 of the permutation by folding the permutation into the single
41134 input vector. */
41135 /* FALLTHRU */
41137 case 2:
41138 for (i = 0; i < nelt; ++i)
41139 d->perm[i] &= nelt - 1;
41140 d->op0 = d->op1;
41141 break;
41143 case 1:
41144 d->op1 = d->op0;
41145 break;
41148 return (which == 3);
41151 bool
41152 ix86_expand_vec_perm_const (rtx operands[4])
41154 struct expand_vec_perm_d d;
41155 unsigned char perm[MAX_VECT_LEN];
41156 int i, nelt;
41157 bool two_args;
41158 rtx sel;
41160 d.target = operands[0];
41161 d.op0 = operands[1];
41162 d.op1 = operands[2];
41163 sel = operands[3];
41165 d.vmode = GET_MODE (d.target);
41166 gcc_assert (VECTOR_MODE_P (d.vmode));
41167 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41168 d.testing_p = false;
41170 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
41171 gcc_assert (XVECLEN (sel, 0) == nelt);
41172 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
41174 for (i = 0; i < nelt; ++i)
41176 rtx e = XVECEXP (sel, 0, i);
41177 int ei = INTVAL (e) & (2 * nelt - 1);
41178 d.perm[i] = ei;
41179 perm[i] = ei;
41182 two_args = canonicalize_perm (&d);
41184 if (ix86_expand_vec_perm_const_1 (&d))
41185 return true;
41187 /* If the selector says both arguments are needed, but the operands are the
41188 same, the above tried to expand with one_operand_p and flattened selector.
41189 If that didn't work, retry without one_operand_p; we succeeded with that
41190 during testing. */
41191 if (two_args && d.one_operand_p)
41193 d.one_operand_p = false;
41194 memcpy (d.perm, perm, sizeof (perm));
41195 return ix86_expand_vec_perm_const_1 (&d);
41198 return false;
41201 /* Implement targetm.vectorize.vec_perm_const_ok. */
41203 static bool
41204 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
41205 const unsigned char *sel)
41207 struct expand_vec_perm_d d;
41208 unsigned int i, nelt, which;
41209 bool ret;
41211 d.vmode = vmode;
41212 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41213 d.testing_p = true;
41215 /* Given sufficient ISA support we can just return true here
41216 for selected vector modes. */
41217 if (GET_MODE_SIZE (d.vmode) == 16)
41219 /* All implementable with a single vpperm insn. */
41220 if (TARGET_XOP)
41221 return true;
41222 /* All implementable with 2 pshufb + 1 ior. */
41223 if (TARGET_SSSE3)
41224 return true;
41225 /* All implementable with shufpd or unpck[lh]pd. */
41226 if (d.nelt == 2)
41227 return true;
41230 /* Extract the values from the vector CST into the permutation
41231 array in D. */
41232 memcpy (d.perm, sel, nelt);
41233 for (i = which = 0; i < nelt; ++i)
41235 unsigned char e = d.perm[i];
41236 gcc_assert (e < 2 * nelt);
41237 which |= (e < nelt ? 1 : 2);
41240 /* For all elements from second vector, fold the elements to first. */
41241 if (which == 2)
41242 for (i = 0; i < nelt; ++i)
41243 d.perm[i] -= nelt;
41245 /* Check whether the mask can be applied to the vector type. */
41246 d.one_operand_p = (which != 3);
41248 /* Implementable with shufps or pshufd. */
41249 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
41250 return true;
41252 /* Otherwise we have to go through the motions and see if we can
41253 figure out how to generate the requested permutation. */
41254 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
41255 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
41256 if (!d.one_operand_p)
41257 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
41259 start_sequence ();
41260 ret = ix86_expand_vec_perm_const_1 (&d);
41261 end_sequence ();
41263 return ret;
41266 void
41267 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
41269 struct expand_vec_perm_d d;
41270 unsigned i, nelt;
41272 d.target = targ;
41273 d.op0 = op0;
41274 d.op1 = op1;
41275 d.vmode = GET_MODE (targ);
41276 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41277 d.one_operand_p = false;
41278 d.testing_p = false;
41280 for (i = 0; i < nelt; ++i)
41281 d.perm[i] = i * 2 + odd;
41283 /* We'll either be able to implement the permutation directly... */
41284 if (expand_vec_perm_1 (&d))
41285 return;
41287 /* ... or we use the special-case patterns. */
41288 expand_vec_perm_even_odd_1 (&d, odd);
41291 static void
41292 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
41294 struct expand_vec_perm_d d;
41295 unsigned i, nelt, base;
41296 bool ok;
41298 d.target = targ;
41299 d.op0 = op0;
41300 d.op1 = op1;
41301 d.vmode = GET_MODE (targ);
41302 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41303 d.one_operand_p = false;
41304 d.testing_p = false;
41306 base = high_p ? nelt / 2 : 0;
41307 for (i = 0; i < nelt / 2; ++i)
41309 d.perm[i * 2] = i + base;
41310 d.perm[i * 2 + 1] = i + base + nelt;
41313 /* Note that for AVX this isn't one instruction. */
41314 ok = ix86_expand_vec_perm_const_1 (&d);
41315 gcc_assert (ok);
41319 /* Expand a vector operation CODE for a V*QImode in terms of the
41320 same operation on V*HImode. */
41322 void
41323 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
41325 enum machine_mode qimode = GET_MODE (dest);
41326 enum machine_mode himode;
41327 rtx (*gen_il) (rtx, rtx, rtx);
41328 rtx (*gen_ih) (rtx, rtx, rtx);
41329 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
41330 struct expand_vec_perm_d d;
41331 bool ok, full_interleave;
41332 bool uns_p = false;
41333 int i;
41335 switch (qimode)
41337 case V16QImode:
41338 himode = V8HImode;
41339 gen_il = gen_vec_interleave_lowv16qi;
41340 gen_ih = gen_vec_interleave_highv16qi;
41341 break;
41342 case V32QImode:
41343 himode = V16HImode;
41344 gen_il = gen_avx2_interleave_lowv32qi;
41345 gen_ih = gen_avx2_interleave_highv32qi;
41346 break;
41347 default:
41348 gcc_unreachable ();
41351 op2_l = op2_h = op2;
41352 switch (code)
41354 case MULT:
41355 /* Unpack data such that we've got a source byte in each low byte of
41356 each word. We don't care what goes into the high byte of each word.
41357 Rather than trying to get zero in there, most convenient is to let
41358 it be a copy of the low byte. */
41359 op2_l = gen_reg_rtx (qimode);
41360 op2_h = gen_reg_rtx (qimode);
41361 emit_insn (gen_il (op2_l, op2, op2));
41362 emit_insn (gen_ih (op2_h, op2, op2));
41363 /* FALLTHRU */
41365 op1_l = gen_reg_rtx (qimode);
41366 op1_h = gen_reg_rtx (qimode);
41367 emit_insn (gen_il (op1_l, op1, op1));
41368 emit_insn (gen_ih (op1_h, op1, op1));
41369 full_interleave = qimode == V16QImode;
41370 break;
41372 case ASHIFT:
41373 case LSHIFTRT:
41374 uns_p = true;
41375 /* FALLTHRU */
41376 case ASHIFTRT:
41377 op1_l = gen_reg_rtx (himode);
41378 op1_h = gen_reg_rtx (himode);
41379 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
41380 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
41381 full_interleave = true;
41382 break;
41383 default:
41384 gcc_unreachable ();
41387 /* Perform the operation. */
41388 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
41389 1, OPTAB_DIRECT);
41390 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
41391 1, OPTAB_DIRECT);
41392 gcc_assert (res_l && res_h);
41394 /* Merge the data back into the right place. */
41395 d.target = dest;
41396 d.op0 = gen_lowpart (qimode, res_l);
41397 d.op1 = gen_lowpart (qimode, res_h);
41398 d.vmode = qimode;
41399 d.nelt = GET_MODE_NUNITS (qimode);
41400 d.one_operand_p = false;
41401 d.testing_p = false;
41403 if (full_interleave)
41405 /* For SSE2, we used an full interleave, so the desired
41406 results are in the even elements. */
41407 for (i = 0; i < 32; ++i)
41408 d.perm[i] = i * 2;
41410 else
41412 /* For AVX, the interleave used above was not cross-lane. So the
41413 extraction is evens but with the second and third quarter swapped.
41414 Happily, that is even one insn shorter than even extraction. */
41415 for (i = 0; i < 32; ++i)
41416 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
41419 ok = ix86_expand_vec_perm_const_1 (&d);
41420 gcc_assert (ok);
41422 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41423 gen_rtx_fmt_ee (code, qimode, op1, op2));
41426 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
41427 if op is CONST_VECTOR with all odd elements equal to their
41428 preceding element. */
41430 static bool
41431 const_vector_equal_evenodd_p (rtx op)
41433 enum machine_mode mode = GET_MODE (op);
41434 int i, nunits = GET_MODE_NUNITS (mode);
41435 if (GET_CODE (op) != CONST_VECTOR
41436 || nunits != CONST_VECTOR_NUNITS (op))
41437 return false;
41438 for (i = 0; i < nunits; i += 2)
41439 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
41440 return false;
41441 return true;
41444 void
41445 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
41446 bool uns_p, bool odd_p)
41448 enum machine_mode mode = GET_MODE (op1);
41449 enum machine_mode wmode = GET_MODE (dest);
41450 rtx x;
41451 rtx orig_op1 = op1, orig_op2 = op2;
41453 if (!nonimmediate_operand (op1, mode))
41454 op1 = force_reg (mode, op1);
41455 if (!nonimmediate_operand (op2, mode))
41456 op2 = force_reg (mode, op2);
41458 /* We only play even/odd games with vectors of SImode. */
41459 gcc_assert (mode == V4SImode || mode == V8SImode);
41461 /* If we're looking for the odd results, shift those members down to
41462 the even slots. For some cpus this is faster than a PSHUFD. */
41463 if (odd_p)
41465 /* For XOP use vpmacsdqh, but only for smult, as it is only
41466 signed. */
41467 if (TARGET_XOP && mode == V4SImode && !uns_p)
41469 x = force_reg (wmode, CONST0_RTX (wmode));
41470 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
41471 return;
41474 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
41475 if (!const_vector_equal_evenodd_p (orig_op1))
41476 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
41477 x, NULL, 1, OPTAB_DIRECT);
41478 if (!const_vector_equal_evenodd_p (orig_op2))
41479 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
41480 x, NULL, 1, OPTAB_DIRECT);
41481 op1 = gen_lowpart (mode, op1);
41482 op2 = gen_lowpart (mode, op2);
41485 if (mode == V8SImode)
41487 if (uns_p)
41488 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
41489 else
41490 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
41492 else if (uns_p)
41493 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
41494 else if (TARGET_SSE4_1)
41495 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
41496 else
41498 rtx s1, s2, t0, t1, t2;
41500 /* The easiest way to implement this without PMULDQ is to go through
41501 the motions as if we are performing a full 64-bit multiply. With
41502 the exception that we need to do less shuffling of the elements. */
41504 /* Compute the sign-extension, aka highparts, of the two operands. */
41505 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41506 op1, pc_rtx, pc_rtx);
41507 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41508 op2, pc_rtx, pc_rtx);
41510 /* Multiply LO(A) * HI(B), and vice-versa. */
41511 t1 = gen_reg_rtx (wmode);
41512 t2 = gen_reg_rtx (wmode);
41513 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
41514 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
41516 /* Multiply LO(A) * LO(B). */
41517 t0 = gen_reg_rtx (wmode);
41518 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
41520 /* Combine and shift the highparts into place. */
41521 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
41522 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
41523 1, OPTAB_DIRECT);
41525 /* Combine high and low parts. */
41526 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
41527 return;
41529 emit_insn (x);
41532 void
41533 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
41534 bool uns_p, bool high_p)
41536 enum machine_mode wmode = GET_MODE (dest);
41537 enum machine_mode mode = GET_MODE (op1);
41538 rtx t1, t2, t3, t4, mask;
41540 switch (mode)
41542 case V4SImode:
41543 t1 = gen_reg_rtx (mode);
41544 t2 = gen_reg_rtx (mode);
41545 if (TARGET_XOP && !uns_p)
41547 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
41548 shuffle the elements once so that all elements are in the right
41549 place for immediate use: { A C B D }. */
41550 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
41551 const1_rtx, GEN_INT (3)));
41552 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
41553 const1_rtx, GEN_INT (3)));
41555 else
41557 /* Put the elements into place for the multiply. */
41558 ix86_expand_vec_interleave (t1, op1, op1, high_p);
41559 ix86_expand_vec_interleave (t2, op2, op2, high_p);
41560 high_p = false;
41562 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
41563 break;
41565 case V8SImode:
41566 /* Shuffle the elements between the lanes. After this we
41567 have { A B E F | C D G H } for each operand. */
41568 t1 = gen_reg_rtx (V4DImode);
41569 t2 = gen_reg_rtx (V4DImode);
41570 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
41571 const0_rtx, const2_rtx,
41572 const1_rtx, GEN_INT (3)));
41573 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
41574 const0_rtx, const2_rtx,
41575 const1_rtx, GEN_INT (3)));
41577 /* Shuffle the elements within the lanes. After this we
41578 have { A A B B | C C D D } or { E E F F | G G H H }. */
41579 t3 = gen_reg_rtx (V8SImode);
41580 t4 = gen_reg_rtx (V8SImode);
41581 mask = GEN_INT (high_p
41582 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
41583 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
41584 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
41585 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
41587 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
41588 break;
41590 case V8HImode:
41591 case V16HImode:
41592 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
41593 uns_p, OPTAB_DIRECT);
41594 t2 = expand_binop (mode,
41595 uns_p ? umul_highpart_optab : smul_highpart_optab,
41596 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
41597 gcc_assert (t1 && t2);
41599 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
41600 break;
41602 case V16QImode:
41603 case V32QImode:
41604 t1 = gen_reg_rtx (wmode);
41605 t2 = gen_reg_rtx (wmode);
41606 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
41607 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
41609 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
41610 break;
41612 default:
41613 gcc_unreachable ();
41617 void
41618 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
41620 rtx res_1, res_2;
41622 res_1 = gen_reg_rtx (V4SImode);
41623 res_2 = gen_reg_rtx (V4SImode);
41624 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
41625 op1, op2, true, false);
41626 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
41627 op1, op2, true, true);
41629 /* Move the results in element 2 down to element 1; we don't care
41630 what goes in elements 2 and 3. Then we can merge the parts
41631 back together with an interleave.
41633 Note that two other sequences were tried:
41634 (1) Use interleaves at the start instead of psrldq, which allows
41635 us to use a single shufps to merge things back at the end.
41636 (2) Use shufps here to combine the two vectors, then pshufd to
41637 put the elements in the correct order.
41638 In both cases the cost of the reformatting stall was too high
41639 and the overall sequence slower. */
41641 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
41642 const0_rtx, const0_rtx));
41643 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
41644 const0_rtx, const0_rtx));
41645 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
41647 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
41650 void
41651 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
41653 enum machine_mode mode = GET_MODE (op0);
41654 rtx t1, t2, t3, t4, t5, t6;
41656 if (TARGET_XOP && mode == V2DImode)
41658 /* op1: A,B,C,D, op2: E,F,G,H */
41659 op1 = gen_lowpart (V4SImode, op1);
41660 op2 = gen_lowpart (V4SImode, op2);
41662 t1 = gen_reg_rtx (V4SImode);
41663 t2 = gen_reg_rtx (V4SImode);
41664 t3 = gen_reg_rtx (V2DImode);
41665 t4 = gen_reg_rtx (V2DImode);
41667 /* t1: B,A,D,C */
41668 emit_insn (gen_sse2_pshufd_1 (t1, op1,
41669 GEN_INT (1),
41670 GEN_INT (0),
41671 GEN_INT (3),
41672 GEN_INT (2)));
41674 /* t2: (B*E),(A*F),(D*G),(C*H) */
41675 emit_insn (gen_mulv4si3 (t2, t1, op2));
41677 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
41678 emit_insn (gen_xop_phadddq (t3, t2));
41680 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
41681 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
41683 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
41684 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
41686 else
41688 enum machine_mode nmode;
41689 rtx (*umul) (rtx, rtx, rtx);
41691 if (mode == V2DImode)
41693 umul = gen_vec_widen_umult_even_v4si;
41694 nmode = V4SImode;
41696 else if (mode == V4DImode)
41698 umul = gen_vec_widen_umult_even_v8si;
41699 nmode = V8SImode;
41701 else
41702 gcc_unreachable ();
41705 /* Multiply low parts. */
41706 t1 = gen_reg_rtx (mode);
41707 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
41709 /* Shift input vectors right 32 bits so we can multiply high parts. */
41710 t6 = GEN_INT (32);
41711 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
41712 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
41714 /* Multiply high parts by low parts. */
41715 t4 = gen_reg_rtx (mode);
41716 t5 = gen_reg_rtx (mode);
41717 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
41718 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
41720 /* Combine and shift the highparts back. */
41721 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
41722 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
41724 /* Combine high and low parts. */
41725 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
41728 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41729 gen_rtx_MULT (mode, op1, op2));
41732 /* Expand an insert into a vector register through pinsr insn.
41733 Return true if successful. */
41735 bool
41736 ix86_expand_pinsr (rtx *operands)
41738 rtx dst = operands[0];
41739 rtx src = operands[3];
41741 unsigned int size = INTVAL (operands[1]);
41742 unsigned int pos = INTVAL (operands[2]);
41744 if (GET_CODE (dst) == SUBREG)
41746 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
41747 dst = SUBREG_REG (dst);
41750 if (GET_CODE (src) == SUBREG)
41751 src = SUBREG_REG (src);
41753 switch (GET_MODE (dst))
41755 case V16QImode:
41756 case V8HImode:
41757 case V4SImode:
41758 case V2DImode:
41760 enum machine_mode srcmode, dstmode;
41761 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41763 srcmode = mode_for_size (size, MODE_INT, 0);
41765 switch (srcmode)
41767 case QImode:
41768 if (!TARGET_SSE4_1)
41769 return false;
41770 dstmode = V16QImode;
41771 pinsr = gen_sse4_1_pinsrb;
41772 break;
41774 case HImode:
41775 if (!TARGET_SSE2)
41776 return false;
41777 dstmode = V8HImode;
41778 pinsr = gen_sse2_pinsrw;
41779 break;
41781 case SImode:
41782 if (!TARGET_SSE4_1)
41783 return false;
41784 dstmode = V4SImode;
41785 pinsr = gen_sse4_1_pinsrd;
41786 break;
41788 case DImode:
41789 gcc_assert (TARGET_64BIT);
41790 if (!TARGET_SSE4_1)
41791 return false;
41792 dstmode = V2DImode;
41793 pinsr = gen_sse4_1_pinsrq;
41794 break;
41796 default:
41797 return false;
41800 dst = gen_lowpart (dstmode, dst);
41801 src = gen_lowpart (srcmode, src);
41803 pos /= size;
41805 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41806 return true;
41809 default:
41810 return false;
41814 /* This function returns the calling abi specific va_list type node.
41815 It returns the FNDECL specific va_list type. */
41817 static tree
41818 ix86_fn_abi_va_list (tree fndecl)
41820 if (!TARGET_64BIT)
41821 return va_list_type_node;
41822 gcc_assert (fndecl != NULL_TREE);
41824 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41825 return ms_va_list_type_node;
41826 else
41827 return sysv_va_list_type_node;
41830 /* Returns the canonical va_list type specified by TYPE. If there
41831 is no valid TYPE provided, it return NULL_TREE. */
41833 static tree
41834 ix86_canonical_va_list_type (tree type)
41836 tree wtype, htype;
41838 /* Resolve references and pointers to va_list type. */
41839 if (TREE_CODE (type) == MEM_REF)
41840 type = TREE_TYPE (type);
41841 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41842 type = TREE_TYPE (type);
41843 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41844 type = TREE_TYPE (type);
41846 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41848 wtype = va_list_type_node;
41849 gcc_assert (wtype != NULL_TREE);
41850 htype = type;
41851 if (TREE_CODE (wtype) == ARRAY_TYPE)
41853 /* If va_list is an array type, the argument may have decayed
41854 to a pointer type, e.g. by being passed to another function.
41855 In that case, unwrap both types so that we can compare the
41856 underlying records. */
41857 if (TREE_CODE (htype) == ARRAY_TYPE
41858 || POINTER_TYPE_P (htype))
41860 wtype = TREE_TYPE (wtype);
41861 htype = TREE_TYPE (htype);
41864 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41865 return va_list_type_node;
41866 wtype = sysv_va_list_type_node;
41867 gcc_assert (wtype != NULL_TREE);
41868 htype = type;
41869 if (TREE_CODE (wtype) == ARRAY_TYPE)
41871 /* If va_list is an array type, the argument may have decayed
41872 to a pointer type, e.g. by being passed to another function.
41873 In that case, unwrap both types so that we can compare the
41874 underlying records. */
41875 if (TREE_CODE (htype) == ARRAY_TYPE
41876 || POINTER_TYPE_P (htype))
41878 wtype = TREE_TYPE (wtype);
41879 htype = TREE_TYPE (htype);
41882 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41883 return sysv_va_list_type_node;
41884 wtype = ms_va_list_type_node;
41885 gcc_assert (wtype != NULL_TREE);
41886 htype = type;
41887 if (TREE_CODE (wtype) == ARRAY_TYPE)
41889 /* If va_list is an array type, the argument may have decayed
41890 to a pointer type, e.g. by being passed to another function.
41891 In that case, unwrap both types so that we can compare the
41892 underlying records. */
41893 if (TREE_CODE (htype) == ARRAY_TYPE
41894 || POINTER_TYPE_P (htype))
41896 wtype = TREE_TYPE (wtype);
41897 htype = TREE_TYPE (htype);
41900 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41901 return ms_va_list_type_node;
41902 return NULL_TREE;
41904 return std_canonical_va_list_type (type);
41907 /* Iterate through the target-specific builtin types for va_list.
41908 IDX denotes the iterator, *PTREE is set to the result type of
41909 the va_list builtin, and *PNAME to its internal type.
41910 Returns zero if there is no element for this index, otherwise
41911 IDX should be increased upon the next call.
41912 Note, do not iterate a base builtin's name like __builtin_va_list.
41913 Used from c_common_nodes_and_builtins. */
41915 static int
41916 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41918 if (TARGET_64BIT)
41920 switch (idx)
41922 default:
41923 break;
41925 case 0:
41926 *ptree = ms_va_list_type_node;
41927 *pname = "__builtin_ms_va_list";
41928 return 1;
41930 case 1:
41931 *ptree = sysv_va_list_type_node;
41932 *pname = "__builtin_sysv_va_list";
41933 return 1;
41937 return 0;
41940 #undef TARGET_SCHED_DISPATCH
41941 #define TARGET_SCHED_DISPATCH has_dispatch
41942 #undef TARGET_SCHED_DISPATCH_DO
41943 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41944 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41945 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41946 #undef TARGET_SCHED_REORDER
41947 #define TARGET_SCHED_REORDER ix86_sched_reorder
41948 #undef TARGET_SCHED_ADJUST_PRIORITY
41949 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41950 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41951 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
41952 ix86_dependencies_evaluation_hook
41954 /* The size of the dispatch window is the total number of bytes of
41955 object code allowed in a window. */
41956 #define DISPATCH_WINDOW_SIZE 16
41958 /* Number of dispatch windows considered for scheduling. */
41959 #define MAX_DISPATCH_WINDOWS 3
41961 /* Maximum number of instructions in a window. */
41962 #define MAX_INSN 4
41964 /* Maximum number of immediate operands in a window. */
41965 #define MAX_IMM 4
41967 /* Maximum number of immediate bits allowed in a window. */
41968 #define MAX_IMM_SIZE 128
41970 /* Maximum number of 32 bit immediates allowed in a window. */
41971 #define MAX_IMM_32 4
41973 /* Maximum number of 64 bit immediates allowed in a window. */
41974 #define MAX_IMM_64 2
41976 /* Maximum total of loads or prefetches allowed in a window. */
41977 #define MAX_LOAD 2
41979 /* Maximum total of stores allowed in a window. */
41980 #define MAX_STORE 1
41982 #undef BIG
41983 #define BIG 100
41986 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41987 enum dispatch_group {
41988 disp_no_group = 0,
41989 disp_load,
41990 disp_store,
41991 disp_load_store,
41992 disp_prefetch,
41993 disp_imm,
41994 disp_imm_32,
41995 disp_imm_64,
41996 disp_branch,
41997 disp_cmp,
41998 disp_jcc,
41999 disp_last
42002 /* Number of allowable groups in a dispatch window. It is an array
42003 indexed by dispatch_group enum. 100 is used as a big number,
42004 because the number of these kind of operations does not have any
42005 effect in dispatch window, but we need them for other reasons in
42006 the table. */
42007 static unsigned int num_allowable_groups[disp_last] = {
42008 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
42011 char group_name[disp_last + 1][16] = {
42012 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
42013 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
42014 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
42017 /* Instruction path. */
42018 enum insn_path {
42019 no_path = 0,
42020 path_single, /* Single micro op. */
42021 path_double, /* Double micro op. */
42022 path_multi, /* Instructions with more than 2 micro op.. */
42023 last_path
42026 /* sched_insn_info defines a window to the instructions scheduled in
42027 the basic block. It contains a pointer to the insn_info table and
42028 the instruction scheduled.
42030 Windows are allocated for each basic block and are linked
42031 together. */
42032 typedef struct sched_insn_info_s {
42033 rtx insn;
42034 enum dispatch_group group;
42035 enum insn_path path;
42036 int byte_len;
42037 int imm_bytes;
42038 } sched_insn_info;
42040 /* Linked list of dispatch windows. This is a two way list of
42041 dispatch windows of a basic block. It contains information about
42042 the number of uops in the window and the total number of
42043 instructions and of bytes in the object code for this dispatch
42044 window. */
42045 typedef struct dispatch_windows_s {
42046 int num_insn; /* Number of insn in the window. */
42047 int num_uops; /* Number of uops in the window. */
42048 int window_size; /* Number of bytes in the window. */
42049 int window_num; /* Window number between 0 or 1. */
42050 int num_imm; /* Number of immediates in an insn. */
42051 int num_imm_32; /* Number of 32 bit immediates in an insn. */
42052 int num_imm_64; /* Number of 64 bit immediates in an insn. */
42053 int imm_size; /* Total immediates in the window. */
42054 int num_loads; /* Total memory loads in the window. */
42055 int num_stores; /* Total memory stores in the window. */
42056 int violation; /* Violation exists in window. */
42057 sched_insn_info *window; /* Pointer to the window. */
42058 struct dispatch_windows_s *next;
42059 struct dispatch_windows_s *prev;
42060 } dispatch_windows;
42062 /* Immediate valuse used in an insn. */
42063 typedef struct imm_info_s
42065 int imm;
42066 int imm32;
42067 int imm64;
42068 } imm_info;
42070 static dispatch_windows *dispatch_window_list;
42071 static dispatch_windows *dispatch_window_list1;
42073 /* Get dispatch group of insn. */
42075 static enum dispatch_group
42076 get_mem_group (rtx insn)
42078 enum attr_memory memory;
42080 if (INSN_CODE (insn) < 0)
42081 return disp_no_group;
42082 memory = get_attr_memory (insn);
42083 if (memory == MEMORY_STORE)
42084 return disp_store;
42086 if (memory == MEMORY_LOAD)
42087 return disp_load;
42089 if (memory == MEMORY_BOTH)
42090 return disp_load_store;
42092 return disp_no_group;
42095 /* Return true if insn is a compare instruction. */
42097 static bool
42098 is_cmp (rtx insn)
42100 enum attr_type type;
42102 type = get_attr_type (insn);
42103 return (type == TYPE_TEST
42104 || type == TYPE_ICMP
42105 || type == TYPE_FCMP
42106 || GET_CODE (PATTERN (insn)) == COMPARE);
42109 /* Return true if a dispatch violation encountered. */
42111 static bool
42112 dispatch_violation (void)
42114 if (dispatch_window_list->next)
42115 return dispatch_window_list->next->violation;
42116 return dispatch_window_list->violation;
42119 /* Return true if insn is a branch instruction. */
42121 static bool
42122 is_branch (rtx insn)
42124 return (CALL_P (insn) || JUMP_P (insn));
42127 /* Return true if insn is a prefetch instruction. */
42129 static bool
42130 is_prefetch (rtx insn)
42132 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
42135 /* This function initializes a dispatch window and the list container holding a
42136 pointer to the window. */
42138 static void
42139 init_window (int window_num)
42141 int i;
42142 dispatch_windows *new_list;
42144 if (window_num == 0)
42145 new_list = dispatch_window_list;
42146 else
42147 new_list = dispatch_window_list1;
42149 new_list->num_insn = 0;
42150 new_list->num_uops = 0;
42151 new_list->window_size = 0;
42152 new_list->next = NULL;
42153 new_list->prev = NULL;
42154 new_list->window_num = window_num;
42155 new_list->num_imm = 0;
42156 new_list->num_imm_32 = 0;
42157 new_list->num_imm_64 = 0;
42158 new_list->imm_size = 0;
42159 new_list->num_loads = 0;
42160 new_list->num_stores = 0;
42161 new_list->violation = false;
42163 for (i = 0; i < MAX_INSN; i++)
42165 new_list->window[i].insn = NULL;
42166 new_list->window[i].group = disp_no_group;
42167 new_list->window[i].path = no_path;
42168 new_list->window[i].byte_len = 0;
42169 new_list->window[i].imm_bytes = 0;
42171 return;
42174 /* This function allocates and initializes a dispatch window and the
42175 list container holding a pointer to the window. */
42177 static dispatch_windows *
42178 allocate_window (void)
42180 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
42181 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
42183 return new_list;
42186 /* This routine initializes the dispatch scheduling information. It
42187 initiates building dispatch scheduler tables and constructs the
42188 first dispatch window. */
42190 static void
42191 init_dispatch_sched (void)
42193 /* Allocate a dispatch list and a window. */
42194 dispatch_window_list = allocate_window ();
42195 dispatch_window_list1 = allocate_window ();
42196 init_window (0);
42197 init_window (1);
42200 /* This function returns true if a branch is detected. End of a basic block
42201 does not have to be a branch, but here we assume only branches end a
42202 window. */
42204 static bool
42205 is_end_basic_block (enum dispatch_group group)
42207 return group == disp_branch;
42210 /* This function is called when the end of a window processing is reached. */
42212 static void
42213 process_end_window (void)
42215 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
42216 if (dispatch_window_list->next)
42218 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
42219 gcc_assert (dispatch_window_list->window_size
42220 + dispatch_window_list1->window_size <= 48);
42221 init_window (1);
42223 init_window (0);
42226 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
42227 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
42228 for 48 bytes of instructions. Note that these windows are not dispatch
42229 windows that their sizes are DISPATCH_WINDOW_SIZE. */
42231 static dispatch_windows *
42232 allocate_next_window (int window_num)
42234 if (window_num == 0)
42236 if (dispatch_window_list->next)
42237 init_window (1);
42238 init_window (0);
42239 return dispatch_window_list;
42242 dispatch_window_list->next = dispatch_window_list1;
42243 dispatch_window_list1->prev = dispatch_window_list;
42245 return dispatch_window_list1;
42248 /* Increment the number of immediate operands of an instruction. */
42250 static int
42251 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
42253 if (*in_rtx == 0)
42254 return 0;
42256 switch ( GET_CODE (*in_rtx))
42258 case CONST:
42259 case SYMBOL_REF:
42260 case CONST_INT:
42261 (imm_values->imm)++;
42262 if (x86_64_immediate_operand (*in_rtx, SImode))
42263 (imm_values->imm32)++;
42264 else
42265 (imm_values->imm64)++;
42266 break;
42268 case CONST_DOUBLE:
42269 (imm_values->imm)++;
42270 (imm_values->imm64)++;
42271 break;
42273 case CODE_LABEL:
42274 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
42276 (imm_values->imm)++;
42277 (imm_values->imm32)++;
42279 break;
42281 default:
42282 break;
42285 return 0;
42288 /* Compute number of immediate operands of an instruction. */
42290 static void
42291 find_constant (rtx in_rtx, imm_info *imm_values)
42293 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
42294 (rtx_function) find_constant_1, (void *) imm_values);
42297 /* Return total size of immediate operands of an instruction along with number
42298 of corresponding immediate-operands. It initializes its parameters to zero
42299 befor calling FIND_CONSTANT.
42300 INSN is the input instruction. IMM is the total of immediates.
42301 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
42302 bit immediates. */
42304 static int
42305 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
42307 imm_info imm_values = {0, 0, 0};
42309 find_constant (insn, &imm_values);
42310 *imm = imm_values.imm;
42311 *imm32 = imm_values.imm32;
42312 *imm64 = imm_values.imm64;
42313 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
42316 /* This function indicates if an operand of an instruction is an
42317 immediate. */
42319 static bool
42320 has_immediate (rtx insn)
42322 int num_imm_operand;
42323 int num_imm32_operand;
42324 int num_imm64_operand;
42326 if (insn)
42327 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42328 &num_imm64_operand);
42329 return false;
42332 /* Return single or double path for instructions. */
42334 static enum insn_path
42335 get_insn_path (rtx insn)
42337 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
42339 if ((int)path == 0)
42340 return path_single;
42342 if ((int)path == 1)
42343 return path_double;
42345 return path_multi;
42348 /* Return insn dispatch group. */
42350 static enum dispatch_group
42351 get_insn_group (rtx insn)
42353 enum dispatch_group group = get_mem_group (insn);
42354 if (group)
42355 return group;
42357 if (is_branch (insn))
42358 return disp_branch;
42360 if (is_cmp (insn))
42361 return disp_cmp;
42363 if (has_immediate (insn))
42364 return disp_imm;
42366 if (is_prefetch (insn))
42367 return disp_prefetch;
42369 return disp_no_group;
42372 /* Count number of GROUP restricted instructions in a dispatch
42373 window WINDOW_LIST. */
42375 static int
42376 count_num_restricted (rtx insn, dispatch_windows *window_list)
42378 enum dispatch_group group = get_insn_group (insn);
42379 int imm_size;
42380 int num_imm_operand;
42381 int num_imm32_operand;
42382 int num_imm64_operand;
42384 if (group == disp_no_group)
42385 return 0;
42387 if (group == disp_imm)
42389 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42390 &num_imm64_operand);
42391 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
42392 || num_imm_operand + window_list->num_imm > MAX_IMM
42393 || (num_imm32_operand > 0
42394 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
42395 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
42396 || (num_imm64_operand > 0
42397 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
42398 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
42399 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
42400 && num_imm64_operand > 0
42401 && ((window_list->num_imm_64 > 0
42402 && window_list->num_insn >= 2)
42403 || window_list->num_insn >= 3)))
42404 return BIG;
42406 return 1;
42409 if ((group == disp_load_store
42410 && (window_list->num_loads >= MAX_LOAD
42411 || window_list->num_stores >= MAX_STORE))
42412 || ((group == disp_load
42413 || group == disp_prefetch)
42414 && window_list->num_loads >= MAX_LOAD)
42415 || (group == disp_store
42416 && window_list->num_stores >= MAX_STORE))
42417 return BIG;
42419 return 1;
42422 /* This function returns true if insn satisfies dispatch rules on the
42423 last window scheduled. */
42425 static bool
42426 fits_dispatch_window (rtx insn)
42428 dispatch_windows *window_list = dispatch_window_list;
42429 dispatch_windows *window_list_next = dispatch_window_list->next;
42430 unsigned int num_restrict;
42431 enum dispatch_group group = get_insn_group (insn);
42432 enum insn_path path = get_insn_path (insn);
42433 int sum;
42435 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
42436 instructions should be given the lowest priority in the
42437 scheduling process in Haifa scheduler to make sure they will be
42438 scheduled in the same dispatch window as the reference to them. */
42439 if (group == disp_jcc || group == disp_cmp)
42440 return false;
42442 /* Check nonrestricted. */
42443 if (group == disp_no_group || group == disp_branch)
42444 return true;
42446 /* Get last dispatch window. */
42447 if (window_list_next)
42448 window_list = window_list_next;
42450 if (window_list->window_num == 1)
42452 sum = window_list->prev->window_size + window_list->window_size;
42454 if (sum == 32
42455 || (min_insn_size (insn) + sum) >= 48)
42456 /* Window 1 is full. Go for next window. */
42457 return true;
42460 num_restrict = count_num_restricted (insn, window_list);
42462 if (num_restrict > num_allowable_groups[group])
42463 return false;
42465 /* See if it fits in the first window. */
42466 if (window_list->window_num == 0)
42468 /* The first widow should have only single and double path
42469 uops. */
42470 if (path == path_double
42471 && (window_list->num_uops + 2) > MAX_INSN)
42472 return false;
42473 else if (path != path_single)
42474 return false;
42476 return true;
42479 /* Add an instruction INSN with NUM_UOPS micro-operations to the
42480 dispatch window WINDOW_LIST. */
42482 static void
42483 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
42485 int byte_len = min_insn_size (insn);
42486 int num_insn = window_list->num_insn;
42487 int imm_size;
42488 sched_insn_info *window = window_list->window;
42489 enum dispatch_group group = get_insn_group (insn);
42490 enum insn_path path = get_insn_path (insn);
42491 int num_imm_operand;
42492 int num_imm32_operand;
42493 int num_imm64_operand;
42495 if (!window_list->violation && group != disp_cmp
42496 && !fits_dispatch_window (insn))
42497 window_list->violation = true;
42499 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42500 &num_imm64_operand);
42502 /* Initialize window with new instruction. */
42503 window[num_insn].insn = insn;
42504 window[num_insn].byte_len = byte_len;
42505 window[num_insn].group = group;
42506 window[num_insn].path = path;
42507 window[num_insn].imm_bytes = imm_size;
42509 window_list->window_size += byte_len;
42510 window_list->num_insn = num_insn + 1;
42511 window_list->num_uops = window_list->num_uops + num_uops;
42512 window_list->imm_size += imm_size;
42513 window_list->num_imm += num_imm_operand;
42514 window_list->num_imm_32 += num_imm32_operand;
42515 window_list->num_imm_64 += num_imm64_operand;
42517 if (group == disp_store)
42518 window_list->num_stores += 1;
42519 else if (group == disp_load
42520 || group == disp_prefetch)
42521 window_list->num_loads += 1;
42522 else if (group == disp_load_store)
42524 window_list->num_stores += 1;
42525 window_list->num_loads += 1;
42529 /* Adds a scheduled instruction, INSN, to the current dispatch window.
42530 If the total bytes of instructions or the number of instructions in
42531 the window exceed allowable, it allocates a new window. */
42533 static void
42534 add_to_dispatch_window (rtx insn)
42536 int byte_len;
42537 dispatch_windows *window_list;
42538 dispatch_windows *next_list;
42539 dispatch_windows *window0_list;
42540 enum insn_path path;
42541 enum dispatch_group insn_group;
42542 bool insn_fits;
42543 int num_insn;
42544 int num_uops;
42545 int window_num;
42546 int insn_num_uops;
42547 int sum;
42549 if (INSN_CODE (insn) < 0)
42550 return;
42552 byte_len = min_insn_size (insn);
42553 window_list = dispatch_window_list;
42554 next_list = window_list->next;
42555 path = get_insn_path (insn);
42556 insn_group = get_insn_group (insn);
42558 /* Get the last dispatch window. */
42559 if (next_list)
42560 window_list = dispatch_window_list->next;
42562 if (path == path_single)
42563 insn_num_uops = 1;
42564 else if (path == path_double)
42565 insn_num_uops = 2;
42566 else
42567 insn_num_uops = (int) path;
42569 /* If current window is full, get a new window.
42570 Window number zero is full, if MAX_INSN uops are scheduled in it.
42571 Window number one is full, if window zero's bytes plus window
42572 one's bytes is 32, or if the bytes of the new instruction added
42573 to the total makes it greater than 48, or it has already MAX_INSN
42574 instructions in it. */
42575 num_insn = window_list->num_insn;
42576 num_uops = window_list->num_uops;
42577 window_num = window_list->window_num;
42578 insn_fits = fits_dispatch_window (insn);
42580 if (num_insn >= MAX_INSN
42581 || num_uops + insn_num_uops > MAX_INSN
42582 || !(insn_fits))
42584 window_num = ~window_num & 1;
42585 window_list = allocate_next_window (window_num);
42588 if (window_num == 0)
42590 add_insn_window (insn, window_list, insn_num_uops);
42591 if (window_list->num_insn >= MAX_INSN
42592 && insn_group == disp_branch)
42594 process_end_window ();
42595 return;
42598 else if (window_num == 1)
42600 window0_list = window_list->prev;
42601 sum = window0_list->window_size + window_list->window_size;
42602 if (sum == 32
42603 || (byte_len + sum) >= 48)
42605 process_end_window ();
42606 window_list = dispatch_window_list;
42609 add_insn_window (insn, window_list, insn_num_uops);
42611 else
42612 gcc_unreachable ();
42614 if (is_end_basic_block (insn_group))
42616 /* End of basic block is reached do end-basic-block process. */
42617 process_end_window ();
42618 return;
42622 /* Print the dispatch window, WINDOW_NUM, to FILE. */
42624 DEBUG_FUNCTION static void
42625 debug_dispatch_window_file (FILE *file, int window_num)
42627 dispatch_windows *list;
42628 int i;
42630 if (window_num == 0)
42631 list = dispatch_window_list;
42632 else
42633 list = dispatch_window_list1;
42635 fprintf (file, "Window #%d:\n", list->window_num);
42636 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
42637 list->num_insn, list->num_uops, list->window_size);
42638 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42639 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
42641 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
42642 list->num_stores);
42643 fprintf (file, " insn info:\n");
42645 for (i = 0; i < MAX_INSN; i++)
42647 if (!list->window[i].insn)
42648 break;
42649 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
42650 i, group_name[list->window[i].group],
42651 i, (void *)list->window[i].insn,
42652 i, list->window[i].path,
42653 i, list->window[i].byte_len,
42654 i, list->window[i].imm_bytes);
42658 /* Print to stdout a dispatch window. */
42660 DEBUG_FUNCTION void
42661 debug_dispatch_window (int window_num)
42663 debug_dispatch_window_file (stdout, window_num);
42666 /* Print INSN dispatch information to FILE. */
42668 DEBUG_FUNCTION static void
42669 debug_insn_dispatch_info_file (FILE *file, rtx insn)
42671 int byte_len;
42672 enum insn_path path;
42673 enum dispatch_group group;
42674 int imm_size;
42675 int num_imm_operand;
42676 int num_imm32_operand;
42677 int num_imm64_operand;
42679 if (INSN_CODE (insn) < 0)
42680 return;
42682 byte_len = min_insn_size (insn);
42683 path = get_insn_path (insn);
42684 group = get_insn_group (insn);
42685 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42686 &num_imm64_operand);
42688 fprintf (file, " insn info:\n");
42689 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
42690 group_name[group], path, byte_len);
42691 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42692 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
42695 /* Print to STDERR the status of the ready list with respect to
42696 dispatch windows. */
42698 DEBUG_FUNCTION void
42699 debug_ready_dispatch (void)
42701 int i;
42702 int no_ready = number_in_ready ();
42704 fprintf (stdout, "Number of ready: %d\n", no_ready);
42706 for (i = 0; i < no_ready; i++)
42707 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
42710 /* This routine is the driver of the dispatch scheduler. */
42712 static void
42713 do_dispatch (rtx insn, int mode)
42715 if (mode == DISPATCH_INIT)
42716 init_dispatch_sched ();
42717 else if (mode == ADD_TO_DISPATCH_WINDOW)
42718 add_to_dispatch_window (insn);
42721 /* Return TRUE if Dispatch Scheduling is supported. */
42723 static bool
42724 has_dispatch (rtx insn, int action)
42726 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
42727 && flag_dispatch_scheduler)
42728 switch (action)
42730 default:
42731 return false;
42733 case IS_DISPATCH_ON:
42734 return true;
42735 break;
42737 case IS_CMP:
42738 return is_cmp (insn);
42740 case DISPATCH_VIOLATION:
42741 return dispatch_violation ();
42743 case FITS_DISPATCH_WINDOW:
42744 return fits_dispatch_window (insn);
42747 return false;
42750 /* Implementation of reassociation_width target hook used by
42751 reassoc phase to identify parallelism level in reassociated
42752 tree. Statements tree_code is passed in OPC. Arguments type
42753 is passed in MODE.
42755 Currently parallel reassociation is enabled for Atom
42756 processors only and we set reassociation width to be 2
42757 because Atom may issue up to 2 instructions per cycle.
42759 Return value should be fixed if parallel reassociation is
42760 enabled for other processors. */
42762 static int
42763 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42764 enum machine_mode mode)
42766 int res = 1;
42768 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42769 res = 2;
42770 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42771 res = 2;
42773 return res;
42776 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42777 place emms and femms instructions. */
42779 static enum machine_mode
42780 ix86_preferred_simd_mode (enum machine_mode mode)
42782 if (!TARGET_SSE)
42783 return word_mode;
42785 switch (mode)
42787 case QImode:
42788 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42789 case HImode:
42790 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42791 case SImode:
42792 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42793 case DImode:
42794 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42796 case SFmode:
42797 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42798 return V8SFmode;
42799 else
42800 return V4SFmode;
42802 case DFmode:
42803 if (!TARGET_VECTORIZE_DOUBLE)
42804 return word_mode;
42805 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42806 return V4DFmode;
42807 else if (TARGET_SSE2)
42808 return V2DFmode;
42809 /* FALLTHRU */
42811 default:
42812 return word_mode;
42816 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42817 vectors. */
42819 static unsigned int
42820 ix86_autovectorize_vector_sizes (void)
42822 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42827 /* Return class of registers which could be used for pseudo of MODE
42828 and of class RCLASS for spilling instead of memory. Return NO_REGS
42829 if it is not possible or non-profitable. */
42830 static reg_class_t
42831 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42833 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42834 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42835 && INTEGER_CLASS_P (rclass))
42836 return ALL_SSE_REGS;
42837 return NO_REGS;
42840 /* Implement targetm.vectorize.init_cost. */
42842 static void *
42843 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42845 unsigned *cost = XNEWVEC (unsigned, 3);
42846 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42847 return cost;
42850 /* Implement targetm.vectorize.add_stmt_cost. */
42852 static unsigned
42853 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42854 struct _stmt_vec_info *stmt_info, int misalign,
42855 enum vect_cost_model_location where)
42857 unsigned *cost = (unsigned *) data;
42858 unsigned retval = 0;
42860 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42861 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42863 /* Statements in an inner loop relative to the loop being
42864 vectorized are weighted more heavily. The value here is
42865 arbitrary and could potentially be improved with analysis. */
42866 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42867 count *= 50; /* FIXME. */
42869 retval = (unsigned) (count * stmt_cost);
42870 cost[where] += retval;
42872 return retval;
42875 /* Implement targetm.vectorize.finish_cost. */
42877 static void
42878 ix86_finish_cost (void *data, unsigned *prologue_cost,
42879 unsigned *body_cost, unsigned *epilogue_cost)
42881 unsigned *cost = (unsigned *) data;
42882 *prologue_cost = cost[vect_prologue];
42883 *body_cost = cost[vect_body];
42884 *epilogue_cost = cost[vect_epilogue];
42887 /* Implement targetm.vectorize.destroy_cost_data. */
42889 static void
42890 ix86_destroy_cost_data (void *data)
42892 free (data);
42895 /* Validate target specific memory model bits in VAL. */
42897 static unsigned HOST_WIDE_INT
42898 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42900 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42901 bool strong;
42903 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42904 |MEMMODEL_MASK)
42905 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42907 warning (OPT_Winvalid_memory_model,
42908 "Unknown architecture specific memory model");
42909 return MEMMODEL_SEQ_CST;
42911 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42912 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42914 warning (OPT_Winvalid_memory_model,
42915 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42916 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42918 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42920 warning (OPT_Winvalid_memory_model,
42921 "HLE_RELEASE not used with RELEASE or stronger memory model");
42922 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42924 return val;
42927 /* Initialize the GCC target structure. */
42928 #undef TARGET_RETURN_IN_MEMORY
42929 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42931 #undef TARGET_LEGITIMIZE_ADDRESS
42932 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42934 #undef TARGET_ATTRIBUTE_TABLE
42935 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42936 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
42937 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
42938 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42939 # undef TARGET_MERGE_DECL_ATTRIBUTES
42940 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42941 #endif
42943 #undef TARGET_COMP_TYPE_ATTRIBUTES
42944 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42946 #undef TARGET_INIT_BUILTINS
42947 #define TARGET_INIT_BUILTINS ix86_init_builtins
42948 #undef TARGET_BUILTIN_DECL
42949 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42950 #undef TARGET_EXPAND_BUILTIN
42951 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42953 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42954 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42955 ix86_builtin_vectorized_function
42957 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42958 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42960 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42961 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42963 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42964 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42966 #undef TARGET_BUILTIN_RECIPROCAL
42967 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42969 #undef TARGET_ASM_FUNCTION_EPILOGUE
42970 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42972 #undef TARGET_ENCODE_SECTION_INFO
42973 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42974 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42975 #else
42976 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42977 #endif
42979 #undef TARGET_ASM_OPEN_PAREN
42980 #define TARGET_ASM_OPEN_PAREN ""
42981 #undef TARGET_ASM_CLOSE_PAREN
42982 #define TARGET_ASM_CLOSE_PAREN ""
42984 #undef TARGET_ASM_BYTE_OP
42985 #define TARGET_ASM_BYTE_OP ASM_BYTE
42987 #undef TARGET_ASM_ALIGNED_HI_OP
42988 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42989 #undef TARGET_ASM_ALIGNED_SI_OP
42990 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42991 #ifdef ASM_QUAD
42992 #undef TARGET_ASM_ALIGNED_DI_OP
42993 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42994 #endif
42996 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42997 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42999 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
43000 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
43002 #undef TARGET_ASM_UNALIGNED_HI_OP
43003 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
43004 #undef TARGET_ASM_UNALIGNED_SI_OP
43005 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
43006 #undef TARGET_ASM_UNALIGNED_DI_OP
43007 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
43009 #undef TARGET_PRINT_OPERAND
43010 #define TARGET_PRINT_OPERAND ix86_print_operand
43011 #undef TARGET_PRINT_OPERAND_ADDRESS
43012 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
43013 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
43014 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
43015 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
43016 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
43018 #undef TARGET_SCHED_INIT_GLOBAL
43019 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
43020 #undef TARGET_SCHED_ADJUST_COST
43021 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
43022 #undef TARGET_SCHED_ISSUE_RATE
43023 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
43024 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
43025 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
43026 ia32_multipass_dfa_lookahead
43028 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
43029 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
43031 #undef TARGET_MEMMODEL_CHECK
43032 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
43034 #ifdef HAVE_AS_TLS
43035 #undef TARGET_HAVE_TLS
43036 #define TARGET_HAVE_TLS true
43037 #endif
43038 #undef TARGET_CANNOT_FORCE_CONST_MEM
43039 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
43040 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
43041 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
43043 #undef TARGET_DELEGITIMIZE_ADDRESS
43044 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
43046 #undef TARGET_MS_BITFIELD_LAYOUT_P
43047 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
43049 #if TARGET_MACHO
43050 #undef TARGET_BINDS_LOCAL_P
43051 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
43052 #endif
43053 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43054 #undef TARGET_BINDS_LOCAL_P
43055 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
43056 #endif
43058 #undef TARGET_ASM_OUTPUT_MI_THUNK
43059 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
43060 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
43061 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
43063 #undef TARGET_ASM_FILE_START
43064 #define TARGET_ASM_FILE_START x86_file_start
43066 #undef TARGET_OPTION_OVERRIDE
43067 #define TARGET_OPTION_OVERRIDE ix86_option_override
43069 #undef TARGET_REGISTER_MOVE_COST
43070 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
43071 #undef TARGET_MEMORY_MOVE_COST
43072 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
43073 #undef TARGET_RTX_COSTS
43074 #define TARGET_RTX_COSTS ix86_rtx_costs
43075 #undef TARGET_ADDRESS_COST
43076 #define TARGET_ADDRESS_COST ix86_address_cost
43078 #undef TARGET_FIXED_CONDITION_CODE_REGS
43079 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
43080 #undef TARGET_CC_MODES_COMPATIBLE
43081 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
43083 #undef TARGET_MACHINE_DEPENDENT_REORG
43084 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
43086 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
43087 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
43089 #undef TARGET_BUILD_BUILTIN_VA_LIST
43090 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
43092 #undef TARGET_FOLD_BUILTIN
43093 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
43095 #undef TARGET_COMPARE_VERSION_PRIORITY
43096 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
43098 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
43099 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
43100 ix86_generate_version_dispatcher_body
43102 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
43103 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
43104 ix86_get_function_versions_dispatcher
43106 #undef TARGET_ENUM_VA_LIST_P
43107 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
43109 #undef TARGET_FN_ABI_VA_LIST
43110 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
43112 #undef TARGET_CANONICAL_VA_LIST_TYPE
43113 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
43115 #undef TARGET_EXPAND_BUILTIN_VA_START
43116 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
43118 #undef TARGET_MD_ASM_CLOBBERS
43119 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
43121 #undef TARGET_PROMOTE_PROTOTYPES
43122 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
43123 #undef TARGET_STRUCT_VALUE_RTX
43124 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
43125 #undef TARGET_SETUP_INCOMING_VARARGS
43126 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
43127 #undef TARGET_MUST_PASS_IN_STACK
43128 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
43129 #undef TARGET_FUNCTION_ARG_ADVANCE
43130 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
43131 #undef TARGET_FUNCTION_ARG
43132 #define TARGET_FUNCTION_ARG ix86_function_arg
43133 #undef TARGET_FUNCTION_ARG_BOUNDARY
43134 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
43135 #undef TARGET_PASS_BY_REFERENCE
43136 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
43137 #undef TARGET_INTERNAL_ARG_POINTER
43138 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
43139 #undef TARGET_UPDATE_STACK_BOUNDARY
43140 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
43141 #undef TARGET_GET_DRAP_RTX
43142 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
43143 #undef TARGET_STRICT_ARGUMENT_NAMING
43144 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
43145 #undef TARGET_STATIC_CHAIN
43146 #define TARGET_STATIC_CHAIN ix86_static_chain
43147 #undef TARGET_TRAMPOLINE_INIT
43148 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
43149 #undef TARGET_RETURN_POPS_ARGS
43150 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
43152 #undef TARGET_LEGITIMATE_COMBINED_INSN
43153 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
43155 #undef TARGET_ASAN_SHADOW_OFFSET
43156 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
43158 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
43159 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
43161 #undef TARGET_SCALAR_MODE_SUPPORTED_P
43162 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
43164 #undef TARGET_VECTOR_MODE_SUPPORTED_P
43165 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
43167 #undef TARGET_C_MODE_FOR_SUFFIX
43168 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
43170 #ifdef HAVE_AS_TLS
43171 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
43172 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
43173 #endif
43175 #ifdef SUBTARGET_INSERT_ATTRIBUTES
43176 #undef TARGET_INSERT_ATTRIBUTES
43177 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
43178 #endif
43180 #undef TARGET_MANGLE_TYPE
43181 #define TARGET_MANGLE_TYPE ix86_mangle_type
43183 #if !TARGET_MACHO
43184 #undef TARGET_STACK_PROTECT_FAIL
43185 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
43186 #endif
43188 #undef TARGET_FUNCTION_VALUE
43189 #define TARGET_FUNCTION_VALUE ix86_function_value
43191 #undef TARGET_FUNCTION_VALUE_REGNO_P
43192 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
43194 #undef TARGET_PROMOTE_FUNCTION_MODE
43195 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
43197 #undef TARGET_MEMBER_TYPE_FORCES_BLK
43198 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
43200 #undef TARGET_INSTANTIATE_DECLS
43201 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
43203 #undef TARGET_SECONDARY_RELOAD
43204 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
43206 #undef TARGET_CLASS_MAX_NREGS
43207 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
43209 #undef TARGET_PREFERRED_RELOAD_CLASS
43210 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
43211 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
43212 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
43213 #undef TARGET_CLASS_LIKELY_SPILLED_P
43214 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
43216 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
43217 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
43218 ix86_builtin_vectorization_cost
43219 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
43220 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
43221 ix86_vectorize_vec_perm_const_ok
43222 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
43223 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
43224 ix86_preferred_simd_mode
43225 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
43226 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
43227 ix86_autovectorize_vector_sizes
43228 #undef TARGET_VECTORIZE_INIT_COST
43229 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
43230 #undef TARGET_VECTORIZE_ADD_STMT_COST
43231 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
43232 #undef TARGET_VECTORIZE_FINISH_COST
43233 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
43234 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
43235 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
43237 #undef TARGET_SET_CURRENT_FUNCTION
43238 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
43240 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
43241 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
43243 #undef TARGET_OPTION_SAVE
43244 #define TARGET_OPTION_SAVE ix86_function_specific_save
43246 #undef TARGET_OPTION_RESTORE
43247 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
43249 #undef TARGET_OPTION_PRINT
43250 #define TARGET_OPTION_PRINT ix86_function_specific_print
43252 #undef TARGET_OPTION_FUNCTION_VERSIONS
43253 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
43255 #undef TARGET_CAN_INLINE_P
43256 #define TARGET_CAN_INLINE_P ix86_can_inline_p
43258 #undef TARGET_EXPAND_TO_RTL_HOOK
43259 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
43261 #undef TARGET_LEGITIMATE_ADDRESS_P
43262 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
43264 #undef TARGET_LRA_P
43265 #define TARGET_LRA_P hook_bool_void_true
43267 #undef TARGET_REGISTER_PRIORITY
43268 #define TARGET_REGISTER_PRIORITY ix86_register_priority
43270 #undef TARGET_REGISTER_USAGE_LEVELING_P
43271 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
43273 #undef TARGET_LEGITIMATE_CONSTANT_P
43274 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
43276 #undef TARGET_FRAME_POINTER_REQUIRED
43277 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
43279 #undef TARGET_CAN_ELIMINATE
43280 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
43282 #undef TARGET_EXTRA_LIVE_ON_ENTRY
43283 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
43285 #undef TARGET_ASM_CODE_END
43286 #define TARGET_ASM_CODE_END ix86_code_end
43288 #undef TARGET_CONDITIONAL_REGISTER_USAGE
43289 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
43291 #if TARGET_MACHO
43292 #undef TARGET_INIT_LIBFUNCS
43293 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
43294 #endif
43296 #undef TARGET_SPILL_CLASS
43297 #define TARGET_SPILL_CLASS ix86_spill_class
43299 struct gcc_target targetm = TARGET_INITIALIZER;
43301 #include "gt-i386.h"