i386.c (ix86_decompose_address): If operand isn't UNSPEC, return 0.
[official-gcc.git] / gcc / config / i386 / i386.c
blob2410236b93bc6b5225bfbc0faa8bfed5269d7ac8
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
64 enum upper_128bits_state
66 unknown = 0,
67 unused,
68 used
71 typedef struct block_info_def
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
88 enum call_avx256_state
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
103 /* Check if a 256bit AVX register is referenced in stores. */
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
124 STATE is state of the upper 128bits of AVX registers at entry. */
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
136 if (BLOCK_INFO (bb)->unchanged)
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
142 BLOCK_INFO (bb)->state = state;
143 return;
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
154 BLOCK_INFO (bb)->prev = state;
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
160 unchanged = true;
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
167 insn = NEXT_INSN (insn);
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
175 if (!vzeroupper_insn)
176 continue;
178 if (PREV_INSN (insn) != vzeroupper_insn)
180 if (dump_file)
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
190 vzeroupper_insn = NULL_RTX;
191 continue;
194 pat = PATTERN (insn);
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
200 if (dump_file)
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
207 else
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
214 state = unused;
215 unchanged = false;
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
224 else if (state != used)
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
230 continue;
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
236 if (state == unused)
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
243 state = used;
244 unchanged = false;
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
254 delete_insn (insn);
256 else
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
273 delete_insn (insn);
275 else
277 vzeroupper_insn = insn;
278 unchanged = false;
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
310 if (BLOCK_INFO (block)->processed)
311 return false;
313 state = unused;
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
334 if (seen_unknown)
335 state = unknown;
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
353 else
354 return false;
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
361 static void
362 move_or_delete_vzeroupper (void)
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
418 while (!fibheap_empty (pending))
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
427 sbitmap_zero (visited);
429 cfun->machine->rescan_vzeroupper_p = 0;
431 while (!fibheap_empty (worklist))
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
438 edge_iterator ei;
440 SET_BIT (visited, bb->index);
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
449 if (TEST_BIT (visited, e->dest->index))
451 if (!TEST_BIT (in_pending, e->dest->index))
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
460 else if (!TEST_BIT (in_worklist, e->dest->index))
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
488 free_aux_for_blocks ();
491 static rtx legitimize_dllimport_symbol (rtx, bool);
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1999 /* X86_TUNE_QIMODE_MATH */
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2256 /* The "default" register map used in 32bit mode. */
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2269 /* The "default" register map used in 64bit mode. */
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2347 /* Define parameter passing and return registers. */
2349 static int const x86_64_int_parameter_registers[6] =
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2356 CX_REG, DX_REG, R8_REG, R9_REG
2359 static int const x86_64_int_return_registers[4] =
2361 AX_REG, DX_REG, DI_REG, SI_REG
2364 /* Define the structure for the machine field in struct function. */
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2380 saved static chain if ix86_static_chain_on_stack
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2394 [frame] |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2399 struct ix86_frame
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2450 /* Preferred alignment for stack boundary in bits. */
2451 unsigned int ix86_preferred_stack_boundary;
2453 /* Alignment for incoming stack boundary in bits specified at
2454 command line. */
2455 static unsigned int ix86_user_incoming_stack_boundary;
2457 /* Default alignment for incoming stack boundary in bits. */
2458 static unsigned int ix86_default_incoming_stack_boundary;
2460 /* Alignment for incoming stack boundary in bits. */
2461 unsigned int ix86_incoming_stack_boundary;
2463 /* Calling abi specific va_list type nodes. */
2464 static GTY(()) tree sysv_va_list_type_node;
2465 static GTY(()) tree ms_va_list_type_node;
2467 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2468 char internal_label_prefix[16];
2469 int internal_label_prefix_len;
2471 /* Fence to use after loop using movnt. */
2472 tree x86_mfence;
2474 /* Register class used for passing given 64bit part of the argument.
2475 These represent classes as documented by the PS ABI, with the exception
2476 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2477 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2479 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2480 whenever possible (upper half does contain padding). */
2481 enum x86_64_reg_class
2483 X86_64_NO_CLASS,
2484 X86_64_INTEGER_CLASS,
2485 X86_64_INTEGERSI_CLASS,
2486 X86_64_SSE_CLASS,
2487 X86_64_SSESF_CLASS,
2488 X86_64_SSEDF_CLASS,
2489 X86_64_SSEUP_CLASS,
2490 X86_64_X87_CLASS,
2491 X86_64_X87UP_CLASS,
2492 X86_64_COMPLEX_X87_CLASS,
2493 X86_64_MEMORY_CLASS
2496 #define MAX_CLASSES 4
2498 /* Table of constants used by fldpi, fldln2, etc.... */
2499 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2500 static bool ext_80387_constants_init = 0;
2503 static struct machine_function * ix86_init_machine_status (void);
2504 static rtx ix86_function_value (const_tree, const_tree, bool);
2505 static bool ix86_function_value_regno_p (const unsigned int);
2506 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2507 const_tree);
2508 static rtx ix86_static_chain (const_tree, bool);
2509 static int ix86_function_regparm (const_tree, const_tree);
2510 static void ix86_compute_frame_layout (struct ix86_frame *);
2511 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2512 rtx, rtx, int);
2513 static void ix86_add_new_builtins (HOST_WIDE_INT);
2514 static tree ix86_canonical_va_list_type (tree);
2515 static void predict_jump (int);
2516 static unsigned int split_stack_prologue_scratch_regno (void);
2517 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2519 enum ix86_function_specific_strings
2521 IX86_FUNCTION_SPECIFIC_ARCH,
2522 IX86_FUNCTION_SPECIFIC_TUNE,
2523 IX86_FUNCTION_SPECIFIC_MAX
2526 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2527 const char *, enum fpmath_unit, bool);
2528 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2529 static void ix86_function_specific_save (struct cl_target_option *);
2530 static void ix86_function_specific_restore (struct cl_target_option *);
2531 static void ix86_function_specific_print (FILE *, int,
2532 struct cl_target_option *);
2533 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2534 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2535 struct gcc_options *);
2536 static bool ix86_can_inline_p (tree, tree);
2537 static void ix86_set_current_function (tree);
2538 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2540 static enum calling_abi ix86_function_abi (const_tree);
2543 #ifndef SUBTARGET32_DEFAULT_CPU
2544 #define SUBTARGET32_DEFAULT_CPU "i386"
2545 #endif
2547 /* The svr4 ABI for the i386 says that records and unions are returned
2548 in memory. */
2549 #ifndef DEFAULT_PCC_STRUCT_RETURN
2550 #define DEFAULT_PCC_STRUCT_RETURN 1
2551 #endif
2553 /* Whether -mtune= or -march= were specified */
2554 static int ix86_tune_defaulted;
2555 static int ix86_arch_specified;
2557 /* Vectorization library interface and handlers. */
2558 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2560 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2561 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2563 /* Processor target table, indexed by processor number */
2564 struct ptt
2566 const struct processor_costs *cost; /* Processor costs */
2567 const int align_loop; /* Default alignments. */
2568 const int align_loop_max_skip;
2569 const int align_jump;
2570 const int align_jump_max_skip;
2571 const int align_func;
2574 static const struct ptt processor_target_table[PROCESSOR_max] =
2576 {&i386_cost, 4, 3, 4, 3, 4},
2577 {&i486_cost, 16, 15, 16, 15, 16},
2578 {&pentium_cost, 16, 7, 16, 7, 16},
2579 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2580 {&geode_cost, 0, 0, 0, 0, 0},
2581 {&k6_cost, 32, 7, 32, 7, 32},
2582 {&athlon_cost, 16, 7, 16, 7, 16},
2583 {&pentium4_cost, 0, 0, 0, 0, 0},
2584 {&k8_cost, 16, 7, 16, 7, 16},
2585 {&nocona_cost, 0, 0, 0, 0, 0},
2586 /* Core 2 32-bit. */
2587 {&generic32_cost, 16, 10, 16, 10, 16},
2588 /* Core 2 64-bit. */
2589 {&generic64_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 32-bit. */
2591 {&generic32_cost, 16, 10, 16, 10, 16},
2592 /* Core i7 64-bit. */
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&generic32_cost, 16, 7, 16, 7, 16},
2595 {&generic64_cost, 16, 10, 16, 10, 16},
2596 {&amdfam10_cost, 32, 24, 32, 7, 32},
2597 {&bdver1_cost, 32, 24, 32, 7, 32},
2598 {&bdver2_cost, 32, 24, 32, 7, 32},
2599 {&btver1_cost, 32, 24, 32, 7, 32},
2600 {&atom_cost, 16, 15, 16, 7, 16}
2603 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2605 "generic",
2606 "i386",
2607 "i486",
2608 "pentium",
2609 "pentium-mmx",
2610 "pentiumpro",
2611 "pentium2",
2612 "pentium3",
2613 "pentium4",
2614 "pentium-m",
2615 "prescott",
2616 "nocona",
2617 "core2",
2618 "corei7",
2619 "atom",
2620 "geode",
2621 "k6",
2622 "k6-2",
2623 "k6-3",
2624 "athlon",
2625 "athlon-4",
2626 "k8",
2627 "amdfam10",
2628 "bdver1",
2629 "bdver2",
2630 "btver1"
2633 /* Return true if a red-zone is in use. */
2635 static inline bool
2636 ix86_using_red_zone (void)
2638 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2641 /* Return a string that documents the current -m options. The caller is
2642 responsible for freeing the string. */
2644 static char *
2645 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2646 const char *tune, enum fpmath_unit fpmath,
2647 bool add_nl_p)
2649 struct ix86_target_opts
2651 const char *option; /* option string */
2652 HOST_WIDE_INT mask; /* isa mask options */
2655 /* This table is ordered so that options like -msse4.2 that imply
2656 preceding options while match those first. */
2657 static struct ix86_target_opts isa_opts[] =
2659 { "-m64", OPTION_MASK_ISA_64BIT },
2660 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2661 { "-mfma", OPTION_MASK_ISA_FMA },
2662 { "-mxop", OPTION_MASK_ISA_XOP },
2663 { "-mlwp", OPTION_MASK_ISA_LWP },
2664 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2665 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2666 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2667 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2668 { "-msse3", OPTION_MASK_ISA_SSE3 },
2669 { "-msse2", OPTION_MASK_ISA_SSE2 },
2670 { "-msse", OPTION_MASK_ISA_SSE },
2671 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2672 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2673 { "-mmmx", OPTION_MASK_ISA_MMX },
2674 { "-mabm", OPTION_MASK_ISA_ABM },
2675 { "-mbmi", OPTION_MASK_ISA_BMI },
2676 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2677 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2678 { "-mtbm", OPTION_MASK_ISA_TBM },
2679 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2680 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2681 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2682 { "-maes", OPTION_MASK_ISA_AES },
2683 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2684 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2685 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2686 { "-mf16c", OPTION_MASK_ISA_F16C },
2687 { "-mrtm", OPTION_MASK_ISA_RTM },
2690 /* Flag options. */
2691 static struct ix86_target_opts flag_opts[] =
2693 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2694 { "-m80387", MASK_80387 },
2695 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2696 { "-malign-double", MASK_ALIGN_DOUBLE },
2697 { "-mcld", MASK_CLD },
2698 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2699 { "-mieee-fp", MASK_IEEE_FP },
2700 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2701 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2702 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2703 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2704 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2705 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2706 { "-mno-red-zone", MASK_NO_RED_ZONE },
2707 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2708 { "-mrecip", MASK_RECIP },
2709 { "-mrtd", MASK_RTD },
2710 { "-msseregparm", MASK_SSEREGPARM },
2711 { "-mstack-arg-probe", MASK_STACK_PROBE },
2712 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2713 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2714 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2715 { "-mvzeroupper", MASK_VZEROUPPER },
2716 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2717 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2718 { "-mprefer-avx128", MASK_PREFER_AVX128},
2721 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2723 char isa_other[40];
2724 char target_other[40];
2725 unsigned num = 0;
2726 unsigned i, j;
2727 char *ret;
2728 char *ptr;
2729 size_t len;
2730 size_t line_len;
2731 size_t sep_len;
2733 memset (opts, '\0', sizeof (opts));
2735 /* Add -march= option. */
2736 if (arch)
2738 opts[num][0] = "-march=";
2739 opts[num++][1] = arch;
2742 /* Add -mtune= option. */
2743 if (tune)
2745 opts[num][0] = "-mtune=";
2746 opts[num++][1] = tune;
2749 /* Pick out the options in isa options. */
2750 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2752 if ((isa & isa_opts[i].mask) != 0)
2754 opts[num++][0] = isa_opts[i].option;
2755 isa &= ~ isa_opts[i].mask;
2759 if (isa && add_nl_p)
2761 opts[num++][0] = isa_other;
2762 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2763 isa);
2766 /* Add flag options. */
2767 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2769 if ((flags & flag_opts[i].mask) != 0)
2771 opts[num++][0] = flag_opts[i].option;
2772 flags &= ~ flag_opts[i].mask;
2776 if (flags && add_nl_p)
2778 opts[num++][0] = target_other;
2779 sprintf (target_other, "(other flags: %#x)", flags);
2782 /* Add -fpmath= option. */
2783 if (fpmath)
2785 opts[num][0] = "-mfpmath=";
2786 switch ((int) fpmath)
2788 case FPMATH_387:
2789 opts[num++][1] = "387";
2790 break;
2792 case FPMATH_SSE:
2793 opts[num++][1] = "sse";
2794 break;
2796 case FPMATH_387 | FPMATH_SSE:
2797 opts[num++][1] = "sse+387";
2798 break;
2800 default:
2801 gcc_unreachable ();
2805 /* Any options? */
2806 if (num == 0)
2807 return NULL;
2809 gcc_assert (num < ARRAY_SIZE (opts));
2811 /* Size the string. */
2812 len = 0;
2813 sep_len = (add_nl_p) ? 3 : 1;
2814 for (i = 0; i < num; i++)
2816 len += sep_len;
2817 for (j = 0; j < 2; j++)
2818 if (opts[i][j])
2819 len += strlen (opts[i][j]);
2822 /* Build the string. */
2823 ret = ptr = (char *) xmalloc (len);
2824 line_len = 0;
2826 for (i = 0; i < num; i++)
2828 size_t len2[2];
2830 for (j = 0; j < 2; j++)
2831 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2833 if (i != 0)
2835 *ptr++ = ' ';
2836 line_len++;
2838 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2840 *ptr++ = '\\';
2841 *ptr++ = '\n';
2842 line_len = 0;
2846 for (j = 0; j < 2; j++)
2847 if (opts[i][j])
2849 memcpy (ptr, opts[i][j], len2[j]);
2850 ptr += len2[j];
2851 line_len += len2[j];
2855 *ptr = '\0';
2856 gcc_assert (ret + len >= ptr);
2858 return ret;
2861 /* Return true, if profiling code should be emitted before
2862 prologue. Otherwise it returns false.
2863 Note: For x86 with "hotfix" it is sorried. */
2864 static bool
2865 ix86_profile_before_prologue (void)
2867 return flag_fentry != 0;
2870 /* Function that is callable from the debugger to print the current
2871 options. */
2872 void
2873 ix86_debug_options (void)
2875 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2876 ix86_arch_string, ix86_tune_string,
2877 ix86_fpmath, true);
2879 if (opts)
2881 fprintf (stderr, "%s\n\n", opts);
2882 free (opts);
2884 else
2885 fputs ("<no options>\n\n", stderr);
2887 return;
2890 /* Override various settings based on options. If MAIN_ARGS_P, the
2891 options are from the command line, otherwise they are from
2892 attributes. */
2894 static void
2895 ix86_option_override_internal (bool main_args_p)
2897 int i;
2898 unsigned int ix86_arch_mask, ix86_tune_mask;
2899 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2900 const char *prefix;
2901 const char *suffix;
2902 const char *sw;
2904 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2905 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2906 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2907 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2908 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2909 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2910 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2911 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2912 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2913 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2914 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2915 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2916 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2917 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2918 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2919 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2920 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2921 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2922 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2923 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2924 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2925 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2926 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2927 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2928 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2929 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2930 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2931 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2932 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2933 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2934 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2935 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2936 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2937 /* if this reaches 64, need to widen struct pta flags below */
2939 static struct pta
2941 const char *const name; /* processor name or nickname. */
2942 const enum processor_type processor;
2943 const enum attr_cpu schedule;
2944 const unsigned HOST_WIDE_INT flags;
2946 const processor_alias_table[] =
2948 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2949 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2950 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2951 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2952 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2953 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2954 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2955 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2956 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2957 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2958 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2959 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2960 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE},
2962 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2963 PTA_MMX | PTA_SSE},
2964 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2967 PTA_MMX |PTA_SSE | PTA_SSE2},
2968 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2969 PTA_MMX | PTA_SSE | PTA_SSE2},
2970 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2971 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2972 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2973 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2974 | PTA_CX16 | PTA_NO_SAHF},
2975 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2976 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2977 | PTA_SSSE3 | PTA_CX16},
2978 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2979 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2980 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2981 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2985 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2986 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2987 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2988 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2989 | PTA_RDRND | PTA_F16C},
2990 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2991 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2992 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2993 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2994 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2995 | PTA_FMA | PTA_MOVBE | PTA_RTM},
2996 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2997 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2998 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2999 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3000 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3001 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3002 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3003 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3004 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3006 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3008 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3011 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3012 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3013 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3014 {"x86-64", PROCESSOR_K8, CPU_K8,
3015 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3016 {"k8", PROCESSOR_K8, CPU_K8,
3017 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3018 | PTA_SSE2 | PTA_NO_SAHF},
3019 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3020 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3021 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3022 {"opteron", PROCESSOR_K8, CPU_K8,
3023 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3024 | PTA_SSE2 | PTA_NO_SAHF},
3025 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3026 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3027 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3028 {"athlon64", PROCESSOR_K8, CPU_K8,
3029 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3030 | PTA_SSE2 | PTA_NO_SAHF},
3031 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3032 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3033 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3034 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3035 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3036 | PTA_SSE2 | PTA_NO_SAHF},
3037 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3038 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3039 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3040 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3041 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3042 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3043 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3044 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3045 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3046 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3047 | PTA_XOP | PTA_LWP},
3048 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3049 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3050 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3051 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3052 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3053 | PTA_FMA},
3054 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3055 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3056 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3057 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3058 0 /* flags are only used for -march switch. */ },
3059 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3060 PTA_64BIT /* flags are only used for -march switch. */ },
3063 /* -mrecip options. */
3064 static struct
3066 const char *string; /* option name */
3067 unsigned int mask; /* mask bits to set */
3069 const recip_options[] =
3071 { "all", RECIP_MASK_ALL },
3072 { "none", RECIP_MASK_NONE },
3073 { "div", RECIP_MASK_DIV },
3074 { "sqrt", RECIP_MASK_SQRT },
3075 { "vec-div", RECIP_MASK_VEC_DIV },
3076 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3079 int const pta_size = ARRAY_SIZE (processor_alias_table);
3081 /* Set up prefix/suffix so the error messages refer to either the command
3082 line argument, or the attribute(target). */
3083 if (main_args_p)
3085 prefix = "-m";
3086 suffix = "";
3087 sw = "switch";
3089 else
3091 prefix = "option(\"";
3092 suffix = "\")";
3093 sw = "attribute";
3096 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3097 SUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3100 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3101 SUBSUBTARGET_OVERRIDE_OPTIONS;
3102 #endif
3104 if (TARGET_X32)
3105 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3107 /* -fPIC is the default for x86_64. */
3108 if (TARGET_MACHO && TARGET_64BIT)
3109 flag_pic = 2;
3111 /* Need to check -mtune=generic first. */
3112 if (ix86_tune_string)
3114 if (!strcmp (ix86_tune_string, "generic")
3115 || !strcmp (ix86_tune_string, "i686")
3116 /* As special support for cross compilers we read -mtune=native
3117 as -mtune=generic. With native compilers we won't see the
3118 -mtune=native, as it was changed by the driver. */
3119 || !strcmp (ix86_tune_string, "native"))
3121 if (TARGET_64BIT)
3122 ix86_tune_string = "generic64";
3123 else
3124 ix86_tune_string = "generic32";
3126 /* If this call is for setting the option attribute, allow the
3127 generic32/generic64 that was previously set. */
3128 else if (!main_args_p
3129 && (!strcmp (ix86_tune_string, "generic32")
3130 || !strcmp (ix86_tune_string, "generic64")))
3132 else if (!strncmp (ix86_tune_string, "generic", 7))
3133 error ("bad value (%s) for %stune=%s %s",
3134 ix86_tune_string, prefix, suffix, sw);
3135 else if (!strcmp (ix86_tune_string, "x86-64"))
3136 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3137 "%stune=k8%s or %stune=generic%s instead as appropriate",
3138 prefix, suffix, prefix, suffix, prefix, suffix);
3140 else
3142 if (ix86_arch_string)
3143 ix86_tune_string = ix86_arch_string;
3144 if (!ix86_tune_string)
3146 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3147 ix86_tune_defaulted = 1;
3150 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3151 need to use a sensible tune option. */
3152 if (!strcmp (ix86_tune_string, "generic")
3153 || !strcmp (ix86_tune_string, "x86-64")
3154 || !strcmp (ix86_tune_string, "i686"))
3156 if (TARGET_64BIT)
3157 ix86_tune_string = "generic64";
3158 else
3159 ix86_tune_string = "generic32";
3163 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3165 /* rep; movq isn't available in 32-bit code. */
3166 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3167 ix86_stringop_alg = no_stringop;
3170 if (!ix86_arch_string)
3171 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3172 else
3173 ix86_arch_specified = 1;
3175 if (global_options_set.x_ix86_pmode)
3177 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3178 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3179 error ("address mode %qs not supported in the %s bit mode",
3180 TARGET_64BIT ? "short" : "long",
3181 TARGET_64BIT ? "64" : "32");
3183 else
3184 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3186 if (!global_options_set.x_ix86_abi)
3187 ix86_abi = DEFAULT_ABI;
3189 if (global_options_set.x_ix86_cmodel)
3191 switch (ix86_cmodel)
3193 case CM_SMALL:
3194 case CM_SMALL_PIC:
3195 if (flag_pic)
3196 ix86_cmodel = CM_SMALL_PIC;
3197 if (!TARGET_64BIT)
3198 error ("code model %qs not supported in the %s bit mode",
3199 "small", "32");
3200 break;
3202 case CM_MEDIUM:
3203 case CM_MEDIUM_PIC:
3204 if (flag_pic)
3205 ix86_cmodel = CM_MEDIUM_PIC;
3206 if (!TARGET_64BIT)
3207 error ("code model %qs not supported in the %s bit mode",
3208 "medium", "32");
3209 else if (TARGET_X32)
3210 error ("code model %qs not supported in x32 mode",
3211 "medium");
3212 break;
3214 case CM_LARGE:
3215 case CM_LARGE_PIC:
3216 if (flag_pic)
3217 ix86_cmodel = CM_LARGE_PIC;
3218 if (!TARGET_64BIT)
3219 error ("code model %qs not supported in the %s bit mode",
3220 "large", "32");
3221 else if (TARGET_X32)
3222 error ("code model %qs not supported in x32 mode",
3223 "medium");
3224 break;
3226 case CM_32:
3227 if (flag_pic)
3228 error ("code model %s does not support PIC mode", "32");
3229 if (TARGET_64BIT)
3230 error ("code model %qs not supported in the %s bit mode",
3231 "32", "64");
3232 break;
3234 case CM_KERNEL:
3235 if (flag_pic)
3237 error ("code model %s does not support PIC mode", "kernel");
3238 ix86_cmodel = CM_32;
3240 if (!TARGET_64BIT)
3241 error ("code model %qs not supported in the %s bit mode",
3242 "kernel", "32");
3243 break;
3245 default:
3246 gcc_unreachable ();
3249 else
3251 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3252 use of rip-relative addressing. This eliminates fixups that
3253 would otherwise be needed if this object is to be placed in a
3254 DLL, and is essentially just as efficient as direct addressing. */
3255 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3256 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3257 else if (TARGET_64BIT)
3258 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3259 else
3260 ix86_cmodel = CM_32;
3262 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3264 error ("-masm=intel not supported in this configuration");
3265 ix86_asm_dialect = ASM_ATT;
3267 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3268 sorry ("%i-bit mode not compiled in",
3269 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3271 for (i = 0; i < pta_size; i++)
3272 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3274 ix86_schedule = processor_alias_table[i].schedule;
3275 ix86_arch = processor_alias_table[i].processor;
3276 /* Default cpu tuning to the architecture. */
3277 ix86_tune = ix86_arch;
3279 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3280 error ("CPU you selected does not support x86-64 "
3281 "instruction set");
3283 if (processor_alias_table[i].flags & PTA_MMX
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3285 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3286 if (processor_alias_table[i].flags & PTA_3DNOW
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3288 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3289 if (processor_alias_table[i].flags & PTA_3DNOW_A
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3291 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3292 if (processor_alias_table[i].flags & PTA_SSE
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3295 if (processor_alias_table[i].flags & PTA_SSE2
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3297 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3298 if (processor_alias_table[i].flags & PTA_SSE3
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3300 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3301 if (processor_alias_table[i].flags & PTA_SSSE3
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3303 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3304 if (processor_alias_table[i].flags & PTA_SSE4_1
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3307 if (processor_alias_table[i].flags & PTA_SSE4_2
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3309 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3310 if (processor_alias_table[i].flags & PTA_AVX
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3312 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3313 if (processor_alias_table[i].flags & PTA_AVX2
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3315 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3316 if (processor_alias_table[i].flags & PTA_FMA
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3318 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3319 if (processor_alias_table[i].flags & PTA_SSE4A
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3321 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3322 if (processor_alias_table[i].flags & PTA_FMA4
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3324 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3325 if (processor_alias_table[i].flags & PTA_XOP
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3327 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3328 if (processor_alias_table[i].flags & PTA_LWP
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3330 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3331 if (processor_alias_table[i].flags & PTA_ABM
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3333 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3334 if (processor_alias_table[i].flags & PTA_BMI
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3336 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3337 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3339 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3340 if (processor_alias_table[i].flags & PTA_TBM
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3342 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3343 if (processor_alias_table[i].flags & PTA_BMI2
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3345 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3346 if (processor_alias_table[i].flags & PTA_CX16
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3348 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3349 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3351 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3352 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3354 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3355 if (processor_alias_table[i].flags & PTA_MOVBE
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3357 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3358 if (processor_alias_table[i].flags & PTA_AES
3359 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3360 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3361 if (processor_alias_table[i].flags & PTA_PCLMUL
3362 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3363 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3364 if (processor_alias_table[i].flags & PTA_FSGSBASE
3365 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3366 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3367 if (processor_alias_table[i].flags & PTA_RDRND
3368 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3369 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3370 if (processor_alias_table[i].flags & PTA_F16C
3371 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3372 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3373 if (processor_alias_table[i].flags & PTA_RTM
3374 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3375 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3376 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3377 x86_prefetch_sse = true;
3379 break;
3382 if (!strcmp (ix86_arch_string, "generic"))
3383 error ("generic CPU can be used only for %stune=%s %s",
3384 prefix, suffix, sw);
3385 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3386 error ("bad value (%s) for %sarch=%s %s",
3387 ix86_arch_string, prefix, suffix, sw);
3389 ix86_arch_mask = 1u << ix86_arch;
3390 for (i = 0; i < X86_ARCH_LAST; ++i)
3391 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3393 for (i = 0; i < pta_size; i++)
3394 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3396 ix86_schedule = processor_alias_table[i].schedule;
3397 ix86_tune = processor_alias_table[i].processor;
3398 if (TARGET_64BIT)
3400 if (!(processor_alias_table[i].flags & PTA_64BIT))
3402 if (ix86_tune_defaulted)
3404 ix86_tune_string = "x86-64";
3405 for (i = 0; i < pta_size; i++)
3406 if (! strcmp (ix86_tune_string,
3407 processor_alias_table[i].name))
3408 break;
3409 ix86_schedule = processor_alias_table[i].schedule;
3410 ix86_tune = processor_alias_table[i].processor;
3412 else
3413 error ("CPU you selected does not support x86-64 "
3414 "instruction set");
3417 else
3419 /* Adjust tuning when compiling for 32-bit ABI. */
3420 switch (ix86_tune)
3422 case PROCESSOR_GENERIC64:
3423 ix86_tune = PROCESSOR_GENERIC32;
3424 ix86_schedule = CPU_PENTIUMPRO;
3425 break;
3427 case PROCESSOR_CORE2_64:
3428 ix86_tune = PROCESSOR_CORE2_32;
3429 break;
3431 case PROCESSOR_COREI7_64:
3432 ix86_tune = PROCESSOR_COREI7_32;
3433 break;
3435 default:
3436 break;
3439 /* Intel CPUs have always interpreted SSE prefetch instructions as
3440 NOPs; so, we can enable SSE prefetch instructions even when
3441 -mtune (rather than -march) points us to a processor that has them.
3442 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3443 higher processors. */
3444 if (TARGET_CMOVE
3445 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3446 x86_prefetch_sse = true;
3447 break;
3450 if (ix86_tune_specified && i == pta_size)
3451 error ("bad value (%s) for %stune=%s %s",
3452 ix86_tune_string, prefix, suffix, sw);
3454 ix86_tune_mask = 1u << ix86_tune;
3455 for (i = 0; i < X86_TUNE_LAST; ++i)
3456 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3458 #ifndef USE_IX86_FRAME_POINTER
3459 #define USE_IX86_FRAME_POINTER 0
3460 #endif
3462 #ifndef USE_X86_64_FRAME_POINTER
3463 #define USE_X86_64_FRAME_POINTER 0
3464 #endif
3466 /* Set the default values for switches whose default depends on TARGET_64BIT
3467 in case they weren't overwritten by command line options. */
3468 if (TARGET_64BIT)
3470 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3471 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3472 if (flag_asynchronous_unwind_tables == 2)
3473 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3474 if (flag_pcc_struct_return == 2)
3475 flag_pcc_struct_return = 0;
3477 else
3479 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3480 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3481 if (flag_asynchronous_unwind_tables == 2)
3482 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3483 if (flag_pcc_struct_return == 2)
3484 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3487 if (optimize_size)
3488 ix86_cost = &ix86_size_cost;
3489 else
3490 ix86_cost = processor_target_table[ix86_tune].cost;
3492 /* Arrange to set up i386_stack_locals for all functions. */
3493 init_machine_status = ix86_init_machine_status;
3495 /* Validate -mregparm= value. */
3496 if (global_options_set.x_ix86_regparm)
3498 if (TARGET_64BIT)
3499 warning (0, "-mregparm is ignored in 64-bit mode");
3500 if (ix86_regparm > REGPARM_MAX)
3502 error ("-mregparm=%d is not between 0 and %d",
3503 ix86_regparm, REGPARM_MAX);
3504 ix86_regparm = 0;
3507 if (TARGET_64BIT)
3508 ix86_regparm = REGPARM_MAX;
3510 /* Default align_* from the processor table. */
3511 if (align_loops == 0)
3513 align_loops = processor_target_table[ix86_tune].align_loop;
3514 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3516 if (align_jumps == 0)
3518 align_jumps = processor_target_table[ix86_tune].align_jump;
3519 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3521 if (align_functions == 0)
3523 align_functions = processor_target_table[ix86_tune].align_func;
3526 /* Provide default for -mbranch-cost= value. */
3527 if (!global_options_set.x_ix86_branch_cost)
3528 ix86_branch_cost = ix86_cost->branch_cost;
3530 if (TARGET_64BIT)
3532 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3534 /* Enable by default the SSE and MMX builtins. Do allow the user to
3535 explicitly disable any of these. In particular, disabling SSE and
3536 MMX for kernel code is extremely useful. */
3537 if (!ix86_arch_specified)
3538 ix86_isa_flags
3539 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3540 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3542 if (TARGET_RTD)
3543 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3545 else
3547 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3549 if (!ix86_arch_specified)
3550 ix86_isa_flags
3551 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3553 /* i386 ABI does not specify red zone. It still makes sense to use it
3554 when programmer takes care to stack from being destroyed. */
3555 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3556 target_flags |= MASK_NO_RED_ZONE;
3559 /* Keep nonleaf frame pointers. */
3560 if (flag_omit_frame_pointer)
3561 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3562 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3563 flag_omit_frame_pointer = 1;
3565 /* If we're doing fast math, we don't care about comparison order
3566 wrt NaNs. This lets us use a shorter comparison sequence. */
3567 if (flag_finite_math_only)
3568 target_flags &= ~MASK_IEEE_FP;
3570 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3571 since the insns won't need emulation. */
3572 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3573 target_flags &= ~MASK_NO_FANCY_MATH_387;
3575 /* Likewise, if the target doesn't have a 387, or we've specified
3576 software floating point, don't use 387 inline intrinsics. */
3577 if (!TARGET_80387)
3578 target_flags |= MASK_NO_FANCY_MATH_387;
3580 /* Turn on MMX builtins for -msse. */
3581 if (TARGET_SSE)
3583 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3584 x86_prefetch_sse = true;
3587 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3588 if (TARGET_SSE4_2 || TARGET_ABM)
3589 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3591 /* Turn on lzcnt instruction for -mabm. */
3592 if (TARGET_ABM)
3593 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3595 /* Validate -mpreferred-stack-boundary= value or default it to
3596 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3597 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3598 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3600 int min = (TARGET_64BIT ? 4 : 2);
3601 int max = (TARGET_SEH ? 4 : 12);
3603 if (ix86_preferred_stack_boundary_arg < min
3604 || ix86_preferred_stack_boundary_arg > max)
3606 if (min == max)
3607 error ("-mpreferred-stack-boundary is not supported "
3608 "for this target");
3609 else
3610 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3611 ix86_preferred_stack_boundary_arg, min, max);
3613 else
3614 ix86_preferred_stack_boundary
3615 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3618 /* Set the default value for -mstackrealign. */
3619 if (ix86_force_align_arg_pointer == -1)
3620 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3622 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3624 /* Validate -mincoming-stack-boundary= value or default it to
3625 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3626 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3627 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3629 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3630 || ix86_incoming_stack_boundary_arg > 12)
3631 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3632 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3633 else
3635 ix86_user_incoming_stack_boundary
3636 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3637 ix86_incoming_stack_boundary
3638 = ix86_user_incoming_stack_boundary;
3642 /* Accept -msseregparm only if at least SSE support is enabled. */
3643 if (TARGET_SSEREGPARM
3644 && ! TARGET_SSE)
3645 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3647 if (global_options_set.x_ix86_fpmath)
3649 if (ix86_fpmath & FPMATH_SSE)
3651 if (!TARGET_SSE)
3653 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3654 ix86_fpmath = FPMATH_387;
3656 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3658 warning (0, "387 instruction set disabled, using SSE arithmetics");
3659 ix86_fpmath = FPMATH_SSE;
3663 else
3664 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3666 /* If the i387 is disabled, then do not return values in it. */
3667 if (!TARGET_80387)
3668 target_flags &= ~MASK_FLOAT_RETURNS;
3670 /* Use external vectorized library in vectorizing intrinsics. */
3671 if (global_options_set.x_ix86_veclibabi_type)
3672 switch (ix86_veclibabi_type)
3674 case ix86_veclibabi_type_svml:
3675 ix86_veclib_handler = ix86_veclibabi_svml;
3676 break;
3678 case ix86_veclibabi_type_acml:
3679 ix86_veclib_handler = ix86_veclibabi_acml;
3680 break;
3682 default:
3683 gcc_unreachable ();
3686 if ((!USE_IX86_FRAME_POINTER
3687 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3688 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3689 && !optimize_size)
3690 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3692 /* ??? Unwind info is not correct around the CFG unless either a frame
3693 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3694 unwind info generation to be aware of the CFG and propagating states
3695 around edges. */
3696 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3697 || flag_exceptions || flag_non_call_exceptions)
3698 && flag_omit_frame_pointer
3699 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3701 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3702 warning (0, "unwind tables currently require either a frame pointer "
3703 "or %saccumulate-outgoing-args%s for correctness",
3704 prefix, suffix);
3705 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3708 /* If stack probes are required, the space used for large function
3709 arguments on the stack must also be probed, so enable
3710 -maccumulate-outgoing-args so this happens in the prologue. */
3711 if (TARGET_STACK_PROBE
3712 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3714 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3715 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3716 "for correctness", prefix, suffix);
3717 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3720 /* For sane SSE instruction set generation we need fcomi instruction.
3721 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3722 expands to a sequence that includes conditional move. */
3723 if (TARGET_SSE || TARGET_RDRND)
3724 TARGET_CMOVE = 1;
3726 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3728 char *p;
3729 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3730 p = strchr (internal_label_prefix, 'X');
3731 internal_label_prefix_len = p - internal_label_prefix;
3732 *p = '\0';
3735 /* When scheduling description is not available, disable scheduler pass
3736 so it won't slow down the compilation and make x87 code slower. */
3737 if (!TARGET_SCHEDULE)
3738 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3740 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3741 ix86_cost->simultaneous_prefetches,
3742 global_options.x_param_values,
3743 global_options_set.x_param_values);
3744 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3745 global_options.x_param_values,
3746 global_options_set.x_param_values);
3747 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3748 global_options.x_param_values,
3749 global_options_set.x_param_values);
3750 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3751 global_options.x_param_values,
3752 global_options_set.x_param_values);
3754 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3755 if (flag_prefetch_loop_arrays < 0
3756 && HAVE_prefetch
3757 && optimize >= 3
3758 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3759 flag_prefetch_loop_arrays = 1;
3761 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3762 can be optimized to ap = __builtin_next_arg (0). */
3763 if (!TARGET_64BIT && !flag_split_stack)
3764 targetm.expand_builtin_va_start = NULL;
3766 if (TARGET_64BIT)
3768 ix86_gen_leave = gen_leave_rex64;
3769 if (Pmode == DImode)
3771 ix86_gen_monitor = gen_sse3_monitor64_di;
3772 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3773 ix86_gen_tls_local_dynamic_base_64
3774 = gen_tls_local_dynamic_base_64_di;
3776 else
3778 ix86_gen_monitor = gen_sse3_monitor64_si;
3779 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3780 ix86_gen_tls_local_dynamic_base_64
3781 = gen_tls_local_dynamic_base_64_si;
3784 else
3786 ix86_gen_leave = gen_leave;
3787 ix86_gen_monitor = gen_sse3_monitor;
3790 if (Pmode == DImode)
3792 ix86_gen_add3 = gen_adddi3;
3793 ix86_gen_sub3 = gen_subdi3;
3794 ix86_gen_sub3_carry = gen_subdi3_carry;
3795 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3796 ix86_gen_andsp = gen_anddi3;
3797 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3798 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3799 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3801 else
3803 ix86_gen_add3 = gen_addsi3;
3804 ix86_gen_sub3 = gen_subsi3;
3805 ix86_gen_sub3_carry = gen_subsi3_carry;
3806 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3807 ix86_gen_andsp = gen_andsi3;
3808 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3809 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3810 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3813 #ifdef USE_IX86_CLD
3814 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3815 if (!TARGET_64BIT)
3816 target_flags |= MASK_CLD & ~target_flags_explicit;
3817 #endif
3819 if (!TARGET_64BIT && flag_pic)
3821 if (flag_fentry > 0)
3822 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3823 "with -fpic");
3824 flag_fentry = 0;
3826 else if (TARGET_SEH)
3828 if (flag_fentry == 0)
3829 sorry ("-mno-fentry isn%'t compatible with SEH");
3830 flag_fentry = 1;
3832 else if (flag_fentry < 0)
3834 #if defined(PROFILE_BEFORE_PROLOGUE)
3835 flag_fentry = 1;
3836 #else
3837 flag_fentry = 0;
3838 #endif
3841 if (TARGET_AVX)
3843 /* When not optimize for size, enable vzeroupper optimization for
3844 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3845 AVX unaligned load/store. */
3846 if (!optimize_size)
3848 if (flag_expensive_optimizations
3849 && !(target_flags_explicit & MASK_VZEROUPPER))
3850 target_flags |= MASK_VZEROUPPER;
3851 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3852 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3853 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3854 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3855 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3856 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3857 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3858 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3859 target_flags |= MASK_PREFER_AVX128;
3862 else
3864 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3865 target_flags &= ~MASK_VZEROUPPER;
3868 if (ix86_recip_name)
3870 char *p = ASTRDUP (ix86_recip_name);
3871 char *q;
3872 unsigned int mask, i;
3873 bool invert;
3875 while ((q = strtok (p, ",")) != NULL)
3877 p = NULL;
3878 if (*q == '!')
3880 invert = true;
3881 q++;
3883 else
3884 invert = false;
3886 if (!strcmp (q, "default"))
3887 mask = RECIP_MASK_ALL;
3888 else
3890 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3891 if (!strcmp (q, recip_options[i].string))
3893 mask = recip_options[i].mask;
3894 break;
3897 if (i == ARRAY_SIZE (recip_options))
3899 error ("unknown option for -mrecip=%s", q);
3900 invert = false;
3901 mask = RECIP_MASK_NONE;
3905 recip_mask_explicit |= mask;
3906 if (invert)
3907 recip_mask &= ~mask;
3908 else
3909 recip_mask |= mask;
3913 if (TARGET_RECIP)
3914 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3915 else if (target_flags_explicit & MASK_RECIP)
3916 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3918 /* Save the initial options in case the user does function specific
3919 options. */
3920 if (main_args_p)
3921 target_option_default_node = target_option_current_node
3922 = build_target_option_node ();
3925 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3927 static bool
3928 function_pass_avx256_p (const_rtx val)
3930 if (!val)
3931 return false;
3933 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3934 return true;
3936 if (GET_CODE (val) == PARALLEL)
3938 int i;
3939 rtx r;
3941 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3943 r = XVECEXP (val, 0, i);
3944 if (GET_CODE (r) == EXPR_LIST
3945 && XEXP (r, 0)
3946 && REG_P (XEXP (r, 0))
3947 && (GET_MODE (XEXP (r, 0)) == OImode
3948 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3949 return true;
3953 return false;
3956 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3958 static void
3959 ix86_option_override (void)
3961 ix86_option_override_internal (true);
3964 /* Update register usage after having seen the compiler flags. */
3966 static void
3967 ix86_conditional_register_usage (void)
3969 int i;
3970 unsigned int j;
3972 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3974 if (fixed_regs[i] > 1)
3975 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3976 if (call_used_regs[i] > 1)
3977 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3980 /* The PIC register, if it exists, is fixed. */
3981 j = PIC_OFFSET_TABLE_REGNUM;
3982 if (j != INVALID_REGNUM)
3983 fixed_regs[j] = call_used_regs[j] = 1;
3985 /* The 64-bit MS_ABI changes the set of call-used registers. */
3986 if (TARGET_64BIT_MS_ABI)
3988 call_used_regs[SI_REG] = 0;
3989 call_used_regs[DI_REG] = 0;
3990 call_used_regs[XMM6_REG] = 0;
3991 call_used_regs[XMM7_REG] = 0;
3992 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3993 call_used_regs[i] = 0;
3996 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3997 other call-clobbered regs for 64-bit. */
3998 if (TARGET_64BIT)
4000 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4002 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4003 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4004 && call_used_regs[i])
4005 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4008 /* If MMX is disabled, squash the registers. */
4009 if (! TARGET_MMX)
4010 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4011 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4012 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4014 /* If SSE is disabled, squash the registers. */
4015 if (! TARGET_SSE)
4016 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4017 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4018 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4020 /* If the FPU is disabled, squash the registers. */
4021 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4022 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4023 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4024 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4026 /* If 32-bit, squash the 64-bit registers. */
4027 if (! TARGET_64BIT)
4029 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4030 reg_names[i] = "";
4031 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4032 reg_names[i] = "";
4037 /* Save the current options */
4039 static void
4040 ix86_function_specific_save (struct cl_target_option *ptr)
4042 ptr->arch = ix86_arch;
4043 ptr->schedule = ix86_schedule;
4044 ptr->tune = ix86_tune;
4045 ptr->branch_cost = ix86_branch_cost;
4046 ptr->tune_defaulted = ix86_tune_defaulted;
4047 ptr->arch_specified = ix86_arch_specified;
4048 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4049 ptr->ix86_target_flags_explicit = target_flags_explicit;
4050 ptr->x_recip_mask_explicit = recip_mask_explicit;
4052 /* The fields are char but the variables are not; make sure the
4053 values fit in the fields. */
4054 gcc_assert (ptr->arch == ix86_arch);
4055 gcc_assert (ptr->schedule == ix86_schedule);
4056 gcc_assert (ptr->tune == ix86_tune);
4057 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4060 /* Restore the current options */
4062 static void
4063 ix86_function_specific_restore (struct cl_target_option *ptr)
4065 enum processor_type old_tune = ix86_tune;
4066 enum processor_type old_arch = ix86_arch;
4067 unsigned int ix86_arch_mask, ix86_tune_mask;
4068 int i;
4070 ix86_arch = (enum processor_type) ptr->arch;
4071 ix86_schedule = (enum attr_cpu) ptr->schedule;
4072 ix86_tune = (enum processor_type) ptr->tune;
4073 ix86_branch_cost = ptr->branch_cost;
4074 ix86_tune_defaulted = ptr->tune_defaulted;
4075 ix86_arch_specified = ptr->arch_specified;
4076 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4077 target_flags_explicit = ptr->ix86_target_flags_explicit;
4078 recip_mask_explicit = ptr->x_recip_mask_explicit;
4080 /* Recreate the arch feature tests if the arch changed */
4081 if (old_arch != ix86_arch)
4083 ix86_arch_mask = 1u << ix86_arch;
4084 for (i = 0; i < X86_ARCH_LAST; ++i)
4085 ix86_arch_features[i]
4086 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4089 /* Recreate the tune optimization tests */
4090 if (old_tune != ix86_tune)
4092 ix86_tune_mask = 1u << ix86_tune;
4093 for (i = 0; i < X86_TUNE_LAST; ++i)
4094 ix86_tune_features[i]
4095 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4099 /* Print the current options */
4101 static void
4102 ix86_function_specific_print (FILE *file, int indent,
4103 struct cl_target_option *ptr)
4105 char *target_string
4106 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4107 NULL, NULL, ptr->x_ix86_fpmath, false);
4109 fprintf (file, "%*sarch = %d (%s)\n",
4110 indent, "",
4111 ptr->arch,
4112 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4113 ? cpu_names[ptr->arch]
4114 : "<unknown>"));
4116 fprintf (file, "%*stune = %d (%s)\n",
4117 indent, "",
4118 ptr->tune,
4119 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4120 ? cpu_names[ptr->tune]
4121 : "<unknown>"));
4123 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4125 if (target_string)
4127 fprintf (file, "%*s%s\n", indent, "", target_string);
4128 free (target_string);
4133 /* Inner function to process the attribute((target(...))), take an argument and
4134 set the current options from the argument. If we have a list, recursively go
4135 over the list. */
4137 static bool
4138 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4139 struct gcc_options *enum_opts_set)
4141 char *next_optstr;
4142 bool ret = true;
4144 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4145 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4146 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4147 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4148 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4150 enum ix86_opt_type
4152 ix86_opt_unknown,
4153 ix86_opt_yes,
4154 ix86_opt_no,
4155 ix86_opt_str,
4156 ix86_opt_enum,
4157 ix86_opt_isa
4160 static const struct
4162 const char *string;
4163 size_t len;
4164 enum ix86_opt_type type;
4165 int opt;
4166 int mask;
4167 } attrs[] = {
4168 /* isa options */
4169 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4170 IX86_ATTR_ISA ("abm", OPT_mabm),
4171 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4172 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4173 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4174 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4175 IX86_ATTR_ISA ("aes", OPT_maes),
4176 IX86_ATTR_ISA ("avx", OPT_mavx),
4177 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4178 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4179 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4180 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4181 IX86_ATTR_ISA ("sse", OPT_msse),
4182 IX86_ATTR_ISA ("sse2", OPT_msse2),
4183 IX86_ATTR_ISA ("sse3", OPT_msse3),
4184 IX86_ATTR_ISA ("sse4", OPT_msse4),
4185 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4186 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4187 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4188 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4189 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4190 IX86_ATTR_ISA ("fma", OPT_mfma),
4191 IX86_ATTR_ISA ("xop", OPT_mxop),
4192 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4193 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4194 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4195 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4196 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4198 /* enum options */
4199 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4201 /* string options */
4202 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4203 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4205 /* flag options */
4206 IX86_ATTR_YES ("cld",
4207 OPT_mcld,
4208 MASK_CLD),
4210 IX86_ATTR_NO ("fancy-math-387",
4211 OPT_mfancy_math_387,
4212 MASK_NO_FANCY_MATH_387),
4214 IX86_ATTR_YES ("ieee-fp",
4215 OPT_mieee_fp,
4216 MASK_IEEE_FP),
4218 IX86_ATTR_YES ("inline-all-stringops",
4219 OPT_minline_all_stringops,
4220 MASK_INLINE_ALL_STRINGOPS),
4222 IX86_ATTR_YES ("inline-stringops-dynamically",
4223 OPT_minline_stringops_dynamically,
4224 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4226 IX86_ATTR_NO ("align-stringops",
4227 OPT_mno_align_stringops,
4228 MASK_NO_ALIGN_STRINGOPS),
4230 IX86_ATTR_YES ("recip",
4231 OPT_mrecip,
4232 MASK_RECIP),
4236 /* If this is a list, recurse to get the options. */
4237 if (TREE_CODE (args) == TREE_LIST)
4239 bool ret = true;
4241 for (; args; args = TREE_CHAIN (args))
4242 if (TREE_VALUE (args)
4243 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4244 p_strings, enum_opts_set))
4245 ret = false;
4247 return ret;
4250 else if (TREE_CODE (args) != STRING_CST)
4251 gcc_unreachable ();
4253 /* Handle multiple arguments separated by commas. */
4254 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4256 while (next_optstr && *next_optstr != '\0')
4258 char *p = next_optstr;
4259 char *orig_p = p;
4260 char *comma = strchr (next_optstr, ',');
4261 const char *opt_string;
4262 size_t len, opt_len;
4263 int opt;
4264 bool opt_set_p;
4265 char ch;
4266 unsigned i;
4267 enum ix86_opt_type type = ix86_opt_unknown;
4268 int mask = 0;
4270 if (comma)
4272 *comma = '\0';
4273 len = comma - next_optstr;
4274 next_optstr = comma + 1;
4276 else
4278 len = strlen (p);
4279 next_optstr = NULL;
4282 /* Recognize no-xxx. */
4283 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4285 opt_set_p = false;
4286 p += 3;
4287 len -= 3;
4289 else
4290 opt_set_p = true;
4292 /* Find the option. */
4293 ch = *p;
4294 opt = N_OPTS;
4295 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4297 type = attrs[i].type;
4298 opt_len = attrs[i].len;
4299 if (ch == attrs[i].string[0]
4300 && ((type != ix86_opt_str && type != ix86_opt_enum)
4301 ? len == opt_len
4302 : len > opt_len)
4303 && memcmp (p, attrs[i].string, opt_len) == 0)
4305 opt = attrs[i].opt;
4306 mask = attrs[i].mask;
4307 opt_string = attrs[i].string;
4308 break;
4312 /* Process the option. */
4313 if (opt == N_OPTS)
4315 error ("attribute(target(\"%s\")) is unknown", orig_p);
4316 ret = false;
4319 else if (type == ix86_opt_isa)
4321 struct cl_decoded_option decoded;
4323 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4324 ix86_handle_option (&global_options, &global_options_set,
4325 &decoded, input_location);
4328 else if (type == ix86_opt_yes || type == ix86_opt_no)
4330 if (type == ix86_opt_no)
4331 opt_set_p = !opt_set_p;
4333 if (opt_set_p)
4334 target_flags |= mask;
4335 else
4336 target_flags &= ~mask;
4339 else if (type == ix86_opt_str)
4341 if (p_strings[opt])
4343 error ("option(\"%s\") was already specified", opt_string);
4344 ret = false;
4346 else
4347 p_strings[opt] = xstrdup (p + opt_len);
4350 else if (type == ix86_opt_enum)
4352 bool arg_ok;
4353 int value;
4355 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4356 if (arg_ok)
4357 set_option (&global_options, enum_opts_set, opt, value,
4358 p + opt_len, DK_UNSPECIFIED, input_location,
4359 global_dc);
4360 else
4362 error ("attribute(target(\"%s\")) is unknown", orig_p);
4363 ret = false;
4367 else
4368 gcc_unreachable ();
4371 return ret;
4374 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4376 tree
4377 ix86_valid_target_attribute_tree (tree args)
4379 const char *orig_arch_string = ix86_arch_string;
4380 const char *orig_tune_string = ix86_tune_string;
4381 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4382 int orig_tune_defaulted = ix86_tune_defaulted;
4383 int orig_arch_specified = ix86_arch_specified;
4384 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4385 tree t = NULL_TREE;
4386 int i;
4387 struct cl_target_option *def
4388 = TREE_TARGET_OPTION (target_option_default_node);
4389 struct gcc_options enum_opts_set;
4391 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4393 /* Process each of the options on the chain. */
4394 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4395 &enum_opts_set))
4396 return NULL_TREE;
4398 /* If the changed options are different from the default, rerun
4399 ix86_option_override_internal, and then save the options away.
4400 The string options are are attribute options, and will be undone
4401 when we copy the save structure. */
4402 if (ix86_isa_flags != def->x_ix86_isa_flags
4403 || target_flags != def->x_target_flags
4404 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4405 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4406 || enum_opts_set.x_ix86_fpmath)
4408 /* If we are using the default tune= or arch=, undo the string assigned,
4409 and use the default. */
4410 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4411 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4412 else if (!orig_arch_specified)
4413 ix86_arch_string = NULL;
4415 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4416 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4417 else if (orig_tune_defaulted)
4418 ix86_tune_string = NULL;
4420 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4421 if (enum_opts_set.x_ix86_fpmath)
4422 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4423 else if (!TARGET_64BIT && TARGET_SSE)
4425 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4426 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4429 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4430 ix86_option_override_internal (false);
4432 /* Add any builtin functions with the new isa if any. */
4433 ix86_add_new_builtins (ix86_isa_flags);
4435 /* Save the current options unless we are validating options for
4436 #pragma. */
4437 t = build_target_option_node ();
4439 ix86_arch_string = orig_arch_string;
4440 ix86_tune_string = orig_tune_string;
4441 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4443 /* Free up memory allocated to hold the strings */
4444 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4445 free (option_strings[i]);
4448 return t;
4451 /* Hook to validate attribute((target("string"))). */
4453 static bool
4454 ix86_valid_target_attribute_p (tree fndecl,
4455 tree ARG_UNUSED (name),
4456 tree args,
4457 int ARG_UNUSED (flags))
4459 struct cl_target_option cur_target;
4460 bool ret = true;
4461 tree old_optimize = build_optimization_node ();
4462 tree new_target, new_optimize;
4463 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4465 /* If the function changed the optimization levels as well as setting target
4466 options, start with the optimizations specified. */
4467 if (func_optimize && func_optimize != old_optimize)
4468 cl_optimization_restore (&global_options,
4469 TREE_OPTIMIZATION (func_optimize));
4471 /* The target attributes may also change some optimization flags, so update
4472 the optimization options if necessary. */
4473 cl_target_option_save (&cur_target, &global_options);
4474 new_target = ix86_valid_target_attribute_tree (args);
4475 new_optimize = build_optimization_node ();
4477 if (!new_target)
4478 ret = false;
4480 else if (fndecl)
4482 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4484 if (old_optimize != new_optimize)
4485 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4488 cl_target_option_restore (&global_options, &cur_target);
4490 if (old_optimize != new_optimize)
4491 cl_optimization_restore (&global_options,
4492 TREE_OPTIMIZATION (old_optimize));
4494 return ret;
4498 /* Hook to determine if one function can safely inline another. */
4500 static bool
4501 ix86_can_inline_p (tree caller, tree callee)
4503 bool ret = false;
4504 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4505 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4507 /* If callee has no option attributes, then it is ok to inline. */
4508 if (!callee_tree)
4509 ret = true;
4511 /* If caller has no option attributes, but callee does then it is not ok to
4512 inline. */
4513 else if (!caller_tree)
4514 ret = false;
4516 else
4518 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4519 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4521 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4522 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4523 function. */
4524 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4525 != callee_opts->x_ix86_isa_flags)
4526 ret = false;
4528 /* See if we have the same non-isa options. */
4529 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4530 ret = false;
4532 /* See if arch, tune, etc. are the same. */
4533 else if (caller_opts->arch != callee_opts->arch)
4534 ret = false;
4536 else if (caller_opts->tune != callee_opts->tune)
4537 ret = false;
4539 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4540 ret = false;
4542 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4543 ret = false;
4545 else
4546 ret = true;
4549 return ret;
4553 /* Remember the last target of ix86_set_current_function. */
4554 static GTY(()) tree ix86_previous_fndecl;
4556 /* Establish appropriate back-end context for processing the function
4557 FNDECL. The argument might be NULL to indicate processing at top
4558 level, outside of any function scope. */
4559 static void
4560 ix86_set_current_function (tree fndecl)
4562 /* Only change the context if the function changes. This hook is called
4563 several times in the course of compiling a function, and we don't want to
4564 slow things down too much or call target_reinit when it isn't safe. */
4565 if (fndecl && fndecl != ix86_previous_fndecl)
4567 tree old_tree = (ix86_previous_fndecl
4568 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4569 : NULL_TREE);
4571 tree new_tree = (fndecl
4572 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4573 : NULL_TREE);
4575 ix86_previous_fndecl = fndecl;
4576 if (old_tree == new_tree)
4579 else if (new_tree)
4581 cl_target_option_restore (&global_options,
4582 TREE_TARGET_OPTION (new_tree));
4583 target_reinit ();
4586 else if (old_tree)
4588 struct cl_target_option *def
4589 = TREE_TARGET_OPTION (target_option_current_node);
4591 cl_target_option_restore (&global_options, def);
4592 target_reinit ();
4598 /* Return true if this goes in large data/bss. */
4600 static bool
4601 ix86_in_large_data_p (tree exp)
4603 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4604 return false;
4606 /* Functions are never large data. */
4607 if (TREE_CODE (exp) == FUNCTION_DECL)
4608 return false;
4610 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4612 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4613 if (strcmp (section, ".ldata") == 0
4614 || strcmp (section, ".lbss") == 0)
4615 return true;
4616 return false;
4618 else
4620 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4622 /* If this is an incomplete type with size 0, then we can't put it
4623 in data because it might be too big when completed. */
4624 if (!size || size > ix86_section_threshold)
4625 return true;
4628 return false;
4631 /* Switch to the appropriate section for output of DECL.
4632 DECL is either a `VAR_DECL' node or a constant of some sort.
4633 RELOC indicates whether forming the initial value of DECL requires
4634 link-time relocations. */
4636 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4637 ATTRIBUTE_UNUSED;
4639 static section *
4640 x86_64_elf_select_section (tree decl, int reloc,
4641 unsigned HOST_WIDE_INT align)
4643 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4644 && ix86_in_large_data_p (decl))
4646 const char *sname = NULL;
4647 unsigned int flags = SECTION_WRITE;
4648 switch (categorize_decl_for_section (decl, reloc))
4650 case SECCAT_DATA:
4651 sname = ".ldata";
4652 break;
4653 case SECCAT_DATA_REL:
4654 sname = ".ldata.rel";
4655 break;
4656 case SECCAT_DATA_REL_LOCAL:
4657 sname = ".ldata.rel.local";
4658 break;
4659 case SECCAT_DATA_REL_RO:
4660 sname = ".ldata.rel.ro";
4661 break;
4662 case SECCAT_DATA_REL_RO_LOCAL:
4663 sname = ".ldata.rel.ro.local";
4664 break;
4665 case SECCAT_BSS:
4666 sname = ".lbss";
4667 flags |= SECTION_BSS;
4668 break;
4669 case SECCAT_RODATA:
4670 case SECCAT_RODATA_MERGE_STR:
4671 case SECCAT_RODATA_MERGE_STR_INIT:
4672 case SECCAT_RODATA_MERGE_CONST:
4673 sname = ".lrodata";
4674 flags = 0;
4675 break;
4676 case SECCAT_SRODATA:
4677 case SECCAT_SDATA:
4678 case SECCAT_SBSS:
4679 gcc_unreachable ();
4680 case SECCAT_TEXT:
4681 case SECCAT_TDATA:
4682 case SECCAT_TBSS:
4683 /* We don't split these for medium model. Place them into
4684 default sections and hope for best. */
4685 break;
4687 if (sname)
4689 /* We might get called with string constants, but get_named_section
4690 doesn't like them as they are not DECLs. Also, we need to set
4691 flags in that case. */
4692 if (!DECL_P (decl))
4693 return get_section (sname, flags, NULL);
4694 return get_named_section (decl, sname, reloc);
4697 return default_elf_select_section (decl, reloc, align);
4700 /* Build up a unique section name, expressed as a
4701 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4702 RELOC indicates whether the initial value of EXP requires
4703 link-time relocations. */
4705 static void ATTRIBUTE_UNUSED
4706 x86_64_elf_unique_section (tree decl, int reloc)
4708 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4709 && ix86_in_large_data_p (decl))
4711 const char *prefix = NULL;
4712 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4713 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4715 switch (categorize_decl_for_section (decl, reloc))
4717 case SECCAT_DATA:
4718 case SECCAT_DATA_REL:
4719 case SECCAT_DATA_REL_LOCAL:
4720 case SECCAT_DATA_REL_RO:
4721 case SECCAT_DATA_REL_RO_LOCAL:
4722 prefix = one_only ? ".ld" : ".ldata";
4723 break;
4724 case SECCAT_BSS:
4725 prefix = one_only ? ".lb" : ".lbss";
4726 break;
4727 case SECCAT_RODATA:
4728 case SECCAT_RODATA_MERGE_STR:
4729 case SECCAT_RODATA_MERGE_STR_INIT:
4730 case SECCAT_RODATA_MERGE_CONST:
4731 prefix = one_only ? ".lr" : ".lrodata";
4732 break;
4733 case SECCAT_SRODATA:
4734 case SECCAT_SDATA:
4735 case SECCAT_SBSS:
4736 gcc_unreachable ();
4737 case SECCAT_TEXT:
4738 case SECCAT_TDATA:
4739 case SECCAT_TBSS:
4740 /* We don't split these for medium model. Place them into
4741 default sections and hope for best. */
4742 break;
4744 if (prefix)
4746 const char *name, *linkonce;
4747 char *string;
4749 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4750 name = targetm.strip_name_encoding (name);
4752 /* If we're using one_only, then there needs to be a .gnu.linkonce
4753 prefix to the section name. */
4754 linkonce = one_only ? ".gnu.linkonce" : "";
4756 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4758 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4759 return;
4762 default_unique_section (decl, reloc);
4765 #ifdef COMMON_ASM_OP
4766 /* This says how to output assembler code to declare an
4767 uninitialized external linkage data object.
4769 For medium model x86-64 we need to use .largecomm opcode for
4770 large objects. */
4771 void
4772 x86_elf_aligned_common (FILE *file,
4773 const char *name, unsigned HOST_WIDE_INT size,
4774 int align)
4776 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4777 && size > (unsigned int)ix86_section_threshold)
4778 fputs (".largecomm\t", file);
4779 else
4780 fputs (COMMON_ASM_OP, file);
4781 assemble_name (file, name);
4782 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4783 size, align / BITS_PER_UNIT);
4785 #endif
4787 /* Utility function for targets to use in implementing
4788 ASM_OUTPUT_ALIGNED_BSS. */
4790 void
4791 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4792 const char *name, unsigned HOST_WIDE_INT size,
4793 int align)
4795 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4796 && size > (unsigned int)ix86_section_threshold)
4797 switch_to_section (get_named_section (decl, ".lbss", 0));
4798 else
4799 switch_to_section (bss_section);
4800 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4801 #ifdef ASM_DECLARE_OBJECT_NAME
4802 last_assemble_variable_decl = decl;
4803 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4804 #else
4805 /* Standard thing is just output label for the object. */
4806 ASM_OUTPUT_LABEL (file, name);
4807 #endif /* ASM_DECLARE_OBJECT_NAME */
4808 ASM_OUTPUT_SKIP (file, size ? size : 1);
4811 /* Decide whether we must probe the stack before any space allocation
4812 on this target. It's essentially TARGET_STACK_PROBE except when
4813 -fstack-check causes the stack to be already probed differently. */
4815 bool
4816 ix86_target_stack_probe (void)
4818 /* Do not probe the stack twice if static stack checking is enabled. */
4819 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4820 return false;
4822 return TARGET_STACK_PROBE;
4825 /* Decide whether we can make a sibling call to a function. DECL is the
4826 declaration of the function being targeted by the call and EXP is the
4827 CALL_EXPR representing the call. */
4829 static bool
4830 ix86_function_ok_for_sibcall (tree decl, tree exp)
4832 tree type, decl_or_type;
4833 rtx a, b;
4835 /* If we are generating position-independent code, we cannot sibcall
4836 optimize any indirect call, or a direct call to a global function,
4837 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4838 if (!TARGET_MACHO
4839 && !TARGET_64BIT
4840 && flag_pic
4841 && (!decl || !targetm.binds_local_p (decl)))
4842 return false;
4844 /* If we need to align the outgoing stack, then sibcalling would
4845 unalign the stack, which may break the called function. */
4846 if (ix86_minimum_incoming_stack_boundary (true)
4847 < PREFERRED_STACK_BOUNDARY)
4848 return false;
4850 if (decl)
4852 decl_or_type = decl;
4853 type = TREE_TYPE (decl);
4855 else
4857 /* We're looking at the CALL_EXPR, we need the type of the function. */
4858 type = CALL_EXPR_FN (exp); /* pointer expression */
4859 type = TREE_TYPE (type); /* pointer type */
4860 type = TREE_TYPE (type); /* function type */
4861 decl_or_type = type;
4864 /* Check that the return value locations are the same. Like
4865 if we are returning floats on the 80387 register stack, we cannot
4866 make a sibcall from a function that doesn't return a float to a
4867 function that does or, conversely, from a function that does return
4868 a float to a function that doesn't; the necessary stack adjustment
4869 would not be executed. This is also the place we notice
4870 differences in the return value ABI. Note that it is ok for one
4871 of the functions to have void return type as long as the return
4872 value of the other is passed in a register. */
4873 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4874 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4875 cfun->decl, false);
4876 if (STACK_REG_P (a) || STACK_REG_P (b))
4878 if (!rtx_equal_p (a, b))
4879 return false;
4881 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4883 /* Disable sibcall if we need to generate vzeroupper after
4884 callee returns. */
4885 if (TARGET_VZEROUPPER
4886 && cfun->machine->callee_return_avx256_p
4887 && !cfun->machine->caller_return_avx256_p)
4888 return false;
4890 else if (!rtx_equal_p (a, b))
4891 return false;
4893 if (TARGET_64BIT)
4895 /* The SYSV ABI has more call-clobbered registers;
4896 disallow sibcalls from MS to SYSV. */
4897 if (cfun->machine->call_abi == MS_ABI
4898 && ix86_function_type_abi (type) == SYSV_ABI)
4899 return false;
4901 else
4903 /* If this call is indirect, we'll need to be able to use a
4904 call-clobbered register for the address of the target function.
4905 Make sure that all such registers are not used for passing
4906 parameters. Note that DLLIMPORT functions are indirect. */
4907 if (!decl
4908 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4910 if (ix86_function_regparm (type, NULL) >= 3)
4912 /* ??? Need to count the actual number of registers to be used,
4913 not the possible number of registers. Fix later. */
4914 return false;
4919 /* Otherwise okay. That also includes certain types of indirect calls. */
4920 return true;
4923 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4924 and "sseregparm" calling convention attributes;
4925 arguments as in struct attribute_spec.handler. */
4927 static tree
4928 ix86_handle_cconv_attribute (tree *node, tree name,
4929 tree args,
4930 int flags ATTRIBUTE_UNUSED,
4931 bool *no_add_attrs)
4933 if (TREE_CODE (*node) != FUNCTION_TYPE
4934 && TREE_CODE (*node) != METHOD_TYPE
4935 && TREE_CODE (*node) != FIELD_DECL
4936 && TREE_CODE (*node) != TYPE_DECL)
4938 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4939 name);
4940 *no_add_attrs = true;
4941 return NULL_TREE;
4944 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4945 if (is_attribute_p ("regparm", name))
4947 tree cst;
4949 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4951 error ("fastcall and regparm attributes are not compatible");
4954 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4956 error ("regparam and thiscall attributes are not compatible");
4959 cst = TREE_VALUE (args);
4960 if (TREE_CODE (cst) != INTEGER_CST)
4962 warning (OPT_Wattributes,
4963 "%qE attribute requires an integer constant argument",
4964 name);
4965 *no_add_attrs = true;
4967 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4969 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4970 name, REGPARM_MAX);
4971 *no_add_attrs = true;
4974 return NULL_TREE;
4977 if (TARGET_64BIT)
4979 /* Do not warn when emulating the MS ABI. */
4980 if ((TREE_CODE (*node) != FUNCTION_TYPE
4981 && TREE_CODE (*node) != METHOD_TYPE)
4982 || ix86_function_type_abi (*node) != MS_ABI)
4983 warning (OPT_Wattributes, "%qE attribute ignored",
4984 name);
4985 *no_add_attrs = true;
4986 return NULL_TREE;
4989 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4990 if (is_attribute_p ("fastcall", name))
4992 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4994 error ("fastcall and cdecl attributes are not compatible");
4996 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4998 error ("fastcall and stdcall attributes are not compatible");
5000 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5002 error ("fastcall and regparm attributes are not compatible");
5004 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5006 error ("fastcall and thiscall attributes are not compatible");
5010 /* Can combine stdcall with fastcall (redundant), regparm and
5011 sseregparm. */
5012 else if (is_attribute_p ("stdcall", name))
5014 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5016 error ("stdcall and cdecl attributes are not compatible");
5018 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5020 error ("stdcall and fastcall attributes are not compatible");
5022 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5024 error ("stdcall and thiscall attributes are not compatible");
5028 /* Can combine cdecl with regparm and sseregparm. */
5029 else if (is_attribute_p ("cdecl", name))
5031 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5033 error ("stdcall and cdecl attributes are not compatible");
5035 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5037 error ("fastcall and cdecl attributes are not compatible");
5039 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5041 error ("cdecl and thiscall attributes are not compatible");
5044 else if (is_attribute_p ("thiscall", name))
5046 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5047 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5048 name);
5049 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5051 error ("stdcall and thiscall attributes are not compatible");
5053 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5055 error ("fastcall and thiscall attributes are not compatible");
5057 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5059 error ("cdecl and thiscall attributes are not compatible");
5063 /* Can combine sseregparm with all attributes. */
5065 return NULL_TREE;
5068 /* The transactional memory builtins are implicitly regparm or fastcall
5069 depending on the ABI. Override the generic do-nothing attribute that
5070 these builtins were declared with, and replace it with one of the two
5071 attributes that we expect elsewhere. */
5073 static tree
5074 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5075 tree args ATTRIBUTE_UNUSED,
5076 int flags ATTRIBUTE_UNUSED,
5077 bool *no_add_attrs)
5079 tree alt;
5081 /* In no case do we want to add the placeholder attribute. */
5082 *no_add_attrs = true;
5084 /* The 64-bit ABI is unchanged for transactional memory. */
5085 if (TARGET_64BIT)
5086 return NULL_TREE;
5088 /* ??? Is there a better way to validate 32-bit windows? We have
5089 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5090 if (CHECK_STACK_LIMIT > 0)
5091 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5092 else
5094 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5095 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5097 decl_attributes (node, alt, flags);
5099 return NULL_TREE;
5102 /* This function determines from TYPE the calling-convention. */
5104 unsigned int
5105 ix86_get_callcvt (const_tree type)
5107 unsigned int ret = 0;
5108 bool is_stdarg;
5109 tree attrs;
5111 if (TARGET_64BIT)
5112 return IX86_CALLCVT_CDECL;
5114 attrs = TYPE_ATTRIBUTES (type);
5115 if (attrs != NULL_TREE)
5117 if (lookup_attribute ("cdecl", attrs))
5118 ret |= IX86_CALLCVT_CDECL;
5119 else if (lookup_attribute ("stdcall", attrs))
5120 ret |= IX86_CALLCVT_STDCALL;
5121 else if (lookup_attribute ("fastcall", attrs))
5122 ret |= IX86_CALLCVT_FASTCALL;
5123 else if (lookup_attribute ("thiscall", attrs))
5124 ret |= IX86_CALLCVT_THISCALL;
5126 /* Regparam isn't allowed for thiscall and fastcall. */
5127 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5129 if (lookup_attribute ("regparm", attrs))
5130 ret |= IX86_CALLCVT_REGPARM;
5131 if (lookup_attribute ("sseregparm", attrs))
5132 ret |= IX86_CALLCVT_SSEREGPARM;
5135 if (IX86_BASE_CALLCVT(ret) != 0)
5136 return ret;
5139 is_stdarg = stdarg_p (type);
5140 if (TARGET_RTD && !is_stdarg)
5141 return IX86_CALLCVT_STDCALL | ret;
5143 if (ret != 0
5144 || is_stdarg
5145 || TREE_CODE (type) != METHOD_TYPE
5146 || ix86_function_type_abi (type) != MS_ABI)
5147 return IX86_CALLCVT_CDECL | ret;
5149 return IX86_CALLCVT_THISCALL;
5152 /* Return 0 if the attributes for two types are incompatible, 1 if they
5153 are compatible, and 2 if they are nearly compatible (which causes a
5154 warning to be generated). */
5156 static int
5157 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5159 unsigned int ccvt1, ccvt2;
5161 if (TREE_CODE (type1) != FUNCTION_TYPE
5162 && TREE_CODE (type1) != METHOD_TYPE)
5163 return 1;
5165 ccvt1 = ix86_get_callcvt (type1);
5166 ccvt2 = ix86_get_callcvt (type2);
5167 if (ccvt1 != ccvt2)
5168 return 0;
5169 if (ix86_function_regparm (type1, NULL)
5170 != ix86_function_regparm (type2, NULL))
5171 return 0;
5173 return 1;
5176 /* Return the regparm value for a function with the indicated TYPE and DECL.
5177 DECL may be NULL when calling function indirectly
5178 or considering a libcall. */
5180 static int
5181 ix86_function_regparm (const_tree type, const_tree decl)
5183 tree attr;
5184 int regparm;
5185 unsigned int ccvt;
5187 if (TARGET_64BIT)
5188 return (ix86_function_type_abi (type) == SYSV_ABI
5189 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5190 ccvt = ix86_get_callcvt (type);
5191 regparm = ix86_regparm;
5193 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5195 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5196 if (attr)
5198 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5199 return regparm;
5202 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5203 return 2;
5204 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5205 return 1;
5207 /* Use register calling convention for local functions when possible. */
5208 if (decl
5209 && TREE_CODE (decl) == FUNCTION_DECL
5210 && optimize
5211 && !(profile_flag && !flag_fentry))
5213 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5214 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5215 if (i && i->local && i->can_change_signature)
5217 int local_regparm, globals = 0, regno;
5219 /* Make sure no regparm register is taken by a
5220 fixed register variable. */
5221 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5222 if (fixed_regs[local_regparm])
5223 break;
5225 /* We don't want to use regparm(3) for nested functions as
5226 these use a static chain pointer in the third argument. */
5227 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5228 local_regparm = 2;
5230 /* In 32-bit mode save a register for the split stack. */
5231 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5232 local_regparm = 2;
5234 /* Each fixed register usage increases register pressure,
5235 so less registers should be used for argument passing.
5236 This functionality can be overriden by an explicit
5237 regparm value. */
5238 for (regno = 0; regno <= DI_REG; regno++)
5239 if (fixed_regs[regno])
5240 globals++;
5242 local_regparm
5243 = globals < local_regparm ? local_regparm - globals : 0;
5245 if (local_regparm > regparm)
5246 regparm = local_regparm;
5250 return regparm;
5253 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5254 DFmode (2) arguments in SSE registers for a function with the
5255 indicated TYPE and DECL. DECL may be NULL when calling function
5256 indirectly or considering a libcall. Otherwise return 0. */
5258 static int
5259 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5261 gcc_assert (!TARGET_64BIT);
5263 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5264 by the sseregparm attribute. */
5265 if (TARGET_SSEREGPARM
5266 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5268 if (!TARGET_SSE)
5270 if (warn)
5272 if (decl)
5273 error ("calling %qD with attribute sseregparm without "
5274 "SSE/SSE2 enabled", decl);
5275 else
5276 error ("calling %qT with attribute sseregparm without "
5277 "SSE/SSE2 enabled", type);
5279 return 0;
5282 return 2;
5285 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5286 (and DFmode for SSE2) arguments in SSE registers. */
5287 if (decl && TARGET_SSE_MATH && optimize
5288 && !(profile_flag && !flag_fentry))
5290 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5291 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5292 if (i && i->local && i->can_change_signature)
5293 return TARGET_SSE2 ? 2 : 1;
5296 return 0;
5299 /* Return true if EAX is live at the start of the function. Used by
5300 ix86_expand_prologue to determine if we need special help before
5301 calling allocate_stack_worker. */
5303 static bool
5304 ix86_eax_live_at_start_p (void)
5306 /* Cheat. Don't bother working forward from ix86_function_regparm
5307 to the function type to whether an actual argument is located in
5308 eax. Instead just look at cfg info, which is still close enough
5309 to correct at this point. This gives false positives for broken
5310 functions that might use uninitialized data that happens to be
5311 allocated in eax, but who cares? */
5312 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5315 static bool
5316 ix86_keep_aggregate_return_pointer (tree fntype)
5318 tree attr;
5320 if (!TARGET_64BIT)
5322 attr = lookup_attribute ("callee_pop_aggregate_return",
5323 TYPE_ATTRIBUTES (fntype));
5324 if (attr)
5325 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5327 /* For 32-bit MS-ABI the default is to keep aggregate
5328 return pointer. */
5329 if (ix86_function_type_abi (fntype) == MS_ABI)
5330 return true;
5332 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5335 /* Value is the number of bytes of arguments automatically
5336 popped when returning from a subroutine call.
5337 FUNDECL is the declaration node of the function (as a tree),
5338 FUNTYPE is the data type of the function (as a tree),
5339 or for a library call it is an identifier node for the subroutine name.
5340 SIZE is the number of bytes of arguments passed on the stack.
5342 On the 80386, the RTD insn may be used to pop them if the number
5343 of args is fixed, but if the number is variable then the caller
5344 must pop them all. RTD can't be used for library calls now
5345 because the library is compiled with the Unix compiler.
5346 Use of RTD is a selectable option, since it is incompatible with
5347 standard Unix calling sequences. If the option is not selected,
5348 the caller must always pop the args.
5350 The attribute stdcall is equivalent to RTD on a per module basis. */
5352 static int
5353 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5355 unsigned int ccvt;
5357 /* None of the 64-bit ABIs pop arguments. */
5358 if (TARGET_64BIT)
5359 return 0;
5361 ccvt = ix86_get_callcvt (funtype);
5363 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5364 | IX86_CALLCVT_THISCALL)) != 0
5365 && ! stdarg_p (funtype))
5366 return size;
5368 /* Lose any fake structure return argument if it is passed on the stack. */
5369 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5370 && !ix86_keep_aggregate_return_pointer (funtype))
5372 int nregs = ix86_function_regparm (funtype, fundecl);
5373 if (nregs == 0)
5374 return GET_MODE_SIZE (Pmode);
5377 return 0;
5380 /* Argument support functions. */
5382 /* Return true when register may be used to pass function parameters. */
5383 bool
5384 ix86_function_arg_regno_p (int regno)
5386 int i;
5387 const int *parm_regs;
5389 if (!TARGET_64BIT)
5391 if (TARGET_MACHO)
5392 return (regno < REGPARM_MAX
5393 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5394 else
5395 return (regno < REGPARM_MAX
5396 || (TARGET_MMX && MMX_REGNO_P (regno)
5397 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5398 || (TARGET_SSE && SSE_REGNO_P (regno)
5399 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5402 if (TARGET_MACHO)
5404 if (SSE_REGNO_P (regno) && TARGET_SSE)
5405 return true;
5407 else
5409 if (TARGET_SSE && SSE_REGNO_P (regno)
5410 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5411 return true;
5414 /* TODO: The function should depend on current function ABI but
5415 builtins.c would need updating then. Therefore we use the
5416 default ABI. */
5418 /* RAX is used as hidden argument to va_arg functions. */
5419 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5420 return true;
5422 if (ix86_abi == MS_ABI)
5423 parm_regs = x86_64_ms_abi_int_parameter_registers;
5424 else
5425 parm_regs = x86_64_int_parameter_registers;
5426 for (i = 0; i < (ix86_abi == MS_ABI
5427 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5428 if (regno == parm_regs[i])
5429 return true;
5430 return false;
5433 /* Return if we do not know how to pass TYPE solely in registers. */
5435 static bool
5436 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5438 if (must_pass_in_stack_var_size_or_pad (mode, type))
5439 return true;
5441 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5442 The layout_type routine is crafty and tries to trick us into passing
5443 currently unsupported vector types on the stack by using TImode. */
5444 return (!TARGET_64BIT && mode == TImode
5445 && type && TREE_CODE (type) != VECTOR_TYPE);
5448 /* It returns the size, in bytes, of the area reserved for arguments passed
5449 in registers for the function represented by fndecl dependent to the used
5450 abi format. */
5452 ix86_reg_parm_stack_space (const_tree fndecl)
5454 enum calling_abi call_abi = SYSV_ABI;
5455 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5456 call_abi = ix86_function_abi (fndecl);
5457 else
5458 call_abi = ix86_function_type_abi (fndecl);
5459 if (TARGET_64BIT && call_abi == MS_ABI)
5460 return 32;
5461 return 0;
5464 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5465 call abi used. */
5466 enum calling_abi
5467 ix86_function_type_abi (const_tree fntype)
5469 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5471 enum calling_abi abi = ix86_abi;
5472 if (abi == SYSV_ABI)
5474 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5475 abi = MS_ABI;
5477 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5478 abi = SYSV_ABI;
5479 return abi;
5481 return ix86_abi;
5484 static bool
5485 ix86_function_ms_hook_prologue (const_tree fn)
5487 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5489 if (decl_function_context (fn) != NULL_TREE)
5490 error_at (DECL_SOURCE_LOCATION (fn),
5491 "ms_hook_prologue is not compatible with nested function");
5492 else
5493 return true;
5495 return false;
5498 static enum calling_abi
5499 ix86_function_abi (const_tree fndecl)
5501 if (! fndecl)
5502 return ix86_abi;
5503 return ix86_function_type_abi (TREE_TYPE (fndecl));
5506 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5507 call abi used. */
5508 enum calling_abi
5509 ix86_cfun_abi (void)
5511 if (! cfun)
5512 return ix86_abi;
5513 return cfun->machine->call_abi;
5516 /* Write the extra assembler code needed to declare a function properly. */
5518 void
5519 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5520 tree decl)
5522 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5524 if (is_ms_hook)
5526 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5527 unsigned int filler_cc = 0xcccccccc;
5529 for (i = 0; i < filler_count; i += 4)
5530 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5533 #ifdef SUBTARGET_ASM_UNWIND_INIT
5534 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5535 #endif
5537 ASM_OUTPUT_LABEL (asm_out_file, fname);
5539 /* Output magic byte marker, if hot-patch attribute is set. */
5540 if (is_ms_hook)
5542 if (TARGET_64BIT)
5544 /* leaq [%rsp + 0], %rsp */
5545 asm_fprintf (asm_out_file, ASM_BYTE
5546 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5548 else
5550 /* movl.s %edi, %edi
5551 push %ebp
5552 movl.s %esp, %ebp */
5553 asm_fprintf (asm_out_file, ASM_BYTE
5554 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5559 /* regclass.c */
5560 extern void init_regs (void);
5562 /* Implementation of call abi switching target hook. Specific to FNDECL
5563 the specific call register sets are set. See also
5564 ix86_conditional_register_usage for more details. */
5565 void
5566 ix86_call_abi_override (const_tree fndecl)
5568 if (fndecl == NULL_TREE)
5569 cfun->machine->call_abi = ix86_abi;
5570 else
5571 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5574 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5575 expensive re-initialization of init_regs each time we switch function context
5576 since this is needed only during RTL expansion. */
5577 static void
5578 ix86_maybe_switch_abi (void)
5580 if (TARGET_64BIT &&
5581 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5582 reinit_regs ();
5585 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5586 for a call to a function whose data type is FNTYPE.
5587 For a library call, FNTYPE is 0. */
5589 void
5590 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5591 tree fntype, /* tree ptr for function decl */
5592 rtx libname, /* SYMBOL_REF of library name or 0 */
5593 tree fndecl,
5594 int caller)
5596 struct cgraph_local_info *i;
5597 tree fnret_type;
5599 memset (cum, 0, sizeof (*cum));
5601 /* Initialize for the current callee. */
5602 if (caller)
5604 cfun->machine->callee_pass_avx256_p = false;
5605 cfun->machine->callee_return_avx256_p = false;
5608 if (fndecl)
5610 i = cgraph_local_info (fndecl);
5611 cum->call_abi = ix86_function_abi (fndecl);
5612 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5614 else
5616 i = NULL;
5617 cum->call_abi = ix86_function_type_abi (fntype);
5618 if (fntype)
5619 fnret_type = TREE_TYPE (fntype);
5620 else
5621 fnret_type = NULL;
5624 if (TARGET_VZEROUPPER && fnret_type)
5626 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5627 false);
5628 if (function_pass_avx256_p (fnret_value))
5630 /* The return value of this function uses 256bit AVX modes. */
5631 if (caller)
5632 cfun->machine->callee_return_avx256_p = true;
5633 else
5634 cfun->machine->caller_return_avx256_p = true;
5638 cum->caller = caller;
5640 /* Set up the number of registers to use for passing arguments. */
5642 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5643 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5644 "or subtarget optimization implying it");
5645 cum->nregs = ix86_regparm;
5646 if (TARGET_64BIT)
5648 cum->nregs = (cum->call_abi == SYSV_ABI
5649 ? X86_64_REGPARM_MAX
5650 : X86_64_MS_REGPARM_MAX);
5652 if (TARGET_SSE)
5654 cum->sse_nregs = SSE_REGPARM_MAX;
5655 if (TARGET_64BIT)
5657 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5658 ? X86_64_SSE_REGPARM_MAX
5659 : X86_64_MS_SSE_REGPARM_MAX);
5662 if (TARGET_MMX)
5663 cum->mmx_nregs = MMX_REGPARM_MAX;
5664 cum->warn_avx = true;
5665 cum->warn_sse = true;
5666 cum->warn_mmx = true;
5668 /* Because type might mismatch in between caller and callee, we need to
5669 use actual type of function for local calls.
5670 FIXME: cgraph_analyze can be told to actually record if function uses
5671 va_start so for local functions maybe_vaarg can be made aggressive
5672 helping K&R code.
5673 FIXME: once typesytem is fixed, we won't need this code anymore. */
5674 if (i && i->local && i->can_change_signature)
5675 fntype = TREE_TYPE (fndecl);
5676 cum->maybe_vaarg = (fntype
5677 ? (!prototype_p (fntype) || stdarg_p (fntype))
5678 : !libname);
5680 if (!TARGET_64BIT)
5682 /* If there are variable arguments, then we won't pass anything
5683 in registers in 32-bit mode. */
5684 if (stdarg_p (fntype))
5686 cum->nregs = 0;
5687 cum->sse_nregs = 0;
5688 cum->mmx_nregs = 0;
5689 cum->warn_avx = 0;
5690 cum->warn_sse = 0;
5691 cum->warn_mmx = 0;
5692 return;
5695 /* Use ecx and edx registers if function has fastcall attribute,
5696 else look for regparm information. */
5697 if (fntype)
5699 unsigned int ccvt = ix86_get_callcvt (fntype);
5700 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5702 cum->nregs = 1;
5703 cum->fastcall = 1; /* Same first register as in fastcall. */
5705 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5707 cum->nregs = 2;
5708 cum->fastcall = 1;
5710 else
5711 cum->nregs = ix86_function_regparm (fntype, fndecl);
5714 /* Set up the number of SSE registers used for passing SFmode
5715 and DFmode arguments. Warn for mismatching ABI. */
5716 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5720 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5721 But in the case of vector types, it is some vector mode.
5723 When we have only some of our vector isa extensions enabled, then there
5724 are some modes for which vector_mode_supported_p is false. For these
5725 modes, the generic vector support in gcc will choose some non-vector mode
5726 in order to implement the type. By computing the natural mode, we'll
5727 select the proper ABI location for the operand and not depend on whatever
5728 the middle-end decides to do with these vector types.
5730 The midde-end can't deal with the vector types > 16 bytes. In this
5731 case, we return the original mode and warn ABI change if CUM isn't
5732 NULL. */
5734 static enum machine_mode
5735 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5737 enum machine_mode mode = TYPE_MODE (type);
5739 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5741 HOST_WIDE_INT size = int_size_in_bytes (type);
5742 if ((size == 8 || size == 16 || size == 32)
5743 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5744 && TYPE_VECTOR_SUBPARTS (type) > 1)
5746 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5748 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5749 mode = MIN_MODE_VECTOR_FLOAT;
5750 else
5751 mode = MIN_MODE_VECTOR_INT;
5753 /* Get the mode which has this inner mode and number of units. */
5754 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5755 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5756 && GET_MODE_INNER (mode) == innermode)
5758 if (size == 32 && !TARGET_AVX)
5760 static bool warnedavx;
5762 if (cum
5763 && !warnedavx
5764 && cum->warn_avx)
5766 warnedavx = true;
5767 warning (0, "AVX vector argument without AVX "
5768 "enabled changes the ABI");
5770 return TYPE_MODE (type);
5772 else
5773 return mode;
5776 gcc_unreachable ();
5780 return mode;
5783 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5784 this may not agree with the mode that the type system has chosen for the
5785 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5786 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5788 static rtx
5789 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5790 unsigned int regno)
5792 rtx tmp;
5794 if (orig_mode != BLKmode)
5795 tmp = gen_rtx_REG (orig_mode, regno);
5796 else
5798 tmp = gen_rtx_REG (mode, regno);
5799 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5800 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5803 return tmp;
5806 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5807 of this code is to classify each 8bytes of incoming argument by the register
5808 class and assign registers accordingly. */
5810 /* Return the union class of CLASS1 and CLASS2.
5811 See the x86-64 PS ABI for details. */
5813 static enum x86_64_reg_class
5814 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5816 /* Rule #1: If both classes are equal, this is the resulting class. */
5817 if (class1 == class2)
5818 return class1;
5820 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5821 the other class. */
5822 if (class1 == X86_64_NO_CLASS)
5823 return class2;
5824 if (class2 == X86_64_NO_CLASS)
5825 return class1;
5827 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5828 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5829 return X86_64_MEMORY_CLASS;
5831 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5832 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5833 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5834 return X86_64_INTEGERSI_CLASS;
5835 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5836 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5837 return X86_64_INTEGER_CLASS;
5839 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5840 MEMORY is used. */
5841 if (class1 == X86_64_X87_CLASS
5842 || class1 == X86_64_X87UP_CLASS
5843 || class1 == X86_64_COMPLEX_X87_CLASS
5844 || class2 == X86_64_X87_CLASS
5845 || class2 == X86_64_X87UP_CLASS
5846 || class2 == X86_64_COMPLEX_X87_CLASS)
5847 return X86_64_MEMORY_CLASS;
5849 /* Rule #6: Otherwise class SSE is used. */
5850 return X86_64_SSE_CLASS;
5853 /* Classify the argument of type TYPE and mode MODE.
5854 CLASSES will be filled by the register class used to pass each word
5855 of the operand. The number of words is returned. In case the parameter
5856 should be passed in memory, 0 is returned. As a special case for zero
5857 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5859 BIT_OFFSET is used internally for handling records and specifies offset
5860 of the offset in bits modulo 256 to avoid overflow cases.
5862 See the x86-64 PS ABI for details.
5865 static int
5866 classify_argument (enum machine_mode mode, const_tree type,
5867 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5869 HOST_WIDE_INT bytes =
5870 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5871 int words
5872 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5874 /* Variable sized entities are always passed/returned in memory. */
5875 if (bytes < 0)
5876 return 0;
5878 if (mode != VOIDmode
5879 && targetm.calls.must_pass_in_stack (mode, type))
5880 return 0;
5882 if (type && AGGREGATE_TYPE_P (type))
5884 int i;
5885 tree field;
5886 enum x86_64_reg_class subclasses[MAX_CLASSES];
5888 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5889 if (bytes > 32)
5890 return 0;
5892 for (i = 0; i < words; i++)
5893 classes[i] = X86_64_NO_CLASS;
5895 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5896 signalize memory class, so handle it as special case. */
5897 if (!words)
5899 classes[0] = X86_64_NO_CLASS;
5900 return 1;
5903 /* Classify each field of record and merge classes. */
5904 switch (TREE_CODE (type))
5906 case RECORD_TYPE:
5907 /* And now merge the fields of structure. */
5908 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5910 if (TREE_CODE (field) == FIELD_DECL)
5912 int num;
5914 if (TREE_TYPE (field) == error_mark_node)
5915 continue;
5917 /* Bitfields are always classified as integer. Handle them
5918 early, since later code would consider them to be
5919 misaligned integers. */
5920 if (DECL_BIT_FIELD (field))
5922 for (i = (int_bit_position (field)
5923 + (bit_offset % 64)) / 8 / 8;
5924 i < ((int_bit_position (field) + (bit_offset % 64))
5925 + tree_low_cst (DECL_SIZE (field), 0)
5926 + 63) / 8 / 8; i++)
5927 classes[i] =
5928 merge_classes (X86_64_INTEGER_CLASS,
5929 classes[i]);
5931 else
5933 int pos;
5935 type = TREE_TYPE (field);
5937 /* Flexible array member is ignored. */
5938 if (TYPE_MODE (type) == BLKmode
5939 && TREE_CODE (type) == ARRAY_TYPE
5940 && TYPE_SIZE (type) == NULL_TREE
5941 && TYPE_DOMAIN (type) != NULL_TREE
5942 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5943 == NULL_TREE))
5945 static bool warned;
5947 if (!warned && warn_psabi)
5949 warned = true;
5950 inform (input_location,
5951 "the ABI of passing struct with"
5952 " a flexible array member has"
5953 " changed in GCC 4.4");
5955 continue;
5957 num = classify_argument (TYPE_MODE (type), type,
5958 subclasses,
5959 (int_bit_position (field)
5960 + bit_offset) % 256);
5961 if (!num)
5962 return 0;
5963 pos = (int_bit_position (field)
5964 + (bit_offset % 64)) / 8 / 8;
5965 for (i = 0; i < num && (i + pos) < words; i++)
5966 classes[i + pos] =
5967 merge_classes (subclasses[i], classes[i + pos]);
5971 break;
5973 case ARRAY_TYPE:
5974 /* Arrays are handled as small records. */
5976 int num;
5977 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5978 TREE_TYPE (type), subclasses, bit_offset);
5979 if (!num)
5980 return 0;
5982 /* The partial classes are now full classes. */
5983 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5984 subclasses[0] = X86_64_SSE_CLASS;
5985 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5986 && !((bit_offset % 64) == 0 && bytes == 4))
5987 subclasses[0] = X86_64_INTEGER_CLASS;
5989 for (i = 0; i < words; i++)
5990 classes[i] = subclasses[i % num];
5992 break;
5994 case UNION_TYPE:
5995 case QUAL_UNION_TYPE:
5996 /* Unions are similar to RECORD_TYPE but offset is always 0.
5998 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6000 if (TREE_CODE (field) == FIELD_DECL)
6002 int num;
6004 if (TREE_TYPE (field) == error_mark_node)
6005 continue;
6007 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6008 TREE_TYPE (field), subclasses,
6009 bit_offset);
6010 if (!num)
6011 return 0;
6012 for (i = 0; i < num; i++)
6013 classes[i] = merge_classes (subclasses[i], classes[i]);
6016 break;
6018 default:
6019 gcc_unreachable ();
6022 if (words > 2)
6024 /* When size > 16 bytes, if the first one isn't
6025 X86_64_SSE_CLASS or any other ones aren't
6026 X86_64_SSEUP_CLASS, everything should be passed in
6027 memory. */
6028 if (classes[0] != X86_64_SSE_CLASS)
6029 return 0;
6031 for (i = 1; i < words; i++)
6032 if (classes[i] != X86_64_SSEUP_CLASS)
6033 return 0;
6036 /* Final merger cleanup. */
6037 for (i = 0; i < words; i++)
6039 /* If one class is MEMORY, everything should be passed in
6040 memory. */
6041 if (classes[i] == X86_64_MEMORY_CLASS)
6042 return 0;
6044 /* The X86_64_SSEUP_CLASS should be always preceded by
6045 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6046 if (classes[i] == X86_64_SSEUP_CLASS
6047 && classes[i - 1] != X86_64_SSE_CLASS
6048 && classes[i - 1] != X86_64_SSEUP_CLASS)
6050 /* The first one should never be X86_64_SSEUP_CLASS. */
6051 gcc_assert (i != 0);
6052 classes[i] = X86_64_SSE_CLASS;
6055 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6056 everything should be passed in memory. */
6057 if (classes[i] == X86_64_X87UP_CLASS
6058 && (classes[i - 1] != X86_64_X87_CLASS))
6060 static bool warned;
6062 /* The first one should never be X86_64_X87UP_CLASS. */
6063 gcc_assert (i != 0);
6064 if (!warned && warn_psabi)
6066 warned = true;
6067 inform (input_location,
6068 "the ABI of passing union with long double"
6069 " has changed in GCC 4.4");
6071 return 0;
6074 return words;
6077 /* Compute alignment needed. We align all types to natural boundaries with
6078 exception of XFmode that is aligned to 64bits. */
6079 if (mode != VOIDmode && mode != BLKmode)
6081 int mode_alignment = GET_MODE_BITSIZE (mode);
6083 if (mode == XFmode)
6084 mode_alignment = 128;
6085 else if (mode == XCmode)
6086 mode_alignment = 256;
6087 if (COMPLEX_MODE_P (mode))
6088 mode_alignment /= 2;
6089 /* Misaligned fields are always returned in memory. */
6090 if (bit_offset % mode_alignment)
6091 return 0;
6094 /* for V1xx modes, just use the base mode */
6095 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6096 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6097 mode = GET_MODE_INNER (mode);
6099 /* Classification of atomic types. */
6100 switch (mode)
6102 case SDmode:
6103 case DDmode:
6104 classes[0] = X86_64_SSE_CLASS;
6105 return 1;
6106 case TDmode:
6107 classes[0] = X86_64_SSE_CLASS;
6108 classes[1] = X86_64_SSEUP_CLASS;
6109 return 2;
6110 case DImode:
6111 case SImode:
6112 case HImode:
6113 case QImode:
6114 case CSImode:
6115 case CHImode:
6116 case CQImode:
6118 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6120 if (size <= 32)
6122 classes[0] = X86_64_INTEGERSI_CLASS;
6123 return 1;
6125 else if (size <= 64)
6127 classes[0] = X86_64_INTEGER_CLASS;
6128 return 1;
6130 else if (size <= 64+32)
6132 classes[0] = X86_64_INTEGER_CLASS;
6133 classes[1] = X86_64_INTEGERSI_CLASS;
6134 return 2;
6136 else if (size <= 64+64)
6138 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6139 return 2;
6141 else
6142 gcc_unreachable ();
6144 case CDImode:
6145 case TImode:
6146 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6147 return 2;
6148 case COImode:
6149 case OImode:
6150 /* OImode shouldn't be used directly. */
6151 gcc_unreachable ();
6152 case CTImode:
6153 return 0;
6154 case SFmode:
6155 if (!(bit_offset % 64))
6156 classes[0] = X86_64_SSESF_CLASS;
6157 else
6158 classes[0] = X86_64_SSE_CLASS;
6159 return 1;
6160 case DFmode:
6161 classes[0] = X86_64_SSEDF_CLASS;
6162 return 1;
6163 case XFmode:
6164 classes[0] = X86_64_X87_CLASS;
6165 classes[1] = X86_64_X87UP_CLASS;
6166 return 2;
6167 case TFmode:
6168 classes[0] = X86_64_SSE_CLASS;
6169 classes[1] = X86_64_SSEUP_CLASS;
6170 return 2;
6171 case SCmode:
6172 classes[0] = X86_64_SSE_CLASS;
6173 if (!(bit_offset % 64))
6174 return 1;
6175 else
6177 static bool warned;
6179 if (!warned && warn_psabi)
6181 warned = true;
6182 inform (input_location,
6183 "the ABI of passing structure with complex float"
6184 " member has changed in GCC 4.4");
6186 classes[1] = X86_64_SSESF_CLASS;
6187 return 2;
6189 case DCmode:
6190 classes[0] = X86_64_SSEDF_CLASS;
6191 classes[1] = X86_64_SSEDF_CLASS;
6192 return 2;
6193 case XCmode:
6194 classes[0] = X86_64_COMPLEX_X87_CLASS;
6195 return 1;
6196 case TCmode:
6197 /* This modes is larger than 16 bytes. */
6198 return 0;
6199 case V8SFmode:
6200 case V8SImode:
6201 case V32QImode:
6202 case V16HImode:
6203 case V4DFmode:
6204 case V4DImode:
6205 classes[0] = X86_64_SSE_CLASS;
6206 classes[1] = X86_64_SSEUP_CLASS;
6207 classes[2] = X86_64_SSEUP_CLASS;
6208 classes[3] = X86_64_SSEUP_CLASS;
6209 return 4;
6210 case V4SFmode:
6211 case V4SImode:
6212 case V16QImode:
6213 case V8HImode:
6214 case V2DFmode:
6215 case V2DImode:
6216 classes[0] = X86_64_SSE_CLASS;
6217 classes[1] = X86_64_SSEUP_CLASS;
6218 return 2;
6219 case V1TImode:
6220 case V1DImode:
6221 case V2SFmode:
6222 case V2SImode:
6223 case V4HImode:
6224 case V8QImode:
6225 classes[0] = X86_64_SSE_CLASS;
6226 return 1;
6227 case BLKmode:
6228 case VOIDmode:
6229 return 0;
6230 default:
6231 gcc_assert (VECTOR_MODE_P (mode));
6233 if (bytes > 16)
6234 return 0;
6236 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6238 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6239 classes[0] = X86_64_INTEGERSI_CLASS;
6240 else
6241 classes[0] = X86_64_INTEGER_CLASS;
6242 classes[1] = X86_64_INTEGER_CLASS;
6243 return 1 + (bytes > 8);
6247 /* Examine the argument and return set number of register required in each
6248 class. Return 0 iff parameter should be passed in memory. */
6249 static int
6250 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6251 int *int_nregs, int *sse_nregs)
6253 enum x86_64_reg_class regclass[MAX_CLASSES];
6254 int n = classify_argument (mode, type, regclass, 0);
6256 *int_nregs = 0;
6257 *sse_nregs = 0;
6258 if (!n)
6259 return 0;
6260 for (n--; n >= 0; n--)
6261 switch (regclass[n])
6263 case X86_64_INTEGER_CLASS:
6264 case X86_64_INTEGERSI_CLASS:
6265 (*int_nregs)++;
6266 break;
6267 case X86_64_SSE_CLASS:
6268 case X86_64_SSESF_CLASS:
6269 case X86_64_SSEDF_CLASS:
6270 (*sse_nregs)++;
6271 break;
6272 case X86_64_NO_CLASS:
6273 case X86_64_SSEUP_CLASS:
6274 break;
6275 case X86_64_X87_CLASS:
6276 case X86_64_X87UP_CLASS:
6277 if (!in_return)
6278 return 0;
6279 break;
6280 case X86_64_COMPLEX_X87_CLASS:
6281 return in_return ? 2 : 0;
6282 case X86_64_MEMORY_CLASS:
6283 gcc_unreachable ();
6285 return 1;
6288 /* Construct container for the argument used by GCC interface. See
6289 FUNCTION_ARG for the detailed description. */
6291 static rtx
6292 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6293 const_tree type, int in_return, int nintregs, int nsseregs,
6294 const int *intreg, int sse_regno)
6296 /* The following variables hold the static issued_error state. */
6297 static bool issued_sse_arg_error;
6298 static bool issued_sse_ret_error;
6299 static bool issued_x87_ret_error;
6301 enum machine_mode tmpmode;
6302 int bytes =
6303 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6304 enum x86_64_reg_class regclass[MAX_CLASSES];
6305 int n;
6306 int i;
6307 int nexps = 0;
6308 int needed_sseregs, needed_intregs;
6309 rtx exp[MAX_CLASSES];
6310 rtx ret;
6312 n = classify_argument (mode, type, regclass, 0);
6313 if (!n)
6314 return NULL;
6315 if (!examine_argument (mode, type, in_return, &needed_intregs,
6316 &needed_sseregs))
6317 return NULL;
6318 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6319 return NULL;
6321 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6322 some less clueful developer tries to use floating-point anyway. */
6323 if (needed_sseregs && !TARGET_SSE)
6325 if (in_return)
6327 if (!issued_sse_ret_error)
6329 error ("SSE register return with SSE disabled");
6330 issued_sse_ret_error = true;
6333 else if (!issued_sse_arg_error)
6335 error ("SSE register argument with SSE disabled");
6336 issued_sse_arg_error = true;
6338 return NULL;
6341 /* Likewise, error if the ABI requires us to return values in the
6342 x87 registers and the user specified -mno-80387. */
6343 if (!TARGET_80387 && in_return)
6344 for (i = 0; i < n; i++)
6345 if (regclass[i] == X86_64_X87_CLASS
6346 || regclass[i] == X86_64_X87UP_CLASS
6347 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6349 if (!issued_x87_ret_error)
6351 error ("x87 register return with x87 disabled");
6352 issued_x87_ret_error = true;
6354 return NULL;
6357 /* First construct simple cases. Avoid SCmode, since we want to use
6358 single register to pass this type. */
6359 if (n == 1 && mode != SCmode)
6360 switch (regclass[0])
6362 case X86_64_INTEGER_CLASS:
6363 case X86_64_INTEGERSI_CLASS:
6364 return gen_rtx_REG (mode, intreg[0]);
6365 case X86_64_SSE_CLASS:
6366 case X86_64_SSESF_CLASS:
6367 case X86_64_SSEDF_CLASS:
6368 if (mode != BLKmode)
6369 return gen_reg_or_parallel (mode, orig_mode,
6370 SSE_REGNO (sse_regno));
6371 break;
6372 case X86_64_X87_CLASS:
6373 case X86_64_COMPLEX_X87_CLASS:
6374 return gen_rtx_REG (mode, FIRST_STACK_REG);
6375 case X86_64_NO_CLASS:
6376 /* Zero sized array, struct or class. */
6377 return NULL;
6378 default:
6379 gcc_unreachable ();
6381 if (n == 2
6382 && regclass[0] == X86_64_SSE_CLASS
6383 && regclass[1] == X86_64_SSEUP_CLASS
6384 && mode != BLKmode)
6385 return gen_reg_or_parallel (mode, orig_mode,
6386 SSE_REGNO (sse_regno));
6387 if (n == 4
6388 && regclass[0] == X86_64_SSE_CLASS
6389 && regclass[1] == X86_64_SSEUP_CLASS
6390 && regclass[2] == X86_64_SSEUP_CLASS
6391 && regclass[3] == X86_64_SSEUP_CLASS
6392 && mode != BLKmode)
6393 return gen_reg_or_parallel (mode, orig_mode,
6394 SSE_REGNO (sse_regno));
6395 if (n == 2
6396 && regclass[0] == X86_64_X87_CLASS
6397 && regclass[1] == X86_64_X87UP_CLASS)
6398 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6400 if (n == 2
6401 && regclass[0] == X86_64_INTEGER_CLASS
6402 && regclass[1] == X86_64_INTEGER_CLASS
6403 && (mode == CDImode || mode == TImode || mode == TFmode)
6404 && intreg[0] + 1 == intreg[1])
6405 return gen_rtx_REG (mode, intreg[0]);
6407 /* Otherwise figure out the entries of the PARALLEL. */
6408 for (i = 0; i < n; i++)
6410 int pos;
6412 switch (regclass[i])
6414 case X86_64_NO_CLASS:
6415 break;
6416 case X86_64_INTEGER_CLASS:
6417 case X86_64_INTEGERSI_CLASS:
6418 /* Merge TImodes on aligned occasions here too. */
6419 if (i * 8 + 8 > bytes)
6420 tmpmode
6421 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6422 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6423 tmpmode = SImode;
6424 else
6425 tmpmode = DImode;
6426 /* We've requested 24 bytes we
6427 don't have mode for. Use DImode. */
6428 if (tmpmode == BLKmode)
6429 tmpmode = DImode;
6430 exp [nexps++]
6431 = gen_rtx_EXPR_LIST (VOIDmode,
6432 gen_rtx_REG (tmpmode, *intreg),
6433 GEN_INT (i*8));
6434 intreg++;
6435 break;
6436 case X86_64_SSESF_CLASS:
6437 exp [nexps++]
6438 = gen_rtx_EXPR_LIST (VOIDmode,
6439 gen_rtx_REG (SFmode,
6440 SSE_REGNO (sse_regno)),
6441 GEN_INT (i*8));
6442 sse_regno++;
6443 break;
6444 case X86_64_SSEDF_CLASS:
6445 exp [nexps++]
6446 = gen_rtx_EXPR_LIST (VOIDmode,
6447 gen_rtx_REG (DFmode,
6448 SSE_REGNO (sse_regno)),
6449 GEN_INT (i*8));
6450 sse_regno++;
6451 break;
6452 case X86_64_SSE_CLASS:
6453 pos = i;
6454 switch (n)
6456 case 1:
6457 tmpmode = DImode;
6458 break;
6459 case 2:
6460 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6462 tmpmode = TImode;
6463 i++;
6465 else
6466 tmpmode = DImode;
6467 break;
6468 case 4:
6469 gcc_assert (i == 0
6470 && regclass[1] == X86_64_SSEUP_CLASS
6471 && regclass[2] == X86_64_SSEUP_CLASS
6472 && regclass[3] == X86_64_SSEUP_CLASS);
6473 tmpmode = OImode;
6474 i += 3;
6475 break;
6476 default:
6477 gcc_unreachable ();
6479 exp [nexps++]
6480 = gen_rtx_EXPR_LIST (VOIDmode,
6481 gen_rtx_REG (tmpmode,
6482 SSE_REGNO (sse_regno)),
6483 GEN_INT (pos*8));
6484 sse_regno++;
6485 break;
6486 default:
6487 gcc_unreachable ();
6491 /* Empty aligned struct, union or class. */
6492 if (nexps == 0)
6493 return NULL;
6495 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6496 for (i = 0; i < nexps; i++)
6497 XVECEXP (ret, 0, i) = exp [i];
6498 return ret;
6501 /* Update the data in CUM to advance over an argument of mode MODE
6502 and data type TYPE. (TYPE is null for libcalls where that information
6503 may not be available.) */
6505 static void
6506 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6507 const_tree type, HOST_WIDE_INT bytes,
6508 HOST_WIDE_INT words)
6510 switch (mode)
6512 default:
6513 break;
6515 case BLKmode:
6516 if (bytes < 0)
6517 break;
6518 /* FALLTHRU */
6520 case DImode:
6521 case SImode:
6522 case HImode:
6523 case QImode:
6524 cum->words += words;
6525 cum->nregs -= words;
6526 cum->regno += words;
6528 if (cum->nregs <= 0)
6530 cum->nregs = 0;
6531 cum->regno = 0;
6533 break;
6535 case OImode:
6536 /* OImode shouldn't be used directly. */
6537 gcc_unreachable ();
6539 case DFmode:
6540 if (cum->float_in_sse < 2)
6541 break;
6542 case SFmode:
6543 if (cum->float_in_sse < 1)
6544 break;
6545 /* FALLTHRU */
6547 case V8SFmode:
6548 case V8SImode:
6549 case V32QImode:
6550 case V16HImode:
6551 case V4DFmode:
6552 case V4DImode:
6553 case TImode:
6554 case V16QImode:
6555 case V8HImode:
6556 case V4SImode:
6557 case V2DImode:
6558 case V4SFmode:
6559 case V2DFmode:
6560 if (!type || !AGGREGATE_TYPE_P (type))
6562 cum->sse_words += words;
6563 cum->sse_nregs -= 1;
6564 cum->sse_regno += 1;
6565 if (cum->sse_nregs <= 0)
6567 cum->sse_nregs = 0;
6568 cum->sse_regno = 0;
6571 break;
6573 case V8QImode:
6574 case V4HImode:
6575 case V2SImode:
6576 case V2SFmode:
6577 case V1TImode:
6578 case V1DImode:
6579 if (!type || !AGGREGATE_TYPE_P (type))
6581 cum->mmx_words += words;
6582 cum->mmx_nregs -= 1;
6583 cum->mmx_regno += 1;
6584 if (cum->mmx_nregs <= 0)
6586 cum->mmx_nregs = 0;
6587 cum->mmx_regno = 0;
6590 break;
6594 static void
6595 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6596 const_tree type, HOST_WIDE_INT words, bool named)
6598 int int_nregs, sse_nregs;
6600 /* Unnamed 256bit vector mode parameters are passed on stack. */
6601 if (!named && VALID_AVX256_REG_MODE (mode))
6602 return;
6604 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6605 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6607 cum->nregs -= int_nregs;
6608 cum->sse_nregs -= sse_nregs;
6609 cum->regno += int_nregs;
6610 cum->sse_regno += sse_nregs;
6612 else
6614 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6615 cum->words = (cum->words + align - 1) & ~(align - 1);
6616 cum->words += words;
6620 static void
6621 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6622 HOST_WIDE_INT words)
6624 /* Otherwise, this should be passed indirect. */
6625 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6627 cum->words += words;
6628 if (cum->nregs > 0)
6630 cum->nregs -= 1;
6631 cum->regno += 1;
6635 /* Update the data in CUM to advance over an argument of mode MODE and
6636 data type TYPE. (TYPE is null for libcalls where that information
6637 may not be available.) */
6639 static void
6640 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6641 const_tree type, bool named)
6643 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6644 HOST_WIDE_INT bytes, words;
6646 if (mode == BLKmode)
6647 bytes = int_size_in_bytes (type);
6648 else
6649 bytes = GET_MODE_SIZE (mode);
6650 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6652 if (type)
6653 mode = type_natural_mode (type, NULL);
6655 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6656 function_arg_advance_ms_64 (cum, bytes, words);
6657 else if (TARGET_64BIT)
6658 function_arg_advance_64 (cum, mode, type, words, named);
6659 else
6660 function_arg_advance_32 (cum, mode, type, bytes, words);
6663 /* Define where to put the arguments to a function.
6664 Value is zero to push the argument on the stack,
6665 or a hard register in which to store the argument.
6667 MODE is the argument's machine mode.
6668 TYPE is the data type of the argument (as a tree).
6669 This is null for libcalls where that information may
6670 not be available.
6671 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6672 the preceding args and about the function being called.
6673 NAMED is nonzero if this argument is a named parameter
6674 (otherwise it is an extra parameter matching an ellipsis). */
6676 static rtx
6677 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6678 enum machine_mode orig_mode, const_tree type,
6679 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6681 static bool warnedsse, warnedmmx;
6683 /* Avoid the AL settings for the Unix64 ABI. */
6684 if (mode == VOIDmode)
6685 return constm1_rtx;
6687 switch (mode)
6689 default:
6690 break;
6692 case BLKmode:
6693 if (bytes < 0)
6694 break;
6695 /* FALLTHRU */
6696 case DImode:
6697 case SImode:
6698 case HImode:
6699 case QImode:
6700 if (words <= cum->nregs)
6702 int regno = cum->regno;
6704 /* Fastcall allocates the first two DWORD (SImode) or
6705 smaller arguments to ECX and EDX if it isn't an
6706 aggregate type . */
6707 if (cum->fastcall)
6709 if (mode == BLKmode
6710 || mode == DImode
6711 || (type && AGGREGATE_TYPE_P (type)))
6712 break;
6714 /* ECX not EAX is the first allocated register. */
6715 if (regno == AX_REG)
6716 regno = CX_REG;
6718 return gen_rtx_REG (mode, regno);
6720 break;
6722 case DFmode:
6723 if (cum->float_in_sse < 2)
6724 break;
6725 case SFmode:
6726 if (cum->float_in_sse < 1)
6727 break;
6728 /* FALLTHRU */
6729 case TImode:
6730 /* In 32bit, we pass TImode in xmm registers. */
6731 case V16QImode:
6732 case V8HImode:
6733 case V4SImode:
6734 case V2DImode:
6735 case V4SFmode:
6736 case V2DFmode:
6737 if (!type || !AGGREGATE_TYPE_P (type))
6739 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6741 warnedsse = true;
6742 warning (0, "SSE vector argument without SSE enabled "
6743 "changes the ABI");
6745 if (cum->sse_nregs)
6746 return gen_reg_or_parallel (mode, orig_mode,
6747 cum->sse_regno + FIRST_SSE_REG);
6749 break;
6751 case OImode:
6752 /* OImode shouldn't be used directly. */
6753 gcc_unreachable ();
6755 case V8SFmode:
6756 case V8SImode:
6757 case V32QImode:
6758 case V16HImode:
6759 case V4DFmode:
6760 case V4DImode:
6761 if (!type || !AGGREGATE_TYPE_P (type))
6763 if (cum->sse_nregs)
6764 return gen_reg_or_parallel (mode, orig_mode,
6765 cum->sse_regno + FIRST_SSE_REG);
6767 break;
6769 case V8QImode:
6770 case V4HImode:
6771 case V2SImode:
6772 case V2SFmode:
6773 case V1TImode:
6774 case V1DImode:
6775 if (!type || !AGGREGATE_TYPE_P (type))
6777 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6779 warnedmmx = true;
6780 warning (0, "MMX vector argument without MMX enabled "
6781 "changes the ABI");
6783 if (cum->mmx_nregs)
6784 return gen_reg_or_parallel (mode, orig_mode,
6785 cum->mmx_regno + FIRST_MMX_REG);
6787 break;
6790 return NULL_RTX;
6793 static rtx
6794 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6795 enum machine_mode orig_mode, const_tree type, bool named)
6797 /* Handle a hidden AL argument containing number of registers
6798 for varargs x86-64 functions. */
6799 if (mode == VOIDmode)
6800 return GEN_INT (cum->maybe_vaarg
6801 ? (cum->sse_nregs < 0
6802 ? X86_64_SSE_REGPARM_MAX
6803 : cum->sse_regno)
6804 : -1);
6806 switch (mode)
6808 default:
6809 break;
6811 case V8SFmode:
6812 case V8SImode:
6813 case V32QImode:
6814 case V16HImode:
6815 case V4DFmode:
6816 case V4DImode:
6817 /* Unnamed 256bit vector mode parameters are passed on stack. */
6818 if (!named)
6819 return NULL;
6820 break;
6823 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6824 cum->sse_nregs,
6825 &x86_64_int_parameter_registers [cum->regno],
6826 cum->sse_regno);
6829 static rtx
6830 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6831 enum machine_mode orig_mode, bool named,
6832 HOST_WIDE_INT bytes)
6834 unsigned int regno;
6836 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6837 We use value of -2 to specify that current function call is MSABI. */
6838 if (mode == VOIDmode)
6839 return GEN_INT (-2);
6841 /* If we've run out of registers, it goes on the stack. */
6842 if (cum->nregs == 0)
6843 return NULL_RTX;
6845 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6847 /* Only floating point modes are passed in anything but integer regs. */
6848 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6850 if (named)
6851 regno = cum->regno + FIRST_SSE_REG;
6852 else
6854 rtx t1, t2;
6856 /* Unnamed floating parameters are passed in both the
6857 SSE and integer registers. */
6858 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6859 t2 = gen_rtx_REG (mode, regno);
6860 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6861 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6862 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6865 /* Handle aggregated types passed in register. */
6866 if (orig_mode == BLKmode)
6868 if (bytes > 0 && bytes <= 8)
6869 mode = (bytes > 4 ? DImode : SImode);
6870 if (mode == BLKmode)
6871 mode = DImode;
6874 return gen_reg_or_parallel (mode, orig_mode, regno);
6877 /* Return where to put the arguments to a function.
6878 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6880 MODE is the argument's machine mode. TYPE is the data type of the
6881 argument. It is null for libcalls where that information may not be
6882 available. CUM gives information about the preceding args and about
6883 the function being called. NAMED is nonzero if this argument is a
6884 named parameter (otherwise it is an extra parameter matching an
6885 ellipsis). */
6887 static rtx
6888 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6889 const_tree type, bool named)
6891 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6892 enum machine_mode mode = omode;
6893 HOST_WIDE_INT bytes, words;
6894 rtx arg;
6896 if (mode == BLKmode)
6897 bytes = int_size_in_bytes (type);
6898 else
6899 bytes = GET_MODE_SIZE (mode);
6900 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6902 /* To simplify the code below, represent vector types with a vector mode
6903 even if MMX/SSE are not active. */
6904 if (type && TREE_CODE (type) == VECTOR_TYPE)
6905 mode = type_natural_mode (type, cum);
6907 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6908 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6909 else if (TARGET_64BIT)
6910 arg = function_arg_64 (cum, mode, omode, type, named);
6911 else
6912 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6914 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6916 /* This argument uses 256bit AVX modes. */
6917 if (cum->caller)
6918 cfun->machine->callee_pass_avx256_p = true;
6919 else
6920 cfun->machine->caller_pass_avx256_p = true;
6923 return arg;
6926 /* A C expression that indicates when an argument must be passed by
6927 reference. If nonzero for an argument, a copy of that argument is
6928 made in memory and a pointer to the argument is passed instead of
6929 the argument itself. The pointer is passed in whatever way is
6930 appropriate for passing a pointer to that type. */
6932 static bool
6933 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6934 enum machine_mode mode ATTRIBUTE_UNUSED,
6935 const_tree type, bool named ATTRIBUTE_UNUSED)
6937 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6939 /* See Windows x64 Software Convention. */
6940 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6942 int msize = (int) GET_MODE_SIZE (mode);
6943 if (type)
6945 /* Arrays are passed by reference. */
6946 if (TREE_CODE (type) == ARRAY_TYPE)
6947 return true;
6949 if (AGGREGATE_TYPE_P (type))
6951 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6952 are passed by reference. */
6953 msize = int_size_in_bytes (type);
6957 /* __m128 is passed by reference. */
6958 switch (msize) {
6959 case 1: case 2: case 4: case 8:
6960 break;
6961 default:
6962 return true;
6965 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6966 return 1;
6968 return 0;
6971 /* Return true when TYPE should be 128bit aligned for 32bit argument
6972 passing ABI. XXX: This function is obsolete and is only used for
6973 checking psABI compatibility with previous versions of GCC. */
6975 static bool
6976 ix86_compat_aligned_value_p (const_tree type)
6978 enum machine_mode mode = TYPE_MODE (type);
6979 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6980 || mode == TDmode
6981 || mode == TFmode
6982 || mode == TCmode)
6983 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6984 return true;
6985 if (TYPE_ALIGN (type) < 128)
6986 return false;
6988 if (AGGREGATE_TYPE_P (type))
6990 /* Walk the aggregates recursively. */
6991 switch (TREE_CODE (type))
6993 case RECORD_TYPE:
6994 case UNION_TYPE:
6995 case QUAL_UNION_TYPE:
6997 tree field;
6999 /* Walk all the structure fields. */
7000 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7002 if (TREE_CODE (field) == FIELD_DECL
7003 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7004 return true;
7006 break;
7009 case ARRAY_TYPE:
7010 /* Just for use if some languages passes arrays by value. */
7011 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7012 return true;
7013 break;
7015 default:
7016 gcc_unreachable ();
7019 return false;
7022 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7023 XXX: This function is obsolete and is only used for checking psABI
7024 compatibility with previous versions of GCC. */
7026 static unsigned int
7027 ix86_compat_function_arg_boundary (enum machine_mode mode,
7028 const_tree type, unsigned int align)
7030 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7031 natural boundaries. */
7032 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7034 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7035 make an exception for SSE modes since these require 128bit
7036 alignment.
7038 The handling here differs from field_alignment. ICC aligns MMX
7039 arguments to 4 byte boundaries, while structure fields are aligned
7040 to 8 byte boundaries. */
7041 if (!type)
7043 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7044 align = PARM_BOUNDARY;
7046 else
7048 if (!ix86_compat_aligned_value_p (type))
7049 align = PARM_BOUNDARY;
7052 if (align > BIGGEST_ALIGNMENT)
7053 align = BIGGEST_ALIGNMENT;
7054 return align;
7057 /* Return true when TYPE should be 128bit aligned for 32bit argument
7058 passing ABI. */
7060 static bool
7061 ix86_contains_aligned_value_p (const_tree type)
7063 enum machine_mode mode = TYPE_MODE (type);
7065 if (mode == XFmode || mode == XCmode)
7066 return false;
7068 if (TYPE_ALIGN (type) < 128)
7069 return false;
7071 if (AGGREGATE_TYPE_P (type))
7073 /* Walk the aggregates recursively. */
7074 switch (TREE_CODE (type))
7076 case RECORD_TYPE:
7077 case UNION_TYPE:
7078 case QUAL_UNION_TYPE:
7080 tree field;
7082 /* Walk all the structure fields. */
7083 for (field = TYPE_FIELDS (type);
7084 field;
7085 field = DECL_CHAIN (field))
7087 if (TREE_CODE (field) == FIELD_DECL
7088 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7089 return true;
7091 break;
7094 case ARRAY_TYPE:
7095 /* Just for use if some languages passes arrays by value. */
7096 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7097 return true;
7098 break;
7100 default:
7101 gcc_unreachable ();
7104 else
7105 return TYPE_ALIGN (type) >= 128;
7107 return false;
7110 /* Gives the alignment boundary, in bits, of an argument with the
7111 specified mode and type. */
7113 static unsigned int
7114 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7116 unsigned int align;
7117 if (type)
7119 /* Since the main variant type is used for call, we convert it to
7120 the main variant type. */
7121 type = TYPE_MAIN_VARIANT (type);
7122 align = TYPE_ALIGN (type);
7124 else
7125 align = GET_MODE_ALIGNMENT (mode);
7126 if (align < PARM_BOUNDARY)
7127 align = PARM_BOUNDARY;
7128 else
7130 static bool warned;
7131 unsigned int saved_align = align;
7133 if (!TARGET_64BIT)
7135 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7136 if (!type)
7138 if (mode == XFmode || mode == XCmode)
7139 align = PARM_BOUNDARY;
7141 else if (!ix86_contains_aligned_value_p (type))
7142 align = PARM_BOUNDARY;
7144 if (align < 128)
7145 align = PARM_BOUNDARY;
7148 if (warn_psabi
7149 && !warned
7150 && align != ix86_compat_function_arg_boundary (mode, type,
7151 saved_align))
7153 warned = true;
7154 inform (input_location,
7155 "The ABI for passing parameters with %d-byte"
7156 " alignment has changed in GCC 4.6",
7157 align / BITS_PER_UNIT);
7161 return align;
7164 /* Return true if N is a possible register number of function value. */
7166 static bool
7167 ix86_function_value_regno_p (const unsigned int regno)
7169 switch (regno)
7171 case AX_REG:
7172 return true;
7174 case FIRST_FLOAT_REG:
7175 /* TODO: The function should depend on current function ABI but
7176 builtins.c would need updating then. Therefore we use the
7177 default ABI. */
7178 if (TARGET_64BIT && ix86_abi == MS_ABI)
7179 return false;
7180 return TARGET_FLOAT_RETURNS_IN_80387;
7182 case FIRST_SSE_REG:
7183 return TARGET_SSE;
7185 case FIRST_MMX_REG:
7186 if (TARGET_MACHO || TARGET_64BIT)
7187 return false;
7188 return TARGET_MMX;
7191 return false;
7194 /* Define how to find the value returned by a function.
7195 VALTYPE is the data type of the value (as a tree).
7196 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7197 otherwise, FUNC is 0. */
7199 static rtx
7200 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7201 const_tree fntype, const_tree fn)
7203 unsigned int regno;
7205 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7206 we normally prevent this case when mmx is not available. However
7207 some ABIs may require the result to be returned like DImode. */
7208 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7209 regno = FIRST_MMX_REG;
7211 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7212 we prevent this case when sse is not available. However some ABIs
7213 may require the result to be returned like integer TImode. */
7214 else if (mode == TImode
7215 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7216 regno = FIRST_SSE_REG;
7218 /* 32-byte vector modes in %ymm0. */
7219 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7220 regno = FIRST_SSE_REG;
7222 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7223 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7224 regno = FIRST_FLOAT_REG;
7225 else
7226 /* Most things go in %eax. */
7227 regno = AX_REG;
7229 /* Override FP return register with %xmm0 for local functions when
7230 SSE math is enabled or for functions with sseregparm attribute. */
7231 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7233 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7234 if ((sse_level >= 1 && mode == SFmode)
7235 || (sse_level == 2 && mode == DFmode))
7236 regno = FIRST_SSE_REG;
7239 /* OImode shouldn't be used directly. */
7240 gcc_assert (mode != OImode);
7242 return gen_rtx_REG (orig_mode, regno);
7245 static rtx
7246 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7247 const_tree valtype)
7249 rtx ret;
7251 /* Handle libcalls, which don't provide a type node. */
7252 if (valtype == NULL)
7254 unsigned int regno;
7256 switch (mode)
7258 case SFmode:
7259 case SCmode:
7260 case DFmode:
7261 case DCmode:
7262 case TFmode:
7263 case SDmode:
7264 case DDmode:
7265 case TDmode:
7266 regno = FIRST_SSE_REG;
7267 break;
7268 case XFmode:
7269 case XCmode:
7270 regno = FIRST_FLOAT_REG;
7271 break;
7272 case TCmode:
7273 return NULL;
7274 default:
7275 regno = AX_REG;
7278 return gen_rtx_REG (mode, regno);
7280 else if (POINTER_TYPE_P (valtype))
7282 /* Pointers are always returned in word_mode. */
7283 mode = word_mode;
7286 ret = construct_container (mode, orig_mode, valtype, 1,
7287 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7288 x86_64_int_return_registers, 0);
7290 /* For zero sized structures, construct_container returns NULL, but we
7291 need to keep rest of compiler happy by returning meaningful value. */
7292 if (!ret)
7293 ret = gen_rtx_REG (orig_mode, AX_REG);
7295 return ret;
7298 static rtx
7299 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7301 unsigned int regno = AX_REG;
7303 if (TARGET_SSE)
7305 switch (GET_MODE_SIZE (mode))
7307 case 16:
7308 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7309 && !COMPLEX_MODE_P (mode))
7310 regno = FIRST_SSE_REG;
7311 break;
7312 case 8:
7313 case 4:
7314 if (mode == SFmode || mode == DFmode)
7315 regno = FIRST_SSE_REG;
7316 break;
7317 default:
7318 break;
7321 return gen_rtx_REG (orig_mode, regno);
7324 static rtx
7325 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7326 enum machine_mode orig_mode, enum machine_mode mode)
7328 const_tree fn, fntype;
7330 fn = NULL_TREE;
7331 if (fntype_or_decl && DECL_P (fntype_or_decl))
7332 fn = fntype_or_decl;
7333 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7335 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7336 return function_value_ms_64 (orig_mode, mode);
7337 else if (TARGET_64BIT)
7338 return function_value_64 (orig_mode, mode, valtype);
7339 else
7340 return function_value_32 (orig_mode, mode, fntype, fn);
7343 static rtx
7344 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7345 bool outgoing ATTRIBUTE_UNUSED)
7347 enum machine_mode mode, orig_mode;
7349 orig_mode = TYPE_MODE (valtype);
7350 mode = type_natural_mode (valtype, NULL);
7351 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7354 /* Pointer function arguments and return values are promoted to
7355 word_mode. */
7357 static enum machine_mode
7358 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7359 int *punsignedp, const_tree fntype,
7360 int for_return)
7362 if (type != NULL_TREE && POINTER_TYPE_P (type))
7364 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7365 return word_mode;
7367 return default_promote_function_mode (type, mode, punsignedp, fntype,
7368 for_return);
7372 ix86_libcall_value (enum machine_mode mode)
7374 return ix86_function_value_1 (NULL, NULL, mode, mode);
7377 /* Return true iff type is returned in memory. */
7379 static bool ATTRIBUTE_UNUSED
7380 return_in_memory_32 (const_tree type, enum machine_mode mode)
7382 HOST_WIDE_INT size;
7384 if (mode == BLKmode)
7385 return true;
7387 size = int_size_in_bytes (type);
7389 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7390 return false;
7392 if (VECTOR_MODE_P (mode) || mode == TImode)
7394 /* User-created vectors small enough to fit in EAX. */
7395 if (size < 8)
7396 return false;
7398 /* MMX/3dNow values are returned in MM0,
7399 except when it doesn't exits or the ABI prescribes otherwise. */
7400 if (size == 8)
7401 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7403 /* SSE values are returned in XMM0, except when it doesn't exist. */
7404 if (size == 16)
7405 return !TARGET_SSE;
7407 /* AVX values are returned in YMM0, except when it doesn't exist. */
7408 if (size == 32)
7409 return !TARGET_AVX;
7412 if (mode == XFmode)
7413 return false;
7415 if (size > 12)
7416 return true;
7418 /* OImode shouldn't be used directly. */
7419 gcc_assert (mode != OImode);
7421 return false;
7424 static bool ATTRIBUTE_UNUSED
7425 return_in_memory_64 (const_tree type, enum machine_mode mode)
7427 int needed_intregs, needed_sseregs;
7428 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7431 static bool ATTRIBUTE_UNUSED
7432 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7434 HOST_WIDE_INT size = int_size_in_bytes (type);
7436 /* __m128 is returned in xmm0. */
7437 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7438 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7439 return false;
7441 /* Otherwise, the size must be exactly in [1248]. */
7442 return size != 1 && size != 2 && size != 4 && size != 8;
7445 static bool
7446 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7448 #ifdef SUBTARGET_RETURN_IN_MEMORY
7449 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7450 #else
7451 const enum machine_mode mode = type_natural_mode (type, NULL);
7453 if (TARGET_64BIT)
7455 if (ix86_function_type_abi (fntype) == MS_ABI)
7456 return return_in_memory_ms_64 (type, mode);
7457 else
7458 return return_in_memory_64 (type, mode);
7460 else
7461 return return_in_memory_32 (type, mode);
7462 #endif
7465 /* When returning SSE vector types, we have a choice of either
7466 (1) being abi incompatible with a -march switch, or
7467 (2) generating an error.
7468 Given no good solution, I think the safest thing is one warning.
7469 The user won't be able to use -Werror, but....
7471 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7472 called in response to actually generating a caller or callee that
7473 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7474 via aggregate_value_p for general type probing from tree-ssa. */
7476 static rtx
7477 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7479 static bool warnedsse, warnedmmx;
7481 if (!TARGET_64BIT && type)
7483 /* Look at the return type of the function, not the function type. */
7484 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7486 if (!TARGET_SSE && !warnedsse)
7488 if (mode == TImode
7489 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7491 warnedsse = true;
7492 warning (0, "SSE vector return without SSE enabled "
7493 "changes the ABI");
7497 if (!TARGET_MMX && !warnedmmx)
7499 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7501 warnedmmx = true;
7502 warning (0, "MMX vector return without MMX enabled "
7503 "changes the ABI");
7508 return NULL;
7512 /* Create the va_list data type. */
7514 /* Returns the calling convention specific va_list date type.
7515 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7517 static tree
7518 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7520 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7522 /* For i386 we use plain pointer to argument area. */
7523 if (!TARGET_64BIT || abi == MS_ABI)
7524 return build_pointer_type (char_type_node);
7526 record = lang_hooks.types.make_type (RECORD_TYPE);
7527 type_decl = build_decl (BUILTINS_LOCATION,
7528 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7530 f_gpr = build_decl (BUILTINS_LOCATION,
7531 FIELD_DECL, get_identifier ("gp_offset"),
7532 unsigned_type_node);
7533 f_fpr = build_decl (BUILTINS_LOCATION,
7534 FIELD_DECL, get_identifier ("fp_offset"),
7535 unsigned_type_node);
7536 f_ovf = build_decl (BUILTINS_LOCATION,
7537 FIELD_DECL, get_identifier ("overflow_arg_area"),
7538 ptr_type_node);
7539 f_sav = build_decl (BUILTINS_LOCATION,
7540 FIELD_DECL, get_identifier ("reg_save_area"),
7541 ptr_type_node);
7543 va_list_gpr_counter_field = f_gpr;
7544 va_list_fpr_counter_field = f_fpr;
7546 DECL_FIELD_CONTEXT (f_gpr) = record;
7547 DECL_FIELD_CONTEXT (f_fpr) = record;
7548 DECL_FIELD_CONTEXT (f_ovf) = record;
7549 DECL_FIELD_CONTEXT (f_sav) = record;
7551 TYPE_STUB_DECL (record) = type_decl;
7552 TYPE_NAME (record) = type_decl;
7553 TYPE_FIELDS (record) = f_gpr;
7554 DECL_CHAIN (f_gpr) = f_fpr;
7555 DECL_CHAIN (f_fpr) = f_ovf;
7556 DECL_CHAIN (f_ovf) = f_sav;
7558 layout_type (record);
7560 /* The correct type is an array type of one element. */
7561 return build_array_type (record, build_index_type (size_zero_node));
7564 /* Setup the builtin va_list data type and for 64-bit the additional
7565 calling convention specific va_list data types. */
7567 static tree
7568 ix86_build_builtin_va_list (void)
7570 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7572 /* Initialize abi specific va_list builtin types. */
7573 if (TARGET_64BIT)
7575 tree t;
7576 if (ix86_abi == MS_ABI)
7578 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7579 if (TREE_CODE (t) != RECORD_TYPE)
7580 t = build_variant_type_copy (t);
7581 sysv_va_list_type_node = t;
7583 else
7585 t = ret;
7586 if (TREE_CODE (t) != RECORD_TYPE)
7587 t = build_variant_type_copy (t);
7588 sysv_va_list_type_node = t;
7590 if (ix86_abi != MS_ABI)
7592 t = ix86_build_builtin_va_list_abi (MS_ABI);
7593 if (TREE_CODE (t) != RECORD_TYPE)
7594 t = build_variant_type_copy (t);
7595 ms_va_list_type_node = t;
7597 else
7599 t = ret;
7600 if (TREE_CODE (t) != RECORD_TYPE)
7601 t = build_variant_type_copy (t);
7602 ms_va_list_type_node = t;
7606 return ret;
7609 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7611 static void
7612 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7614 rtx save_area, mem;
7615 alias_set_type set;
7616 int i, max;
7618 /* GPR size of varargs save area. */
7619 if (cfun->va_list_gpr_size)
7620 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7621 else
7622 ix86_varargs_gpr_size = 0;
7624 /* FPR size of varargs save area. We don't need it if we don't pass
7625 anything in SSE registers. */
7626 if (TARGET_SSE && cfun->va_list_fpr_size)
7627 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7628 else
7629 ix86_varargs_fpr_size = 0;
7631 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7632 return;
7634 save_area = frame_pointer_rtx;
7635 set = get_varargs_alias_set ();
7637 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7638 if (max > X86_64_REGPARM_MAX)
7639 max = X86_64_REGPARM_MAX;
7641 for (i = cum->regno; i < max; i++)
7643 mem = gen_rtx_MEM (word_mode,
7644 plus_constant (save_area, i * UNITS_PER_WORD));
7645 MEM_NOTRAP_P (mem) = 1;
7646 set_mem_alias_set (mem, set);
7647 emit_move_insn (mem,
7648 gen_rtx_REG (word_mode,
7649 x86_64_int_parameter_registers[i]));
7652 if (ix86_varargs_fpr_size)
7654 enum machine_mode smode;
7655 rtx label, test;
7657 /* Now emit code to save SSE registers. The AX parameter contains number
7658 of SSE parameter registers used to call this function, though all we
7659 actually check here is the zero/non-zero status. */
7661 label = gen_label_rtx ();
7662 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7663 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7664 label));
7666 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7667 we used movdqa (i.e. TImode) instead? Perhaps even better would
7668 be if we could determine the real mode of the data, via a hook
7669 into pass_stdarg. Ignore all that for now. */
7670 smode = V4SFmode;
7671 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7672 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7674 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7675 if (max > X86_64_SSE_REGPARM_MAX)
7676 max = X86_64_SSE_REGPARM_MAX;
7678 for (i = cum->sse_regno; i < max; ++i)
7680 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7681 mem = gen_rtx_MEM (smode, mem);
7682 MEM_NOTRAP_P (mem) = 1;
7683 set_mem_alias_set (mem, set);
7684 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7686 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7689 emit_label (label);
7693 static void
7694 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7696 alias_set_type set = get_varargs_alias_set ();
7697 int i;
7699 /* Reset to zero, as there might be a sysv vaarg used
7700 before. */
7701 ix86_varargs_gpr_size = 0;
7702 ix86_varargs_fpr_size = 0;
7704 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7706 rtx reg, mem;
7708 mem = gen_rtx_MEM (Pmode,
7709 plus_constant (virtual_incoming_args_rtx,
7710 i * UNITS_PER_WORD));
7711 MEM_NOTRAP_P (mem) = 1;
7712 set_mem_alias_set (mem, set);
7714 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7715 emit_move_insn (mem, reg);
7719 static void
7720 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7721 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7722 int no_rtl)
7724 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7725 CUMULATIVE_ARGS next_cum;
7726 tree fntype;
7728 /* This argument doesn't appear to be used anymore. Which is good,
7729 because the old code here didn't suppress rtl generation. */
7730 gcc_assert (!no_rtl);
7732 if (!TARGET_64BIT)
7733 return;
7735 fntype = TREE_TYPE (current_function_decl);
7737 /* For varargs, we do not want to skip the dummy va_dcl argument.
7738 For stdargs, we do want to skip the last named argument. */
7739 next_cum = *cum;
7740 if (stdarg_p (fntype))
7741 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7742 true);
7744 if (cum->call_abi == MS_ABI)
7745 setup_incoming_varargs_ms_64 (&next_cum);
7746 else
7747 setup_incoming_varargs_64 (&next_cum);
7750 /* Checks if TYPE is of kind va_list char *. */
7752 static bool
7753 is_va_list_char_pointer (tree type)
7755 tree canonic;
7757 /* For 32-bit it is always true. */
7758 if (!TARGET_64BIT)
7759 return true;
7760 canonic = ix86_canonical_va_list_type (type);
7761 return (canonic == ms_va_list_type_node
7762 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7765 /* Implement va_start. */
7767 static void
7768 ix86_va_start (tree valist, rtx nextarg)
7770 HOST_WIDE_INT words, n_gpr, n_fpr;
7771 tree f_gpr, f_fpr, f_ovf, f_sav;
7772 tree gpr, fpr, ovf, sav, t;
7773 tree type;
7774 rtx ovf_rtx;
7776 if (flag_split_stack
7777 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7779 unsigned int scratch_regno;
7781 /* When we are splitting the stack, we can't refer to the stack
7782 arguments using internal_arg_pointer, because they may be on
7783 the old stack. The split stack prologue will arrange to
7784 leave a pointer to the old stack arguments in a scratch
7785 register, which we here copy to a pseudo-register. The split
7786 stack prologue can't set the pseudo-register directly because
7787 it (the prologue) runs before any registers have been saved. */
7789 scratch_regno = split_stack_prologue_scratch_regno ();
7790 if (scratch_regno != INVALID_REGNUM)
7792 rtx reg, seq;
7794 reg = gen_reg_rtx (Pmode);
7795 cfun->machine->split_stack_varargs_pointer = reg;
7797 start_sequence ();
7798 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7799 seq = get_insns ();
7800 end_sequence ();
7802 push_topmost_sequence ();
7803 emit_insn_after (seq, entry_of_function ());
7804 pop_topmost_sequence ();
7808 /* Only 64bit target needs something special. */
7809 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7811 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7812 std_expand_builtin_va_start (valist, nextarg);
7813 else
7815 rtx va_r, next;
7817 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7818 next = expand_binop (ptr_mode, add_optab,
7819 cfun->machine->split_stack_varargs_pointer,
7820 crtl->args.arg_offset_rtx,
7821 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7822 convert_move (va_r, next, 0);
7824 return;
7827 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7828 f_fpr = DECL_CHAIN (f_gpr);
7829 f_ovf = DECL_CHAIN (f_fpr);
7830 f_sav = DECL_CHAIN (f_ovf);
7832 valist = build_simple_mem_ref (valist);
7833 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7834 /* The following should be folded into the MEM_REF offset. */
7835 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7836 f_gpr, NULL_TREE);
7837 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7838 f_fpr, NULL_TREE);
7839 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7840 f_ovf, NULL_TREE);
7841 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7842 f_sav, NULL_TREE);
7844 /* Count number of gp and fp argument registers used. */
7845 words = crtl->args.info.words;
7846 n_gpr = crtl->args.info.regno;
7847 n_fpr = crtl->args.info.sse_regno;
7849 if (cfun->va_list_gpr_size)
7851 type = TREE_TYPE (gpr);
7852 t = build2 (MODIFY_EXPR, type,
7853 gpr, build_int_cst (type, n_gpr * 8));
7854 TREE_SIDE_EFFECTS (t) = 1;
7855 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7858 if (TARGET_SSE && cfun->va_list_fpr_size)
7860 type = TREE_TYPE (fpr);
7861 t = build2 (MODIFY_EXPR, type, fpr,
7862 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7863 TREE_SIDE_EFFECTS (t) = 1;
7864 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7867 /* Find the overflow area. */
7868 type = TREE_TYPE (ovf);
7869 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7870 ovf_rtx = crtl->args.internal_arg_pointer;
7871 else
7872 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7873 t = make_tree (type, ovf_rtx);
7874 if (words != 0)
7875 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7876 t = build2 (MODIFY_EXPR, type, ovf, t);
7877 TREE_SIDE_EFFECTS (t) = 1;
7878 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7880 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7882 /* Find the register save area.
7883 Prologue of the function save it right above stack frame. */
7884 type = TREE_TYPE (sav);
7885 t = make_tree (type, frame_pointer_rtx);
7886 if (!ix86_varargs_gpr_size)
7887 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7888 t = build2 (MODIFY_EXPR, type, sav, t);
7889 TREE_SIDE_EFFECTS (t) = 1;
7890 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7894 /* Implement va_arg. */
7896 static tree
7897 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7898 gimple_seq *post_p)
7900 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7901 tree f_gpr, f_fpr, f_ovf, f_sav;
7902 tree gpr, fpr, ovf, sav, t;
7903 int size, rsize;
7904 tree lab_false, lab_over = NULL_TREE;
7905 tree addr, t2;
7906 rtx container;
7907 int indirect_p = 0;
7908 tree ptrtype;
7909 enum machine_mode nat_mode;
7910 unsigned int arg_boundary;
7912 /* Only 64bit target needs something special. */
7913 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7914 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7916 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7917 f_fpr = DECL_CHAIN (f_gpr);
7918 f_ovf = DECL_CHAIN (f_fpr);
7919 f_sav = DECL_CHAIN (f_ovf);
7921 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7922 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7923 valist = build_va_arg_indirect_ref (valist);
7924 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7925 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7926 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7928 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7929 if (indirect_p)
7930 type = build_pointer_type (type);
7931 size = int_size_in_bytes (type);
7932 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7934 nat_mode = type_natural_mode (type, NULL);
7935 switch (nat_mode)
7937 case V8SFmode:
7938 case V8SImode:
7939 case V32QImode:
7940 case V16HImode:
7941 case V4DFmode:
7942 case V4DImode:
7943 /* Unnamed 256bit vector mode parameters are passed on stack. */
7944 if (!TARGET_64BIT_MS_ABI)
7946 container = NULL;
7947 break;
7950 default:
7951 container = construct_container (nat_mode, TYPE_MODE (type),
7952 type, 0, X86_64_REGPARM_MAX,
7953 X86_64_SSE_REGPARM_MAX, intreg,
7955 break;
7958 /* Pull the value out of the saved registers. */
7960 addr = create_tmp_var (ptr_type_node, "addr");
7962 if (container)
7964 int needed_intregs, needed_sseregs;
7965 bool need_temp;
7966 tree int_addr, sse_addr;
7968 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7969 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7971 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7973 need_temp = (!REG_P (container)
7974 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7975 || TYPE_ALIGN (type) > 128));
7977 /* In case we are passing structure, verify that it is consecutive block
7978 on the register save area. If not we need to do moves. */
7979 if (!need_temp && !REG_P (container))
7981 /* Verify that all registers are strictly consecutive */
7982 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7984 int i;
7986 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7988 rtx slot = XVECEXP (container, 0, i);
7989 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7990 || INTVAL (XEXP (slot, 1)) != i * 16)
7991 need_temp = 1;
7994 else
7996 int i;
7998 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8000 rtx slot = XVECEXP (container, 0, i);
8001 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8002 || INTVAL (XEXP (slot, 1)) != i * 8)
8003 need_temp = 1;
8007 if (!need_temp)
8009 int_addr = addr;
8010 sse_addr = addr;
8012 else
8014 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8015 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8018 /* First ensure that we fit completely in registers. */
8019 if (needed_intregs)
8021 t = build_int_cst (TREE_TYPE (gpr),
8022 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8023 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8024 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8025 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8026 gimplify_and_add (t, pre_p);
8028 if (needed_sseregs)
8030 t = build_int_cst (TREE_TYPE (fpr),
8031 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8032 + X86_64_REGPARM_MAX * 8);
8033 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8034 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8035 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8036 gimplify_and_add (t, pre_p);
8039 /* Compute index to start of area used for integer regs. */
8040 if (needed_intregs)
8042 /* int_addr = gpr + sav; */
8043 t = fold_build_pointer_plus (sav, gpr);
8044 gimplify_assign (int_addr, t, pre_p);
8046 if (needed_sseregs)
8048 /* sse_addr = fpr + sav; */
8049 t = fold_build_pointer_plus (sav, fpr);
8050 gimplify_assign (sse_addr, t, pre_p);
8052 if (need_temp)
8054 int i, prev_size = 0;
8055 tree temp = create_tmp_var (type, "va_arg_tmp");
8057 /* addr = &temp; */
8058 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8059 gimplify_assign (addr, t, pre_p);
8061 for (i = 0; i < XVECLEN (container, 0); i++)
8063 rtx slot = XVECEXP (container, 0, i);
8064 rtx reg = XEXP (slot, 0);
8065 enum machine_mode mode = GET_MODE (reg);
8066 tree piece_type;
8067 tree addr_type;
8068 tree daddr_type;
8069 tree src_addr, src;
8070 int src_offset;
8071 tree dest_addr, dest;
8072 int cur_size = GET_MODE_SIZE (mode);
8074 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8075 prev_size = INTVAL (XEXP (slot, 1));
8076 if (prev_size + cur_size > size)
8078 cur_size = size - prev_size;
8079 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8080 if (mode == BLKmode)
8081 mode = QImode;
8083 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8084 if (mode == GET_MODE (reg))
8085 addr_type = build_pointer_type (piece_type);
8086 else
8087 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8088 true);
8089 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8090 true);
8092 if (SSE_REGNO_P (REGNO (reg)))
8094 src_addr = sse_addr;
8095 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8097 else
8099 src_addr = int_addr;
8100 src_offset = REGNO (reg) * 8;
8102 src_addr = fold_convert (addr_type, src_addr);
8103 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8105 dest_addr = fold_convert (daddr_type, addr);
8106 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8107 if (cur_size == GET_MODE_SIZE (mode))
8109 src = build_va_arg_indirect_ref (src_addr);
8110 dest = build_va_arg_indirect_ref (dest_addr);
8112 gimplify_assign (dest, src, pre_p);
8114 else
8116 tree copy
8117 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8118 3, dest_addr, src_addr,
8119 size_int (cur_size));
8120 gimplify_and_add (copy, pre_p);
8122 prev_size += cur_size;
8126 if (needed_intregs)
8128 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8129 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8130 gimplify_assign (gpr, t, pre_p);
8133 if (needed_sseregs)
8135 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8136 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8137 gimplify_assign (fpr, t, pre_p);
8140 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8142 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8145 /* ... otherwise out of the overflow area. */
8147 /* When we align parameter on stack for caller, if the parameter
8148 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8149 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8150 here with caller. */
8151 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8152 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8153 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8155 /* Care for on-stack alignment if needed. */
8156 if (arg_boundary <= 64 || size == 0)
8157 t = ovf;
8158 else
8160 HOST_WIDE_INT align = arg_boundary / 8;
8161 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8162 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8163 build_int_cst (TREE_TYPE (t), -align));
8166 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8167 gimplify_assign (addr, t, pre_p);
8169 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8170 gimplify_assign (unshare_expr (ovf), t, pre_p);
8172 if (container)
8173 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8175 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8176 addr = fold_convert (ptrtype, addr);
8178 if (indirect_p)
8179 addr = build_va_arg_indirect_ref (addr);
8180 return build_va_arg_indirect_ref (addr);
8183 /* Return true if OPNUM's MEM should be matched
8184 in movabs* patterns. */
8186 bool
8187 ix86_check_movabs (rtx insn, int opnum)
8189 rtx set, mem;
8191 set = PATTERN (insn);
8192 if (GET_CODE (set) == PARALLEL)
8193 set = XVECEXP (set, 0, 0);
8194 gcc_assert (GET_CODE (set) == SET);
8195 mem = XEXP (set, opnum);
8196 while (GET_CODE (mem) == SUBREG)
8197 mem = SUBREG_REG (mem);
8198 gcc_assert (MEM_P (mem));
8199 return volatile_ok || !MEM_VOLATILE_P (mem);
8202 /* Initialize the table of extra 80387 mathematical constants. */
8204 static void
8205 init_ext_80387_constants (void)
8207 static const char * cst[5] =
8209 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8210 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8211 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8212 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8213 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8215 int i;
8217 for (i = 0; i < 5; i++)
8219 real_from_string (&ext_80387_constants_table[i], cst[i]);
8220 /* Ensure each constant is rounded to XFmode precision. */
8221 real_convert (&ext_80387_constants_table[i],
8222 XFmode, &ext_80387_constants_table[i]);
8225 ext_80387_constants_init = 1;
8228 /* Return non-zero if the constant is something that
8229 can be loaded with a special instruction. */
8232 standard_80387_constant_p (rtx x)
8234 enum machine_mode mode = GET_MODE (x);
8236 REAL_VALUE_TYPE r;
8238 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8239 return -1;
8241 if (x == CONST0_RTX (mode))
8242 return 1;
8243 if (x == CONST1_RTX (mode))
8244 return 2;
8246 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8248 /* For XFmode constants, try to find a special 80387 instruction when
8249 optimizing for size or on those CPUs that benefit from them. */
8250 if (mode == XFmode
8251 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8253 int i;
8255 if (! ext_80387_constants_init)
8256 init_ext_80387_constants ();
8258 for (i = 0; i < 5; i++)
8259 if (real_identical (&r, &ext_80387_constants_table[i]))
8260 return i + 3;
8263 /* Load of the constant -0.0 or -1.0 will be split as
8264 fldz;fchs or fld1;fchs sequence. */
8265 if (real_isnegzero (&r))
8266 return 8;
8267 if (real_identical (&r, &dconstm1))
8268 return 9;
8270 return 0;
8273 /* Return the opcode of the special instruction to be used to load
8274 the constant X. */
8276 const char *
8277 standard_80387_constant_opcode (rtx x)
8279 switch (standard_80387_constant_p (x))
8281 case 1:
8282 return "fldz";
8283 case 2:
8284 return "fld1";
8285 case 3:
8286 return "fldlg2";
8287 case 4:
8288 return "fldln2";
8289 case 5:
8290 return "fldl2e";
8291 case 6:
8292 return "fldl2t";
8293 case 7:
8294 return "fldpi";
8295 case 8:
8296 case 9:
8297 return "#";
8298 default:
8299 gcc_unreachable ();
8303 /* Return the CONST_DOUBLE representing the 80387 constant that is
8304 loaded by the specified special instruction. The argument IDX
8305 matches the return value from standard_80387_constant_p. */
8308 standard_80387_constant_rtx (int idx)
8310 int i;
8312 if (! ext_80387_constants_init)
8313 init_ext_80387_constants ();
8315 switch (idx)
8317 case 3:
8318 case 4:
8319 case 5:
8320 case 6:
8321 case 7:
8322 i = idx - 3;
8323 break;
8325 default:
8326 gcc_unreachable ();
8329 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8330 XFmode);
8333 /* Return 1 if X is all 0s and 2 if x is all 1s
8334 in supported SSE/AVX vector mode. */
8337 standard_sse_constant_p (rtx x)
8339 enum machine_mode mode = GET_MODE (x);
8341 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8342 return 1;
8343 if (vector_all_ones_operand (x, mode))
8344 switch (mode)
8346 case V16QImode:
8347 case V8HImode:
8348 case V4SImode:
8349 case V2DImode:
8350 if (TARGET_SSE2)
8351 return 2;
8352 case V32QImode:
8353 case V16HImode:
8354 case V8SImode:
8355 case V4DImode:
8356 if (TARGET_AVX2)
8357 return 2;
8358 default:
8359 break;
8362 return 0;
8365 /* Return the opcode of the special instruction to be used to load
8366 the constant X. */
8368 const char *
8369 standard_sse_constant_opcode (rtx insn, rtx x)
8371 switch (standard_sse_constant_p (x))
8373 case 1:
8374 switch (get_attr_mode (insn))
8376 case MODE_TI:
8377 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8378 return "%vpxor\t%0, %d0";
8379 case MODE_V2DF:
8380 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8381 return "%vxorpd\t%0, %d0";
8382 case MODE_V4SF:
8383 return "%vxorps\t%0, %d0";
8385 case MODE_OI:
8386 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8387 return "vpxor\t%x0, %x0, %x0";
8388 case MODE_V4DF:
8389 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8390 return "vxorpd\t%x0, %x0, %x0";
8391 case MODE_V8SF:
8392 return "vxorps\t%x0, %x0, %x0";
8394 default:
8395 break;
8398 case 2:
8399 if (TARGET_AVX)
8400 return "vpcmpeqd\t%0, %0, %0";
8401 else
8402 return "pcmpeqd\t%0, %0";
8404 default:
8405 break;
8407 gcc_unreachable ();
8410 /* Returns true if OP contains a symbol reference */
8412 bool
8413 symbolic_reference_mentioned_p (rtx op)
8415 const char *fmt;
8416 int i;
8418 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8419 return true;
8421 fmt = GET_RTX_FORMAT (GET_CODE (op));
8422 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8424 if (fmt[i] == 'E')
8426 int j;
8428 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8429 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8430 return true;
8433 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8434 return true;
8437 return false;
8440 /* Return true if it is appropriate to emit `ret' instructions in the
8441 body of a function. Do this only if the epilogue is simple, needing a
8442 couple of insns. Prior to reloading, we can't tell how many registers
8443 must be saved, so return false then. Return false if there is no frame
8444 marker to de-allocate. */
8446 bool
8447 ix86_can_use_return_insn_p (void)
8449 struct ix86_frame frame;
8451 if (! reload_completed || frame_pointer_needed)
8452 return 0;
8454 /* Don't allow more than 32k pop, since that's all we can do
8455 with one instruction. */
8456 if (crtl->args.pops_args && crtl->args.size >= 32768)
8457 return 0;
8459 ix86_compute_frame_layout (&frame);
8460 return (frame.stack_pointer_offset == UNITS_PER_WORD
8461 && (frame.nregs + frame.nsseregs) == 0);
8464 /* Value should be nonzero if functions must have frame pointers.
8465 Zero means the frame pointer need not be set up (and parms may
8466 be accessed via the stack pointer) in functions that seem suitable. */
8468 static bool
8469 ix86_frame_pointer_required (void)
8471 /* If we accessed previous frames, then the generated code expects
8472 to be able to access the saved ebp value in our frame. */
8473 if (cfun->machine->accesses_prev_frame)
8474 return true;
8476 /* Several x86 os'es need a frame pointer for other reasons,
8477 usually pertaining to setjmp. */
8478 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8479 return true;
8481 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8482 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8483 return true;
8485 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8486 turns off the frame pointer by default. Turn it back on now if
8487 we've not got a leaf function. */
8488 if (TARGET_OMIT_LEAF_FRAME_POINTER
8489 && (!current_function_is_leaf
8490 || ix86_current_function_calls_tls_descriptor))
8491 return true;
8493 if (crtl->profile && !flag_fentry)
8494 return true;
8496 return false;
8499 /* Record that the current function accesses previous call frames. */
8501 void
8502 ix86_setup_frame_addresses (void)
8504 cfun->machine->accesses_prev_frame = 1;
8507 #ifndef USE_HIDDEN_LINKONCE
8508 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8509 # define USE_HIDDEN_LINKONCE 1
8510 # else
8511 # define USE_HIDDEN_LINKONCE 0
8512 # endif
8513 #endif
8515 static int pic_labels_used;
8517 /* Fills in the label name that should be used for a pc thunk for
8518 the given register. */
8520 static void
8521 get_pc_thunk_name (char name[32], unsigned int regno)
8523 gcc_assert (!TARGET_64BIT);
8525 if (USE_HIDDEN_LINKONCE)
8526 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8527 else
8528 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8532 /* This function generates code for -fpic that loads %ebx with
8533 the return address of the caller and then returns. */
8535 static void
8536 ix86_code_end (void)
8538 rtx xops[2];
8539 int regno;
8541 for (regno = AX_REG; regno <= SP_REG; regno++)
8543 char name[32];
8544 tree decl;
8546 if (!(pic_labels_used & (1 << regno)))
8547 continue;
8549 get_pc_thunk_name (name, regno);
8551 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8552 get_identifier (name),
8553 build_function_type_list (void_type_node, NULL_TREE));
8554 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8555 NULL_TREE, void_type_node);
8556 TREE_PUBLIC (decl) = 1;
8557 TREE_STATIC (decl) = 1;
8559 #if TARGET_MACHO
8560 if (TARGET_MACHO)
8562 switch_to_section (darwin_sections[text_coal_section]);
8563 fputs ("\t.weak_definition\t", asm_out_file);
8564 assemble_name (asm_out_file, name);
8565 fputs ("\n\t.private_extern\t", asm_out_file);
8566 assemble_name (asm_out_file, name);
8567 putc ('\n', asm_out_file);
8568 ASM_OUTPUT_LABEL (asm_out_file, name);
8569 DECL_WEAK (decl) = 1;
8571 else
8572 #endif
8573 if (USE_HIDDEN_LINKONCE)
8575 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8577 targetm.asm_out.unique_section (decl, 0);
8578 switch_to_section (get_named_section (decl, NULL, 0));
8580 targetm.asm_out.globalize_label (asm_out_file, name);
8581 fputs ("\t.hidden\t", asm_out_file);
8582 assemble_name (asm_out_file, name);
8583 putc ('\n', asm_out_file);
8584 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8586 else
8588 switch_to_section (text_section);
8589 ASM_OUTPUT_LABEL (asm_out_file, name);
8592 DECL_INITIAL (decl) = make_node (BLOCK);
8593 current_function_decl = decl;
8594 init_function_start (decl);
8595 first_function_block_is_cold = false;
8596 /* Make sure unwind info is emitted for the thunk if needed. */
8597 final_start_function (emit_barrier (), asm_out_file, 1);
8599 /* Pad stack IP move with 4 instructions (two NOPs count
8600 as one instruction). */
8601 if (TARGET_PAD_SHORT_FUNCTION)
8603 int i = 8;
8605 while (i--)
8606 fputs ("\tnop\n", asm_out_file);
8609 xops[0] = gen_rtx_REG (Pmode, regno);
8610 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8611 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8612 fputs ("\tret\n", asm_out_file);
8613 final_end_function ();
8614 init_insn_lengths ();
8615 free_after_compilation (cfun);
8616 set_cfun (NULL);
8617 current_function_decl = NULL;
8620 if (flag_split_stack)
8621 file_end_indicate_split_stack ();
8624 /* Emit code for the SET_GOT patterns. */
8626 const char *
8627 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8629 rtx xops[3];
8631 xops[0] = dest;
8633 if (TARGET_VXWORKS_RTP && flag_pic)
8635 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8636 xops[2] = gen_rtx_MEM (Pmode,
8637 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8638 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8640 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8641 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8642 an unadorned address. */
8643 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8644 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8645 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8646 return "";
8649 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8651 if (!flag_pic)
8653 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8655 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8657 #if TARGET_MACHO
8658 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8659 is what will be referenced by the Mach-O PIC subsystem. */
8660 if (!label)
8661 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8662 #endif
8664 targetm.asm_out.internal_label (asm_out_file, "L",
8665 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8667 else
8669 char name[32];
8670 get_pc_thunk_name (name, REGNO (dest));
8671 pic_labels_used |= 1 << REGNO (dest);
8673 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8674 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8675 output_asm_insn ("call\t%X2", xops);
8676 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8677 is what will be referenced by the Mach-O PIC subsystem. */
8678 #if TARGET_MACHO
8679 if (!label)
8680 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8681 else
8682 targetm.asm_out.internal_label (asm_out_file, "L",
8683 CODE_LABEL_NUMBER (label));
8684 #endif
8687 if (!TARGET_MACHO)
8688 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8690 return "";
8693 /* Generate an "push" pattern for input ARG. */
8695 static rtx
8696 gen_push (rtx arg)
8698 struct machine_function *m = cfun->machine;
8700 if (m->fs.cfa_reg == stack_pointer_rtx)
8701 m->fs.cfa_offset += UNITS_PER_WORD;
8702 m->fs.sp_offset += UNITS_PER_WORD;
8704 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8705 arg = gen_rtx_REG (word_mode, REGNO (arg));
8707 return gen_rtx_SET (VOIDmode,
8708 gen_rtx_MEM (word_mode,
8709 gen_rtx_PRE_DEC (Pmode,
8710 stack_pointer_rtx)),
8711 arg);
8714 /* Generate an "pop" pattern for input ARG. */
8716 static rtx
8717 gen_pop (rtx arg)
8719 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8720 arg = gen_rtx_REG (word_mode, REGNO (arg));
8722 return gen_rtx_SET (VOIDmode,
8723 arg,
8724 gen_rtx_MEM (word_mode,
8725 gen_rtx_POST_INC (Pmode,
8726 stack_pointer_rtx)));
8729 /* Return >= 0 if there is an unused call-clobbered register available
8730 for the entire function. */
8732 static unsigned int
8733 ix86_select_alt_pic_regnum (void)
8735 if (current_function_is_leaf
8736 && !crtl->profile
8737 && !ix86_current_function_calls_tls_descriptor)
8739 int i, drap;
8740 /* Can't use the same register for both PIC and DRAP. */
8741 if (crtl->drap_reg)
8742 drap = REGNO (crtl->drap_reg);
8743 else
8744 drap = -1;
8745 for (i = 2; i >= 0; --i)
8746 if (i != drap && !df_regs_ever_live_p (i))
8747 return i;
8750 return INVALID_REGNUM;
8753 /* Return TRUE if we need to save REGNO. */
8755 static bool
8756 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8758 if (pic_offset_table_rtx
8759 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8760 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8761 || crtl->profile
8762 || crtl->calls_eh_return
8763 || crtl->uses_const_pool))
8764 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8766 if (crtl->calls_eh_return && maybe_eh_return)
8768 unsigned i;
8769 for (i = 0; ; i++)
8771 unsigned test = EH_RETURN_DATA_REGNO (i);
8772 if (test == INVALID_REGNUM)
8773 break;
8774 if (test == regno)
8775 return true;
8779 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8780 return true;
8782 return (df_regs_ever_live_p (regno)
8783 && !call_used_regs[regno]
8784 && !fixed_regs[regno]
8785 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8788 /* Return number of saved general prupose registers. */
8790 static int
8791 ix86_nsaved_regs (void)
8793 int nregs = 0;
8794 int regno;
8796 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8797 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8798 nregs ++;
8799 return nregs;
8802 /* Return number of saved SSE registrers. */
8804 static int
8805 ix86_nsaved_sseregs (void)
8807 int nregs = 0;
8808 int regno;
8810 if (!TARGET_64BIT_MS_ABI)
8811 return 0;
8812 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8813 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8814 nregs ++;
8815 return nregs;
8818 /* Given FROM and TO register numbers, say whether this elimination is
8819 allowed. If stack alignment is needed, we can only replace argument
8820 pointer with hard frame pointer, or replace frame pointer with stack
8821 pointer. Otherwise, frame pointer elimination is automatically
8822 handled and all other eliminations are valid. */
8824 static bool
8825 ix86_can_eliminate (const int from, const int to)
8827 if (stack_realign_fp)
8828 return ((from == ARG_POINTER_REGNUM
8829 && to == HARD_FRAME_POINTER_REGNUM)
8830 || (from == FRAME_POINTER_REGNUM
8831 && to == STACK_POINTER_REGNUM));
8832 else
8833 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8836 /* Return the offset between two registers, one to be eliminated, and the other
8837 its replacement, at the start of a routine. */
8839 HOST_WIDE_INT
8840 ix86_initial_elimination_offset (int from, int to)
8842 struct ix86_frame frame;
8843 ix86_compute_frame_layout (&frame);
8845 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8846 return frame.hard_frame_pointer_offset;
8847 else if (from == FRAME_POINTER_REGNUM
8848 && to == HARD_FRAME_POINTER_REGNUM)
8849 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8850 else
8852 gcc_assert (to == STACK_POINTER_REGNUM);
8854 if (from == ARG_POINTER_REGNUM)
8855 return frame.stack_pointer_offset;
8857 gcc_assert (from == FRAME_POINTER_REGNUM);
8858 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8862 /* In a dynamically-aligned function, we can't know the offset from
8863 stack pointer to frame pointer, so we must ensure that setjmp
8864 eliminates fp against the hard fp (%ebp) rather than trying to
8865 index from %esp up to the top of the frame across a gap that is
8866 of unknown (at compile-time) size. */
8867 static rtx
8868 ix86_builtin_setjmp_frame_value (void)
8870 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8873 /* When using -fsplit-stack, the allocation routines set a field in
8874 the TCB to the bottom of the stack plus this much space, measured
8875 in bytes. */
8877 #define SPLIT_STACK_AVAILABLE 256
8879 /* Fill structure ix86_frame about frame of currently computed function. */
8881 static void
8882 ix86_compute_frame_layout (struct ix86_frame *frame)
8884 unsigned int stack_alignment_needed;
8885 HOST_WIDE_INT offset;
8886 unsigned int preferred_alignment;
8887 HOST_WIDE_INT size = get_frame_size ();
8888 HOST_WIDE_INT to_allocate;
8890 frame->nregs = ix86_nsaved_regs ();
8891 frame->nsseregs = ix86_nsaved_sseregs ();
8893 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8894 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8896 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8897 function prologues and leaf. */
8898 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8899 && (!current_function_is_leaf || cfun->calls_alloca != 0
8900 || ix86_current_function_calls_tls_descriptor))
8902 preferred_alignment = 16;
8903 stack_alignment_needed = 16;
8904 crtl->preferred_stack_boundary = 128;
8905 crtl->stack_alignment_needed = 128;
8908 gcc_assert (!size || stack_alignment_needed);
8909 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8910 gcc_assert (preferred_alignment <= stack_alignment_needed);
8912 /* For SEH we have to limit the amount of code movement into the prologue.
8913 At present we do this via a BLOCKAGE, at which point there's very little
8914 scheduling that can be done, which means that there's very little point
8915 in doing anything except PUSHs. */
8916 if (TARGET_SEH)
8917 cfun->machine->use_fast_prologue_epilogue = false;
8919 /* During reload iteration the amount of registers saved can change.
8920 Recompute the value as needed. Do not recompute when amount of registers
8921 didn't change as reload does multiple calls to the function and does not
8922 expect the decision to change within single iteration. */
8923 else if (!optimize_function_for_size_p (cfun)
8924 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8926 int count = frame->nregs;
8927 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8929 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8931 /* The fast prologue uses move instead of push to save registers. This
8932 is significantly longer, but also executes faster as modern hardware
8933 can execute the moves in parallel, but can't do that for push/pop.
8935 Be careful about choosing what prologue to emit: When function takes
8936 many instructions to execute we may use slow version as well as in
8937 case function is known to be outside hot spot (this is known with
8938 feedback only). Weight the size of function by number of registers
8939 to save as it is cheap to use one or two push instructions but very
8940 slow to use many of them. */
8941 if (count)
8942 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8943 if (node->frequency < NODE_FREQUENCY_NORMAL
8944 || (flag_branch_probabilities
8945 && node->frequency < NODE_FREQUENCY_HOT))
8946 cfun->machine->use_fast_prologue_epilogue = false;
8947 else
8948 cfun->machine->use_fast_prologue_epilogue
8949 = !expensive_function_p (count);
8952 frame->save_regs_using_mov
8953 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8954 /* If static stack checking is enabled and done with probes,
8955 the registers need to be saved before allocating the frame. */
8956 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8958 /* Skip return address. */
8959 offset = UNITS_PER_WORD;
8961 /* Skip pushed static chain. */
8962 if (ix86_static_chain_on_stack)
8963 offset += UNITS_PER_WORD;
8965 /* Skip saved base pointer. */
8966 if (frame_pointer_needed)
8967 offset += UNITS_PER_WORD;
8968 frame->hfp_save_offset = offset;
8970 /* The traditional frame pointer location is at the top of the frame. */
8971 frame->hard_frame_pointer_offset = offset;
8973 /* Register save area */
8974 offset += frame->nregs * UNITS_PER_WORD;
8975 frame->reg_save_offset = offset;
8977 /* Align and set SSE register save area. */
8978 if (frame->nsseregs)
8980 /* The only ABI that has saved SSE registers (Win64) also has a
8981 16-byte aligned default stack, and thus we don't need to be
8982 within the re-aligned local stack frame to save them. */
8983 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8984 offset = (offset + 16 - 1) & -16;
8985 offset += frame->nsseregs * 16;
8987 frame->sse_reg_save_offset = offset;
8989 /* The re-aligned stack starts here. Values before this point are not
8990 directly comparable with values below this point. In order to make
8991 sure that no value happens to be the same before and after, force
8992 the alignment computation below to add a non-zero value. */
8993 if (stack_realign_fp)
8994 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8996 /* Va-arg area */
8997 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8998 offset += frame->va_arg_size;
9000 /* Align start of frame for local function. */
9001 if (stack_realign_fp
9002 || offset != frame->sse_reg_save_offset
9003 || size != 0
9004 || !current_function_is_leaf
9005 || cfun->calls_alloca
9006 || ix86_current_function_calls_tls_descriptor)
9007 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9009 /* Frame pointer points here. */
9010 frame->frame_pointer_offset = offset;
9012 offset += size;
9014 /* Add outgoing arguments area. Can be skipped if we eliminated
9015 all the function calls as dead code.
9016 Skipping is however impossible when function calls alloca. Alloca
9017 expander assumes that last crtl->outgoing_args_size
9018 of stack frame are unused. */
9019 if (ACCUMULATE_OUTGOING_ARGS
9020 && (!current_function_is_leaf || cfun->calls_alloca
9021 || ix86_current_function_calls_tls_descriptor))
9023 offset += crtl->outgoing_args_size;
9024 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9026 else
9027 frame->outgoing_arguments_size = 0;
9029 /* Align stack boundary. Only needed if we're calling another function
9030 or using alloca. */
9031 if (!current_function_is_leaf || cfun->calls_alloca
9032 || ix86_current_function_calls_tls_descriptor)
9033 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9035 /* We've reached end of stack frame. */
9036 frame->stack_pointer_offset = offset;
9038 /* Size prologue needs to allocate. */
9039 to_allocate = offset - frame->sse_reg_save_offset;
9041 if ((!to_allocate && frame->nregs <= 1)
9042 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9043 frame->save_regs_using_mov = false;
9045 if (ix86_using_red_zone ()
9046 && current_function_sp_is_unchanging
9047 && current_function_is_leaf
9048 && !ix86_current_function_calls_tls_descriptor)
9050 frame->red_zone_size = to_allocate;
9051 if (frame->save_regs_using_mov)
9052 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9053 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9054 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9056 else
9057 frame->red_zone_size = 0;
9058 frame->stack_pointer_offset -= frame->red_zone_size;
9060 /* The SEH frame pointer location is near the bottom of the frame.
9061 This is enforced by the fact that the difference between the
9062 stack pointer and the frame pointer is limited to 240 bytes in
9063 the unwind data structure. */
9064 if (TARGET_SEH)
9066 HOST_WIDE_INT diff;
9068 /* If we can leave the frame pointer where it is, do so. */
9069 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9070 if (diff > 240 || (diff & 15) != 0)
9072 /* Ideally we'd determine what portion of the local stack frame
9073 (within the constraint of the lowest 240) is most heavily used.
9074 But without that complication, simply bias the frame pointer
9075 by 128 bytes so as to maximize the amount of the local stack
9076 frame that is addressable with 8-bit offsets. */
9077 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9082 /* This is semi-inlined memory_address_length, but simplified
9083 since we know that we're always dealing with reg+offset, and
9084 to avoid having to create and discard all that rtl. */
9086 static inline int
9087 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9089 int len = 4;
9091 if (offset == 0)
9093 /* EBP and R13 cannot be encoded without an offset. */
9094 len = (regno == BP_REG || regno == R13_REG);
9096 else if (IN_RANGE (offset, -128, 127))
9097 len = 1;
9099 /* ESP and R12 must be encoded with a SIB byte. */
9100 if (regno == SP_REG || regno == R12_REG)
9101 len++;
9103 return len;
9106 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9107 The valid base registers are taken from CFUN->MACHINE->FS. */
9109 static rtx
9110 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9112 const struct machine_function *m = cfun->machine;
9113 rtx base_reg = NULL;
9114 HOST_WIDE_INT base_offset = 0;
9116 if (m->use_fast_prologue_epilogue)
9118 /* Choose the base register most likely to allow the most scheduling
9119 opportunities. Generally FP is valid througout the function,
9120 while DRAP must be reloaded within the epilogue. But choose either
9121 over the SP due to increased encoding size. */
9123 if (m->fs.fp_valid)
9125 base_reg = hard_frame_pointer_rtx;
9126 base_offset = m->fs.fp_offset - cfa_offset;
9128 else if (m->fs.drap_valid)
9130 base_reg = crtl->drap_reg;
9131 base_offset = 0 - cfa_offset;
9133 else if (m->fs.sp_valid)
9135 base_reg = stack_pointer_rtx;
9136 base_offset = m->fs.sp_offset - cfa_offset;
9139 else
9141 HOST_WIDE_INT toffset;
9142 int len = 16, tlen;
9144 /* Choose the base register with the smallest address encoding.
9145 With a tie, choose FP > DRAP > SP. */
9146 if (m->fs.sp_valid)
9148 base_reg = stack_pointer_rtx;
9149 base_offset = m->fs.sp_offset - cfa_offset;
9150 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9152 if (m->fs.drap_valid)
9154 toffset = 0 - cfa_offset;
9155 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9156 if (tlen <= len)
9158 base_reg = crtl->drap_reg;
9159 base_offset = toffset;
9160 len = tlen;
9163 if (m->fs.fp_valid)
9165 toffset = m->fs.fp_offset - cfa_offset;
9166 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9167 if (tlen <= len)
9169 base_reg = hard_frame_pointer_rtx;
9170 base_offset = toffset;
9171 len = tlen;
9175 gcc_assert (base_reg != NULL);
9177 return plus_constant (base_reg, base_offset);
9180 /* Emit code to save registers in the prologue. */
9182 static void
9183 ix86_emit_save_regs (void)
9185 unsigned int regno;
9186 rtx insn;
9188 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9189 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9191 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9192 RTX_FRAME_RELATED_P (insn) = 1;
9196 /* Emit a single register save at CFA - CFA_OFFSET. */
9198 static void
9199 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9200 HOST_WIDE_INT cfa_offset)
9202 struct machine_function *m = cfun->machine;
9203 rtx reg = gen_rtx_REG (mode, regno);
9204 rtx mem, addr, base, insn;
9206 addr = choose_baseaddr (cfa_offset);
9207 mem = gen_frame_mem (mode, addr);
9209 /* For SSE saves, we need to indicate the 128-bit alignment. */
9210 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9212 insn = emit_move_insn (mem, reg);
9213 RTX_FRAME_RELATED_P (insn) = 1;
9215 base = addr;
9216 if (GET_CODE (base) == PLUS)
9217 base = XEXP (base, 0);
9218 gcc_checking_assert (REG_P (base));
9220 /* When saving registers into a re-aligned local stack frame, avoid
9221 any tricky guessing by dwarf2out. */
9222 if (m->fs.realigned)
9224 gcc_checking_assert (stack_realign_drap);
9226 if (regno == REGNO (crtl->drap_reg))
9228 /* A bit of a hack. We force the DRAP register to be saved in
9229 the re-aligned stack frame, which provides us with a copy
9230 of the CFA that will last past the prologue. Install it. */
9231 gcc_checking_assert (cfun->machine->fs.fp_valid);
9232 addr = plus_constant (hard_frame_pointer_rtx,
9233 cfun->machine->fs.fp_offset - cfa_offset);
9234 mem = gen_rtx_MEM (mode, addr);
9235 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9237 else
9239 /* The frame pointer is a stable reference within the
9240 aligned frame. Use it. */
9241 gcc_checking_assert (cfun->machine->fs.fp_valid);
9242 addr = plus_constant (hard_frame_pointer_rtx,
9243 cfun->machine->fs.fp_offset - cfa_offset);
9244 mem = gen_rtx_MEM (mode, addr);
9245 add_reg_note (insn, REG_CFA_EXPRESSION,
9246 gen_rtx_SET (VOIDmode, mem, reg));
9250 /* The memory may not be relative to the current CFA register,
9251 which means that we may need to generate a new pattern for
9252 use by the unwind info. */
9253 else if (base != m->fs.cfa_reg)
9255 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9256 mem = gen_rtx_MEM (mode, addr);
9257 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9261 /* Emit code to save registers using MOV insns.
9262 First register is stored at CFA - CFA_OFFSET. */
9263 static void
9264 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9266 unsigned int regno;
9268 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9269 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9271 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9272 cfa_offset -= UNITS_PER_WORD;
9276 /* Emit code to save SSE registers using MOV insns.
9277 First register is stored at CFA - CFA_OFFSET. */
9278 static void
9279 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9281 unsigned int regno;
9283 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9284 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9286 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9287 cfa_offset -= 16;
9291 static GTY(()) rtx queued_cfa_restores;
9293 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9294 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9295 Don't add the note if the previously saved value will be left untouched
9296 within stack red-zone till return, as unwinders can find the same value
9297 in the register and on the stack. */
9299 static void
9300 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9302 if (!crtl->shrink_wrapped
9303 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9304 return;
9306 if (insn)
9308 add_reg_note (insn, REG_CFA_RESTORE, reg);
9309 RTX_FRAME_RELATED_P (insn) = 1;
9311 else
9312 queued_cfa_restores
9313 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9316 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9318 static void
9319 ix86_add_queued_cfa_restore_notes (rtx insn)
9321 rtx last;
9322 if (!queued_cfa_restores)
9323 return;
9324 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9326 XEXP (last, 1) = REG_NOTES (insn);
9327 REG_NOTES (insn) = queued_cfa_restores;
9328 queued_cfa_restores = NULL_RTX;
9329 RTX_FRAME_RELATED_P (insn) = 1;
9332 /* Expand prologue or epilogue stack adjustment.
9333 The pattern exist to put a dependency on all ebp-based memory accesses.
9334 STYLE should be negative if instructions should be marked as frame related,
9335 zero if %r11 register is live and cannot be freely used and positive
9336 otherwise. */
9338 static void
9339 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9340 int style, bool set_cfa)
9342 struct machine_function *m = cfun->machine;
9343 rtx insn;
9344 bool add_frame_related_expr = false;
9346 if (Pmode == SImode)
9347 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9348 else if (x86_64_immediate_operand (offset, DImode))
9349 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9350 else
9352 rtx tmp;
9353 /* r11 is used by indirect sibcall return as well, set before the
9354 epilogue and used after the epilogue. */
9355 if (style)
9356 tmp = gen_rtx_REG (DImode, R11_REG);
9357 else
9359 gcc_assert (src != hard_frame_pointer_rtx
9360 && dest != hard_frame_pointer_rtx);
9361 tmp = hard_frame_pointer_rtx;
9363 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9364 if (style < 0)
9365 add_frame_related_expr = true;
9367 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9370 insn = emit_insn (insn);
9371 if (style >= 0)
9372 ix86_add_queued_cfa_restore_notes (insn);
9374 if (set_cfa)
9376 rtx r;
9378 gcc_assert (m->fs.cfa_reg == src);
9379 m->fs.cfa_offset += INTVAL (offset);
9380 m->fs.cfa_reg = dest;
9382 r = gen_rtx_PLUS (Pmode, src, offset);
9383 r = gen_rtx_SET (VOIDmode, dest, r);
9384 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9385 RTX_FRAME_RELATED_P (insn) = 1;
9387 else if (style < 0)
9389 RTX_FRAME_RELATED_P (insn) = 1;
9390 if (add_frame_related_expr)
9392 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9393 r = gen_rtx_SET (VOIDmode, dest, r);
9394 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9398 if (dest == stack_pointer_rtx)
9400 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9401 bool valid = m->fs.sp_valid;
9403 if (src == hard_frame_pointer_rtx)
9405 valid = m->fs.fp_valid;
9406 ooffset = m->fs.fp_offset;
9408 else if (src == crtl->drap_reg)
9410 valid = m->fs.drap_valid;
9411 ooffset = 0;
9413 else
9415 /* Else there are two possibilities: SP itself, which we set
9416 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9417 taken care of this by hand along the eh_return path. */
9418 gcc_checking_assert (src == stack_pointer_rtx
9419 || offset == const0_rtx);
9422 m->fs.sp_offset = ooffset - INTVAL (offset);
9423 m->fs.sp_valid = valid;
9427 /* Find an available register to be used as dynamic realign argument
9428 pointer regsiter. Such a register will be written in prologue and
9429 used in begin of body, so it must not be
9430 1. parameter passing register.
9431 2. GOT pointer.
9432 We reuse static-chain register if it is available. Otherwise, we
9433 use DI for i386 and R13 for x86-64. We chose R13 since it has
9434 shorter encoding.
9436 Return: the regno of chosen register. */
9438 static unsigned int
9439 find_drap_reg (void)
9441 tree decl = cfun->decl;
9443 if (TARGET_64BIT)
9445 /* Use R13 for nested function or function need static chain.
9446 Since function with tail call may use any caller-saved
9447 registers in epilogue, DRAP must not use caller-saved
9448 register in such case. */
9449 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9450 return R13_REG;
9452 return R10_REG;
9454 else
9456 /* Use DI for nested function or function need static chain.
9457 Since function with tail call may use any caller-saved
9458 registers in epilogue, DRAP must not use caller-saved
9459 register in such case. */
9460 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9461 return DI_REG;
9463 /* Reuse static chain register if it isn't used for parameter
9464 passing. */
9465 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9467 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9468 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9469 return CX_REG;
9471 return DI_REG;
9475 /* Return minimum incoming stack alignment. */
9477 static unsigned int
9478 ix86_minimum_incoming_stack_boundary (bool sibcall)
9480 unsigned int incoming_stack_boundary;
9482 /* Prefer the one specified at command line. */
9483 if (ix86_user_incoming_stack_boundary)
9484 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9485 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9486 if -mstackrealign is used, it isn't used for sibcall check and
9487 estimated stack alignment is 128bit. */
9488 else if (!sibcall
9489 && !TARGET_64BIT
9490 && ix86_force_align_arg_pointer
9491 && crtl->stack_alignment_estimated == 128)
9492 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9493 else
9494 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9496 /* Incoming stack alignment can be changed on individual functions
9497 via force_align_arg_pointer attribute. We use the smallest
9498 incoming stack boundary. */
9499 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9500 && lookup_attribute (ix86_force_align_arg_pointer_string,
9501 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9502 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9504 /* The incoming stack frame has to be aligned at least at
9505 parm_stack_boundary. */
9506 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9507 incoming_stack_boundary = crtl->parm_stack_boundary;
9509 /* Stack at entrance of main is aligned by runtime. We use the
9510 smallest incoming stack boundary. */
9511 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9512 && DECL_NAME (current_function_decl)
9513 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9514 && DECL_FILE_SCOPE_P (current_function_decl))
9515 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9517 return incoming_stack_boundary;
9520 /* Update incoming stack boundary and estimated stack alignment. */
9522 static void
9523 ix86_update_stack_boundary (void)
9525 ix86_incoming_stack_boundary
9526 = ix86_minimum_incoming_stack_boundary (false);
9528 /* x86_64 vararg needs 16byte stack alignment for register save
9529 area. */
9530 if (TARGET_64BIT
9531 && cfun->stdarg
9532 && crtl->stack_alignment_estimated < 128)
9533 crtl->stack_alignment_estimated = 128;
9536 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9537 needed or an rtx for DRAP otherwise. */
9539 static rtx
9540 ix86_get_drap_rtx (void)
9542 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9543 crtl->need_drap = true;
9545 if (stack_realign_drap)
9547 /* Assign DRAP to vDRAP and returns vDRAP */
9548 unsigned int regno = find_drap_reg ();
9549 rtx drap_vreg;
9550 rtx arg_ptr;
9551 rtx seq, insn;
9553 arg_ptr = gen_rtx_REG (Pmode, regno);
9554 crtl->drap_reg = arg_ptr;
9556 start_sequence ();
9557 drap_vreg = copy_to_reg (arg_ptr);
9558 seq = get_insns ();
9559 end_sequence ();
9561 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9562 if (!optimize)
9564 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9565 RTX_FRAME_RELATED_P (insn) = 1;
9567 return drap_vreg;
9569 else
9570 return NULL;
9573 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9575 static rtx
9576 ix86_internal_arg_pointer (void)
9578 return virtual_incoming_args_rtx;
9581 struct scratch_reg {
9582 rtx reg;
9583 bool saved;
9586 /* Return a short-lived scratch register for use on function entry.
9587 In 32-bit mode, it is valid only after the registers are saved
9588 in the prologue. This register must be released by means of
9589 release_scratch_register_on_entry once it is dead. */
9591 static void
9592 get_scratch_register_on_entry (struct scratch_reg *sr)
9594 int regno;
9596 sr->saved = false;
9598 if (TARGET_64BIT)
9600 /* We always use R11 in 64-bit mode. */
9601 regno = R11_REG;
9603 else
9605 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9606 bool fastcall_p
9607 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9608 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9609 int regparm = ix86_function_regparm (fntype, decl);
9610 int drap_regno
9611 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9613 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9614 for the static chain register. */
9615 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9616 && drap_regno != AX_REG)
9617 regno = AX_REG;
9618 else if (regparm < 2 && drap_regno != DX_REG)
9619 regno = DX_REG;
9620 /* ecx is the static chain register. */
9621 else if (regparm < 3 && !fastcall_p && !static_chain_p
9622 && drap_regno != CX_REG)
9623 regno = CX_REG;
9624 else if (ix86_save_reg (BX_REG, true))
9625 regno = BX_REG;
9626 /* esi is the static chain register. */
9627 else if (!(regparm == 3 && static_chain_p)
9628 && ix86_save_reg (SI_REG, true))
9629 regno = SI_REG;
9630 else if (ix86_save_reg (DI_REG, true))
9631 regno = DI_REG;
9632 else
9634 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9635 sr->saved = true;
9639 sr->reg = gen_rtx_REG (Pmode, regno);
9640 if (sr->saved)
9642 rtx insn = emit_insn (gen_push (sr->reg));
9643 RTX_FRAME_RELATED_P (insn) = 1;
9647 /* Release a scratch register obtained from the preceding function. */
9649 static void
9650 release_scratch_register_on_entry (struct scratch_reg *sr)
9652 if (sr->saved)
9654 rtx x, insn = emit_insn (gen_pop (sr->reg));
9656 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9657 RTX_FRAME_RELATED_P (insn) = 1;
9658 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9659 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9660 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9664 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9666 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9668 static void
9669 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9671 /* We skip the probe for the first interval + a small dope of 4 words and
9672 probe that many bytes past the specified size to maintain a protection
9673 area at the botton of the stack. */
9674 const int dope = 4 * UNITS_PER_WORD;
9675 rtx size_rtx = GEN_INT (size), last;
9677 /* See if we have a constant small number of probes to generate. If so,
9678 that's the easy case. The run-time loop is made up of 11 insns in the
9679 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9680 for n # of intervals. */
9681 if (size <= 5 * PROBE_INTERVAL)
9683 HOST_WIDE_INT i, adjust;
9684 bool first_probe = true;
9686 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9687 values of N from 1 until it exceeds SIZE. If only one probe is
9688 needed, this will not generate any code. Then adjust and probe
9689 to PROBE_INTERVAL + SIZE. */
9690 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9692 if (first_probe)
9694 adjust = 2 * PROBE_INTERVAL + dope;
9695 first_probe = false;
9697 else
9698 adjust = PROBE_INTERVAL;
9700 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9701 plus_constant (stack_pointer_rtx, -adjust)));
9702 emit_stack_probe (stack_pointer_rtx);
9705 if (first_probe)
9706 adjust = size + PROBE_INTERVAL + dope;
9707 else
9708 adjust = size + PROBE_INTERVAL - i;
9710 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9711 plus_constant (stack_pointer_rtx, -adjust)));
9712 emit_stack_probe (stack_pointer_rtx);
9714 /* Adjust back to account for the additional first interval. */
9715 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9716 plus_constant (stack_pointer_rtx,
9717 PROBE_INTERVAL + dope)));
9720 /* Otherwise, do the same as above, but in a loop. Note that we must be
9721 extra careful with variables wrapping around because we might be at
9722 the very top (or the very bottom) of the address space and we have
9723 to be able to handle this case properly; in particular, we use an
9724 equality test for the loop condition. */
9725 else
9727 HOST_WIDE_INT rounded_size;
9728 struct scratch_reg sr;
9730 get_scratch_register_on_entry (&sr);
9733 /* Step 1: round SIZE to the previous multiple of the interval. */
9735 rounded_size = size & -PROBE_INTERVAL;
9738 /* Step 2: compute initial and final value of the loop counter. */
9740 /* SP = SP_0 + PROBE_INTERVAL. */
9741 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9742 plus_constant (stack_pointer_rtx,
9743 - (PROBE_INTERVAL + dope))));
9745 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9746 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9747 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9748 gen_rtx_PLUS (Pmode, sr.reg,
9749 stack_pointer_rtx)));
9752 /* Step 3: the loop
9754 while (SP != LAST_ADDR)
9756 SP = SP + PROBE_INTERVAL
9757 probe at SP
9760 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9761 values of N from 1 until it is equal to ROUNDED_SIZE. */
9763 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9766 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9767 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9769 if (size != rounded_size)
9771 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9772 plus_constant (stack_pointer_rtx,
9773 rounded_size - size)));
9774 emit_stack_probe (stack_pointer_rtx);
9777 /* Adjust back to account for the additional first interval. */
9778 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9779 plus_constant (stack_pointer_rtx,
9780 PROBE_INTERVAL + dope)));
9782 release_scratch_register_on_entry (&sr);
9785 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9787 /* Even if the stack pointer isn't the CFA register, we need to correctly
9788 describe the adjustments made to it, in particular differentiate the
9789 frame-related ones from the frame-unrelated ones. */
9790 if (size > 0)
9792 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9793 XVECEXP (expr, 0, 0)
9794 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9795 plus_constant (stack_pointer_rtx, -size));
9796 XVECEXP (expr, 0, 1)
9797 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9798 plus_constant (stack_pointer_rtx,
9799 PROBE_INTERVAL + dope + size));
9800 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9801 RTX_FRAME_RELATED_P (last) = 1;
9803 cfun->machine->fs.sp_offset += size;
9806 /* Make sure nothing is scheduled before we are done. */
9807 emit_insn (gen_blockage ());
9810 /* Adjust the stack pointer up to REG while probing it. */
9812 const char *
9813 output_adjust_stack_and_probe (rtx reg)
9815 static int labelno = 0;
9816 char loop_lab[32], end_lab[32];
9817 rtx xops[2];
9819 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9820 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9822 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9824 /* Jump to END_LAB if SP == LAST_ADDR. */
9825 xops[0] = stack_pointer_rtx;
9826 xops[1] = reg;
9827 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9828 fputs ("\tje\t", asm_out_file);
9829 assemble_name_raw (asm_out_file, end_lab);
9830 fputc ('\n', asm_out_file);
9832 /* SP = SP + PROBE_INTERVAL. */
9833 xops[1] = GEN_INT (PROBE_INTERVAL);
9834 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9836 /* Probe at SP. */
9837 xops[1] = const0_rtx;
9838 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9840 fprintf (asm_out_file, "\tjmp\t");
9841 assemble_name_raw (asm_out_file, loop_lab);
9842 fputc ('\n', asm_out_file);
9844 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9846 return "";
9849 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9850 inclusive. These are offsets from the current stack pointer. */
9852 static void
9853 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9855 /* See if we have a constant small number of probes to generate. If so,
9856 that's the easy case. The run-time loop is made up of 7 insns in the
9857 generic case while the compile-time loop is made up of n insns for n #
9858 of intervals. */
9859 if (size <= 7 * PROBE_INTERVAL)
9861 HOST_WIDE_INT i;
9863 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9864 it exceeds SIZE. If only one probe is needed, this will not
9865 generate any code. Then probe at FIRST + SIZE. */
9866 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9867 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9869 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9872 /* Otherwise, do the same as above, but in a loop. Note that we must be
9873 extra careful with variables wrapping around because we might be at
9874 the very top (or the very bottom) of the address space and we have
9875 to be able to handle this case properly; in particular, we use an
9876 equality test for the loop condition. */
9877 else
9879 HOST_WIDE_INT rounded_size, last;
9880 struct scratch_reg sr;
9882 get_scratch_register_on_entry (&sr);
9885 /* Step 1: round SIZE to the previous multiple of the interval. */
9887 rounded_size = size & -PROBE_INTERVAL;
9890 /* Step 2: compute initial and final value of the loop counter. */
9892 /* TEST_OFFSET = FIRST. */
9893 emit_move_insn (sr.reg, GEN_INT (-first));
9895 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9896 last = first + rounded_size;
9899 /* Step 3: the loop
9901 while (TEST_ADDR != LAST_ADDR)
9903 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9904 probe at TEST_ADDR
9907 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9908 until it is equal to ROUNDED_SIZE. */
9910 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9913 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9914 that SIZE is equal to ROUNDED_SIZE. */
9916 if (size != rounded_size)
9917 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9918 stack_pointer_rtx,
9919 sr.reg),
9920 rounded_size - size));
9922 release_scratch_register_on_entry (&sr);
9925 /* Make sure nothing is scheduled before we are done. */
9926 emit_insn (gen_blockage ());
9929 /* Probe a range of stack addresses from REG to END, inclusive. These are
9930 offsets from the current stack pointer. */
9932 const char *
9933 output_probe_stack_range (rtx reg, rtx end)
9935 static int labelno = 0;
9936 char loop_lab[32], end_lab[32];
9937 rtx xops[3];
9939 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9940 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9942 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9944 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9945 xops[0] = reg;
9946 xops[1] = end;
9947 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9948 fputs ("\tje\t", asm_out_file);
9949 assemble_name_raw (asm_out_file, end_lab);
9950 fputc ('\n', asm_out_file);
9952 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9953 xops[1] = GEN_INT (PROBE_INTERVAL);
9954 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9956 /* Probe at TEST_ADDR. */
9957 xops[0] = stack_pointer_rtx;
9958 xops[1] = reg;
9959 xops[2] = const0_rtx;
9960 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9962 fprintf (asm_out_file, "\tjmp\t");
9963 assemble_name_raw (asm_out_file, loop_lab);
9964 fputc ('\n', asm_out_file);
9966 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9968 return "";
9971 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9972 to be generated in correct form. */
9973 static void
9974 ix86_finalize_stack_realign_flags (void)
9976 /* Check if stack realign is really needed after reload, and
9977 stores result in cfun */
9978 unsigned int incoming_stack_boundary
9979 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9980 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9981 unsigned int stack_realign = (incoming_stack_boundary
9982 < (current_function_is_leaf
9983 ? crtl->max_used_stack_slot_alignment
9984 : crtl->stack_alignment_needed));
9986 if (crtl->stack_realign_finalized)
9988 /* After stack_realign_needed is finalized, we can't no longer
9989 change it. */
9990 gcc_assert (crtl->stack_realign_needed == stack_realign);
9991 return;
9994 /* If the only reason for frame_pointer_needed is that we conservatively
9995 assumed stack realignment might be needed, but in the end nothing that
9996 needed the stack alignment had been spilled, clear frame_pointer_needed
9997 and say we don't need stack realignment. */
9998 if (stack_realign
9999 && !crtl->need_drap
10000 && frame_pointer_needed
10001 && current_function_is_leaf
10002 && flag_omit_frame_pointer
10003 && current_function_sp_is_unchanging
10004 && !ix86_current_function_calls_tls_descriptor
10005 && !crtl->accesses_prior_frames
10006 && !cfun->calls_alloca
10007 && !crtl->calls_eh_return
10008 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10009 && !ix86_frame_pointer_required ()
10010 && get_frame_size () == 0
10011 && ix86_nsaved_sseregs () == 0
10012 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10014 HARD_REG_SET set_up_by_prologue, prologue_used;
10015 basic_block bb;
10017 CLEAR_HARD_REG_SET (prologue_used);
10018 CLEAR_HARD_REG_SET (set_up_by_prologue);
10019 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10020 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10021 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10022 HARD_FRAME_POINTER_REGNUM);
10023 FOR_EACH_BB (bb)
10025 rtx insn;
10026 FOR_BB_INSNS (bb, insn)
10027 if (NONDEBUG_INSN_P (insn)
10028 && requires_stack_frame_p (insn, prologue_used,
10029 set_up_by_prologue))
10031 crtl->stack_realign_needed = stack_realign;
10032 crtl->stack_realign_finalized = true;
10033 return;
10037 frame_pointer_needed = false;
10038 stack_realign = false;
10039 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10040 crtl->stack_alignment_needed = incoming_stack_boundary;
10041 crtl->stack_alignment_estimated = incoming_stack_boundary;
10042 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10043 crtl->preferred_stack_boundary = incoming_stack_boundary;
10044 df_finish_pass (true);
10045 df_scan_alloc (NULL);
10046 df_scan_blocks ();
10047 df_compute_regs_ever_live (true);
10048 df_analyze ();
10051 crtl->stack_realign_needed = stack_realign;
10052 crtl->stack_realign_finalized = true;
10055 /* Expand the prologue into a bunch of separate insns. */
10057 void
10058 ix86_expand_prologue (void)
10060 struct machine_function *m = cfun->machine;
10061 rtx insn, t;
10062 bool pic_reg_used;
10063 struct ix86_frame frame;
10064 HOST_WIDE_INT allocate;
10065 bool int_registers_saved;
10067 ix86_finalize_stack_realign_flags ();
10069 /* DRAP should not coexist with stack_realign_fp */
10070 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10072 memset (&m->fs, 0, sizeof (m->fs));
10074 /* Initialize CFA state for before the prologue. */
10075 m->fs.cfa_reg = stack_pointer_rtx;
10076 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10078 /* Track SP offset to the CFA. We continue tracking this after we've
10079 swapped the CFA register away from SP. In the case of re-alignment
10080 this is fudged; we're interested to offsets within the local frame. */
10081 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10082 m->fs.sp_valid = true;
10084 ix86_compute_frame_layout (&frame);
10086 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10088 /* We should have already generated an error for any use of
10089 ms_hook on a nested function. */
10090 gcc_checking_assert (!ix86_static_chain_on_stack);
10092 /* Check if profiling is active and we shall use profiling before
10093 prologue variant. If so sorry. */
10094 if (crtl->profile && flag_fentry != 0)
10095 sorry ("ms_hook_prologue attribute isn%'t compatible "
10096 "with -mfentry for 32-bit");
10098 /* In ix86_asm_output_function_label we emitted:
10099 8b ff movl.s %edi,%edi
10100 55 push %ebp
10101 8b ec movl.s %esp,%ebp
10103 This matches the hookable function prologue in Win32 API
10104 functions in Microsoft Windows XP Service Pack 2 and newer.
10105 Wine uses this to enable Windows apps to hook the Win32 API
10106 functions provided by Wine.
10108 What that means is that we've already set up the frame pointer. */
10110 if (frame_pointer_needed
10111 && !(crtl->drap_reg && crtl->stack_realign_needed))
10113 rtx push, mov;
10115 /* We've decided to use the frame pointer already set up.
10116 Describe this to the unwinder by pretending that both
10117 push and mov insns happen right here.
10119 Putting the unwind info here at the end of the ms_hook
10120 is done so that we can make absolutely certain we get
10121 the required byte sequence at the start of the function,
10122 rather than relying on an assembler that can produce
10123 the exact encoding required.
10125 However it does mean (in the unpatched case) that we have
10126 a 1 insn window where the asynchronous unwind info is
10127 incorrect. However, if we placed the unwind info at
10128 its correct location we would have incorrect unwind info
10129 in the patched case. Which is probably all moot since
10130 I don't expect Wine generates dwarf2 unwind info for the
10131 system libraries that use this feature. */
10133 insn = emit_insn (gen_blockage ());
10135 push = gen_push (hard_frame_pointer_rtx);
10136 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10137 stack_pointer_rtx);
10138 RTX_FRAME_RELATED_P (push) = 1;
10139 RTX_FRAME_RELATED_P (mov) = 1;
10141 RTX_FRAME_RELATED_P (insn) = 1;
10142 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10143 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10145 /* Note that gen_push incremented m->fs.cfa_offset, even
10146 though we didn't emit the push insn here. */
10147 m->fs.cfa_reg = hard_frame_pointer_rtx;
10148 m->fs.fp_offset = m->fs.cfa_offset;
10149 m->fs.fp_valid = true;
10151 else
10153 /* The frame pointer is not needed so pop %ebp again.
10154 This leaves us with a pristine state. */
10155 emit_insn (gen_pop (hard_frame_pointer_rtx));
10159 /* The first insn of a function that accepts its static chain on the
10160 stack is to push the register that would be filled in by a direct
10161 call. This insn will be skipped by the trampoline. */
10162 else if (ix86_static_chain_on_stack)
10164 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10165 emit_insn (gen_blockage ());
10167 /* We don't want to interpret this push insn as a register save,
10168 only as a stack adjustment. The real copy of the register as
10169 a save will be done later, if needed. */
10170 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10171 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10172 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10173 RTX_FRAME_RELATED_P (insn) = 1;
10176 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10177 of DRAP is needed and stack realignment is really needed after reload */
10178 if (stack_realign_drap)
10180 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10182 /* Only need to push parameter pointer reg if it is caller saved. */
10183 if (!call_used_regs[REGNO (crtl->drap_reg)])
10185 /* Push arg pointer reg */
10186 insn = emit_insn (gen_push (crtl->drap_reg));
10187 RTX_FRAME_RELATED_P (insn) = 1;
10190 /* Grab the argument pointer. */
10191 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10192 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10193 RTX_FRAME_RELATED_P (insn) = 1;
10194 m->fs.cfa_reg = crtl->drap_reg;
10195 m->fs.cfa_offset = 0;
10197 /* Align the stack. */
10198 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10199 stack_pointer_rtx,
10200 GEN_INT (-align_bytes)));
10201 RTX_FRAME_RELATED_P (insn) = 1;
10203 /* Replicate the return address on the stack so that return
10204 address can be reached via (argp - 1) slot. This is needed
10205 to implement macro RETURN_ADDR_RTX and intrinsic function
10206 expand_builtin_return_addr etc. */
10207 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10208 t = gen_frame_mem (word_mode, t);
10209 insn = emit_insn (gen_push (t));
10210 RTX_FRAME_RELATED_P (insn) = 1;
10212 /* For the purposes of frame and register save area addressing,
10213 we've started over with a new frame. */
10214 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10215 m->fs.realigned = true;
10218 if (frame_pointer_needed && !m->fs.fp_valid)
10220 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10221 slower on all targets. Also sdb doesn't like it. */
10222 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10223 RTX_FRAME_RELATED_P (insn) = 1;
10225 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10227 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10228 RTX_FRAME_RELATED_P (insn) = 1;
10230 if (m->fs.cfa_reg == stack_pointer_rtx)
10231 m->fs.cfa_reg = hard_frame_pointer_rtx;
10232 m->fs.fp_offset = m->fs.sp_offset;
10233 m->fs.fp_valid = true;
10237 int_registers_saved = (frame.nregs == 0);
10239 if (!int_registers_saved)
10241 /* If saving registers via PUSH, do so now. */
10242 if (!frame.save_regs_using_mov)
10244 ix86_emit_save_regs ();
10245 int_registers_saved = true;
10246 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10249 /* When using red zone we may start register saving before allocating
10250 the stack frame saving one cycle of the prologue. However, avoid
10251 doing this if we have to probe the stack; at least on x86_64 the
10252 stack probe can turn into a call that clobbers a red zone location. */
10253 else if (ix86_using_red_zone ()
10254 && (! TARGET_STACK_PROBE
10255 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10257 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10258 int_registers_saved = true;
10262 if (stack_realign_fp)
10264 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10265 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10267 /* The computation of the size of the re-aligned stack frame means
10268 that we must allocate the size of the register save area before
10269 performing the actual alignment. Otherwise we cannot guarantee
10270 that there's enough storage above the realignment point. */
10271 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10272 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10273 GEN_INT (m->fs.sp_offset
10274 - frame.sse_reg_save_offset),
10275 -1, false);
10277 /* Align the stack. */
10278 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10279 stack_pointer_rtx,
10280 GEN_INT (-align_bytes)));
10282 /* For the purposes of register save area addressing, the stack
10283 pointer is no longer valid. As for the value of sp_offset,
10284 see ix86_compute_frame_layout, which we need to match in order
10285 to pass verification of stack_pointer_offset at the end. */
10286 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10287 m->fs.sp_valid = false;
10290 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10292 if (flag_stack_usage_info)
10294 /* We start to count from ARG_POINTER. */
10295 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10297 /* If it was realigned, take into account the fake frame. */
10298 if (stack_realign_drap)
10300 if (ix86_static_chain_on_stack)
10301 stack_size += UNITS_PER_WORD;
10303 if (!call_used_regs[REGNO (crtl->drap_reg)])
10304 stack_size += UNITS_PER_WORD;
10306 /* This over-estimates by 1 minimal-stack-alignment-unit but
10307 mitigates that by counting in the new return address slot. */
10308 current_function_dynamic_stack_size
10309 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10312 current_function_static_stack_size = stack_size;
10315 /* The stack has already been decremented by the instruction calling us
10316 so probe if the size is non-negative to preserve the protection area. */
10317 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10319 /* We expect the registers to be saved when probes are used. */
10320 gcc_assert (int_registers_saved);
10322 if (STACK_CHECK_MOVING_SP)
10324 ix86_adjust_stack_and_probe (allocate);
10325 allocate = 0;
10327 else
10329 HOST_WIDE_INT size = allocate;
10331 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10332 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10334 if (TARGET_STACK_PROBE)
10335 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10336 else
10337 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10341 if (allocate == 0)
10343 else if (!ix86_target_stack_probe ()
10344 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10346 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10347 GEN_INT (-allocate), -1,
10348 m->fs.cfa_reg == stack_pointer_rtx);
10350 else
10352 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10353 rtx r10 = NULL;
10354 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10356 bool eax_live = false;
10357 bool r10_live = false;
10359 if (TARGET_64BIT)
10360 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10361 if (!TARGET_64BIT_MS_ABI)
10362 eax_live = ix86_eax_live_at_start_p ();
10364 if (eax_live)
10366 emit_insn (gen_push (eax));
10367 allocate -= UNITS_PER_WORD;
10369 if (r10_live)
10371 r10 = gen_rtx_REG (Pmode, R10_REG);
10372 emit_insn (gen_push (r10));
10373 allocate -= UNITS_PER_WORD;
10376 emit_move_insn (eax, GEN_INT (allocate));
10377 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10379 /* Use the fact that AX still contains ALLOCATE. */
10380 adjust_stack_insn = (Pmode == DImode
10381 ? gen_pro_epilogue_adjust_stack_di_sub
10382 : gen_pro_epilogue_adjust_stack_si_sub);
10384 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10385 stack_pointer_rtx, eax));
10387 /* Note that SEH directives need to continue tracking the stack
10388 pointer even after the frame pointer has been set up. */
10389 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10391 if (m->fs.cfa_reg == stack_pointer_rtx)
10392 m->fs.cfa_offset += allocate;
10394 RTX_FRAME_RELATED_P (insn) = 1;
10395 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10396 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10397 plus_constant (stack_pointer_rtx,
10398 -allocate)));
10400 m->fs.sp_offset += allocate;
10402 if (r10_live && eax_live)
10404 t = choose_baseaddr (m->fs.sp_offset - allocate);
10405 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10406 gen_frame_mem (word_mode, t));
10407 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10408 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10409 gen_frame_mem (word_mode, t));
10411 else if (eax_live || r10_live)
10413 t = choose_baseaddr (m->fs.sp_offset - allocate);
10414 emit_move_insn (gen_rtx_REG (word_mode,
10415 (eax_live ? AX_REG : R10_REG)),
10416 gen_frame_mem (word_mode, t));
10419 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10421 /* If we havn't already set up the frame pointer, do so now. */
10422 if (frame_pointer_needed && !m->fs.fp_valid)
10424 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10425 GEN_INT (frame.stack_pointer_offset
10426 - frame.hard_frame_pointer_offset));
10427 insn = emit_insn (insn);
10428 RTX_FRAME_RELATED_P (insn) = 1;
10429 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10431 if (m->fs.cfa_reg == stack_pointer_rtx)
10432 m->fs.cfa_reg = hard_frame_pointer_rtx;
10433 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10434 m->fs.fp_valid = true;
10437 if (!int_registers_saved)
10438 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10439 if (frame.nsseregs)
10440 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10442 pic_reg_used = false;
10443 if (pic_offset_table_rtx
10444 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10445 || crtl->profile))
10447 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10449 if (alt_pic_reg_used != INVALID_REGNUM)
10450 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10452 pic_reg_used = true;
10455 if (pic_reg_used)
10457 if (TARGET_64BIT)
10459 if (ix86_cmodel == CM_LARGE_PIC)
10461 rtx label, tmp_reg;
10463 gcc_assert (Pmode == DImode);
10464 label = gen_label_rtx ();
10465 emit_label (label);
10466 LABEL_PRESERVE_P (label) = 1;
10467 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10468 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10469 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10470 label));
10471 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10472 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10473 pic_offset_table_rtx, tmp_reg));
10475 else
10476 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10478 else
10480 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10481 RTX_FRAME_RELATED_P (insn) = 1;
10482 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10486 /* In the pic_reg_used case, make sure that the got load isn't deleted
10487 when mcount needs it. Blockage to avoid call movement across mcount
10488 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10489 note. */
10490 if (crtl->profile && !flag_fentry && pic_reg_used)
10491 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10493 if (crtl->drap_reg && !crtl->stack_realign_needed)
10495 /* vDRAP is setup but after reload it turns out stack realign
10496 isn't necessary, here we will emit prologue to setup DRAP
10497 without stack realign adjustment */
10498 t = choose_baseaddr (0);
10499 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10502 /* Prevent instructions from being scheduled into register save push
10503 sequence when access to the redzone area is done through frame pointer.
10504 The offset between the frame pointer and the stack pointer is calculated
10505 relative to the value of the stack pointer at the end of the function
10506 prologue, and moving instructions that access redzone area via frame
10507 pointer inside push sequence violates this assumption. */
10508 if (frame_pointer_needed && frame.red_zone_size)
10509 emit_insn (gen_memory_blockage ());
10511 /* Emit cld instruction if stringops are used in the function. */
10512 if (TARGET_CLD && ix86_current_function_needs_cld)
10513 emit_insn (gen_cld ());
10515 /* SEH requires that the prologue end within 256 bytes of the start of
10516 the function. Prevent instruction schedules that would extend that.
10517 Further, prevent alloca modifications to the stack pointer from being
10518 combined with prologue modifications. */
10519 if (TARGET_SEH)
10520 emit_insn (gen_prologue_use (stack_pointer_rtx));
10523 /* Emit code to restore REG using a POP insn. */
10525 static void
10526 ix86_emit_restore_reg_using_pop (rtx reg)
10528 struct machine_function *m = cfun->machine;
10529 rtx insn = emit_insn (gen_pop (reg));
10531 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10532 m->fs.sp_offset -= UNITS_PER_WORD;
10534 if (m->fs.cfa_reg == crtl->drap_reg
10535 && REGNO (reg) == REGNO (crtl->drap_reg))
10537 /* Previously we'd represented the CFA as an expression
10538 like *(%ebp - 8). We've just popped that value from
10539 the stack, which means we need to reset the CFA to
10540 the drap register. This will remain until we restore
10541 the stack pointer. */
10542 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10543 RTX_FRAME_RELATED_P (insn) = 1;
10545 /* This means that the DRAP register is valid for addressing too. */
10546 m->fs.drap_valid = true;
10547 return;
10550 if (m->fs.cfa_reg == stack_pointer_rtx)
10552 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10553 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10554 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10555 RTX_FRAME_RELATED_P (insn) = 1;
10557 m->fs.cfa_offset -= UNITS_PER_WORD;
10560 /* When the frame pointer is the CFA, and we pop it, we are
10561 swapping back to the stack pointer as the CFA. This happens
10562 for stack frames that don't allocate other data, so we assume
10563 the stack pointer is now pointing at the return address, i.e.
10564 the function entry state, which makes the offset be 1 word. */
10565 if (reg == hard_frame_pointer_rtx)
10567 m->fs.fp_valid = false;
10568 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10570 m->fs.cfa_reg = stack_pointer_rtx;
10571 m->fs.cfa_offset -= UNITS_PER_WORD;
10573 add_reg_note (insn, REG_CFA_DEF_CFA,
10574 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10575 GEN_INT (m->fs.cfa_offset)));
10576 RTX_FRAME_RELATED_P (insn) = 1;
10581 /* Emit code to restore saved registers using POP insns. */
10583 static void
10584 ix86_emit_restore_regs_using_pop (void)
10586 unsigned int regno;
10588 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10589 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10590 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10593 /* Emit code and notes for the LEAVE instruction. */
10595 static void
10596 ix86_emit_leave (void)
10598 struct machine_function *m = cfun->machine;
10599 rtx insn = emit_insn (ix86_gen_leave ());
10601 ix86_add_queued_cfa_restore_notes (insn);
10603 gcc_assert (m->fs.fp_valid);
10604 m->fs.sp_valid = true;
10605 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10606 m->fs.fp_valid = false;
10608 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10610 m->fs.cfa_reg = stack_pointer_rtx;
10611 m->fs.cfa_offset = m->fs.sp_offset;
10613 add_reg_note (insn, REG_CFA_DEF_CFA,
10614 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10615 RTX_FRAME_RELATED_P (insn) = 1;
10617 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10618 m->fs.fp_offset);
10621 /* Emit code to restore saved registers using MOV insns.
10622 First register is restored from CFA - CFA_OFFSET. */
10623 static void
10624 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10625 bool maybe_eh_return)
10627 struct machine_function *m = cfun->machine;
10628 unsigned int regno;
10630 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10631 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10633 rtx reg = gen_rtx_REG (word_mode, regno);
10634 rtx insn, mem;
10636 mem = choose_baseaddr (cfa_offset);
10637 mem = gen_frame_mem (word_mode, mem);
10638 insn = emit_move_insn (reg, mem);
10640 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10642 /* Previously we'd represented the CFA as an expression
10643 like *(%ebp - 8). We've just popped that value from
10644 the stack, which means we need to reset the CFA to
10645 the drap register. This will remain until we restore
10646 the stack pointer. */
10647 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10648 RTX_FRAME_RELATED_P (insn) = 1;
10650 /* This means that the DRAP register is valid for addressing. */
10651 m->fs.drap_valid = true;
10653 else
10654 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10656 cfa_offset -= UNITS_PER_WORD;
10660 /* Emit code to restore saved registers using MOV insns.
10661 First register is restored from CFA - CFA_OFFSET. */
10662 static void
10663 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10664 bool maybe_eh_return)
10666 unsigned int regno;
10668 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10669 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10671 rtx reg = gen_rtx_REG (V4SFmode, regno);
10672 rtx mem;
10674 mem = choose_baseaddr (cfa_offset);
10675 mem = gen_rtx_MEM (V4SFmode, mem);
10676 set_mem_align (mem, 128);
10677 emit_move_insn (reg, mem);
10679 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10681 cfa_offset -= 16;
10685 /* Emit vzeroupper if needed. */
10687 void
10688 ix86_maybe_emit_epilogue_vzeroupper (void)
10690 if (TARGET_VZEROUPPER
10691 && !TREE_THIS_VOLATILE (cfun->decl)
10692 && !cfun->machine->caller_return_avx256_p)
10693 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10696 /* Restore function stack, frame, and registers. */
10698 void
10699 ix86_expand_epilogue (int style)
10701 struct machine_function *m = cfun->machine;
10702 struct machine_frame_state frame_state_save = m->fs;
10703 struct ix86_frame frame;
10704 bool restore_regs_via_mov;
10705 bool using_drap;
10707 ix86_finalize_stack_realign_flags ();
10708 ix86_compute_frame_layout (&frame);
10710 m->fs.sp_valid = (!frame_pointer_needed
10711 || (current_function_sp_is_unchanging
10712 && !stack_realign_fp));
10713 gcc_assert (!m->fs.sp_valid
10714 || m->fs.sp_offset == frame.stack_pointer_offset);
10716 /* The FP must be valid if the frame pointer is present. */
10717 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10718 gcc_assert (!m->fs.fp_valid
10719 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10721 /* We must have *some* valid pointer to the stack frame. */
10722 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10724 /* The DRAP is never valid at this point. */
10725 gcc_assert (!m->fs.drap_valid);
10727 /* See the comment about red zone and frame
10728 pointer usage in ix86_expand_prologue. */
10729 if (frame_pointer_needed && frame.red_zone_size)
10730 emit_insn (gen_memory_blockage ());
10732 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10733 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10735 /* Determine the CFA offset of the end of the red-zone. */
10736 m->fs.red_zone_offset = 0;
10737 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10739 /* The red-zone begins below the return address. */
10740 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10742 /* When the register save area is in the aligned portion of
10743 the stack, determine the maximum runtime displacement that
10744 matches up with the aligned frame. */
10745 if (stack_realign_drap)
10746 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10747 + UNITS_PER_WORD);
10750 /* Special care must be taken for the normal return case of a function
10751 using eh_return: the eax and edx registers are marked as saved, but
10752 not restored along this path. Adjust the save location to match. */
10753 if (crtl->calls_eh_return && style != 2)
10754 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10756 /* EH_RETURN requires the use of moves to function properly. */
10757 if (crtl->calls_eh_return)
10758 restore_regs_via_mov = true;
10759 /* SEH requires the use of pops to identify the epilogue. */
10760 else if (TARGET_SEH)
10761 restore_regs_via_mov = false;
10762 /* If we're only restoring one register and sp is not valid then
10763 using a move instruction to restore the register since it's
10764 less work than reloading sp and popping the register. */
10765 else if (!m->fs.sp_valid && frame.nregs <= 1)
10766 restore_regs_via_mov = true;
10767 else if (TARGET_EPILOGUE_USING_MOVE
10768 && cfun->machine->use_fast_prologue_epilogue
10769 && (frame.nregs > 1
10770 || m->fs.sp_offset != frame.reg_save_offset))
10771 restore_regs_via_mov = true;
10772 else if (frame_pointer_needed
10773 && !frame.nregs
10774 && m->fs.sp_offset != frame.reg_save_offset)
10775 restore_regs_via_mov = true;
10776 else if (frame_pointer_needed
10777 && TARGET_USE_LEAVE
10778 && cfun->machine->use_fast_prologue_epilogue
10779 && frame.nregs == 1)
10780 restore_regs_via_mov = true;
10781 else
10782 restore_regs_via_mov = false;
10784 if (restore_regs_via_mov || frame.nsseregs)
10786 /* Ensure that the entire register save area is addressable via
10787 the stack pointer, if we will restore via sp. */
10788 if (TARGET_64BIT
10789 && m->fs.sp_offset > 0x7fffffff
10790 && !(m->fs.fp_valid || m->fs.drap_valid)
10791 && (frame.nsseregs + frame.nregs) != 0)
10793 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10794 GEN_INT (m->fs.sp_offset
10795 - frame.sse_reg_save_offset),
10796 style,
10797 m->fs.cfa_reg == stack_pointer_rtx);
10801 /* If there are any SSE registers to restore, then we have to do it
10802 via moves, since there's obviously no pop for SSE regs. */
10803 if (frame.nsseregs)
10804 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10805 style == 2);
10807 if (restore_regs_via_mov)
10809 rtx t;
10811 if (frame.nregs)
10812 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10814 /* eh_return epilogues need %ecx added to the stack pointer. */
10815 if (style == 2)
10817 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10819 /* Stack align doesn't work with eh_return. */
10820 gcc_assert (!stack_realign_drap);
10821 /* Neither does regparm nested functions. */
10822 gcc_assert (!ix86_static_chain_on_stack);
10824 if (frame_pointer_needed)
10826 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10827 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10828 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10830 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10831 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10833 /* Note that we use SA as a temporary CFA, as the return
10834 address is at the proper place relative to it. We
10835 pretend this happens at the FP restore insn because
10836 prior to this insn the FP would be stored at the wrong
10837 offset relative to SA, and after this insn we have no
10838 other reasonable register to use for the CFA. We don't
10839 bother resetting the CFA to the SP for the duration of
10840 the return insn. */
10841 add_reg_note (insn, REG_CFA_DEF_CFA,
10842 plus_constant (sa, UNITS_PER_WORD));
10843 ix86_add_queued_cfa_restore_notes (insn);
10844 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10845 RTX_FRAME_RELATED_P (insn) = 1;
10847 m->fs.cfa_reg = sa;
10848 m->fs.cfa_offset = UNITS_PER_WORD;
10849 m->fs.fp_valid = false;
10851 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10852 const0_rtx, style, false);
10854 else
10856 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10857 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10858 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10859 ix86_add_queued_cfa_restore_notes (insn);
10861 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10862 if (m->fs.cfa_offset != UNITS_PER_WORD)
10864 m->fs.cfa_offset = UNITS_PER_WORD;
10865 add_reg_note (insn, REG_CFA_DEF_CFA,
10866 plus_constant (stack_pointer_rtx,
10867 UNITS_PER_WORD));
10868 RTX_FRAME_RELATED_P (insn) = 1;
10871 m->fs.sp_offset = UNITS_PER_WORD;
10872 m->fs.sp_valid = true;
10875 else
10877 /* SEH requires that the function end with (1) a stack adjustment
10878 if necessary, (2) a sequence of pops, and (3) a return or
10879 jump instruction. Prevent insns from the function body from
10880 being scheduled into this sequence. */
10881 if (TARGET_SEH)
10883 /* Prevent a catch region from being adjacent to the standard
10884 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10885 several other flags that would be interesting to test are
10886 not yet set up. */
10887 if (flag_non_call_exceptions)
10888 emit_insn (gen_nops (const1_rtx));
10889 else
10890 emit_insn (gen_blockage ());
10893 /* First step is to deallocate the stack frame so that we can
10894 pop the registers. */
10895 if (!m->fs.sp_valid)
10897 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10898 GEN_INT (m->fs.fp_offset
10899 - frame.reg_save_offset),
10900 style, false);
10902 else if (m->fs.sp_offset != frame.reg_save_offset)
10904 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10905 GEN_INT (m->fs.sp_offset
10906 - frame.reg_save_offset),
10907 style,
10908 m->fs.cfa_reg == stack_pointer_rtx);
10911 ix86_emit_restore_regs_using_pop ();
10914 /* If we used a stack pointer and haven't already got rid of it,
10915 then do so now. */
10916 if (m->fs.fp_valid)
10918 /* If the stack pointer is valid and pointing at the frame
10919 pointer store address, then we only need a pop. */
10920 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10921 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10922 /* Leave results in shorter dependency chains on CPUs that are
10923 able to grok it fast. */
10924 else if (TARGET_USE_LEAVE
10925 || optimize_function_for_size_p (cfun)
10926 || !cfun->machine->use_fast_prologue_epilogue)
10927 ix86_emit_leave ();
10928 else
10930 pro_epilogue_adjust_stack (stack_pointer_rtx,
10931 hard_frame_pointer_rtx,
10932 const0_rtx, style, !using_drap);
10933 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10937 if (using_drap)
10939 int param_ptr_offset = UNITS_PER_WORD;
10940 rtx insn;
10942 gcc_assert (stack_realign_drap);
10944 if (ix86_static_chain_on_stack)
10945 param_ptr_offset += UNITS_PER_WORD;
10946 if (!call_used_regs[REGNO (crtl->drap_reg)])
10947 param_ptr_offset += UNITS_PER_WORD;
10949 insn = emit_insn (gen_rtx_SET
10950 (VOIDmode, stack_pointer_rtx,
10951 gen_rtx_PLUS (Pmode,
10952 crtl->drap_reg,
10953 GEN_INT (-param_ptr_offset))));
10954 m->fs.cfa_reg = stack_pointer_rtx;
10955 m->fs.cfa_offset = param_ptr_offset;
10956 m->fs.sp_offset = param_ptr_offset;
10957 m->fs.realigned = false;
10959 add_reg_note (insn, REG_CFA_DEF_CFA,
10960 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10961 GEN_INT (param_ptr_offset)));
10962 RTX_FRAME_RELATED_P (insn) = 1;
10964 if (!call_used_regs[REGNO (crtl->drap_reg)])
10965 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10968 /* At this point the stack pointer must be valid, and we must have
10969 restored all of the registers. We may not have deallocated the
10970 entire stack frame. We've delayed this until now because it may
10971 be possible to merge the local stack deallocation with the
10972 deallocation forced by ix86_static_chain_on_stack. */
10973 gcc_assert (m->fs.sp_valid);
10974 gcc_assert (!m->fs.fp_valid);
10975 gcc_assert (!m->fs.realigned);
10976 if (m->fs.sp_offset != UNITS_PER_WORD)
10978 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10979 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10980 style, true);
10982 else
10983 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10985 /* Sibcall epilogues don't want a return instruction. */
10986 if (style == 0)
10988 m->fs = frame_state_save;
10989 return;
10992 /* Emit vzeroupper if needed. */
10993 ix86_maybe_emit_epilogue_vzeroupper ();
10995 if (crtl->args.pops_args && crtl->args.size)
10997 rtx popc = GEN_INT (crtl->args.pops_args);
10999 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11000 address, do explicit add, and jump indirectly to the caller. */
11002 if (crtl->args.pops_args >= 65536)
11004 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11005 rtx insn;
11007 /* There is no "pascal" calling convention in any 64bit ABI. */
11008 gcc_assert (!TARGET_64BIT);
11010 insn = emit_insn (gen_pop (ecx));
11011 m->fs.cfa_offset -= UNITS_PER_WORD;
11012 m->fs.sp_offset -= UNITS_PER_WORD;
11014 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11015 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11016 add_reg_note (insn, REG_CFA_REGISTER,
11017 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11018 RTX_FRAME_RELATED_P (insn) = 1;
11020 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11021 popc, -1, true);
11022 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11024 else
11025 emit_jump_insn (gen_simple_return_pop_internal (popc));
11027 else
11028 emit_jump_insn (gen_simple_return_internal ());
11030 /* Restore the state back to the state from the prologue,
11031 so that it's correct for the next epilogue. */
11032 m->fs = frame_state_save;
11035 /* Reset from the function's potential modifications. */
11037 static void
11038 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11039 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11041 if (pic_offset_table_rtx)
11042 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11043 #if TARGET_MACHO
11044 /* Mach-O doesn't support labels at the end of objects, so if
11045 it looks like we might want one, insert a NOP. */
11047 rtx insn = get_last_insn ();
11048 rtx deleted_debug_label = NULL_RTX;
11049 while (insn
11050 && NOTE_P (insn)
11051 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11053 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11054 notes only, instead set their CODE_LABEL_NUMBER to -1,
11055 otherwise there would be code generation differences
11056 in between -g and -g0. */
11057 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11058 deleted_debug_label = insn;
11059 insn = PREV_INSN (insn);
11061 if (insn
11062 && (LABEL_P (insn)
11063 || (NOTE_P (insn)
11064 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11065 fputs ("\tnop\n", file);
11066 else if (deleted_debug_label)
11067 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11068 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11069 CODE_LABEL_NUMBER (insn) = -1;
11071 #endif
11075 /* Return a scratch register to use in the split stack prologue. The
11076 split stack prologue is used for -fsplit-stack. It is the first
11077 instructions in the function, even before the regular prologue.
11078 The scratch register can be any caller-saved register which is not
11079 used for parameters or for the static chain. */
11081 static unsigned int
11082 split_stack_prologue_scratch_regno (void)
11084 if (TARGET_64BIT)
11085 return R11_REG;
11086 else
11088 bool is_fastcall;
11089 int regparm;
11091 is_fastcall = (lookup_attribute ("fastcall",
11092 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11093 != NULL);
11094 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11096 if (is_fastcall)
11098 if (DECL_STATIC_CHAIN (cfun->decl))
11100 sorry ("-fsplit-stack does not support fastcall with "
11101 "nested function");
11102 return INVALID_REGNUM;
11104 return AX_REG;
11106 else if (regparm < 3)
11108 if (!DECL_STATIC_CHAIN (cfun->decl))
11109 return CX_REG;
11110 else
11112 if (regparm >= 2)
11114 sorry ("-fsplit-stack does not support 2 register "
11115 " parameters for a nested function");
11116 return INVALID_REGNUM;
11118 return DX_REG;
11121 else
11123 /* FIXME: We could make this work by pushing a register
11124 around the addition and comparison. */
11125 sorry ("-fsplit-stack does not support 3 register parameters");
11126 return INVALID_REGNUM;
11131 /* A SYMBOL_REF for the function which allocates new stackspace for
11132 -fsplit-stack. */
11134 static GTY(()) rtx split_stack_fn;
11136 /* A SYMBOL_REF for the more stack function when using the large
11137 model. */
11139 static GTY(()) rtx split_stack_fn_large;
11141 /* Handle -fsplit-stack. These are the first instructions in the
11142 function, even before the regular prologue. */
11144 void
11145 ix86_expand_split_stack_prologue (void)
11147 struct ix86_frame frame;
11148 HOST_WIDE_INT allocate;
11149 unsigned HOST_WIDE_INT args_size;
11150 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11151 rtx scratch_reg = NULL_RTX;
11152 rtx varargs_label = NULL_RTX;
11153 rtx fn;
11155 gcc_assert (flag_split_stack && reload_completed);
11157 ix86_finalize_stack_realign_flags ();
11158 ix86_compute_frame_layout (&frame);
11159 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11161 /* This is the label we will branch to if we have enough stack
11162 space. We expect the basic block reordering pass to reverse this
11163 branch if optimizing, so that we branch in the unlikely case. */
11164 label = gen_label_rtx ();
11166 /* We need to compare the stack pointer minus the frame size with
11167 the stack boundary in the TCB. The stack boundary always gives
11168 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11169 can compare directly. Otherwise we need to do an addition. */
11171 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11172 UNSPEC_STACK_CHECK);
11173 limit = gen_rtx_CONST (Pmode, limit);
11174 limit = gen_rtx_MEM (Pmode, limit);
11175 if (allocate < SPLIT_STACK_AVAILABLE)
11176 current = stack_pointer_rtx;
11177 else
11179 unsigned int scratch_regno;
11180 rtx offset;
11182 /* We need a scratch register to hold the stack pointer minus
11183 the required frame size. Since this is the very start of the
11184 function, the scratch register can be any caller-saved
11185 register which is not used for parameters. */
11186 offset = GEN_INT (- allocate);
11187 scratch_regno = split_stack_prologue_scratch_regno ();
11188 if (scratch_regno == INVALID_REGNUM)
11189 return;
11190 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11191 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11193 /* We don't use ix86_gen_add3 in this case because it will
11194 want to split to lea, but when not optimizing the insn
11195 will not be split after this point. */
11196 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11197 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11198 offset)));
11200 else
11202 emit_move_insn (scratch_reg, offset);
11203 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11204 stack_pointer_rtx));
11206 current = scratch_reg;
11209 ix86_expand_branch (GEU, current, limit, label);
11210 jump_insn = get_last_insn ();
11211 JUMP_LABEL (jump_insn) = label;
11213 /* Mark the jump as very likely to be taken. */
11214 add_reg_note (jump_insn, REG_BR_PROB,
11215 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11217 if (split_stack_fn == NULL_RTX)
11218 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11219 fn = split_stack_fn;
11221 /* Get more stack space. We pass in the desired stack space and the
11222 size of the arguments to copy to the new stack. In 32-bit mode
11223 we push the parameters; __morestack will return on a new stack
11224 anyhow. In 64-bit mode we pass the parameters in r10 and
11225 r11. */
11226 allocate_rtx = GEN_INT (allocate);
11227 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11228 call_fusage = NULL_RTX;
11229 if (TARGET_64BIT)
11231 rtx reg10, reg11;
11233 reg10 = gen_rtx_REG (Pmode, R10_REG);
11234 reg11 = gen_rtx_REG (Pmode, R11_REG);
11236 /* If this function uses a static chain, it will be in %r10.
11237 Preserve it across the call to __morestack. */
11238 if (DECL_STATIC_CHAIN (cfun->decl))
11240 rtx rax;
11242 rax = gen_rtx_REG (word_mode, AX_REG);
11243 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11244 use_reg (&call_fusage, rax);
11247 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11249 HOST_WIDE_INT argval;
11251 gcc_assert (Pmode == DImode);
11252 /* When using the large model we need to load the address
11253 into a register, and we've run out of registers. So we
11254 switch to a different calling convention, and we call a
11255 different function: __morestack_large. We pass the
11256 argument size in the upper 32 bits of r10 and pass the
11257 frame size in the lower 32 bits. */
11258 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11259 gcc_assert ((args_size & 0xffffffff) == args_size);
11261 if (split_stack_fn_large == NULL_RTX)
11262 split_stack_fn_large =
11263 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11265 if (ix86_cmodel == CM_LARGE_PIC)
11267 rtx label, x;
11269 label = gen_label_rtx ();
11270 emit_label (label);
11271 LABEL_PRESERVE_P (label) = 1;
11272 emit_insn (gen_set_rip_rex64 (reg10, label));
11273 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11274 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11275 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11276 UNSPEC_GOT);
11277 x = gen_rtx_CONST (Pmode, x);
11278 emit_move_insn (reg11, x);
11279 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11280 x = gen_const_mem (Pmode, x);
11281 emit_move_insn (reg11, x);
11283 else
11284 emit_move_insn (reg11, split_stack_fn_large);
11286 fn = reg11;
11288 argval = ((args_size << 16) << 16) + allocate;
11289 emit_move_insn (reg10, GEN_INT (argval));
11291 else
11293 emit_move_insn (reg10, allocate_rtx);
11294 emit_move_insn (reg11, GEN_INT (args_size));
11295 use_reg (&call_fusage, reg11);
11298 use_reg (&call_fusage, reg10);
11300 else
11302 emit_insn (gen_push (GEN_INT (args_size)));
11303 emit_insn (gen_push (allocate_rtx));
11305 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11306 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11307 NULL_RTX, false);
11308 add_function_usage_to (call_insn, call_fusage);
11310 /* In order to make call/return prediction work right, we now need
11311 to execute a return instruction. See
11312 libgcc/config/i386/morestack.S for the details on how this works.
11314 For flow purposes gcc must not see this as a return
11315 instruction--we need control flow to continue at the subsequent
11316 label. Therefore, we use an unspec. */
11317 gcc_assert (crtl->args.pops_args < 65536);
11318 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11320 /* If we are in 64-bit mode and this function uses a static chain,
11321 we saved %r10 in %rax before calling _morestack. */
11322 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11323 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11324 gen_rtx_REG (word_mode, AX_REG));
11326 /* If this function calls va_start, we need to store a pointer to
11327 the arguments on the old stack, because they may not have been
11328 all copied to the new stack. At this point the old stack can be
11329 found at the frame pointer value used by __morestack, because
11330 __morestack has set that up before calling back to us. Here we
11331 store that pointer in a scratch register, and in
11332 ix86_expand_prologue we store the scratch register in a stack
11333 slot. */
11334 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11336 unsigned int scratch_regno;
11337 rtx frame_reg;
11338 int words;
11340 scratch_regno = split_stack_prologue_scratch_regno ();
11341 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11342 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11344 /* 64-bit:
11345 fp -> old fp value
11346 return address within this function
11347 return address of caller of this function
11348 stack arguments
11349 So we add three words to get to the stack arguments.
11351 32-bit:
11352 fp -> old fp value
11353 return address within this function
11354 first argument to __morestack
11355 second argument to __morestack
11356 return address of caller of this function
11357 stack arguments
11358 So we add five words to get to the stack arguments.
11360 words = TARGET_64BIT ? 3 : 5;
11361 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11362 gen_rtx_PLUS (Pmode, frame_reg,
11363 GEN_INT (words * UNITS_PER_WORD))));
11365 varargs_label = gen_label_rtx ();
11366 emit_jump_insn (gen_jump (varargs_label));
11367 JUMP_LABEL (get_last_insn ()) = varargs_label;
11369 emit_barrier ();
11372 emit_label (label);
11373 LABEL_NUSES (label) = 1;
11375 /* If this function calls va_start, we now have to set the scratch
11376 register for the case where we do not call __morestack. In this
11377 case we need to set it based on the stack pointer. */
11378 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11380 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11381 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11382 GEN_INT (UNITS_PER_WORD))));
11384 emit_label (varargs_label);
11385 LABEL_NUSES (varargs_label) = 1;
11389 /* We may have to tell the dataflow pass that the split stack prologue
11390 is initializing a scratch register. */
11392 static void
11393 ix86_live_on_entry (bitmap regs)
11395 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11397 gcc_assert (flag_split_stack);
11398 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11402 /* Determine if op is suitable SUBREG RTX for address. */
11404 static bool
11405 ix86_address_subreg_operand (rtx op)
11407 enum machine_mode mode;
11409 if (!REG_P (op))
11410 return false;
11412 mode = GET_MODE (op);
11414 if (GET_MODE_CLASS (mode) != MODE_INT)
11415 return false;
11417 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11418 failures when the register is one word out of a two word structure. */
11419 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11420 return false;
11422 /* Allow only SUBREGs of non-eliminable hard registers. */
11423 return register_no_elim_operand (op, mode);
11426 /* Extract the parts of an RTL expression that is a valid memory address
11427 for an instruction. Return 0 if the structure of the address is
11428 grossly off. Return -1 if the address contains ASHIFT, so it is not
11429 strictly valid, but still used for computing length of lea instruction. */
11432 ix86_decompose_address (rtx addr, struct ix86_address *out)
11434 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11435 rtx base_reg, index_reg;
11436 HOST_WIDE_INT scale = 1;
11437 rtx scale_rtx = NULL_RTX;
11438 rtx tmp;
11439 int retval = 1;
11440 enum ix86_address_seg seg = SEG_DEFAULT;
11442 /* Allow zero-extended SImode addresses,
11443 they will be emitted with addr32 prefix. */
11444 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11446 if (GET_CODE (addr) == ZERO_EXTEND
11447 && GET_MODE (XEXP (addr, 0)) == SImode)
11448 addr = XEXP (addr, 0);
11449 else if (GET_CODE (addr) == AND
11450 && const_32bit_mask (XEXP (addr, 1), DImode))
11452 addr = XEXP (addr, 0);
11454 /* Adjust SUBREGs. */
11455 if (GET_CODE (addr) == SUBREG
11456 && GET_MODE (SUBREG_REG (addr)) == SImode)
11457 addr = SUBREG_REG (addr);
11458 else if (GET_MODE (addr) == DImode)
11459 addr = gen_rtx_SUBREG (SImode, addr, 0);
11460 else
11461 return 0;
11465 if (REG_P (addr))
11466 base = addr;
11467 else if (GET_CODE (addr) == SUBREG)
11469 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11470 base = addr;
11471 else
11472 return 0;
11474 else if (GET_CODE (addr) == PLUS)
11476 rtx addends[4], op;
11477 int n = 0, i;
11479 op = addr;
11482 if (n >= 4)
11483 return 0;
11484 addends[n++] = XEXP (op, 1);
11485 op = XEXP (op, 0);
11487 while (GET_CODE (op) == PLUS);
11488 if (n >= 4)
11489 return 0;
11490 addends[n] = op;
11492 for (i = n; i >= 0; --i)
11494 op = addends[i];
11495 switch (GET_CODE (op))
11497 case MULT:
11498 if (index)
11499 return 0;
11500 index = XEXP (op, 0);
11501 scale_rtx = XEXP (op, 1);
11502 break;
11504 case ASHIFT:
11505 if (index)
11506 return 0;
11507 index = XEXP (op, 0);
11508 tmp = XEXP (op, 1);
11509 if (!CONST_INT_P (tmp))
11510 return 0;
11511 scale = INTVAL (tmp);
11512 if ((unsigned HOST_WIDE_INT) scale > 3)
11513 return 0;
11514 scale = 1 << scale;
11515 break;
11517 case ZERO_EXTEND:
11518 op = XEXP (op, 0);
11519 if (GET_CODE (op) != UNSPEC)
11520 return 0;
11521 /* FALLTHRU */
11523 case UNSPEC:
11524 if (XINT (op, 1) == UNSPEC_TP
11525 && TARGET_TLS_DIRECT_SEG_REFS
11526 && seg == SEG_DEFAULT)
11527 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11528 else
11529 return 0;
11530 break;
11532 case SUBREG:
11533 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11534 return 0;
11535 /* FALLTHRU */
11537 case REG:
11538 if (!base)
11539 base = op;
11540 else if (!index)
11541 index = op;
11542 else
11543 return 0;
11544 break;
11546 case CONST:
11547 case CONST_INT:
11548 case SYMBOL_REF:
11549 case LABEL_REF:
11550 if (disp)
11551 return 0;
11552 disp = op;
11553 break;
11555 default:
11556 return 0;
11560 else if (GET_CODE (addr) == MULT)
11562 index = XEXP (addr, 0); /* index*scale */
11563 scale_rtx = XEXP (addr, 1);
11565 else if (GET_CODE (addr) == ASHIFT)
11567 /* We're called for lea too, which implements ashift on occasion. */
11568 index = XEXP (addr, 0);
11569 tmp = XEXP (addr, 1);
11570 if (!CONST_INT_P (tmp))
11571 return 0;
11572 scale = INTVAL (tmp);
11573 if ((unsigned HOST_WIDE_INT) scale > 3)
11574 return 0;
11575 scale = 1 << scale;
11576 retval = -1;
11578 else
11579 disp = addr; /* displacement */
11581 if (index)
11583 if (REG_P (index))
11585 else if (GET_CODE (index) == SUBREG
11586 && ix86_address_subreg_operand (SUBREG_REG (index)))
11588 else
11589 return 0;
11592 /* Address override works only on the (%reg) part of %fs:(%reg). */
11593 if (seg != SEG_DEFAULT
11594 && ((base && GET_MODE (base) != word_mode)
11595 || (index && GET_MODE (index) != word_mode)))
11596 return 0;
11598 /* Extract the integral value of scale. */
11599 if (scale_rtx)
11601 if (!CONST_INT_P (scale_rtx))
11602 return 0;
11603 scale = INTVAL (scale_rtx);
11606 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11607 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11609 /* Avoid useless 0 displacement. */
11610 if (disp == const0_rtx && (base || index))
11611 disp = NULL_RTX;
11613 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11614 if (base_reg && index_reg && scale == 1
11615 && (index_reg == arg_pointer_rtx
11616 || index_reg == frame_pointer_rtx
11617 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11619 rtx tmp;
11620 tmp = base, base = index, index = tmp;
11621 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11624 /* Special case: %ebp cannot be encoded as a base without a displacement.
11625 Similarly %r13. */
11626 if (!disp
11627 && base_reg
11628 && (base_reg == hard_frame_pointer_rtx
11629 || base_reg == frame_pointer_rtx
11630 || base_reg == arg_pointer_rtx
11631 || (REG_P (base_reg)
11632 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11633 || REGNO (base_reg) == R13_REG))))
11634 disp = const0_rtx;
11636 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11637 Avoid this by transforming to [%esi+0].
11638 Reload calls address legitimization without cfun defined, so we need
11639 to test cfun for being non-NULL. */
11640 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11641 && base_reg && !index_reg && !disp
11642 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11643 disp = const0_rtx;
11645 /* Special case: encode reg+reg instead of reg*2. */
11646 if (!base && index && scale == 2)
11647 base = index, base_reg = index_reg, scale = 1;
11649 /* Special case: scaling cannot be encoded without base or displacement. */
11650 if (!base && !disp && index && scale != 1)
11651 disp = const0_rtx;
11653 out->base = base;
11654 out->index = index;
11655 out->disp = disp;
11656 out->scale = scale;
11657 out->seg = seg;
11659 return retval;
11662 /* Return cost of the memory address x.
11663 For i386, it is better to use a complex address than let gcc copy
11664 the address into a reg and make a new pseudo. But not if the address
11665 requires to two regs - that would mean more pseudos with longer
11666 lifetimes. */
11667 static int
11668 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11670 struct ix86_address parts;
11671 int cost = 1;
11672 int ok = ix86_decompose_address (x, &parts);
11674 gcc_assert (ok);
11676 if (parts.base && GET_CODE (parts.base) == SUBREG)
11677 parts.base = SUBREG_REG (parts.base);
11678 if (parts.index && GET_CODE (parts.index) == SUBREG)
11679 parts.index = SUBREG_REG (parts.index);
11681 /* Attempt to minimize number of registers in the address. */
11682 if ((parts.base
11683 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11684 || (parts.index
11685 && (!REG_P (parts.index)
11686 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11687 cost++;
11689 if (parts.base
11690 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11691 && parts.index
11692 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11693 && parts.base != parts.index)
11694 cost++;
11696 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11697 since it's predecode logic can't detect the length of instructions
11698 and it degenerates to vector decoded. Increase cost of such
11699 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11700 to split such addresses or even refuse such addresses at all.
11702 Following addressing modes are affected:
11703 [base+scale*index]
11704 [scale*index+disp]
11705 [base+index]
11707 The first and last case may be avoidable by explicitly coding the zero in
11708 memory address, but I don't have AMD-K6 machine handy to check this
11709 theory. */
11711 if (TARGET_K6
11712 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11713 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11714 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11715 cost += 10;
11717 return cost;
11720 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11721 this is used for to form addresses to local data when -fPIC is in
11722 use. */
11724 static bool
11725 darwin_local_data_pic (rtx disp)
11727 return (GET_CODE (disp) == UNSPEC
11728 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11731 /* Determine if a given RTX is a valid constant. We already know this
11732 satisfies CONSTANT_P. */
11734 static bool
11735 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11737 switch (GET_CODE (x))
11739 case CONST:
11740 x = XEXP (x, 0);
11742 if (GET_CODE (x) == PLUS)
11744 if (!CONST_INT_P (XEXP (x, 1)))
11745 return false;
11746 x = XEXP (x, 0);
11749 if (TARGET_MACHO && darwin_local_data_pic (x))
11750 return true;
11752 /* Only some unspecs are valid as "constants". */
11753 if (GET_CODE (x) == UNSPEC)
11754 switch (XINT (x, 1))
11756 case UNSPEC_GOT:
11757 case UNSPEC_GOTOFF:
11758 case UNSPEC_PLTOFF:
11759 return TARGET_64BIT;
11760 case UNSPEC_TPOFF:
11761 case UNSPEC_NTPOFF:
11762 x = XVECEXP (x, 0, 0);
11763 return (GET_CODE (x) == SYMBOL_REF
11764 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11765 case UNSPEC_DTPOFF:
11766 x = XVECEXP (x, 0, 0);
11767 return (GET_CODE (x) == SYMBOL_REF
11768 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11769 default:
11770 return false;
11773 /* We must have drilled down to a symbol. */
11774 if (GET_CODE (x) == LABEL_REF)
11775 return true;
11776 if (GET_CODE (x) != SYMBOL_REF)
11777 return false;
11778 /* FALLTHRU */
11780 case SYMBOL_REF:
11781 /* TLS symbols are never valid. */
11782 if (SYMBOL_REF_TLS_MODEL (x))
11783 return false;
11785 /* DLLIMPORT symbols are never valid. */
11786 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11787 && SYMBOL_REF_DLLIMPORT_P (x))
11788 return false;
11790 #if TARGET_MACHO
11791 /* mdynamic-no-pic */
11792 if (MACHO_DYNAMIC_NO_PIC_P)
11793 return machopic_symbol_defined_p (x);
11794 #endif
11795 break;
11797 case CONST_DOUBLE:
11798 if (GET_MODE (x) == TImode
11799 && x != CONST0_RTX (TImode)
11800 && !TARGET_64BIT)
11801 return false;
11802 break;
11804 case CONST_VECTOR:
11805 if (!standard_sse_constant_p (x))
11806 return false;
11808 default:
11809 break;
11812 /* Otherwise we handle everything else in the move patterns. */
11813 return true;
11816 /* Determine if it's legal to put X into the constant pool. This
11817 is not possible for the address of thread-local symbols, which
11818 is checked above. */
11820 static bool
11821 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11823 /* We can always put integral constants and vectors in memory. */
11824 switch (GET_CODE (x))
11826 case CONST_INT:
11827 case CONST_DOUBLE:
11828 case CONST_VECTOR:
11829 return false;
11831 default:
11832 break;
11834 return !ix86_legitimate_constant_p (mode, x);
11838 /* Nonzero if the constant value X is a legitimate general operand
11839 when generating PIC code. It is given that flag_pic is on and
11840 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11842 bool
11843 legitimate_pic_operand_p (rtx x)
11845 rtx inner;
11847 switch (GET_CODE (x))
11849 case CONST:
11850 inner = XEXP (x, 0);
11851 if (GET_CODE (inner) == PLUS
11852 && CONST_INT_P (XEXP (inner, 1)))
11853 inner = XEXP (inner, 0);
11855 /* Only some unspecs are valid as "constants". */
11856 if (GET_CODE (inner) == UNSPEC)
11857 switch (XINT (inner, 1))
11859 case UNSPEC_GOT:
11860 case UNSPEC_GOTOFF:
11861 case UNSPEC_PLTOFF:
11862 return TARGET_64BIT;
11863 case UNSPEC_TPOFF:
11864 x = XVECEXP (inner, 0, 0);
11865 return (GET_CODE (x) == SYMBOL_REF
11866 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11867 case UNSPEC_MACHOPIC_OFFSET:
11868 return legitimate_pic_address_disp_p (x);
11869 default:
11870 return false;
11872 /* FALLTHRU */
11874 case SYMBOL_REF:
11875 case LABEL_REF:
11876 return legitimate_pic_address_disp_p (x);
11878 default:
11879 return true;
11883 /* Determine if a given CONST RTX is a valid memory displacement
11884 in PIC mode. */
11886 bool
11887 legitimate_pic_address_disp_p (rtx disp)
11889 bool saw_plus;
11891 /* In 64bit mode we can allow direct addresses of symbols and labels
11892 when they are not dynamic symbols. */
11893 if (TARGET_64BIT)
11895 rtx op0 = disp, op1;
11897 switch (GET_CODE (disp))
11899 case LABEL_REF:
11900 return true;
11902 case CONST:
11903 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11904 break;
11905 op0 = XEXP (XEXP (disp, 0), 0);
11906 op1 = XEXP (XEXP (disp, 0), 1);
11907 if (!CONST_INT_P (op1)
11908 || INTVAL (op1) >= 16*1024*1024
11909 || INTVAL (op1) < -16*1024*1024)
11910 break;
11911 if (GET_CODE (op0) == LABEL_REF)
11912 return true;
11913 if (GET_CODE (op0) == CONST
11914 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11915 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11916 return true;
11917 if (GET_CODE (op0) == UNSPEC
11918 && XINT (op0, 1) == UNSPEC_PCREL)
11919 return true;
11920 if (GET_CODE (op0) != SYMBOL_REF)
11921 break;
11922 /* FALLTHRU */
11924 case SYMBOL_REF:
11925 /* TLS references should always be enclosed in UNSPEC. */
11926 if (SYMBOL_REF_TLS_MODEL (op0))
11927 return false;
11928 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11929 && ix86_cmodel != CM_LARGE_PIC)
11930 return true;
11931 break;
11933 default:
11934 break;
11937 if (GET_CODE (disp) != CONST)
11938 return false;
11939 disp = XEXP (disp, 0);
11941 if (TARGET_64BIT)
11943 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11944 of GOT tables. We should not need these anyway. */
11945 if (GET_CODE (disp) != UNSPEC
11946 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11947 && XINT (disp, 1) != UNSPEC_GOTOFF
11948 && XINT (disp, 1) != UNSPEC_PCREL
11949 && XINT (disp, 1) != UNSPEC_PLTOFF))
11950 return false;
11952 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11953 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11954 return false;
11955 return true;
11958 saw_plus = false;
11959 if (GET_CODE (disp) == PLUS)
11961 if (!CONST_INT_P (XEXP (disp, 1)))
11962 return false;
11963 disp = XEXP (disp, 0);
11964 saw_plus = true;
11967 if (TARGET_MACHO && darwin_local_data_pic (disp))
11968 return true;
11970 if (GET_CODE (disp) != UNSPEC)
11971 return false;
11973 switch (XINT (disp, 1))
11975 case UNSPEC_GOT:
11976 if (saw_plus)
11977 return false;
11978 /* We need to check for both symbols and labels because VxWorks loads
11979 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11980 details. */
11981 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11982 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11983 case UNSPEC_GOTOFF:
11984 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11985 While ABI specify also 32bit relocation but we don't produce it in
11986 small PIC model at all. */
11987 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11988 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11989 && !TARGET_64BIT)
11990 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11991 return false;
11992 case UNSPEC_GOTTPOFF:
11993 case UNSPEC_GOTNTPOFF:
11994 case UNSPEC_INDNTPOFF:
11995 if (saw_plus)
11996 return false;
11997 disp = XVECEXP (disp, 0, 0);
11998 return (GET_CODE (disp) == SYMBOL_REF
11999 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12000 case UNSPEC_NTPOFF:
12001 disp = XVECEXP (disp, 0, 0);
12002 return (GET_CODE (disp) == SYMBOL_REF
12003 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12004 case UNSPEC_DTPOFF:
12005 disp = XVECEXP (disp, 0, 0);
12006 return (GET_CODE (disp) == SYMBOL_REF
12007 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12010 return false;
12013 /* Recognizes RTL expressions that are valid memory addresses for an
12014 instruction. The MODE argument is the machine mode for the MEM
12015 expression that wants to use this address.
12017 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12018 convert common non-canonical forms to canonical form so that they will
12019 be recognized. */
12021 static bool
12022 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12023 rtx addr, bool strict)
12025 struct ix86_address parts;
12026 rtx base, index, disp;
12027 HOST_WIDE_INT scale;
12029 /* Since constant address in x32 is signed extended to 64bit,
12030 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12031 if (TARGET_X32
12032 && CONST_INT_P (addr)
12033 && INTVAL (addr) < 0)
12034 return false;
12036 if (ix86_decompose_address (addr, &parts) <= 0)
12037 /* Decomposition failed. */
12038 return false;
12040 base = parts.base;
12041 index = parts.index;
12042 disp = parts.disp;
12043 scale = parts.scale;
12045 /* Validate base register. */
12046 if (base)
12048 rtx reg;
12050 if (REG_P (base))
12051 reg = base;
12052 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12053 reg = SUBREG_REG (base);
12054 else
12055 /* Base is not a register. */
12056 return false;
12058 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12059 return false;
12061 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12062 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12063 /* Base is not valid. */
12064 return false;
12067 /* Validate index register. */
12068 if (index)
12070 rtx reg;
12072 if (REG_P (index))
12073 reg = index;
12074 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12075 reg = SUBREG_REG (index);
12076 else
12077 /* Index is not a register. */
12078 return false;
12080 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12081 return false;
12083 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12084 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12085 /* Index is not valid. */
12086 return false;
12089 /* Index and base should have the same mode. */
12090 if (base && index
12091 && GET_MODE (base) != GET_MODE (index))
12092 return false;
12094 /* Validate scale factor. */
12095 if (scale != 1)
12097 if (!index)
12098 /* Scale without index. */
12099 return false;
12101 if (scale != 2 && scale != 4 && scale != 8)
12102 /* Scale is not a valid multiplier. */
12103 return false;
12106 /* Validate displacement. */
12107 if (disp)
12109 if (GET_CODE (disp) == CONST
12110 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12111 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12112 switch (XINT (XEXP (disp, 0), 1))
12114 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12115 used. While ABI specify also 32bit relocations, we don't produce
12116 them at all and use IP relative instead. */
12117 case UNSPEC_GOT:
12118 case UNSPEC_GOTOFF:
12119 gcc_assert (flag_pic);
12120 if (!TARGET_64BIT)
12121 goto is_legitimate_pic;
12123 /* 64bit address unspec. */
12124 return false;
12126 case UNSPEC_GOTPCREL:
12127 case UNSPEC_PCREL:
12128 gcc_assert (flag_pic);
12129 goto is_legitimate_pic;
12131 case UNSPEC_GOTTPOFF:
12132 case UNSPEC_GOTNTPOFF:
12133 case UNSPEC_INDNTPOFF:
12134 case UNSPEC_NTPOFF:
12135 case UNSPEC_DTPOFF:
12136 break;
12138 case UNSPEC_STACK_CHECK:
12139 gcc_assert (flag_split_stack);
12140 break;
12142 default:
12143 /* Invalid address unspec. */
12144 return false;
12147 else if (SYMBOLIC_CONST (disp)
12148 && (flag_pic
12149 || (TARGET_MACHO
12150 #if TARGET_MACHO
12151 && MACHOPIC_INDIRECT
12152 && !machopic_operand_p (disp)
12153 #endif
12157 is_legitimate_pic:
12158 if (TARGET_64BIT && (index || base))
12160 /* foo@dtpoff(%rX) is ok. */
12161 if (GET_CODE (disp) != CONST
12162 || GET_CODE (XEXP (disp, 0)) != PLUS
12163 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12164 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12165 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12166 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12167 /* Non-constant pic memory reference. */
12168 return false;
12170 else if ((!TARGET_MACHO || flag_pic)
12171 && ! legitimate_pic_address_disp_p (disp))
12172 /* Displacement is an invalid pic construct. */
12173 return false;
12174 #if TARGET_MACHO
12175 else if (MACHO_DYNAMIC_NO_PIC_P
12176 && !ix86_legitimate_constant_p (Pmode, disp))
12177 /* displacment must be referenced via non_lazy_pointer */
12178 return false;
12179 #endif
12181 /* This code used to verify that a symbolic pic displacement
12182 includes the pic_offset_table_rtx register.
12184 While this is good idea, unfortunately these constructs may
12185 be created by "adds using lea" optimization for incorrect
12186 code like:
12188 int a;
12189 int foo(int i)
12191 return *(&a+i);
12194 This code is nonsensical, but results in addressing
12195 GOT table with pic_offset_table_rtx base. We can't
12196 just refuse it easily, since it gets matched by
12197 "addsi3" pattern, that later gets split to lea in the
12198 case output register differs from input. While this
12199 can be handled by separate addsi pattern for this case
12200 that never results in lea, this seems to be easier and
12201 correct fix for crash to disable this test. */
12203 else if (GET_CODE (disp) != LABEL_REF
12204 && !CONST_INT_P (disp)
12205 && (GET_CODE (disp) != CONST
12206 || !ix86_legitimate_constant_p (Pmode, disp))
12207 && (GET_CODE (disp) != SYMBOL_REF
12208 || !ix86_legitimate_constant_p (Pmode, disp)))
12209 /* Displacement is not constant. */
12210 return false;
12211 else if (TARGET_64BIT
12212 && !x86_64_immediate_operand (disp, VOIDmode))
12213 /* Displacement is out of range. */
12214 return false;
12217 /* Everything looks valid. */
12218 return true;
12221 /* Determine if a given RTX is a valid constant address. */
12223 bool
12224 constant_address_p (rtx x)
12226 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12229 /* Return a unique alias set for the GOT. */
12231 static alias_set_type
12232 ix86_GOT_alias_set (void)
12234 static alias_set_type set = -1;
12235 if (set == -1)
12236 set = new_alias_set ();
12237 return set;
12240 /* Return a legitimate reference for ORIG (an address) using the
12241 register REG. If REG is 0, a new pseudo is generated.
12243 There are two types of references that must be handled:
12245 1. Global data references must load the address from the GOT, via
12246 the PIC reg. An insn is emitted to do this load, and the reg is
12247 returned.
12249 2. Static data references, constant pool addresses, and code labels
12250 compute the address as an offset from the GOT, whose base is in
12251 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12252 differentiate them from global data objects. The returned
12253 address is the PIC reg + an unspec constant.
12255 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12256 reg also appears in the address. */
12258 static rtx
12259 legitimize_pic_address (rtx orig, rtx reg)
12261 rtx addr = orig;
12262 rtx new_rtx = orig;
12263 rtx base;
12265 #if TARGET_MACHO
12266 if (TARGET_MACHO && !TARGET_64BIT)
12268 if (reg == 0)
12269 reg = gen_reg_rtx (Pmode);
12270 /* Use the generic Mach-O PIC machinery. */
12271 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12273 #endif
12275 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12276 new_rtx = addr;
12277 else if (TARGET_64BIT
12278 && ix86_cmodel != CM_SMALL_PIC
12279 && gotoff_operand (addr, Pmode))
12281 rtx tmpreg;
12282 /* This symbol may be referenced via a displacement from the PIC
12283 base address (@GOTOFF). */
12285 if (reload_in_progress)
12286 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12287 if (GET_CODE (addr) == CONST)
12288 addr = XEXP (addr, 0);
12289 if (GET_CODE (addr) == PLUS)
12291 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12292 UNSPEC_GOTOFF);
12293 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12295 else
12296 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12297 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12298 if (!reg)
12299 tmpreg = gen_reg_rtx (Pmode);
12300 else
12301 tmpreg = reg;
12302 emit_move_insn (tmpreg, new_rtx);
12304 if (reg != 0)
12306 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12307 tmpreg, 1, OPTAB_DIRECT);
12308 new_rtx = reg;
12310 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12312 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12314 /* This symbol may be referenced via a displacement from the PIC
12315 base address (@GOTOFF). */
12317 if (reload_in_progress)
12318 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12319 if (GET_CODE (addr) == CONST)
12320 addr = XEXP (addr, 0);
12321 if (GET_CODE (addr) == PLUS)
12323 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12324 UNSPEC_GOTOFF);
12325 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12327 else
12328 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12329 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12330 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12332 if (reg != 0)
12334 emit_move_insn (reg, new_rtx);
12335 new_rtx = reg;
12338 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12339 /* We can't use @GOTOFF for text labels on VxWorks;
12340 see gotoff_operand. */
12341 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12343 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12345 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12346 return legitimize_dllimport_symbol (addr, true);
12347 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12348 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12349 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12351 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12352 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12356 /* For x64 PE-COFF there is no GOT table. So we use address
12357 directly. */
12358 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12360 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12361 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12363 if (reg == 0)
12364 reg = gen_reg_rtx (Pmode);
12365 emit_move_insn (reg, new_rtx);
12366 new_rtx = reg;
12368 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12370 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12371 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12372 new_rtx = gen_const_mem (Pmode, new_rtx);
12373 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12375 if (reg == 0)
12376 reg = gen_reg_rtx (Pmode);
12377 /* Use directly gen_movsi, otherwise the address is loaded
12378 into register for CSE. We don't want to CSE this addresses,
12379 instead we CSE addresses from the GOT table, so skip this. */
12380 emit_insn (gen_movsi (reg, new_rtx));
12381 new_rtx = reg;
12383 else
12385 /* This symbol must be referenced via a load from the
12386 Global Offset Table (@GOT). */
12388 if (reload_in_progress)
12389 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12390 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12391 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12392 if (TARGET_64BIT)
12393 new_rtx = force_reg (Pmode, new_rtx);
12394 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12395 new_rtx = gen_const_mem (Pmode, new_rtx);
12396 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12398 if (reg == 0)
12399 reg = gen_reg_rtx (Pmode);
12400 emit_move_insn (reg, new_rtx);
12401 new_rtx = reg;
12404 else
12406 if (CONST_INT_P (addr)
12407 && !x86_64_immediate_operand (addr, VOIDmode))
12409 if (reg)
12411 emit_move_insn (reg, addr);
12412 new_rtx = reg;
12414 else
12415 new_rtx = force_reg (Pmode, addr);
12417 else if (GET_CODE (addr) == CONST)
12419 addr = XEXP (addr, 0);
12421 /* We must match stuff we generate before. Assume the only
12422 unspecs that can get here are ours. Not that we could do
12423 anything with them anyway.... */
12424 if (GET_CODE (addr) == UNSPEC
12425 || (GET_CODE (addr) == PLUS
12426 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12427 return orig;
12428 gcc_assert (GET_CODE (addr) == PLUS);
12430 if (GET_CODE (addr) == PLUS)
12432 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12434 /* Check first to see if this is a constant offset from a @GOTOFF
12435 symbol reference. */
12436 if (gotoff_operand (op0, Pmode)
12437 && CONST_INT_P (op1))
12439 if (!TARGET_64BIT)
12441 if (reload_in_progress)
12442 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12443 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12444 UNSPEC_GOTOFF);
12445 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12446 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12447 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12449 if (reg != 0)
12451 emit_move_insn (reg, new_rtx);
12452 new_rtx = reg;
12455 else
12457 if (INTVAL (op1) < -16*1024*1024
12458 || INTVAL (op1) >= 16*1024*1024)
12460 if (!x86_64_immediate_operand (op1, Pmode))
12461 op1 = force_reg (Pmode, op1);
12462 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12466 else
12468 base = legitimize_pic_address (XEXP (addr, 0), reg);
12469 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12470 base == reg ? NULL_RTX : reg);
12472 if (CONST_INT_P (new_rtx))
12473 new_rtx = plus_constant (base, INTVAL (new_rtx));
12474 else
12476 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12478 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12479 new_rtx = XEXP (new_rtx, 1);
12481 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12486 return new_rtx;
12489 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12491 static rtx
12492 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12494 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12496 if (GET_MODE (tp) != tp_mode)
12498 gcc_assert (GET_MODE (tp) == SImode);
12499 gcc_assert (tp_mode == DImode);
12501 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12504 if (to_reg)
12505 tp = copy_to_mode_reg (tp_mode, tp);
12507 return tp;
12510 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12512 static GTY(()) rtx ix86_tls_symbol;
12514 static rtx
12515 ix86_tls_get_addr (void)
12517 if (!ix86_tls_symbol)
12519 const char *sym
12520 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12521 ? "___tls_get_addr" : "__tls_get_addr");
12523 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12526 return ix86_tls_symbol;
12529 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12531 static GTY(()) rtx ix86_tls_module_base_symbol;
12534 ix86_tls_module_base (void)
12536 if (!ix86_tls_module_base_symbol)
12538 ix86_tls_module_base_symbol
12539 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12541 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12542 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12545 return ix86_tls_module_base_symbol;
12548 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12549 false if we expect this to be used for a memory address and true if
12550 we expect to load the address into a register. */
12552 static rtx
12553 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12555 rtx dest, base, off;
12556 rtx pic = NULL_RTX, tp = NULL_RTX;
12557 enum machine_mode tp_mode = Pmode;
12558 int type;
12560 switch (model)
12562 case TLS_MODEL_GLOBAL_DYNAMIC:
12563 dest = gen_reg_rtx (Pmode);
12565 if (!TARGET_64BIT)
12567 if (flag_pic)
12568 pic = pic_offset_table_rtx;
12569 else
12571 pic = gen_reg_rtx (Pmode);
12572 emit_insn (gen_set_got (pic));
12576 if (TARGET_GNU2_TLS)
12578 if (TARGET_64BIT)
12579 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12580 else
12581 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12583 tp = get_thread_pointer (Pmode, true);
12584 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12586 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12588 else
12590 rtx caddr = ix86_tls_get_addr ();
12592 if (TARGET_64BIT)
12594 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12596 start_sequence ();
12597 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12598 caddr));
12599 insns = get_insns ();
12600 end_sequence ();
12602 RTL_CONST_CALL_P (insns) = 1;
12603 emit_libcall_block (insns, dest, rax, x);
12605 else
12606 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12608 break;
12610 case TLS_MODEL_LOCAL_DYNAMIC:
12611 base = gen_reg_rtx (Pmode);
12613 if (!TARGET_64BIT)
12615 if (flag_pic)
12616 pic = pic_offset_table_rtx;
12617 else
12619 pic = gen_reg_rtx (Pmode);
12620 emit_insn (gen_set_got (pic));
12624 if (TARGET_GNU2_TLS)
12626 rtx tmp = ix86_tls_module_base ();
12628 if (TARGET_64BIT)
12629 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12630 else
12631 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12633 tp = get_thread_pointer (Pmode, true);
12634 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12635 gen_rtx_MINUS (Pmode, tmp, tp));
12637 else
12639 rtx caddr = ix86_tls_get_addr ();
12641 if (TARGET_64BIT)
12643 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12645 start_sequence ();
12646 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12647 caddr));
12648 insns = get_insns ();
12649 end_sequence ();
12651 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12652 share the LD_BASE result with other LD model accesses. */
12653 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12654 UNSPEC_TLS_LD_BASE);
12656 RTL_CONST_CALL_P (insns) = 1;
12657 emit_libcall_block (insns, base, rax, eqv);
12659 else
12660 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12663 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12664 off = gen_rtx_CONST (Pmode, off);
12666 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12668 if (TARGET_GNU2_TLS)
12670 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12672 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12674 break;
12676 case TLS_MODEL_INITIAL_EXEC:
12677 if (TARGET_64BIT)
12679 if (TARGET_SUN_TLS)
12681 /* The Sun linker took the AMD64 TLS spec literally
12682 and can only handle %rax as destination of the
12683 initial executable code sequence. */
12685 dest = gen_reg_rtx (Pmode);
12686 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12687 return dest;
12690 /* Generate DImode references to avoid %fs:(%reg32)
12691 problems and linker IE->LE relaxation bug. */
12692 tp_mode = DImode;
12693 pic = NULL;
12694 type = UNSPEC_GOTNTPOFF;
12696 else if (flag_pic)
12698 if (reload_in_progress)
12699 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12700 pic = pic_offset_table_rtx;
12701 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12703 else if (!TARGET_ANY_GNU_TLS)
12705 pic = gen_reg_rtx (Pmode);
12706 emit_insn (gen_set_got (pic));
12707 type = UNSPEC_GOTTPOFF;
12709 else
12711 pic = NULL;
12712 type = UNSPEC_INDNTPOFF;
12715 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12716 off = gen_rtx_CONST (tp_mode, off);
12717 if (pic)
12718 off = gen_rtx_PLUS (tp_mode, pic, off);
12719 off = gen_const_mem (tp_mode, off);
12720 set_mem_alias_set (off, ix86_GOT_alias_set ());
12722 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12724 base = get_thread_pointer (tp_mode,
12725 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12726 off = force_reg (tp_mode, off);
12727 return gen_rtx_PLUS (tp_mode, base, off);
12729 else
12731 base = get_thread_pointer (Pmode, true);
12732 dest = gen_reg_rtx (Pmode);
12733 emit_insn (ix86_gen_sub3 (dest, base, off));
12735 break;
12737 case TLS_MODEL_LOCAL_EXEC:
12738 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12739 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12740 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12741 off = gen_rtx_CONST (Pmode, off);
12743 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12745 base = get_thread_pointer (Pmode,
12746 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12747 return gen_rtx_PLUS (Pmode, base, off);
12749 else
12751 base = get_thread_pointer (Pmode, true);
12752 dest = gen_reg_rtx (Pmode);
12753 emit_insn (ix86_gen_sub3 (dest, base, off));
12755 break;
12757 default:
12758 gcc_unreachable ();
12761 return dest;
12764 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12765 to symbol DECL. */
12767 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12768 htab_t dllimport_map;
12770 static tree
12771 get_dllimport_decl (tree decl)
12773 struct tree_map *h, in;
12774 void **loc;
12775 const char *name;
12776 const char *prefix;
12777 size_t namelen, prefixlen;
12778 char *imp_name;
12779 tree to;
12780 rtx rtl;
12782 if (!dllimport_map)
12783 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12785 in.hash = htab_hash_pointer (decl);
12786 in.base.from = decl;
12787 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12788 h = (struct tree_map *) *loc;
12789 if (h)
12790 return h->to;
12792 *loc = h = ggc_alloc_tree_map ();
12793 h->hash = in.hash;
12794 h->base.from = decl;
12795 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12796 VAR_DECL, NULL, ptr_type_node);
12797 DECL_ARTIFICIAL (to) = 1;
12798 DECL_IGNORED_P (to) = 1;
12799 DECL_EXTERNAL (to) = 1;
12800 TREE_READONLY (to) = 1;
12802 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12803 name = targetm.strip_name_encoding (name);
12804 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12805 ? "*__imp_" : "*__imp__";
12806 namelen = strlen (name);
12807 prefixlen = strlen (prefix);
12808 imp_name = (char *) alloca (namelen + prefixlen + 1);
12809 memcpy (imp_name, prefix, prefixlen);
12810 memcpy (imp_name + prefixlen, name, namelen + 1);
12812 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12813 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12814 SET_SYMBOL_REF_DECL (rtl, to);
12815 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12817 rtl = gen_const_mem (Pmode, rtl);
12818 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12820 SET_DECL_RTL (to, rtl);
12821 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12823 return to;
12826 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12827 true if we require the result be a register. */
12829 static rtx
12830 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12832 tree imp_decl;
12833 rtx x;
12835 gcc_assert (SYMBOL_REF_DECL (symbol));
12836 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12838 x = DECL_RTL (imp_decl);
12839 if (want_reg)
12840 x = force_reg (Pmode, x);
12841 return x;
12844 /* Try machine-dependent ways of modifying an illegitimate address
12845 to be legitimate. If we find one, return the new, valid address.
12846 This macro is used in only one place: `memory_address' in explow.c.
12848 OLDX is the address as it was before break_out_memory_refs was called.
12849 In some cases it is useful to look at this to decide what needs to be done.
12851 It is always safe for this macro to do nothing. It exists to recognize
12852 opportunities to optimize the output.
12854 For the 80386, we handle X+REG by loading X into a register R and
12855 using R+REG. R will go in a general reg and indexing will be used.
12856 However, if REG is a broken-out memory address or multiplication,
12857 nothing needs to be done because REG can certainly go in a general reg.
12859 When -fpic is used, special handling is needed for symbolic references.
12860 See comments by legitimize_pic_address in i386.c for details. */
12862 static rtx
12863 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12864 enum machine_mode mode)
12866 int changed = 0;
12867 unsigned log;
12869 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12870 if (log)
12871 return legitimize_tls_address (x, (enum tls_model) log, false);
12872 if (GET_CODE (x) == CONST
12873 && GET_CODE (XEXP (x, 0)) == PLUS
12874 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12875 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12877 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12878 (enum tls_model) log, false);
12879 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12882 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12884 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12885 return legitimize_dllimport_symbol (x, true);
12886 if (GET_CODE (x) == CONST
12887 && GET_CODE (XEXP (x, 0)) == PLUS
12888 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12889 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12891 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12892 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12896 if (flag_pic && SYMBOLIC_CONST (x))
12897 return legitimize_pic_address (x, 0);
12899 #if TARGET_MACHO
12900 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12901 return machopic_indirect_data_reference (x, 0);
12902 #endif
12904 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12905 if (GET_CODE (x) == ASHIFT
12906 && CONST_INT_P (XEXP (x, 1))
12907 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12909 changed = 1;
12910 log = INTVAL (XEXP (x, 1));
12911 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12912 GEN_INT (1 << log));
12915 if (GET_CODE (x) == PLUS)
12917 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12919 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12920 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12921 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12923 changed = 1;
12924 log = INTVAL (XEXP (XEXP (x, 0), 1));
12925 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12926 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12927 GEN_INT (1 << log));
12930 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12931 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12932 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12934 changed = 1;
12935 log = INTVAL (XEXP (XEXP (x, 1), 1));
12936 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12937 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12938 GEN_INT (1 << log));
12941 /* Put multiply first if it isn't already. */
12942 if (GET_CODE (XEXP (x, 1)) == MULT)
12944 rtx tmp = XEXP (x, 0);
12945 XEXP (x, 0) = XEXP (x, 1);
12946 XEXP (x, 1) = tmp;
12947 changed = 1;
12950 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12951 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12952 created by virtual register instantiation, register elimination, and
12953 similar optimizations. */
12954 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12956 changed = 1;
12957 x = gen_rtx_PLUS (Pmode,
12958 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12959 XEXP (XEXP (x, 1), 0)),
12960 XEXP (XEXP (x, 1), 1));
12963 /* Canonicalize
12964 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12965 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12966 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12967 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12968 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12969 && CONSTANT_P (XEXP (x, 1)))
12971 rtx constant;
12972 rtx other = NULL_RTX;
12974 if (CONST_INT_P (XEXP (x, 1)))
12976 constant = XEXP (x, 1);
12977 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12979 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12981 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12982 other = XEXP (x, 1);
12984 else
12985 constant = 0;
12987 if (constant)
12989 changed = 1;
12990 x = gen_rtx_PLUS (Pmode,
12991 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12992 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12993 plus_constant (other, INTVAL (constant)));
12997 if (changed && ix86_legitimate_address_p (mode, x, false))
12998 return x;
13000 if (GET_CODE (XEXP (x, 0)) == MULT)
13002 changed = 1;
13003 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13006 if (GET_CODE (XEXP (x, 1)) == MULT)
13008 changed = 1;
13009 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13012 if (changed
13013 && REG_P (XEXP (x, 1))
13014 && REG_P (XEXP (x, 0)))
13015 return x;
13017 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13019 changed = 1;
13020 x = legitimize_pic_address (x, 0);
13023 if (changed && ix86_legitimate_address_p (mode, x, false))
13024 return x;
13026 if (REG_P (XEXP (x, 0)))
13028 rtx temp = gen_reg_rtx (Pmode);
13029 rtx val = force_operand (XEXP (x, 1), temp);
13030 if (val != temp)
13032 if (GET_MODE (val) != Pmode)
13033 val = convert_to_mode (Pmode, val, 1);
13034 emit_move_insn (temp, val);
13037 XEXP (x, 1) = temp;
13038 return x;
13041 else if (REG_P (XEXP (x, 1)))
13043 rtx temp = gen_reg_rtx (Pmode);
13044 rtx val = force_operand (XEXP (x, 0), temp);
13045 if (val != temp)
13047 if (GET_MODE (val) != Pmode)
13048 val = convert_to_mode (Pmode, val, 1);
13049 emit_move_insn (temp, val);
13052 XEXP (x, 0) = temp;
13053 return x;
13057 return x;
13060 /* Print an integer constant expression in assembler syntax. Addition
13061 and subtraction are the only arithmetic that may appear in these
13062 expressions. FILE is the stdio stream to write to, X is the rtx, and
13063 CODE is the operand print code from the output string. */
13065 static void
13066 output_pic_addr_const (FILE *file, rtx x, int code)
13068 char buf[256];
13070 switch (GET_CODE (x))
13072 case PC:
13073 gcc_assert (flag_pic);
13074 putc ('.', file);
13075 break;
13077 case SYMBOL_REF:
13078 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13079 output_addr_const (file, x);
13080 else
13082 const char *name = XSTR (x, 0);
13084 /* Mark the decl as referenced so that cgraph will
13085 output the function. */
13086 if (SYMBOL_REF_DECL (x))
13087 mark_decl_referenced (SYMBOL_REF_DECL (x));
13089 #if TARGET_MACHO
13090 if (MACHOPIC_INDIRECT
13091 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13092 name = machopic_indirection_name (x, /*stub_p=*/true);
13093 #endif
13094 assemble_name (file, name);
13096 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13097 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13098 fputs ("@PLT", file);
13099 break;
13101 case LABEL_REF:
13102 x = XEXP (x, 0);
13103 /* FALLTHRU */
13104 case CODE_LABEL:
13105 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13106 assemble_name (asm_out_file, buf);
13107 break;
13109 case CONST_INT:
13110 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13111 break;
13113 case CONST:
13114 /* This used to output parentheses around the expression,
13115 but that does not work on the 386 (either ATT or BSD assembler). */
13116 output_pic_addr_const (file, XEXP (x, 0), code);
13117 break;
13119 case CONST_DOUBLE:
13120 if (GET_MODE (x) == VOIDmode)
13122 /* We can use %d if the number is <32 bits and positive. */
13123 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13124 fprintf (file, "0x%lx%08lx",
13125 (unsigned long) CONST_DOUBLE_HIGH (x),
13126 (unsigned long) CONST_DOUBLE_LOW (x));
13127 else
13128 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13130 else
13131 /* We can't handle floating point constants;
13132 TARGET_PRINT_OPERAND must handle them. */
13133 output_operand_lossage ("floating constant misused");
13134 break;
13136 case PLUS:
13137 /* Some assemblers need integer constants to appear first. */
13138 if (CONST_INT_P (XEXP (x, 0)))
13140 output_pic_addr_const (file, XEXP (x, 0), code);
13141 putc ('+', file);
13142 output_pic_addr_const (file, XEXP (x, 1), code);
13144 else
13146 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13147 output_pic_addr_const (file, XEXP (x, 1), code);
13148 putc ('+', file);
13149 output_pic_addr_const (file, XEXP (x, 0), code);
13151 break;
13153 case MINUS:
13154 if (!TARGET_MACHO)
13155 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13156 output_pic_addr_const (file, XEXP (x, 0), code);
13157 putc ('-', file);
13158 output_pic_addr_const (file, XEXP (x, 1), code);
13159 if (!TARGET_MACHO)
13160 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13161 break;
13163 case UNSPEC:
13164 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13166 bool f = i386_asm_output_addr_const_extra (file, x);
13167 gcc_assert (f);
13168 break;
13171 gcc_assert (XVECLEN (x, 0) == 1);
13172 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13173 switch (XINT (x, 1))
13175 case UNSPEC_GOT:
13176 fputs ("@GOT", file);
13177 break;
13178 case UNSPEC_GOTOFF:
13179 fputs ("@GOTOFF", file);
13180 break;
13181 case UNSPEC_PLTOFF:
13182 fputs ("@PLTOFF", file);
13183 break;
13184 case UNSPEC_PCREL:
13185 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13186 "(%rip)" : "[rip]", file);
13187 break;
13188 case UNSPEC_GOTPCREL:
13189 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13190 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13191 break;
13192 case UNSPEC_GOTTPOFF:
13193 /* FIXME: This might be @TPOFF in Sun ld too. */
13194 fputs ("@gottpoff", file);
13195 break;
13196 case UNSPEC_TPOFF:
13197 fputs ("@tpoff", file);
13198 break;
13199 case UNSPEC_NTPOFF:
13200 if (TARGET_64BIT)
13201 fputs ("@tpoff", file);
13202 else
13203 fputs ("@ntpoff", file);
13204 break;
13205 case UNSPEC_DTPOFF:
13206 fputs ("@dtpoff", file);
13207 break;
13208 case UNSPEC_GOTNTPOFF:
13209 if (TARGET_64BIT)
13210 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13211 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13212 else
13213 fputs ("@gotntpoff", file);
13214 break;
13215 case UNSPEC_INDNTPOFF:
13216 fputs ("@indntpoff", file);
13217 break;
13218 #if TARGET_MACHO
13219 case UNSPEC_MACHOPIC_OFFSET:
13220 putc ('-', file);
13221 machopic_output_function_base_name (file);
13222 break;
13223 #endif
13224 default:
13225 output_operand_lossage ("invalid UNSPEC as operand");
13226 break;
13228 break;
13230 default:
13231 output_operand_lossage ("invalid expression as operand");
13235 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13236 We need to emit DTP-relative relocations. */
13238 static void ATTRIBUTE_UNUSED
13239 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13241 fputs (ASM_LONG, file);
13242 output_addr_const (file, x);
13243 fputs ("@dtpoff", file);
13244 switch (size)
13246 case 4:
13247 break;
13248 case 8:
13249 fputs (", 0", file);
13250 break;
13251 default:
13252 gcc_unreachable ();
13256 /* Return true if X is a representation of the PIC register. This copes
13257 with calls from ix86_find_base_term, where the register might have
13258 been replaced by a cselib value. */
13260 static bool
13261 ix86_pic_register_p (rtx x)
13263 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13264 return (pic_offset_table_rtx
13265 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13266 else
13267 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13270 /* Helper function for ix86_delegitimize_address.
13271 Attempt to delegitimize TLS local-exec accesses. */
13273 static rtx
13274 ix86_delegitimize_tls_address (rtx orig_x)
13276 rtx x = orig_x, unspec;
13277 struct ix86_address addr;
13279 if (!TARGET_TLS_DIRECT_SEG_REFS)
13280 return orig_x;
13281 if (MEM_P (x))
13282 x = XEXP (x, 0);
13283 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13284 return orig_x;
13285 if (ix86_decompose_address (x, &addr) == 0
13286 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13287 || addr.disp == NULL_RTX
13288 || GET_CODE (addr.disp) != CONST)
13289 return orig_x;
13290 unspec = XEXP (addr.disp, 0);
13291 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13292 unspec = XEXP (unspec, 0);
13293 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13294 return orig_x;
13295 x = XVECEXP (unspec, 0, 0);
13296 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13297 if (unspec != XEXP (addr.disp, 0))
13298 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13299 if (addr.index)
13301 rtx idx = addr.index;
13302 if (addr.scale != 1)
13303 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13304 x = gen_rtx_PLUS (Pmode, idx, x);
13306 if (addr.base)
13307 x = gen_rtx_PLUS (Pmode, addr.base, x);
13308 if (MEM_P (orig_x))
13309 x = replace_equiv_address_nv (orig_x, x);
13310 return x;
13313 /* In the name of slightly smaller debug output, and to cater to
13314 general assembler lossage, recognize PIC+GOTOFF and turn it back
13315 into a direct symbol reference.
13317 On Darwin, this is necessary to avoid a crash, because Darwin
13318 has a different PIC label for each routine but the DWARF debugging
13319 information is not associated with any particular routine, so it's
13320 necessary to remove references to the PIC label from RTL stored by
13321 the DWARF output code. */
13323 static rtx
13324 ix86_delegitimize_address (rtx x)
13326 rtx orig_x = delegitimize_mem_from_attrs (x);
13327 /* addend is NULL or some rtx if x is something+GOTOFF where
13328 something doesn't include the PIC register. */
13329 rtx addend = NULL_RTX;
13330 /* reg_addend is NULL or a multiple of some register. */
13331 rtx reg_addend = NULL_RTX;
13332 /* const_addend is NULL or a const_int. */
13333 rtx const_addend = NULL_RTX;
13334 /* This is the result, or NULL. */
13335 rtx result = NULL_RTX;
13337 x = orig_x;
13339 if (MEM_P (x))
13340 x = XEXP (x, 0);
13342 if (TARGET_64BIT)
13344 if (GET_CODE (x) == CONST
13345 && GET_CODE (XEXP (x, 0)) == PLUS
13346 && GET_MODE (XEXP (x, 0)) == Pmode
13347 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13348 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13349 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13351 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13352 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13353 if (MEM_P (orig_x))
13354 x = replace_equiv_address_nv (orig_x, x);
13355 return x;
13357 if (GET_CODE (x) != CONST
13358 || GET_CODE (XEXP (x, 0)) != UNSPEC
13359 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13360 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13361 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13362 return ix86_delegitimize_tls_address (orig_x);
13363 x = XVECEXP (XEXP (x, 0), 0, 0);
13364 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13366 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13367 GET_MODE (x), 0);
13368 if (x == NULL_RTX)
13369 return orig_x;
13371 return x;
13374 if (GET_CODE (x) != PLUS
13375 || GET_CODE (XEXP (x, 1)) != CONST)
13376 return ix86_delegitimize_tls_address (orig_x);
13378 if (ix86_pic_register_p (XEXP (x, 0)))
13379 /* %ebx + GOT/GOTOFF */
13381 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13383 /* %ebx + %reg * scale + GOT/GOTOFF */
13384 reg_addend = XEXP (x, 0);
13385 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13386 reg_addend = XEXP (reg_addend, 1);
13387 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13388 reg_addend = XEXP (reg_addend, 0);
13389 else
13391 reg_addend = NULL_RTX;
13392 addend = XEXP (x, 0);
13395 else
13396 addend = XEXP (x, 0);
13398 x = XEXP (XEXP (x, 1), 0);
13399 if (GET_CODE (x) == PLUS
13400 && CONST_INT_P (XEXP (x, 1)))
13402 const_addend = XEXP (x, 1);
13403 x = XEXP (x, 0);
13406 if (GET_CODE (x) == UNSPEC
13407 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13408 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13409 result = XVECEXP (x, 0, 0);
13411 if (TARGET_MACHO && darwin_local_data_pic (x)
13412 && !MEM_P (orig_x))
13413 result = XVECEXP (x, 0, 0);
13415 if (! result)
13416 return ix86_delegitimize_tls_address (orig_x);
13418 if (const_addend)
13419 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13420 if (reg_addend)
13421 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13422 if (addend)
13424 /* If the rest of original X doesn't involve the PIC register, add
13425 addend and subtract pic_offset_table_rtx. This can happen e.g.
13426 for code like:
13427 leal (%ebx, %ecx, 4), %ecx
13429 movl foo@GOTOFF(%ecx), %edx
13430 in which case we return (%ecx - %ebx) + foo. */
13431 if (pic_offset_table_rtx)
13432 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13433 pic_offset_table_rtx),
13434 result);
13435 else
13436 return orig_x;
13438 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13440 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13441 if (result == NULL_RTX)
13442 return orig_x;
13444 return result;
13447 /* If X is a machine specific address (i.e. a symbol or label being
13448 referenced as a displacement from the GOT implemented using an
13449 UNSPEC), then return the base term. Otherwise return X. */
13452 ix86_find_base_term (rtx x)
13454 rtx term;
13456 if (TARGET_64BIT)
13458 if (GET_CODE (x) != CONST)
13459 return x;
13460 term = XEXP (x, 0);
13461 if (GET_CODE (term) == PLUS
13462 && (CONST_INT_P (XEXP (term, 1))
13463 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13464 term = XEXP (term, 0);
13465 if (GET_CODE (term) != UNSPEC
13466 || (XINT (term, 1) != UNSPEC_GOTPCREL
13467 && XINT (term, 1) != UNSPEC_PCREL))
13468 return x;
13470 return XVECEXP (term, 0, 0);
13473 return ix86_delegitimize_address (x);
13476 static void
13477 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13478 int fp, FILE *file)
13480 const char *suffix;
13482 if (mode == CCFPmode || mode == CCFPUmode)
13484 code = ix86_fp_compare_code_to_integer (code);
13485 mode = CCmode;
13487 if (reverse)
13488 code = reverse_condition (code);
13490 switch (code)
13492 case EQ:
13493 switch (mode)
13495 case CCAmode:
13496 suffix = "a";
13497 break;
13499 case CCCmode:
13500 suffix = "c";
13501 break;
13503 case CCOmode:
13504 suffix = "o";
13505 break;
13507 case CCSmode:
13508 suffix = "s";
13509 break;
13511 default:
13512 suffix = "e";
13514 break;
13515 case NE:
13516 switch (mode)
13518 case CCAmode:
13519 suffix = "na";
13520 break;
13522 case CCCmode:
13523 suffix = "nc";
13524 break;
13526 case CCOmode:
13527 suffix = "no";
13528 break;
13530 case CCSmode:
13531 suffix = "ns";
13532 break;
13534 default:
13535 suffix = "ne";
13537 break;
13538 case GT:
13539 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13540 suffix = "g";
13541 break;
13542 case GTU:
13543 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13544 Those same assemblers have the same but opposite lossage on cmov. */
13545 if (mode == CCmode)
13546 suffix = fp ? "nbe" : "a";
13547 else if (mode == CCCmode)
13548 suffix = "b";
13549 else
13550 gcc_unreachable ();
13551 break;
13552 case LT:
13553 switch (mode)
13555 case CCNOmode:
13556 case CCGOCmode:
13557 suffix = "s";
13558 break;
13560 case CCmode:
13561 case CCGCmode:
13562 suffix = "l";
13563 break;
13565 default:
13566 gcc_unreachable ();
13568 break;
13569 case LTU:
13570 gcc_assert (mode == CCmode || mode == CCCmode);
13571 suffix = "b";
13572 break;
13573 case GE:
13574 switch (mode)
13576 case CCNOmode:
13577 case CCGOCmode:
13578 suffix = "ns";
13579 break;
13581 case CCmode:
13582 case CCGCmode:
13583 suffix = "ge";
13584 break;
13586 default:
13587 gcc_unreachable ();
13589 break;
13590 case GEU:
13591 /* ??? As above. */
13592 gcc_assert (mode == CCmode || mode == CCCmode);
13593 suffix = fp ? "nb" : "ae";
13594 break;
13595 case LE:
13596 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13597 suffix = "le";
13598 break;
13599 case LEU:
13600 /* ??? As above. */
13601 if (mode == CCmode)
13602 suffix = "be";
13603 else if (mode == CCCmode)
13604 suffix = fp ? "nb" : "ae";
13605 else
13606 gcc_unreachable ();
13607 break;
13608 case UNORDERED:
13609 suffix = fp ? "u" : "p";
13610 break;
13611 case ORDERED:
13612 suffix = fp ? "nu" : "np";
13613 break;
13614 default:
13615 gcc_unreachable ();
13617 fputs (suffix, file);
13620 /* Print the name of register X to FILE based on its machine mode and number.
13621 If CODE is 'w', pretend the mode is HImode.
13622 If CODE is 'b', pretend the mode is QImode.
13623 If CODE is 'k', pretend the mode is SImode.
13624 If CODE is 'q', pretend the mode is DImode.
13625 If CODE is 'x', pretend the mode is V4SFmode.
13626 If CODE is 't', pretend the mode is V8SFmode.
13627 If CODE is 'h', pretend the reg is the 'high' byte register.
13628 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13629 If CODE is 'd', duplicate the operand for AVX instruction.
13632 void
13633 print_reg (rtx x, int code, FILE *file)
13635 const char *reg;
13636 bool duplicated = code == 'd' && TARGET_AVX;
13638 gcc_assert (x == pc_rtx
13639 || (REGNO (x) != ARG_POINTER_REGNUM
13640 && REGNO (x) != FRAME_POINTER_REGNUM
13641 && REGNO (x) != FLAGS_REG
13642 && REGNO (x) != FPSR_REG
13643 && REGNO (x) != FPCR_REG));
13645 if (ASSEMBLER_DIALECT == ASM_ATT)
13646 putc ('%', file);
13648 if (x == pc_rtx)
13650 gcc_assert (TARGET_64BIT);
13651 fputs ("rip", file);
13652 return;
13655 if (code == 'w' || MMX_REG_P (x))
13656 code = 2;
13657 else if (code == 'b')
13658 code = 1;
13659 else if (code == 'k')
13660 code = 4;
13661 else if (code == 'q')
13662 code = 8;
13663 else if (code == 'y')
13664 code = 3;
13665 else if (code == 'h')
13666 code = 0;
13667 else if (code == 'x')
13668 code = 16;
13669 else if (code == 't')
13670 code = 32;
13671 else
13672 code = GET_MODE_SIZE (GET_MODE (x));
13674 /* Irritatingly, AMD extended registers use different naming convention
13675 from the normal registers: "r%d[bwd]" */
13676 if (REX_INT_REG_P (x))
13678 gcc_assert (TARGET_64BIT);
13679 putc ('r', file);
13680 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13681 switch (code)
13683 case 0:
13684 error ("extended registers have no high halves");
13685 break;
13686 case 1:
13687 putc ('b', file);
13688 break;
13689 case 2:
13690 putc ('w', file);
13691 break;
13692 case 4:
13693 putc ('d', file);
13694 break;
13695 case 8:
13696 /* no suffix */
13697 break;
13698 default:
13699 error ("unsupported operand size for extended register");
13700 break;
13702 return;
13705 reg = NULL;
13706 switch (code)
13708 case 3:
13709 if (STACK_TOP_P (x))
13711 reg = "st(0)";
13712 break;
13714 /* FALLTHRU */
13715 case 8:
13716 case 4:
13717 case 12:
13718 if (! ANY_FP_REG_P (x))
13719 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13720 /* FALLTHRU */
13721 case 16:
13722 case 2:
13723 normal:
13724 reg = hi_reg_name[REGNO (x)];
13725 break;
13726 case 1:
13727 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13728 goto normal;
13729 reg = qi_reg_name[REGNO (x)];
13730 break;
13731 case 0:
13732 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13733 goto normal;
13734 reg = qi_high_reg_name[REGNO (x)];
13735 break;
13736 case 32:
13737 if (SSE_REG_P (x))
13739 gcc_assert (!duplicated);
13740 putc ('y', file);
13741 fputs (hi_reg_name[REGNO (x)] + 1, file);
13742 return;
13744 break;
13745 default:
13746 gcc_unreachable ();
13749 fputs (reg, file);
13750 if (duplicated)
13752 if (ASSEMBLER_DIALECT == ASM_ATT)
13753 fprintf (file, ", %%%s", reg);
13754 else
13755 fprintf (file, ", %s", reg);
13759 /* Locate some local-dynamic symbol still in use by this function
13760 so that we can print its name in some tls_local_dynamic_base
13761 pattern. */
13763 static int
13764 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13766 rtx x = *px;
13768 if (GET_CODE (x) == SYMBOL_REF
13769 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13771 cfun->machine->some_ld_name = XSTR (x, 0);
13772 return 1;
13775 return 0;
13778 static const char *
13779 get_some_local_dynamic_name (void)
13781 rtx insn;
13783 if (cfun->machine->some_ld_name)
13784 return cfun->machine->some_ld_name;
13786 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13787 if (NONDEBUG_INSN_P (insn)
13788 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13789 return cfun->machine->some_ld_name;
13791 return NULL;
13794 /* Meaning of CODE:
13795 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13796 C -- print opcode suffix for set/cmov insn.
13797 c -- like C, but print reversed condition
13798 F,f -- likewise, but for floating-point.
13799 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13800 otherwise nothing
13801 R -- print the prefix for register names.
13802 z -- print the opcode suffix for the size of the current operand.
13803 Z -- likewise, with special suffixes for x87 instructions.
13804 * -- print a star (in certain assembler syntax)
13805 A -- print an absolute memory reference.
13806 E -- print address with DImode register names if TARGET_64BIT.
13807 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13808 s -- print a shift double count, followed by the assemblers argument
13809 delimiter.
13810 b -- print the QImode name of the register for the indicated operand.
13811 %b0 would print %al if operands[0] is reg 0.
13812 w -- likewise, print the HImode name of the register.
13813 k -- likewise, print the SImode name of the register.
13814 q -- likewise, print the DImode name of the register.
13815 x -- likewise, print the V4SFmode name of the register.
13816 t -- likewise, print the V8SFmode name of the register.
13817 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13818 y -- print "st(0)" instead of "st" as a register.
13819 d -- print duplicated register operand for AVX instruction.
13820 D -- print condition for SSE cmp instruction.
13821 P -- if PIC, print an @PLT suffix.
13822 p -- print raw symbol name.
13823 X -- don't print any sort of PIC '@' suffix for a symbol.
13824 & -- print some in-use local-dynamic symbol name.
13825 H -- print a memory address offset by 8; used for sse high-parts
13826 Y -- print condition for XOP pcom* instruction.
13827 + -- print a branch hint as 'cs' or 'ds' prefix
13828 ; -- print a semicolon (after prefixes due to bug in older gas).
13829 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13830 @ -- print a segment register of thread base pointer load
13831 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
13834 void
13835 ix86_print_operand (FILE *file, rtx x, int code)
13837 if (code)
13839 switch (code)
13841 case '*':
13842 if (ASSEMBLER_DIALECT == ASM_ATT)
13843 putc ('*', file);
13844 return;
13846 case '&':
13848 const char *name = get_some_local_dynamic_name ();
13849 if (name == NULL)
13850 output_operand_lossage ("'%%&' used without any "
13851 "local dynamic TLS references");
13852 else
13853 assemble_name (file, name);
13854 return;
13857 case 'A':
13858 switch (ASSEMBLER_DIALECT)
13860 case ASM_ATT:
13861 putc ('*', file);
13862 break;
13864 case ASM_INTEL:
13865 /* Intel syntax. For absolute addresses, registers should not
13866 be surrounded by braces. */
13867 if (!REG_P (x))
13869 putc ('[', file);
13870 ix86_print_operand (file, x, 0);
13871 putc (']', file);
13872 return;
13874 break;
13876 default:
13877 gcc_unreachable ();
13880 ix86_print_operand (file, x, 0);
13881 return;
13883 case 'E':
13884 /* Wrap address in an UNSPEC to declare special handling. */
13885 if (TARGET_64BIT)
13886 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
13888 output_address (x);
13889 return;
13891 case 'L':
13892 if (ASSEMBLER_DIALECT == ASM_ATT)
13893 putc ('l', file);
13894 return;
13896 case 'W':
13897 if (ASSEMBLER_DIALECT == ASM_ATT)
13898 putc ('w', file);
13899 return;
13901 case 'B':
13902 if (ASSEMBLER_DIALECT == ASM_ATT)
13903 putc ('b', file);
13904 return;
13906 case 'Q':
13907 if (ASSEMBLER_DIALECT == ASM_ATT)
13908 putc ('l', file);
13909 return;
13911 case 'S':
13912 if (ASSEMBLER_DIALECT == ASM_ATT)
13913 putc ('s', file);
13914 return;
13916 case 'T':
13917 if (ASSEMBLER_DIALECT == ASM_ATT)
13918 putc ('t', file);
13919 return;
13921 case 'z':
13922 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13924 /* Opcodes don't get size suffixes if using Intel opcodes. */
13925 if (ASSEMBLER_DIALECT == ASM_INTEL)
13926 return;
13928 switch (GET_MODE_SIZE (GET_MODE (x)))
13930 case 1:
13931 putc ('b', file);
13932 return;
13934 case 2:
13935 putc ('w', file);
13936 return;
13938 case 4:
13939 putc ('l', file);
13940 return;
13942 case 8:
13943 putc ('q', file);
13944 return;
13946 default:
13947 output_operand_lossage
13948 ("invalid operand size for operand code '%c'", code);
13949 return;
13953 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13954 warning
13955 (0, "non-integer operand used with operand code '%c'", code);
13956 /* FALLTHRU */
13958 case 'Z':
13959 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13960 if (ASSEMBLER_DIALECT == ASM_INTEL)
13961 return;
13963 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13965 switch (GET_MODE_SIZE (GET_MODE (x)))
13967 case 2:
13968 #ifdef HAVE_AS_IX86_FILDS
13969 putc ('s', file);
13970 #endif
13971 return;
13973 case 4:
13974 putc ('l', file);
13975 return;
13977 case 8:
13978 #ifdef HAVE_AS_IX86_FILDQ
13979 putc ('q', file);
13980 #else
13981 fputs ("ll", file);
13982 #endif
13983 return;
13985 default:
13986 break;
13989 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13991 /* 387 opcodes don't get size suffixes
13992 if the operands are registers. */
13993 if (STACK_REG_P (x))
13994 return;
13996 switch (GET_MODE_SIZE (GET_MODE (x)))
13998 case 4:
13999 putc ('s', file);
14000 return;
14002 case 8:
14003 putc ('l', file);
14004 return;
14006 case 12:
14007 case 16:
14008 putc ('t', file);
14009 return;
14011 default:
14012 break;
14015 else
14017 output_operand_lossage
14018 ("invalid operand type used with operand code '%c'", code);
14019 return;
14022 output_operand_lossage
14023 ("invalid operand size for operand code '%c'", code);
14024 return;
14026 case 'd':
14027 case 'b':
14028 case 'w':
14029 case 'k':
14030 case 'q':
14031 case 'h':
14032 case 't':
14033 case 'y':
14034 case 'x':
14035 case 'X':
14036 case 'P':
14037 case 'p':
14038 break;
14040 case 's':
14041 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14043 ix86_print_operand (file, x, 0);
14044 fputs (", ", file);
14046 return;
14048 case 'D':
14049 /* Little bit of braindamage here. The SSE compare instructions
14050 does use completely different names for the comparisons that the
14051 fp conditional moves. */
14052 if (TARGET_AVX)
14054 switch (GET_CODE (x))
14056 case EQ:
14057 fputs ("eq", file);
14058 break;
14059 case UNEQ:
14060 fputs ("eq_us", file);
14061 break;
14062 case LT:
14063 fputs ("lt", file);
14064 break;
14065 case UNLT:
14066 fputs ("nge", file);
14067 break;
14068 case LE:
14069 fputs ("le", file);
14070 break;
14071 case UNLE:
14072 fputs ("ngt", file);
14073 break;
14074 case UNORDERED:
14075 fputs ("unord", file);
14076 break;
14077 case NE:
14078 fputs ("neq", file);
14079 break;
14080 case LTGT:
14081 fputs ("neq_oq", file);
14082 break;
14083 case GE:
14084 fputs ("ge", file);
14085 break;
14086 case UNGE:
14087 fputs ("nlt", file);
14088 break;
14089 case GT:
14090 fputs ("gt", file);
14091 break;
14092 case UNGT:
14093 fputs ("nle", file);
14094 break;
14095 case ORDERED:
14096 fputs ("ord", file);
14097 break;
14098 default:
14099 output_operand_lossage ("operand is not a condition code, "
14100 "invalid operand code 'D'");
14101 return;
14104 else
14106 switch (GET_CODE (x))
14108 case EQ:
14109 case UNEQ:
14110 fputs ("eq", file);
14111 break;
14112 case LT:
14113 case UNLT:
14114 fputs ("lt", file);
14115 break;
14116 case LE:
14117 case UNLE:
14118 fputs ("le", file);
14119 break;
14120 case UNORDERED:
14121 fputs ("unord", file);
14122 break;
14123 case NE:
14124 case LTGT:
14125 fputs ("neq", file);
14126 break;
14127 case UNGE:
14128 case GE:
14129 fputs ("nlt", file);
14130 break;
14131 case UNGT:
14132 case GT:
14133 fputs ("nle", file);
14134 break;
14135 case ORDERED:
14136 fputs ("ord", file);
14137 break;
14138 default:
14139 output_operand_lossage ("operand is not a condition code, "
14140 "invalid operand code 'D'");
14141 return;
14144 return;
14145 case 'O':
14146 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14147 if (ASSEMBLER_DIALECT == ASM_ATT)
14149 switch (GET_MODE (x))
14151 case HImode: putc ('w', file); break;
14152 case SImode:
14153 case SFmode: putc ('l', file); break;
14154 case DImode:
14155 case DFmode: putc ('q', file); break;
14156 default: gcc_unreachable ();
14158 putc ('.', file);
14160 #endif
14161 return;
14162 case 'C':
14163 if (!COMPARISON_P (x))
14165 output_operand_lossage ("operand is neither a constant nor a "
14166 "condition code, invalid operand code "
14167 "'C'");
14168 return;
14170 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14171 return;
14172 case 'F':
14173 if (!COMPARISON_P (x))
14175 output_operand_lossage ("operand is neither a constant nor a "
14176 "condition code, invalid operand code "
14177 "'F'");
14178 return;
14180 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14181 if (ASSEMBLER_DIALECT == ASM_ATT)
14182 putc ('.', file);
14183 #endif
14184 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14185 return;
14187 /* Like above, but reverse condition */
14188 case 'c':
14189 /* Check to see if argument to %c is really a constant
14190 and not a condition code which needs to be reversed. */
14191 if (!COMPARISON_P (x))
14193 output_operand_lossage ("operand is neither a constant nor a "
14194 "condition code, invalid operand "
14195 "code 'c'");
14196 return;
14198 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14199 return;
14200 case 'f':
14201 if (!COMPARISON_P (x))
14203 output_operand_lossage ("operand is neither a constant nor a "
14204 "condition code, invalid operand "
14205 "code 'f'");
14206 return;
14208 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14209 if (ASSEMBLER_DIALECT == ASM_ATT)
14210 putc ('.', file);
14211 #endif
14212 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14213 return;
14215 case 'H':
14216 if (!offsettable_memref_p (x))
14218 output_operand_lossage ("operand is not an offsettable memory "
14219 "reference, invalid operand "
14220 "code 'H'");
14221 return;
14223 /* It doesn't actually matter what mode we use here, as we're
14224 only going to use this for printing. */
14225 x = adjust_address_nv (x, DImode, 8);
14226 break;
14228 case '+':
14230 rtx x;
14232 if (!optimize
14233 || optimize_function_for_size_p (cfun)
14234 || !TARGET_BRANCH_PREDICTION_HINTS)
14235 return;
14237 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14238 if (x)
14240 int pred_val = INTVAL (XEXP (x, 0));
14242 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14243 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14245 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14246 bool cputaken
14247 = final_forward_branch_p (current_output_insn) == 0;
14249 /* Emit hints only in the case default branch prediction
14250 heuristics would fail. */
14251 if (taken != cputaken)
14253 /* We use 3e (DS) prefix for taken branches and
14254 2e (CS) prefix for not taken branches. */
14255 if (taken)
14256 fputs ("ds ; ", file);
14257 else
14258 fputs ("cs ; ", file);
14262 return;
14265 case 'Y':
14266 switch (GET_CODE (x))
14268 case NE:
14269 fputs ("neq", file);
14270 break;
14271 case EQ:
14272 fputs ("eq", file);
14273 break;
14274 case GE:
14275 case GEU:
14276 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14277 break;
14278 case GT:
14279 case GTU:
14280 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14281 break;
14282 case LE:
14283 case LEU:
14284 fputs ("le", file);
14285 break;
14286 case LT:
14287 case LTU:
14288 fputs ("lt", file);
14289 break;
14290 case UNORDERED:
14291 fputs ("unord", file);
14292 break;
14293 case ORDERED:
14294 fputs ("ord", file);
14295 break;
14296 case UNEQ:
14297 fputs ("ueq", file);
14298 break;
14299 case UNGE:
14300 fputs ("nlt", file);
14301 break;
14302 case UNGT:
14303 fputs ("nle", file);
14304 break;
14305 case UNLE:
14306 fputs ("ule", file);
14307 break;
14308 case UNLT:
14309 fputs ("ult", file);
14310 break;
14311 case LTGT:
14312 fputs ("une", file);
14313 break;
14314 default:
14315 output_operand_lossage ("operand is not a condition code, "
14316 "invalid operand code 'Y'");
14317 return;
14319 return;
14321 case ';':
14322 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14323 putc (';', file);
14324 #endif
14325 return;
14327 case '@':
14328 if (ASSEMBLER_DIALECT == ASM_ATT)
14329 putc ('%', file);
14331 /* The kernel uses a different segment register for performance
14332 reasons; a system call would not have to trash the userspace
14333 segment register, which would be expensive. */
14334 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14335 fputs ("fs", file);
14336 else
14337 fputs ("gs", file);
14338 return;
14340 case '~':
14341 putc (TARGET_AVX2 ? 'i' : 'f', file);
14342 return;
14344 case '^':
14345 if (TARGET_64BIT && Pmode != word_mode)
14346 fputs ("addr32 ", file);
14347 return;
14349 default:
14350 output_operand_lossage ("invalid operand code '%c'", code);
14354 if (REG_P (x))
14355 print_reg (x, code, file);
14357 else if (MEM_P (x))
14359 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14360 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14361 && GET_MODE (x) != BLKmode)
14363 const char * size;
14364 switch (GET_MODE_SIZE (GET_MODE (x)))
14366 case 1: size = "BYTE"; break;
14367 case 2: size = "WORD"; break;
14368 case 4: size = "DWORD"; break;
14369 case 8: size = "QWORD"; break;
14370 case 12: size = "TBYTE"; break;
14371 case 16:
14372 if (GET_MODE (x) == XFmode)
14373 size = "TBYTE";
14374 else
14375 size = "XMMWORD";
14376 break;
14377 case 32: size = "YMMWORD"; break;
14378 default:
14379 gcc_unreachable ();
14382 /* Check for explicit size override (codes 'b', 'w', 'k',
14383 'q' and 'x') */
14384 if (code == 'b')
14385 size = "BYTE";
14386 else if (code == 'w')
14387 size = "WORD";
14388 else if (code == 'k')
14389 size = "DWORD";
14390 else if (code == 'q')
14391 size = "QWORD";
14392 else if (code == 'x')
14393 size = "XMMWORD";
14395 fputs (size, file);
14396 fputs (" PTR ", file);
14399 x = XEXP (x, 0);
14400 /* Avoid (%rip) for call operands. */
14401 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14402 && !CONST_INT_P (x))
14403 output_addr_const (file, x);
14404 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14405 output_operand_lossage ("invalid constraints for operand");
14406 else
14407 output_address (x);
14410 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14412 REAL_VALUE_TYPE r;
14413 long l;
14415 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14416 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14418 if (ASSEMBLER_DIALECT == ASM_ATT)
14419 putc ('$', file);
14420 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14421 if (code == 'q')
14422 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14423 else
14424 fprintf (file, "0x%08x", (unsigned int) l);
14427 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14429 REAL_VALUE_TYPE r;
14430 long l[2];
14432 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14433 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14435 if (ASSEMBLER_DIALECT == ASM_ATT)
14436 putc ('$', file);
14437 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14440 /* These float cases don't actually occur as immediate operands. */
14441 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14443 char dstr[30];
14445 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14446 fputs (dstr, file);
14449 else
14451 /* We have patterns that allow zero sets of memory, for instance.
14452 In 64-bit mode, we should probably support all 8-byte vectors,
14453 since we can in fact encode that into an immediate. */
14454 if (GET_CODE (x) == CONST_VECTOR)
14456 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14457 x = const0_rtx;
14460 if (code != 'P' && code != 'p')
14462 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14464 if (ASSEMBLER_DIALECT == ASM_ATT)
14465 putc ('$', file);
14467 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14468 || GET_CODE (x) == LABEL_REF)
14470 if (ASSEMBLER_DIALECT == ASM_ATT)
14471 putc ('$', file);
14472 else
14473 fputs ("OFFSET FLAT:", file);
14476 if (CONST_INT_P (x))
14477 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14478 else if (flag_pic || MACHOPIC_INDIRECT)
14479 output_pic_addr_const (file, x, code);
14480 else
14481 output_addr_const (file, x);
14485 static bool
14486 ix86_print_operand_punct_valid_p (unsigned char code)
14488 return (code == '@' || code == '*' || code == '+' || code == '&'
14489 || code == ';' || code == '~' || code == '^');
14492 /* Print a memory operand whose address is ADDR. */
14494 static void
14495 ix86_print_operand_address (FILE *file, rtx addr)
14497 struct ix86_address parts;
14498 rtx base, index, disp;
14499 int scale;
14500 int ok;
14501 bool vsib = false;
14502 int code = 0;
14504 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14506 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14507 gcc_assert (parts.index == NULL_RTX);
14508 parts.index = XVECEXP (addr, 0, 1);
14509 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14510 addr = XVECEXP (addr, 0, 0);
14511 vsib = true;
14513 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14515 gcc_assert (TARGET_64BIT);
14516 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14517 code = 'q';
14519 else
14520 ok = ix86_decompose_address (addr, &parts);
14522 gcc_assert (ok);
14524 if (parts.base && GET_CODE (parts.base) == SUBREG)
14526 rtx tmp = SUBREG_REG (parts.base);
14527 parts.base = simplify_subreg (GET_MODE (parts.base),
14528 tmp, GET_MODE (tmp), 0);
14531 if (parts.index && GET_CODE (parts.index) == SUBREG)
14533 rtx tmp = SUBREG_REG (parts.index);
14534 parts.index = simplify_subreg (GET_MODE (parts.index),
14535 tmp, GET_MODE (tmp), 0);
14538 base = parts.base;
14539 index = parts.index;
14540 disp = parts.disp;
14541 scale = parts.scale;
14543 switch (parts.seg)
14545 case SEG_DEFAULT:
14546 break;
14547 case SEG_FS:
14548 case SEG_GS:
14549 if (ASSEMBLER_DIALECT == ASM_ATT)
14550 putc ('%', file);
14551 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14552 break;
14553 default:
14554 gcc_unreachable ();
14557 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14558 if (TARGET_64BIT && !base && !index)
14560 rtx symbol = disp;
14562 if (GET_CODE (disp) == CONST
14563 && GET_CODE (XEXP (disp, 0)) == PLUS
14564 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14565 symbol = XEXP (XEXP (disp, 0), 0);
14567 if (GET_CODE (symbol) == LABEL_REF
14568 || (GET_CODE (symbol) == SYMBOL_REF
14569 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14570 base = pc_rtx;
14572 if (!base && !index)
14574 /* Displacement only requires special attention. */
14576 if (CONST_INT_P (disp))
14578 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14579 fputs ("ds:", file);
14580 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14582 else if (flag_pic)
14583 output_pic_addr_const (file, disp, 0);
14584 else
14585 output_addr_const (file, disp);
14587 else
14589 /* Print SImode register names for zero-extended
14590 addresses to force addr32 prefix. */
14591 if (TARGET_64BIT
14592 && (GET_CODE (addr) == ZERO_EXTEND
14593 || GET_CODE (addr) == AND))
14595 gcc_assert (!code);
14596 code = 'l';
14599 if (ASSEMBLER_DIALECT == ASM_ATT)
14601 if (disp)
14603 if (flag_pic)
14604 output_pic_addr_const (file, disp, 0);
14605 else if (GET_CODE (disp) == LABEL_REF)
14606 output_asm_label (disp);
14607 else
14608 output_addr_const (file, disp);
14611 putc ('(', file);
14612 if (base)
14613 print_reg (base, code, file);
14614 if (index)
14616 putc (',', file);
14617 print_reg (index, vsib ? 0 : code, file);
14618 if (scale != 1 || vsib)
14619 fprintf (file, ",%d", scale);
14621 putc (')', file);
14623 else
14625 rtx offset = NULL_RTX;
14627 if (disp)
14629 /* Pull out the offset of a symbol; print any symbol itself. */
14630 if (GET_CODE (disp) == CONST
14631 && GET_CODE (XEXP (disp, 0)) == PLUS
14632 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14634 offset = XEXP (XEXP (disp, 0), 1);
14635 disp = gen_rtx_CONST (VOIDmode,
14636 XEXP (XEXP (disp, 0), 0));
14639 if (flag_pic)
14640 output_pic_addr_const (file, disp, 0);
14641 else if (GET_CODE (disp) == LABEL_REF)
14642 output_asm_label (disp);
14643 else if (CONST_INT_P (disp))
14644 offset = disp;
14645 else
14646 output_addr_const (file, disp);
14649 putc ('[', file);
14650 if (base)
14652 print_reg (base, code, file);
14653 if (offset)
14655 if (INTVAL (offset) >= 0)
14656 putc ('+', file);
14657 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14660 else if (offset)
14661 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14662 else
14663 putc ('0', file);
14665 if (index)
14667 putc ('+', file);
14668 print_reg (index, vsib ? 0 : code, file);
14669 if (scale != 1 || vsib)
14670 fprintf (file, "*%d", scale);
14672 putc (']', file);
14677 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14679 static bool
14680 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14682 rtx op;
14684 if (GET_CODE (x) != UNSPEC)
14685 return false;
14687 op = XVECEXP (x, 0, 0);
14688 switch (XINT (x, 1))
14690 case UNSPEC_GOTTPOFF:
14691 output_addr_const (file, op);
14692 /* FIXME: This might be @TPOFF in Sun ld. */
14693 fputs ("@gottpoff", file);
14694 break;
14695 case UNSPEC_TPOFF:
14696 output_addr_const (file, op);
14697 fputs ("@tpoff", file);
14698 break;
14699 case UNSPEC_NTPOFF:
14700 output_addr_const (file, op);
14701 if (TARGET_64BIT)
14702 fputs ("@tpoff", file);
14703 else
14704 fputs ("@ntpoff", file);
14705 break;
14706 case UNSPEC_DTPOFF:
14707 output_addr_const (file, op);
14708 fputs ("@dtpoff", file);
14709 break;
14710 case UNSPEC_GOTNTPOFF:
14711 output_addr_const (file, op);
14712 if (TARGET_64BIT)
14713 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14714 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14715 else
14716 fputs ("@gotntpoff", file);
14717 break;
14718 case UNSPEC_INDNTPOFF:
14719 output_addr_const (file, op);
14720 fputs ("@indntpoff", file);
14721 break;
14722 #if TARGET_MACHO
14723 case UNSPEC_MACHOPIC_OFFSET:
14724 output_addr_const (file, op);
14725 putc ('-', file);
14726 machopic_output_function_base_name (file);
14727 break;
14728 #endif
14730 case UNSPEC_STACK_CHECK:
14732 int offset;
14734 gcc_assert (flag_split_stack);
14736 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14737 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14738 #else
14739 gcc_unreachable ();
14740 #endif
14742 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14744 break;
14746 default:
14747 return false;
14750 return true;
14753 /* Split one or more double-mode RTL references into pairs of half-mode
14754 references. The RTL can be REG, offsettable MEM, integer constant, or
14755 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14756 split and "num" is its length. lo_half and hi_half are output arrays
14757 that parallel "operands". */
14759 void
14760 split_double_mode (enum machine_mode mode, rtx operands[],
14761 int num, rtx lo_half[], rtx hi_half[])
14763 enum machine_mode half_mode;
14764 unsigned int byte;
14766 switch (mode)
14768 case TImode:
14769 half_mode = DImode;
14770 break;
14771 case DImode:
14772 half_mode = SImode;
14773 break;
14774 default:
14775 gcc_unreachable ();
14778 byte = GET_MODE_SIZE (half_mode);
14780 while (num--)
14782 rtx op = operands[num];
14784 /* simplify_subreg refuse to split volatile memory addresses,
14785 but we still have to handle it. */
14786 if (MEM_P (op))
14788 lo_half[num] = adjust_address (op, half_mode, 0);
14789 hi_half[num] = adjust_address (op, half_mode, byte);
14791 else
14793 lo_half[num] = simplify_gen_subreg (half_mode, op,
14794 GET_MODE (op) == VOIDmode
14795 ? mode : GET_MODE (op), 0);
14796 hi_half[num] = simplify_gen_subreg (half_mode, op,
14797 GET_MODE (op) == VOIDmode
14798 ? mode : GET_MODE (op), byte);
14803 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14804 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14805 is the expression of the binary operation. The output may either be
14806 emitted here, or returned to the caller, like all output_* functions.
14808 There is no guarantee that the operands are the same mode, as they
14809 might be within FLOAT or FLOAT_EXTEND expressions. */
14811 #ifndef SYSV386_COMPAT
14812 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14813 wants to fix the assemblers because that causes incompatibility
14814 with gcc. No-one wants to fix gcc because that causes
14815 incompatibility with assemblers... You can use the option of
14816 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14817 #define SYSV386_COMPAT 1
14818 #endif
14820 const char *
14821 output_387_binary_op (rtx insn, rtx *operands)
14823 static char buf[40];
14824 const char *p;
14825 const char *ssep;
14826 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14828 #ifdef ENABLE_CHECKING
14829 /* Even if we do not want to check the inputs, this documents input
14830 constraints. Which helps in understanding the following code. */
14831 if (STACK_REG_P (operands[0])
14832 && ((REG_P (operands[1])
14833 && REGNO (operands[0]) == REGNO (operands[1])
14834 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14835 || (REG_P (operands[2])
14836 && REGNO (operands[0]) == REGNO (operands[2])
14837 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14838 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14839 ; /* ok */
14840 else
14841 gcc_assert (is_sse);
14842 #endif
14844 switch (GET_CODE (operands[3]))
14846 case PLUS:
14847 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14848 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14849 p = "fiadd";
14850 else
14851 p = "fadd";
14852 ssep = "vadd";
14853 break;
14855 case MINUS:
14856 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14857 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14858 p = "fisub";
14859 else
14860 p = "fsub";
14861 ssep = "vsub";
14862 break;
14864 case MULT:
14865 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14866 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14867 p = "fimul";
14868 else
14869 p = "fmul";
14870 ssep = "vmul";
14871 break;
14873 case DIV:
14874 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14875 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14876 p = "fidiv";
14877 else
14878 p = "fdiv";
14879 ssep = "vdiv";
14880 break;
14882 default:
14883 gcc_unreachable ();
14886 if (is_sse)
14888 if (TARGET_AVX)
14890 strcpy (buf, ssep);
14891 if (GET_MODE (operands[0]) == SFmode)
14892 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14893 else
14894 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14896 else
14898 strcpy (buf, ssep + 1);
14899 if (GET_MODE (operands[0]) == SFmode)
14900 strcat (buf, "ss\t{%2, %0|%0, %2}");
14901 else
14902 strcat (buf, "sd\t{%2, %0|%0, %2}");
14904 return buf;
14906 strcpy (buf, p);
14908 switch (GET_CODE (operands[3]))
14910 case MULT:
14911 case PLUS:
14912 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14914 rtx temp = operands[2];
14915 operands[2] = operands[1];
14916 operands[1] = temp;
14919 /* know operands[0] == operands[1]. */
14921 if (MEM_P (operands[2]))
14923 p = "%Z2\t%2";
14924 break;
14927 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14929 if (STACK_TOP_P (operands[0]))
14930 /* How is it that we are storing to a dead operand[2]?
14931 Well, presumably operands[1] is dead too. We can't
14932 store the result to st(0) as st(0) gets popped on this
14933 instruction. Instead store to operands[2] (which I
14934 think has to be st(1)). st(1) will be popped later.
14935 gcc <= 2.8.1 didn't have this check and generated
14936 assembly code that the Unixware assembler rejected. */
14937 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14938 else
14939 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14940 break;
14943 if (STACK_TOP_P (operands[0]))
14944 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14945 else
14946 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14947 break;
14949 case MINUS:
14950 case DIV:
14951 if (MEM_P (operands[1]))
14953 p = "r%Z1\t%1";
14954 break;
14957 if (MEM_P (operands[2]))
14959 p = "%Z2\t%2";
14960 break;
14963 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14965 #if SYSV386_COMPAT
14966 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14967 derived assemblers, confusingly reverse the direction of
14968 the operation for fsub{r} and fdiv{r} when the
14969 destination register is not st(0). The Intel assembler
14970 doesn't have this brain damage. Read !SYSV386_COMPAT to
14971 figure out what the hardware really does. */
14972 if (STACK_TOP_P (operands[0]))
14973 p = "{p\t%0, %2|rp\t%2, %0}";
14974 else
14975 p = "{rp\t%2, %0|p\t%0, %2}";
14976 #else
14977 if (STACK_TOP_P (operands[0]))
14978 /* As above for fmul/fadd, we can't store to st(0). */
14979 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14980 else
14981 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14982 #endif
14983 break;
14986 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14988 #if SYSV386_COMPAT
14989 if (STACK_TOP_P (operands[0]))
14990 p = "{rp\t%0, %1|p\t%1, %0}";
14991 else
14992 p = "{p\t%1, %0|rp\t%0, %1}";
14993 #else
14994 if (STACK_TOP_P (operands[0]))
14995 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14996 else
14997 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14998 #endif
14999 break;
15002 if (STACK_TOP_P (operands[0]))
15004 if (STACK_TOP_P (operands[1]))
15005 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15006 else
15007 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15008 break;
15010 else if (STACK_TOP_P (operands[1]))
15012 #if SYSV386_COMPAT
15013 p = "{\t%1, %0|r\t%0, %1}";
15014 #else
15015 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15016 #endif
15018 else
15020 #if SYSV386_COMPAT
15021 p = "{r\t%2, %0|\t%0, %2}";
15022 #else
15023 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15024 #endif
15026 break;
15028 default:
15029 gcc_unreachable ();
15032 strcat (buf, p);
15033 return buf;
15036 /* Return needed mode for entity in optimize_mode_switching pass. */
15039 ix86_mode_needed (int entity, rtx insn)
15041 enum attr_i387_cw mode;
15043 /* The mode UNINITIALIZED is used to store control word after a
15044 function call or ASM pattern. The mode ANY specify that function
15045 has no requirements on the control word and make no changes in the
15046 bits we are interested in. */
15048 if (CALL_P (insn)
15049 || (NONJUMP_INSN_P (insn)
15050 && (asm_noperands (PATTERN (insn)) >= 0
15051 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15052 return I387_CW_UNINITIALIZED;
15054 if (recog_memoized (insn) < 0)
15055 return I387_CW_ANY;
15057 mode = get_attr_i387_cw (insn);
15059 switch (entity)
15061 case I387_TRUNC:
15062 if (mode == I387_CW_TRUNC)
15063 return mode;
15064 break;
15066 case I387_FLOOR:
15067 if (mode == I387_CW_FLOOR)
15068 return mode;
15069 break;
15071 case I387_CEIL:
15072 if (mode == I387_CW_CEIL)
15073 return mode;
15074 break;
15076 case I387_MASK_PM:
15077 if (mode == I387_CW_MASK_PM)
15078 return mode;
15079 break;
15081 default:
15082 gcc_unreachable ();
15085 return I387_CW_ANY;
15088 /* Output code to initialize control word copies used by trunc?f?i and
15089 rounding patterns. CURRENT_MODE is set to current control word,
15090 while NEW_MODE is set to new control word. */
15092 void
15093 emit_i387_cw_initialization (int mode)
15095 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15096 rtx new_mode;
15098 enum ix86_stack_slot slot;
15100 rtx reg = gen_reg_rtx (HImode);
15102 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15103 emit_move_insn (reg, copy_rtx (stored_mode));
15105 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15106 || optimize_function_for_size_p (cfun))
15108 switch (mode)
15110 case I387_CW_TRUNC:
15111 /* round toward zero (truncate) */
15112 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15113 slot = SLOT_CW_TRUNC;
15114 break;
15116 case I387_CW_FLOOR:
15117 /* round down toward -oo */
15118 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15119 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15120 slot = SLOT_CW_FLOOR;
15121 break;
15123 case I387_CW_CEIL:
15124 /* round up toward +oo */
15125 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15126 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15127 slot = SLOT_CW_CEIL;
15128 break;
15130 case I387_CW_MASK_PM:
15131 /* mask precision exception for nearbyint() */
15132 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15133 slot = SLOT_CW_MASK_PM;
15134 break;
15136 default:
15137 gcc_unreachable ();
15140 else
15142 switch (mode)
15144 case I387_CW_TRUNC:
15145 /* round toward zero (truncate) */
15146 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15147 slot = SLOT_CW_TRUNC;
15148 break;
15150 case I387_CW_FLOOR:
15151 /* round down toward -oo */
15152 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15153 slot = SLOT_CW_FLOOR;
15154 break;
15156 case I387_CW_CEIL:
15157 /* round up toward +oo */
15158 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15159 slot = SLOT_CW_CEIL;
15160 break;
15162 case I387_CW_MASK_PM:
15163 /* mask precision exception for nearbyint() */
15164 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15165 slot = SLOT_CW_MASK_PM;
15166 break;
15168 default:
15169 gcc_unreachable ();
15173 gcc_assert (slot < MAX_386_STACK_LOCALS);
15175 new_mode = assign_386_stack_local (HImode, slot);
15176 emit_move_insn (new_mode, reg);
15179 /* Output code for INSN to convert a float to a signed int. OPERANDS
15180 are the insn operands. The output may be [HSD]Imode and the input
15181 operand may be [SDX]Fmode. */
15183 const char *
15184 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15186 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15187 int dimode_p = GET_MODE (operands[0]) == DImode;
15188 int round_mode = get_attr_i387_cw (insn);
15190 /* Jump through a hoop or two for DImode, since the hardware has no
15191 non-popping instruction. We used to do this a different way, but
15192 that was somewhat fragile and broke with post-reload splitters. */
15193 if ((dimode_p || fisttp) && !stack_top_dies)
15194 output_asm_insn ("fld\t%y1", operands);
15196 gcc_assert (STACK_TOP_P (operands[1]));
15197 gcc_assert (MEM_P (operands[0]));
15198 gcc_assert (GET_MODE (operands[1]) != TFmode);
15200 if (fisttp)
15201 output_asm_insn ("fisttp%Z0\t%0", operands);
15202 else
15204 if (round_mode != I387_CW_ANY)
15205 output_asm_insn ("fldcw\t%3", operands);
15206 if (stack_top_dies || dimode_p)
15207 output_asm_insn ("fistp%Z0\t%0", operands);
15208 else
15209 output_asm_insn ("fist%Z0\t%0", operands);
15210 if (round_mode != I387_CW_ANY)
15211 output_asm_insn ("fldcw\t%2", operands);
15214 return "";
15217 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15218 have the values zero or one, indicates the ffreep insn's operand
15219 from the OPERANDS array. */
15221 static const char *
15222 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15224 if (TARGET_USE_FFREEP)
15225 #ifdef HAVE_AS_IX86_FFREEP
15226 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15227 #else
15229 static char retval[32];
15230 int regno = REGNO (operands[opno]);
15232 gcc_assert (FP_REGNO_P (regno));
15234 regno -= FIRST_STACK_REG;
15236 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15237 return retval;
15239 #endif
15241 return opno ? "fstp\t%y1" : "fstp\t%y0";
15245 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15246 should be used. UNORDERED_P is true when fucom should be used. */
15248 const char *
15249 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15251 int stack_top_dies;
15252 rtx cmp_op0, cmp_op1;
15253 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15255 if (eflags_p)
15257 cmp_op0 = operands[0];
15258 cmp_op1 = operands[1];
15260 else
15262 cmp_op0 = operands[1];
15263 cmp_op1 = operands[2];
15266 if (is_sse)
15268 if (GET_MODE (operands[0]) == SFmode)
15269 if (unordered_p)
15270 return "%vucomiss\t{%1, %0|%0, %1}";
15271 else
15272 return "%vcomiss\t{%1, %0|%0, %1}";
15273 else
15274 if (unordered_p)
15275 return "%vucomisd\t{%1, %0|%0, %1}";
15276 else
15277 return "%vcomisd\t{%1, %0|%0, %1}";
15280 gcc_assert (STACK_TOP_P (cmp_op0));
15282 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15284 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15286 if (stack_top_dies)
15288 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15289 return output_387_ffreep (operands, 1);
15291 else
15292 return "ftst\n\tfnstsw\t%0";
15295 if (STACK_REG_P (cmp_op1)
15296 && stack_top_dies
15297 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15298 && REGNO (cmp_op1) != FIRST_STACK_REG)
15300 /* If both the top of the 387 stack dies, and the other operand
15301 is also a stack register that dies, then this must be a
15302 `fcompp' float compare */
15304 if (eflags_p)
15306 /* There is no double popping fcomi variant. Fortunately,
15307 eflags is immune from the fstp's cc clobbering. */
15308 if (unordered_p)
15309 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15310 else
15311 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15312 return output_387_ffreep (operands, 0);
15314 else
15316 if (unordered_p)
15317 return "fucompp\n\tfnstsw\t%0";
15318 else
15319 return "fcompp\n\tfnstsw\t%0";
15322 else
15324 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15326 static const char * const alt[16] =
15328 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15329 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15330 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15331 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15333 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15334 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15335 NULL,
15336 NULL,
15338 "fcomi\t{%y1, %0|%0, %y1}",
15339 "fcomip\t{%y1, %0|%0, %y1}",
15340 "fucomi\t{%y1, %0|%0, %y1}",
15341 "fucomip\t{%y1, %0|%0, %y1}",
15343 NULL,
15344 NULL,
15345 NULL,
15346 NULL
15349 int mask;
15350 const char *ret;
15352 mask = eflags_p << 3;
15353 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15354 mask |= unordered_p << 1;
15355 mask |= stack_top_dies;
15357 gcc_assert (mask < 16);
15358 ret = alt[mask];
15359 gcc_assert (ret);
15361 return ret;
15365 void
15366 ix86_output_addr_vec_elt (FILE *file, int value)
15368 const char *directive = ASM_LONG;
15370 #ifdef ASM_QUAD
15371 if (TARGET_LP64)
15372 directive = ASM_QUAD;
15373 #else
15374 gcc_assert (!TARGET_64BIT);
15375 #endif
15377 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15380 void
15381 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15383 const char *directive = ASM_LONG;
15385 #ifdef ASM_QUAD
15386 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15387 directive = ASM_QUAD;
15388 #else
15389 gcc_assert (!TARGET_64BIT);
15390 #endif
15391 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15392 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15393 fprintf (file, "%s%s%d-%s%d\n",
15394 directive, LPREFIX, value, LPREFIX, rel);
15395 else if (HAVE_AS_GOTOFF_IN_DATA)
15396 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15397 #if TARGET_MACHO
15398 else if (TARGET_MACHO)
15400 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15401 machopic_output_function_base_name (file);
15402 putc ('\n', file);
15404 #endif
15405 else
15406 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15407 GOT_SYMBOL_NAME, LPREFIX, value);
15410 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15411 for the target. */
15413 void
15414 ix86_expand_clear (rtx dest)
15416 rtx tmp;
15418 /* We play register width games, which are only valid after reload. */
15419 gcc_assert (reload_completed);
15421 /* Avoid HImode and its attendant prefix byte. */
15422 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15423 dest = gen_rtx_REG (SImode, REGNO (dest));
15424 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15426 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15427 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15429 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15430 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15433 emit_insn (tmp);
15436 /* X is an unchanging MEM. If it is a constant pool reference, return
15437 the constant pool rtx, else NULL. */
15440 maybe_get_pool_constant (rtx x)
15442 x = ix86_delegitimize_address (XEXP (x, 0));
15444 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15445 return get_pool_constant (x);
15447 return NULL_RTX;
15450 void
15451 ix86_expand_move (enum machine_mode mode, rtx operands[])
15453 rtx op0, op1;
15454 enum tls_model model;
15456 op0 = operands[0];
15457 op1 = operands[1];
15459 if (GET_CODE (op1) == SYMBOL_REF)
15461 model = SYMBOL_REF_TLS_MODEL (op1);
15462 if (model)
15464 op1 = legitimize_tls_address (op1, model, true);
15465 op1 = force_operand (op1, op0);
15466 if (op1 == op0)
15467 return;
15468 if (GET_MODE (op1) != mode)
15469 op1 = convert_to_mode (mode, op1, 1);
15471 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15472 && SYMBOL_REF_DLLIMPORT_P (op1))
15473 op1 = legitimize_dllimport_symbol (op1, false);
15475 else if (GET_CODE (op1) == CONST
15476 && GET_CODE (XEXP (op1, 0)) == PLUS
15477 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15479 rtx addend = XEXP (XEXP (op1, 0), 1);
15480 rtx symbol = XEXP (XEXP (op1, 0), 0);
15481 rtx tmp = NULL;
15483 model = SYMBOL_REF_TLS_MODEL (symbol);
15484 if (model)
15485 tmp = legitimize_tls_address (symbol, model, true);
15486 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15487 && SYMBOL_REF_DLLIMPORT_P (symbol))
15488 tmp = legitimize_dllimport_symbol (symbol, true);
15490 if (tmp)
15492 tmp = force_operand (tmp, NULL);
15493 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15494 op0, 1, OPTAB_DIRECT);
15495 if (tmp == op0)
15496 return;
15497 if (GET_MODE (tmp) != mode)
15498 op1 = convert_to_mode (mode, tmp, 1);
15502 if ((flag_pic || MACHOPIC_INDIRECT)
15503 && symbolic_operand (op1, mode))
15505 if (TARGET_MACHO && !TARGET_64BIT)
15507 #if TARGET_MACHO
15508 /* dynamic-no-pic */
15509 if (MACHOPIC_INDIRECT)
15511 rtx temp = ((reload_in_progress
15512 || ((op0 && REG_P (op0))
15513 && mode == Pmode))
15514 ? op0 : gen_reg_rtx (Pmode));
15515 op1 = machopic_indirect_data_reference (op1, temp);
15516 if (MACHOPIC_PURE)
15517 op1 = machopic_legitimize_pic_address (op1, mode,
15518 temp == op1 ? 0 : temp);
15520 if (op0 != op1 && GET_CODE (op0) != MEM)
15522 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15523 emit_insn (insn);
15524 return;
15526 if (GET_CODE (op0) == MEM)
15527 op1 = force_reg (Pmode, op1);
15528 else
15530 rtx temp = op0;
15531 if (GET_CODE (temp) != REG)
15532 temp = gen_reg_rtx (Pmode);
15533 temp = legitimize_pic_address (op1, temp);
15534 if (temp == op0)
15535 return;
15536 op1 = temp;
15538 /* dynamic-no-pic */
15539 #endif
15541 else
15543 if (MEM_P (op0))
15544 op1 = force_reg (mode, op1);
15545 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15547 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15548 op1 = legitimize_pic_address (op1, reg);
15549 if (op0 == op1)
15550 return;
15551 if (GET_MODE (op1) != mode)
15552 op1 = convert_to_mode (mode, op1, 1);
15556 else
15558 if (MEM_P (op0)
15559 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15560 || !push_operand (op0, mode))
15561 && MEM_P (op1))
15562 op1 = force_reg (mode, op1);
15564 if (push_operand (op0, mode)
15565 && ! general_no_elim_operand (op1, mode))
15566 op1 = copy_to_mode_reg (mode, op1);
15568 /* Force large constants in 64bit compilation into register
15569 to get them CSEed. */
15570 if (can_create_pseudo_p ()
15571 && (mode == DImode) && TARGET_64BIT
15572 && immediate_operand (op1, mode)
15573 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15574 && !register_operand (op0, mode)
15575 && optimize)
15576 op1 = copy_to_mode_reg (mode, op1);
15578 if (can_create_pseudo_p ()
15579 && FLOAT_MODE_P (mode)
15580 && GET_CODE (op1) == CONST_DOUBLE)
15582 /* If we are loading a floating point constant to a register,
15583 force the value to memory now, since we'll get better code
15584 out the back end. */
15586 op1 = validize_mem (force_const_mem (mode, op1));
15587 if (!register_operand (op0, mode))
15589 rtx temp = gen_reg_rtx (mode);
15590 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15591 emit_move_insn (op0, temp);
15592 return;
15597 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15600 void
15601 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15603 rtx op0 = operands[0], op1 = operands[1];
15604 unsigned int align = GET_MODE_ALIGNMENT (mode);
15606 /* Force constants other than zero into memory. We do not know how
15607 the instructions used to build constants modify the upper 64 bits
15608 of the register, once we have that information we may be able
15609 to handle some of them more efficiently. */
15610 if (can_create_pseudo_p ()
15611 && register_operand (op0, mode)
15612 && (CONSTANT_P (op1)
15613 || (GET_CODE (op1) == SUBREG
15614 && CONSTANT_P (SUBREG_REG (op1))))
15615 && !standard_sse_constant_p (op1))
15616 op1 = validize_mem (force_const_mem (mode, op1));
15618 /* We need to check memory alignment for SSE mode since attribute
15619 can make operands unaligned. */
15620 if (can_create_pseudo_p ()
15621 && SSE_REG_MODE_P (mode)
15622 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15623 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15625 rtx tmp[2];
15627 /* ix86_expand_vector_move_misalign() does not like constants ... */
15628 if (CONSTANT_P (op1)
15629 || (GET_CODE (op1) == SUBREG
15630 && CONSTANT_P (SUBREG_REG (op1))))
15631 op1 = validize_mem (force_const_mem (mode, op1));
15633 /* ... nor both arguments in memory. */
15634 if (!register_operand (op0, mode)
15635 && !register_operand (op1, mode))
15636 op1 = force_reg (mode, op1);
15638 tmp[0] = op0; tmp[1] = op1;
15639 ix86_expand_vector_move_misalign (mode, tmp);
15640 return;
15643 /* Make operand1 a register if it isn't already. */
15644 if (can_create_pseudo_p ()
15645 && !register_operand (op0, mode)
15646 && !register_operand (op1, mode))
15648 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15649 return;
15652 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15655 /* Split 32-byte AVX unaligned load and store if needed. */
15657 static void
15658 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15660 rtx m;
15661 rtx (*extract) (rtx, rtx, rtx);
15662 rtx (*move_unaligned) (rtx, rtx);
15663 enum machine_mode mode;
15665 switch (GET_MODE (op0))
15667 default:
15668 gcc_unreachable ();
15669 case V32QImode:
15670 extract = gen_avx_vextractf128v32qi;
15671 move_unaligned = gen_avx_movdqu256;
15672 mode = V16QImode;
15673 break;
15674 case V8SFmode:
15675 extract = gen_avx_vextractf128v8sf;
15676 move_unaligned = gen_avx_movups256;
15677 mode = V4SFmode;
15678 break;
15679 case V4DFmode:
15680 extract = gen_avx_vextractf128v4df;
15681 move_unaligned = gen_avx_movupd256;
15682 mode = V2DFmode;
15683 break;
15686 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15688 rtx r = gen_reg_rtx (mode);
15689 m = adjust_address (op1, mode, 0);
15690 emit_move_insn (r, m);
15691 m = adjust_address (op1, mode, 16);
15692 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15693 emit_move_insn (op0, r);
15695 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15697 m = adjust_address (op0, mode, 0);
15698 emit_insn (extract (m, op1, const0_rtx));
15699 m = adjust_address (op0, mode, 16);
15700 emit_insn (extract (m, op1, const1_rtx));
15702 else
15703 emit_insn (move_unaligned (op0, op1));
15706 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15707 straight to ix86_expand_vector_move. */
15708 /* Code generation for scalar reg-reg moves of single and double precision data:
15709 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15710 movaps reg, reg
15711 else
15712 movss reg, reg
15713 if (x86_sse_partial_reg_dependency == true)
15714 movapd reg, reg
15715 else
15716 movsd reg, reg
15718 Code generation for scalar loads of double precision data:
15719 if (x86_sse_split_regs == true)
15720 movlpd mem, reg (gas syntax)
15721 else
15722 movsd mem, reg
15724 Code generation for unaligned packed loads of single precision data
15725 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15726 if (x86_sse_unaligned_move_optimal)
15727 movups mem, reg
15729 if (x86_sse_partial_reg_dependency == true)
15731 xorps reg, reg
15732 movlps mem, reg
15733 movhps mem+8, reg
15735 else
15737 movlps mem, reg
15738 movhps mem+8, reg
15741 Code generation for unaligned packed loads of double precision data
15742 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15743 if (x86_sse_unaligned_move_optimal)
15744 movupd mem, reg
15746 if (x86_sse_split_regs == true)
15748 movlpd mem, reg
15749 movhpd mem+8, reg
15751 else
15753 movsd mem, reg
15754 movhpd mem+8, reg
15758 void
15759 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15761 rtx op0, op1, m;
15763 op0 = operands[0];
15764 op1 = operands[1];
15766 if (TARGET_AVX)
15768 switch (GET_MODE_CLASS (mode))
15770 case MODE_VECTOR_INT:
15771 case MODE_INT:
15772 switch (GET_MODE_SIZE (mode))
15774 case 16:
15775 /* If we're optimizing for size, movups is the smallest. */
15776 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15778 op0 = gen_lowpart (V4SFmode, op0);
15779 op1 = gen_lowpart (V4SFmode, op1);
15780 emit_insn (gen_sse_movups (op0, op1));
15781 return;
15783 op0 = gen_lowpart (V16QImode, op0);
15784 op1 = gen_lowpart (V16QImode, op1);
15785 emit_insn (gen_sse2_movdqu (op0, op1));
15786 break;
15787 case 32:
15788 op0 = gen_lowpart (V32QImode, op0);
15789 op1 = gen_lowpart (V32QImode, op1);
15790 ix86_avx256_split_vector_move_misalign (op0, op1);
15791 break;
15792 default:
15793 gcc_unreachable ();
15795 break;
15796 case MODE_VECTOR_FLOAT:
15797 op0 = gen_lowpart (mode, op0);
15798 op1 = gen_lowpart (mode, op1);
15800 switch (mode)
15802 case V4SFmode:
15803 emit_insn (gen_sse_movups (op0, op1));
15804 break;
15805 case V8SFmode:
15806 ix86_avx256_split_vector_move_misalign (op0, op1);
15807 break;
15808 case V2DFmode:
15809 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15811 op0 = gen_lowpart (V4SFmode, op0);
15812 op1 = gen_lowpart (V4SFmode, op1);
15813 emit_insn (gen_sse_movups (op0, op1));
15814 return;
15816 emit_insn (gen_sse2_movupd (op0, op1));
15817 break;
15818 case V4DFmode:
15819 ix86_avx256_split_vector_move_misalign (op0, op1);
15820 break;
15821 default:
15822 gcc_unreachable ();
15824 break;
15826 default:
15827 gcc_unreachable ();
15830 return;
15833 if (MEM_P (op1))
15835 /* If we're optimizing for size, movups is the smallest. */
15836 if (optimize_insn_for_size_p ()
15837 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15839 op0 = gen_lowpart (V4SFmode, op0);
15840 op1 = gen_lowpart (V4SFmode, op1);
15841 emit_insn (gen_sse_movups (op0, op1));
15842 return;
15845 /* ??? If we have typed data, then it would appear that using
15846 movdqu is the only way to get unaligned data loaded with
15847 integer type. */
15848 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15850 op0 = gen_lowpart (V16QImode, op0);
15851 op1 = gen_lowpart (V16QImode, op1);
15852 emit_insn (gen_sse2_movdqu (op0, op1));
15853 return;
15856 if (TARGET_SSE2 && mode == V2DFmode)
15858 rtx zero;
15860 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15862 op0 = gen_lowpart (V2DFmode, op0);
15863 op1 = gen_lowpart (V2DFmode, op1);
15864 emit_insn (gen_sse2_movupd (op0, op1));
15865 return;
15868 /* When SSE registers are split into halves, we can avoid
15869 writing to the top half twice. */
15870 if (TARGET_SSE_SPLIT_REGS)
15872 emit_clobber (op0);
15873 zero = op0;
15875 else
15877 /* ??? Not sure about the best option for the Intel chips.
15878 The following would seem to satisfy; the register is
15879 entirely cleared, breaking the dependency chain. We
15880 then store to the upper half, with a dependency depth
15881 of one. A rumor has it that Intel recommends two movsd
15882 followed by an unpacklpd, but this is unconfirmed. And
15883 given that the dependency depth of the unpacklpd would
15884 still be one, I'm not sure why this would be better. */
15885 zero = CONST0_RTX (V2DFmode);
15888 m = adjust_address (op1, DFmode, 0);
15889 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15890 m = adjust_address (op1, DFmode, 8);
15891 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15893 else
15895 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15897 op0 = gen_lowpart (V4SFmode, op0);
15898 op1 = gen_lowpart (V4SFmode, op1);
15899 emit_insn (gen_sse_movups (op0, op1));
15900 return;
15903 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15904 emit_move_insn (op0, CONST0_RTX (mode));
15905 else
15906 emit_clobber (op0);
15908 if (mode != V4SFmode)
15909 op0 = gen_lowpart (V4SFmode, op0);
15910 m = adjust_address (op1, V2SFmode, 0);
15911 emit_insn (gen_sse_loadlps (op0, op0, m));
15912 m = adjust_address (op1, V2SFmode, 8);
15913 emit_insn (gen_sse_loadhps (op0, op0, m));
15916 else if (MEM_P (op0))
15918 /* If we're optimizing for size, movups is the smallest. */
15919 if (optimize_insn_for_size_p ()
15920 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15922 op0 = gen_lowpart (V4SFmode, op0);
15923 op1 = gen_lowpart (V4SFmode, op1);
15924 emit_insn (gen_sse_movups (op0, op1));
15925 return;
15928 /* ??? Similar to above, only less clear because of quote
15929 typeless stores unquote. */
15930 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15931 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15933 op0 = gen_lowpart (V16QImode, op0);
15934 op1 = gen_lowpart (V16QImode, op1);
15935 emit_insn (gen_sse2_movdqu (op0, op1));
15936 return;
15939 if (TARGET_SSE2 && mode == V2DFmode)
15941 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15943 op0 = gen_lowpart (V2DFmode, op0);
15944 op1 = gen_lowpart (V2DFmode, op1);
15945 emit_insn (gen_sse2_movupd (op0, op1));
15947 else
15949 m = adjust_address (op0, DFmode, 0);
15950 emit_insn (gen_sse2_storelpd (m, op1));
15951 m = adjust_address (op0, DFmode, 8);
15952 emit_insn (gen_sse2_storehpd (m, op1));
15955 else
15957 if (mode != V4SFmode)
15958 op1 = gen_lowpart (V4SFmode, op1);
15960 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15962 op0 = gen_lowpart (V4SFmode, op0);
15963 emit_insn (gen_sse_movups (op0, op1));
15965 else
15967 m = adjust_address (op0, V2SFmode, 0);
15968 emit_insn (gen_sse_storelps (m, op1));
15969 m = adjust_address (op0, V2SFmode, 8);
15970 emit_insn (gen_sse_storehps (m, op1));
15974 else
15975 gcc_unreachable ();
15978 /* Expand a push in MODE. This is some mode for which we do not support
15979 proper push instructions, at least from the registers that we expect
15980 the value to live in. */
15982 void
15983 ix86_expand_push (enum machine_mode mode, rtx x)
15985 rtx tmp;
15987 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15988 GEN_INT (-GET_MODE_SIZE (mode)),
15989 stack_pointer_rtx, 1, OPTAB_DIRECT);
15990 if (tmp != stack_pointer_rtx)
15991 emit_move_insn (stack_pointer_rtx, tmp);
15993 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15995 /* When we push an operand onto stack, it has to be aligned at least
15996 at the function argument boundary. However since we don't have
15997 the argument type, we can't determine the actual argument
15998 boundary. */
15999 emit_move_insn (tmp, x);
16002 /* Helper function of ix86_fixup_binary_operands to canonicalize
16003 operand order. Returns true if the operands should be swapped. */
16005 static bool
16006 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16007 rtx operands[])
16009 rtx dst = operands[0];
16010 rtx src1 = operands[1];
16011 rtx src2 = operands[2];
16013 /* If the operation is not commutative, we can't do anything. */
16014 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16015 return false;
16017 /* Highest priority is that src1 should match dst. */
16018 if (rtx_equal_p (dst, src1))
16019 return false;
16020 if (rtx_equal_p (dst, src2))
16021 return true;
16023 /* Next highest priority is that immediate constants come second. */
16024 if (immediate_operand (src2, mode))
16025 return false;
16026 if (immediate_operand (src1, mode))
16027 return true;
16029 /* Lowest priority is that memory references should come second. */
16030 if (MEM_P (src2))
16031 return false;
16032 if (MEM_P (src1))
16033 return true;
16035 return false;
16039 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16040 destination to use for the operation. If different from the true
16041 destination in operands[0], a copy operation will be required. */
16044 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16045 rtx operands[])
16047 rtx dst = operands[0];
16048 rtx src1 = operands[1];
16049 rtx src2 = operands[2];
16051 /* Canonicalize operand order. */
16052 if (ix86_swap_binary_operands_p (code, mode, operands))
16054 rtx temp;
16056 /* It is invalid to swap operands of different modes. */
16057 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16059 temp = src1;
16060 src1 = src2;
16061 src2 = temp;
16064 /* Both source operands cannot be in memory. */
16065 if (MEM_P (src1) && MEM_P (src2))
16067 /* Optimization: Only read from memory once. */
16068 if (rtx_equal_p (src1, src2))
16070 src2 = force_reg (mode, src2);
16071 src1 = src2;
16073 else
16074 src2 = force_reg (mode, src2);
16077 /* If the destination is memory, and we do not have matching source
16078 operands, do things in registers. */
16079 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16080 dst = gen_reg_rtx (mode);
16082 /* Source 1 cannot be a constant. */
16083 if (CONSTANT_P (src1))
16084 src1 = force_reg (mode, src1);
16086 /* Source 1 cannot be a non-matching memory. */
16087 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16088 src1 = force_reg (mode, src1);
16090 /* Improve address combine. */
16091 if (code == PLUS
16092 && GET_MODE_CLASS (mode) == MODE_INT
16093 && MEM_P (src2))
16094 src2 = force_reg (mode, src2);
16096 operands[1] = src1;
16097 operands[2] = src2;
16098 return dst;
16101 /* Similarly, but assume that the destination has already been
16102 set up properly. */
16104 void
16105 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16106 enum machine_mode mode, rtx operands[])
16108 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16109 gcc_assert (dst == operands[0]);
16112 /* Attempt to expand a binary operator. Make the expansion closer to the
16113 actual machine, then just general_operand, which will allow 3 separate
16114 memory references (one output, two input) in a single insn. */
16116 void
16117 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16118 rtx operands[])
16120 rtx src1, src2, dst, op, clob;
16122 dst = ix86_fixup_binary_operands (code, mode, operands);
16123 src1 = operands[1];
16124 src2 = operands[2];
16126 /* Emit the instruction. */
16128 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16129 if (reload_in_progress)
16131 /* Reload doesn't know about the flags register, and doesn't know that
16132 it doesn't want to clobber it. We can only do this with PLUS. */
16133 gcc_assert (code == PLUS);
16134 emit_insn (op);
16136 else if (reload_completed
16137 && code == PLUS
16138 && !rtx_equal_p (dst, src1))
16140 /* This is going to be an LEA; avoid splitting it later. */
16141 emit_insn (op);
16143 else
16145 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16146 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16149 /* Fix up the destination if needed. */
16150 if (dst != operands[0])
16151 emit_move_insn (operands[0], dst);
16154 /* Return TRUE or FALSE depending on whether the binary operator meets the
16155 appropriate constraints. */
16157 bool
16158 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16159 rtx operands[3])
16161 rtx dst = operands[0];
16162 rtx src1 = operands[1];
16163 rtx src2 = operands[2];
16165 /* Both source operands cannot be in memory. */
16166 if (MEM_P (src1) && MEM_P (src2))
16167 return false;
16169 /* Canonicalize operand order for commutative operators. */
16170 if (ix86_swap_binary_operands_p (code, mode, operands))
16172 rtx temp = src1;
16173 src1 = src2;
16174 src2 = temp;
16177 /* If the destination is memory, we must have a matching source operand. */
16178 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16179 return false;
16181 /* Source 1 cannot be a constant. */
16182 if (CONSTANT_P (src1))
16183 return false;
16185 /* Source 1 cannot be a non-matching memory. */
16186 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16187 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16188 return (code == AND
16189 && (mode == HImode
16190 || mode == SImode
16191 || (TARGET_64BIT && mode == DImode))
16192 && satisfies_constraint_L (src2));
16194 return true;
16197 /* Attempt to expand a unary operator. Make the expansion closer to the
16198 actual machine, then just general_operand, which will allow 2 separate
16199 memory references (one output, one input) in a single insn. */
16201 void
16202 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16203 rtx operands[])
16205 int matching_memory;
16206 rtx src, dst, op, clob;
16208 dst = operands[0];
16209 src = operands[1];
16211 /* If the destination is memory, and we do not have matching source
16212 operands, do things in registers. */
16213 matching_memory = 0;
16214 if (MEM_P (dst))
16216 if (rtx_equal_p (dst, src))
16217 matching_memory = 1;
16218 else
16219 dst = gen_reg_rtx (mode);
16222 /* When source operand is memory, destination must match. */
16223 if (MEM_P (src) && !matching_memory)
16224 src = force_reg (mode, src);
16226 /* Emit the instruction. */
16228 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16229 if (reload_in_progress || code == NOT)
16231 /* Reload doesn't know about the flags register, and doesn't know that
16232 it doesn't want to clobber it. */
16233 gcc_assert (code == NOT);
16234 emit_insn (op);
16236 else
16238 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16239 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16242 /* Fix up the destination if needed. */
16243 if (dst != operands[0])
16244 emit_move_insn (operands[0], dst);
16247 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16248 divisor are within the range [0-255]. */
16250 void
16251 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16252 bool signed_p)
16254 rtx end_label, qimode_label;
16255 rtx insn, div, mod;
16256 rtx scratch, tmp0, tmp1, tmp2;
16257 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16258 rtx (*gen_zero_extend) (rtx, rtx);
16259 rtx (*gen_test_ccno_1) (rtx, rtx);
16261 switch (mode)
16263 case SImode:
16264 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16265 gen_test_ccno_1 = gen_testsi_ccno_1;
16266 gen_zero_extend = gen_zero_extendqisi2;
16267 break;
16268 case DImode:
16269 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16270 gen_test_ccno_1 = gen_testdi_ccno_1;
16271 gen_zero_extend = gen_zero_extendqidi2;
16272 break;
16273 default:
16274 gcc_unreachable ();
16277 end_label = gen_label_rtx ();
16278 qimode_label = gen_label_rtx ();
16280 scratch = gen_reg_rtx (mode);
16282 /* Use 8bit unsigned divimod if dividend and divisor are within
16283 the range [0-255]. */
16284 emit_move_insn (scratch, operands[2]);
16285 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16286 scratch, 1, OPTAB_DIRECT);
16287 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16288 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16289 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16290 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16291 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16292 pc_rtx);
16293 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16294 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16295 JUMP_LABEL (insn) = qimode_label;
16297 /* Generate original signed/unsigned divimod. */
16298 div = gen_divmod4_1 (operands[0], operands[1],
16299 operands[2], operands[3]);
16300 emit_insn (div);
16302 /* Branch to the end. */
16303 emit_jump_insn (gen_jump (end_label));
16304 emit_barrier ();
16306 /* Generate 8bit unsigned divide. */
16307 emit_label (qimode_label);
16308 /* Don't use operands[0] for result of 8bit divide since not all
16309 registers support QImode ZERO_EXTRACT. */
16310 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16311 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16312 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16313 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16315 if (signed_p)
16317 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16318 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16320 else
16322 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16323 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16326 /* Extract remainder from AH. */
16327 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16328 if (REG_P (operands[1]))
16329 insn = emit_move_insn (operands[1], tmp1);
16330 else
16332 /* Need a new scratch register since the old one has result
16333 of 8bit divide. */
16334 scratch = gen_reg_rtx (mode);
16335 emit_move_insn (scratch, tmp1);
16336 insn = emit_move_insn (operands[1], scratch);
16338 set_unique_reg_note (insn, REG_EQUAL, mod);
16340 /* Zero extend quotient from AL. */
16341 tmp1 = gen_lowpart (QImode, tmp0);
16342 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16343 set_unique_reg_note (insn, REG_EQUAL, div);
16345 emit_label (end_label);
16348 #define LEA_MAX_STALL (3)
16349 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16351 /* Increase given DISTANCE in half-cycles according to
16352 dependencies between PREV and NEXT instructions.
16353 Add 1 half-cycle if there is no dependency and
16354 go to next cycle if there is some dependecy. */
16356 static unsigned int
16357 increase_distance (rtx prev, rtx next, unsigned int distance)
16359 df_ref *use_rec;
16360 df_ref *def_rec;
16362 if (!prev || !next)
16363 return distance + (distance & 1) + 2;
16365 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16366 return distance + 1;
16368 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16369 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16370 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16371 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16372 return distance + (distance & 1) + 2;
16374 return distance + 1;
16377 /* Function checks if instruction INSN defines register number
16378 REGNO1 or REGNO2. */
16380 static bool
16381 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16382 rtx insn)
16384 df_ref *def_rec;
16386 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16387 if (DF_REF_REG_DEF_P (*def_rec)
16388 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16389 && (regno1 == DF_REF_REGNO (*def_rec)
16390 || regno2 == DF_REF_REGNO (*def_rec)))
16392 return true;
16395 return false;
16398 /* Function checks if instruction INSN uses register number
16399 REGNO as a part of address expression. */
16401 static bool
16402 insn_uses_reg_mem (unsigned int regno, rtx insn)
16404 df_ref *use_rec;
16406 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16407 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16408 return true;
16410 return false;
16413 /* Search backward for non-agu definition of register number REGNO1
16414 or register number REGNO2 in basic block starting from instruction
16415 START up to head of basic block or instruction INSN.
16417 Function puts true value into *FOUND var if definition was found
16418 and false otherwise.
16420 Distance in half-cycles between START and found instruction or head
16421 of BB is added to DISTANCE and returned. */
16423 static int
16424 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16425 rtx insn, int distance,
16426 rtx start, bool *found)
16428 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16429 rtx prev = start;
16430 rtx next = NULL;
16432 *found = false;
16434 while (prev
16435 && prev != insn
16436 && distance < LEA_SEARCH_THRESHOLD)
16438 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16440 distance = increase_distance (prev, next, distance);
16441 if (insn_defines_reg (regno1, regno2, prev))
16443 if (recog_memoized (prev) < 0
16444 || get_attr_type (prev) != TYPE_LEA)
16446 *found = true;
16447 return distance;
16451 next = prev;
16453 if (prev == BB_HEAD (bb))
16454 break;
16456 prev = PREV_INSN (prev);
16459 return distance;
16462 /* Search backward for non-agu definition of register number REGNO1
16463 or register number REGNO2 in INSN's basic block until
16464 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16465 2. Reach neighbour BBs boundary, or
16466 3. Reach agu definition.
16467 Returns the distance between the non-agu definition point and INSN.
16468 If no definition point, returns -1. */
16470 static int
16471 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16472 rtx insn)
16474 basic_block bb = BLOCK_FOR_INSN (insn);
16475 int distance = 0;
16476 bool found = false;
16478 if (insn != BB_HEAD (bb))
16479 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16480 distance, PREV_INSN (insn),
16481 &found);
16483 if (!found && distance < LEA_SEARCH_THRESHOLD)
16485 edge e;
16486 edge_iterator ei;
16487 bool simple_loop = false;
16489 FOR_EACH_EDGE (e, ei, bb->preds)
16490 if (e->src == bb)
16492 simple_loop = true;
16493 break;
16496 if (simple_loop)
16497 distance = distance_non_agu_define_in_bb (regno1, regno2,
16498 insn, distance,
16499 BB_END (bb), &found);
16500 else
16502 int shortest_dist = -1;
16503 bool found_in_bb = false;
16505 FOR_EACH_EDGE (e, ei, bb->preds)
16507 int bb_dist
16508 = distance_non_agu_define_in_bb (regno1, regno2,
16509 insn, distance,
16510 BB_END (e->src),
16511 &found_in_bb);
16512 if (found_in_bb)
16514 if (shortest_dist < 0)
16515 shortest_dist = bb_dist;
16516 else if (bb_dist > 0)
16517 shortest_dist = MIN (bb_dist, shortest_dist);
16519 found = true;
16523 distance = shortest_dist;
16527 /* get_attr_type may modify recog data. We want to make sure
16528 that recog data is valid for instruction INSN, on which
16529 distance_non_agu_define is called. INSN is unchanged here. */
16530 extract_insn_cached (insn);
16532 if (!found)
16533 return -1;
16535 return distance >> 1;
16538 /* Return the distance in half-cycles between INSN and the next
16539 insn that uses register number REGNO in memory address added
16540 to DISTANCE. Return -1 if REGNO0 is set.
16542 Put true value into *FOUND if register usage was found and
16543 false otherwise.
16544 Put true value into *REDEFINED if register redefinition was
16545 found and false otherwise. */
16547 static int
16548 distance_agu_use_in_bb (unsigned int regno,
16549 rtx insn, int distance, rtx start,
16550 bool *found, bool *redefined)
16552 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16553 rtx next = start;
16554 rtx prev = NULL;
16556 *found = false;
16557 *redefined = false;
16559 while (next
16560 && next != insn
16561 && distance < LEA_SEARCH_THRESHOLD)
16563 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16565 distance = increase_distance(prev, next, distance);
16566 if (insn_uses_reg_mem (regno, next))
16568 /* Return DISTANCE if OP0 is used in memory
16569 address in NEXT. */
16570 *found = true;
16571 return distance;
16574 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16576 /* Return -1 if OP0 is set in NEXT. */
16577 *redefined = true;
16578 return -1;
16581 prev = next;
16584 if (next == BB_END (bb))
16585 break;
16587 next = NEXT_INSN (next);
16590 return distance;
16593 /* Return the distance between INSN and the next insn that uses
16594 register number REGNO0 in memory address. Return -1 if no such
16595 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16597 static int
16598 distance_agu_use (unsigned int regno0, rtx insn)
16600 basic_block bb = BLOCK_FOR_INSN (insn);
16601 int distance = 0;
16602 bool found = false;
16603 bool redefined = false;
16605 if (insn != BB_END (bb))
16606 distance = distance_agu_use_in_bb (regno0, insn, distance,
16607 NEXT_INSN (insn),
16608 &found, &redefined);
16610 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16612 edge e;
16613 edge_iterator ei;
16614 bool simple_loop = false;
16616 FOR_EACH_EDGE (e, ei, bb->succs)
16617 if (e->dest == bb)
16619 simple_loop = true;
16620 break;
16623 if (simple_loop)
16624 distance = distance_agu_use_in_bb (regno0, insn,
16625 distance, BB_HEAD (bb),
16626 &found, &redefined);
16627 else
16629 int shortest_dist = -1;
16630 bool found_in_bb = false;
16631 bool redefined_in_bb = false;
16633 FOR_EACH_EDGE (e, ei, bb->succs)
16635 int bb_dist
16636 = distance_agu_use_in_bb (regno0, insn,
16637 distance, BB_HEAD (e->dest),
16638 &found_in_bb, &redefined_in_bb);
16639 if (found_in_bb)
16641 if (shortest_dist < 0)
16642 shortest_dist = bb_dist;
16643 else if (bb_dist > 0)
16644 shortest_dist = MIN (bb_dist, shortest_dist);
16646 found = true;
16650 distance = shortest_dist;
16654 if (!found || redefined)
16655 return -1;
16657 return distance >> 1;
16660 /* Define this macro to tune LEA priority vs ADD, it take effect when
16661 there is a dilemma of choicing LEA or ADD
16662 Negative value: ADD is more preferred than LEA
16663 Zero: Netrual
16664 Positive value: LEA is more preferred than ADD*/
16665 #define IX86_LEA_PRIORITY 0
16667 /* Return true if usage of lea INSN has performance advantage
16668 over a sequence of instructions. Instructions sequence has
16669 SPLIT_COST cycles higher latency than lea latency. */
16671 bool
16672 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16673 unsigned int regno2, unsigned int split_cost)
16675 int dist_define, dist_use;
16677 dist_define = distance_non_agu_define (regno1, regno2, insn);
16678 dist_use = distance_agu_use (regno0, insn);
16680 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16682 /* If there is no non AGU operand definition, no AGU
16683 operand usage and split cost is 0 then both lea
16684 and non lea variants have same priority. Currently
16685 we prefer lea for 64 bit code and non lea on 32 bit
16686 code. */
16687 if (dist_use < 0 && split_cost == 0)
16688 return TARGET_64BIT || IX86_LEA_PRIORITY;
16689 else
16690 return true;
16693 /* With longer definitions distance lea is more preferable.
16694 Here we change it to take into account splitting cost and
16695 lea priority. */
16696 dist_define += split_cost + IX86_LEA_PRIORITY;
16698 /* If there is no use in memory addess then we just check
16699 that split cost does not exceed AGU stall. */
16700 if (dist_use < 0)
16701 return dist_define >= LEA_MAX_STALL;
16703 /* If this insn has both backward non-agu dependence and forward
16704 agu dependence, the one with short distance takes effect. */
16705 return dist_define >= dist_use;
16708 /* Return true if it is legal to clobber flags by INSN and
16709 false otherwise. */
16711 static bool
16712 ix86_ok_to_clobber_flags (rtx insn)
16714 basic_block bb = BLOCK_FOR_INSN (insn);
16715 df_ref *use;
16716 bitmap live;
16718 while (insn)
16720 if (NONDEBUG_INSN_P (insn))
16722 for (use = DF_INSN_USES (insn); *use; use++)
16723 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16724 return false;
16726 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16727 return true;
16730 if (insn == BB_END (bb))
16731 break;
16733 insn = NEXT_INSN (insn);
16736 live = df_get_live_out(bb);
16737 return !REGNO_REG_SET_P (live, FLAGS_REG);
16740 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16741 move and add to avoid AGU stalls. */
16743 bool
16744 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16746 unsigned int regno0 = true_regnum (operands[0]);
16747 unsigned int regno1 = true_regnum (operands[1]);
16748 unsigned int regno2 = true_regnum (operands[2]);
16750 /* Check if we need to optimize. */
16751 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16752 return false;
16754 /* Check it is correct to split here. */
16755 if (!ix86_ok_to_clobber_flags(insn))
16756 return false;
16758 /* We need to split only adds with non destructive
16759 destination operand. */
16760 if (regno0 == regno1 || regno0 == regno2)
16761 return false;
16762 else
16763 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16766 /* Return true if we should emit lea instruction instead of mov
16767 instruction. */
16769 bool
16770 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16772 unsigned int regno0;
16773 unsigned int regno1;
16775 /* Check if we need to optimize. */
16776 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16777 return false;
16779 /* Use lea for reg to reg moves only. */
16780 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16781 return false;
16783 regno0 = true_regnum (operands[0]);
16784 regno1 = true_regnum (operands[1]);
16786 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16789 /* Return true if we need to split lea into a sequence of
16790 instructions to avoid AGU stalls. */
16792 bool
16793 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16795 unsigned int regno0 = true_regnum (operands[0]) ;
16796 unsigned int regno1 = -1;
16797 unsigned int regno2 = -1;
16798 unsigned int split_cost = 0;
16799 struct ix86_address parts;
16800 int ok;
16802 /* Check we need to optimize. */
16803 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16804 return false;
16806 /* Check it is correct to split here. */
16807 if (!ix86_ok_to_clobber_flags(insn))
16808 return false;
16810 ok = ix86_decompose_address (operands[1], &parts);
16811 gcc_assert (ok);
16813 /* We should not split into add if non legitimate pic
16814 operand is used as displacement. */
16815 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16816 return false;
16818 if (parts.base)
16819 regno1 = true_regnum (parts.base);
16820 if (parts.index)
16821 regno2 = true_regnum (parts.index);
16823 /* Compute how many cycles we will add to execution time
16824 if split lea into a sequence of instructions. */
16825 if (parts.base || parts.index)
16827 /* Have to use mov instruction if non desctructive
16828 destination form is used. */
16829 if (regno1 != regno0 && regno2 != regno0)
16830 split_cost += 1;
16832 /* Have to add index to base if both exist. */
16833 if (parts.base && parts.index)
16834 split_cost += 1;
16836 /* Have to use shift and adds if scale is 2 or greater. */
16837 if (parts.scale > 1)
16839 if (regno0 != regno1)
16840 split_cost += 1;
16841 else if (regno2 == regno0)
16842 split_cost += 4;
16843 else
16844 split_cost += parts.scale;
16847 /* Have to use add instruction with immediate if
16848 disp is non zero. */
16849 if (parts.disp && parts.disp != const0_rtx)
16850 split_cost += 1;
16852 /* Subtract the price of lea. */
16853 split_cost -= 1;
16856 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16859 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16860 matches destination. RTX includes clobber of FLAGS_REG. */
16862 static void
16863 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16864 rtx dst, rtx src)
16866 rtx op, clob;
16868 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16869 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16871 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16874 /* Split lea instructions into a sequence of instructions
16875 which are executed on ALU to avoid AGU stalls.
16876 It is assumed that it is allowed to clobber flags register
16877 at lea position. */
16879 extern void
16880 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16882 unsigned int regno0 = true_regnum (operands[0]) ;
16883 unsigned int regno1 = INVALID_REGNUM;
16884 unsigned int regno2 = INVALID_REGNUM;
16885 struct ix86_address parts;
16886 rtx tmp;
16887 int ok, adds;
16889 ok = ix86_decompose_address (operands[1], &parts);
16890 gcc_assert (ok);
16892 if (parts.base)
16894 if (GET_MODE (parts.base) != mode)
16895 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16896 regno1 = true_regnum (parts.base);
16899 if (parts.index)
16901 if (GET_MODE (parts.index) != mode)
16902 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16903 regno2 = true_regnum (parts.index);
16906 if (parts.scale > 1)
16908 /* Case r1 = r1 + ... */
16909 if (regno1 == regno0)
16911 /* If we have a case r1 = r1 + C * r1 then we
16912 should use multiplication which is very
16913 expensive. Assume cost model is wrong if we
16914 have such case here. */
16915 gcc_assert (regno2 != regno0);
16917 for (adds = parts.scale; adds > 0; adds--)
16918 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16920 else
16922 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16923 if (regno0 != regno2)
16924 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16926 /* Use shift for scaling. */
16927 ix86_emit_binop (ASHIFT, mode, operands[0],
16928 GEN_INT (exact_log2 (parts.scale)));
16930 if (parts.base)
16931 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16933 if (parts.disp && parts.disp != const0_rtx)
16934 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16937 else if (!parts.base && !parts.index)
16939 gcc_assert(parts.disp);
16940 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16942 else
16944 if (!parts.base)
16946 if (regno0 != regno2)
16947 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16949 else if (!parts.index)
16951 if (regno0 != regno1)
16952 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16954 else
16956 if (regno0 == regno1)
16957 tmp = parts.index;
16958 else if (regno0 == regno2)
16959 tmp = parts.base;
16960 else
16962 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16963 tmp = parts.index;
16966 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16969 if (parts.disp && parts.disp != const0_rtx)
16970 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16974 /* Return true if it is ok to optimize an ADD operation to LEA
16975 operation to avoid flag register consumation. For most processors,
16976 ADD is faster than LEA. For the processors like ATOM, if the
16977 destination register of LEA holds an actual address which will be
16978 used soon, LEA is better and otherwise ADD is better. */
16980 bool
16981 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16983 unsigned int regno0 = true_regnum (operands[0]);
16984 unsigned int regno1 = true_regnum (operands[1]);
16985 unsigned int regno2 = true_regnum (operands[2]);
16987 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16988 if (regno0 != regno1 && regno0 != regno2)
16989 return true;
16991 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16992 return false;
16994 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16997 /* Return true if destination reg of SET_BODY is shift count of
16998 USE_BODY. */
17000 static bool
17001 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17003 rtx set_dest;
17004 rtx shift_rtx;
17005 int i;
17007 /* Retrieve destination of SET_BODY. */
17008 switch (GET_CODE (set_body))
17010 case SET:
17011 set_dest = SET_DEST (set_body);
17012 if (!set_dest || !REG_P (set_dest))
17013 return false;
17014 break;
17015 case PARALLEL:
17016 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17017 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17018 use_body))
17019 return true;
17020 default:
17021 return false;
17022 break;
17025 /* Retrieve shift count of USE_BODY. */
17026 switch (GET_CODE (use_body))
17028 case SET:
17029 shift_rtx = XEXP (use_body, 1);
17030 break;
17031 case PARALLEL:
17032 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17033 if (ix86_dep_by_shift_count_body (set_body,
17034 XVECEXP (use_body, 0, i)))
17035 return true;
17036 default:
17037 return false;
17038 break;
17041 if (shift_rtx
17042 && (GET_CODE (shift_rtx) == ASHIFT
17043 || GET_CODE (shift_rtx) == LSHIFTRT
17044 || GET_CODE (shift_rtx) == ASHIFTRT
17045 || GET_CODE (shift_rtx) == ROTATE
17046 || GET_CODE (shift_rtx) == ROTATERT))
17048 rtx shift_count = XEXP (shift_rtx, 1);
17050 /* Return true if shift count is dest of SET_BODY. */
17051 if (REG_P (shift_count)
17052 && true_regnum (set_dest) == true_regnum (shift_count))
17053 return true;
17056 return false;
17059 /* Return true if destination reg of SET_INSN is shift count of
17060 USE_INSN. */
17062 bool
17063 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17065 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17066 PATTERN (use_insn));
17069 /* Return TRUE or FALSE depending on whether the unary operator meets the
17070 appropriate constraints. */
17072 bool
17073 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17074 enum machine_mode mode ATTRIBUTE_UNUSED,
17075 rtx operands[2] ATTRIBUTE_UNUSED)
17077 /* If one of operands is memory, source and destination must match. */
17078 if ((MEM_P (operands[0])
17079 || MEM_P (operands[1]))
17080 && ! rtx_equal_p (operands[0], operands[1]))
17081 return false;
17082 return true;
17085 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17086 are ok, keeping in mind the possible movddup alternative. */
17088 bool
17089 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17091 if (MEM_P (operands[0]))
17092 return rtx_equal_p (operands[0], operands[1 + high]);
17093 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17094 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17095 return true;
17098 /* Post-reload splitter for converting an SF or DFmode value in an
17099 SSE register into an unsigned SImode. */
17101 void
17102 ix86_split_convert_uns_si_sse (rtx operands[])
17104 enum machine_mode vecmode;
17105 rtx value, large, zero_or_two31, input, two31, x;
17107 large = operands[1];
17108 zero_or_two31 = operands[2];
17109 input = operands[3];
17110 two31 = operands[4];
17111 vecmode = GET_MODE (large);
17112 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17114 /* Load up the value into the low element. We must ensure that the other
17115 elements are valid floats -- zero is the easiest such value. */
17116 if (MEM_P (input))
17118 if (vecmode == V4SFmode)
17119 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17120 else
17121 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17123 else
17125 input = gen_rtx_REG (vecmode, REGNO (input));
17126 emit_move_insn (value, CONST0_RTX (vecmode));
17127 if (vecmode == V4SFmode)
17128 emit_insn (gen_sse_movss (value, value, input));
17129 else
17130 emit_insn (gen_sse2_movsd (value, value, input));
17133 emit_move_insn (large, two31);
17134 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17136 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17137 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17139 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17140 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17142 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17143 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17145 large = gen_rtx_REG (V4SImode, REGNO (large));
17146 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17148 x = gen_rtx_REG (V4SImode, REGNO (value));
17149 if (vecmode == V4SFmode)
17150 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17151 else
17152 emit_insn (gen_sse2_cvttpd2dq (x, value));
17153 value = x;
17155 emit_insn (gen_xorv4si3 (value, value, large));
17158 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17159 Expects the 64-bit DImode to be supplied in a pair of integral
17160 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17161 -mfpmath=sse, !optimize_size only. */
17163 void
17164 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17166 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17167 rtx int_xmm, fp_xmm;
17168 rtx biases, exponents;
17169 rtx x;
17171 int_xmm = gen_reg_rtx (V4SImode);
17172 if (TARGET_INTER_UNIT_MOVES)
17173 emit_insn (gen_movdi_to_sse (int_xmm, input));
17174 else if (TARGET_SSE_SPLIT_REGS)
17176 emit_clobber (int_xmm);
17177 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17179 else
17181 x = gen_reg_rtx (V2DImode);
17182 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17183 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17186 x = gen_rtx_CONST_VECTOR (V4SImode,
17187 gen_rtvec (4, GEN_INT (0x43300000UL),
17188 GEN_INT (0x45300000UL),
17189 const0_rtx, const0_rtx));
17190 exponents = validize_mem (force_const_mem (V4SImode, x));
17192 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17193 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17195 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17196 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17197 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17198 (0x1.0p84 + double(fp_value_hi_xmm)).
17199 Note these exponents differ by 32. */
17201 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17203 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17204 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17205 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17206 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17207 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17208 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17209 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17210 biases = validize_mem (force_const_mem (V2DFmode, biases));
17211 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17213 /* Add the upper and lower DFmode values together. */
17214 if (TARGET_SSE3)
17215 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17216 else
17218 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17219 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17220 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17223 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17226 /* Not used, but eases macroization of patterns. */
17227 void
17228 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17229 rtx input ATTRIBUTE_UNUSED)
17231 gcc_unreachable ();
17234 /* Convert an unsigned SImode value into a DFmode. Only currently used
17235 for SSE, but applicable anywhere. */
17237 void
17238 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17240 REAL_VALUE_TYPE TWO31r;
17241 rtx x, fp;
17243 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17244 NULL, 1, OPTAB_DIRECT);
17246 fp = gen_reg_rtx (DFmode);
17247 emit_insn (gen_floatsidf2 (fp, x));
17249 real_ldexp (&TWO31r, &dconst1, 31);
17250 x = const_double_from_real_value (TWO31r, DFmode);
17252 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17253 if (x != target)
17254 emit_move_insn (target, x);
17257 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17258 32-bit mode; otherwise we have a direct convert instruction. */
17260 void
17261 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17263 REAL_VALUE_TYPE TWO32r;
17264 rtx fp_lo, fp_hi, x;
17266 fp_lo = gen_reg_rtx (DFmode);
17267 fp_hi = gen_reg_rtx (DFmode);
17269 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17271 real_ldexp (&TWO32r, &dconst1, 32);
17272 x = const_double_from_real_value (TWO32r, DFmode);
17273 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17275 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17277 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17278 0, OPTAB_DIRECT);
17279 if (x != target)
17280 emit_move_insn (target, x);
17283 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17284 For x86_32, -mfpmath=sse, !optimize_size only. */
17285 void
17286 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17288 REAL_VALUE_TYPE ONE16r;
17289 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17291 real_ldexp (&ONE16r, &dconst1, 16);
17292 x = const_double_from_real_value (ONE16r, SFmode);
17293 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17294 NULL, 0, OPTAB_DIRECT);
17295 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17296 NULL, 0, OPTAB_DIRECT);
17297 fp_hi = gen_reg_rtx (SFmode);
17298 fp_lo = gen_reg_rtx (SFmode);
17299 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17300 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17301 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17302 0, OPTAB_DIRECT);
17303 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17304 0, OPTAB_DIRECT);
17305 if (!rtx_equal_p (target, fp_hi))
17306 emit_move_insn (target, fp_hi);
17309 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17310 a vector of unsigned ints VAL to vector of floats TARGET. */
17312 void
17313 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17315 rtx tmp[8];
17316 REAL_VALUE_TYPE TWO16r;
17317 enum machine_mode intmode = GET_MODE (val);
17318 enum machine_mode fltmode = GET_MODE (target);
17319 rtx (*cvt) (rtx, rtx);
17321 if (intmode == V4SImode)
17322 cvt = gen_floatv4siv4sf2;
17323 else
17324 cvt = gen_floatv8siv8sf2;
17325 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17326 tmp[0] = force_reg (intmode, tmp[0]);
17327 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17328 OPTAB_DIRECT);
17329 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17330 NULL_RTX, 1, OPTAB_DIRECT);
17331 tmp[3] = gen_reg_rtx (fltmode);
17332 emit_insn (cvt (tmp[3], tmp[1]));
17333 tmp[4] = gen_reg_rtx (fltmode);
17334 emit_insn (cvt (tmp[4], tmp[2]));
17335 real_ldexp (&TWO16r, &dconst1, 16);
17336 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17337 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17338 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17339 OPTAB_DIRECT);
17340 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17341 OPTAB_DIRECT);
17342 if (tmp[7] != target)
17343 emit_move_insn (target, tmp[7]);
17346 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17347 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17348 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17349 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17352 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17354 REAL_VALUE_TYPE TWO31r;
17355 rtx two31r, tmp[4];
17356 enum machine_mode mode = GET_MODE (val);
17357 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17358 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17359 rtx (*cmp) (rtx, rtx, rtx, rtx);
17360 int i;
17362 for (i = 0; i < 3; i++)
17363 tmp[i] = gen_reg_rtx (mode);
17364 real_ldexp (&TWO31r, &dconst1, 31);
17365 two31r = const_double_from_real_value (TWO31r, scalarmode);
17366 two31r = ix86_build_const_vector (mode, 1, two31r);
17367 two31r = force_reg (mode, two31r);
17368 switch (mode)
17370 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17371 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17372 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17373 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17374 default: gcc_unreachable ();
17376 tmp[3] = gen_rtx_LE (mode, two31r, val);
17377 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17378 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17379 0, OPTAB_DIRECT);
17380 if (intmode == V4SImode || TARGET_AVX2)
17381 *xorp = expand_simple_binop (intmode, ASHIFT,
17382 gen_lowpart (intmode, tmp[0]),
17383 GEN_INT (31), NULL_RTX, 0,
17384 OPTAB_DIRECT);
17385 else
17387 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17388 two31 = ix86_build_const_vector (intmode, 1, two31);
17389 *xorp = expand_simple_binop (intmode, AND,
17390 gen_lowpart (intmode, tmp[0]),
17391 two31, NULL_RTX, 0,
17392 OPTAB_DIRECT);
17394 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17395 0, OPTAB_DIRECT);
17398 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17399 then replicate the value for all elements of the vector
17400 register. */
17403 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17405 int i, n_elt;
17406 rtvec v;
17407 enum machine_mode scalar_mode;
17409 switch (mode)
17411 case V32QImode:
17412 case V16QImode:
17413 case V16HImode:
17414 case V8HImode:
17415 case V8SImode:
17416 case V4SImode:
17417 case V4DImode:
17418 case V2DImode:
17419 gcc_assert (vect);
17420 case V8SFmode:
17421 case V4SFmode:
17422 case V4DFmode:
17423 case V2DFmode:
17424 n_elt = GET_MODE_NUNITS (mode);
17425 v = rtvec_alloc (n_elt);
17426 scalar_mode = GET_MODE_INNER (mode);
17428 RTVEC_ELT (v, 0) = value;
17430 for (i = 1; i < n_elt; ++i)
17431 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17433 return gen_rtx_CONST_VECTOR (mode, v);
17435 default:
17436 gcc_unreachable ();
17440 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17441 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17442 for an SSE register. If VECT is true, then replicate the mask for
17443 all elements of the vector register. If INVERT is true, then create
17444 a mask excluding the sign bit. */
17447 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17449 enum machine_mode vec_mode, imode;
17450 HOST_WIDE_INT hi, lo;
17451 int shift = 63;
17452 rtx v;
17453 rtx mask;
17455 /* Find the sign bit, sign extended to 2*HWI. */
17456 switch (mode)
17458 case V8SImode:
17459 case V4SImode:
17460 case V8SFmode:
17461 case V4SFmode:
17462 vec_mode = mode;
17463 mode = GET_MODE_INNER (mode);
17464 imode = SImode;
17465 lo = 0x80000000, hi = lo < 0;
17466 break;
17468 case V4DImode:
17469 case V2DImode:
17470 case V4DFmode:
17471 case V2DFmode:
17472 vec_mode = mode;
17473 mode = GET_MODE_INNER (mode);
17474 imode = DImode;
17475 if (HOST_BITS_PER_WIDE_INT >= 64)
17476 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17477 else
17478 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17479 break;
17481 case TImode:
17482 case TFmode:
17483 vec_mode = VOIDmode;
17484 if (HOST_BITS_PER_WIDE_INT >= 64)
17486 imode = TImode;
17487 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17489 else
17491 rtvec vec;
17493 imode = DImode;
17494 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17496 if (invert)
17498 lo = ~lo, hi = ~hi;
17499 v = constm1_rtx;
17501 else
17502 v = const0_rtx;
17504 mask = immed_double_const (lo, hi, imode);
17506 vec = gen_rtvec (2, v, mask);
17507 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17508 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17510 return v;
17512 break;
17514 default:
17515 gcc_unreachable ();
17518 if (invert)
17519 lo = ~lo, hi = ~hi;
17521 /* Force this value into the low part of a fp vector constant. */
17522 mask = immed_double_const (lo, hi, imode);
17523 mask = gen_lowpart (mode, mask);
17525 if (vec_mode == VOIDmode)
17526 return force_reg (mode, mask);
17528 v = ix86_build_const_vector (vec_mode, vect, mask);
17529 return force_reg (vec_mode, v);
17532 /* Generate code for floating point ABS or NEG. */
17534 void
17535 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17536 rtx operands[])
17538 rtx mask, set, dst, src;
17539 bool use_sse = false;
17540 bool vector_mode = VECTOR_MODE_P (mode);
17541 enum machine_mode vmode = mode;
17543 if (vector_mode)
17544 use_sse = true;
17545 else if (mode == TFmode)
17546 use_sse = true;
17547 else if (TARGET_SSE_MATH)
17549 use_sse = SSE_FLOAT_MODE_P (mode);
17550 if (mode == SFmode)
17551 vmode = V4SFmode;
17552 else if (mode == DFmode)
17553 vmode = V2DFmode;
17556 /* NEG and ABS performed with SSE use bitwise mask operations.
17557 Create the appropriate mask now. */
17558 if (use_sse)
17559 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17560 else
17561 mask = NULL_RTX;
17563 dst = operands[0];
17564 src = operands[1];
17566 set = gen_rtx_fmt_e (code, mode, src);
17567 set = gen_rtx_SET (VOIDmode, dst, set);
17569 if (mask)
17571 rtx use, clob;
17572 rtvec par;
17574 use = gen_rtx_USE (VOIDmode, mask);
17575 if (vector_mode)
17576 par = gen_rtvec (2, set, use);
17577 else
17579 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17580 par = gen_rtvec (3, set, use, clob);
17582 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17584 else
17585 emit_insn (set);
17588 /* Expand a copysign operation. Special case operand 0 being a constant. */
17590 void
17591 ix86_expand_copysign (rtx operands[])
17593 enum machine_mode mode, vmode;
17594 rtx dest, op0, op1, mask, nmask;
17596 dest = operands[0];
17597 op0 = operands[1];
17598 op1 = operands[2];
17600 mode = GET_MODE (dest);
17602 if (mode == SFmode)
17603 vmode = V4SFmode;
17604 else if (mode == DFmode)
17605 vmode = V2DFmode;
17606 else
17607 vmode = mode;
17609 if (GET_CODE (op0) == CONST_DOUBLE)
17611 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17613 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17614 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17616 if (mode == SFmode || mode == DFmode)
17618 if (op0 == CONST0_RTX (mode))
17619 op0 = CONST0_RTX (vmode);
17620 else
17622 rtx v = ix86_build_const_vector (vmode, false, op0);
17624 op0 = force_reg (vmode, v);
17627 else if (op0 != CONST0_RTX (mode))
17628 op0 = force_reg (mode, op0);
17630 mask = ix86_build_signbit_mask (vmode, 0, 0);
17632 if (mode == SFmode)
17633 copysign_insn = gen_copysignsf3_const;
17634 else if (mode == DFmode)
17635 copysign_insn = gen_copysigndf3_const;
17636 else
17637 copysign_insn = gen_copysigntf3_const;
17639 emit_insn (copysign_insn (dest, op0, op1, mask));
17641 else
17643 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17645 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17646 mask = ix86_build_signbit_mask (vmode, 0, 0);
17648 if (mode == SFmode)
17649 copysign_insn = gen_copysignsf3_var;
17650 else if (mode == DFmode)
17651 copysign_insn = gen_copysigndf3_var;
17652 else
17653 copysign_insn = gen_copysigntf3_var;
17655 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17659 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17660 be a constant, and so has already been expanded into a vector constant. */
17662 void
17663 ix86_split_copysign_const (rtx operands[])
17665 enum machine_mode mode, vmode;
17666 rtx dest, op0, mask, x;
17668 dest = operands[0];
17669 op0 = operands[1];
17670 mask = operands[3];
17672 mode = GET_MODE (dest);
17673 vmode = GET_MODE (mask);
17675 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17676 x = gen_rtx_AND (vmode, dest, mask);
17677 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17679 if (op0 != CONST0_RTX (vmode))
17681 x = gen_rtx_IOR (vmode, dest, op0);
17682 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17686 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17687 so we have to do two masks. */
17689 void
17690 ix86_split_copysign_var (rtx operands[])
17692 enum machine_mode mode, vmode;
17693 rtx dest, scratch, op0, op1, mask, nmask, x;
17695 dest = operands[0];
17696 scratch = operands[1];
17697 op0 = operands[2];
17698 op1 = operands[3];
17699 nmask = operands[4];
17700 mask = operands[5];
17702 mode = GET_MODE (dest);
17703 vmode = GET_MODE (mask);
17705 if (rtx_equal_p (op0, op1))
17707 /* Shouldn't happen often (it's useless, obviously), but when it does
17708 we'd generate incorrect code if we continue below. */
17709 emit_move_insn (dest, op0);
17710 return;
17713 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17715 gcc_assert (REGNO (op1) == REGNO (scratch));
17717 x = gen_rtx_AND (vmode, scratch, mask);
17718 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17720 dest = mask;
17721 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17722 x = gen_rtx_NOT (vmode, dest);
17723 x = gen_rtx_AND (vmode, x, op0);
17724 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17726 else
17728 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17730 x = gen_rtx_AND (vmode, scratch, mask);
17732 else /* alternative 2,4 */
17734 gcc_assert (REGNO (mask) == REGNO (scratch));
17735 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17736 x = gen_rtx_AND (vmode, scratch, op1);
17738 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17740 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17742 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17743 x = gen_rtx_AND (vmode, dest, nmask);
17745 else /* alternative 3,4 */
17747 gcc_assert (REGNO (nmask) == REGNO (dest));
17748 dest = nmask;
17749 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17750 x = gen_rtx_AND (vmode, dest, op0);
17752 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17755 x = gen_rtx_IOR (vmode, dest, scratch);
17756 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17759 /* Return TRUE or FALSE depending on whether the first SET in INSN
17760 has source and destination with matching CC modes, and that the
17761 CC mode is at least as constrained as REQ_MODE. */
17763 bool
17764 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17766 rtx set;
17767 enum machine_mode set_mode;
17769 set = PATTERN (insn);
17770 if (GET_CODE (set) == PARALLEL)
17771 set = XVECEXP (set, 0, 0);
17772 gcc_assert (GET_CODE (set) == SET);
17773 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17775 set_mode = GET_MODE (SET_DEST (set));
17776 switch (set_mode)
17778 case CCNOmode:
17779 if (req_mode != CCNOmode
17780 && (req_mode != CCmode
17781 || XEXP (SET_SRC (set), 1) != const0_rtx))
17782 return false;
17783 break;
17784 case CCmode:
17785 if (req_mode == CCGCmode)
17786 return false;
17787 /* FALLTHRU */
17788 case CCGCmode:
17789 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17790 return false;
17791 /* FALLTHRU */
17792 case CCGOCmode:
17793 if (req_mode == CCZmode)
17794 return false;
17795 /* FALLTHRU */
17796 case CCZmode:
17797 break;
17799 case CCAmode:
17800 case CCCmode:
17801 case CCOmode:
17802 case CCSmode:
17803 if (set_mode != req_mode)
17804 return false;
17805 break;
17807 default:
17808 gcc_unreachable ();
17811 return GET_MODE (SET_SRC (set)) == set_mode;
17814 /* Generate insn patterns to do an integer compare of OPERANDS. */
17816 static rtx
17817 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17819 enum machine_mode cmpmode;
17820 rtx tmp, flags;
17822 cmpmode = SELECT_CC_MODE (code, op0, op1);
17823 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17825 /* This is very simple, but making the interface the same as in the
17826 FP case makes the rest of the code easier. */
17827 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17828 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17830 /* Return the test that should be put into the flags user, i.e.
17831 the bcc, scc, or cmov instruction. */
17832 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17835 /* Figure out whether to use ordered or unordered fp comparisons.
17836 Return the appropriate mode to use. */
17838 enum machine_mode
17839 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17841 /* ??? In order to make all comparisons reversible, we do all comparisons
17842 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17843 all forms trapping and nontrapping comparisons, we can make inequality
17844 comparisons trapping again, since it results in better code when using
17845 FCOM based compares. */
17846 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17849 enum machine_mode
17850 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17852 enum machine_mode mode = GET_MODE (op0);
17854 if (SCALAR_FLOAT_MODE_P (mode))
17856 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17857 return ix86_fp_compare_mode (code);
17860 switch (code)
17862 /* Only zero flag is needed. */
17863 case EQ: /* ZF=0 */
17864 case NE: /* ZF!=0 */
17865 return CCZmode;
17866 /* Codes needing carry flag. */
17867 case GEU: /* CF=0 */
17868 case LTU: /* CF=1 */
17869 /* Detect overflow checks. They need just the carry flag. */
17870 if (GET_CODE (op0) == PLUS
17871 && rtx_equal_p (op1, XEXP (op0, 0)))
17872 return CCCmode;
17873 else
17874 return CCmode;
17875 case GTU: /* CF=0 & ZF=0 */
17876 case LEU: /* CF=1 | ZF=1 */
17877 /* Detect overflow checks. They need just the carry flag. */
17878 if (GET_CODE (op0) == MINUS
17879 && rtx_equal_p (op1, XEXP (op0, 0)))
17880 return CCCmode;
17881 else
17882 return CCmode;
17883 /* Codes possibly doable only with sign flag when
17884 comparing against zero. */
17885 case GE: /* SF=OF or SF=0 */
17886 case LT: /* SF<>OF or SF=1 */
17887 if (op1 == const0_rtx)
17888 return CCGOCmode;
17889 else
17890 /* For other cases Carry flag is not required. */
17891 return CCGCmode;
17892 /* Codes doable only with sign flag when comparing
17893 against zero, but we miss jump instruction for it
17894 so we need to use relational tests against overflow
17895 that thus needs to be zero. */
17896 case GT: /* ZF=0 & SF=OF */
17897 case LE: /* ZF=1 | SF<>OF */
17898 if (op1 == const0_rtx)
17899 return CCNOmode;
17900 else
17901 return CCGCmode;
17902 /* strcmp pattern do (use flags) and combine may ask us for proper
17903 mode. */
17904 case USE:
17905 return CCmode;
17906 default:
17907 gcc_unreachable ();
17911 /* Return the fixed registers used for condition codes. */
17913 static bool
17914 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17916 *p1 = FLAGS_REG;
17917 *p2 = FPSR_REG;
17918 return true;
17921 /* If two condition code modes are compatible, return a condition code
17922 mode which is compatible with both. Otherwise, return
17923 VOIDmode. */
17925 static enum machine_mode
17926 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17928 if (m1 == m2)
17929 return m1;
17931 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17932 return VOIDmode;
17934 if ((m1 == CCGCmode && m2 == CCGOCmode)
17935 || (m1 == CCGOCmode && m2 == CCGCmode))
17936 return CCGCmode;
17938 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
17939 return m2;
17940 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
17941 return m1;
17943 switch (m1)
17945 default:
17946 gcc_unreachable ();
17948 case CCmode:
17949 case CCGCmode:
17950 case CCGOCmode:
17951 case CCNOmode:
17952 case CCAmode:
17953 case CCCmode:
17954 case CCOmode:
17955 case CCSmode:
17956 case CCZmode:
17957 switch (m2)
17959 default:
17960 return VOIDmode;
17962 case CCmode:
17963 case CCGCmode:
17964 case CCGOCmode:
17965 case CCNOmode:
17966 case CCAmode:
17967 case CCCmode:
17968 case CCOmode:
17969 case CCSmode:
17970 case CCZmode:
17971 return CCmode;
17974 case CCFPmode:
17975 case CCFPUmode:
17976 /* These are only compatible with themselves, which we already
17977 checked above. */
17978 return VOIDmode;
17983 /* Return a comparison we can do and that it is equivalent to
17984 swap_condition (code) apart possibly from orderedness.
17985 But, never change orderedness if TARGET_IEEE_FP, returning
17986 UNKNOWN in that case if necessary. */
17988 static enum rtx_code
17989 ix86_fp_swap_condition (enum rtx_code code)
17991 switch (code)
17993 case GT: /* GTU - CF=0 & ZF=0 */
17994 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17995 case GE: /* GEU - CF=0 */
17996 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17997 case UNLT: /* LTU - CF=1 */
17998 return TARGET_IEEE_FP ? UNKNOWN : GT;
17999 case UNLE: /* LEU - CF=1 | ZF=1 */
18000 return TARGET_IEEE_FP ? UNKNOWN : GE;
18001 default:
18002 return swap_condition (code);
18006 /* Return cost of comparison CODE using the best strategy for performance.
18007 All following functions do use number of instructions as a cost metrics.
18008 In future this should be tweaked to compute bytes for optimize_size and
18009 take into account performance of various instructions on various CPUs. */
18011 static int
18012 ix86_fp_comparison_cost (enum rtx_code code)
18014 int arith_cost;
18016 /* The cost of code using bit-twiddling on %ah. */
18017 switch (code)
18019 case UNLE:
18020 case UNLT:
18021 case LTGT:
18022 case GT:
18023 case GE:
18024 case UNORDERED:
18025 case ORDERED:
18026 case UNEQ:
18027 arith_cost = 4;
18028 break;
18029 case LT:
18030 case NE:
18031 case EQ:
18032 case UNGE:
18033 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18034 break;
18035 case LE:
18036 case UNGT:
18037 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18038 break;
18039 default:
18040 gcc_unreachable ();
18043 switch (ix86_fp_comparison_strategy (code))
18045 case IX86_FPCMP_COMI:
18046 return arith_cost > 4 ? 3 : 2;
18047 case IX86_FPCMP_SAHF:
18048 return arith_cost > 4 ? 4 : 3;
18049 default:
18050 return arith_cost;
18054 /* Return strategy to use for floating-point. We assume that fcomi is always
18055 preferrable where available, since that is also true when looking at size
18056 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18058 enum ix86_fpcmp_strategy
18059 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18061 /* Do fcomi/sahf based test when profitable. */
18063 if (TARGET_CMOVE)
18064 return IX86_FPCMP_COMI;
18066 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18067 return IX86_FPCMP_SAHF;
18069 return IX86_FPCMP_ARITH;
18072 /* Swap, force into registers, or otherwise massage the two operands
18073 to a fp comparison. The operands are updated in place; the new
18074 comparison code is returned. */
18076 static enum rtx_code
18077 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18079 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18080 rtx op0 = *pop0, op1 = *pop1;
18081 enum machine_mode op_mode = GET_MODE (op0);
18082 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18084 /* All of the unordered compare instructions only work on registers.
18085 The same is true of the fcomi compare instructions. The XFmode
18086 compare instructions require registers except when comparing
18087 against zero or when converting operand 1 from fixed point to
18088 floating point. */
18090 if (!is_sse
18091 && (fpcmp_mode == CCFPUmode
18092 || (op_mode == XFmode
18093 && ! (standard_80387_constant_p (op0) == 1
18094 || standard_80387_constant_p (op1) == 1)
18095 && GET_CODE (op1) != FLOAT)
18096 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18098 op0 = force_reg (op_mode, op0);
18099 op1 = force_reg (op_mode, op1);
18101 else
18103 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18104 things around if they appear profitable, otherwise force op0
18105 into a register. */
18107 if (standard_80387_constant_p (op0) == 0
18108 || (MEM_P (op0)
18109 && ! (standard_80387_constant_p (op1) == 0
18110 || MEM_P (op1))))
18112 enum rtx_code new_code = ix86_fp_swap_condition (code);
18113 if (new_code != UNKNOWN)
18115 rtx tmp;
18116 tmp = op0, op0 = op1, op1 = tmp;
18117 code = new_code;
18121 if (!REG_P (op0))
18122 op0 = force_reg (op_mode, op0);
18124 if (CONSTANT_P (op1))
18126 int tmp = standard_80387_constant_p (op1);
18127 if (tmp == 0)
18128 op1 = validize_mem (force_const_mem (op_mode, op1));
18129 else if (tmp == 1)
18131 if (TARGET_CMOVE)
18132 op1 = force_reg (op_mode, op1);
18134 else
18135 op1 = force_reg (op_mode, op1);
18139 /* Try to rearrange the comparison to make it cheaper. */
18140 if (ix86_fp_comparison_cost (code)
18141 > ix86_fp_comparison_cost (swap_condition (code))
18142 && (REG_P (op1) || can_create_pseudo_p ()))
18144 rtx tmp;
18145 tmp = op0, op0 = op1, op1 = tmp;
18146 code = swap_condition (code);
18147 if (!REG_P (op0))
18148 op0 = force_reg (op_mode, op0);
18151 *pop0 = op0;
18152 *pop1 = op1;
18153 return code;
18156 /* Convert comparison codes we use to represent FP comparison to integer
18157 code that will result in proper branch. Return UNKNOWN if no such code
18158 is available. */
18160 enum rtx_code
18161 ix86_fp_compare_code_to_integer (enum rtx_code code)
18163 switch (code)
18165 case GT:
18166 return GTU;
18167 case GE:
18168 return GEU;
18169 case ORDERED:
18170 case UNORDERED:
18171 return code;
18172 break;
18173 case UNEQ:
18174 return EQ;
18175 break;
18176 case UNLT:
18177 return LTU;
18178 break;
18179 case UNLE:
18180 return LEU;
18181 break;
18182 case LTGT:
18183 return NE;
18184 break;
18185 default:
18186 return UNKNOWN;
18190 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18192 static rtx
18193 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18195 enum machine_mode fpcmp_mode, intcmp_mode;
18196 rtx tmp, tmp2;
18198 fpcmp_mode = ix86_fp_compare_mode (code);
18199 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18201 /* Do fcomi/sahf based test when profitable. */
18202 switch (ix86_fp_comparison_strategy (code))
18204 case IX86_FPCMP_COMI:
18205 intcmp_mode = fpcmp_mode;
18206 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18207 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18208 tmp);
18209 emit_insn (tmp);
18210 break;
18212 case IX86_FPCMP_SAHF:
18213 intcmp_mode = fpcmp_mode;
18214 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18215 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18216 tmp);
18218 if (!scratch)
18219 scratch = gen_reg_rtx (HImode);
18220 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18221 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18222 break;
18224 case IX86_FPCMP_ARITH:
18225 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18226 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18227 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18228 if (!scratch)
18229 scratch = gen_reg_rtx (HImode);
18230 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18232 /* In the unordered case, we have to check C2 for NaN's, which
18233 doesn't happen to work out to anything nice combination-wise.
18234 So do some bit twiddling on the value we've got in AH to come
18235 up with an appropriate set of condition codes. */
18237 intcmp_mode = CCNOmode;
18238 switch (code)
18240 case GT:
18241 case UNGT:
18242 if (code == GT || !TARGET_IEEE_FP)
18244 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18245 code = EQ;
18247 else
18249 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18250 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18251 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18252 intcmp_mode = CCmode;
18253 code = GEU;
18255 break;
18256 case LT:
18257 case UNLT:
18258 if (code == LT && TARGET_IEEE_FP)
18260 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18261 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18262 intcmp_mode = CCmode;
18263 code = EQ;
18265 else
18267 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18268 code = NE;
18270 break;
18271 case GE:
18272 case UNGE:
18273 if (code == GE || !TARGET_IEEE_FP)
18275 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18276 code = EQ;
18278 else
18280 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18281 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18282 code = NE;
18284 break;
18285 case LE:
18286 case UNLE:
18287 if (code == LE && TARGET_IEEE_FP)
18289 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18290 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18291 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18292 intcmp_mode = CCmode;
18293 code = LTU;
18295 else
18297 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18298 code = NE;
18300 break;
18301 case EQ:
18302 case UNEQ:
18303 if (code == EQ && TARGET_IEEE_FP)
18305 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18306 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18307 intcmp_mode = CCmode;
18308 code = EQ;
18310 else
18312 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18313 code = NE;
18315 break;
18316 case NE:
18317 case LTGT:
18318 if (code == NE && TARGET_IEEE_FP)
18320 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18321 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18322 GEN_INT (0x40)));
18323 code = NE;
18325 else
18327 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18328 code = EQ;
18330 break;
18332 case UNORDERED:
18333 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18334 code = NE;
18335 break;
18336 case ORDERED:
18337 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18338 code = EQ;
18339 break;
18341 default:
18342 gcc_unreachable ();
18344 break;
18346 default:
18347 gcc_unreachable();
18350 /* Return the test that should be put into the flags user, i.e.
18351 the bcc, scc, or cmov instruction. */
18352 return gen_rtx_fmt_ee (code, VOIDmode,
18353 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18354 const0_rtx);
18357 static rtx
18358 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18360 rtx ret;
18362 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18363 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18365 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18367 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18368 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18370 else
18371 ret = ix86_expand_int_compare (code, op0, op1);
18373 return ret;
18376 void
18377 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18379 enum machine_mode mode = GET_MODE (op0);
18380 rtx tmp;
18382 switch (mode)
18384 case SFmode:
18385 case DFmode:
18386 case XFmode:
18387 case QImode:
18388 case HImode:
18389 case SImode:
18390 simple:
18391 tmp = ix86_expand_compare (code, op0, op1);
18392 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18393 gen_rtx_LABEL_REF (VOIDmode, label),
18394 pc_rtx);
18395 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18396 return;
18398 case DImode:
18399 if (TARGET_64BIT)
18400 goto simple;
18401 case TImode:
18402 /* Expand DImode branch into multiple compare+branch. */
18404 rtx lo[2], hi[2], label2;
18405 enum rtx_code code1, code2, code3;
18406 enum machine_mode submode;
18408 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18410 tmp = op0, op0 = op1, op1 = tmp;
18411 code = swap_condition (code);
18414 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18415 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18417 submode = mode == DImode ? SImode : DImode;
18419 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18420 avoid two branches. This costs one extra insn, so disable when
18421 optimizing for size. */
18423 if ((code == EQ || code == NE)
18424 && (!optimize_insn_for_size_p ()
18425 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18427 rtx xor0, xor1;
18429 xor1 = hi[0];
18430 if (hi[1] != const0_rtx)
18431 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18432 NULL_RTX, 0, OPTAB_WIDEN);
18434 xor0 = lo[0];
18435 if (lo[1] != const0_rtx)
18436 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18437 NULL_RTX, 0, OPTAB_WIDEN);
18439 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18440 NULL_RTX, 0, OPTAB_WIDEN);
18442 ix86_expand_branch (code, tmp, const0_rtx, label);
18443 return;
18446 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18447 op1 is a constant and the low word is zero, then we can just
18448 examine the high word. Similarly for low word -1 and
18449 less-or-equal-than or greater-than. */
18451 if (CONST_INT_P (hi[1]))
18452 switch (code)
18454 case LT: case LTU: case GE: case GEU:
18455 if (lo[1] == const0_rtx)
18457 ix86_expand_branch (code, hi[0], hi[1], label);
18458 return;
18460 break;
18461 case LE: case LEU: case GT: case GTU:
18462 if (lo[1] == constm1_rtx)
18464 ix86_expand_branch (code, hi[0], hi[1], label);
18465 return;
18467 break;
18468 default:
18469 break;
18472 /* Otherwise, we need two or three jumps. */
18474 label2 = gen_label_rtx ();
18476 code1 = code;
18477 code2 = swap_condition (code);
18478 code3 = unsigned_condition (code);
18480 switch (code)
18482 case LT: case GT: case LTU: case GTU:
18483 break;
18485 case LE: code1 = LT; code2 = GT; break;
18486 case GE: code1 = GT; code2 = LT; break;
18487 case LEU: code1 = LTU; code2 = GTU; break;
18488 case GEU: code1 = GTU; code2 = LTU; break;
18490 case EQ: code1 = UNKNOWN; code2 = NE; break;
18491 case NE: code2 = UNKNOWN; break;
18493 default:
18494 gcc_unreachable ();
18498 * a < b =>
18499 * if (hi(a) < hi(b)) goto true;
18500 * if (hi(a) > hi(b)) goto false;
18501 * if (lo(a) < lo(b)) goto true;
18502 * false:
18505 if (code1 != UNKNOWN)
18506 ix86_expand_branch (code1, hi[0], hi[1], label);
18507 if (code2 != UNKNOWN)
18508 ix86_expand_branch (code2, hi[0], hi[1], label2);
18510 ix86_expand_branch (code3, lo[0], lo[1], label);
18512 if (code2 != UNKNOWN)
18513 emit_label (label2);
18514 return;
18517 default:
18518 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18519 goto simple;
18523 /* Split branch based on floating point condition. */
18524 void
18525 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18526 rtx target1, rtx target2, rtx tmp, rtx pushed)
18528 rtx condition;
18529 rtx i;
18531 if (target2 != pc_rtx)
18533 rtx tmp = target2;
18534 code = reverse_condition_maybe_unordered (code);
18535 target2 = target1;
18536 target1 = tmp;
18539 condition = ix86_expand_fp_compare (code, op1, op2,
18540 tmp);
18542 /* Remove pushed operand from stack. */
18543 if (pushed)
18544 ix86_free_from_memory (GET_MODE (pushed));
18546 i = emit_jump_insn (gen_rtx_SET
18547 (VOIDmode, pc_rtx,
18548 gen_rtx_IF_THEN_ELSE (VOIDmode,
18549 condition, target1, target2)));
18550 if (split_branch_probability >= 0)
18551 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18554 void
18555 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18557 rtx ret;
18559 gcc_assert (GET_MODE (dest) == QImode);
18561 ret = ix86_expand_compare (code, op0, op1);
18562 PUT_MODE (ret, QImode);
18563 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18566 /* Expand comparison setting or clearing carry flag. Return true when
18567 successful and set pop for the operation. */
18568 static bool
18569 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18571 enum machine_mode mode =
18572 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18574 /* Do not handle double-mode compares that go through special path. */
18575 if (mode == (TARGET_64BIT ? TImode : DImode))
18576 return false;
18578 if (SCALAR_FLOAT_MODE_P (mode))
18580 rtx compare_op, compare_seq;
18582 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18584 /* Shortcut: following common codes never translate
18585 into carry flag compares. */
18586 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18587 || code == ORDERED || code == UNORDERED)
18588 return false;
18590 /* These comparisons require zero flag; swap operands so they won't. */
18591 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18592 && !TARGET_IEEE_FP)
18594 rtx tmp = op0;
18595 op0 = op1;
18596 op1 = tmp;
18597 code = swap_condition (code);
18600 /* Try to expand the comparison and verify that we end up with
18601 carry flag based comparison. This fails to be true only when
18602 we decide to expand comparison using arithmetic that is not
18603 too common scenario. */
18604 start_sequence ();
18605 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18606 compare_seq = get_insns ();
18607 end_sequence ();
18609 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18610 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18611 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18612 else
18613 code = GET_CODE (compare_op);
18615 if (code != LTU && code != GEU)
18616 return false;
18618 emit_insn (compare_seq);
18619 *pop = compare_op;
18620 return true;
18623 if (!INTEGRAL_MODE_P (mode))
18624 return false;
18626 switch (code)
18628 case LTU:
18629 case GEU:
18630 break;
18632 /* Convert a==0 into (unsigned)a<1. */
18633 case EQ:
18634 case NE:
18635 if (op1 != const0_rtx)
18636 return false;
18637 op1 = const1_rtx;
18638 code = (code == EQ ? LTU : GEU);
18639 break;
18641 /* Convert a>b into b<a or a>=b-1. */
18642 case GTU:
18643 case LEU:
18644 if (CONST_INT_P (op1))
18646 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18647 /* Bail out on overflow. We still can swap operands but that
18648 would force loading of the constant into register. */
18649 if (op1 == const0_rtx
18650 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18651 return false;
18652 code = (code == GTU ? GEU : LTU);
18654 else
18656 rtx tmp = op1;
18657 op1 = op0;
18658 op0 = tmp;
18659 code = (code == GTU ? LTU : GEU);
18661 break;
18663 /* Convert a>=0 into (unsigned)a<0x80000000. */
18664 case LT:
18665 case GE:
18666 if (mode == DImode || op1 != const0_rtx)
18667 return false;
18668 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18669 code = (code == LT ? GEU : LTU);
18670 break;
18671 case LE:
18672 case GT:
18673 if (mode == DImode || op1 != constm1_rtx)
18674 return false;
18675 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18676 code = (code == LE ? GEU : LTU);
18677 break;
18679 default:
18680 return false;
18682 /* Swapping operands may cause constant to appear as first operand. */
18683 if (!nonimmediate_operand (op0, VOIDmode))
18685 if (!can_create_pseudo_p ())
18686 return false;
18687 op0 = force_reg (mode, op0);
18689 *pop = ix86_expand_compare (code, op0, op1);
18690 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18691 return true;
18694 bool
18695 ix86_expand_int_movcc (rtx operands[])
18697 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18698 rtx compare_seq, compare_op;
18699 enum machine_mode mode = GET_MODE (operands[0]);
18700 bool sign_bit_compare_p = false;
18701 rtx op0 = XEXP (operands[1], 0);
18702 rtx op1 = XEXP (operands[1], 1);
18704 start_sequence ();
18705 compare_op = ix86_expand_compare (code, op0, op1);
18706 compare_seq = get_insns ();
18707 end_sequence ();
18709 compare_code = GET_CODE (compare_op);
18711 if ((op1 == const0_rtx && (code == GE || code == LT))
18712 || (op1 == constm1_rtx && (code == GT || code == LE)))
18713 sign_bit_compare_p = true;
18715 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18716 HImode insns, we'd be swallowed in word prefix ops. */
18718 if ((mode != HImode || TARGET_FAST_PREFIX)
18719 && (mode != (TARGET_64BIT ? TImode : DImode))
18720 && CONST_INT_P (operands[2])
18721 && CONST_INT_P (operands[3]))
18723 rtx out = operands[0];
18724 HOST_WIDE_INT ct = INTVAL (operands[2]);
18725 HOST_WIDE_INT cf = INTVAL (operands[3]);
18726 HOST_WIDE_INT diff;
18728 diff = ct - cf;
18729 /* Sign bit compares are better done using shifts than we do by using
18730 sbb. */
18731 if (sign_bit_compare_p
18732 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18734 /* Detect overlap between destination and compare sources. */
18735 rtx tmp = out;
18737 if (!sign_bit_compare_p)
18739 rtx flags;
18740 bool fpcmp = false;
18742 compare_code = GET_CODE (compare_op);
18744 flags = XEXP (compare_op, 0);
18746 if (GET_MODE (flags) == CCFPmode
18747 || GET_MODE (flags) == CCFPUmode)
18749 fpcmp = true;
18750 compare_code
18751 = ix86_fp_compare_code_to_integer (compare_code);
18754 /* To simplify rest of code, restrict to the GEU case. */
18755 if (compare_code == LTU)
18757 HOST_WIDE_INT tmp = ct;
18758 ct = cf;
18759 cf = tmp;
18760 compare_code = reverse_condition (compare_code);
18761 code = reverse_condition (code);
18763 else
18765 if (fpcmp)
18766 PUT_CODE (compare_op,
18767 reverse_condition_maybe_unordered
18768 (GET_CODE (compare_op)));
18769 else
18770 PUT_CODE (compare_op,
18771 reverse_condition (GET_CODE (compare_op)));
18773 diff = ct - cf;
18775 if (reg_overlap_mentioned_p (out, op0)
18776 || reg_overlap_mentioned_p (out, op1))
18777 tmp = gen_reg_rtx (mode);
18779 if (mode == DImode)
18780 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18781 else
18782 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18783 flags, compare_op));
18785 else
18787 if (code == GT || code == GE)
18788 code = reverse_condition (code);
18789 else
18791 HOST_WIDE_INT tmp = ct;
18792 ct = cf;
18793 cf = tmp;
18794 diff = ct - cf;
18796 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18799 if (diff == 1)
18802 * cmpl op0,op1
18803 * sbbl dest,dest
18804 * [addl dest, ct]
18806 * Size 5 - 8.
18808 if (ct)
18809 tmp = expand_simple_binop (mode, PLUS,
18810 tmp, GEN_INT (ct),
18811 copy_rtx (tmp), 1, OPTAB_DIRECT);
18813 else if (cf == -1)
18816 * cmpl op0,op1
18817 * sbbl dest,dest
18818 * orl $ct, dest
18820 * Size 8.
18822 tmp = expand_simple_binop (mode, IOR,
18823 tmp, GEN_INT (ct),
18824 copy_rtx (tmp), 1, OPTAB_DIRECT);
18826 else if (diff == -1 && ct)
18829 * cmpl op0,op1
18830 * sbbl dest,dest
18831 * notl dest
18832 * [addl dest, cf]
18834 * Size 8 - 11.
18836 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18837 if (cf)
18838 tmp = expand_simple_binop (mode, PLUS,
18839 copy_rtx (tmp), GEN_INT (cf),
18840 copy_rtx (tmp), 1, OPTAB_DIRECT);
18842 else
18845 * cmpl op0,op1
18846 * sbbl dest,dest
18847 * [notl dest]
18848 * andl cf - ct, dest
18849 * [addl dest, ct]
18851 * Size 8 - 11.
18854 if (cf == 0)
18856 cf = ct;
18857 ct = 0;
18858 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18861 tmp = expand_simple_binop (mode, AND,
18862 copy_rtx (tmp),
18863 gen_int_mode (cf - ct, mode),
18864 copy_rtx (tmp), 1, OPTAB_DIRECT);
18865 if (ct)
18866 tmp = expand_simple_binop (mode, PLUS,
18867 copy_rtx (tmp), GEN_INT (ct),
18868 copy_rtx (tmp), 1, OPTAB_DIRECT);
18871 if (!rtx_equal_p (tmp, out))
18872 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18874 return true;
18877 if (diff < 0)
18879 enum machine_mode cmp_mode = GET_MODE (op0);
18881 HOST_WIDE_INT tmp;
18882 tmp = ct, ct = cf, cf = tmp;
18883 diff = -diff;
18885 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18887 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18889 /* We may be reversing unordered compare to normal compare, that
18890 is not valid in general (we may convert non-trapping condition
18891 to trapping one), however on i386 we currently emit all
18892 comparisons unordered. */
18893 compare_code = reverse_condition_maybe_unordered (compare_code);
18894 code = reverse_condition_maybe_unordered (code);
18896 else
18898 compare_code = reverse_condition (compare_code);
18899 code = reverse_condition (code);
18903 compare_code = UNKNOWN;
18904 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18905 && CONST_INT_P (op1))
18907 if (op1 == const0_rtx
18908 && (code == LT || code == GE))
18909 compare_code = code;
18910 else if (op1 == constm1_rtx)
18912 if (code == LE)
18913 compare_code = LT;
18914 else if (code == GT)
18915 compare_code = GE;
18919 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18920 if (compare_code != UNKNOWN
18921 && GET_MODE (op0) == GET_MODE (out)
18922 && (cf == -1 || ct == -1))
18924 /* If lea code below could be used, only optimize
18925 if it results in a 2 insn sequence. */
18927 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18928 || diff == 3 || diff == 5 || diff == 9)
18929 || (compare_code == LT && ct == -1)
18930 || (compare_code == GE && cf == -1))
18933 * notl op1 (if necessary)
18934 * sarl $31, op1
18935 * orl cf, op1
18937 if (ct != -1)
18939 cf = ct;
18940 ct = -1;
18941 code = reverse_condition (code);
18944 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18946 out = expand_simple_binop (mode, IOR,
18947 out, GEN_INT (cf),
18948 out, 1, OPTAB_DIRECT);
18949 if (out != operands[0])
18950 emit_move_insn (operands[0], out);
18952 return true;
18957 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18958 || diff == 3 || diff == 5 || diff == 9)
18959 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18960 && (mode != DImode
18961 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18964 * xorl dest,dest
18965 * cmpl op1,op2
18966 * setcc dest
18967 * lea cf(dest*(ct-cf)),dest
18969 * Size 14.
18971 * This also catches the degenerate setcc-only case.
18974 rtx tmp;
18975 int nops;
18977 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18979 nops = 0;
18980 /* On x86_64 the lea instruction operates on Pmode, so we need
18981 to get arithmetics done in proper mode to match. */
18982 if (diff == 1)
18983 tmp = copy_rtx (out);
18984 else
18986 rtx out1;
18987 out1 = copy_rtx (out);
18988 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18989 nops++;
18990 if (diff & 1)
18992 tmp = gen_rtx_PLUS (mode, tmp, out1);
18993 nops++;
18996 if (cf != 0)
18998 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18999 nops++;
19001 if (!rtx_equal_p (tmp, out))
19003 if (nops == 1)
19004 out = force_operand (tmp, copy_rtx (out));
19005 else
19006 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19008 if (!rtx_equal_p (out, operands[0]))
19009 emit_move_insn (operands[0], copy_rtx (out));
19011 return true;
19015 * General case: Jumpful:
19016 * xorl dest,dest cmpl op1, op2
19017 * cmpl op1, op2 movl ct, dest
19018 * setcc dest jcc 1f
19019 * decl dest movl cf, dest
19020 * andl (cf-ct),dest 1:
19021 * addl ct,dest
19023 * Size 20. Size 14.
19025 * This is reasonably steep, but branch mispredict costs are
19026 * high on modern cpus, so consider failing only if optimizing
19027 * for space.
19030 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19031 && BRANCH_COST (optimize_insn_for_speed_p (),
19032 false) >= 2)
19034 if (cf == 0)
19036 enum machine_mode cmp_mode = GET_MODE (op0);
19038 cf = ct;
19039 ct = 0;
19041 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19043 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19045 /* We may be reversing unordered compare to normal compare,
19046 that is not valid in general (we may convert non-trapping
19047 condition to trapping one), however on i386 we currently
19048 emit all comparisons unordered. */
19049 code = reverse_condition_maybe_unordered (code);
19051 else
19053 code = reverse_condition (code);
19054 if (compare_code != UNKNOWN)
19055 compare_code = reverse_condition (compare_code);
19059 if (compare_code != UNKNOWN)
19061 /* notl op1 (if needed)
19062 sarl $31, op1
19063 andl (cf-ct), op1
19064 addl ct, op1
19066 For x < 0 (resp. x <= -1) there will be no notl,
19067 so if possible swap the constants to get rid of the
19068 complement.
19069 True/false will be -1/0 while code below (store flag
19070 followed by decrement) is 0/-1, so the constants need
19071 to be exchanged once more. */
19073 if (compare_code == GE || !cf)
19075 code = reverse_condition (code);
19076 compare_code = LT;
19078 else
19080 HOST_WIDE_INT tmp = cf;
19081 cf = ct;
19082 ct = tmp;
19085 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19087 else
19089 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19091 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19092 constm1_rtx,
19093 copy_rtx (out), 1, OPTAB_DIRECT);
19096 out = expand_simple_binop (mode, AND, copy_rtx (out),
19097 gen_int_mode (cf - ct, mode),
19098 copy_rtx (out), 1, OPTAB_DIRECT);
19099 if (ct)
19100 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19101 copy_rtx (out), 1, OPTAB_DIRECT);
19102 if (!rtx_equal_p (out, operands[0]))
19103 emit_move_insn (operands[0], copy_rtx (out));
19105 return true;
19109 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19111 /* Try a few things more with specific constants and a variable. */
19113 optab op;
19114 rtx var, orig_out, out, tmp;
19116 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19117 return false;
19119 /* If one of the two operands is an interesting constant, load a
19120 constant with the above and mask it in with a logical operation. */
19122 if (CONST_INT_P (operands[2]))
19124 var = operands[3];
19125 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19126 operands[3] = constm1_rtx, op = and_optab;
19127 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19128 operands[3] = const0_rtx, op = ior_optab;
19129 else
19130 return false;
19132 else if (CONST_INT_P (operands[3]))
19134 var = operands[2];
19135 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19136 operands[2] = constm1_rtx, op = and_optab;
19137 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19138 operands[2] = const0_rtx, op = ior_optab;
19139 else
19140 return false;
19142 else
19143 return false;
19145 orig_out = operands[0];
19146 tmp = gen_reg_rtx (mode);
19147 operands[0] = tmp;
19149 /* Recurse to get the constant loaded. */
19150 if (ix86_expand_int_movcc (operands) == 0)
19151 return false;
19153 /* Mask in the interesting variable. */
19154 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19155 OPTAB_WIDEN);
19156 if (!rtx_equal_p (out, orig_out))
19157 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19159 return true;
19163 * For comparison with above,
19165 * movl cf,dest
19166 * movl ct,tmp
19167 * cmpl op1,op2
19168 * cmovcc tmp,dest
19170 * Size 15.
19173 if (! nonimmediate_operand (operands[2], mode))
19174 operands[2] = force_reg (mode, operands[2]);
19175 if (! nonimmediate_operand (operands[3], mode))
19176 operands[3] = force_reg (mode, operands[3]);
19178 if (! register_operand (operands[2], VOIDmode)
19179 && (mode == QImode
19180 || ! register_operand (operands[3], VOIDmode)))
19181 operands[2] = force_reg (mode, operands[2]);
19183 if (mode == QImode
19184 && ! register_operand (operands[3], VOIDmode))
19185 operands[3] = force_reg (mode, operands[3]);
19187 emit_insn (compare_seq);
19188 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19189 gen_rtx_IF_THEN_ELSE (mode,
19190 compare_op, operands[2],
19191 operands[3])));
19192 return true;
19195 /* Swap, force into registers, or otherwise massage the two operands
19196 to an sse comparison with a mask result. Thus we differ a bit from
19197 ix86_prepare_fp_compare_args which expects to produce a flags result.
19199 The DEST operand exists to help determine whether to commute commutative
19200 operators. The POP0/POP1 operands are updated in place. The new
19201 comparison code is returned, or UNKNOWN if not implementable. */
19203 static enum rtx_code
19204 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19205 rtx *pop0, rtx *pop1)
19207 rtx tmp;
19209 switch (code)
19211 case LTGT:
19212 case UNEQ:
19213 /* AVX supports all the needed comparisons. */
19214 if (TARGET_AVX)
19215 break;
19216 /* We have no LTGT as an operator. We could implement it with
19217 NE & ORDERED, but this requires an extra temporary. It's
19218 not clear that it's worth it. */
19219 return UNKNOWN;
19221 case LT:
19222 case LE:
19223 case UNGT:
19224 case UNGE:
19225 /* These are supported directly. */
19226 break;
19228 case EQ:
19229 case NE:
19230 case UNORDERED:
19231 case ORDERED:
19232 /* AVX has 3 operand comparisons, no need to swap anything. */
19233 if (TARGET_AVX)
19234 break;
19235 /* For commutative operators, try to canonicalize the destination
19236 operand to be first in the comparison - this helps reload to
19237 avoid extra moves. */
19238 if (!dest || !rtx_equal_p (dest, *pop1))
19239 break;
19240 /* FALLTHRU */
19242 case GE:
19243 case GT:
19244 case UNLE:
19245 case UNLT:
19246 /* These are not supported directly before AVX, and furthermore
19247 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19248 comparison operands to transform into something that is
19249 supported. */
19250 tmp = *pop0;
19251 *pop0 = *pop1;
19252 *pop1 = tmp;
19253 code = swap_condition (code);
19254 break;
19256 default:
19257 gcc_unreachable ();
19260 return code;
19263 /* Detect conditional moves that exactly match min/max operational
19264 semantics. Note that this is IEEE safe, as long as we don't
19265 interchange the operands.
19267 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19268 and TRUE if the operation is successful and instructions are emitted. */
19270 static bool
19271 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19272 rtx cmp_op1, rtx if_true, rtx if_false)
19274 enum machine_mode mode;
19275 bool is_min;
19276 rtx tmp;
19278 if (code == LT)
19280 else if (code == UNGE)
19282 tmp = if_true;
19283 if_true = if_false;
19284 if_false = tmp;
19286 else
19287 return false;
19289 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19290 is_min = true;
19291 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19292 is_min = false;
19293 else
19294 return false;
19296 mode = GET_MODE (dest);
19298 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19299 but MODE may be a vector mode and thus not appropriate. */
19300 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19302 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19303 rtvec v;
19305 if_true = force_reg (mode, if_true);
19306 v = gen_rtvec (2, if_true, if_false);
19307 tmp = gen_rtx_UNSPEC (mode, v, u);
19309 else
19311 code = is_min ? SMIN : SMAX;
19312 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19315 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19316 return true;
19319 /* Expand an sse vector comparison. Return the register with the result. */
19321 static rtx
19322 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19323 rtx op_true, rtx op_false)
19325 enum machine_mode mode = GET_MODE (dest);
19326 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19327 rtx x;
19329 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19330 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19331 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19333 if (optimize
19334 || reg_overlap_mentioned_p (dest, op_true)
19335 || reg_overlap_mentioned_p (dest, op_false))
19336 dest = gen_reg_rtx (mode);
19338 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19339 if (cmp_mode != mode)
19341 x = force_reg (cmp_mode, x);
19342 convert_move (dest, x, false);
19344 else
19345 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19347 return dest;
19350 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19351 operations. This is used for both scalar and vector conditional moves. */
19353 static void
19354 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19356 enum machine_mode mode = GET_MODE (dest);
19357 rtx t2, t3, x;
19359 if (vector_all_ones_operand (op_true, mode)
19360 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19362 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19364 else if (op_false == CONST0_RTX (mode))
19366 op_true = force_reg (mode, op_true);
19367 x = gen_rtx_AND (mode, cmp, op_true);
19368 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19370 else if (op_true == CONST0_RTX (mode))
19372 op_false = force_reg (mode, op_false);
19373 x = gen_rtx_NOT (mode, cmp);
19374 x = gen_rtx_AND (mode, x, op_false);
19375 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19377 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19379 op_false = force_reg (mode, op_false);
19380 x = gen_rtx_IOR (mode, cmp, op_false);
19381 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19383 else if (TARGET_XOP)
19385 op_true = force_reg (mode, op_true);
19387 if (!nonimmediate_operand (op_false, mode))
19388 op_false = force_reg (mode, op_false);
19390 emit_insn (gen_rtx_SET (mode, dest,
19391 gen_rtx_IF_THEN_ELSE (mode, cmp,
19392 op_true,
19393 op_false)));
19395 else
19397 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19399 if (!nonimmediate_operand (op_true, mode))
19400 op_true = force_reg (mode, op_true);
19402 op_false = force_reg (mode, op_false);
19404 switch (mode)
19406 case V4SFmode:
19407 if (TARGET_SSE4_1)
19408 gen = gen_sse4_1_blendvps;
19409 break;
19410 case V2DFmode:
19411 if (TARGET_SSE4_1)
19412 gen = gen_sse4_1_blendvpd;
19413 break;
19414 case V16QImode:
19415 case V8HImode:
19416 case V4SImode:
19417 case V2DImode:
19418 if (TARGET_SSE4_1)
19420 gen = gen_sse4_1_pblendvb;
19421 dest = gen_lowpart (V16QImode, dest);
19422 op_false = gen_lowpart (V16QImode, op_false);
19423 op_true = gen_lowpart (V16QImode, op_true);
19424 cmp = gen_lowpart (V16QImode, cmp);
19426 break;
19427 case V8SFmode:
19428 if (TARGET_AVX)
19429 gen = gen_avx_blendvps256;
19430 break;
19431 case V4DFmode:
19432 if (TARGET_AVX)
19433 gen = gen_avx_blendvpd256;
19434 break;
19435 case V32QImode:
19436 case V16HImode:
19437 case V8SImode:
19438 case V4DImode:
19439 if (TARGET_AVX2)
19441 gen = gen_avx2_pblendvb;
19442 dest = gen_lowpart (V32QImode, dest);
19443 op_false = gen_lowpart (V32QImode, op_false);
19444 op_true = gen_lowpart (V32QImode, op_true);
19445 cmp = gen_lowpart (V32QImode, cmp);
19447 break;
19448 default:
19449 break;
19452 if (gen != NULL)
19453 emit_insn (gen (dest, op_false, op_true, cmp));
19454 else
19456 op_true = force_reg (mode, op_true);
19458 t2 = gen_reg_rtx (mode);
19459 if (optimize)
19460 t3 = gen_reg_rtx (mode);
19461 else
19462 t3 = dest;
19464 x = gen_rtx_AND (mode, op_true, cmp);
19465 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19467 x = gen_rtx_NOT (mode, cmp);
19468 x = gen_rtx_AND (mode, x, op_false);
19469 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19471 x = gen_rtx_IOR (mode, t3, t2);
19472 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19477 /* Expand a floating-point conditional move. Return true if successful. */
19479 bool
19480 ix86_expand_fp_movcc (rtx operands[])
19482 enum machine_mode mode = GET_MODE (operands[0]);
19483 enum rtx_code code = GET_CODE (operands[1]);
19484 rtx tmp, compare_op;
19485 rtx op0 = XEXP (operands[1], 0);
19486 rtx op1 = XEXP (operands[1], 1);
19488 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19490 enum machine_mode cmode;
19492 /* Since we've no cmove for sse registers, don't force bad register
19493 allocation just to gain access to it. Deny movcc when the
19494 comparison mode doesn't match the move mode. */
19495 cmode = GET_MODE (op0);
19496 if (cmode == VOIDmode)
19497 cmode = GET_MODE (op1);
19498 if (cmode != mode)
19499 return false;
19501 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19502 if (code == UNKNOWN)
19503 return false;
19505 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19506 operands[2], operands[3]))
19507 return true;
19509 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19510 operands[2], operands[3]);
19511 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19512 return true;
19515 /* The floating point conditional move instructions don't directly
19516 support conditions resulting from a signed integer comparison. */
19518 compare_op = ix86_expand_compare (code, op0, op1);
19519 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19521 tmp = gen_reg_rtx (QImode);
19522 ix86_expand_setcc (tmp, code, op0, op1);
19524 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19527 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19528 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19529 operands[2], operands[3])));
19531 return true;
19534 /* Expand a floating-point vector conditional move; a vcond operation
19535 rather than a movcc operation. */
19537 bool
19538 ix86_expand_fp_vcond (rtx operands[])
19540 enum rtx_code code = GET_CODE (operands[3]);
19541 rtx cmp;
19543 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19544 &operands[4], &operands[5]);
19545 if (code == UNKNOWN)
19547 rtx temp;
19548 switch (GET_CODE (operands[3]))
19550 case LTGT:
19551 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19552 operands[5], operands[0], operands[0]);
19553 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19554 operands[5], operands[1], operands[2]);
19555 code = AND;
19556 break;
19557 case UNEQ:
19558 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19559 operands[5], operands[0], operands[0]);
19560 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19561 operands[5], operands[1], operands[2]);
19562 code = IOR;
19563 break;
19564 default:
19565 gcc_unreachable ();
19567 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19568 OPTAB_DIRECT);
19569 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19570 return true;
19573 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19574 operands[5], operands[1], operands[2]))
19575 return true;
19577 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19578 operands[1], operands[2]);
19579 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19580 return true;
19583 /* Expand a signed/unsigned integral vector conditional move. */
19585 bool
19586 ix86_expand_int_vcond (rtx operands[])
19588 enum machine_mode data_mode = GET_MODE (operands[0]);
19589 enum machine_mode mode = GET_MODE (operands[4]);
19590 enum rtx_code code = GET_CODE (operands[3]);
19591 bool negate = false;
19592 rtx x, cop0, cop1;
19594 cop0 = operands[4];
19595 cop1 = operands[5];
19597 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19598 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19599 if ((code == LT || code == GE)
19600 && data_mode == mode
19601 && cop1 == CONST0_RTX (mode)
19602 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19603 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19604 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19605 && (GET_MODE_SIZE (data_mode) == 16
19606 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19608 rtx negop = operands[2 - (code == LT)];
19609 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19610 if (negop == CONST1_RTX (data_mode))
19612 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19613 operands[0], 1, OPTAB_DIRECT);
19614 if (res != operands[0])
19615 emit_move_insn (operands[0], res);
19616 return true;
19618 else if (GET_MODE_INNER (data_mode) != DImode
19619 && vector_all_ones_operand (negop, data_mode))
19621 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19622 operands[0], 0, OPTAB_DIRECT);
19623 if (res != operands[0])
19624 emit_move_insn (operands[0], res);
19625 return true;
19629 if (!nonimmediate_operand (cop1, mode))
19630 cop1 = force_reg (mode, cop1);
19631 if (!general_operand (operands[1], data_mode))
19632 operands[1] = force_reg (data_mode, operands[1]);
19633 if (!general_operand (operands[2], data_mode))
19634 operands[2] = force_reg (data_mode, operands[2]);
19636 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19637 if (TARGET_XOP
19638 && (mode == V16QImode || mode == V8HImode
19639 || mode == V4SImode || mode == V2DImode))
19641 else
19643 /* Canonicalize the comparison to EQ, GT, GTU. */
19644 switch (code)
19646 case EQ:
19647 case GT:
19648 case GTU:
19649 break;
19651 case NE:
19652 case LE:
19653 case LEU:
19654 code = reverse_condition (code);
19655 negate = true;
19656 break;
19658 case GE:
19659 case GEU:
19660 code = reverse_condition (code);
19661 negate = true;
19662 /* FALLTHRU */
19664 case LT:
19665 case LTU:
19666 code = swap_condition (code);
19667 x = cop0, cop0 = cop1, cop1 = x;
19668 break;
19670 default:
19671 gcc_unreachable ();
19674 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19675 if (mode == V2DImode)
19677 switch (code)
19679 case EQ:
19680 /* SSE4.1 supports EQ. */
19681 if (!TARGET_SSE4_1)
19682 return false;
19683 break;
19685 case GT:
19686 case GTU:
19687 /* SSE4.2 supports GT/GTU. */
19688 if (!TARGET_SSE4_2)
19689 return false;
19690 break;
19692 default:
19693 gcc_unreachable ();
19697 /* Unsigned parallel compare is not supported by the hardware.
19698 Play some tricks to turn this into a signed comparison
19699 against 0. */
19700 if (code == GTU)
19702 cop0 = force_reg (mode, cop0);
19704 switch (mode)
19706 case V8SImode:
19707 case V4DImode:
19708 case V4SImode:
19709 case V2DImode:
19711 rtx t1, t2, mask;
19712 rtx (*gen_sub3) (rtx, rtx, rtx);
19714 switch (mode)
19716 case V8SImode: gen_sub3 = gen_subv8si3; break;
19717 case V4DImode: gen_sub3 = gen_subv4di3; break;
19718 case V4SImode: gen_sub3 = gen_subv4si3; break;
19719 case V2DImode: gen_sub3 = gen_subv2di3; break;
19720 default:
19721 gcc_unreachable ();
19723 /* Subtract (-(INT MAX) - 1) from both operands to make
19724 them signed. */
19725 mask = ix86_build_signbit_mask (mode, true, false);
19726 t1 = gen_reg_rtx (mode);
19727 emit_insn (gen_sub3 (t1, cop0, mask));
19729 t2 = gen_reg_rtx (mode);
19730 emit_insn (gen_sub3 (t2, cop1, mask));
19732 cop0 = t1;
19733 cop1 = t2;
19734 code = GT;
19736 break;
19738 case V32QImode:
19739 case V16HImode:
19740 case V16QImode:
19741 case V8HImode:
19742 /* Perform a parallel unsigned saturating subtraction. */
19743 x = gen_reg_rtx (mode);
19744 emit_insn (gen_rtx_SET (VOIDmode, x,
19745 gen_rtx_US_MINUS (mode, cop0, cop1)));
19747 cop0 = x;
19748 cop1 = CONST0_RTX (mode);
19749 code = EQ;
19750 negate = !negate;
19751 break;
19753 default:
19754 gcc_unreachable ();
19759 /* Allow the comparison to be done in one mode, but the movcc to
19760 happen in another mode. */
19761 if (data_mode == mode)
19763 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19764 operands[1+negate], operands[2-negate]);
19766 else
19768 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19769 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19770 code, cop0, cop1,
19771 operands[1+negate], operands[2-negate]);
19772 x = gen_lowpart (data_mode, x);
19775 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19776 operands[2-negate]);
19777 return true;
19780 /* Expand a variable vector permutation. */
19782 void
19783 ix86_expand_vec_perm (rtx operands[])
19785 rtx target = operands[0];
19786 rtx op0 = operands[1];
19787 rtx op1 = operands[2];
19788 rtx mask = operands[3];
19789 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19790 enum machine_mode mode = GET_MODE (op0);
19791 enum machine_mode maskmode = GET_MODE (mask);
19792 int w, e, i;
19793 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19795 /* Number of elements in the vector. */
19796 w = GET_MODE_NUNITS (mode);
19797 e = GET_MODE_UNIT_SIZE (mode);
19798 gcc_assert (w <= 32);
19800 if (TARGET_AVX2)
19802 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19804 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19805 an constant shuffle operand. With a tiny bit of effort we can
19806 use VPERMD instead. A re-interpretation stall for V4DFmode is
19807 unfortunate but there's no avoiding it.
19808 Similarly for V16HImode we don't have instructions for variable
19809 shuffling, while for V32QImode we can use after preparing suitable
19810 masks vpshufb; vpshufb; vpermq; vpor. */
19812 if (mode == V16HImode)
19814 maskmode = mode = V32QImode;
19815 w = 32;
19816 e = 1;
19818 else
19820 maskmode = mode = V8SImode;
19821 w = 8;
19822 e = 4;
19824 t1 = gen_reg_rtx (maskmode);
19826 /* Replicate the low bits of the V4DImode mask into V8SImode:
19827 mask = { A B C D }
19828 t1 = { A A B B C C D D }. */
19829 for (i = 0; i < w / 2; ++i)
19830 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19831 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19832 vt = force_reg (maskmode, vt);
19833 mask = gen_lowpart (maskmode, mask);
19834 if (maskmode == V8SImode)
19835 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19836 else
19837 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19839 /* Multiply the shuffle indicies by two. */
19840 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19841 OPTAB_DIRECT);
19843 /* Add one to the odd shuffle indicies:
19844 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19845 for (i = 0; i < w / 2; ++i)
19847 vec[i * 2] = const0_rtx;
19848 vec[i * 2 + 1] = const1_rtx;
19850 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19851 vt = force_const_mem (maskmode, vt);
19852 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19853 OPTAB_DIRECT);
19855 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19856 operands[3] = mask = t1;
19857 target = gen_lowpart (mode, target);
19858 op0 = gen_lowpart (mode, op0);
19859 op1 = gen_lowpart (mode, op1);
19862 switch (mode)
19864 case V8SImode:
19865 /* The VPERMD and VPERMPS instructions already properly ignore
19866 the high bits of the shuffle elements. No need for us to
19867 perform an AND ourselves. */
19868 if (one_operand_shuffle)
19869 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19870 else
19872 t1 = gen_reg_rtx (V8SImode);
19873 t2 = gen_reg_rtx (V8SImode);
19874 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19875 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19876 goto merge_two;
19878 return;
19880 case V8SFmode:
19881 mask = gen_lowpart (V8SFmode, mask);
19882 if (one_operand_shuffle)
19883 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19884 else
19886 t1 = gen_reg_rtx (V8SFmode);
19887 t2 = gen_reg_rtx (V8SFmode);
19888 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19889 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19890 goto merge_two;
19892 return;
19894 case V4SImode:
19895 /* By combining the two 128-bit input vectors into one 256-bit
19896 input vector, we can use VPERMD and VPERMPS for the full
19897 two-operand shuffle. */
19898 t1 = gen_reg_rtx (V8SImode);
19899 t2 = gen_reg_rtx (V8SImode);
19900 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19901 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19902 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19903 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19904 return;
19906 case V4SFmode:
19907 t1 = gen_reg_rtx (V8SFmode);
19908 t2 = gen_reg_rtx (V8SFmode);
19909 mask = gen_lowpart (V4SFmode, mask);
19910 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19911 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19912 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19913 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19914 return;
19916 case V32QImode:
19917 t1 = gen_reg_rtx (V32QImode);
19918 t2 = gen_reg_rtx (V32QImode);
19919 t3 = gen_reg_rtx (V32QImode);
19920 vt2 = GEN_INT (128);
19921 for (i = 0; i < 32; i++)
19922 vec[i] = vt2;
19923 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19924 vt = force_reg (V32QImode, vt);
19925 for (i = 0; i < 32; i++)
19926 vec[i] = i < 16 ? vt2 : const0_rtx;
19927 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19928 vt2 = force_reg (V32QImode, vt2);
19929 /* From mask create two adjusted masks, which contain the same
19930 bits as mask in the low 7 bits of each vector element.
19931 The first mask will have the most significant bit clear
19932 if it requests element from the same 128-bit lane
19933 and MSB set if it requests element from the other 128-bit lane.
19934 The second mask will have the opposite values of the MSB,
19935 and additionally will have its 128-bit lanes swapped.
19936 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19937 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19938 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19939 stands for other 12 bytes. */
19940 /* The bit whether element is from the same lane or the other
19941 lane is bit 4, so shift it up by 3 to the MSB position. */
19942 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19943 gen_lowpart (V4DImode, mask),
19944 GEN_INT (3)));
19945 /* Clear MSB bits from the mask just in case it had them set. */
19946 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19947 /* After this t1 will have MSB set for elements from other lane. */
19948 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19949 /* Clear bits other than MSB. */
19950 emit_insn (gen_andv32qi3 (t1, t1, vt));
19951 /* Or in the lower bits from mask into t3. */
19952 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19953 /* And invert MSB bits in t1, so MSB is set for elements from the same
19954 lane. */
19955 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19956 /* Swap 128-bit lanes in t3. */
19957 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19958 gen_lowpart (V4DImode, t3),
19959 const2_rtx, GEN_INT (3),
19960 const0_rtx, const1_rtx));
19961 /* And or in the lower bits from mask into t1. */
19962 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19963 if (one_operand_shuffle)
19965 /* Each of these shuffles will put 0s in places where
19966 element from the other 128-bit lane is needed, otherwise
19967 will shuffle in the requested value. */
19968 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19969 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19970 /* For t3 the 128-bit lanes are swapped again. */
19971 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19972 gen_lowpart (V4DImode, t3),
19973 const2_rtx, GEN_INT (3),
19974 const0_rtx, const1_rtx));
19975 /* And oring both together leads to the result. */
19976 emit_insn (gen_iorv32qi3 (target, t1, t3));
19977 return;
19980 t4 = gen_reg_rtx (V32QImode);
19981 /* Similarly to the above one_operand_shuffle code,
19982 just for repeated twice for each operand. merge_two:
19983 code will merge the two results together. */
19984 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19985 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19986 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19987 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19988 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19989 gen_lowpart (V4DImode, t4),
19990 const2_rtx, GEN_INT (3),
19991 const0_rtx, const1_rtx));
19992 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19993 gen_lowpart (V4DImode, t3),
19994 const2_rtx, GEN_INT (3),
19995 const0_rtx, const1_rtx));
19996 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19997 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19998 t1 = t4;
19999 t2 = t3;
20000 goto merge_two;
20002 default:
20003 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20004 break;
20008 if (TARGET_XOP)
20010 /* The XOP VPPERM insn supports three inputs. By ignoring the
20011 one_operand_shuffle special case, we avoid creating another
20012 set of constant vectors in memory. */
20013 one_operand_shuffle = false;
20015 /* mask = mask & {2*w-1, ...} */
20016 vt = GEN_INT (2*w - 1);
20018 else
20020 /* mask = mask & {w-1, ...} */
20021 vt = GEN_INT (w - 1);
20024 for (i = 0; i < w; i++)
20025 vec[i] = vt;
20026 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20027 mask = expand_simple_binop (maskmode, AND, mask, vt,
20028 NULL_RTX, 0, OPTAB_DIRECT);
20030 /* For non-QImode operations, convert the word permutation control
20031 into a byte permutation control. */
20032 if (mode != V16QImode)
20034 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20035 GEN_INT (exact_log2 (e)),
20036 NULL_RTX, 0, OPTAB_DIRECT);
20038 /* Convert mask to vector of chars. */
20039 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20041 /* Replicate each of the input bytes into byte positions:
20042 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20043 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20044 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20045 for (i = 0; i < 16; ++i)
20046 vec[i] = GEN_INT (i/e * e);
20047 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20048 vt = force_const_mem (V16QImode, vt);
20049 if (TARGET_XOP)
20050 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20051 else
20052 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20054 /* Convert it into the byte positions by doing
20055 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20056 for (i = 0; i < 16; ++i)
20057 vec[i] = GEN_INT (i % e);
20058 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20059 vt = force_const_mem (V16QImode, vt);
20060 emit_insn (gen_addv16qi3 (mask, mask, vt));
20063 /* The actual shuffle operations all operate on V16QImode. */
20064 op0 = gen_lowpart (V16QImode, op0);
20065 op1 = gen_lowpart (V16QImode, op1);
20066 target = gen_lowpart (V16QImode, target);
20068 if (TARGET_XOP)
20070 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20072 else if (one_operand_shuffle)
20074 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20076 else
20078 rtx xops[6];
20079 bool ok;
20081 /* Shuffle the two input vectors independently. */
20082 t1 = gen_reg_rtx (V16QImode);
20083 t2 = gen_reg_rtx (V16QImode);
20084 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20085 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20087 merge_two:
20088 /* Then merge them together. The key is whether any given control
20089 element contained a bit set that indicates the second word. */
20090 mask = operands[3];
20091 vt = GEN_INT (w);
20092 if (maskmode == V2DImode && !TARGET_SSE4_1)
20094 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20095 more shuffle to convert the V2DI input mask into a V4SI
20096 input mask. At which point the masking that expand_int_vcond
20097 will work as desired. */
20098 rtx t3 = gen_reg_rtx (V4SImode);
20099 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20100 const0_rtx, const0_rtx,
20101 const2_rtx, const2_rtx));
20102 mask = t3;
20103 maskmode = V4SImode;
20104 e = w = 4;
20107 for (i = 0; i < w; i++)
20108 vec[i] = vt;
20109 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20110 vt = force_reg (maskmode, vt);
20111 mask = expand_simple_binop (maskmode, AND, mask, vt,
20112 NULL_RTX, 0, OPTAB_DIRECT);
20114 xops[0] = gen_lowpart (mode, operands[0]);
20115 xops[1] = gen_lowpart (mode, t2);
20116 xops[2] = gen_lowpart (mode, t1);
20117 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20118 xops[4] = mask;
20119 xops[5] = vt;
20120 ok = ix86_expand_int_vcond (xops);
20121 gcc_assert (ok);
20125 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20126 true if we should do zero extension, else sign extension. HIGH_P is
20127 true if we want the N/2 high elements, else the low elements. */
20129 void
20130 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20132 enum machine_mode imode = GET_MODE (operands[1]);
20133 rtx tmp, dest;
20135 if (TARGET_SSE4_1)
20137 rtx (*unpack)(rtx, rtx);
20138 rtx (*extract)(rtx, rtx) = NULL;
20139 enum machine_mode halfmode = BLKmode;
20141 switch (imode)
20143 case V32QImode:
20144 if (unsigned_p)
20145 unpack = gen_avx2_zero_extendv16qiv16hi2;
20146 else
20147 unpack = gen_avx2_sign_extendv16qiv16hi2;
20148 halfmode = V16QImode;
20149 extract
20150 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20151 break;
20152 case V16HImode:
20153 if (unsigned_p)
20154 unpack = gen_avx2_zero_extendv8hiv8si2;
20155 else
20156 unpack = gen_avx2_sign_extendv8hiv8si2;
20157 halfmode = V8HImode;
20158 extract
20159 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20160 break;
20161 case V8SImode:
20162 if (unsigned_p)
20163 unpack = gen_avx2_zero_extendv4siv4di2;
20164 else
20165 unpack = gen_avx2_sign_extendv4siv4di2;
20166 halfmode = V4SImode;
20167 extract
20168 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20169 break;
20170 case V16QImode:
20171 if (unsigned_p)
20172 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20173 else
20174 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20175 break;
20176 case V8HImode:
20177 if (unsigned_p)
20178 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20179 else
20180 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20181 break;
20182 case V4SImode:
20183 if (unsigned_p)
20184 unpack = gen_sse4_1_zero_extendv2siv2di2;
20185 else
20186 unpack = gen_sse4_1_sign_extendv2siv2di2;
20187 break;
20188 default:
20189 gcc_unreachable ();
20192 if (GET_MODE_SIZE (imode) == 32)
20194 tmp = gen_reg_rtx (halfmode);
20195 emit_insn (extract (tmp, operands[1]));
20197 else if (high_p)
20199 /* Shift higher 8 bytes to lower 8 bytes. */
20200 tmp = gen_reg_rtx (imode);
20201 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20202 gen_lowpart (V1TImode, operands[1]),
20203 GEN_INT (64)));
20205 else
20206 tmp = operands[1];
20208 emit_insn (unpack (operands[0], tmp));
20210 else
20212 rtx (*unpack)(rtx, rtx, rtx);
20214 switch (imode)
20216 case V16QImode:
20217 if (high_p)
20218 unpack = gen_vec_interleave_highv16qi;
20219 else
20220 unpack = gen_vec_interleave_lowv16qi;
20221 break;
20222 case V8HImode:
20223 if (high_p)
20224 unpack = gen_vec_interleave_highv8hi;
20225 else
20226 unpack = gen_vec_interleave_lowv8hi;
20227 break;
20228 case V4SImode:
20229 if (high_p)
20230 unpack = gen_vec_interleave_highv4si;
20231 else
20232 unpack = gen_vec_interleave_lowv4si;
20233 break;
20234 default:
20235 gcc_unreachable ();
20238 dest = gen_lowpart (imode, operands[0]);
20240 if (unsigned_p)
20241 tmp = force_reg (imode, CONST0_RTX (imode));
20242 else
20243 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20244 operands[1], pc_rtx, pc_rtx);
20246 emit_insn (unpack (dest, operands[1], tmp));
20250 /* Expand conditional increment or decrement using adb/sbb instructions.
20251 The default case using setcc followed by the conditional move can be
20252 done by generic code. */
20253 bool
20254 ix86_expand_int_addcc (rtx operands[])
20256 enum rtx_code code = GET_CODE (operands[1]);
20257 rtx flags;
20258 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20259 rtx compare_op;
20260 rtx val = const0_rtx;
20261 bool fpcmp = false;
20262 enum machine_mode mode;
20263 rtx op0 = XEXP (operands[1], 0);
20264 rtx op1 = XEXP (operands[1], 1);
20266 if (operands[3] != const1_rtx
20267 && operands[3] != constm1_rtx)
20268 return false;
20269 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20270 return false;
20271 code = GET_CODE (compare_op);
20273 flags = XEXP (compare_op, 0);
20275 if (GET_MODE (flags) == CCFPmode
20276 || GET_MODE (flags) == CCFPUmode)
20278 fpcmp = true;
20279 code = ix86_fp_compare_code_to_integer (code);
20282 if (code != LTU)
20284 val = constm1_rtx;
20285 if (fpcmp)
20286 PUT_CODE (compare_op,
20287 reverse_condition_maybe_unordered
20288 (GET_CODE (compare_op)));
20289 else
20290 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20293 mode = GET_MODE (operands[0]);
20295 /* Construct either adc or sbb insn. */
20296 if ((code == LTU) == (operands[3] == constm1_rtx))
20298 switch (mode)
20300 case QImode:
20301 insn = gen_subqi3_carry;
20302 break;
20303 case HImode:
20304 insn = gen_subhi3_carry;
20305 break;
20306 case SImode:
20307 insn = gen_subsi3_carry;
20308 break;
20309 case DImode:
20310 insn = gen_subdi3_carry;
20311 break;
20312 default:
20313 gcc_unreachable ();
20316 else
20318 switch (mode)
20320 case QImode:
20321 insn = gen_addqi3_carry;
20322 break;
20323 case HImode:
20324 insn = gen_addhi3_carry;
20325 break;
20326 case SImode:
20327 insn = gen_addsi3_carry;
20328 break;
20329 case DImode:
20330 insn = gen_adddi3_carry;
20331 break;
20332 default:
20333 gcc_unreachable ();
20336 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20338 return true;
20342 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20343 but works for floating pointer parameters and nonoffsetable memories.
20344 For pushes, it returns just stack offsets; the values will be saved
20345 in the right order. Maximally three parts are generated. */
20347 static int
20348 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20350 int size;
20352 if (!TARGET_64BIT)
20353 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20354 else
20355 size = (GET_MODE_SIZE (mode) + 4) / 8;
20357 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20358 gcc_assert (size >= 2 && size <= 4);
20360 /* Optimize constant pool reference to immediates. This is used by fp
20361 moves, that force all constants to memory to allow combining. */
20362 if (MEM_P (operand) && MEM_READONLY_P (operand))
20364 rtx tmp = maybe_get_pool_constant (operand);
20365 if (tmp)
20366 operand = tmp;
20369 if (MEM_P (operand) && !offsettable_memref_p (operand))
20371 /* The only non-offsetable memories we handle are pushes. */
20372 int ok = push_operand (operand, VOIDmode);
20374 gcc_assert (ok);
20376 operand = copy_rtx (operand);
20377 PUT_MODE (operand, word_mode);
20378 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20379 return size;
20382 if (GET_CODE (operand) == CONST_VECTOR)
20384 enum machine_mode imode = int_mode_for_mode (mode);
20385 /* Caution: if we looked through a constant pool memory above,
20386 the operand may actually have a different mode now. That's
20387 ok, since we want to pun this all the way back to an integer. */
20388 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20389 gcc_assert (operand != NULL);
20390 mode = imode;
20393 if (!TARGET_64BIT)
20395 if (mode == DImode)
20396 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20397 else
20399 int i;
20401 if (REG_P (operand))
20403 gcc_assert (reload_completed);
20404 for (i = 0; i < size; i++)
20405 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20407 else if (offsettable_memref_p (operand))
20409 operand = adjust_address (operand, SImode, 0);
20410 parts[0] = operand;
20411 for (i = 1; i < size; i++)
20412 parts[i] = adjust_address (operand, SImode, 4 * i);
20414 else if (GET_CODE (operand) == CONST_DOUBLE)
20416 REAL_VALUE_TYPE r;
20417 long l[4];
20419 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20420 switch (mode)
20422 case TFmode:
20423 real_to_target (l, &r, mode);
20424 parts[3] = gen_int_mode (l[3], SImode);
20425 parts[2] = gen_int_mode (l[2], SImode);
20426 break;
20427 case XFmode:
20428 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20429 parts[2] = gen_int_mode (l[2], SImode);
20430 break;
20431 case DFmode:
20432 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20433 break;
20434 default:
20435 gcc_unreachable ();
20437 parts[1] = gen_int_mode (l[1], SImode);
20438 parts[0] = gen_int_mode (l[0], SImode);
20440 else
20441 gcc_unreachable ();
20444 else
20446 if (mode == TImode)
20447 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20448 if (mode == XFmode || mode == TFmode)
20450 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20451 if (REG_P (operand))
20453 gcc_assert (reload_completed);
20454 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20455 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20457 else if (offsettable_memref_p (operand))
20459 operand = adjust_address (operand, DImode, 0);
20460 parts[0] = operand;
20461 parts[1] = adjust_address (operand, upper_mode, 8);
20463 else if (GET_CODE (operand) == CONST_DOUBLE)
20465 REAL_VALUE_TYPE r;
20466 long l[4];
20468 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20469 real_to_target (l, &r, mode);
20471 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20472 if (HOST_BITS_PER_WIDE_INT >= 64)
20473 parts[0]
20474 = gen_int_mode
20475 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20476 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20477 DImode);
20478 else
20479 parts[0] = immed_double_const (l[0], l[1], DImode);
20481 if (upper_mode == SImode)
20482 parts[1] = gen_int_mode (l[2], SImode);
20483 else if (HOST_BITS_PER_WIDE_INT >= 64)
20484 parts[1]
20485 = gen_int_mode
20486 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20487 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20488 DImode);
20489 else
20490 parts[1] = immed_double_const (l[2], l[3], DImode);
20492 else
20493 gcc_unreachable ();
20497 return size;
20500 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20501 Return false when normal moves are needed; true when all required
20502 insns have been emitted. Operands 2-4 contain the input values
20503 int the correct order; operands 5-7 contain the output values. */
20505 void
20506 ix86_split_long_move (rtx operands[])
20508 rtx part[2][4];
20509 int nparts, i, j;
20510 int push = 0;
20511 int collisions = 0;
20512 enum machine_mode mode = GET_MODE (operands[0]);
20513 bool collisionparts[4];
20515 /* The DFmode expanders may ask us to move double.
20516 For 64bit target this is single move. By hiding the fact
20517 here we simplify i386.md splitters. */
20518 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20520 /* Optimize constant pool reference to immediates. This is used by
20521 fp moves, that force all constants to memory to allow combining. */
20523 if (MEM_P (operands[1])
20524 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20525 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20526 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20527 if (push_operand (operands[0], VOIDmode))
20529 operands[0] = copy_rtx (operands[0]);
20530 PUT_MODE (operands[0], word_mode);
20532 else
20533 operands[0] = gen_lowpart (DImode, operands[0]);
20534 operands[1] = gen_lowpart (DImode, operands[1]);
20535 emit_move_insn (operands[0], operands[1]);
20536 return;
20539 /* The only non-offsettable memory we handle is push. */
20540 if (push_operand (operands[0], VOIDmode))
20541 push = 1;
20542 else
20543 gcc_assert (!MEM_P (operands[0])
20544 || offsettable_memref_p (operands[0]));
20546 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20547 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20549 /* When emitting push, take care for source operands on the stack. */
20550 if (push && MEM_P (operands[1])
20551 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20553 rtx src_base = XEXP (part[1][nparts - 1], 0);
20555 /* Compensate for the stack decrement by 4. */
20556 if (!TARGET_64BIT && nparts == 3
20557 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20558 src_base = plus_constant (src_base, 4);
20560 /* src_base refers to the stack pointer and is
20561 automatically decreased by emitted push. */
20562 for (i = 0; i < nparts; i++)
20563 part[1][i] = change_address (part[1][i],
20564 GET_MODE (part[1][i]), src_base);
20567 /* We need to do copy in the right order in case an address register
20568 of the source overlaps the destination. */
20569 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20571 rtx tmp;
20573 for (i = 0; i < nparts; i++)
20575 collisionparts[i]
20576 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20577 if (collisionparts[i])
20578 collisions++;
20581 /* Collision in the middle part can be handled by reordering. */
20582 if (collisions == 1 && nparts == 3 && collisionparts [1])
20584 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20585 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20587 else if (collisions == 1
20588 && nparts == 4
20589 && (collisionparts [1] || collisionparts [2]))
20591 if (collisionparts [1])
20593 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20594 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20596 else
20598 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20599 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20603 /* If there are more collisions, we can't handle it by reordering.
20604 Do an lea to the last part and use only one colliding move. */
20605 else if (collisions > 1)
20607 rtx base;
20609 collisions = 1;
20611 base = part[0][nparts - 1];
20613 /* Handle the case when the last part isn't valid for lea.
20614 Happens in 64-bit mode storing the 12-byte XFmode. */
20615 if (GET_MODE (base) != Pmode)
20616 base = gen_rtx_REG (Pmode, REGNO (base));
20618 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20619 part[1][0] = replace_equiv_address (part[1][0], base);
20620 for (i = 1; i < nparts; i++)
20622 tmp = plus_constant (base, UNITS_PER_WORD * i);
20623 part[1][i] = replace_equiv_address (part[1][i], tmp);
20628 if (push)
20630 if (!TARGET_64BIT)
20632 if (nparts == 3)
20634 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20635 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
20636 stack_pointer_rtx, GEN_INT (-4)));
20637 emit_move_insn (part[0][2], part[1][2]);
20639 else if (nparts == 4)
20641 emit_move_insn (part[0][3], part[1][3]);
20642 emit_move_insn (part[0][2], part[1][2]);
20645 else
20647 /* In 64bit mode we don't have 32bit push available. In case this is
20648 register, it is OK - we will just use larger counterpart. We also
20649 retype memory - these comes from attempt to avoid REX prefix on
20650 moving of second half of TFmode value. */
20651 if (GET_MODE (part[1][1]) == SImode)
20653 switch (GET_CODE (part[1][1]))
20655 case MEM:
20656 part[1][1] = adjust_address (part[1][1], DImode, 0);
20657 break;
20659 case REG:
20660 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20661 break;
20663 default:
20664 gcc_unreachable ();
20667 if (GET_MODE (part[1][0]) == SImode)
20668 part[1][0] = part[1][1];
20671 emit_move_insn (part[0][1], part[1][1]);
20672 emit_move_insn (part[0][0], part[1][0]);
20673 return;
20676 /* Choose correct order to not overwrite the source before it is copied. */
20677 if ((REG_P (part[0][0])
20678 && REG_P (part[1][1])
20679 && (REGNO (part[0][0]) == REGNO (part[1][1])
20680 || (nparts == 3
20681 && REGNO (part[0][0]) == REGNO (part[1][2]))
20682 || (nparts == 4
20683 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20684 || (collisions > 0
20685 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20687 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20689 operands[2 + i] = part[0][j];
20690 operands[6 + i] = part[1][j];
20693 else
20695 for (i = 0; i < nparts; i++)
20697 operands[2 + i] = part[0][i];
20698 operands[6 + i] = part[1][i];
20702 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20703 if (optimize_insn_for_size_p ())
20705 for (j = 0; j < nparts - 1; j++)
20706 if (CONST_INT_P (operands[6 + j])
20707 && operands[6 + j] != const0_rtx
20708 && REG_P (operands[2 + j]))
20709 for (i = j; i < nparts - 1; i++)
20710 if (CONST_INT_P (operands[7 + i])
20711 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20712 operands[7 + i] = operands[2 + j];
20715 for (i = 0; i < nparts; i++)
20716 emit_move_insn (operands[2 + i], operands[6 + i]);
20718 return;
20721 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20722 left shift by a constant, either using a single shift or
20723 a sequence of add instructions. */
20725 static void
20726 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20728 rtx (*insn)(rtx, rtx, rtx);
20730 if (count == 1
20731 || (count * ix86_cost->add <= ix86_cost->shift_const
20732 && !optimize_insn_for_size_p ()))
20734 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20735 while (count-- > 0)
20736 emit_insn (insn (operand, operand, operand));
20738 else
20740 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20741 emit_insn (insn (operand, operand, GEN_INT (count)));
20745 void
20746 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20748 rtx (*gen_ashl3)(rtx, rtx, rtx);
20749 rtx (*gen_shld)(rtx, rtx, rtx);
20750 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20752 rtx low[2], high[2];
20753 int count;
20755 if (CONST_INT_P (operands[2]))
20757 split_double_mode (mode, operands, 2, low, high);
20758 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20760 if (count >= half_width)
20762 emit_move_insn (high[0], low[1]);
20763 emit_move_insn (low[0], const0_rtx);
20765 if (count > half_width)
20766 ix86_expand_ashl_const (high[0], count - half_width, mode);
20768 else
20770 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20772 if (!rtx_equal_p (operands[0], operands[1]))
20773 emit_move_insn (operands[0], operands[1]);
20775 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20776 ix86_expand_ashl_const (low[0], count, mode);
20778 return;
20781 split_double_mode (mode, operands, 1, low, high);
20783 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20785 if (operands[1] == const1_rtx)
20787 /* Assuming we've chosen a QImode capable registers, then 1 << N
20788 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20789 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20791 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20793 ix86_expand_clear (low[0]);
20794 ix86_expand_clear (high[0]);
20795 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20797 d = gen_lowpart (QImode, low[0]);
20798 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20799 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20800 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20802 d = gen_lowpart (QImode, high[0]);
20803 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20804 s = gen_rtx_NE (QImode, flags, const0_rtx);
20805 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20808 /* Otherwise, we can get the same results by manually performing
20809 a bit extract operation on bit 5/6, and then performing the two
20810 shifts. The two methods of getting 0/1 into low/high are exactly
20811 the same size. Avoiding the shift in the bit extract case helps
20812 pentium4 a bit; no one else seems to care much either way. */
20813 else
20815 enum machine_mode half_mode;
20816 rtx (*gen_lshr3)(rtx, rtx, rtx);
20817 rtx (*gen_and3)(rtx, rtx, rtx);
20818 rtx (*gen_xor3)(rtx, rtx, rtx);
20819 HOST_WIDE_INT bits;
20820 rtx x;
20822 if (mode == DImode)
20824 half_mode = SImode;
20825 gen_lshr3 = gen_lshrsi3;
20826 gen_and3 = gen_andsi3;
20827 gen_xor3 = gen_xorsi3;
20828 bits = 5;
20830 else
20832 half_mode = DImode;
20833 gen_lshr3 = gen_lshrdi3;
20834 gen_and3 = gen_anddi3;
20835 gen_xor3 = gen_xordi3;
20836 bits = 6;
20839 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20840 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20841 else
20842 x = gen_lowpart (half_mode, operands[2]);
20843 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20845 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20846 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20847 emit_move_insn (low[0], high[0]);
20848 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20851 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20852 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20853 return;
20856 if (operands[1] == constm1_rtx)
20858 /* For -1 << N, we can avoid the shld instruction, because we
20859 know that we're shifting 0...31/63 ones into a -1. */
20860 emit_move_insn (low[0], constm1_rtx);
20861 if (optimize_insn_for_size_p ())
20862 emit_move_insn (high[0], low[0]);
20863 else
20864 emit_move_insn (high[0], constm1_rtx);
20866 else
20868 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20870 if (!rtx_equal_p (operands[0], operands[1]))
20871 emit_move_insn (operands[0], operands[1]);
20873 split_double_mode (mode, operands, 1, low, high);
20874 emit_insn (gen_shld (high[0], low[0], operands[2]));
20877 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20879 if (TARGET_CMOVE && scratch)
20881 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20882 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20884 ix86_expand_clear (scratch);
20885 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20887 else
20889 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20890 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20892 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20896 void
20897 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20899 rtx (*gen_ashr3)(rtx, rtx, rtx)
20900 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20901 rtx (*gen_shrd)(rtx, rtx, rtx);
20902 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20904 rtx low[2], high[2];
20905 int count;
20907 if (CONST_INT_P (operands[2]))
20909 split_double_mode (mode, operands, 2, low, high);
20910 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20912 if (count == GET_MODE_BITSIZE (mode) - 1)
20914 emit_move_insn (high[0], high[1]);
20915 emit_insn (gen_ashr3 (high[0], high[0],
20916 GEN_INT (half_width - 1)));
20917 emit_move_insn (low[0], high[0]);
20920 else if (count >= half_width)
20922 emit_move_insn (low[0], high[1]);
20923 emit_move_insn (high[0], low[0]);
20924 emit_insn (gen_ashr3 (high[0], high[0],
20925 GEN_INT (half_width - 1)));
20927 if (count > half_width)
20928 emit_insn (gen_ashr3 (low[0], low[0],
20929 GEN_INT (count - half_width)));
20931 else
20933 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20935 if (!rtx_equal_p (operands[0], operands[1]))
20936 emit_move_insn (operands[0], operands[1]);
20938 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20939 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20942 else
20944 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20946 if (!rtx_equal_p (operands[0], operands[1]))
20947 emit_move_insn (operands[0], operands[1]);
20949 split_double_mode (mode, operands, 1, low, high);
20951 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20952 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20954 if (TARGET_CMOVE && scratch)
20956 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20957 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20959 emit_move_insn (scratch, high[0]);
20960 emit_insn (gen_ashr3 (scratch, scratch,
20961 GEN_INT (half_width - 1)));
20962 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20963 scratch));
20965 else
20967 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20968 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20970 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20975 void
20976 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20978 rtx (*gen_lshr3)(rtx, rtx, rtx)
20979 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20980 rtx (*gen_shrd)(rtx, rtx, rtx);
20981 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20983 rtx low[2], high[2];
20984 int count;
20986 if (CONST_INT_P (operands[2]))
20988 split_double_mode (mode, operands, 2, low, high);
20989 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20991 if (count >= half_width)
20993 emit_move_insn (low[0], high[1]);
20994 ix86_expand_clear (high[0]);
20996 if (count > half_width)
20997 emit_insn (gen_lshr3 (low[0], low[0],
20998 GEN_INT (count - half_width)));
21000 else
21002 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21004 if (!rtx_equal_p (operands[0], operands[1]))
21005 emit_move_insn (operands[0], operands[1]);
21007 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21008 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21011 else
21013 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21015 if (!rtx_equal_p (operands[0], operands[1]))
21016 emit_move_insn (operands[0], operands[1]);
21018 split_double_mode (mode, operands, 1, low, high);
21020 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21021 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21023 if (TARGET_CMOVE && scratch)
21025 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21026 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21028 ix86_expand_clear (scratch);
21029 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21030 scratch));
21032 else
21034 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21035 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21037 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21042 /* Predict just emitted jump instruction to be taken with probability PROB. */
21043 static void
21044 predict_jump (int prob)
21046 rtx insn = get_last_insn ();
21047 gcc_assert (JUMP_P (insn));
21048 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21051 /* Helper function for the string operations below. Dest VARIABLE whether
21052 it is aligned to VALUE bytes. If true, jump to the label. */
21053 static rtx
21054 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21056 rtx label = gen_label_rtx ();
21057 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21058 if (GET_MODE (variable) == DImode)
21059 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21060 else
21061 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21062 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21063 1, label);
21064 if (epilogue)
21065 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21066 else
21067 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21068 return label;
21071 /* Adjust COUNTER by the VALUE. */
21072 static void
21073 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21075 rtx (*gen_add)(rtx, rtx, rtx)
21076 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21078 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21081 /* Zero extend possibly SImode EXP to Pmode register. */
21083 ix86_zero_extend_to_Pmode (rtx exp)
21085 if (GET_MODE (exp) != Pmode)
21086 exp = convert_to_mode (Pmode, exp, 1);
21087 return force_reg (Pmode, exp);
21090 /* Divide COUNTREG by SCALE. */
21091 static rtx
21092 scale_counter (rtx countreg, int scale)
21094 rtx sc;
21096 if (scale == 1)
21097 return countreg;
21098 if (CONST_INT_P (countreg))
21099 return GEN_INT (INTVAL (countreg) / scale);
21100 gcc_assert (REG_P (countreg));
21102 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21103 GEN_INT (exact_log2 (scale)),
21104 NULL, 1, OPTAB_DIRECT);
21105 return sc;
21108 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21109 DImode for constant loop counts. */
21111 static enum machine_mode
21112 counter_mode (rtx count_exp)
21114 if (GET_MODE (count_exp) != VOIDmode)
21115 return GET_MODE (count_exp);
21116 if (!CONST_INT_P (count_exp))
21117 return Pmode;
21118 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21119 return DImode;
21120 return SImode;
21123 /* When SRCPTR is non-NULL, output simple loop to move memory
21124 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21125 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21126 equivalent loop to set memory by VALUE (supposed to be in MODE).
21128 The size is rounded down to whole number of chunk size moved at once.
21129 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21132 static void
21133 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21134 rtx destptr, rtx srcptr, rtx value,
21135 rtx count, enum machine_mode mode, int unroll,
21136 int expected_size)
21138 rtx out_label, top_label, iter, tmp;
21139 enum machine_mode iter_mode = counter_mode (count);
21140 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21141 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21142 rtx size;
21143 rtx x_addr;
21144 rtx y_addr;
21145 int i;
21147 top_label = gen_label_rtx ();
21148 out_label = gen_label_rtx ();
21149 iter = gen_reg_rtx (iter_mode);
21151 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21152 NULL, 1, OPTAB_DIRECT);
21153 /* Those two should combine. */
21154 if (piece_size == const1_rtx)
21156 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21157 true, out_label);
21158 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21160 emit_move_insn (iter, const0_rtx);
21162 emit_label (top_label);
21164 tmp = convert_modes (Pmode, iter_mode, iter, true);
21165 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21166 destmem = change_address (destmem, mode, x_addr);
21168 if (srcmem)
21170 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21171 srcmem = change_address (srcmem, mode, y_addr);
21173 /* When unrolling for chips that reorder memory reads and writes,
21174 we can save registers by using single temporary.
21175 Also using 4 temporaries is overkill in 32bit mode. */
21176 if (!TARGET_64BIT && 0)
21178 for (i = 0; i < unroll; i++)
21180 if (i)
21182 destmem =
21183 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21184 srcmem =
21185 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21187 emit_move_insn (destmem, srcmem);
21190 else
21192 rtx tmpreg[4];
21193 gcc_assert (unroll <= 4);
21194 for (i = 0; i < unroll; i++)
21196 tmpreg[i] = gen_reg_rtx (mode);
21197 if (i)
21199 srcmem =
21200 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21202 emit_move_insn (tmpreg[i], srcmem);
21204 for (i = 0; i < unroll; i++)
21206 if (i)
21208 destmem =
21209 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21211 emit_move_insn (destmem, tmpreg[i]);
21215 else
21216 for (i = 0; i < unroll; i++)
21218 if (i)
21219 destmem =
21220 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21221 emit_move_insn (destmem, value);
21224 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21225 true, OPTAB_LIB_WIDEN);
21226 if (tmp != iter)
21227 emit_move_insn (iter, tmp);
21229 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21230 true, top_label);
21231 if (expected_size != -1)
21233 expected_size /= GET_MODE_SIZE (mode) * unroll;
21234 if (expected_size == 0)
21235 predict_jump (0);
21236 else if (expected_size > REG_BR_PROB_BASE)
21237 predict_jump (REG_BR_PROB_BASE - 1);
21238 else
21239 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21241 else
21242 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21243 iter = ix86_zero_extend_to_Pmode (iter);
21244 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21245 true, OPTAB_LIB_WIDEN);
21246 if (tmp != destptr)
21247 emit_move_insn (destptr, tmp);
21248 if (srcptr)
21250 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21251 true, OPTAB_LIB_WIDEN);
21252 if (tmp != srcptr)
21253 emit_move_insn (srcptr, tmp);
21255 emit_label (out_label);
21258 /* Output "rep; mov" instruction.
21259 Arguments have same meaning as for previous function */
21260 static void
21261 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21262 rtx destptr, rtx srcptr,
21263 rtx count,
21264 enum machine_mode mode)
21266 rtx destexp;
21267 rtx srcexp;
21268 rtx countreg;
21269 HOST_WIDE_INT rounded_count;
21271 /* If the size is known, it is shorter to use rep movs. */
21272 if (mode == QImode && CONST_INT_P (count)
21273 && !(INTVAL (count) & 3))
21274 mode = SImode;
21276 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21277 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21278 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21279 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21280 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21281 if (mode != QImode)
21283 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21284 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21285 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21286 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21287 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21288 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21290 else
21292 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21293 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21295 if (CONST_INT_P (count))
21297 rounded_count = (INTVAL (count)
21298 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21299 destmem = shallow_copy_rtx (destmem);
21300 srcmem = shallow_copy_rtx (srcmem);
21301 set_mem_size (destmem, rounded_count);
21302 set_mem_size (srcmem, rounded_count);
21304 else
21306 if (MEM_SIZE_KNOWN_P (destmem))
21307 clear_mem_size (destmem);
21308 if (MEM_SIZE_KNOWN_P (srcmem))
21309 clear_mem_size (srcmem);
21311 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21312 destexp, srcexp));
21315 /* Output "rep; stos" instruction.
21316 Arguments have same meaning as for previous function */
21317 static void
21318 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21319 rtx count, enum machine_mode mode,
21320 rtx orig_value)
21322 rtx destexp;
21323 rtx countreg;
21324 HOST_WIDE_INT rounded_count;
21326 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21327 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21328 value = force_reg (mode, gen_lowpart (mode, value));
21329 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21330 if (mode != QImode)
21332 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21333 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21334 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21336 else
21337 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21338 if (orig_value == const0_rtx && CONST_INT_P (count))
21340 rounded_count = (INTVAL (count)
21341 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21342 destmem = shallow_copy_rtx (destmem);
21343 set_mem_size (destmem, rounded_count);
21345 else if (MEM_SIZE_KNOWN_P (destmem))
21346 clear_mem_size (destmem);
21347 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21350 static void
21351 emit_strmov (rtx destmem, rtx srcmem,
21352 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21354 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21355 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21356 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21359 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21360 static void
21361 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21362 rtx destptr, rtx srcptr, rtx count, int max_size)
21364 rtx src, dest;
21365 if (CONST_INT_P (count))
21367 HOST_WIDE_INT countval = INTVAL (count);
21368 int offset = 0;
21370 if ((countval & 0x10) && max_size > 16)
21372 if (TARGET_64BIT)
21374 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21375 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21377 else
21378 gcc_unreachable ();
21379 offset += 16;
21381 if ((countval & 0x08) && max_size > 8)
21383 if (TARGET_64BIT)
21384 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21385 else
21387 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21388 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21390 offset += 8;
21392 if ((countval & 0x04) && max_size > 4)
21394 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21395 offset += 4;
21397 if ((countval & 0x02) && max_size > 2)
21399 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21400 offset += 2;
21402 if ((countval & 0x01) && max_size > 1)
21404 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21405 offset += 1;
21407 return;
21409 if (max_size > 8)
21411 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21412 count, 1, OPTAB_DIRECT);
21413 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21414 count, QImode, 1, 4);
21415 return;
21418 /* When there are stringops, we can cheaply increase dest and src pointers.
21419 Otherwise we save code size by maintaining offset (zero is readily
21420 available from preceding rep operation) and using x86 addressing modes.
21422 if (TARGET_SINGLE_STRINGOP)
21424 if (max_size > 4)
21426 rtx label = ix86_expand_aligntest (count, 4, true);
21427 src = change_address (srcmem, SImode, srcptr);
21428 dest = change_address (destmem, SImode, destptr);
21429 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21430 emit_label (label);
21431 LABEL_NUSES (label) = 1;
21433 if (max_size > 2)
21435 rtx label = ix86_expand_aligntest (count, 2, true);
21436 src = change_address (srcmem, HImode, srcptr);
21437 dest = change_address (destmem, HImode, destptr);
21438 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21439 emit_label (label);
21440 LABEL_NUSES (label) = 1;
21442 if (max_size > 1)
21444 rtx label = ix86_expand_aligntest (count, 1, true);
21445 src = change_address (srcmem, QImode, srcptr);
21446 dest = change_address (destmem, QImode, destptr);
21447 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21448 emit_label (label);
21449 LABEL_NUSES (label) = 1;
21452 else
21454 rtx offset = force_reg (Pmode, const0_rtx);
21455 rtx tmp;
21457 if (max_size > 4)
21459 rtx label = ix86_expand_aligntest (count, 4, true);
21460 src = change_address (srcmem, SImode, srcptr);
21461 dest = change_address (destmem, SImode, destptr);
21462 emit_move_insn (dest, src);
21463 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21464 true, OPTAB_LIB_WIDEN);
21465 if (tmp != offset)
21466 emit_move_insn (offset, tmp);
21467 emit_label (label);
21468 LABEL_NUSES (label) = 1;
21470 if (max_size > 2)
21472 rtx label = ix86_expand_aligntest (count, 2, true);
21473 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21474 src = change_address (srcmem, HImode, tmp);
21475 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21476 dest = change_address (destmem, HImode, tmp);
21477 emit_move_insn (dest, src);
21478 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21479 true, OPTAB_LIB_WIDEN);
21480 if (tmp != offset)
21481 emit_move_insn (offset, tmp);
21482 emit_label (label);
21483 LABEL_NUSES (label) = 1;
21485 if (max_size > 1)
21487 rtx label = ix86_expand_aligntest (count, 1, true);
21488 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21489 src = change_address (srcmem, QImode, tmp);
21490 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21491 dest = change_address (destmem, QImode, tmp);
21492 emit_move_insn (dest, src);
21493 emit_label (label);
21494 LABEL_NUSES (label) = 1;
21499 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21500 static void
21501 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21502 rtx count, int max_size)
21504 count =
21505 expand_simple_binop (counter_mode (count), AND, count,
21506 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21507 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21508 gen_lowpart (QImode, value), count, QImode,
21509 1, max_size / 2);
21512 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21513 static void
21514 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21516 rtx dest;
21518 if (CONST_INT_P (count))
21520 HOST_WIDE_INT countval = INTVAL (count);
21521 int offset = 0;
21523 if ((countval & 0x10) && max_size > 16)
21525 if (TARGET_64BIT)
21527 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21528 emit_insn (gen_strset (destptr, dest, value));
21529 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21530 emit_insn (gen_strset (destptr, dest, value));
21532 else
21533 gcc_unreachable ();
21534 offset += 16;
21536 if ((countval & 0x08) && max_size > 8)
21538 if (TARGET_64BIT)
21540 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21541 emit_insn (gen_strset (destptr, dest, value));
21543 else
21545 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21546 emit_insn (gen_strset (destptr, dest, value));
21547 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21548 emit_insn (gen_strset (destptr, dest, value));
21550 offset += 8;
21552 if ((countval & 0x04) && max_size > 4)
21554 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21555 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21556 offset += 4;
21558 if ((countval & 0x02) && max_size > 2)
21560 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21561 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21562 offset += 2;
21564 if ((countval & 0x01) && max_size > 1)
21566 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21567 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21568 offset += 1;
21570 return;
21572 if (max_size > 32)
21574 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21575 return;
21577 if (max_size > 16)
21579 rtx label = ix86_expand_aligntest (count, 16, true);
21580 if (TARGET_64BIT)
21582 dest = change_address (destmem, DImode, destptr);
21583 emit_insn (gen_strset (destptr, dest, value));
21584 emit_insn (gen_strset (destptr, dest, value));
21586 else
21588 dest = change_address (destmem, SImode, destptr);
21589 emit_insn (gen_strset (destptr, dest, value));
21590 emit_insn (gen_strset (destptr, dest, value));
21591 emit_insn (gen_strset (destptr, dest, value));
21592 emit_insn (gen_strset (destptr, dest, value));
21594 emit_label (label);
21595 LABEL_NUSES (label) = 1;
21597 if (max_size > 8)
21599 rtx label = ix86_expand_aligntest (count, 8, true);
21600 if (TARGET_64BIT)
21602 dest = change_address (destmem, DImode, destptr);
21603 emit_insn (gen_strset (destptr, dest, value));
21605 else
21607 dest = change_address (destmem, SImode, destptr);
21608 emit_insn (gen_strset (destptr, dest, value));
21609 emit_insn (gen_strset (destptr, dest, value));
21611 emit_label (label);
21612 LABEL_NUSES (label) = 1;
21614 if (max_size > 4)
21616 rtx label = ix86_expand_aligntest (count, 4, true);
21617 dest = change_address (destmem, SImode, destptr);
21618 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21619 emit_label (label);
21620 LABEL_NUSES (label) = 1;
21622 if (max_size > 2)
21624 rtx label = ix86_expand_aligntest (count, 2, true);
21625 dest = change_address (destmem, HImode, destptr);
21626 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21627 emit_label (label);
21628 LABEL_NUSES (label) = 1;
21630 if (max_size > 1)
21632 rtx label = ix86_expand_aligntest (count, 1, true);
21633 dest = change_address (destmem, QImode, destptr);
21634 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21635 emit_label (label);
21636 LABEL_NUSES (label) = 1;
21640 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21641 DESIRED_ALIGNMENT. */
21642 static void
21643 expand_movmem_prologue (rtx destmem, rtx srcmem,
21644 rtx destptr, rtx srcptr, rtx count,
21645 int align, int desired_alignment)
21647 if (align <= 1 && desired_alignment > 1)
21649 rtx label = ix86_expand_aligntest (destptr, 1, false);
21650 srcmem = change_address (srcmem, QImode, srcptr);
21651 destmem = change_address (destmem, QImode, destptr);
21652 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21653 ix86_adjust_counter (count, 1);
21654 emit_label (label);
21655 LABEL_NUSES (label) = 1;
21657 if (align <= 2 && desired_alignment > 2)
21659 rtx label = ix86_expand_aligntest (destptr, 2, false);
21660 srcmem = change_address (srcmem, HImode, srcptr);
21661 destmem = change_address (destmem, HImode, destptr);
21662 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21663 ix86_adjust_counter (count, 2);
21664 emit_label (label);
21665 LABEL_NUSES (label) = 1;
21667 if (align <= 4 && desired_alignment > 4)
21669 rtx label = ix86_expand_aligntest (destptr, 4, false);
21670 srcmem = change_address (srcmem, SImode, srcptr);
21671 destmem = change_address (destmem, SImode, destptr);
21672 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21673 ix86_adjust_counter (count, 4);
21674 emit_label (label);
21675 LABEL_NUSES (label) = 1;
21677 gcc_assert (desired_alignment <= 8);
21680 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21681 ALIGN_BYTES is how many bytes need to be copied. */
21682 static rtx
21683 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21684 int desired_align, int align_bytes)
21686 rtx src = *srcp;
21687 rtx orig_dst = dst;
21688 rtx orig_src = src;
21689 int off = 0;
21690 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21691 if (src_align_bytes >= 0)
21692 src_align_bytes = desired_align - src_align_bytes;
21693 if (align_bytes & 1)
21695 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21696 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21697 off = 1;
21698 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21700 if (align_bytes & 2)
21702 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21703 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21704 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21705 set_mem_align (dst, 2 * BITS_PER_UNIT);
21706 if (src_align_bytes >= 0
21707 && (src_align_bytes & 1) == (align_bytes & 1)
21708 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21709 set_mem_align (src, 2 * BITS_PER_UNIT);
21710 off = 2;
21711 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21713 if (align_bytes & 4)
21715 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21716 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21717 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21718 set_mem_align (dst, 4 * BITS_PER_UNIT);
21719 if (src_align_bytes >= 0)
21721 unsigned int src_align = 0;
21722 if ((src_align_bytes & 3) == (align_bytes & 3))
21723 src_align = 4;
21724 else if ((src_align_bytes & 1) == (align_bytes & 1))
21725 src_align = 2;
21726 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21727 set_mem_align (src, src_align * BITS_PER_UNIT);
21729 off = 4;
21730 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21732 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21733 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21734 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21735 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21736 if (src_align_bytes >= 0)
21738 unsigned int src_align = 0;
21739 if ((src_align_bytes & 7) == (align_bytes & 7))
21740 src_align = 8;
21741 else if ((src_align_bytes & 3) == (align_bytes & 3))
21742 src_align = 4;
21743 else if ((src_align_bytes & 1) == (align_bytes & 1))
21744 src_align = 2;
21745 if (src_align > (unsigned int) desired_align)
21746 src_align = desired_align;
21747 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21748 set_mem_align (src, src_align * BITS_PER_UNIT);
21750 if (MEM_SIZE_KNOWN_P (orig_dst))
21751 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21752 if (MEM_SIZE_KNOWN_P (orig_src))
21753 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21754 *srcp = src;
21755 return dst;
21758 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21759 DESIRED_ALIGNMENT. */
21760 static void
21761 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21762 int align, int desired_alignment)
21764 if (align <= 1 && desired_alignment > 1)
21766 rtx label = ix86_expand_aligntest (destptr, 1, false);
21767 destmem = change_address (destmem, QImode, destptr);
21768 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21769 ix86_adjust_counter (count, 1);
21770 emit_label (label);
21771 LABEL_NUSES (label) = 1;
21773 if (align <= 2 && desired_alignment > 2)
21775 rtx label = ix86_expand_aligntest (destptr, 2, false);
21776 destmem = change_address (destmem, HImode, destptr);
21777 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21778 ix86_adjust_counter (count, 2);
21779 emit_label (label);
21780 LABEL_NUSES (label) = 1;
21782 if (align <= 4 && desired_alignment > 4)
21784 rtx label = ix86_expand_aligntest (destptr, 4, false);
21785 destmem = change_address (destmem, SImode, destptr);
21786 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21787 ix86_adjust_counter (count, 4);
21788 emit_label (label);
21789 LABEL_NUSES (label) = 1;
21791 gcc_assert (desired_alignment <= 8);
21794 /* Set enough from DST to align DST known to by aligned by ALIGN to
21795 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21796 static rtx
21797 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21798 int desired_align, int align_bytes)
21800 int off = 0;
21801 rtx orig_dst = dst;
21802 if (align_bytes & 1)
21804 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21805 off = 1;
21806 emit_insn (gen_strset (destreg, dst,
21807 gen_lowpart (QImode, value)));
21809 if (align_bytes & 2)
21811 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21812 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21813 set_mem_align (dst, 2 * BITS_PER_UNIT);
21814 off = 2;
21815 emit_insn (gen_strset (destreg, dst,
21816 gen_lowpart (HImode, value)));
21818 if (align_bytes & 4)
21820 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21821 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21822 set_mem_align (dst, 4 * BITS_PER_UNIT);
21823 off = 4;
21824 emit_insn (gen_strset (destreg, dst,
21825 gen_lowpart (SImode, value)));
21827 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21828 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21829 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21830 if (MEM_SIZE_KNOWN_P (orig_dst))
21831 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21832 return dst;
21835 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21836 static enum stringop_alg
21837 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21838 int *dynamic_check)
21840 const struct stringop_algs * algs;
21841 bool optimize_for_speed;
21842 /* Algorithms using the rep prefix want at least edi and ecx;
21843 additionally, memset wants eax and memcpy wants esi. Don't
21844 consider such algorithms if the user has appropriated those
21845 registers for their own purposes. */
21846 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21847 || (memset
21848 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21850 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21851 || (alg != rep_prefix_1_byte \
21852 && alg != rep_prefix_4_byte \
21853 && alg != rep_prefix_8_byte))
21854 const struct processor_costs *cost;
21856 /* Even if the string operation call is cold, we still might spend a lot
21857 of time processing large blocks. */
21858 if (optimize_function_for_size_p (cfun)
21859 || (optimize_insn_for_size_p ()
21860 && expected_size != -1 && expected_size < 256))
21861 optimize_for_speed = false;
21862 else
21863 optimize_for_speed = true;
21865 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21867 *dynamic_check = -1;
21868 if (memset)
21869 algs = &cost->memset[TARGET_64BIT != 0];
21870 else
21871 algs = &cost->memcpy[TARGET_64BIT != 0];
21872 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21873 return ix86_stringop_alg;
21874 /* rep; movq or rep; movl is the smallest variant. */
21875 else if (!optimize_for_speed)
21877 if (!count || (count & 3))
21878 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21879 else
21880 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21882 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21884 else if (expected_size != -1 && expected_size < 4)
21885 return loop_1_byte;
21886 else if (expected_size != -1)
21888 unsigned int i;
21889 enum stringop_alg alg = libcall;
21890 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21892 /* We get here if the algorithms that were not libcall-based
21893 were rep-prefix based and we are unable to use rep prefixes
21894 based on global register usage. Break out of the loop and
21895 use the heuristic below. */
21896 if (algs->size[i].max == 0)
21897 break;
21898 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21900 enum stringop_alg candidate = algs->size[i].alg;
21902 if (candidate != libcall && ALG_USABLE_P (candidate))
21903 alg = candidate;
21904 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21905 last non-libcall inline algorithm. */
21906 if (TARGET_INLINE_ALL_STRINGOPS)
21908 /* When the current size is best to be copied by a libcall,
21909 but we are still forced to inline, run the heuristic below
21910 that will pick code for medium sized blocks. */
21911 if (alg != libcall)
21912 return alg;
21913 break;
21915 else if (ALG_USABLE_P (candidate))
21916 return candidate;
21919 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21921 /* When asked to inline the call anyway, try to pick meaningful choice.
21922 We look for maximal size of block that is faster to copy by hand and
21923 take blocks of at most of that size guessing that average size will
21924 be roughly half of the block.
21926 If this turns out to be bad, we might simply specify the preferred
21927 choice in ix86_costs. */
21928 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21929 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21931 int max = -1;
21932 enum stringop_alg alg;
21933 int i;
21934 bool any_alg_usable_p = true;
21936 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21938 enum stringop_alg candidate = algs->size[i].alg;
21939 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21941 if (candidate != libcall && candidate
21942 && ALG_USABLE_P (candidate))
21943 max = algs->size[i].max;
21945 /* If there aren't any usable algorithms, then recursing on
21946 smaller sizes isn't going to find anything. Just return the
21947 simple byte-at-a-time copy loop. */
21948 if (!any_alg_usable_p)
21950 /* Pick something reasonable. */
21951 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21952 *dynamic_check = 128;
21953 return loop_1_byte;
21955 if (max == -1)
21956 max = 4096;
21957 alg = decide_alg (count, max / 2, memset, dynamic_check);
21958 gcc_assert (*dynamic_check == -1);
21959 gcc_assert (alg != libcall);
21960 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21961 *dynamic_check = max;
21962 return alg;
21964 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21965 #undef ALG_USABLE_P
21968 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21969 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21970 static int
21971 decide_alignment (int align,
21972 enum stringop_alg alg,
21973 int expected_size)
21975 int desired_align = 0;
21976 switch (alg)
21978 case no_stringop:
21979 gcc_unreachable ();
21980 case loop:
21981 case unrolled_loop:
21982 desired_align = GET_MODE_SIZE (Pmode);
21983 break;
21984 case rep_prefix_8_byte:
21985 desired_align = 8;
21986 break;
21987 case rep_prefix_4_byte:
21988 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21989 copying whole cacheline at once. */
21990 if (TARGET_PENTIUMPRO)
21991 desired_align = 8;
21992 else
21993 desired_align = 4;
21994 break;
21995 case rep_prefix_1_byte:
21996 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21997 copying whole cacheline at once. */
21998 if (TARGET_PENTIUMPRO)
21999 desired_align = 8;
22000 else
22001 desired_align = 1;
22002 break;
22003 case loop_1_byte:
22004 desired_align = 1;
22005 break;
22006 case libcall:
22007 return 0;
22010 if (optimize_size)
22011 desired_align = 1;
22012 if (desired_align < align)
22013 desired_align = align;
22014 if (expected_size != -1 && expected_size < 4)
22015 desired_align = align;
22016 return desired_align;
22019 /* Return the smallest power of 2 greater than VAL. */
22020 static int
22021 smallest_pow2_greater_than (int val)
22023 int ret = 1;
22024 while (ret <= val)
22025 ret <<= 1;
22026 return ret;
22029 /* Expand string move (memcpy) operation. Use i386 string operations
22030 when profitable. expand_setmem contains similar code. The code
22031 depends upon architecture, block size and alignment, but always has
22032 the same overall structure:
22034 1) Prologue guard: Conditional that jumps up to epilogues for small
22035 blocks that can be handled by epilogue alone. This is faster
22036 but also needed for correctness, since prologue assume the block
22037 is larger than the desired alignment.
22039 Optional dynamic check for size and libcall for large
22040 blocks is emitted here too, with -minline-stringops-dynamically.
22042 2) Prologue: copy first few bytes in order to get destination
22043 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22044 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22045 copied. We emit either a jump tree on power of two sized
22046 blocks, or a byte loop.
22048 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22049 with specified algorithm.
22051 4) Epilogue: code copying tail of the block that is too small to be
22052 handled by main body (or up to size guarded by prologue guard). */
22054 bool
22055 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22056 rtx expected_align_exp, rtx expected_size_exp)
22058 rtx destreg;
22059 rtx srcreg;
22060 rtx label = NULL;
22061 rtx tmp;
22062 rtx jump_around_label = NULL;
22063 HOST_WIDE_INT align = 1;
22064 unsigned HOST_WIDE_INT count = 0;
22065 HOST_WIDE_INT expected_size = -1;
22066 int size_needed = 0, epilogue_size_needed;
22067 int desired_align = 0, align_bytes = 0;
22068 enum stringop_alg alg;
22069 int dynamic_check;
22070 bool need_zero_guard = false;
22072 if (CONST_INT_P (align_exp))
22073 align = INTVAL (align_exp);
22074 /* i386 can do misaligned access on reasonably increased cost. */
22075 if (CONST_INT_P (expected_align_exp)
22076 && INTVAL (expected_align_exp) > align)
22077 align = INTVAL (expected_align_exp);
22078 /* ALIGN is the minimum of destination and source alignment, but we care here
22079 just about destination alignment. */
22080 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22081 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22083 if (CONST_INT_P (count_exp))
22084 count = expected_size = INTVAL (count_exp);
22085 if (CONST_INT_P (expected_size_exp) && count == 0)
22086 expected_size = INTVAL (expected_size_exp);
22088 /* Make sure we don't need to care about overflow later on. */
22089 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22090 return false;
22092 /* Step 0: Decide on preferred algorithm, desired alignment and
22093 size of chunks to be copied by main loop. */
22095 alg = decide_alg (count, expected_size, false, &dynamic_check);
22096 desired_align = decide_alignment (align, alg, expected_size);
22098 if (!TARGET_ALIGN_STRINGOPS)
22099 align = desired_align;
22101 if (alg == libcall)
22102 return false;
22103 gcc_assert (alg != no_stringop);
22104 if (!count)
22105 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22106 destreg = copy_addr_to_reg (XEXP (dst, 0));
22107 srcreg = copy_addr_to_reg (XEXP (src, 0));
22108 switch (alg)
22110 case libcall:
22111 case no_stringop:
22112 gcc_unreachable ();
22113 case loop:
22114 need_zero_guard = true;
22115 size_needed = GET_MODE_SIZE (word_mode);
22116 break;
22117 case unrolled_loop:
22118 need_zero_guard = true;
22119 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22120 break;
22121 case rep_prefix_8_byte:
22122 size_needed = 8;
22123 break;
22124 case rep_prefix_4_byte:
22125 size_needed = 4;
22126 break;
22127 case rep_prefix_1_byte:
22128 size_needed = 1;
22129 break;
22130 case loop_1_byte:
22131 need_zero_guard = true;
22132 size_needed = 1;
22133 break;
22136 epilogue_size_needed = size_needed;
22138 /* Step 1: Prologue guard. */
22140 /* Alignment code needs count to be in register. */
22141 if (CONST_INT_P (count_exp) && desired_align > align)
22143 if (INTVAL (count_exp) > desired_align
22144 && INTVAL (count_exp) > size_needed)
22146 align_bytes
22147 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22148 if (align_bytes <= 0)
22149 align_bytes = 0;
22150 else
22151 align_bytes = desired_align - align_bytes;
22153 if (align_bytes == 0)
22154 count_exp = force_reg (counter_mode (count_exp), count_exp);
22156 gcc_assert (desired_align >= 1 && align >= 1);
22158 /* Ensure that alignment prologue won't copy past end of block. */
22159 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22161 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22162 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22163 Make sure it is power of 2. */
22164 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22166 if (count)
22168 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22170 /* If main algorithm works on QImode, no epilogue is needed.
22171 For small sizes just don't align anything. */
22172 if (size_needed == 1)
22173 desired_align = align;
22174 else
22175 goto epilogue;
22178 else
22180 label = gen_label_rtx ();
22181 emit_cmp_and_jump_insns (count_exp,
22182 GEN_INT (epilogue_size_needed),
22183 LTU, 0, counter_mode (count_exp), 1, label);
22184 if (expected_size == -1 || expected_size < epilogue_size_needed)
22185 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22186 else
22187 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22191 /* Emit code to decide on runtime whether library call or inline should be
22192 used. */
22193 if (dynamic_check != -1)
22195 if (CONST_INT_P (count_exp))
22197 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22199 emit_block_move_via_libcall (dst, src, count_exp, false);
22200 count_exp = const0_rtx;
22201 goto epilogue;
22204 else
22206 rtx hot_label = gen_label_rtx ();
22207 jump_around_label = gen_label_rtx ();
22208 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22209 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22210 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22211 emit_block_move_via_libcall (dst, src, count_exp, false);
22212 emit_jump (jump_around_label);
22213 emit_label (hot_label);
22217 /* Step 2: Alignment prologue. */
22219 if (desired_align > align)
22221 if (align_bytes == 0)
22223 /* Except for the first move in epilogue, we no longer know
22224 constant offset in aliasing info. It don't seems to worth
22225 the pain to maintain it for the first move, so throw away
22226 the info early. */
22227 src = change_address (src, BLKmode, srcreg);
22228 dst = change_address (dst, BLKmode, destreg);
22229 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22230 desired_align);
22232 else
22234 /* If we know how many bytes need to be stored before dst is
22235 sufficiently aligned, maintain aliasing info accurately. */
22236 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22237 desired_align, align_bytes);
22238 count_exp = plus_constant (count_exp, -align_bytes);
22239 count -= align_bytes;
22241 if (need_zero_guard
22242 && (count < (unsigned HOST_WIDE_INT) size_needed
22243 || (align_bytes == 0
22244 && count < ((unsigned HOST_WIDE_INT) size_needed
22245 + desired_align - align))))
22247 /* It is possible that we copied enough so the main loop will not
22248 execute. */
22249 gcc_assert (size_needed > 1);
22250 if (label == NULL_RTX)
22251 label = gen_label_rtx ();
22252 emit_cmp_and_jump_insns (count_exp,
22253 GEN_INT (size_needed),
22254 LTU, 0, counter_mode (count_exp), 1, label);
22255 if (expected_size == -1
22256 || expected_size < (desired_align - align) / 2 + size_needed)
22257 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22258 else
22259 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22262 if (label && size_needed == 1)
22264 emit_label (label);
22265 LABEL_NUSES (label) = 1;
22266 label = NULL;
22267 epilogue_size_needed = 1;
22269 else if (label == NULL_RTX)
22270 epilogue_size_needed = size_needed;
22272 /* Step 3: Main loop. */
22274 switch (alg)
22276 case libcall:
22277 case no_stringop:
22278 gcc_unreachable ();
22279 case loop_1_byte:
22280 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22281 count_exp, QImode, 1, expected_size);
22282 break;
22283 case loop:
22284 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22285 count_exp, word_mode, 1, expected_size);
22286 break;
22287 case unrolled_loop:
22288 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22289 registers for 4 temporaries anyway. */
22290 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22291 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22292 expected_size);
22293 break;
22294 case rep_prefix_8_byte:
22295 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22296 DImode);
22297 break;
22298 case rep_prefix_4_byte:
22299 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22300 SImode);
22301 break;
22302 case rep_prefix_1_byte:
22303 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22304 QImode);
22305 break;
22307 /* Adjust properly the offset of src and dest memory for aliasing. */
22308 if (CONST_INT_P (count_exp))
22310 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22311 (count / size_needed) * size_needed);
22312 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22313 (count / size_needed) * size_needed);
22315 else
22317 src = change_address (src, BLKmode, srcreg);
22318 dst = change_address (dst, BLKmode, destreg);
22321 /* Step 4: Epilogue to copy the remaining bytes. */
22322 epilogue:
22323 if (label)
22325 /* When the main loop is done, COUNT_EXP might hold original count,
22326 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22327 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22328 bytes. Compensate if needed. */
22330 if (size_needed < epilogue_size_needed)
22332 tmp =
22333 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22334 GEN_INT (size_needed - 1), count_exp, 1,
22335 OPTAB_DIRECT);
22336 if (tmp != count_exp)
22337 emit_move_insn (count_exp, tmp);
22339 emit_label (label);
22340 LABEL_NUSES (label) = 1;
22343 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22344 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22345 epilogue_size_needed);
22346 if (jump_around_label)
22347 emit_label (jump_around_label);
22348 return true;
22351 /* Helper function for memcpy. For QImode value 0xXY produce
22352 0xXYXYXYXY of wide specified by MODE. This is essentially
22353 a * 0x10101010, but we can do slightly better than
22354 synth_mult by unwinding the sequence by hand on CPUs with
22355 slow multiply. */
22356 static rtx
22357 promote_duplicated_reg (enum machine_mode mode, rtx val)
22359 enum machine_mode valmode = GET_MODE (val);
22360 rtx tmp;
22361 int nops = mode == DImode ? 3 : 2;
22363 gcc_assert (mode == SImode || mode == DImode);
22364 if (val == const0_rtx)
22365 return copy_to_mode_reg (mode, const0_rtx);
22366 if (CONST_INT_P (val))
22368 HOST_WIDE_INT v = INTVAL (val) & 255;
22370 v |= v << 8;
22371 v |= v << 16;
22372 if (mode == DImode)
22373 v |= (v << 16) << 16;
22374 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22377 if (valmode == VOIDmode)
22378 valmode = QImode;
22379 if (valmode != QImode)
22380 val = gen_lowpart (QImode, val);
22381 if (mode == QImode)
22382 return val;
22383 if (!TARGET_PARTIAL_REG_STALL)
22384 nops--;
22385 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22386 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22387 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22388 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22390 rtx reg = convert_modes (mode, QImode, val, true);
22391 tmp = promote_duplicated_reg (mode, const1_rtx);
22392 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22393 OPTAB_DIRECT);
22395 else
22397 rtx reg = convert_modes (mode, QImode, val, true);
22399 if (!TARGET_PARTIAL_REG_STALL)
22400 if (mode == SImode)
22401 emit_insn (gen_movsi_insv_1 (reg, reg));
22402 else
22403 emit_insn (gen_movdi_insv_1 (reg, reg));
22404 else
22406 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22407 NULL, 1, OPTAB_DIRECT);
22408 reg =
22409 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22411 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22412 NULL, 1, OPTAB_DIRECT);
22413 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22414 if (mode == SImode)
22415 return reg;
22416 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22417 NULL, 1, OPTAB_DIRECT);
22418 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22419 return reg;
22423 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22424 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22425 alignment from ALIGN to DESIRED_ALIGN. */
22426 static rtx
22427 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22429 rtx promoted_val;
22431 if (TARGET_64BIT
22432 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22433 promoted_val = promote_duplicated_reg (DImode, val);
22434 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22435 promoted_val = promote_duplicated_reg (SImode, val);
22436 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22437 promoted_val = promote_duplicated_reg (HImode, val);
22438 else
22439 promoted_val = val;
22441 return promoted_val;
22444 /* Expand string clear operation (bzero). Use i386 string operations when
22445 profitable. See expand_movmem comment for explanation of individual
22446 steps performed. */
22447 bool
22448 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22449 rtx expected_align_exp, rtx expected_size_exp)
22451 rtx destreg;
22452 rtx label = NULL;
22453 rtx tmp;
22454 rtx jump_around_label = NULL;
22455 HOST_WIDE_INT align = 1;
22456 unsigned HOST_WIDE_INT count = 0;
22457 HOST_WIDE_INT expected_size = -1;
22458 int size_needed = 0, epilogue_size_needed;
22459 int desired_align = 0, align_bytes = 0;
22460 enum stringop_alg alg;
22461 rtx promoted_val = NULL;
22462 bool force_loopy_epilogue = false;
22463 int dynamic_check;
22464 bool need_zero_guard = false;
22466 if (CONST_INT_P (align_exp))
22467 align = INTVAL (align_exp);
22468 /* i386 can do misaligned access on reasonably increased cost. */
22469 if (CONST_INT_P (expected_align_exp)
22470 && INTVAL (expected_align_exp) > align)
22471 align = INTVAL (expected_align_exp);
22472 if (CONST_INT_P (count_exp))
22473 count = expected_size = INTVAL (count_exp);
22474 if (CONST_INT_P (expected_size_exp) && count == 0)
22475 expected_size = INTVAL (expected_size_exp);
22477 /* Make sure we don't need to care about overflow later on. */
22478 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22479 return false;
22481 /* Step 0: Decide on preferred algorithm, desired alignment and
22482 size of chunks to be copied by main loop. */
22484 alg = decide_alg (count, expected_size, true, &dynamic_check);
22485 desired_align = decide_alignment (align, alg, expected_size);
22487 if (!TARGET_ALIGN_STRINGOPS)
22488 align = desired_align;
22490 if (alg == libcall)
22491 return false;
22492 gcc_assert (alg != no_stringop);
22493 if (!count)
22494 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22495 destreg = copy_addr_to_reg (XEXP (dst, 0));
22496 switch (alg)
22498 case libcall:
22499 case no_stringop:
22500 gcc_unreachable ();
22501 case loop:
22502 need_zero_guard = true;
22503 size_needed = GET_MODE_SIZE (word_mode);
22504 break;
22505 case unrolled_loop:
22506 need_zero_guard = true;
22507 size_needed = GET_MODE_SIZE (word_mode) * 4;
22508 break;
22509 case rep_prefix_8_byte:
22510 size_needed = 8;
22511 break;
22512 case rep_prefix_4_byte:
22513 size_needed = 4;
22514 break;
22515 case rep_prefix_1_byte:
22516 size_needed = 1;
22517 break;
22518 case loop_1_byte:
22519 need_zero_guard = true;
22520 size_needed = 1;
22521 break;
22523 epilogue_size_needed = size_needed;
22525 /* Step 1: Prologue guard. */
22527 /* Alignment code needs count to be in register. */
22528 if (CONST_INT_P (count_exp) && desired_align > align)
22530 if (INTVAL (count_exp) > desired_align
22531 && INTVAL (count_exp) > size_needed)
22533 align_bytes
22534 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22535 if (align_bytes <= 0)
22536 align_bytes = 0;
22537 else
22538 align_bytes = desired_align - align_bytes;
22540 if (align_bytes == 0)
22542 enum machine_mode mode = SImode;
22543 if (TARGET_64BIT && (count & ~0xffffffff))
22544 mode = DImode;
22545 count_exp = force_reg (mode, count_exp);
22548 /* Do the cheap promotion to allow better CSE across the
22549 main loop and epilogue (ie one load of the big constant in the
22550 front of all code. */
22551 if (CONST_INT_P (val_exp))
22552 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22553 desired_align, align);
22554 /* Ensure that alignment prologue won't copy past end of block. */
22555 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22557 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22558 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22559 Make sure it is power of 2. */
22560 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22562 /* To improve performance of small blocks, we jump around the VAL
22563 promoting mode. This mean that if the promoted VAL is not constant,
22564 we might not use it in the epilogue and have to use byte
22565 loop variant. */
22566 if (epilogue_size_needed > 2 && !promoted_val)
22567 force_loopy_epilogue = true;
22568 if (count)
22570 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22572 /* If main algorithm works on QImode, no epilogue is needed.
22573 For small sizes just don't align anything. */
22574 if (size_needed == 1)
22575 desired_align = align;
22576 else
22577 goto epilogue;
22580 else
22582 label = gen_label_rtx ();
22583 emit_cmp_and_jump_insns (count_exp,
22584 GEN_INT (epilogue_size_needed),
22585 LTU, 0, counter_mode (count_exp), 1, label);
22586 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22587 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22588 else
22589 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22592 if (dynamic_check != -1)
22594 rtx hot_label = gen_label_rtx ();
22595 jump_around_label = gen_label_rtx ();
22596 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22597 LEU, 0, counter_mode (count_exp), 1, hot_label);
22598 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22599 set_storage_via_libcall (dst, count_exp, val_exp, false);
22600 emit_jump (jump_around_label);
22601 emit_label (hot_label);
22604 /* Step 2: Alignment prologue. */
22606 /* Do the expensive promotion once we branched off the small blocks. */
22607 if (!promoted_val)
22608 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22609 desired_align, align);
22610 gcc_assert (desired_align >= 1 && align >= 1);
22612 if (desired_align > align)
22614 if (align_bytes == 0)
22616 /* Except for the first move in epilogue, we no longer know
22617 constant offset in aliasing info. It don't seems to worth
22618 the pain to maintain it for the first move, so throw away
22619 the info early. */
22620 dst = change_address (dst, BLKmode, destreg);
22621 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22622 desired_align);
22624 else
22626 /* If we know how many bytes need to be stored before dst is
22627 sufficiently aligned, maintain aliasing info accurately. */
22628 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22629 desired_align, align_bytes);
22630 count_exp = plus_constant (count_exp, -align_bytes);
22631 count -= align_bytes;
22633 if (need_zero_guard
22634 && (count < (unsigned HOST_WIDE_INT) size_needed
22635 || (align_bytes == 0
22636 && count < ((unsigned HOST_WIDE_INT) size_needed
22637 + desired_align - align))))
22639 /* It is possible that we copied enough so the main loop will not
22640 execute. */
22641 gcc_assert (size_needed > 1);
22642 if (label == NULL_RTX)
22643 label = gen_label_rtx ();
22644 emit_cmp_and_jump_insns (count_exp,
22645 GEN_INT (size_needed),
22646 LTU, 0, counter_mode (count_exp), 1, label);
22647 if (expected_size == -1
22648 || expected_size < (desired_align - align) / 2 + size_needed)
22649 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22650 else
22651 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22654 if (label && size_needed == 1)
22656 emit_label (label);
22657 LABEL_NUSES (label) = 1;
22658 label = NULL;
22659 promoted_val = val_exp;
22660 epilogue_size_needed = 1;
22662 else if (label == NULL_RTX)
22663 epilogue_size_needed = size_needed;
22665 /* Step 3: Main loop. */
22667 switch (alg)
22669 case libcall:
22670 case no_stringop:
22671 gcc_unreachable ();
22672 case loop_1_byte:
22673 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22674 count_exp, QImode, 1, expected_size);
22675 break;
22676 case loop:
22677 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22678 count_exp, word_mode, 1, expected_size);
22679 break;
22680 case unrolled_loop:
22681 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22682 count_exp, word_mode, 4, expected_size);
22683 break;
22684 case rep_prefix_8_byte:
22685 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22686 DImode, val_exp);
22687 break;
22688 case rep_prefix_4_byte:
22689 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22690 SImode, val_exp);
22691 break;
22692 case rep_prefix_1_byte:
22693 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22694 QImode, val_exp);
22695 break;
22697 /* Adjust properly the offset of src and dest memory for aliasing. */
22698 if (CONST_INT_P (count_exp))
22699 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22700 (count / size_needed) * size_needed);
22701 else
22702 dst = change_address (dst, BLKmode, destreg);
22704 /* Step 4: Epilogue to copy the remaining bytes. */
22706 if (label)
22708 /* When the main loop is done, COUNT_EXP might hold original count,
22709 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22710 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22711 bytes. Compensate if needed. */
22713 if (size_needed < epilogue_size_needed)
22715 tmp =
22716 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22717 GEN_INT (size_needed - 1), count_exp, 1,
22718 OPTAB_DIRECT);
22719 if (tmp != count_exp)
22720 emit_move_insn (count_exp, tmp);
22722 emit_label (label);
22723 LABEL_NUSES (label) = 1;
22725 epilogue:
22726 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22728 if (force_loopy_epilogue)
22729 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22730 epilogue_size_needed);
22731 else
22732 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22733 epilogue_size_needed);
22735 if (jump_around_label)
22736 emit_label (jump_around_label);
22737 return true;
22740 /* Expand the appropriate insns for doing strlen if not just doing
22741 repnz; scasb
22743 out = result, initialized with the start address
22744 align_rtx = alignment of the address.
22745 scratch = scratch register, initialized with the startaddress when
22746 not aligned, otherwise undefined
22748 This is just the body. It needs the initializations mentioned above and
22749 some address computing at the end. These things are done in i386.md. */
22751 static void
22752 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22754 int align;
22755 rtx tmp;
22756 rtx align_2_label = NULL_RTX;
22757 rtx align_3_label = NULL_RTX;
22758 rtx align_4_label = gen_label_rtx ();
22759 rtx end_0_label = gen_label_rtx ();
22760 rtx mem;
22761 rtx tmpreg = gen_reg_rtx (SImode);
22762 rtx scratch = gen_reg_rtx (SImode);
22763 rtx cmp;
22765 align = 0;
22766 if (CONST_INT_P (align_rtx))
22767 align = INTVAL (align_rtx);
22769 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22771 /* Is there a known alignment and is it less than 4? */
22772 if (align < 4)
22774 rtx scratch1 = gen_reg_rtx (Pmode);
22775 emit_move_insn (scratch1, out);
22776 /* Is there a known alignment and is it not 2? */
22777 if (align != 2)
22779 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22780 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22782 /* Leave just the 3 lower bits. */
22783 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22784 NULL_RTX, 0, OPTAB_WIDEN);
22786 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22787 Pmode, 1, align_4_label);
22788 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22789 Pmode, 1, align_2_label);
22790 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22791 Pmode, 1, align_3_label);
22793 else
22795 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22796 check if is aligned to 4 - byte. */
22798 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22799 NULL_RTX, 0, OPTAB_WIDEN);
22801 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22802 Pmode, 1, align_4_label);
22805 mem = change_address (src, QImode, out);
22807 /* Now compare the bytes. */
22809 /* Compare the first n unaligned byte on a byte per byte basis. */
22810 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22811 QImode, 1, end_0_label);
22813 /* Increment the address. */
22814 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22816 /* Not needed with an alignment of 2 */
22817 if (align != 2)
22819 emit_label (align_2_label);
22821 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22822 end_0_label);
22824 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22826 emit_label (align_3_label);
22829 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22830 end_0_label);
22832 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22835 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22836 align this loop. It gives only huge programs, but does not help to
22837 speed up. */
22838 emit_label (align_4_label);
22840 mem = change_address (src, SImode, out);
22841 emit_move_insn (scratch, mem);
22842 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22844 /* This formula yields a nonzero result iff one of the bytes is zero.
22845 This saves three branches inside loop and many cycles. */
22847 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22848 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22849 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22850 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22851 gen_int_mode (0x80808080, SImode)));
22852 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22853 align_4_label);
22855 if (TARGET_CMOVE)
22857 rtx reg = gen_reg_rtx (SImode);
22858 rtx reg2 = gen_reg_rtx (Pmode);
22859 emit_move_insn (reg, tmpreg);
22860 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22862 /* If zero is not in the first two bytes, move two bytes forward. */
22863 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22864 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22865 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22866 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22867 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22868 reg,
22869 tmpreg)));
22870 /* Emit lea manually to avoid clobbering of flags. */
22871 emit_insn (gen_rtx_SET (SImode, reg2,
22872 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22874 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22875 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22876 emit_insn (gen_rtx_SET (VOIDmode, out,
22877 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22878 reg2,
22879 out)));
22881 else
22883 rtx end_2_label = gen_label_rtx ();
22884 /* Is zero in the first two bytes? */
22886 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22887 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22888 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22889 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22890 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22891 pc_rtx);
22892 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22893 JUMP_LABEL (tmp) = end_2_label;
22895 /* Not in the first two. Move two bytes forward. */
22896 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22897 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22899 emit_label (end_2_label);
22903 /* Avoid branch in fixing the byte. */
22904 tmpreg = gen_lowpart (QImode, tmpreg);
22905 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22906 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22907 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22908 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22910 emit_label (end_0_label);
22913 /* Expand strlen. */
22915 bool
22916 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22918 rtx addr, scratch1, scratch2, scratch3, scratch4;
22920 /* The generic case of strlen expander is long. Avoid it's
22921 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22923 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22924 && !TARGET_INLINE_ALL_STRINGOPS
22925 && !optimize_insn_for_size_p ()
22926 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22927 return false;
22929 addr = force_reg (Pmode, XEXP (src, 0));
22930 scratch1 = gen_reg_rtx (Pmode);
22932 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22933 && !optimize_insn_for_size_p ())
22935 /* Well it seems that some optimizer does not combine a call like
22936 foo(strlen(bar), strlen(bar));
22937 when the move and the subtraction is done here. It does calculate
22938 the length just once when these instructions are done inside of
22939 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22940 often used and I use one fewer register for the lifetime of
22941 output_strlen_unroll() this is better. */
22943 emit_move_insn (out, addr);
22945 ix86_expand_strlensi_unroll_1 (out, src, align);
22947 /* strlensi_unroll_1 returns the address of the zero at the end of
22948 the string, like memchr(), so compute the length by subtracting
22949 the start address. */
22950 emit_insn (ix86_gen_sub3 (out, out, addr));
22952 else
22954 rtx unspec;
22956 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22957 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22958 return false;
22960 scratch2 = gen_reg_rtx (Pmode);
22961 scratch3 = gen_reg_rtx (Pmode);
22962 scratch4 = force_reg (Pmode, constm1_rtx);
22964 emit_move_insn (scratch3, addr);
22965 eoschar = force_reg (QImode, eoschar);
22967 src = replace_equiv_address_nv (src, scratch3);
22969 /* If .md starts supporting :P, this can be done in .md. */
22970 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22971 scratch4), UNSPEC_SCAS);
22972 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22973 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22974 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22976 return true;
22979 /* For given symbol (function) construct code to compute address of it's PLT
22980 entry in large x86-64 PIC model. */
22982 construct_plt_address (rtx symbol)
22984 rtx tmp, unspec;
22986 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22987 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22988 gcc_assert (Pmode == DImode);
22990 tmp = gen_reg_rtx (Pmode);
22991 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22993 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22994 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
22995 return tmp;
22999 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23000 rtx callarg2,
23001 rtx pop, bool sibcall)
23003 /* We need to represent that SI and DI registers are clobbered
23004 by SYSV calls. */
23005 static int clobbered_registers[] = {
23006 XMM6_REG, XMM7_REG, XMM8_REG,
23007 XMM9_REG, XMM10_REG, XMM11_REG,
23008 XMM12_REG, XMM13_REG, XMM14_REG,
23009 XMM15_REG, SI_REG, DI_REG
23011 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23012 rtx use = NULL, call;
23013 unsigned int vec_len;
23015 if (pop == const0_rtx)
23016 pop = NULL;
23017 gcc_assert (!TARGET_64BIT || !pop);
23019 if (TARGET_MACHO && !TARGET_64BIT)
23021 #if TARGET_MACHO
23022 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23023 fnaddr = machopic_indirect_call_target (fnaddr);
23024 #endif
23026 else
23028 /* Static functions and indirect calls don't need the pic register. */
23029 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23030 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23031 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23032 use_reg (&use, pic_offset_table_rtx);
23035 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23037 rtx al = gen_rtx_REG (QImode, AX_REG);
23038 emit_move_insn (al, callarg2);
23039 use_reg (&use, al);
23042 if (ix86_cmodel == CM_LARGE_PIC
23043 && MEM_P (fnaddr)
23044 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23045 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23046 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23047 else if (sibcall
23048 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23049 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23051 fnaddr = XEXP (fnaddr, 0);
23052 if (GET_MODE (fnaddr) != word_mode)
23053 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23054 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23057 vec_len = 0;
23058 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23059 if (retval)
23060 call = gen_rtx_SET (VOIDmode, retval, call);
23061 vec[vec_len++] = call;
23063 if (pop)
23065 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23066 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23067 vec[vec_len++] = pop;
23070 if (TARGET_64BIT_MS_ABI
23071 && (!callarg2 || INTVAL (callarg2) != -2))
23073 unsigned i;
23075 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23076 UNSPEC_MS_TO_SYSV_CALL);
23078 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23079 vec[vec_len++]
23080 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23081 ? TImode : DImode,
23082 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23083 ? TImode : DImode,
23084 clobbered_registers[i]));
23087 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23088 if (TARGET_VZEROUPPER)
23090 int avx256;
23091 if (cfun->machine->callee_pass_avx256_p)
23093 if (cfun->machine->callee_return_avx256_p)
23094 avx256 = callee_return_pass_avx256;
23095 else
23096 avx256 = callee_pass_avx256;
23098 else if (cfun->machine->callee_return_avx256_p)
23099 avx256 = callee_return_avx256;
23100 else
23101 avx256 = call_no_avx256;
23103 if (reload_completed)
23104 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23105 else
23106 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23107 gen_rtvec (1, GEN_INT (avx256)),
23108 UNSPEC_CALL_NEEDS_VZEROUPPER);
23111 if (vec_len > 1)
23112 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23113 call = emit_call_insn (call);
23114 if (use)
23115 CALL_INSN_FUNCTION_USAGE (call) = use;
23117 return call;
23120 void
23121 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23123 rtx pat = PATTERN (insn);
23124 rtvec vec = XVEC (pat, 0);
23125 int len = GET_NUM_ELEM (vec) - 1;
23127 /* Strip off the last entry of the parallel. */
23128 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23129 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23130 if (len == 1)
23131 pat = RTVEC_ELT (vec, 0);
23132 else
23133 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23135 emit_insn (gen_avx_vzeroupper (vzeroupper));
23136 emit_call_insn (pat);
23139 /* Output the assembly for a call instruction. */
23141 const char *
23142 ix86_output_call_insn (rtx insn, rtx call_op)
23144 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23145 bool seh_nop_p = false;
23146 const char *xasm;
23148 if (SIBLING_CALL_P (insn))
23150 if (direct_p)
23151 xasm = "jmp\t%P0";
23152 /* SEH epilogue detection requires the indirect branch case
23153 to include REX.W. */
23154 else if (TARGET_SEH)
23155 xasm = "rex.W jmp %A0";
23156 else
23157 xasm = "jmp\t%A0";
23159 output_asm_insn (xasm, &call_op);
23160 return "";
23163 /* SEH unwinding can require an extra nop to be emitted in several
23164 circumstances. Determine if we have one of those. */
23165 if (TARGET_SEH)
23167 rtx i;
23169 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23171 /* If we get to another real insn, we don't need the nop. */
23172 if (INSN_P (i))
23173 break;
23175 /* If we get to the epilogue note, prevent a catch region from
23176 being adjacent to the standard epilogue sequence. If non-
23177 call-exceptions, we'll have done this during epilogue emission. */
23178 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23179 && !flag_non_call_exceptions
23180 && !can_throw_internal (insn))
23182 seh_nop_p = true;
23183 break;
23187 /* If we didn't find a real insn following the call, prevent the
23188 unwinder from looking into the next function. */
23189 if (i == NULL)
23190 seh_nop_p = true;
23193 if (direct_p)
23194 xasm = "call\t%P0";
23195 else
23196 xasm = "call\t%A0";
23198 output_asm_insn (xasm, &call_op);
23200 if (seh_nop_p)
23201 return "nop";
23203 return "";
23206 /* Clear stack slot assignments remembered from previous functions.
23207 This is called from INIT_EXPANDERS once before RTL is emitted for each
23208 function. */
23210 static struct machine_function *
23211 ix86_init_machine_status (void)
23213 struct machine_function *f;
23215 f = ggc_alloc_cleared_machine_function ();
23216 f->use_fast_prologue_epilogue_nregs = -1;
23217 f->tls_descriptor_call_expanded_p = 0;
23218 f->call_abi = ix86_abi;
23220 return f;
23223 /* Return a MEM corresponding to a stack slot with mode MODE.
23224 Allocate a new slot if necessary.
23226 The RTL for a function can have several slots available: N is
23227 which slot to use. */
23230 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23232 struct stack_local_entry *s;
23234 gcc_assert (n < MAX_386_STACK_LOCALS);
23236 /* Virtual slot is valid only before vregs are instantiated. */
23237 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23239 for (s = ix86_stack_locals; s; s = s->next)
23240 if (s->mode == mode && s->n == n)
23241 return validize_mem (copy_rtx (s->rtl));
23243 s = ggc_alloc_stack_local_entry ();
23244 s->n = n;
23245 s->mode = mode;
23246 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23248 s->next = ix86_stack_locals;
23249 ix86_stack_locals = s;
23250 return validize_mem (s->rtl);
23253 /* Calculate the length of the memory address in the instruction encoding.
23254 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23255 or other prefixes. */
23258 memory_address_length (rtx addr)
23260 struct ix86_address parts;
23261 rtx base, index, disp;
23262 int len;
23263 int ok;
23265 if (GET_CODE (addr) == PRE_DEC
23266 || GET_CODE (addr) == POST_INC
23267 || GET_CODE (addr) == PRE_MODIFY
23268 || GET_CODE (addr) == POST_MODIFY)
23269 return 0;
23271 ok = ix86_decompose_address (addr, &parts);
23272 gcc_assert (ok);
23274 if (parts.base && GET_CODE (parts.base) == SUBREG)
23275 parts.base = SUBREG_REG (parts.base);
23276 if (parts.index && GET_CODE (parts.index) == SUBREG)
23277 parts.index = SUBREG_REG (parts.index);
23279 base = parts.base;
23280 index = parts.index;
23281 disp = parts.disp;
23283 /* Add length of addr32 prefix. */
23284 len = (GET_CODE (addr) == ZERO_EXTEND
23285 || GET_CODE (addr) == AND);
23287 /* Rule of thumb:
23288 - esp as the base always wants an index,
23289 - ebp as the base always wants a displacement,
23290 - r12 as the base always wants an index,
23291 - r13 as the base always wants a displacement. */
23293 /* Register Indirect. */
23294 if (base && !index && !disp)
23296 /* esp (for its index) and ebp (for its displacement) need
23297 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23298 code. */
23299 if (REG_P (addr)
23300 && (addr == arg_pointer_rtx
23301 || addr == frame_pointer_rtx
23302 || REGNO (addr) == SP_REG
23303 || REGNO (addr) == BP_REG
23304 || REGNO (addr) == R12_REG
23305 || REGNO (addr) == R13_REG))
23306 len = 1;
23309 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23310 is not disp32, but disp32(%rip), so for disp32
23311 SIB byte is needed, unless print_operand_address
23312 optimizes it into disp32(%rip) or (%rip) is implied
23313 by UNSPEC. */
23314 else if (disp && !base && !index)
23316 len = 4;
23317 if (TARGET_64BIT)
23319 rtx symbol = disp;
23321 if (GET_CODE (disp) == CONST)
23322 symbol = XEXP (disp, 0);
23323 if (GET_CODE (symbol) == PLUS
23324 && CONST_INT_P (XEXP (symbol, 1)))
23325 symbol = XEXP (symbol, 0);
23327 if (GET_CODE (symbol) != LABEL_REF
23328 && (GET_CODE (symbol) != SYMBOL_REF
23329 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23330 && (GET_CODE (symbol) != UNSPEC
23331 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23332 && XINT (symbol, 1) != UNSPEC_PCREL
23333 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23334 len += 1;
23338 else
23340 /* Find the length of the displacement constant. */
23341 if (disp)
23343 if (base && satisfies_constraint_K (disp))
23344 len = 1;
23345 else
23346 len = 4;
23348 /* ebp always wants a displacement. Similarly r13. */
23349 else if (base && REG_P (base)
23350 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23351 len = 1;
23353 /* An index requires the two-byte modrm form.... */
23354 if (index
23355 /* ...like esp (or r12), which always wants an index. */
23356 || base == arg_pointer_rtx
23357 || base == frame_pointer_rtx
23358 || (base && REG_P (base)
23359 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23360 len += 1;
23363 switch (parts.seg)
23365 case SEG_FS:
23366 case SEG_GS:
23367 len += 1;
23368 break;
23369 default:
23370 break;
23373 return len;
23376 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23377 is set, expect that insn have 8bit immediate alternative. */
23379 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23381 int len = 0;
23382 int i;
23383 extract_insn_cached (insn);
23384 for (i = recog_data.n_operands - 1; i >= 0; --i)
23385 if (CONSTANT_P (recog_data.operand[i]))
23387 enum attr_mode mode = get_attr_mode (insn);
23389 gcc_assert (!len);
23390 if (shortform && CONST_INT_P (recog_data.operand[i]))
23392 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23393 switch (mode)
23395 case MODE_QI:
23396 len = 1;
23397 continue;
23398 case MODE_HI:
23399 ival = trunc_int_for_mode (ival, HImode);
23400 break;
23401 case MODE_SI:
23402 ival = trunc_int_for_mode (ival, SImode);
23403 break;
23404 default:
23405 break;
23407 if (IN_RANGE (ival, -128, 127))
23409 len = 1;
23410 continue;
23413 switch (mode)
23415 case MODE_QI:
23416 len = 1;
23417 break;
23418 case MODE_HI:
23419 len = 2;
23420 break;
23421 case MODE_SI:
23422 len = 4;
23423 break;
23424 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23425 case MODE_DI:
23426 len = 4;
23427 break;
23428 default:
23429 fatal_insn ("unknown insn mode", insn);
23432 return len;
23434 /* Compute default value for "length_address" attribute. */
23436 ix86_attr_length_address_default (rtx insn)
23438 int i;
23440 if (get_attr_type (insn) == TYPE_LEA)
23442 rtx set = PATTERN (insn), addr;
23444 if (GET_CODE (set) == PARALLEL)
23445 set = XVECEXP (set, 0, 0);
23447 gcc_assert (GET_CODE (set) == SET);
23449 addr = SET_SRC (set);
23450 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23452 if (GET_CODE (addr) == ZERO_EXTEND)
23453 addr = XEXP (addr, 0);
23454 if (GET_CODE (addr) == SUBREG)
23455 addr = SUBREG_REG (addr);
23458 return memory_address_length (addr);
23461 extract_insn_cached (insn);
23462 for (i = recog_data.n_operands - 1; i >= 0; --i)
23463 if (MEM_P (recog_data.operand[i]))
23465 constrain_operands_cached (reload_completed);
23466 if (which_alternative != -1)
23468 const char *constraints = recog_data.constraints[i];
23469 int alt = which_alternative;
23471 while (*constraints == '=' || *constraints == '+')
23472 constraints++;
23473 while (alt-- > 0)
23474 while (*constraints++ != ',')
23476 /* Skip ignored operands. */
23477 if (*constraints == 'X')
23478 continue;
23480 return memory_address_length (XEXP (recog_data.operand[i], 0));
23482 return 0;
23485 /* Compute default value for "length_vex" attribute. It includes
23486 2 or 3 byte VEX prefix and 1 opcode byte. */
23489 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23491 int i;
23493 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23494 byte VEX prefix. */
23495 if (!has_0f_opcode || has_vex_w)
23496 return 3 + 1;
23498 /* We can always use 2 byte VEX prefix in 32bit. */
23499 if (!TARGET_64BIT)
23500 return 2 + 1;
23502 extract_insn_cached (insn);
23504 for (i = recog_data.n_operands - 1; i >= 0; --i)
23505 if (REG_P (recog_data.operand[i]))
23507 /* REX.W bit uses 3 byte VEX prefix. */
23508 if (GET_MODE (recog_data.operand[i]) == DImode
23509 && GENERAL_REG_P (recog_data.operand[i]))
23510 return 3 + 1;
23512 else
23514 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23515 if (MEM_P (recog_data.operand[i])
23516 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23517 return 3 + 1;
23520 return 2 + 1;
23523 /* Return the maximum number of instructions a cpu can issue. */
23525 static int
23526 ix86_issue_rate (void)
23528 switch (ix86_tune)
23530 case PROCESSOR_PENTIUM:
23531 case PROCESSOR_ATOM:
23532 case PROCESSOR_K6:
23533 return 2;
23535 case PROCESSOR_PENTIUMPRO:
23536 case PROCESSOR_PENTIUM4:
23537 case PROCESSOR_CORE2_32:
23538 case PROCESSOR_CORE2_64:
23539 case PROCESSOR_COREI7_32:
23540 case PROCESSOR_COREI7_64:
23541 case PROCESSOR_ATHLON:
23542 case PROCESSOR_K8:
23543 case PROCESSOR_AMDFAM10:
23544 case PROCESSOR_NOCONA:
23545 case PROCESSOR_GENERIC32:
23546 case PROCESSOR_GENERIC64:
23547 case PROCESSOR_BDVER1:
23548 case PROCESSOR_BDVER2:
23549 case PROCESSOR_BTVER1:
23550 return 3;
23552 default:
23553 return 1;
23557 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23558 by DEP_INSN and nothing set by DEP_INSN. */
23560 static bool
23561 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23563 rtx set, set2;
23565 /* Simplify the test for uninteresting insns. */
23566 if (insn_type != TYPE_SETCC
23567 && insn_type != TYPE_ICMOV
23568 && insn_type != TYPE_FCMOV
23569 && insn_type != TYPE_IBR)
23570 return false;
23572 if ((set = single_set (dep_insn)) != 0)
23574 set = SET_DEST (set);
23575 set2 = NULL_RTX;
23577 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23578 && XVECLEN (PATTERN (dep_insn), 0) == 2
23579 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23580 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23582 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23583 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23585 else
23586 return false;
23588 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23589 return false;
23591 /* This test is true if the dependent insn reads the flags but
23592 not any other potentially set register. */
23593 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23594 return false;
23596 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23597 return false;
23599 return true;
23602 /* Return true iff USE_INSN has a memory address with operands set by
23603 SET_INSN. */
23605 bool
23606 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23608 int i;
23609 extract_insn_cached (use_insn);
23610 for (i = recog_data.n_operands - 1; i >= 0; --i)
23611 if (MEM_P (recog_data.operand[i]))
23613 rtx addr = XEXP (recog_data.operand[i], 0);
23614 return modified_in_p (addr, set_insn) != 0;
23616 return false;
23619 static int
23620 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23622 enum attr_type insn_type, dep_insn_type;
23623 enum attr_memory memory;
23624 rtx set, set2;
23625 int dep_insn_code_number;
23627 /* Anti and output dependencies have zero cost on all CPUs. */
23628 if (REG_NOTE_KIND (link) != 0)
23629 return 0;
23631 dep_insn_code_number = recog_memoized (dep_insn);
23633 /* If we can't recognize the insns, we can't really do anything. */
23634 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23635 return cost;
23637 insn_type = get_attr_type (insn);
23638 dep_insn_type = get_attr_type (dep_insn);
23640 switch (ix86_tune)
23642 case PROCESSOR_PENTIUM:
23643 /* Address Generation Interlock adds a cycle of latency. */
23644 if (insn_type == TYPE_LEA)
23646 rtx addr = PATTERN (insn);
23648 if (GET_CODE (addr) == PARALLEL)
23649 addr = XVECEXP (addr, 0, 0);
23651 gcc_assert (GET_CODE (addr) == SET);
23653 addr = SET_SRC (addr);
23654 if (modified_in_p (addr, dep_insn))
23655 cost += 1;
23657 else if (ix86_agi_dependent (dep_insn, insn))
23658 cost += 1;
23660 /* ??? Compares pair with jump/setcc. */
23661 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23662 cost = 0;
23664 /* Floating point stores require value to be ready one cycle earlier. */
23665 if (insn_type == TYPE_FMOV
23666 && get_attr_memory (insn) == MEMORY_STORE
23667 && !ix86_agi_dependent (dep_insn, insn))
23668 cost += 1;
23669 break;
23671 case PROCESSOR_PENTIUMPRO:
23672 memory = get_attr_memory (insn);
23674 /* INT->FP conversion is expensive. */
23675 if (get_attr_fp_int_src (dep_insn))
23676 cost += 5;
23678 /* There is one cycle extra latency between an FP op and a store. */
23679 if (insn_type == TYPE_FMOV
23680 && (set = single_set (dep_insn)) != NULL_RTX
23681 && (set2 = single_set (insn)) != NULL_RTX
23682 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23683 && MEM_P (SET_DEST (set2)))
23684 cost += 1;
23686 /* Show ability of reorder buffer to hide latency of load by executing
23687 in parallel with previous instruction in case
23688 previous instruction is not needed to compute the address. */
23689 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23690 && !ix86_agi_dependent (dep_insn, insn))
23692 /* Claim moves to take one cycle, as core can issue one load
23693 at time and the next load can start cycle later. */
23694 if (dep_insn_type == TYPE_IMOV
23695 || dep_insn_type == TYPE_FMOV)
23696 cost = 1;
23697 else if (cost > 1)
23698 cost--;
23700 break;
23702 case PROCESSOR_K6:
23703 memory = get_attr_memory (insn);
23705 /* The esp dependency is resolved before the instruction is really
23706 finished. */
23707 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23708 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23709 return 1;
23711 /* INT->FP conversion is expensive. */
23712 if (get_attr_fp_int_src (dep_insn))
23713 cost += 5;
23715 /* Show ability of reorder buffer to hide latency of load by executing
23716 in parallel with previous instruction in case
23717 previous instruction is not needed to compute the address. */
23718 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23719 && !ix86_agi_dependent (dep_insn, insn))
23721 /* Claim moves to take one cycle, as core can issue one load
23722 at time and the next load can start cycle later. */
23723 if (dep_insn_type == TYPE_IMOV
23724 || dep_insn_type == TYPE_FMOV)
23725 cost = 1;
23726 else if (cost > 2)
23727 cost -= 2;
23728 else
23729 cost = 1;
23731 break;
23733 case PROCESSOR_ATHLON:
23734 case PROCESSOR_K8:
23735 case PROCESSOR_AMDFAM10:
23736 case PROCESSOR_BDVER1:
23737 case PROCESSOR_BDVER2:
23738 case PROCESSOR_BTVER1:
23739 case PROCESSOR_ATOM:
23740 case PROCESSOR_GENERIC32:
23741 case PROCESSOR_GENERIC64:
23742 memory = get_attr_memory (insn);
23744 /* Show ability of reorder buffer to hide latency of load by executing
23745 in parallel with previous instruction in case
23746 previous instruction is not needed to compute the address. */
23747 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23748 && !ix86_agi_dependent (dep_insn, insn))
23750 enum attr_unit unit = get_attr_unit (insn);
23751 int loadcost = 3;
23753 /* Because of the difference between the length of integer and
23754 floating unit pipeline preparation stages, the memory operands
23755 for floating point are cheaper.
23757 ??? For Athlon it the difference is most probably 2. */
23758 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23759 loadcost = 3;
23760 else
23761 loadcost = TARGET_ATHLON ? 2 : 0;
23763 if (cost >= loadcost)
23764 cost -= loadcost;
23765 else
23766 cost = 0;
23769 default:
23770 break;
23773 return cost;
23776 /* How many alternative schedules to try. This should be as wide as the
23777 scheduling freedom in the DFA, but no wider. Making this value too
23778 large results extra work for the scheduler. */
23780 static int
23781 ia32_multipass_dfa_lookahead (void)
23783 switch (ix86_tune)
23785 case PROCESSOR_PENTIUM:
23786 return 2;
23788 case PROCESSOR_PENTIUMPRO:
23789 case PROCESSOR_K6:
23790 return 1;
23792 case PROCESSOR_CORE2_32:
23793 case PROCESSOR_CORE2_64:
23794 case PROCESSOR_COREI7_32:
23795 case PROCESSOR_COREI7_64:
23796 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23797 as many instructions can be executed on a cycle, i.e.,
23798 issue_rate. I wonder why tuning for many CPUs does not do this. */
23799 return ix86_issue_rate ();
23801 default:
23802 return 0;
23808 /* Model decoder of Core 2/i7.
23809 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23810 track the instruction fetch block boundaries and make sure that long
23811 (9+ bytes) instructions are assigned to D0. */
23813 /* Maximum length of an insn that can be handled by
23814 a secondary decoder unit. '8' for Core 2/i7. */
23815 static int core2i7_secondary_decoder_max_insn_size;
23817 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23818 '16' for Core 2/i7. */
23819 static int core2i7_ifetch_block_size;
23821 /* Maximum number of instructions decoder can handle per cycle.
23822 '6' for Core 2/i7. */
23823 static int core2i7_ifetch_block_max_insns;
23825 typedef struct ix86_first_cycle_multipass_data_ *
23826 ix86_first_cycle_multipass_data_t;
23827 typedef const struct ix86_first_cycle_multipass_data_ *
23828 const_ix86_first_cycle_multipass_data_t;
23830 /* A variable to store target state across calls to max_issue within
23831 one cycle. */
23832 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23833 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23835 /* Initialize DATA. */
23836 static void
23837 core2i7_first_cycle_multipass_init (void *_data)
23839 ix86_first_cycle_multipass_data_t data
23840 = (ix86_first_cycle_multipass_data_t) _data;
23842 data->ifetch_block_len = 0;
23843 data->ifetch_block_n_insns = 0;
23844 data->ready_try_change = NULL;
23845 data->ready_try_change_size = 0;
23848 /* Advancing the cycle; reset ifetch block counts. */
23849 static void
23850 core2i7_dfa_post_advance_cycle (void)
23852 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23854 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23856 data->ifetch_block_len = 0;
23857 data->ifetch_block_n_insns = 0;
23860 static int min_insn_size (rtx);
23862 /* Filter out insns from ready_try that the core will not be able to issue
23863 on current cycle due to decoder. */
23864 static void
23865 core2i7_first_cycle_multipass_filter_ready_try
23866 (const_ix86_first_cycle_multipass_data_t data,
23867 char *ready_try, int n_ready, bool first_cycle_insn_p)
23869 while (n_ready--)
23871 rtx insn;
23872 int insn_size;
23874 if (ready_try[n_ready])
23875 continue;
23877 insn = get_ready_element (n_ready);
23878 insn_size = min_insn_size (insn);
23880 if (/* If this is a too long an insn for a secondary decoder ... */
23881 (!first_cycle_insn_p
23882 && insn_size > core2i7_secondary_decoder_max_insn_size)
23883 /* ... or it would not fit into the ifetch block ... */
23884 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23885 /* ... or the decoder is full already ... */
23886 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23887 /* ... mask the insn out. */
23889 ready_try[n_ready] = 1;
23891 if (data->ready_try_change)
23892 SET_BIT (data->ready_try_change, n_ready);
23897 /* Prepare for a new round of multipass lookahead scheduling. */
23898 static void
23899 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23900 bool first_cycle_insn_p)
23902 ix86_first_cycle_multipass_data_t data
23903 = (ix86_first_cycle_multipass_data_t) _data;
23904 const_ix86_first_cycle_multipass_data_t prev_data
23905 = ix86_first_cycle_multipass_data;
23907 /* Restore the state from the end of the previous round. */
23908 data->ifetch_block_len = prev_data->ifetch_block_len;
23909 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23911 /* Filter instructions that cannot be issued on current cycle due to
23912 decoder restrictions. */
23913 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23914 first_cycle_insn_p);
23917 /* INSN is being issued in current solution. Account for its impact on
23918 the decoder model. */
23919 static void
23920 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23921 rtx insn, const void *_prev_data)
23923 ix86_first_cycle_multipass_data_t data
23924 = (ix86_first_cycle_multipass_data_t) _data;
23925 const_ix86_first_cycle_multipass_data_t prev_data
23926 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23928 int insn_size = min_insn_size (insn);
23930 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23931 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23932 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23933 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23935 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23936 if (!data->ready_try_change)
23938 data->ready_try_change = sbitmap_alloc (n_ready);
23939 data->ready_try_change_size = n_ready;
23941 else if (data->ready_try_change_size < n_ready)
23943 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23944 n_ready, 0);
23945 data->ready_try_change_size = n_ready;
23947 sbitmap_zero (data->ready_try_change);
23949 /* Filter out insns from ready_try that the core will not be able to issue
23950 on current cycle due to decoder. */
23951 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23952 false);
23955 /* Revert the effect on ready_try. */
23956 static void
23957 core2i7_first_cycle_multipass_backtrack (const void *_data,
23958 char *ready_try,
23959 int n_ready ATTRIBUTE_UNUSED)
23961 const_ix86_first_cycle_multipass_data_t data
23962 = (const_ix86_first_cycle_multipass_data_t) _data;
23963 unsigned int i = 0;
23964 sbitmap_iterator sbi;
23966 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23967 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23969 ready_try[i] = 0;
23973 /* Save the result of multipass lookahead scheduling for the next round. */
23974 static void
23975 core2i7_first_cycle_multipass_end (const void *_data)
23977 const_ix86_first_cycle_multipass_data_t data
23978 = (const_ix86_first_cycle_multipass_data_t) _data;
23979 ix86_first_cycle_multipass_data_t next_data
23980 = ix86_first_cycle_multipass_data;
23982 if (data != NULL)
23984 next_data->ifetch_block_len = data->ifetch_block_len;
23985 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23989 /* Deallocate target data. */
23990 static void
23991 core2i7_first_cycle_multipass_fini (void *_data)
23993 ix86_first_cycle_multipass_data_t data
23994 = (ix86_first_cycle_multipass_data_t) _data;
23996 if (data->ready_try_change)
23998 sbitmap_free (data->ready_try_change);
23999 data->ready_try_change = NULL;
24000 data->ready_try_change_size = 0;
24004 /* Prepare for scheduling pass. */
24005 static void
24006 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24007 int verbose ATTRIBUTE_UNUSED,
24008 int max_uid ATTRIBUTE_UNUSED)
24010 /* Install scheduling hooks for current CPU. Some of these hooks are used
24011 in time-critical parts of the scheduler, so we only set them up when
24012 they are actually used. */
24013 switch (ix86_tune)
24015 case PROCESSOR_CORE2_32:
24016 case PROCESSOR_CORE2_64:
24017 case PROCESSOR_COREI7_32:
24018 case PROCESSOR_COREI7_64:
24019 targetm.sched.dfa_post_advance_cycle
24020 = core2i7_dfa_post_advance_cycle;
24021 targetm.sched.first_cycle_multipass_init
24022 = core2i7_first_cycle_multipass_init;
24023 targetm.sched.first_cycle_multipass_begin
24024 = core2i7_first_cycle_multipass_begin;
24025 targetm.sched.first_cycle_multipass_issue
24026 = core2i7_first_cycle_multipass_issue;
24027 targetm.sched.first_cycle_multipass_backtrack
24028 = core2i7_first_cycle_multipass_backtrack;
24029 targetm.sched.first_cycle_multipass_end
24030 = core2i7_first_cycle_multipass_end;
24031 targetm.sched.first_cycle_multipass_fini
24032 = core2i7_first_cycle_multipass_fini;
24034 /* Set decoder parameters. */
24035 core2i7_secondary_decoder_max_insn_size = 8;
24036 core2i7_ifetch_block_size = 16;
24037 core2i7_ifetch_block_max_insns = 6;
24038 break;
24040 default:
24041 targetm.sched.dfa_post_advance_cycle = NULL;
24042 targetm.sched.first_cycle_multipass_init = NULL;
24043 targetm.sched.first_cycle_multipass_begin = NULL;
24044 targetm.sched.first_cycle_multipass_issue = NULL;
24045 targetm.sched.first_cycle_multipass_backtrack = NULL;
24046 targetm.sched.first_cycle_multipass_end = NULL;
24047 targetm.sched.first_cycle_multipass_fini = NULL;
24048 break;
24053 /* Compute the alignment given to a constant that is being placed in memory.
24054 EXP is the constant and ALIGN is the alignment that the object would
24055 ordinarily have.
24056 The value of this function is used instead of that alignment to align
24057 the object. */
24060 ix86_constant_alignment (tree exp, int align)
24062 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24063 || TREE_CODE (exp) == INTEGER_CST)
24065 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24066 return 64;
24067 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24068 return 128;
24070 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24071 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24072 return BITS_PER_WORD;
24074 return align;
24077 /* Compute the alignment for a static variable.
24078 TYPE is the data type, and ALIGN is the alignment that
24079 the object would ordinarily have. The value of this function is used
24080 instead of that alignment to align the object. */
24083 ix86_data_alignment (tree type, int align)
24085 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24087 if (AGGREGATE_TYPE_P (type)
24088 && TYPE_SIZE (type)
24089 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24090 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24091 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24092 && align < max_align)
24093 align = max_align;
24095 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24096 to 16byte boundary. */
24097 if (TARGET_64BIT)
24099 if (AGGREGATE_TYPE_P (type)
24100 && TYPE_SIZE (type)
24101 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24102 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24103 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24104 return 128;
24107 if (TREE_CODE (type) == ARRAY_TYPE)
24109 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24110 return 64;
24111 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24112 return 128;
24114 else if (TREE_CODE (type) == COMPLEX_TYPE)
24117 if (TYPE_MODE (type) == DCmode && align < 64)
24118 return 64;
24119 if ((TYPE_MODE (type) == XCmode
24120 || TYPE_MODE (type) == TCmode) && align < 128)
24121 return 128;
24123 else if ((TREE_CODE (type) == RECORD_TYPE
24124 || TREE_CODE (type) == UNION_TYPE
24125 || TREE_CODE (type) == QUAL_UNION_TYPE)
24126 && TYPE_FIELDS (type))
24128 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24129 return 64;
24130 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24131 return 128;
24133 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24134 || TREE_CODE (type) == INTEGER_TYPE)
24136 if (TYPE_MODE (type) == DFmode && align < 64)
24137 return 64;
24138 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24139 return 128;
24142 return align;
24145 /* Compute the alignment for a local variable or a stack slot. EXP is
24146 the data type or decl itself, MODE is the widest mode available and
24147 ALIGN is the alignment that the object would ordinarily have. The
24148 value of this macro is used instead of that alignment to align the
24149 object. */
24151 unsigned int
24152 ix86_local_alignment (tree exp, enum machine_mode mode,
24153 unsigned int align)
24155 tree type, decl;
24157 if (exp && DECL_P (exp))
24159 type = TREE_TYPE (exp);
24160 decl = exp;
24162 else
24164 type = exp;
24165 decl = NULL;
24168 /* Don't do dynamic stack realignment for long long objects with
24169 -mpreferred-stack-boundary=2. */
24170 if (!TARGET_64BIT
24171 && align == 64
24172 && ix86_preferred_stack_boundary < 64
24173 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24174 && (!type || !TYPE_USER_ALIGN (type))
24175 && (!decl || !DECL_USER_ALIGN (decl)))
24176 align = 32;
24178 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24179 register in MODE. We will return the largest alignment of XF
24180 and DF. */
24181 if (!type)
24183 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24184 align = GET_MODE_ALIGNMENT (DFmode);
24185 return align;
24188 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24189 to 16byte boundary. Exact wording is:
24191 An array uses the same alignment as its elements, except that a local or
24192 global array variable of length at least 16 bytes or
24193 a C99 variable-length array variable always has alignment of at least 16 bytes.
24195 This was added to allow use of aligned SSE instructions at arrays. This
24196 rule is meant for static storage (where compiler can not do the analysis
24197 by itself). We follow it for automatic variables only when convenient.
24198 We fully control everything in the function compiled and functions from
24199 other unit can not rely on the alignment.
24201 Exclude va_list type. It is the common case of local array where
24202 we can not benefit from the alignment. */
24203 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24204 && TARGET_SSE)
24206 if (AGGREGATE_TYPE_P (type)
24207 && (va_list_type_node == NULL_TREE
24208 || (TYPE_MAIN_VARIANT (type)
24209 != TYPE_MAIN_VARIANT (va_list_type_node)))
24210 && TYPE_SIZE (type)
24211 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24212 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24213 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24214 return 128;
24216 if (TREE_CODE (type) == ARRAY_TYPE)
24218 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24219 return 64;
24220 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24221 return 128;
24223 else if (TREE_CODE (type) == COMPLEX_TYPE)
24225 if (TYPE_MODE (type) == DCmode && align < 64)
24226 return 64;
24227 if ((TYPE_MODE (type) == XCmode
24228 || TYPE_MODE (type) == TCmode) && align < 128)
24229 return 128;
24231 else if ((TREE_CODE (type) == RECORD_TYPE
24232 || TREE_CODE (type) == UNION_TYPE
24233 || TREE_CODE (type) == QUAL_UNION_TYPE)
24234 && TYPE_FIELDS (type))
24236 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24237 return 64;
24238 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24239 return 128;
24241 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24242 || TREE_CODE (type) == INTEGER_TYPE)
24245 if (TYPE_MODE (type) == DFmode && align < 64)
24246 return 64;
24247 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24248 return 128;
24250 return align;
24253 /* Compute the minimum required alignment for dynamic stack realignment
24254 purposes for a local variable, parameter or a stack slot. EXP is
24255 the data type or decl itself, MODE is its mode and ALIGN is the
24256 alignment that the object would ordinarily have. */
24258 unsigned int
24259 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24260 unsigned int align)
24262 tree type, decl;
24264 if (exp && DECL_P (exp))
24266 type = TREE_TYPE (exp);
24267 decl = exp;
24269 else
24271 type = exp;
24272 decl = NULL;
24275 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24276 return align;
24278 /* Don't do dynamic stack realignment for long long objects with
24279 -mpreferred-stack-boundary=2. */
24280 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24281 && (!type || !TYPE_USER_ALIGN (type))
24282 && (!decl || !DECL_USER_ALIGN (decl)))
24283 return 32;
24285 return align;
24288 /* Find a location for the static chain incoming to a nested function.
24289 This is a register, unless all free registers are used by arguments. */
24291 static rtx
24292 ix86_static_chain (const_tree fndecl, bool incoming_p)
24294 unsigned regno;
24296 if (!DECL_STATIC_CHAIN (fndecl))
24297 return NULL;
24299 if (TARGET_64BIT)
24301 /* We always use R10 in 64-bit mode. */
24302 regno = R10_REG;
24304 else
24306 tree fntype;
24307 unsigned int ccvt;
24309 /* By default in 32-bit mode we use ECX to pass the static chain. */
24310 regno = CX_REG;
24312 fntype = TREE_TYPE (fndecl);
24313 ccvt = ix86_get_callcvt (fntype);
24314 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24316 /* Fastcall functions use ecx/edx for arguments, which leaves
24317 us with EAX for the static chain.
24318 Thiscall functions use ecx for arguments, which also
24319 leaves us with EAX for the static chain. */
24320 regno = AX_REG;
24322 else if (ix86_function_regparm (fntype, fndecl) == 3)
24324 /* For regparm 3, we have no free call-clobbered registers in
24325 which to store the static chain. In order to implement this,
24326 we have the trampoline push the static chain to the stack.
24327 However, we can't push a value below the return address when
24328 we call the nested function directly, so we have to use an
24329 alternate entry point. For this we use ESI, and have the
24330 alternate entry point push ESI, so that things appear the
24331 same once we're executing the nested function. */
24332 if (incoming_p)
24334 if (fndecl == current_function_decl)
24335 ix86_static_chain_on_stack = true;
24336 return gen_frame_mem (SImode,
24337 plus_constant (arg_pointer_rtx, -8));
24339 regno = SI_REG;
24343 return gen_rtx_REG (Pmode, regno);
24346 /* Emit RTL insns to initialize the variable parts of a trampoline.
24347 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24348 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24349 to be passed to the target function. */
24351 static void
24352 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24354 rtx mem, fnaddr;
24355 int opcode;
24356 int offset = 0;
24358 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24360 if (TARGET_64BIT)
24362 int size;
24364 /* Load the function address to r11. Try to load address using
24365 the shorter movl instead of movabs. We may want to support
24366 movq for kernel mode, but kernel does not use trampolines at
24367 the moment. FNADDR is a 32bit address and may not be in
24368 DImode when ptr_mode == SImode. Always use movl in this
24369 case. */
24370 if (ptr_mode == SImode
24371 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24373 fnaddr = copy_addr_to_reg (fnaddr);
24375 mem = adjust_address (m_tramp, HImode, offset);
24376 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24378 mem = adjust_address (m_tramp, SImode, offset + 2);
24379 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24380 offset += 6;
24382 else
24384 mem = adjust_address (m_tramp, HImode, offset);
24385 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24387 mem = adjust_address (m_tramp, DImode, offset + 2);
24388 emit_move_insn (mem, fnaddr);
24389 offset += 10;
24392 /* Load static chain using movabs to r10. Use the shorter movl
24393 instead of movabs when ptr_mode == SImode. */
24394 if (ptr_mode == SImode)
24396 opcode = 0xba41;
24397 size = 6;
24399 else
24401 opcode = 0xba49;
24402 size = 10;
24405 mem = adjust_address (m_tramp, HImode, offset);
24406 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24408 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24409 emit_move_insn (mem, chain_value);
24410 offset += size;
24412 /* Jump to r11; the last (unused) byte is a nop, only there to
24413 pad the write out to a single 32-bit store. */
24414 mem = adjust_address (m_tramp, SImode, offset);
24415 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24416 offset += 4;
24418 else
24420 rtx disp, chain;
24422 /* Depending on the static chain location, either load a register
24423 with a constant, or push the constant to the stack. All of the
24424 instructions are the same size. */
24425 chain = ix86_static_chain (fndecl, true);
24426 if (REG_P (chain))
24428 switch (REGNO (chain))
24430 case AX_REG:
24431 opcode = 0xb8; break;
24432 case CX_REG:
24433 opcode = 0xb9; break;
24434 default:
24435 gcc_unreachable ();
24438 else
24439 opcode = 0x68;
24441 mem = adjust_address (m_tramp, QImode, offset);
24442 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24444 mem = adjust_address (m_tramp, SImode, offset + 1);
24445 emit_move_insn (mem, chain_value);
24446 offset += 5;
24448 mem = adjust_address (m_tramp, QImode, offset);
24449 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24451 mem = adjust_address (m_tramp, SImode, offset + 1);
24453 /* Compute offset from the end of the jmp to the target function.
24454 In the case in which the trampoline stores the static chain on
24455 the stack, we need to skip the first insn which pushes the
24456 (call-saved) register static chain; this push is 1 byte. */
24457 offset += 5;
24458 disp = expand_binop (SImode, sub_optab, fnaddr,
24459 plus_constant (XEXP (m_tramp, 0),
24460 offset - (MEM_P (chain) ? 1 : 0)),
24461 NULL_RTX, 1, OPTAB_DIRECT);
24462 emit_move_insn (mem, disp);
24465 gcc_assert (offset <= TRAMPOLINE_SIZE);
24467 #ifdef HAVE_ENABLE_EXECUTE_STACK
24468 #ifdef CHECK_EXECUTE_STACK_ENABLED
24469 if (CHECK_EXECUTE_STACK_ENABLED)
24470 #endif
24471 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24472 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24473 #endif
24476 /* The following file contains several enumerations and data structures
24477 built from the definitions in i386-builtin-types.def. */
24479 #include "i386-builtin-types.inc"
24481 /* Table for the ix86 builtin non-function types. */
24482 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24484 /* Retrieve an element from the above table, building some of
24485 the types lazily. */
24487 static tree
24488 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24490 unsigned int index;
24491 tree type, itype;
24493 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24495 type = ix86_builtin_type_tab[(int) tcode];
24496 if (type != NULL)
24497 return type;
24499 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24500 if (tcode <= IX86_BT_LAST_VECT)
24502 enum machine_mode mode;
24504 index = tcode - IX86_BT_LAST_PRIM - 1;
24505 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24506 mode = ix86_builtin_type_vect_mode[index];
24508 type = build_vector_type_for_mode (itype, mode);
24510 else
24512 int quals;
24514 index = tcode - IX86_BT_LAST_VECT - 1;
24515 if (tcode <= IX86_BT_LAST_PTR)
24516 quals = TYPE_UNQUALIFIED;
24517 else
24518 quals = TYPE_QUAL_CONST;
24520 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24521 if (quals != TYPE_UNQUALIFIED)
24522 itype = build_qualified_type (itype, quals);
24524 type = build_pointer_type (itype);
24527 ix86_builtin_type_tab[(int) tcode] = type;
24528 return type;
24531 /* Table for the ix86 builtin function types. */
24532 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24534 /* Retrieve an element from the above table, building some of
24535 the types lazily. */
24537 static tree
24538 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24540 tree type;
24542 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24544 type = ix86_builtin_func_type_tab[(int) tcode];
24545 if (type != NULL)
24546 return type;
24548 if (tcode <= IX86_BT_LAST_FUNC)
24550 unsigned start = ix86_builtin_func_start[(int) tcode];
24551 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24552 tree rtype, atype, args = void_list_node;
24553 unsigned i;
24555 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24556 for (i = after - 1; i > start; --i)
24558 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24559 args = tree_cons (NULL, atype, args);
24562 type = build_function_type (rtype, args);
24564 else
24566 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24567 enum ix86_builtin_func_type icode;
24569 icode = ix86_builtin_func_alias_base[index];
24570 type = ix86_get_builtin_func_type (icode);
24573 ix86_builtin_func_type_tab[(int) tcode] = type;
24574 return type;
24578 /* Codes for all the SSE/MMX builtins. */
24579 enum ix86_builtins
24581 IX86_BUILTIN_ADDPS,
24582 IX86_BUILTIN_ADDSS,
24583 IX86_BUILTIN_DIVPS,
24584 IX86_BUILTIN_DIVSS,
24585 IX86_BUILTIN_MULPS,
24586 IX86_BUILTIN_MULSS,
24587 IX86_BUILTIN_SUBPS,
24588 IX86_BUILTIN_SUBSS,
24590 IX86_BUILTIN_CMPEQPS,
24591 IX86_BUILTIN_CMPLTPS,
24592 IX86_BUILTIN_CMPLEPS,
24593 IX86_BUILTIN_CMPGTPS,
24594 IX86_BUILTIN_CMPGEPS,
24595 IX86_BUILTIN_CMPNEQPS,
24596 IX86_BUILTIN_CMPNLTPS,
24597 IX86_BUILTIN_CMPNLEPS,
24598 IX86_BUILTIN_CMPNGTPS,
24599 IX86_BUILTIN_CMPNGEPS,
24600 IX86_BUILTIN_CMPORDPS,
24601 IX86_BUILTIN_CMPUNORDPS,
24602 IX86_BUILTIN_CMPEQSS,
24603 IX86_BUILTIN_CMPLTSS,
24604 IX86_BUILTIN_CMPLESS,
24605 IX86_BUILTIN_CMPNEQSS,
24606 IX86_BUILTIN_CMPNLTSS,
24607 IX86_BUILTIN_CMPNLESS,
24608 IX86_BUILTIN_CMPNGTSS,
24609 IX86_BUILTIN_CMPNGESS,
24610 IX86_BUILTIN_CMPORDSS,
24611 IX86_BUILTIN_CMPUNORDSS,
24613 IX86_BUILTIN_COMIEQSS,
24614 IX86_BUILTIN_COMILTSS,
24615 IX86_BUILTIN_COMILESS,
24616 IX86_BUILTIN_COMIGTSS,
24617 IX86_BUILTIN_COMIGESS,
24618 IX86_BUILTIN_COMINEQSS,
24619 IX86_BUILTIN_UCOMIEQSS,
24620 IX86_BUILTIN_UCOMILTSS,
24621 IX86_BUILTIN_UCOMILESS,
24622 IX86_BUILTIN_UCOMIGTSS,
24623 IX86_BUILTIN_UCOMIGESS,
24624 IX86_BUILTIN_UCOMINEQSS,
24626 IX86_BUILTIN_CVTPI2PS,
24627 IX86_BUILTIN_CVTPS2PI,
24628 IX86_BUILTIN_CVTSI2SS,
24629 IX86_BUILTIN_CVTSI642SS,
24630 IX86_BUILTIN_CVTSS2SI,
24631 IX86_BUILTIN_CVTSS2SI64,
24632 IX86_BUILTIN_CVTTPS2PI,
24633 IX86_BUILTIN_CVTTSS2SI,
24634 IX86_BUILTIN_CVTTSS2SI64,
24636 IX86_BUILTIN_MAXPS,
24637 IX86_BUILTIN_MAXSS,
24638 IX86_BUILTIN_MINPS,
24639 IX86_BUILTIN_MINSS,
24641 IX86_BUILTIN_LOADUPS,
24642 IX86_BUILTIN_STOREUPS,
24643 IX86_BUILTIN_MOVSS,
24645 IX86_BUILTIN_MOVHLPS,
24646 IX86_BUILTIN_MOVLHPS,
24647 IX86_BUILTIN_LOADHPS,
24648 IX86_BUILTIN_LOADLPS,
24649 IX86_BUILTIN_STOREHPS,
24650 IX86_BUILTIN_STORELPS,
24652 IX86_BUILTIN_MASKMOVQ,
24653 IX86_BUILTIN_MOVMSKPS,
24654 IX86_BUILTIN_PMOVMSKB,
24656 IX86_BUILTIN_MOVNTPS,
24657 IX86_BUILTIN_MOVNTQ,
24659 IX86_BUILTIN_LOADDQU,
24660 IX86_BUILTIN_STOREDQU,
24662 IX86_BUILTIN_PACKSSWB,
24663 IX86_BUILTIN_PACKSSDW,
24664 IX86_BUILTIN_PACKUSWB,
24666 IX86_BUILTIN_PADDB,
24667 IX86_BUILTIN_PADDW,
24668 IX86_BUILTIN_PADDD,
24669 IX86_BUILTIN_PADDQ,
24670 IX86_BUILTIN_PADDSB,
24671 IX86_BUILTIN_PADDSW,
24672 IX86_BUILTIN_PADDUSB,
24673 IX86_BUILTIN_PADDUSW,
24674 IX86_BUILTIN_PSUBB,
24675 IX86_BUILTIN_PSUBW,
24676 IX86_BUILTIN_PSUBD,
24677 IX86_BUILTIN_PSUBQ,
24678 IX86_BUILTIN_PSUBSB,
24679 IX86_BUILTIN_PSUBSW,
24680 IX86_BUILTIN_PSUBUSB,
24681 IX86_BUILTIN_PSUBUSW,
24683 IX86_BUILTIN_PAND,
24684 IX86_BUILTIN_PANDN,
24685 IX86_BUILTIN_POR,
24686 IX86_BUILTIN_PXOR,
24688 IX86_BUILTIN_PAVGB,
24689 IX86_BUILTIN_PAVGW,
24691 IX86_BUILTIN_PCMPEQB,
24692 IX86_BUILTIN_PCMPEQW,
24693 IX86_BUILTIN_PCMPEQD,
24694 IX86_BUILTIN_PCMPGTB,
24695 IX86_BUILTIN_PCMPGTW,
24696 IX86_BUILTIN_PCMPGTD,
24698 IX86_BUILTIN_PMADDWD,
24700 IX86_BUILTIN_PMAXSW,
24701 IX86_BUILTIN_PMAXUB,
24702 IX86_BUILTIN_PMINSW,
24703 IX86_BUILTIN_PMINUB,
24705 IX86_BUILTIN_PMULHUW,
24706 IX86_BUILTIN_PMULHW,
24707 IX86_BUILTIN_PMULLW,
24709 IX86_BUILTIN_PSADBW,
24710 IX86_BUILTIN_PSHUFW,
24712 IX86_BUILTIN_PSLLW,
24713 IX86_BUILTIN_PSLLD,
24714 IX86_BUILTIN_PSLLQ,
24715 IX86_BUILTIN_PSRAW,
24716 IX86_BUILTIN_PSRAD,
24717 IX86_BUILTIN_PSRLW,
24718 IX86_BUILTIN_PSRLD,
24719 IX86_BUILTIN_PSRLQ,
24720 IX86_BUILTIN_PSLLWI,
24721 IX86_BUILTIN_PSLLDI,
24722 IX86_BUILTIN_PSLLQI,
24723 IX86_BUILTIN_PSRAWI,
24724 IX86_BUILTIN_PSRADI,
24725 IX86_BUILTIN_PSRLWI,
24726 IX86_BUILTIN_PSRLDI,
24727 IX86_BUILTIN_PSRLQI,
24729 IX86_BUILTIN_PUNPCKHBW,
24730 IX86_BUILTIN_PUNPCKHWD,
24731 IX86_BUILTIN_PUNPCKHDQ,
24732 IX86_BUILTIN_PUNPCKLBW,
24733 IX86_BUILTIN_PUNPCKLWD,
24734 IX86_BUILTIN_PUNPCKLDQ,
24736 IX86_BUILTIN_SHUFPS,
24738 IX86_BUILTIN_RCPPS,
24739 IX86_BUILTIN_RCPSS,
24740 IX86_BUILTIN_RSQRTPS,
24741 IX86_BUILTIN_RSQRTPS_NR,
24742 IX86_BUILTIN_RSQRTSS,
24743 IX86_BUILTIN_RSQRTF,
24744 IX86_BUILTIN_SQRTPS,
24745 IX86_BUILTIN_SQRTPS_NR,
24746 IX86_BUILTIN_SQRTSS,
24748 IX86_BUILTIN_UNPCKHPS,
24749 IX86_BUILTIN_UNPCKLPS,
24751 IX86_BUILTIN_ANDPS,
24752 IX86_BUILTIN_ANDNPS,
24753 IX86_BUILTIN_ORPS,
24754 IX86_BUILTIN_XORPS,
24756 IX86_BUILTIN_EMMS,
24757 IX86_BUILTIN_LDMXCSR,
24758 IX86_BUILTIN_STMXCSR,
24759 IX86_BUILTIN_SFENCE,
24761 /* 3DNow! Original */
24762 IX86_BUILTIN_FEMMS,
24763 IX86_BUILTIN_PAVGUSB,
24764 IX86_BUILTIN_PF2ID,
24765 IX86_BUILTIN_PFACC,
24766 IX86_BUILTIN_PFADD,
24767 IX86_BUILTIN_PFCMPEQ,
24768 IX86_BUILTIN_PFCMPGE,
24769 IX86_BUILTIN_PFCMPGT,
24770 IX86_BUILTIN_PFMAX,
24771 IX86_BUILTIN_PFMIN,
24772 IX86_BUILTIN_PFMUL,
24773 IX86_BUILTIN_PFRCP,
24774 IX86_BUILTIN_PFRCPIT1,
24775 IX86_BUILTIN_PFRCPIT2,
24776 IX86_BUILTIN_PFRSQIT1,
24777 IX86_BUILTIN_PFRSQRT,
24778 IX86_BUILTIN_PFSUB,
24779 IX86_BUILTIN_PFSUBR,
24780 IX86_BUILTIN_PI2FD,
24781 IX86_BUILTIN_PMULHRW,
24783 /* 3DNow! Athlon Extensions */
24784 IX86_BUILTIN_PF2IW,
24785 IX86_BUILTIN_PFNACC,
24786 IX86_BUILTIN_PFPNACC,
24787 IX86_BUILTIN_PI2FW,
24788 IX86_BUILTIN_PSWAPDSI,
24789 IX86_BUILTIN_PSWAPDSF,
24791 /* SSE2 */
24792 IX86_BUILTIN_ADDPD,
24793 IX86_BUILTIN_ADDSD,
24794 IX86_BUILTIN_DIVPD,
24795 IX86_BUILTIN_DIVSD,
24796 IX86_BUILTIN_MULPD,
24797 IX86_BUILTIN_MULSD,
24798 IX86_BUILTIN_SUBPD,
24799 IX86_BUILTIN_SUBSD,
24801 IX86_BUILTIN_CMPEQPD,
24802 IX86_BUILTIN_CMPLTPD,
24803 IX86_BUILTIN_CMPLEPD,
24804 IX86_BUILTIN_CMPGTPD,
24805 IX86_BUILTIN_CMPGEPD,
24806 IX86_BUILTIN_CMPNEQPD,
24807 IX86_BUILTIN_CMPNLTPD,
24808 IX86_BUILTIN_CMPNLEPD,
24809 IX86_BUILTIN_CMPNGTPD,
24810 IX86_BUILTIN_CMPNGEPD,
24811 IX86_BUILTIN_CMPORDPD,
24812 IX86_BUILTIN_CMPUNORDPD,
24813 IX86_BUILTIN_CMPEQSD,
24814 IX86_BUILTIN_CMPLTSD,
24815 IX86_BUILTIN_CMPLESD,
24816 IX86_BUILTIN_CMPNEQSD,
24817 IX86_BUILTIN_CMPNLTSD,
24818 IX86_BUILTIN_CMPNLESD,
24819 IX86_BUILTIN_CMPORDSD,
24820 IX86_BUILTIN_CMPUNORDSD,
24822 IX86_BUILTIN_COMIEQSD,
24823 IX86_BUILTIN_COMILTSD,
24824 IX86_BUILTIN_COMILESD,
24825 IX86_BUILTIN_COMIGTSD,
24826 IX86_BUILTIN_COMIGESD,
24827 IX86_BUILTIN_COMINEQSD,
24828 IX86_BUILTIN_UCOMIEQSD,
24829 IX86_BUILTIN_UCOMILTSD,
24830 IX86_BUILTIN_UCOMILESD,
24831 IX86_BUILTIN_UCOMIGTSD,
24832 IX86_BUILTIN_UCOMIGESD,
24833 IX86_BUILTIN_UCOMINEQSD,
24835 IX86_BUILTIN_MAXPD,
24836 IX86_BUILTIN_MAXSD,
24837 IX86_BUILTIN_MINPD,
24838 IX86_BUILTIN_MINSD,
24840 IX86_BUILTIN_ANDPD,
24841 IX86_BUILTIN_ANDNPD,
24842 IX86_BUILTIN_ORPD,
24843 IX86_BUILTIN_XORPD,
24845 IX86_BUILTIN_SQRTPD,
24846 IX86_BUILTIN_SQRTSD,
24848 IX86_BUILTIN_UNPCKHPD,
24849 IX86_BUILTIN_UNPCKLPD,
24851 IX86_BUILTIN_SHUFPD,
24853 IX86_BUILTIN_LOADUPD,
24854 IX86_BUILTIN_STOREUPD,
24855 IX86_BUILTIN_MOVSD,
24857 IX86_BUILTIN_LOADHPD,
24858 IX86_BUILTIN_LOADLPD,
24860 IX86_BUILTIN_CVTDQ2PD,
24861 IX86_BUILTIN_CVTDQ2PS,
24863 IX86_BUILTIN_CVTPD2DQ,
24864 IX86_BUILTIN_CVTPD2PI,
24865 IX86_BUILTIN_CVTPD2PS,
24866 IX86_BUILTIN_CVTTPD2DQ,
24867 IX86_BUILTIN_CVTTPD2PI,
24869 IX86_BUILTIN_CVTPI2PD,
24870 IX86_BUILTIN_CVTSI2SD,
24871 IX86_BUILTIN_CVTSI642SD,
24873 IX86_BUILTIN_CVTSD2SI,
24874 IX86_BUILTIN_CVTSD2SI64,
24875 IX86_BUILTIN_CVTSD2SS,
24876 IX86_BUILTIN_CVTSS2SD,
24877 IX86_BUILTIN_CVTTSD2SI,
24878 IX86_BUILTIN_CVTTSD2SI64,
24880 IX86_BUILTIN_CVTPS2DQ,
24881 IX86_BUILTIN_CVTPS2PD,
24882 IX86_BUILTIN_CVTTPS2DQ,
24884 IX86_BUILTIN_MOVNTI,
24885 IX86_BUILTIN_MOVNTI64,
24886 IX86_BUILTIN_MOVNTPD,
24887 IX86_BUILTIN_MOVNTDQ,
24889 IX86_BUILTIN_MOVQ128,
24891 /* SSE2 MMX */
24892 IX86_BUILTIN_MASKMOVDQU,
24893 IX86_BUILTIN_MOVMSKPD,
24894 IX86_BUILTIN_PMOVMSKB128,
24896 IX86_BUILTIN_PACKSSWB128,
24897 IX86_BUILTIN_PACKSSDW128,
24898 IX86_BUILTIN_PACKUSWB128,
24900 IX86_BUILTIN_PADDB128,
24901 IX86_BUILTIN_PADDW128,
24902 IX86_BUILTIN_PADDD128,
24903 IX86_BUILTIN_PADDQ128,
24904 IX86_BUILTIN_PADDSB128,
24905 IX86_BUILTIN_PADDSW128,
24906 IX86_BUILTIN_PADDUSB128,
24907 IX86_BUILTIN_PADDUSW128,
24908 IX86_BUILTIN_PSUBB128,
24909 IX86_BUILTIN_PSUBW128,
24910 IX86_BUILTIN_PSUBD128,
24911 IX86_BUILTIN_PSUBQ128,
24912 IX86_BUILTIN_PSUBSB128,
24913 IX86_BUILTIN_PSUBSW128,
24914 IX86_BUILTIN_PSUBUSB128,
24915 IX86_BUILTIN_PSUBUSW128,
24917 IX86_BUILTIN_PAND128,
24918 IX86_BUILTIN_PANDN128,
24919 IX86_BUILTIN_POR128,
24920 IX86_BUILTIN_PXOR128,
24922 IX86_BUILTIN_PAVGB128,
24923 IX86_BUILTIN_PAVGW128,
24925 IX86_BUILTIN_PCMPEQB128,
24926 IX86_BUILTIN_PCMPEQW128,
24927 IX86_BUILTIN_PCMPEQD128,
24928 IX86_BUILTIN_PCMPGTB128,
24929 IX86_BUILTIN_PCMPGTW128,
24930 IX86_BUILTIN_PCMPGTD128,
24932 IX86_BUILTIN_PMADDWD128,
24934 IX86_BUILTIN_PMAXSW128,
24935 IX86_BUILTIN_PMAXUB128,
24936 IX86_BUILTIN_PMINSW128,
24937 IX86_BUILTIN_PMINUB128,
24939 IX86_BUILTIN_PMULUDQ,
24940 IX86_BUILTIN_PMULUDQ128,
24941 IX86_BUILTIN_PMULHUW128,
24942 IX86_BUILTIN_PMULHW128,
24943 IX86_BUILTIN_PMULLW128,
24945 IX86_BUILTIN_PSADBW128,
24946 IX86_BUILTIN_PSHUFHW,
24947 IX86_BUILTIN_PSHUFLW,
24948 IX86_BUILTIN_PSHUFD,
24950 IX86_BUILTIN_PSLLDQI128,
24951 IX86_BUILTIN_PSLLWI128,
24952 IX86_BUILTIN_PSLLDI128,
24953 IX86_BUILTIN_PSLLQI128,
24954 IX86_BUILTIN_PSRAWI128,
24955 IX86_BUILTIN_PSRADI128,
24956 IX86_BUILTIN_PSRLDQI128,
24957 IX86_BUILTIN_PSRLWI128,
24958 IX86_BUILTIN_PSRLDI128,
24959 IX86_BUILTIN_PSRLQI128,
24961 IX86_BUILTIN_PSLLDQ128,
24962 IX86_BUILTIN_PSLLW128,
24963 IX86_BUILTIN_PSLLD128,
24964 IX86_BUILTIN_PSLLQ128,
24965 IX86_BUILTIN_PSRAW128,
24966 IX86_BUILTIN_PSRAD128,
24967 IX86_BUILTIN_PSRLW128,
24968 IX86_BUILTIN_PSRLD128,
24969 IX86_BUILTIN_PSRLQ128,
24971 IX86_BUILTIN_PUNPCKHBW128,
24972 IX86_BUILTIN_PUNPCKHWD128,
24973 IX86_BUILTIN_PUNPCKHDQ128,
24974 IX86_BUILTIN_PUNPCKHQDQ128,
24975 IX86_BUILTIN_PUNPCKLBW128,
24976 IX86_BUILTIN_PUNPCKLWD128,
24977 IX86_BUILTIN_PUNPCKLDQ128,
24978 IX86_BUILTIN_PUNPCKLQDQ128,
24980 IX86_BUILTIN_CLFLUSH,
24981 IX86_BUILTIN_MFENCE,
24982 IX86_BUILTIN_LFENCE,
24983 IX86_BUILTIN_PAUSE,
24985 IX86_BUILTIN_BSRSI,
24986 IX86_BUILTIN_BSRDI,
24987 IX86_BUILTIN_RDPMC,
24988 IX86_BUILTIN_RDTSC,
24989 IX86_BUILTIN_RDTSCP,
24990 IX86_BUILTIN_ROLQI,
24991 IX86_BUILTIN_ROLHI,
24992 IX86_BUILTIN_RORQI,
24993 IX86_BUILTIN_RORHI,
24995 /* SSE3. */
24996 IX86_BUILTIN_ADDSUBPS,
24997 IX86_BUILTIN_HADDPS,
24998 IX86_BUILTIN_HSUBPS,
24999 IX86_BUILTIN_MOVSHDUP,
25000 IX86_BUILTIN_MOVSLDUP,
25001 IX86_BUILTIN_ADDSUBPD,
25002 IX86_BUILTIN_HADDPD,
25003 IX86_BUILTIN_HSUBPD,
25004 IX86_BUILTIN_LDDQU,
25006 IX86_BUILTIN_MONITOR,
25007 IX86_BUILTIN_MWAIT,
25009 /* SSSE3. */
25010 IX86_BUILTIN_PHADDW,
25011 IX86_BUILTIN_PHADDD,
25012 IX86_BUILTIN_PHADDSW,
25013 IX86_BUILTIN_PHSUBW,
25014 IX86_BUILTIN_PHSUBD,
25015 IX86_BUILTIN_PHSUBSW,
25016 IX86_BUILTIN_PMADDUBSW,
25017 IX86_BUILTIN_PMULHRSW,
25018 IX86_BUILTIN_PSHUFB,
25019 IX86_BUILTIN_PSIGNB,
25020 IX86_BUILTIN_PSIGNW,
25021 IX86_BUILTIN_PSIGND,
25022 IX86_BUILTIN_PALIGNR,
25023 IX86_BUILTIN_PABSB,
25024 IX86_BUILTIN_PABSW,
25025 IX86_BUILTIN_PABSD,
25027 IX86_BUILTIN_PHADDW128,
25028 IX86_BUILTIN_PHADDD128,
25029 IX86_BUILTIN_PHADDSW128,
25030 IX86_BUILTIN_PHSUBW128,
25031 IX86_BUILTIN_PHSUBD128,
25032 IX86_BUILTIN_PHSUBSW128,
25033 IX86_BUILTIN_PMADDUBSW128,
25034 IX86_BUILTIN_PMULHRSW128,
25035 IX86_BUILTIN_PSHUFB128,
25036 IX86_BUILTIN_PSIGNB128,
25037 IX86_BUILTIN_PSIGNW128,
25038 IX86_BUILTIN_PSIGND128,
25039 IX86_BUILTIN_PALIGNR128,
25040 IX86_BUILTIN_PABSB128,
25041 IX86_BUILTIN_PABSW128,
25042 IX86_BUILTIN_PABSD128,
25044 /* AMDFAM10 - SSE4A New Instructions. */
25045 IX86_BUILTIN_MOVNTSD,
25046 IX86_BUILTIN_MOVNTSS,
25047 IX86_BUILTIN_EXTRQI,
25048 IX86_BUILTIN_EXTRQ,
25049 IX86_BUILTIN_INSERTQI,
25050 IX86_BUILTIN_INSERTQ,
25052 /* SSE4.1. */
25053 IX86_BUILTIN_BLENDPD,
25054 IX86_BUILTIN_BLENDPS,
25055 IX86_BUILTIN_BLENDVPD,
25056 IX86_BUILTIN_BLENDVPS,
25057 IX86_BUILTIN_PBLENDVB128,
25058 IX86_BUILTIN_PBLENDW128,
25060 IX86_BUILTIN_DPPD,
25061 IX86_BUILTIN_DPPS,
25063 IX86_BUILTIN_INSERTPS128,
25065 IX86_BUILTIN_MOVNTDQA,
25066 IX86_BUILTIN_MPSADBW128,
25067 IX86_BUILTIN_PACKUSDW128,
25068 IX86_BUILTIN_PCMPEQQ,
25069 IX86_BUILTIN_PHMINPOSUW128,
25071 IX86_BUILTIN_PMAXSB128,
25072 IX86_BUILTIN_PMAXSD128,
25073 IX86_BUILTIN_PMAXUD128,
25074 IX86_BUILTIN_PMAXUW128,
25076 IX86_BUILTIN_PMINSB128,
25077 IX86_BUILTIN_PMINSD128,
25078 IX86_BUILTIN_PMINUD128,
25079 IX86_BUILTIN_PMINUW128,
25081 IX86_BUILTIN_PMOVSXBW128,
25082 IX86_BUILTIN_PMOVSXBD128,
25083 IX86_BUILTIN_PMOVSXBQ128,
25084 IX86_BUILTIN_PMOVSXWD128,
25085 IX86_BUILTIN_PMOVSXWQ128,
25086 IX86_BUILTIN_PMOVSXDQ128,
25088 IX86_BUILTIN_PMOVZXBW128,
25089 IX86_BUILTIN_PMOVZXBD128,
25090 IX86_BUILTIN_PMOVZXBQ128,
25091 IX86_BUILTIN_PMOVZXWD128,
25092 IX86_BUILTIN_PMOVZXWQ128,
25093 IX86_BUILTIN_PMOVZXDQ128,
25095 IX86_BUILTIN_PMULDQ128,
25096 IX86_BUILTIN_PMULLD128,
25098 IX86_BUILTIN_ROUNDSD,
25099 IX86_BUILTIN_ROUNDSS,
25101 IX86_BUILTIN_ROUNDPD,
25102 IX86_BUILTIN_ROUNDPS,
25104 IX86_BUILTIN_FLOORPD,
25105 IX86_BUILTIN_CEILPD,
25106 IX86_BUILTIN_TRUNCPD,
25107 IX86_BUILTIN_RINTPD,
25108 IX86_BUILTIN_ROUNDPD_AZ,
25110 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25111 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25112 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25114 IX86_BUILTIN_FLOORPS,
25115 IX86_BUILTIN_CEILPS,
25116 IX86_BUILTIN_TRUNCPS,
25117 IX86_BUILTIN_RINTPS,
25118 IX86_BUILTIN_ROUNDPS_AZ,
25120 IX86_BUILTIN_FLOORPS_SFIX,
25121 IX86_BUILTIN_CEILPS_SFIX,
25122 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25124 IX86_BUILTIN_PTESTZ,
25125 IX86_BUILTIN_PTESTC,
25126 IX86_BUILTIN_PTESTNZC,
25128 IX86_BUILTIN_VEC_INIT_V2SI,
25129 IX86_BUILTIN_VEC_INIT_V4HI,
25130 IX86_BUILTIN_VEC_INIT_V8QI,
25131 IX86_BUILTIN_VEC_EXT_V2DF,
25132 IX86_BUILTIN_VEC_EXT_V2DI,
25133 IX86_BUILTIN_VEC_EXT_V4SF,
25134 IX86_BUILTIN_VEC_EXT_V4SI,
25135 IX86_BUILTIN_VEC_EXT_V8HI,
25136 IX86_BUILTIN_VEC_EXT_V2SI,
25137 IX86_BUILTIN_VEC_EXT_V4HI,
25138 IX86_BUILTIN_VEC_EXT_V16QI,
25139 IX86_BUILTIN_VEC_SET_V2DI,
25140 IX86_BUILTIN_VEC_SET_V4SF,
25141 IX86_BUILTIN_VEC_SET_V4SI,
25142 IX86_BUILTIN_VEC_SET_V8HI,
25143 IX86_BUILTIN_VEC_SET_V4HI,
25144 IX86_BUILTIN_VEC_SET_V16QI,
25146 IX86_BUILTIN_VEC_PACK_SFIX,
25147 IX86_BUILTIN_VEC_PACK_SFIX256,
25149 /* SSE4.2. */
25150 IX86_BUILTIN_CRC32QI,
25151 IX86_BUILTIN_CRC32HI,
25152 IX86_BUILTIN_CRC32SI,
25153 IX86_BUILTIN_CRC32DI,
25155 IX86_BUILTIN_PCMPESTRI128,
25156 IX86_BUILTIN_PCMPESTRM128,
25157 IX86_BUILTIN_PCMPESTRA128,
25158 IX86_BUILTIN_PCMPESTRC128,
25159 IX86_BUILTIN_PCMPESTRO128,
25160 IX86_BUILTIN_PCMPESTRS128,
25161 IX86_BUILTIN_PCMPESTRZ128,
25162 IX86_BUILTIN_PCMPISTRI128,
25163 IX86_BUILTIN_PCMPISTRM128,
25164 IX86_BUILTIN_PCMPISTRA128,
25165 IX86_BUILTIN_PCMPISTRC128,
25166 IX86_BUILTIN_PCMPISTRO128,
25167 IX86_BUILTIN_PCMPISTRS128,
25168 IX86_BUILTIN_PCMPISTRZ128,
25170 IX86_BUILTIN_PCMPGTQ,
25172 /* AES instructions */
25173 IX86_BUILTIN_AESENC128,
25174 IX86_BUILTIN_AESENCLAST128,
25175 IX86_BUILTIN_AESDEC128,
25176 IX86_BUILTIN_AESDECLAST128,
25177 IX86_BUILTIN_AESIMC128,
25178 IX86_BUILTIN_AESKEYGENASSIST128,
25180 /* PCLMUL instruction */
25181 IX86_BUILTIN_PCLMULQDQ128,
25183 /* AVX */
25184 IX86_BUILTIN_ADDPD256,
25185 IX86_BUILTIN_ADDPS256,
25186 IX86_BUILTIN_ADDSUBPD256,
25187 IX86_BUILTIN_ADDSUBPS256,
25188 IX86_BUILTIN_ANDPD256,
25189 IX86_BUILTIN_ANDPS256,
25190 IX86_BUILTIN_ANDNPD256,
25191 IX86_BUILTIN_ANDNPS256,
25192 IX86_BUILTIN_BLENDPD256,
25193 IX86_BUILTIN_BLENDPS256,
25194 IX86_BUILTIN_BLENDVPD256,
25195 IX86_BUILTIN_BLENDVPS256,
25196 IX86_BUILTIN_DIVPD256,
25197 IX86_BUILTIN_DIVPS256,
25198 IX86_BUILTIN_DPPS256,
25199 IX86_BUILTIN_HADDPD256,
25200 IX86_BUILTIN_HADDPS256,
25201 IX86_BUILTIN_HSUBPD256,
25202 IX86_BUILTIN_HSUBPS256,
25203 IX86_BUILTIN_MAXPD256,
25204 IX86_BUILTIN_MAXPS256,
25205 IX86_BUILTIN_MINPD256,
25206 IX86_BUILTIN_MINPS256,
25207 IX86_BUILTIN_MULPD256,
25208 IX86_BUILTIN_MULPS256,
25209 IX86_BUILTIN_ORPD256,
25210 IX86_BUILTIN_ORPS256,
25211 IX86_BUILTIN_SHUFPD256,
25212 IX86_BUILTIN_SHUFPS256,
25213 IX86_BUILTIN_SUBPD256,
25214 IX86_BUILTIN_SUBPS256,
25215 IX86_BUILTIN_XORPD256,
25216 IX86_BUILTIN_XORPS256,
25217 IX86_BUILTIN_CMPSD,
25218 IX86_BUILTIN_CMPSS,
25219 IX86_BUILTIN_CMPPD,
25220 IX86_BUILTIN_CMPPS,
25221 IX86_BUILTIN_CMPPD256,
25222 IX86_BUILTIN_CMPPS256,
25223 IX86_BUILTIN_CVTDQ2PD256,
25224 IX86_BUILTIN_CVTDQ2PS256,
25225 IX86_BUILTIN_CVTPD2PS256,
25226 IX86_BUILTIN_CVTPS2DQ256,
25227 IX86_BUILTIN_CVTPS2PD256,
25228 IX86_BUILTIN_CVTTPD2DQ256,
25229 IX86_BUILTIN_CVTPD2DQ256,
25230 IX86_BUILTIN_CVTTPS2DQ256,
25231 IX86_BUILTIN_EXTRACTF128PD256,
25232 IX86_BUILTIN_EXTRACTF128PS256,
25233 IX86_BUILTIN_EXTRACTF128SI256,
25234 IX86_BUILTIN_VZEROALL,
25235 IX86_BUILTIN_VZEROUPPER,
25236 IX86_BUILTIN_VPERMILVARPD,
25237 IX86_BUILTIN_VPERMILVARPS,
25238 IX86_BUILTIN_VPERMILVARPD256,
25239 IX86_BUILTIN_VPERMILVARPS256,
25240 IX86_BUILTIN_VPERMILPD,
25241 IX86_BUILTIN_VPERMILPS,
25242 IX86_BUILTIN_VPERMILPD256,
25243 IX86_BUILTIN_VPERMILPS256,
25244 IX86_BUILTIN_VPERMIL2PD,
25245 IX86_BUILTIN_VPERMIL2PS,
25246 IX86_BUILTIN_VPERMIL2PD256,
25247 IX86_BUILTIN_VPERMIL2PS256,
25248 IX86_BUILTIN_VPERM2F128PD256,
25249 IX86_BUILTIN_VPERM2F128PS256,
25250 IX86_BUILTIN_VPERM2F128SI256,
25251 IX86_BUILTIN_VBROADCASTSS,
25252 IX86_BUILTIN_VBROADCASTSD256,
25253 IX86_BUILTIN_VBROADCASTSS256,
25254 IX86_BUILTIN_VBROADCASTPD256,
25255 IX86_BUILTIN_VBROADCASTPS256,
25256 IX86_BUILTIN_VINSERTF128PD256,
25257 IX86_BUILTIN_VINSERTF128PS256,
25258 IX86_BUILTIN_VINSERTF128SI256,
25259 IX86_BUILTIN_LOADUPD256,
25260 IX86_BUILTIN_LOADUPS256,
25261 IX86_BUILTIN_STOREUPD256,
25262 IX86_BUILTIN_STOREUPS256,
25263 IX86_BUILTIN_LDDQU256,
25264 IX86_BUILTIN_MOVNTDQ256,
25265 IX86_BUILTIN_MOVNTPD256,
25266 IX86_BUILTIN_MOVNTPS256,
25267 IX86_BUILTIN_LOADDQU256,
25268 IX86_BUILTIN_STOREDQU256,
25269 IX86_BUILTIN_MASKLOADPD,
25270 IX86_BUILTIN_MASKLOADPS,
25271 IX86_BUILTIN_MASKSTOREPD,
25272 IX86_BUILTIN_MASKSTOREPS,
25273 IX86_BUILTIN_MASKLOADPD256,
25274 IX86_BUILTIN_MASKLOADPS256,
25275 IX86_BUILTIN_MASKSTOREPD256,
25276 IX86_BUILTIN_MASKSTOREPS256,
25277 IX86_BUILTIN_MOVSHDUP256,
25278 IX86_BUILTIN_MOVSLDUP256,
25279 IX86_BUILTIN_MOVDDUP256,
25281 IX86_BUILTIN_SQRTPD256,
25282 IX86_BUILTIN_SQRTPS256,
25283 IX86_BUILTIN_SQRTPS_NR256,
25284 IX86_BUILTIN_RSQRTPS256,
25285 IX86_BUILTIN_RSQRTPS_NR256,
25287 IX86_BUILTIN_RCPPS256,
25289 IX86_BUILTIN_ROUNDPD256,
25290 IX86_BUILTIN_ROUNDPS256,
25292 IX86_BUILTIN_FLOORPD256,
25293 IX86_BUILTIN_CEILPD256,
25294 IX86_BUILTIN_TRUNCPD256,
25295 IX86_BUILTIN_RINTPD256,
25296 IX86_BUILTIN_ROUNDPD_AZ256,
25298 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25299 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25300 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25302 IX86_BUILTIN_FLOORPS256,
25303 IX86_BUILTIN_CEILPS256,
25304 IX86_BUILTIN_TRUNCPS256,
25305 IX86_BUILTIN_RINTPS256,
25306 IX86_BUILTIN_ROUNDPS_AZ256,
25308 IX86_BUILTIN_FLOORPS_SFIX256,
25309 IX86_BUILTIN_CEILPS_SFIX256,
25310 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25312 IX86_BUILTIN_UNPCKHPD256,
25313 IX86_BUILTIN_UNPCKLPD256,
25314 IX86_BUILTIN_UNPCKHPS256,
25315 IX86_BUILTIN_UNPCKLPS256,
25317 IX86_BUILTIN_SI256_SI,
25318 IX86_BUILTIN_PS256_PS,
25319 IX86_BUILTIN_PD256_PD,
25320 IX86_BUILTIN_SI_SI256,
25321 IX86_BUILTIN_PS_PS256,
25322 IX86_BUILTIN_PD_PD256,
25324 IX86_BUILTIN_VTESTZPD,
25325 IX86_BUILTIN_VTESTCPD,
25326 IX86_BUILTIN_VTESTNZCPD,
25327 IX86_BUILTIN_VTESTZPS,
25328 IX86_BUILTIN_VTESTCPS,
25329 IX86_BUILTIN_VTESTNZCPS,
25330 IX86_BUILTIN_VTESTZPD256,
25331 IX86_BUILTIN_VTESTCPD256,
25332 IX86_BUILTIN_VTESTNZCPD256,
25333 IX86_BUILTIN_VTESTZPS256,
25334 IX86_BUILTIN_VTESTCPS256,
25335 IX86_BUILTIN_VTESTNZCPS256,
25336 IX86_BUILTIN_PTESTZ256,
25337 IX86_BUILTIN_PTESTC256,
25338 IX86_BUILTIN_PTESTNZC256,
25340 IX86_BUILTIN_MOVMSKPD256,
25341 IX86_BUILTIN_MOVMSKPS256,
25343 /* AVX2 */
25344 IX86_BUILTIN_MPSADBW256,
25345 IX86_BUILTIN_PABSB256,
25346 IX86_BUILTIN_PABSW256,
25347 IX86_BUILTIN_PABSD256,
25348 IX86_BUILTIN_PACKSSDW256,
25349 IX86_BUILTIN_PACKSSWB256,
25350 IX86_BUILTIN_PACKUSDW256,
25351 IX86_BUILTIN_PACKUSWB256,
25352 IX86_BUILTIN_PADDB256,
25353 IX86_BUILTIN_PADDW256,
25354 IX86_BUILTIN_PADDD256,
25355 IX86_BUILTIN_PADDQ256,
25356 IX86_BUILTIN_PADDSB256,
25357 IX86_BUILTIN_PADDSW256,
25358 IX86_BUILTIN_PADDUSB256,
25359 IX86_BUILTIN_PADDUSW256,
25360 IX86_BUILTIN_PALIGNR256,
25361 IX86_BUILTIN_AND256I,
25362 IX86_BUILTIN_ANDNOT256I,
25363 IX86_BUILTIN_PAVGB256,
25364 IX86_BUILTIN_PAVGW256,
25365 IX86_BUILTIN_PBLENDVB256,
25366 IX86_BUILTIN_PBLENDVW256,
25367 IX86_BUILTIN_PCMPEQB256,
25368 IX86_BUILTIN_PCMPEQW256,
25369 IX86_BUILTIN_PCMPEQD256,
25370 IX86_BUILTIN_PCMPEQQ256,
25371 IX86_BUILTIN_PCMPGTB256,
25372 IX86_BUILTIN_PCMPGTW256,
25373 IX86_BUILTIN_PCMPGTD256,
25374 IX86_BUILTIN_PCMPGTQ256,
25375 IX86_BUILTIN_PHADDW256,
25376 IX86_BUILTIN_PHADDD256,
25377 IX86_BUILTIN_PHADDSW256,
25378 IX86_BUILTIN_PHSUBW256,
25379 IX86_BUILTIN_PHSUBD256,
25380 IX86_BUILTIN_PHSUBSW256,
25381 IX86_BUILTIN_PMADDUBSW256,
25382 IX86_BUILTIN_PMADDWD256,
25383 IX86_BUILTIN_PMAXSB256,
25384 IX86_BUILTIN_PMAXSW256,
25385 IX86_BUILTIN_PMAXSD256,
25386 IX86_BUILTIN_PMAXUB256,
25387 IX86_BUILTIN_PMAXUW256,
25388 IX86_BUILTIN_PMAXUD256,
25389 IX86_BUILTIN_PMINSB256,
25390 IX86_BUILTIN_PMINSW256,
25391 IX86_BUILTIN_PMINSD256,
25392 IX86_BUILTIN_PMINUB256,
25393 IX86_BUILTIN_PMINUW256,
25394 IX86_BUILTIN_PMINUD256,
25395 IX86_BUILTIN_PMOVMSKB256,
25396 IX86_BUILTIN_PMOVSXBW256,
25397 IX86_BUILTIN_PMOVSXBD256,
25398 IX86_BUILTIN_PMOVSXBQ256,
25399 IX86_BUILTIN_PMOVSXWD256,
25400 IX86_BUILTIN_PMOVSXWQ256,
25401 IX86_BUILTIN_PMOVSXDQ256,
25402 IX86_BUILTIN_PMOVZXBW256,
25403 IX86_BUILTIN_PMOVZXBD256,
25404 IX86_BUILTIN_PMOVZXBQ256,
25405 IX86_BUILTIN_PMOVZXWD256,
25406 IX86_BUILTIN_PMOVZXWQ256,
25407 IX86_BUILTIN_PMOVZXDQ256,
25408 IX86_BUILTIN_PMULDQ256,
25409 IX86_BUILTIN_PMULHRSW256,
25410 IX86_BUILTIN_PMULHUW256,
25411 IX86_BUILTIN_PMULHW256,
25412 IX86_BUILTIN_PMULLW256,
25413 IX86_BUILTIN_PMULLD256,
25414 IX86_BUILTIN_PMULUDQ256,
25415 IX86_BUILTIN_POR256,
25416 IX86_BUILTIN_PSADBW256,
25417 IX86_BUILTIN_PSHUFB256,
25418 IX86_BUILTIN_PSHUFD256,
25419 IX86_BUILTIN_PSHUFHW256,
25420 IX86_BUILTIN_PSHUFLW256,
25421 IX86_BUILTIN_PSIGNB256,
25422 IX86_BUILTIN_PSIGNW256,
25423 IX86_BUILTIN_PSIGND256,
25424 IX86_BUILTIN_PSLLDQI256,
25425 IX86_BUILTIN_PSLLWI256,
25426 IX86_BUILTIN_PSLLW256,
25427 IX86_BUILTIN_PSLLDI256,
25428 IX86_BUILTIN_PSLLD256,
25429 IX86_BUILTIN_PSLLQI256,
25430 IX86_BUILTIN_PSLLQ256,
25431 IX86_BUILTIN_PSRAWI256,
25432 IX86_BUILTIN_PSRAW256,
25433 IX86_BUILTIN_PSRADI256,
25434 IX86_BUILTIN_PSRAD256,
25435 IX86_BUILTIN_PSRLDQI256,
25436 IX86_BUILTIN_PSRLWI256,
25437 IX86_BUILTIN_PSRLW256,
25438 IX86_BUILTIN_PSRLDI256,
25439 IX86_BUILTIN_PSRLD256,
25440 IX86_BUILTIN_PSRLQI256,
25441 IX86_BUILTIN_PSRLQ256,
25442 IX86_BUILTIN_PSUBB256,
25443 IX86_BUILTIN_PSUBW256,
25444 IX86_BUILTIN_PSUBD256,
25445 IX86_BUILTIN_PSUBQ256,
25446 IX86_BUILTIN_PSUBSB256,
25447 IX86_BUILTIN_PSUBSW256,
25448 IX86_BUILTIN_PSUBUSB256,
25449 IX86_BUILTIN_PSUBUSW256,
25450 IX86_BUILTIN_PUNPCKHBW256,
25451 IX86_BUILTIN_PUNPCKHWD256,
25452 IX86_BUILTIN_PUNPCKHDQ256,
25453 IX86_BUILTIN_PUNPCKHQDQ256,
25454 IX86_BUILTIN_PUNPCKLBW256,
25455 IX86_BUILTIN_PUNPCKLWD256,
25456 IX86_BUILTIN_PUNPCKLDQ256,
25457 IX86_BUILTIN_PUNPCKLQDQ256,
25458 IX86_BUILTIN_PXOR256,
25459 IX86_BUILTIN_MOVNTDQA256,
25460 IX86_BUILTIN_VBROADCASTSS_PS,
25461 IX86_BUILTIN_VBROADCASTSS_PS256,
25462 IX86_BUILTIN_VBROADCASTSD_PD256,
25463 IX86_BUILTIN_VBROADCASTSI256,
25464 IX86_BUILTIN_PBLENDD256,
25465 IX86_BUILTIN_PBLENDD128,
25466 IX86_BUILTIN_PBROADCASTB256,
25467 IX86_BUILTIN_PBROADCASTW256,
25468 IX86_BUILTIN_PBROADCASTD256,
25469 IX86_BUILTIN_PBROADCASTQ256,
25470 IX86_BUILTIN_PBROADCASTB128,
25471 IX86_BUILTIN_PBROADCASTW128,
25472 IX86_BUILTIN_PBROADCASTD128,
25473 IX86_BUILTIN_PBROADCASTQ128,
25474 IX86_BUILTIN_VPERMVARSI256,
25475 IX86_BUILTIN_VPERMDF256,
25476 IX86_BUILTIN_VPERMVARSF256,
25477 IX86_BUILTIN_VPERMDI256,
25478 IX86_BUILTIN_VPERMTI256,
25479 IX86_BUILTIN_VEXTRACT128I256,
25480 IX86_BUILTIN_VINSERT128I256,
25481 IX86_BUILTIN_MASKLOADD,
25482 IX86_BUILTIN_MASKLOADQ,
25483 IX86_BUILTIN_MASKLOADD256,
25484 IX86_BUILTIN_MASKLOADQ256,
25485 IX86_BUILTIN_MASKSTORED,
25486 IX86_BUILTIN_MASKSTOREQ,
25487 IX86_BUILTIN_MASKSTORED256,
25488 IX86_BUILTIN_MASKSTOREQ256,
25489 IX86_BUILTIN_PSLLVV4DI,
25490 IX86_BUILTIN_PSLLVV2DI,
25491 IX86_BUILTIN_PSLLVV8SI,
25492 IX86_BUILTIN_PSLLVV4SI,
25493 IX86_BUILTIN_PSRAVV8SI,
25494 IX86_BUILTIN_PSRAVV4SI,
25495 IX86_BUILTIN_PSRLVV4DI,
25496 IX86_BUILTIN_PSRLVV2DI,
25497 IX86_BUILTIN_PSRLVV8SI,
25498 IX86_BUILTIN_PSRLVV4SI,
25500 IX86_BUILTIN_GATHERSIV2DF,
25501 IX86_BUILTIN_GATHERSIV4DF,
25502 IX86_BUILTIN_GATHERDIV2DF,
25503 IX86_BUILTIN_GATHERDIV4DF,
25504 IX86_BUILTIN_GATHERSIV4SF,
25505 IX86_BUILTIN_GATHERSIV8SF,
25506 IX86_BUILTIN_GATHERDIV4SF,
25507 IX86_BUILTIN_GATHERDIV8SF,
25508 IX86_BUILTIN_GATHERSIV2DI,
25509 IX86_BUILTIN_GATHERSIV4DI,
25510 IX86_BUILTIN_GATHERDIV2DI,
25511 IX86_BUILTIN_GATHERDIV4DI,
25512 IX86_BUILTIN_GATHERSIV4SI,
25513 IX86_BUILTIN_GATHERSIV8SI,
25514 IX86_BUILTIN_GATHERDIV4SI,
25515 IX86_BUILTIN_GATHERDIV8SI,
25517 /* Alternate 4 element gather for the vectorizer where
25518 all operands are 32-byte wide. */
25519 IX86_BUILTIN_GATHERALTSIV4DF,
25520 IX86_BUILTIN_GATHERALTDIV8SF,
25521 IX86_BUILTIN_GATHERALTSIV4DI,
25522 IX86_BUILTIN_GATHERALTDIV8SI,
25524 /* TFmode support builtins. */
25525 IX86_BUILTIN_INFQ,
25526 IX86_BUILTIN_HUGE_VALQ,
25527 IX86_BUILTIN_FABSQ,
25528 IX86_BUILTIN_COPYSIGNQ,
25530 /* Vectorizer support builtins. */
25531 IX86_BUILTIN_CPYSGNPS,
25532 IX86_BUILTIN_CPYSGNPD,
25533 IX86_BUILTIN_CPYSGNPS256,
25534 IX86_BUILTIN_CPYSGNPD256,
25536 /* FMA4 instructions. */
25537 IX86_BUILTIN_VFMADDSS,
25538 IX86_BUILTIN_VFMADDSD,
25539 IX86_BUILTIN_VFMADDPS,
25540 IX86_BUILTIN_VFMADDPD,
25541 IX86_BUILTIN_VFMADDPS256,
25542 IX86_BUILTIN_VFMADDPD256,
25543 IX86_BUILTIN_VFMADDSUBPS,
25544 IX86_BUILTIN_VFMADDSUBPD,
25545 IX86_BUILTIN_VFMADDSUBPS256,
25546 IX86_BUILTIN_VFMADDSUBPD256,
25548 /* FMA3 instructions. */
25549 IX86_BUILTIN_VFMADDSS3,
25550 IX86_BUILTIN_VFMADDSD3,
25552 /* XOP instructions. */
25553 IX86_BUILTIN_VPCMOV,
25554 IX86_BUILTIN_VPCMOV_V2DI,
25555 IX86_BUILTIN_VPCMOV_V4SI,
25556 IX86_BUILTIN_VPCMOV_V8HI,
25557 IX86_BUILTIN_VPCMOV_V16QI,
25558 IX86_BUILTIN_VPCMOV_V4SF,
25559 IX86_BUILTIN_VPCMOV_V2DF,
25560 IX86_BUILTIN_VPCMOV256,
25561 IX86_BUILTIN_VPCMOV_V4DI256,
25562 IX86_BUILTIN_VPCMOV_V8SI256,
25563 IX86_BUILTIN_VPCMOV_V16HI256,
25564 IX86_BUILTIN_VPCMOV_V32QI256,
25565 IX86_BUILTIN_VPCMOV_V8SF256,
25566 IX86_BUILTIN_VPCMOV_V4DF256,
25568 IX86_BUILTIN_VPPERM,
25570 IX86_BUILTIN_VPMACSSWW,
25571 IX86_BUILTIN_VPMACSWW,
25572 IX86_BUILTIN_VPMACSSWD,
25573 IX86_BUILTIN_VPMACSWD,
25574 IX86_BUILTIN_VPMACSSDD,
25575 IX86_BUILTIN_VPMACSDD,
25576 IX86_BUILTIN_VPMACSSDQL,
25577 IX86_BUILTIN_VPMACSSDQH,
25578 IX86_BUILTIN_VPMACSDQL,
25579 IX86_BUILTIN_VPMACSDQH,
25580 IX86_BUILTIN_VPMADCSSWD,
25581 IX86_BUILTIN_VPMADCSWD,
25583 IX86_BUILTIN_VPHADDBW,
25584 IX86_BUILTIN_VPHADDBD,
25585 IX86_BUILTIN_VPHADDBQ,
25586 IX86_BUILTIN_VPHADDWD,
25587 IX86_BUILTIN_VPHADDWQ,
25588 IX86_BUILTIN_VPHADDDQ,
25589 IX86_BUILTIN_VPHADDUBW,
25590 IX86_BUILTIN_VPHADDUBD,
25591 IX86_BUILTIN_VPHADDUBQ,
25592 IX86_BUILTIN_VPHADDUWD,
25593 IX86_BUILTIN_VPHADDUWQ,
25594 IX86_BUILTIN_VPHADDUDQ,
25595 IX86_BUILTIN_VPHSUBBW,
25596 IX86_BUILTIN_VPHSUBWD,
25597 IX86_BUILTIN_VPHSUBDQ,
25599 IX86_BUILTIN_VPROTB,
25600 IX86_BUILTIN_VPROTW,
25601 IX86_BUILTIN_VPROTD,
25602 IX86_BUILTIN_VPROTQ,
25603 IX86_BUILTIN_VPROTB_IMM,
25604 IX86_BUILTIN_VPROTW_IMM,
25605 IX86_BUILTIN_VPROTD_IMM,
25606 IX86_BUILTIN_VPROTQ_IMM,
25608 IX86_BUILTIN_VPSHLB,
25609 IX86_BUILTIN_VPSHLW,
25610 IX86_BUILTIN_VPSHLD,
25611 IX86_BUILTIN_VPSHLQ,
25612 IX86_BUILTIN_VPSHAB,
25613 IX86_BUILTIN_VPSHAW,
25614 IX86_BUILTIN_VPSHAD,
25615 IX86_BUILTIN_VPSHAQ,
25617 IX86_BUILTIN_VFRCZSS,
25618 IX86_BUILTIN_VFRCZSD,
25619 IX86_BUILTIN_VFRCZPS,
25620 IX86_BUILTIN_VFRCZPD,
25621 IX86_BUILTIN_VFRCZPS256,
25622 IX86_BUILTIN_VFRCZPD256,
25624 IX86_BUILTIN_VPCOMEQUB,
25625 IX86_BUILTIN_VPCOMNEUB,
25626 IX86_BUILTIN_VPCOMLTUB,
25627 IX86_BUILTIN_VPCOMLEUB,
25628 IX86_BUILTIN_VPCOMGTUB,
25629 IX86_BUILTIN_VPCOMGEUB,
25630 IX86_BUILTIN_VPCOMFALSEUB,
25631 IX86_BUILTIN_VPCOMTRUEUB,
25633 IX86_BUILTIN_VPCOMEQUW,
25634 IX86_BUILTIN_VPCOMNEUW,
25635 IX86_BUILTIN_VPCOMLTUW,
25636 IX86_BUILTIN_VPCOMLEUW,
25637 IX86_BUILTIN_VPCOMGTUW,
25638 IX86_BUILTIN_VPCOMGEUW,
25639 IX86_BUILTIN_VPCOMFALSEUW,
25640 IX86_BUILTIN_VPCOMTRUEUW,
25642 IX86_BUILTIN_VPCOMEQUD,
25643 IX86_BUILTIN_VPCOMNEUD,
25644 IX86_BUILTIN_VPCOMLTUD,
25645 IX86_BUILTIN_VPCOMLEUD,
25646 IX86_BUILTIN_VPCOMGTUD,
25647 IX86_BUILTIN_VPCOMGEUD,
25648 IX86_BUILTIN_VPCOMFALSEUD,
25649 IX86_BUILTIN_VPCOMTRUEUD,
25651 IX86_BUILTIN_VPCOMEQUQ,
25652 IX86_BUILTIN_VPCOMNEUQ,
25653 IX86_BUILTIN_VPCOMLTUQ,
25654 IX86_BUILTIN_VPCOMLEUQ,
25655 IX86_BUILTIN_VPCOMGTUQ,
25656 IX86_BUILTIN_VPCOMGEUQ,
25657 IX86_BUILTIN_VPCOMFALSEUQ,
25658 IX86_BUILTIN_VPCOMTRUEUQ,
25660 IX86_BUILTIN_VPCOMEQB,
25661 IX86_BUILTIN_VPCOMNEB,
25662 IX86_BUILTIN_VPCOMLTB,
25663 IX86_BUILTIN_VPCOMLEB,
25664 IX86_BUILTIN_VPCOMGTB,
25665 IX86_BUILTIN_VPCOMGEB,
25666 IX86_BUILTIN_VPCOMFALSEB,
25667 IX86_BUILTIN_VPCOMTRUEB,
25669 IX86_BUILTIN_VPCOMEQW,
25670 IX86_BUILTIN_VPCOMNEW,
25671 IX86_BUILTIN_VPCOMLTW,
25672 IX86_BUILTIN_VPCOMLEW,
25673 IX86_BUILTIN_VPCOMGTW,
25674 IX86_BUILTIN_VPCOMGEW,
25675 IX86_BUILTIN_VPCOMFALSEW,
25676 IX86_BUILTIN_VPCOMTRUEW,
25678 IX86_BUILTIN_VPCOMEQD,
25679 IX86_BUILTIN_VPCOMNED,
25680 IX86_BUILTIN_VPCOMLTD,
25681 IX86_BUILTIN_VPCOMLED,
25682 IX86_BUILTIN_VPCOMGTD,
25683 IX86_BUILTIN_VPCOMGED,
25684 IX86_BUILTIN_VPCOMFALSED,
25685 IX86_BUILTIN_VPCOMTRUED,
25687 IX86_BUILTIN_VPCOMEQQ,
25688 IX86_BUILTIN_VPCOMNEQ,
25689 IX86_BUILTIN_VPCOMLTQ,
25690 IX86_BUILTIN_VPCOMLEQ,
25691 IX86_BUILTIN_VPCOMGTQ,
25692 IX86_BUILTIN_VPCOMGEQ,
25693 IX86_BUILTIN_VPCOMFALSEQ,
25694 IX86_BUILTIN_VPCOMTRUEQ,
25696 /* LWP instructions. */
25697 IX86_BUILTIN_LLWPCB,
25698 IX86_BUILTIN_SLWPCB,
25699 IX86_BUILTIN_LWPVAL32,
25700 IX86_BUILTIN_LWPVAL64,
25701 IX86_BUILTIN_LWPINS32,
25702 IX86_BUILTIN_LWPINS64,
25704 IX86_BUILTIN_CLZS,
25706 /* RTM */
25707 IX86_BUILTIN_XBEGIN,
25708 IX86_BUILTIN_XEND,
25709 IX86_BUILTIN_XABORT,
25710 IX86_BUILTIN_XTEST,
25712 /* BMI instructions. */
25713 IX86_BUILTIN_BEXTR32,
25714 IX86_BUILTIN_BEXTR64,
25715 IX86_BUILTIN_CTZS,
25717 /* TBM instructions. */
25718 IX86_BUILTIN_BEXTRI32,
25719 IX86_BUILTIN_BEXTRI64,
25721 /* BMI2 instructions. */
25722 IX86_BUILTIN_BZHI32,
25723 IX86_BUILTIN_BZHI64,
25724 IX86_BUILTIN_PDEP32,
25725 IX86_BUILTIN_PDEP64,
25726 IX86_BUILTIN_PEXT32,
25727 IX86_BUILTIN_PEXT64,
25729 /* FSGSBASE instructions. */
25730 IX86_BUILTIN_RDFSBASE32,
25731 IX86_BUILTIN_RDFSBASE64,
25732 IX86_BUILTIN_RDGSBASE32,
25733 IX86_BUILTIN_RDGSBASE64,
25734 IX86_BUILTIN_WRFSBASE32,
25735 IX86_BUILTIN_WRFSBASE64,
25736 IX86_BUILTIN_WRGSBASE32,
25737 IX86_BUILTIN_WRGSBASE64,
25739 /* RDRND instructions. */
25740 IX86_BUILTIN_RDRAND16_STEP,
25741 IX86_BUILTIN_RDRAND32_STEP,
25742 IX86_BUILTIN_RDRAND64_STEP,
25744 /* F16C instructions. */
25745 IX86_BUILTIN_CVTPH2PS,
25746 IX86_BUILTIN_CVTPH2PS256,
25747 IX86_BUILTIN_CVTPS2PH,
25748 IX86_BUILTIN_CVTPS2PH256,
25750 /* CFString built-in for darwin */
25751 IX86_BUILTIN_CFSTRING,
25753 IX86_BUILTIN_MAX
25756 /* Table for the ix86 builtin decls. */
25757 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25759 /* Table of all of the builtin functions that are possible with different ISA's
25760 but are waiting to be built until a function is declared to use that
25761 ISA. */
25762 struct builtin_isa {
25763 const char *name; /* function name */
25764 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25765 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25766 bool const_p; /* true if the declaration is constant */
25767 bool set_and_not_built_p;
25770 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25773 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25774 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25775 function decl in the ix86_builtins array. Returns the function decl or
25776 NULL_TREE, if the builtin was not added.
25778 If the front end has a special hook for builtin functions, delay adding
25779 builtin functions that aren't in the current ISA until the ISA is changed
25780 with function specific optimization. Doing so, can save about 300K for the
25781 default compiler. When the builtin is expanded, check at that time whether
25782 it is valid.
25784 If the front end doesn't have a special hook, record all builtins, even if
25785 it isn't an instruction set in the current ISA in case the user uses
25786 function specific options for a different ISA, so that we don't get scope
25787 errors if a builtin is added in the middle of a function scope. */
25789 static inline tree
25790 def_builtin (HOST_WIDE_INT mask, const char *name,
25791 enum ix86_builtin_func_type tcode,
25792 enum ix86_builtins code)
25794 tree decl = NULL_TREE;
25796 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25798 ix86_builtins_isa[(int) code].isa = mask;
25800 mask &= ~OPTION_MASK_ISA_64BIT;
25801 if (mask == 0
25802 || (mask & ix86_isa_flags) != 0
25803 || (lang_hooks.builtin_function
25804 == lang_hooks.builtin_function_ext_scope))
25807 tree type = ix86_get_builtin_func_type (tcode);
25808 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25809 NULL, NULL_TREE);
25810 ix86_builtins[(int) code] = decl;
25811 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25813 else
25815 ix86_builtins[(int) code] = NULL_TREE;
25816 ix86_builtins_isa[(int) code].tcode = tcode;
25817 ix86_builtins_isa[(int) code].name = name;
25818 ix86_builtins_isa[(int) code].const_p = false;
25819 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25823 return decl;
25826 /* Like def_builtin, but also marks the function decl "const". */
25828 static inline tree
25829 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25830 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25832 tree decl = def_builtin (mask, name, tcode, code);
25833 if (decl)
25834 TREE_READONLY (decl) = 1;
25835 else
25836 ix86_builtins_isa[(int) code].const_p = true;
25838 return decl;
25841 /* Add any new builtin functions for a given ISA that may not have been
25842 declared. This saves a bit of space compared to adding all of the
25843 declarations to the tree, even if we didn't use them. */
25845 static void
25846 ix86_add_new_builtins (HOST_WIDE_INT isa)
25848 int i;
25850 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25852 if ((ix86_builtins_isa[i].isa & isa) != 0
25853 && ix86_builtins_isa[i].set_and_not_built_p)
25855 tree decl, type;
25857 /* Don't define the builtin again. */
25858 ix86_builtins_isa[i].set_and_not_built_p = false;
25860 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25861 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25862 type, i, BUILT_IN_MD, NULL,
25863 NULL_TREE);
25865 ix86_builtins[i] = decl;
25866 if (ix86_builtins_isa[i].const_p)
25867 TREE_READONLY (decl) = 1;
25872 /* Bits for builtin_description.flag. */
25874 /* Set when we don't support the comparison natively, and should
25875 swap_comparison in order to support it. */
25876 #define BUILTIN_DESC_SWAP_OPERANDS 1
25878 struct builtin_description
25880 const HOST_WIDE_INT mask;
25881 const enum insn_code icode;
25882 const char *const name;
25883 const enum ix86_builtins code;
25884 const enum rtx_code comparison;
25885 const int flag;
25888 static const struct builtin_description bdesc_comi[] =
25890 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25891 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25892 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25893 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25894 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25895 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25896 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25897 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25898 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25899 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25900 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25901 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25902 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25903 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25904 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25905 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25906 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25907 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25908 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25909 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25910 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25911 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25912 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25913 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25916 static const struct builtin_description bdesc_pcmpestr[] =
25918 /* SSE4.2 */
25919 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25920 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25921 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25922 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25923 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25924 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25925 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25928 static const struct builtin_description bdesc_pcmpistr[] =
25930 /* SSE4.2 */
25931 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25932 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25933 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25934 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25935 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25936 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25937 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25940 /* Special builtins with variable number of arguments. */
25941 static const struct builtin_description bdesc_special_args[] =
25943 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25944 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25945 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25947 /* MMX */
25948 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25950 /* 3DNow! */
25951 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25953 /* SSE */
25954 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25955 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25956 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25958 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25959 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25960 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25961 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25963 /* SSE or 3DNow!A */
25964 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25965 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25967 /* SSE2 */
25968 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25969 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25970 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25971 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25972 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25973 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25974 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25975 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
25976 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25977 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25979 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25980 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25982 /* SSE3 */
25983 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25985 /* SSE4.1 */
25986 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25988 /* SSE4A */
25989 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25990 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25992 /* AVX */
25993 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25994 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25996 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25997 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25998 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25999 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26000 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26002 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26003 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26004 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26005 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26006 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26007 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26008 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26010 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26011 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26012 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26014 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26015 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26016 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26017 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26018 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26019 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26020 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26021 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26023 /* AVX2 */
26024 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26025 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26026 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26027 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26028 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26029 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26030 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26031 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26032 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26034 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26035 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26036 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26037 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26038 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26039 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26041 /* FSGSBASE */
26042 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26043 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26044 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26045 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26046 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26047 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26048 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26049 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26051 /* RTM */
26052 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26053 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26054 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26057 /* Builtins with variable number of arguments. */
26058 static const struct builtin_description bdesc_args[] =
26060 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26061 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26062 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26063 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26064 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26065 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26066 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26068 /* MMX */
26069 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26070 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26071 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26072 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26073 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26074 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26076 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26077 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26078 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26079 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26080 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26081 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26082 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26083 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26085 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26086 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26088 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26089 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26090 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26091 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26093 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26094 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26095 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26096 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26097 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26098 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26100 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26101 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26102 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26103 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26104 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26105 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26107 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26108 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26109 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26111 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26113 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26114 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26115 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26116 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26117 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26118 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26120 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26121 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26122 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26123 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26124 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26125 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26127 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26128 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26129 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26130 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26132 /* 3DNow! */
26133 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26134 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26135 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26136 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26138 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26139 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26140 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26141 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26142 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26143 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26144 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26145 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26146 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26147 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26148 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26149 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26150 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26151 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26152 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26154 /* 3DNow!A */
26155 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26156 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26157 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26158 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26159 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26160 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26162 /* SSE */
26163 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26164 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26165 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26166 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26167 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26168 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26169 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26170 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26171 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26172 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26173 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26174 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26176 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26178 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26179 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26180 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26181 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26182 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26183 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26184 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26185 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26187 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26188 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26189 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26190 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26191 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26192 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26193 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26194 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26195 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26196 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26197 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26198 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26199 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26200 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26201 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26202 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26203 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26204 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26205 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26206 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26207 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26208 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26210 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26211 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26212 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26213 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26215 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26216 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26217 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26218 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26220 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26222 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26223 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26224 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26225 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26226 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26228 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26229 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26230 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26232 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26234 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26235 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26236 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26238 /* SSE MMX or 3Dnow!A */
26239 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26240 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26241 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26243 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26244 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26245 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26246 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26248 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26249 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26251 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26253 /* SSE2 */
26254 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26256 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26257 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26258 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26259 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26260 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26262 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26263 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26264 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26265 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26266 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26268 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26270 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26271 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26272 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26273 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26277 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26279 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26280 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26281 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26282 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26283 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26284 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26285 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26286 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26288 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26289 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26290 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26291 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26292 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26293 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26294 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26295 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26296 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26297 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26298 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26299 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26300 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26301 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26302 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26304 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26305 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26306 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26307 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26309 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26310 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26311 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26312 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26314 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26315 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26316 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26317 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26319 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26321 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26322 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26323 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26325 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26327 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26328 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26329 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26330 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26331 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26332 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26333 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26334 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26336 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26337 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26338 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26339 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26340 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26341 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26342 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26343 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26345 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26346 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26348 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26349 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26350 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26351 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26353 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26354 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26356 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26357 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26358 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26359 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26360 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26361 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26363 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26364 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26365 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26366 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26368 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26369 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26370 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26371 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26372 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26373 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26374 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26375 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26377 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26378 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26379 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26381 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26382 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26384 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26385 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26387 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26389 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26390 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26391 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26392 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26394 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26395 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26396 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26397 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26398 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26399 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26400 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26402 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26403 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26404 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26405 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26406 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26407 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26408 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26410 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26411 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26412 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26413 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26415 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26416 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26417 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26419 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26421 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26422 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26424 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26426 /* SSE2 MMX */
26427 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26428 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26430 /* SSE3 */
26431 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26432 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26434 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26435 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26436 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26437 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26438 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26439 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26441 /* SSSE3 */
26442 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26443 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26444 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26445 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26446 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26447 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26449 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26450 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26451 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26452 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26453 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26454 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26455 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26456 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26457 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26458 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26459 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26460 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26461 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26462 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26463 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26464 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26465 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26466 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26467 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26468 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26469 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26470 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26471 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26472 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26474 /* SSSE3. */
26475 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26476 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26478 /* SSE4.1 */
26479 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26480 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26481 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26482 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26483 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26484 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26485 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26486 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26487 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26488 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26490 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26491 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26492 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26493 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26494 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26495 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26496 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26497 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26498 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26499 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26500 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26501 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26502 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26504 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26505 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26506 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26507 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26508 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26509 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26510 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26511 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26512 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26513 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26514 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26515 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26517 /* SSE4.1 */
26518 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26519 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26520 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26521 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26523 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26524 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26525 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26526 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26528 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26529 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26531 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26532 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26534 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26535 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26536 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26537 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26539 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26540 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26542 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26543 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26545 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26546 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26547 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26549 /* SSE4.2 */
26550 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26551 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26552 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26553 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26554 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26556 /* SSE4A */
26557 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26558 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26559 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26560 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26562 /* AES */
26563 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26564 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26566 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26567 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26568 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26569 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26571 /* PCLMUL */
26572 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26574 /* AVX */
26575 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26576 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26579 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26580 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26581 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26583 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26589 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26590 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26591 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26592 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26593 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26594 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26595 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26596 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26597 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26598 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26599 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26600 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26602 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26603 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26604 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26605 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26607 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26608 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26609 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26610 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26611 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26612 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26613 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26614 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26615 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26616 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26617 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26618 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26619 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26621 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26622 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26623 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26624 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26627 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26628 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26629 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26630 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26631 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26632 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26635 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26636 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26637 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26638 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26639 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26640 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26642 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26643 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26644 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26646 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26647 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26648 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26649 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26650 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26652 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26654 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26655 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26657 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26658 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26659 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26660 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26662 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26663 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26665 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26666 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26668 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26669 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26670 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26671 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26673 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26674 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26676 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26677 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26679 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26680 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26681 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26682 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26684 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26685 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26686 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26687 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26688 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26689 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26691 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26692 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26693 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26694 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26695 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26696 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26697 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26698 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26699 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26700 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26701 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26702 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26703 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26704 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26705 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26707 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26708 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26710 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26711 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26713 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26715 /* AVX2 */
26716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26717 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26718 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26719 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26724 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26725 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26726 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26727 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26733 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26747 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26748 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26749 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26750 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26751 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26752 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26753 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26754 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26755 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26756 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26757 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26758 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26759 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26760 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26761 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26762 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26763 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26764 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26765 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26766 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26782 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26783 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26784 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26785 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26787 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26788 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26793 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26794 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26795 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26796 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26797 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26798 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26799 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26800 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26801 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26802 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26803 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26804 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26805 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26806 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26807 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26808 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26809 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26810 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26811 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26812 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26813 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26814 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26815 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26816 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26817 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26818 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26819 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26820 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26821 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26822 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26823 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26824 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26825 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26826 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26827 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26828 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26829 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26830 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26831 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26832 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26833 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26834 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26835 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26836 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26837 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26838 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26839 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26840 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26841 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26842 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26843 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26844 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26845 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26846 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26847 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26848 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26849 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26850 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26851 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26852 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26853 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26854 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26855 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26856 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26857 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26858 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26859 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26860 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26861 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26863 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26865 /* BMI */
26866 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26867 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26868 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26870 /* TBM */
26871 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26872 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26874 /* F16C */
26875 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26876 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26877 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26878 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26880 /* BMI2 */
26881 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26882 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26883 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26884 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26885 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26886 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26889 /* FMA4 and XOP. */
26890 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26891 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26892 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26893 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26894 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26895 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26896 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26897 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26898 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26899 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26900 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26901 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26902 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26903 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26904 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26905 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26906 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26907 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26908 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26909 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26910 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26911 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26912 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26913 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26914 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26915 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26916 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26917 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26918 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26919 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26920 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26921 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26922 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26923 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26924 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26925 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26926 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26927 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26928 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26929 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26930 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26931 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26932 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26933 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26934 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26935 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26936 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26937 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26938 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26939 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26940 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26941 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26943 static const struct builtin_description bdesc_multi_arg[] =
26945 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26946 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26947 UNKNOWN, (int)MULTI_ARG_3_SF },
26948 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26949 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26950 UNKNOWN, (int)MULTI_ARG_3_DF },
26952 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26953 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26954 UNKNOWN, (int)MULTI_ARG_3_SF },
26955 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26956 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26957 UNKNOWN, (int)MULTI_ARG_3_DF },
26959 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26960 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26961 UNKNOWN, (int)MULTI_ARG_3_SF },
26962 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26963 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26964 UNKNOWN, (int)MULTI_ARG_3_DF },
26965 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26966 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26967 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26968 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26969 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26970 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26972 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26973 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26974 UNKNOWN, (int)MULTI_ARG_3_SF },
26975 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26976 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26977 UNKNOWN, (int)MULTI_ARG_3_DF },
26978 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26979 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26980 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26981 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26982 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26983 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27062 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27064 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27068 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27073 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27074 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27076 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27077 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27078 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27080 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27081 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27082 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27083 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27084 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27085 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27086 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27088 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27089 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27090 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27091 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27092 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27093 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27094 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27096 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27097 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27098 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27099 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27100 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27101 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27102 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27104 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27105 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27106 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27107 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27108 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27109 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27110 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27112 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27113 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27114 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27115 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27117 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27118 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27120 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27121 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27123 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27124 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27125 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27126 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27129 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27131 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27133 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27134 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27136 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27138 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27141 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27145 /* TM vector builtins. */
27147 /* Reuse the existing x86-specific `struct builtin_description' cause
27148 we're lazy. Add casts to make them fit. */
27149 static const struct builtin_description bdesc_tm[] =
27151 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27152 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27153 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27154 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27155 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27156 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27157 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27159 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27160 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27161 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27162 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27163 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27164 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27165 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27167 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27168 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27169 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27170 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27171 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27172 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27173 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27175 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27176 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27177 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27180 /* TM callbacks. */
27182 /* Return the builtin decl needed to load a vector of TYPE. */
27184 static tree
27185 ix86_builtin_tm_load (tree type)
27187 if (TREE_CODE (type) == VECTOR_TYPE)
27189 switch (tree_low_cst (TYPE_SIZE (type), 1))
27191 case 64:
27192 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27193 case 128:
27194 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27195 case 256:
27196 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27199 return NULL_TREE;
27202 /* Return the builtin decl needed to store a vector of TYPE. */
27204 static tree
27205 ix86_builtin_tm_store (tree type)
27207 if (TREE_CODE (type) == VECTOR_TYPE)
27209 switch (tree_low_cst (TYPE_SIZE (type), 1))
27211 case 64:
27212 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27213 case 128:
27214 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27215 case 256:
27216 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27219 return NULL_TREE;
27222 /* Initialize the transactional memory vector load/store builtins. */
27224 static void
27225 ix86_init_tm_builtins (void)
27227 enum ix86_builtin_func_type ftype;
27228 const struct builtin_description *d;
27229 size_t i;
27230 tree decl;
27231 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27232 tree attrs_log, attrs_type_log;
27234 if (!flag_tm)
27235 return;
27237 /* If there are no builtins defined, we must be compiling in a
27238 language without trans-mem support. */
27239 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27240 return;
27242 /* Use whatever attributes a normal TM load has. */
27243 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27244 attrs_load = DECL_ATTRIBUTES (decl);
27245 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27246 /* Use whatever attributes a normal TM store has. */
27247 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27248 attrs_store = DECL_ATTRIBUTES (decl);
27249 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27250 /* Use whatever attributes a normal TM log has. */
27251 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27252 attrs_log = DECL_ATTRIBUTES (decl);
27253 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27255 for (i = 0, d = bdesc_tm;
27256 i < ARRAY_SIZE (bdesc_tm);
27257 i++, d++)
27259 if ((d->mask & ix86_isa_flags) != 0
27260 || (lang_hooks.builtin_function
27261 == lang_hooks.builtin_function_ext_scope))
27263 tree type, attrs, attrs_type;
27264 enum built_in_function code = (enum built_in_function) d->code;
27266 ftype = (enum ix86_builtin_func_type) d->flag;
27267 type = ix86_get_builtin_func_type (ftype);
27269 if (BUILTIN_TM_LOAD_P (code))
27271 attrs = attrs_load;
27272 attrs_type = attrs_type_load;
27274 else if (BUILTIN_TM_STORE_P (code))
27276 attrs = attrs_store;
27277 attrs_type = attrs_type_store;
27279 else
27281 attrs = attrs_log;
27282 attrs_type = attrs_type_log;
27284 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27285 /* The builtin without the prefix for
27286 calling it directly. */
27287 d->name + strlen ("__builtin_"),
27288 attrs);
27289 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27290 set the TYPE_ATTRIBUTES. */
27291 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27293 set_builtin_decl (code, decl, false);
27298 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27299 in the current target ISA to allow the user to compile particular modules
27300 with different target specific options that differ from the command line
27301 options. */
27302 static void
27303 ix86_init_mmx_sse_builtins (void)
27305 const struct builtin_description * d;
27306 enum ix86_builtin_func_type ftype;
27307 size_t i;
27309 /* Add all special builtins with variable number of operands. */
27310 for (i = 0, d = bdesc_special_args;
27311 i < ARRAY_SIZE (bdesc_special_args);
27312 i++, d++)
27314 if (d->name == 0)
27315 continue;
27317 ftype = (enum ix86_builtin_func_type) d->flag;
27318 def_builtin (d->mask, d->name, ftype, d->code);
27321 /* Add all builtins with variable number of operands. */
27322 for (i = 0, d = bdesc_args;
27323 i < ARRAY_SIZE (bdesc_args);
27324 i++, d++)
27326 if (d->name == 0)
27327 continue;
27329 ftype = (enum ix86_builtin_func_type) d->flag;
27330 def_builtin_const (d->mask, d->name, ftype, d->code);
27333 /* pcmpestr[im] insns. */
27334 for (i = 0, d = bdesc_pcmpestr;
27335 i < ARRAY_SIZE (bdesc_pcmpestr);
27336 i++, d++)
27338 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27339 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27340 else
27341 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27342 def_builtin_const (d->mask, d->name, ftype, d->code);
27345 /* pcmpistr[im] insns. */
27346 for (i = 0, d = bdesc_pcmpistr;
27347 i < ARRAY_SIZE (bdesc_pcmpistr);
27348 i++, d++)
27350 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27351 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27352 else
27353 ftype = INT_FTYPE_V16QI_V16QI_INT;
27354 def_builtin_const (d->mask, d->name, ftype, d->code);
27357 /* comi/ucomi insns. */
27358 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27360 if (d->mask == OPTION_MASK_ISA_SSE2)
27361 ftype = INT_FTYPE_V2DF_V2DF;
27362 else
27363 ftype = INT_FTYPE_V4SF_V4SF;
27364 def_builtin_const (d->mask, d->name, ftype, d->code);
27367 /* SSE */
27368 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27369 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27370 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27371 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27373 /* SSE or 3DNow!A */
27374 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27375 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27376 IX86_BUILTIN_MASKMOVQ);
27378 /* SSE2 */
27379 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27380 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27382 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27383 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27384 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27385 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27387 /* SSE3. */
27388 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27389 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27390 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27391 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27393 /* AES */
27394 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27395 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27396 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27397 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27398 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27399 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27400 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27401 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27402 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27403 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27404 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27405 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27407 /* PCLMUL */
27408 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27409 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27411 /* RDRND */
27412 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27413 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27414 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27415 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27416 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27417 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27418 IX86_BUILTIN_RDRAND64_STEP);
27420 /* AVX2 */
27421 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27422 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27423 IX86_BUILTIN_GATHERSIV2DF);
27425 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27426 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27427 IX86_BUILTIN_GATHERSIV4DF);
27429 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27430 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27431 IX86_BUILTIN_GATHERDIV2DF);
27433 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27434 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27435 IX86_BUILTIN_GATHERDIV4DF);
27437 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27438 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27439 IX86_BUILTIN_GATHERSIV4SF);
27441 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27442 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27443 IX86_BUILTIN_GATHERSIV8SF);
27445 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27446 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27447 IX86_BUILTIN_GATHERDIV4SF);
27449 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27450 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27451 IX86_BUILTIN_GATHERDIV8SF);
27453 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27454 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27455 IX86_BUILTIN_GATHERSIV2DI);
27457 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27458 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27459 IX86_BUILTIN_GATHERSIV4DI);
27461 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27462 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27463 IX86_BUILTIN_GATHERDIV2DI);
27465 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27466 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27467 IX86_BUILTIN_GATHERDIV4DI);
27469 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27470 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27471 IX86_BUILTIN_GATHERSIV4SI);
27473 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27474 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27475 IX86_BUILTIN_GATHERSIV8SI);
27477 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27478 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27479 IX86_BUILTIN_GATHERDIV4SI);
27481 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27482 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27483 IX86_BUILTIN_GATHERDIV8SI);
27485 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27486 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27487 IX86_BUILTIN_GATHERALTSIV4DF);
27489 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27490 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27491 IX86_BUILTIN_GATHERALTDIV8SF);
27493 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27494 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27495 IX86_BUILTIN_GATHERALTSIV4DI);
27497 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27498 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27499 IX86_BUILTIN_GATHERALTDIV8SI);
27501 /* RTM. */
27502 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
27503 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
27505 /* MMX access to the vec_init patterns. */
27506 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27507 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27509 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27510 V4HI_FTYPE_HI_HI_HI_HI,
27511 IX86_BUILTIN_VEC_INIT_V4HI);
27513 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27514 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27515 IX86_BUILTIN_VEC_INIT_V8QI);
27517 /* Access to the vec_extract patterns. */
27518 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27519 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27520 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27521 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27522 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27523 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27524 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27525 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27526 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27527 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27529 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27530 "__builtin_ia32_vec_ext_v4hi",
27531 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27533 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27534 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27536 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27537 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27539 /* Access to the vec_set patterns. */
27540 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27541 "__builtin_ia32_vec_set_v2di",
27542 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27544 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27545 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27547 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27548 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27550 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27551 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27553 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27554 "__builtin_ia32_vec_set_v4hi",
27555 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27557 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27558 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27560 /* Add FMA4 multi-arg argument instructions */
27561 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27563 if (d->name == 0)
27564 continue;
27566 ftype = (enum ix86_builtin_func_type) d->flag;
27567 def_builtin_const (d->mask, d->name, ftype, d->code);
27571 /* Internal method for ix86_init_builtins. */
27573 static void
27574 ix86_init_builtins_va_builtins_abi (void)
27576 tree ms_va_ref, sysv_va_ref;
27577 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27578 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27579 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27580 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27582 if (!TARGET_64BIT)
27583 return;
27584 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27585 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27586 ms_va_ref = build_reference_type (ms_va_list_type_node);
27587 sysv_va_ref =
27588 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27590 fnvoid_va_end_ms =
27591 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27592 fnvoid_va_start_ms =
27593 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27594 fnvoid_va_end_sysv =
27595 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27596 fnvoid_va_start_sysv =
27597 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27598 NULL_TREE);
27599 fnvoid_va_copy_ms =
27600 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27601 NULL_TREE);
27602 fnvoid_va_copy_sysv =
27603 build_function_type_list (void_type_node, sysv_va_ref,
27604 sysv_va_ref, NULL_TREE);
27606 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27607 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27608 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27609 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27610 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27611 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27612 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27613 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27614 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27615 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27616 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27617 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27620 static void
27621 ix86_init_builtin_types (void)
27623 tree float128_type_node, float80_type_node;
27625 /* The __float80 type. */
27626 float80_type_node = long_double_type_node;
27627 if (TYPE_MODE (float80_type_node) != XFmode)
27629 /* The __float80 type. */
27630 float80_type_node = make_node (REAL_TYPE);
27632 TYPE_PRECISION (float80_type_node) = 80;
27633 layout_type (float80_type_node);
27635 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27637 /* The __float128 type. */
27638 float128_type_node = make_node (REAL_TYPE);
27639 TYPE_PRECISION (float128_type_node) = 128;
27640 layout_type (float128_type_node);
27641 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27643 /* This macro is built by i386-builtin-types.awk. */
27644 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27647 static void
27648 ix86_init_builtins (void)
27650 tree t;
27652 ix86_init_builtin_types ();
27654 /* TFmode support builtins. */
27655 def_builtin_const (0, "__builtin_infq",
27656 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27657 def_builtin_const (0, "__builtin_huge_valq",
27658 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27660 /* We will expand them to normal call if SSE2 isn't available since
27661 they are used by libgcc. */
27662 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27663 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27664 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27665 TREE_READONLY (t) = 1;
27666 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27668 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27669 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27670 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27671 TREE_READONLY (t) = 1;
27672 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27674 ix86_init_tm_builtins ();
27675 ix86_init_mmx_sse_builtins ();
27677 if (TARGET_LP64)
27678 ix86_init_builtins_va_builtins_abi ();
27680 #ifdef SUBTARGET_INIT_BUILTINS
27681 SUBTARGET_INIT_BUILTINS;
27682 #endif
27685 /* Return the ix86 builtin for CODE. */
27687 static tree
27688 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27690 if (code >= IX86_BUILTIN_MAX)
27691 return error_mark_node;
27693 return ix86_builtins[code];
27696 /* Errors in the source file can cause expand_expr to return const0_rtx
27697 where we expect a vector. To avoid crashing, use one of the vector
27698 clear instructions. */
27699 static rtx
27700 safe_vector_operand (rtx x, enum machine_mode mode)
27702 if (x == const0_rtx)
27703 x = CONST0_RTX (mode);
27704 return x;
27707 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27709 static rtx
27710 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27712 rtx pat;
27713 tree arg0 = CALL_EXPR_ARG (exp, 0);
27714 tree arg1 = CALL_EXPR_ARG (exp, 1);
27715 rtx op0 = expand_normal (arg0);
27716 rtx op1 = expand_normal (arg1);
27717 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27718 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27719 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27721 if (VECTOR_MODE_P (mode0))
27722 op0 = safe_vector_operand (op0, mode0);
27723 if (VECTOR_MODE_P (mode1))
27724 op1 = safe_vector_operand (op1, mode1);
27726 if (optimize || !target
27727 || GET_MODE (target) != tmode
27728 || !insn_data[icode].operand[0].predicate (target, tmode))
27729 target = gen_reg_rtx (tmode);
27731 if (GET_MODE (op1) == SImode && mode1 == TImode)
27733 rtx x = gen_reg_rtx (V4SImode);
27734 emit_insn (gen_sse2_loadd (x, op1));
27735 op1 = gen_lowpart (TImode, x);
27738 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27739 op0 = copy_to_mode_reg (mode0, op0);
27740 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27741 op1 = copy_to_mode_reg (mode1, op1);
27743 pat = GEN_FCN (icode) (target, op0, op1);
27744 if (! pat)
27745 return 0;
27747 emit_insn (pat);
27749 return target;
27752 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27754 static rtx
27755 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27756 enum ix86_builtin_func_type m_type,
27757 enum rtx_code sub_code)
27759 rtx pat;
27760 int i;
27761 int nargs;
27762 bool comparison_p = false;
27763 bool tf_p = false;
27764 bool last_arg_constant = false;
27765 int num_memory = 0;
27766 struct {
27767 rtx op;
27768 enum machine_mode mode;
27769 } args[4];
27771 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27773 switch (m_type)
27775 case MULTI_ARG_4_DF2_DI_I:
27776 case MULTI_ARG_4_DF2_DI_I1:
27777 case MULTI_ARG_4_SF2_SI_I:
27778 case MULTI_ARG_4_SF2_SI_I1:
27779 nargs = 4;
27780 last_arg_constant = true;
27781 break;
27783 case MULTI_ARG_3_SF:
27784 case MULTI_ARG_3_DF:
27785 case MULTI_ARG_3_SF2:
27786 case MULTI_ARG_3_DF2:
27787 case MULTI_ARG_3_DI:
27788 case MULTI_ARG_3_SI:
27789 case MULTI_ARG_3_SI_DI:
27790 case MULTI_ARG_3_HI:
27791 case MULTI_ARG_3_HI_SI:
27792 case MULTI_ARG_3_QI:
27793 case MULTI_ARG_3_DI2:
27794 case MULTI_ARG_3_SI2:
27795 case MULTI_ARG_3_HI2:
27796 case MULTI_ARG_3_QI2:
27797 nargs = 3;
27798 break;
27800 case MULTI_ARG_2_SF:
27801 case MULTI_ARG_2_DF:
27802 case MULTI_ARG_2_DI:
27803 case MULTI_ARG_2_SI:
27804 case MULTI_ARG_2_HI:
27805 case MULTI_ARG_2_QI:
27806 nargs = 2;
27807 break;
27809 case MULTI_ARG_2_DI_IMM:
27810 case MULTI_ARG_2_SI_IMM:
27811 case MULTI_ARG_2_HI_IMM:
27812 case MULTI_ARG_2_QI_IMM:
27813 nargs = 2;
27814 last_arg_constant = true;
27815 break;
27817 case MULTI_ARG_1_SF:
27818 case MULTI_ARG_1_DF:
27819 case MULTI_ARG_1_SF2:
27820 case MULTI_ARG_1_DF2:
27821 case MULTI_ARG_1_DI:
27822 case MULTI_ARG_1_SI:
27823 case MULTI_ARG_1_HI:
27824 case MULTI_ARG_1_QI:
27825 case MULTI_ARG_1_SI_DI:
27826 case MULTI_ARG_1_HI_DI:
27827 case MULTI_ARG_1_HI_SI:
27828 case MULTI_ARG_1_QI_DI:
27829 case MULTI_ARG_1_QI_SI:
27830 case MULTI_ARG_1_QI_HI:
27831 nargs = 1;
27832 break;
27834 case MULTI_ARG_2_DI_CMP:
27835 case MULTI_ARG_2_SI_CMP:
27836 case MULTI_ARG_2_HI_CMP:
27837 case MULTI_ARG_2_QI_CMP:
27838 nargs = 2;
27839 comparison_p = true;
27840 break;
27842 case MULTI_ARG_2_SF_TF:
27843 case MULTI_ARG_2_DF_TF:
27844 case MULTI_ARG_2_DI_TF:
27845 case MULTI_ARG_2_SI_TF:
27846 case MULTI_ARG_2_HI_TF:
27847 case MULTI_ARG_2_QI_TF:
27848 nargs = 2;
27849 tf_p = true;
27850 break;
27852 default:
27853 gcc_unreachable ();
27856 if (optimize || !target
27857 || GET_MODE (target) != tmode
27858 || !insn_data[icode].operand[0].predicate (target, tmode))
27859 target = gen_reg_rtx (tmode);
27861 gcc_assert (nargs <= 4);
27863 for (i = 0; i < nargs; i++)
27865 tree arg = CALL_EXPR_ARG (exp, i);
27866 rtx op = expand_normal (arg);
27867 int adjust = (comparison_p) ? 1 : 0;
27868 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27870 if (last_arg_constant && i == nargs - 1)
27872 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27874 enum insn_code new_icode = icode;
27875 switch (icode)
27877 case CODE_FOR_xop_vpermil2v2df3:
27878 case CODE_FOR_xop_vpermil2v4sf3:
27879 case CODE_FOR_xop_vpermil2v4df3:
27880 case CODE_FOR_xop_vpermil2v8sf3:
27881 error ("the last argument must be a 2-bit immediate");
27882 return gen_reg_rtx (tmode);
27883 case CODE_FOR_xop_rotlv2di3:
27884 new_icode = CODE_FOR_rotlv2di3;
27885 goto xop_rotl;
27886 case CODE_FOR_xop_rotlv4si3:
27887 new_icode = CODE_FOR_rotlv4si3;
27888 goto xop_rotl;
27889 case CODE_FOR_xop_rotlv8hi3:
27890 new_icode = CODE_FOR_rotlv8hi3;
27891 goto xop_rotl;
27892 case CODE_FOR_xop_rotlv16qi3:
27893 new_icode = CODE_FOR_rotlv16qi3;
27894 xop_rotl:
27895 if (CONST_INT_P (op))
27897 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27898 op = GEN_INT (INTVAL (op) & mask);
27899 gcc_checking_assert
27900 (insn_data[icode].operand[i + 1].predicate (op, mode));
27902 else
27904 gcc_checking_assert
27905 (nargs == 2
27906 && insn_data[new_icode].operand[0].mode == tmode
27907 && insn_data[new_icode].operand[1].mode == tmode
27908 && insn_data[new_icode].operand[2].mode == mode
27909 && insn_data[new_icode].operand[0].predicate
27910 == insn_data[icode].operand[0].predicate
27911 && insn_data[new_icode].operand[1].predicate
27912 == insn_data[icode].operand[1].predicate);
27913 icode = new_icode;
27914 goto non_constant;
27916 break;
27917 default:
27918 gcc_unreachable ();
27922 else
27924 non_constant:
27925 if (VECTOR_MODE_P (mode))
27926 op = safe_vector_operand (op, mode);
27928 /* If we aren't optimizing, only allow one memory operand to be
27929 generated. */
27930 if (memory_operand (op, mode))
27931 num_memory++;
27933 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27935 if (optimize
27936 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27937 || num_memory > 1)
27938 op = force_reg (mode, op);
27941 args[i].op = op;
27942 args[i].mode = mode;
27945 switch (nargs)
27947 case 1:
27948 pat = GEN_FCN (icode) (target, args[0].op);
27949 break;
27951 case 2:
27952 if (tf_p)
27953 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27954 GEN_INT ((int)sub_code));
27955 else if (! comparison_p)
27956 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27957 else
27959 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27960 args[0].op,
27961 args[1].op);
27963 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27965 break;
27967 case 3:
27968 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27969 break;
27971 case 4:
27972 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27973 break;
27975 default:
27976 gcc_unreachable ();
27979 if (! pat)
27980 return 0;
27982 emit_insn (pat);
27983 return target;
27986 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27987 insns with vec_merge. */
27989 static rtx
27990 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27991 rtx target)
27993 rtx pat;
27994 tree arg0 = CALL_EXPR_ARG (exp, 0);
27995 rtx op1, op0 = expand_normal (arg0);
27996 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27997 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27999 if (optimize || !target
28000 || GET_MODE (target) != tmode
28001 || !insn_data[icode].operand[0].predicate (target, tmode))
28002 target = gen_reg_rtx (tmode);
28004 if (VECTOR_MODE_P (mode0))
28005 op0 = safe_vector_operand (op0, mode0);
28007 if ((optimize && !register_operand (op0, mode0))
28008 || !insn_data[icode].operand[1].predicate (op0, mode0))
28009 op0 = copy_to_mode_reg (mode0, op0);
28011 op1 = op0;
28012 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28013 op1 = copy_to_mode_reg (mode0, op1);
28015 pat = GEN_FCN (icode) (target, op0, op1);
28016 if (! pat)
28017 return 0;
28018 emit_insn (pat);
28019 return target;
28022 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28024 static rtx
28025 ix86_expand_sse_compare (const struct builtin_description *d,
28026 tree exp, rtx target, bool swap)
28028 rtx pat;
28029 tree arg0 = CALL_EXPR_ARG (exp, 0);
28030 tree arg1 = CALL_EXPR_ARG (exp, 1);
28031 rtx op0 = expand_normal (arg0);
28032 rtx op1 = expand_normal (arg1);
28033 rtx op2;
28034 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28035 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28036 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28037 enum rtx_code comparison = d->comparison;
28039 if (VECTOR_MODE_P (mode0))
28040 op0 = safe_vector_operand (op0, mode0);
28041 if (VECTOR_MODE_P (mode1))
28042 op1 = safe_vector_operand (op1, mode1);
28044 /* Swap operands if we have a comparison that isn't available in
28045 hardware. */
28046 if (swap)
28048 rtx tmp = gen_reg_rtx (mode1);
28049 emit_move_insn (tmp, op1);
28050 op1 = op0;
28051 op0 = tmp;
28054 if (optimize || !target
28055 || GET_MODE (target) != tmode
28056 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28057 target = gen_reg_rtx (tmode);
28059 if ((optimize && !register_operand (op0, mode0))
28060 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28061 op0 = copy_to_mode_reg (mode0, op0);
28062 if ((optimize && !register_operand (op1, mode1))
28063 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28064 op1 = copy_to_mode_reg (mode1, op1);
28066 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28067 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28068 if (! pat)
28069 return 0;
28070 emit_insn (pat);
28071 return target;
28074 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28076 static rtx
28077 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28078 rtx target)
28080 rtx pat;
28081 tree arg0 = CALL_EXPR_ARG (exp, 0);
28082 tree arg1 = CALL_EXPR_ARG (exp, 1);
28083 rtx op0 = expand_normal (arg0);
28084 rtx op1 = expand_normal (arg1);
28085 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28086 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28087 enum rtx_code comparison = d->comparison;
28089 if (VECTOR_MODE_P (mode0))
28090 op0 = safe_vector_operand (op0, mode0);
28091 if (VECTOR_MODE_P (mode1))
28092 op1 = safe_vector_operand (op1, mode1);
28094 /* Swap operands if we have a comparison that isn't available in
28095 hardware. */
28096 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28098 rtx tmp = op1;
28099 op1 = op0;
28100 op0 = tmp;
28103 target = gen_reg_rtx (SImode);
28104 emit_move_insn (target, const0_rtx);
28105 target = gen_rtx_SUBREG (QImode, target, 0);
28107 if ((optimize && !register_operand (op0, mode0))
28108 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28109 op0 = copy_to_mode_reg (mode0, op0);
28110 if ((optimize && !register_operand (op1, mode1))
28111 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28112 op1 = copy_to_mode_reg (mode1, op1);
28114 pat = GEN_FCN (d->icode) (op0, op1);
28115 if (! pat)
28116 return 0;
28117 emit_insn (pat);
28118 emit_insn (gen_rtx_SET (VOIDmode,
28119 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28120 gen_rtx_fmt_ee (comparison, QImode,
28121 SET_DEST (pat),
28122 const0_rtx)));
28124 return SUBREG_REG (target);
28127 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28129 static rtx
28130 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28131 rtx target)
28133 rtx pat;
28134 tree arg0 = CALL_EXPR_ARG (exp, 0);
28135 rtx op1, op0 = expand_normal (arg0);
28136 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28137 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28139 if (optimize || target == 0
28140 || GET_MODE (target) != tmode
28141 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28142 target = gen_reg_rtx (tmode);
28144 if (VECTOR_MODE_P (mode0))
28145 op0 = safe_vector_operand (op0, mode0);
28147 if ((optimize && !register_operand (op0, mode0))
28148 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28149 op0 = copy_to_mode_reg (mode0, op0);
28151 op1 = GEN_INT (d->comparison);
28153 pat = GEN_FCN (d->icode) (target, op0, op1);
28154 if (! pat)
28155 return 0;
28156 emit_insn (pat);
28157 return target;
28160 static rtx
28161 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28162 tree exp, rtx target)
28164 rtx pat;
28165 tree arg0 = CALL_EXPR_ARG (exp, 0);
28166 tree arg1 = CALL_EXPR_ARG (exp, 1);
28167 rtx op0 = expand_normal (arg0);
28168 rtx op1 = expand_normal (arg1);
28169 rtx op2;
28170 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28171 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28172 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28174 if (optimize || target == 0
28175 || GET_MODE (target) != tmode
28176 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28177 target = gen_reg_rtx (tmode);
28179 op0 = safe_vector_operand (op0, mode0);
28180 op1 = safe_vector_operand (op1, mode1);
28182 if ((optimize && !register_operand (op0, mode0))
28183 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28184 op0 = copy_to_mode_reg (mode0, op0);
28185 if ((optimize && !register_operand (op1, mode1))
28186 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28187 op1 = copy_to_mode_reg (mode1, op1);
28189 op2 = GEN_INT (d->comparison);
28191 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28192 if (! pat)
28193 return 0;
28194 emit_insn (pat);
28195 return target;
28198 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28200 static rtx
28201 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28202 rtx target)
28204 rtx pat;
28205 tree arg0 = CALL_EXPR_ARG (exp, 0);
28206 tree arg1 = CALL_EXPR_ARG (exp, 1);
28207 rtx op0 = expand_normal (arg0);
28208 rtx op1 = expand_normal (arg1);
28209 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28210 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28211 enum rtx_code comparison = d->comparison;
28213 if (VECTOR_MODE_P (mode0))
28214 op0 = safe_vector_operand (op0, mode0);
28215 if (VECTOR_MODE_P (mode1))
28216 op1 = safe_vector_operand (op1, mode1);
28218 target = gen_reg_rtx (SImode);
28219 emit_move_insn (target, const0_rtx);
28220 target = gen_rtx_SUBREG (QImode, target, 0);
28222 if ((optimize && !register_operand (op0, mode0))
28223 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28224 op0 = copy_to_mode_reg (mode0, op0);
28225 if ((optimize && !register_operand (op1, mode1))
28226 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28227 op1 = copy_to_mode_reg (mode1, op1);
28229 pat = GEN_FCN (d->icode) (op0, op1);
28230 if (! pat)
28231 return 0;
28232 emit_insn (pat);
28233 emit_insn (gen_rtx_SET (VOIDmode,
28234 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28235 gen_rtx_fmt_ee (comparison, QImode,
28236 SET_DEST (pat),
28237 const0_rtx)));
28239 return SUBREG_REG (target);
28242 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28244 static rtx
28245 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28246 tree exp, rtx target)
28248 rtx pat;
28249 tree arg0 = CALL_EXPR_ARG (exp, 0);
28250 tree arg1 = CALL_EXPR_ARG (exp, 1);
28251 tree arg2 = CALL_EXPR_ARG (exp, 2);
28252 tree arg3 = CALL_EXPR_ARG (exp, 3);
28253 tree arg4 = CALL_EXPR_ARG (exp, 4);
28254 rtx scratch0, scratch1;
28255 rtx op0 = expand_normal (arg0);
28256 rtx op1 = expand_normal (arg1);
28257 rtx op2 = expand_normal (arg2);
28258 rtx op3 = expand_normal (arg3);
28259 rtx op4 = expand_normal (arg4);
28260 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28262 tmode0 = insn_data[d->icode].operand[0].mode;
28263 tmode1 = insn_data[d->icode].operand[1].mode;
28264 modev2 = insn_data[d->icode].operand[2].mode;
28265 modei3 = insn_data[d->icode].operand[3].mode;
28266 modev4 = insn_data[d->icode].operand[4].mode;
28267 modei5 = insn_data[d->icode].operand[5].mode;
28268 modeimm = insn_data[d->icode].operand[6].mode;
28270 if (VECTOR_MODE_P (modev2))
28271 op0 = safe_vector_operand (op0, modev2);
28272 if (VECTOR_MODE_P (modev4))
28273 op2 = safe_vector_operand (op2, modev4);
28275 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28276 op0 = copy_to_mode_reg (modev2, op0);
28277 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28278 op1 = copy_to_mode_reg (modei3, op1);
28279 if ((optimize && !register_operand (op2, modev4))
28280 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28281 op2 = copy_to_mode_reg (modev4, op2);
28282 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28283 op3 = copy_to_mode_reg (modei5, op3);
28285 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28287 error ("the fifth argument must be an 8-bit immediate");
28288 return const0_rtx;
28291 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28293 if (optimize || !target
28294 || GET_MODE (target) != tmode0
28295 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28296 target = gen_reg_rtx (tmode0);
28298 scratch1 = gen_reg_rtx (tmode1);
28300 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28302 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28304 if (optimize || !target
28305 || GET_MODE (target) != tmode1
28306 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28307 target = gen_reg_rtx (tmode1);
28309 scratch0 = gen_reg_rtx (tmode0);
28311 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28313 else
28315 gcc_assert (d->flag);
28317 scratch0 = gen_reg_rtx (tmode0);
28318 scratch1 = gen_reg_rtx (tmode1);
28320 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28323 if (! pat)
28324 return 0;
28326 emit_insn (pat);
28328 if (d->flag)
28330 target = gen_reg_rtx (SImode);
28331 emit_move_insn (target, const0_rtx);
28332 target = gen_rtx_SUBREG (QImode, target, 0);
28334 emit_insn
28335 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28336 gen_rtx_fmt_ee (EQ, QImode,
28337 gen_rtx_REG ((enum machine_mode) d->flag,
28338 FLAGS_REG),
28339 const0_rtx)));
28340 return SUBREG_REG (target);
28342 else
28343 return target;
28347 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28349 static rtx
28350 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28351 tree exp, rtx target)
28353 rtx pat;
28354 tree arg0 = CALL_EXPR_ARG (exp, 0);
28355 tree arg1 = CALL_EXPR_ARG (exp, 1);
28356 tree arg2 = CALL_EXPR_ARG (exp, 2);
28357 rtx scratch0, scratch1;
28358 rtx op0 = expand_normal (arg0);
28359 rtx op1 = expand_normal (arg1);
28360 rtx op2 = expand_normal (arg2);
28361 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28363 tmode0 = insn_data[d->icode].operand[0].mode;
28364 tmode1 = insn_data[d->icode].operand[1].mode;
28365 modev2 = insn_data[d->icode].operand[2].mode;
28366 modev3 = insn_data[d->icode].operand[3].mode;
28367 modeimm = insn_data[d->icode].operand[4].mode;
28369 if (VECTOR_MODE_P (modev2))
28370 op0 = safe_vector_operand (op0, modev2);
28371 if (VECTOR_MODE_P (modev3))
28372 op1 = safe_vector_operand (op1, modev3);
28374 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28375 op0 = copy_to_mode_reg (modev2, op0);
28376 if ((optimize && !register_operand (op1, modev3))
28377 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28378 op1 = copy_to_mode_reg (modev3, op1);
28380 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28382 error ("the third argument must be an 8-bit immediate");
28383 return const0_rtx;
28386 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28388 if (optimize || !target
28389 || GET_MODE (target) != tmode0
28390 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28391 target = gen_reg_rtx (tmode0);
28393 scratch1 = gen_reg_rtx (tmode1);
28395 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28397 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28399 if (optimize || !target
28400 || GET_MODE (target) != tmode1
28401 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28402 target = gen_reg_rtx (tmode1);
28404 scratch0 = gen_reg_rtx (tmode0);
28406 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28408 else
28410 gcc_assert (d->flag);
28412 scratch0 = gen_reg_rtx (tmode0);
28413 scratch1 = gen_reg_rtx (tmode1);
28415 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28418 if (! pat)
28419 return 0;
28421 emit_insn (pat);
28423 if (d->flag)
28425 target = gen_reg_rtx (SImode);
28426 emit_move_insn (target, const0_rtx);
28427 target = gen_rtx_SUBREG (QImode, target, 0);
28429 emit_insn
28430 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28431 gen_rtx_fmt_ee (EQ, QImode,
28432 gen_rtx_REG ((enum machine_mode) d->flag,
28433 FLAGS_REG),
28434 const0_rtx)));
28435 return SUBREG_REG (target);
28437 else
28438 return target;
28441 /* Subroutine of ix86_expand_builtin to take care of insns with
28442 variable number of operands. */
28444 static rtx
28445 ix86_expand_args_builtin (const struct builtin_description *d,
28446 tree exp, rtx target)
28448 rtx pat, real_target;
28449 unsigned int i, nargs;
28450 unsigned int nargs_constant = 0;
28451 int num_memory = 0;
28452 struct
28454 rtx op;
28455 enum machine_mode mode;
28456 } args[4];
28457 bool last_arg_count = false;
28458 enum insn_code icode = d->icode;
28459 const struct insn_data_d *insn_p = &insn_data[icode];
28460 enum machine_mode tmode = insn_p->operand[0].mode;
28461 enum machine_mode rmode = VOIDmode;
28462 bool swap = false;
28463 enum rtx_code comparison = d->comparison;
28465 switch ((enum ix86_builtin_func_type) d->flag)
28467 case V2DF_FTYPE_V2DF_ROUND:
28468 case V4DF_FTYPE_V4DF_ROUND:
28469 case V4SF_FTYPE_V4SF_ROUND:
28470 case V8SF_FTYPE_V8SF_ROUND:
28471 case V4SI_FTYPE_V4SF_ROUND:
28472 case V8SI_FTYPE_V8SF_ROUND:
28473 return ix86_expand_sse_round (d, exp, target);
28474 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28475 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28476 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28477 case INT_FTYPE_V8SF_V8SF_PTEST:
28478 case INT_FTYPE_V4DI_V4DI_PTEST:
28479 case INT_FTYPE_V4DF_V4DF_PTEST:
28480 case INT_FTYPE_V4SF_V4SF_PTEST:
28481 case INT_FTYPE_V2DI_V2DI_PTEST:
28482 case INT_FTYPE_V2DF_V2DF_PTEST:
28483 return ix86_expand_sse_ptest (d, exp, target);
28484 case FLOAT128_FTYPE_FLOAT128:
28485 case FLOAT_FTYPE_FLOAT:
28486 case INT_FTYPE_INT:
28487 case UINT64_FTYPE_INT:
28488 case UINT16_FTYPE_UINT16:
28489 case INT64_FTYPE_INT64:
28490 case INT64_FTYPE_V4SF:
28491 case INT64_FTYPE_V2DF:
28492 case INT_FTYPE_V16QI:
28493 case INT_FTYPE_V8QI:
28494 case INT_FTYPE_V8SF:
28495 case INT_FTYPE_V4DF:
28496 case INT_FTYPE_V4SF:
28497 case INT_FTYPE_V2DF:
28498 case INT_FTYPE_V32QI:
28499 case V16QI_FTYPE_V16QI:
28500 case V8SI_FTYPE_V8SF:
28501 case V8SI_FTYPE_V4SI:
28502 case V8HI_FTYPE_V8HI:
28503 case V8HI_FTYPE_V16QI:
28504 case V8QI_FTYPE_V8QI:
28505 case V8SF_FTYPE_V8SF:
28506 case V8SF_FTYPE_V8SI:
28507 case V8SF_FTYPE_V4SF:
28508 case V8SF_FTYPE_V8HI:
28509 case V4SI_FTYPE_V4SI:
28510 case V4SI_FTYPE_V16QI:
28511 case V4SI_FTYPE_V4SF:
28512 case V4SI_FTYPE_V8SI:
28513 case V4SI_FTYPE_V8HI:
28514 case V4SI_FTYPE_V4DF:
28515 case V4SI_FTYPE_V2DF:
28516 case V4HI_FTYPE_V4HI:
28517 case V4DF_FTYPE_V4DF:
28518 case V4DF_FTYPE_V4SI:
28519 case V4DF_FTYPE_V4SF:
28520 case V4DF_FTYPE_V2DF:
28521 case V4SF_FTYPE_V4SF:
28522 case V4SF_FTYPE_V4SI:
28523 case V4SF_FTYPE_V8SF:
28524 case V4SF_FTYPE_V4DF:
28525 case V4SF_FTYPE_V8HI:
28526 case V4SF_FTYPE_V2DF:
28527 case V2DI_FTYPE_V2DI:
28528 case V2DI_FTYPE_V16QI:
28529 case V2DI_FTYPE_V8HI:
28530 case V2DI_FTYPE_V4SI:
28531 case V2DF_FTYPE_V2DF:
28532 case V2DF_FTYPE_V4SI:
28533 case V2DF_FTYPE_V4DF:
28534 case V2DF_FTYPE_V4SF:
28535 case V2DF_FTYPE_V2SI:
28536 case V2SI_FTYPE_V2SI:
28537 case V2SI_FTYPE_V4SF:
28538 case V2SI_FTYPE_V2SF:
28539 case V2SI_FTYPE_V2DF:
28540 case V2SF_FTYPE_V2SF:
28541 case V2SF_FTYPE_V2SI:
28542 case V32QI_FTYPE_V32QI:
28543 case V32QI_FTYPE_V16QI:
28544 case V16HI_FTYPE_V16HI:
28545 case V16HI_FTYPE_V8HI:
28546 case V8SI_FTYPE_V8SI:
28547 case V16HI_FTYPE_V16QI:
28548 case V8SI_FTYPE_V16QI:
28549 case V4DI_FTYPE_V16QI:
28550 case V8SI_FTYPE_V8HI:
28551 case V4DI_FTYPE_V8HI:
28552 case V4DI_FTYPE_V4SI:
28553 case V4DI_FTYPE_V2DI:
28554 nargs = 1;
28555 break;
28556 case V4SF_FTYPE_V4SF_VEC_MERGE:
28557 case V2DF_FTYPE_V2DF_VEC_MERGE:
28558 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28559 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28560 case V16QI_FTYPE_V16QI_V16QI:
28561 case V16QI_FTYPE_V8HI_V8HI:
28562 case V8QI_FTYPE_V8QI_V8QI:
28563 case V8QI_FTYPE_V4HI_V4HI:
28564 case V8HI_FTYPE_V8HI_V8HI:
28565 case V8HI_FTYPE_V16QI_V16QI:
28566 case V8HI_FTYPE_V4SI_V4SI:
28567 case V8SF_FTYPE_V8SF_V8SF:
28568 case V8SF_FTYPE_V8SF_V8SI:
28569 case V4SI_FTYPE_V4SI_V4SI:
28570 case V4SI_FTYPE_V8HI_V8HI:
28571 case V4SI_FTYPE_V4SF_V4SF:
28572 case V4SI_FTYPE_V2DF_V2DF:
28573 case V4HI_FTYPE_V4HI_V4HI:
28574 case V4HI_FTYPE_V8QI_V8QI:
28575 case V4HI_FTYPE_V2SI_V2SI:
28576 case V4DF_FTYPE_V4DF_V4DF:
28577 case V4DF_FTYPE_V4DF_V4DI:
28578 case V4SF_FTYPE_V4SF_V4SF:
28579 case V4SF_FTYPE_V4SF_V4SI:
28580 case V4SF_FTYPE_V4SF_V2SI:
28581 case V4SF_FTYPE_V4SF_V2DF:
28582 case V4SF_FTYPE_V4SF_DI:
28583 case V4SF_FTYPE_V4SF_SI:
28584 case V2DI_FTYPE_V2DI_V2DI:
28585 case V2DI_FTYPE_V16QI_V16QI:
28586 case V2DI_FTYPE_V4SI_V4SI:
28587 case V2DI_FTYPE_V2DI_V16QI:
28588 case V2DI_FTYPE_V2DF_V2DF:
28589 case V2SI_FTYPE_V2SI_V2SI:
28590 case V2SI_FTYPE_V4HI_V4HI:
28591 case V2SI_FTYPE_V2SF_V2SF:
28592 case V2DF_FTYPE_V2DF_V2DF:
28593 case V2DF_FTYPE_V2DF_V4SF:
28594 case V2DF_FTYPE_V2DF_V2DI:
28595 case V2DF_FTYPE_V2DF_DI:
28596 case V2DF_FTYPE_V2DF_SI:
28597 case V2SF_FTYPE_V2SF_V2SF:
28598 case V1DI_FTYPE_V1DI_V1DI:
28599 case V1DI_FTYPE_V8QI_V8QI:
28600 case V1DI_FTYPE_V2SI_V2SI:
28601 case V32QI_FTYPE_V16HI_V16HI:
28602 case V16HI_FTYPE_V8SI_V8SI:
28603 case V32QI_FTYPE_V32QI_V32QI:
28604 case V16HI_FTYPE_V32QI_V32QI:
28605 case V16HI_FTYPE_V16HI_V16HI:
28606 case V8SI_FTYPE_V4DF_V4DF:
28607 case V8SI_FTYPE_V8SI_V8SI:
28608 case V8SI_FTYPE_V16HI_V16HI:
28609 case V4DI_FTYPE_V4DI_V4DI:
28610 case V4DI_FTYPE_V8SI_V8SI:
28611 if (comparison == UNKNOWN)
28612 return ix86_expand_binop_builtin (icode, exp, target);
28613 nargs = 2;
28614 break;
28615 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28616 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28617 gcc_assert (comparison != UNKNOWN);
28618 nargs = 2;
28619 swap = true;
28620 break;
28621 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28622 case V16HI_FTYPE_V16HI_SI_COUNT:
28623 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28624 case V8SI_FTYPE_V8SI_SI_COUNT:
28625 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28626 case V4DI_FTYPE_V4DI_INT_COUNT:
28627 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28628 case V8HI_FTYPE_V8HI_SI_COUNT:
28629 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28630 case V4SI_FTYPE_V4SI_SI_COUNT:
28631 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28632 case V4HI_FTYPE_V4HI_SI_COUNT:
28633 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28634 case V2DI_FTYPE_V2DI_SI_COUNT:
28635 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28636 case V2SI_FTYPE_V2SI_SI_COUNT:
28637 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28638 case V1DI_FTYPE_V1DI_SI_COUNT:
28639 nargs = 2;
28640 last_arg_count = true;
28641 break;
28642 case UINT64_FTYPE_UINT64_UINT64:
28643 case UINT_FTYPE_UINT_UINT:
28644 case UINT_FTYPE_UINT_USHORT:
28645 case UINT_FTYPE_UINT_UCHAR:
28646 case UINT16_FTYPE_UINT16_INT:
28647 case UINT8_FTYPE_UINT8_INT:
28648 nargs = 2;
28649 break;
28650 case V2DI_FTYPE_V2DI_INT_CONVERT:
28651 nargs = 2;
28652 rmode = V1TImode;
28653 nargs_constant = 1;
28654 break;
28655 case V4DI_FTYPE_V4DI_INT_CONVERT:
28656 nargs = 2;
28657 rmode = V2TImode;
28658 nargs_constant = 1;
28659 break;
28660 case V8HI_FTYPE_V8HI_INT:
28661 case V8HI_FTYPE_V8SF_INT:
28662 case V8HI_FTYPE_V4SF_INT:
28663 case V8SF_FTYPE_V8SF_INT:
28664 case V4SI_FTYPE_V4SI_INT:
28665 case V4SI_FTYPE_V8SI_INT:
28666 case V4HI_FTYPE_V4HI_INT:
28667 case V4DF_FTYPE_V4DF_INT:
28668 case V4SF_FTYPE_V4SF_INT:
28669 case V4SF_FTYPE_V8SF_INT:
28670 case V2DI_FTYPE_V2DI_INT:
28671 case V2DF_FTYPE_V2DF_INT:
28672 case V2DF_FTYPE_V4DF_INT:
28673 case V16HI_FTYPE_V16HI_INT:
28674 case V8SI_FTYPE_V8SI_INT:
28675 case V4DI_FTYPE_V4DI_INT:
28676 case V2DI_FTYPE_V4DI_INT:
28677 nargs = 2;
28678 nargs_constant = 1;
28679 break;
28680 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28681 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28682 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28683 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28684 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28685 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28686 nargs = 3;
28687 break;
28688 case V32QI_FTYPE_V32QI_V32QI_INT:
28689 case V16HI_FTYPE_V16HI_V16HI_INT:
28690 case V16QI_FTYPE_V16QI_V16QI_INT:
28691 case V4DI_FTYPE_V4DI_V4DI_INT:
28692 case V8HI_FTYPE_V8HI_V8HI_INT:
28693 case V8SI_FTYPE_V8SI_V8SI_INT:
28694 case V8SI_FTYPE_V8SI_V4SI_INT:
28695 case V8SF_FTYPE_V8SF_V8SF_INT:
28696 case V8SF_FTYPE_V8SF_V4SF_INT:
28697 case V4SI_FTYPE_V4SI_V4SI_INT:
28698 case V4DF_FTYPE_V4DF_V4DF_INT:
28699 case V4DF_FTYPE_V4DF_V2DF_INT:
28700 case V4SF_FTYPE_V4SF_V4SF_INT:
28701 case V2DI_FTYPE_V2DI_V2DI_INT:
28702 case V4DI_FTYPE_V4DI_V2DI_INT:
28703 case V2DF_FTYPE_V2DF_V2DF_INT:
28704 nargs = 3;
28705 nargs_constant = 1;
28706 break;
28707 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28708 nargs = 3;
28709 rmode = V4DImode;
28710 nargs_constant = 1;
28711 break;
28712 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28713 nargs = 3;
28714 rmode = V2DImode;
28715 nargs_constant = 1;
28716 break;
28717 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28718 nargs = 3;
28719 rmode = DImode;
28720 nargs_constant = 1;
28721 break;
28722 case V2DI_FTYPE_V2DI_UINT_UINT:
28723 nargs = 3;
28724 nargs_constant = 2;
28725 break;
28726 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28727 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28728 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28729 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28730 nargs = 4;
28731 nargs_constant = 1;
28732 break;
28733 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28734 nargs = 4;
28735 nargs_constant = 2;
28736 break;
28737 default:
28738 gcc_unreachable ();
28741 gcc_assert (nargs <= ARRAY_SIZE (args));
28743 if (comparison != UNKNOWN)
28745 gcc_assert (nargs == 2);
28746 return ix86_expand_sse_compare (d, exp, target, swap);
28749 if (rmode == VOIDmode || rmode == tmode)
28751 if (optimize
28752 || target == 0
28753 || GET_MODE (target) != tmode
28754 || !insn_p->operand[0].predicate (target, tmode))
28755 target = gen_reg_rtx (tmode);
28756 real_target = target;
28758 else
28760 target = gen_reg_rtx (rmode);
28761 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28764 for (i = 0; i < nargs; i++)
28766 tree arg = CALL_EXPR_ARG (exp, i);
28767 rtx op = expand_normal (arg);
28768 enum machine_mode mode = insn_p->operand[i + 1].mode;
28769 bool match = insn_p->operand[i + 1].predicate (op, mode);
28771 if (last_arg_count && (i + 1) == nargs)
28773 /* SIMD shift insns take either an 8-bit immediate or
28774 register as count. But builtin functions take int as
28775 count. If count doesn't match, we put it in register. */
28776 if (!match)
28778 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28779 if (!insn_p->operand[i + 1].predicate (op, mode))
28780 op = copy_to_reg (op);
28783 else if ((nargs - i) <= nargs_constant)
28785 if (!match)
28786 switch (icode)
28788 case CODE_FOR_avx2_inserti128:
28789 case CODE_FOR_avx2_extracti128:
28790 error ("the last argument must be an 1-bit immediate");
28791 return const0_rtx;
28793 case CODE_FOR_sse4_1_roundsd:
28794 case CODE_FOR_sse4_1_roundss:
28796 case CODE_FOR_sse4_1_roundpd:
28797 case CODE_FOR_sse4_1_roundps:
28798 case CODE_FOR_avx_roundpd256:
28799 case CODE_FOR_avx_roundps256:
28801 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
28802 case CODE_FOR_sse4_1_roundps_sfix:
28803 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
28804 case CODE_FOR_avx_roundps_sfix256:
28806 case CODE_FOR_sse4_1_blendps:
28807 case CODE_FOR_avx_blendpd256:
28808 case CODE_FOR_avx_vpermilv4df:
28809 error ("the last argument must be a 4-bit immediate");
28810 return const0_rtx;
28812 case CODE_FOR_sse4_1_blendpd:
28813 case CODE_FOR_avx_vpermilv2df:
28814 case CODE_FOR_xop_vpermil2v2df3:
28815 case CODE_FOR_xop_vpermil2v4sf3:
28816 case CODE_FOR_xop_vpermil2v4df3:
28817 case CODE_FOR_xop_vpermil2v8sf3:
28818 error ("the last argument must be a 2-bit immediate");
28819 return const0_rtx;
28821 case CODE_FOR_avx_vextractf128v4df:
28822 case CODE_FOR_avx_vextractf128v8sf:
28823 case CODE_FOR_avx_vextractf128v8si:
28824 case CODE_FOR_avx_vinsertf128v4df:
28825 case CODE_FOR_avx_vinsertf128v8sf:
28826 case CODE_FOR_avx_vinsertf128v8si:
28827 error ("the last argument must be a 1-bit immediate");
28828 return const0_rtx;
28830 case CODE_FOR_avx_vmcmpv2df3:
28831 case CODE_FOR_avx_vmcmpv4sf3:
28832 case CODE_FOR_avx_cmpv2df3:
28833 case CODE_FOR_avx_cmpv4sf3:
28834 case CODE_FOR_avx_cmpv4df3:
28835 case CODE_FOR_avx_cmpv8sf3:
28836 error ("the last argument must be a 5-bit immediate");
28837 return const0_rtx;
28839 default:
28840 switch (nargs_constant)
28842 case 2:
28843 if ((nargs - i) == nargs_constant)
28845 error ("the next to last argument must be an 8-bit immediate");
28846 break;
28848 case 1:
28849 error ("the last argument must be an 8-bit immediate");
28850 break;
28851 default:
28852 gcc_unreachable ();
28854 return const0_rtx;
28857 else
28859 if (VECTOR_MODE_P (mode))
28860 op = safe_vector_operand (op, mode);
28862 /* If we aren't optimizing, only allow one memory operand to
28863 be generated. */
28864 if (memory_operand (op, mode))
28865 num_memory++;
28867 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28869 if (optimize || !match || num_memory > 1)
28870 op = copy_to_mode_reg (mode, op);
28872 else
28874 op = copy_to_reg (op);
28875 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28879 args[i].op = op;
28880 args[i].mode = mode;
28883 switch (nargs)
28885 case 1:
28886 pat = GEN_FCN (icode) (real_target, args[0].op);
28887 break;
28888 case 2:
28889 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28890 break;
28891 case 3:
28892 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28893 args[2].op);
28894 break;
28895 case 4:
28896 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28897 args[2].op, args[3].op);
28898 break;
28899 default:
28900 gcc_unreachable ();
28903 if (! pat)
28904 return 0;
28906 emit_insn (pat);
28907 return target;
28910 /* Subroutine of ix86_expand_builtin to take care of special insns
28911 with variable number of operands. */
28913 static rtx
28914 ix86_expand_special_args_builtin (const struct builtin_description *d,
28915 tree exp, rtx target)
28917 tree arg;
28918 rtx pat, op;
28919 unsigned int i, nargs, arg_adjust, memory;
28920 struct
28922 rtx op;
28923 enum machine_mode mode;
28924 } args[3];
28925 enum insn_code icode = d->icode;
28926 bool last_arg_constant = false;
28927 const struct insn_data_d *insn_p = &insn_data[icode];
28928 enum machine_mode tmode = insn_p->operand[0].mode;
28929 enum { load, store } klass;
28931 switch ((enum ix86_builtin_func_type) d->flag)
28933 case VOID_FTYPE_VOID:
28934 if (icode == CODE_FOR_avx_vzeroupper)
28935 target = GEN_INT (vzeroupper_intrinsic);
28936 emit_insn (GEN_FCN (icode) (target));
28937 return 0;
28938 case VOID_FTYPE_UINT64:
28939 case VOID_FTYPE_UNSIGNED:
28940 nargs = 0;
28941 klass = store;
28942 memory = 0;
28943 break;
28945 case INT_FTYPE_VOID:
28946 case UINT64_FTYPE_VOID:
28947 case UNSIGNED_FTYPE_VOID:
28948 nargs = 0;
28949 klass = load;
28950 memory = 0;
28951 break;
28952 case UINT64_FTYPE_PUNSIGNED:
28953 case V2DI_FTYPE_PV2DI:
28954 case V4DI_FTYPE_PV4DI:
28955 case V32QI_FTYPE_PCCHAR:
28956 case V16QI_FTYPE_PCCHAR:
28957 case V8SF_FTYPE_PCV4SF:
28958 case V8SF_FTYPE_PCFLOAT:
28959 case V4SF_FTYPE_PCFLOAT:
28960 case V4DF_FTYPE_PCV2DF:
28961 case V4DF_FTYPE_PCDOUBLE:
28962 case V2DF_FTYPE_PCDOUBLE:
28963 case VOID_FTYPE_PVOID:
28964 nargs = 1;
28965 klass = load;
28966 memory = 0;
28967 break;
28968 case VOID_FTYPE_PV2SF_V4SF:
28969 case VOID_FTYPE_PV4DI_V4DI:
28970 case VOID_FTYPE_PV2DI_V2DI:
28971 case VOID_FTYPE_PCHAR_V32QI:
28972 case VOID_FTYPE_PCHAR_V16QI:
28973 case VOID_FTYPE_PFLOAT_V8SF:
28974 case VOID_FTYPE_PFLOAT_V4SF:
28975 case VOID_FTYPE_PDOUBLE_V4DF:
28976 case VOID_FTYPE_PDOUBLE_V2DF:
28977 case VOID_FTYPE_PLONGLONG_LONGLONG:
28978 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28979 case VOID_FTYPE_PINT_INT:
28980 nargs = 1;
28981 klass = store;
28982 /* Reserve memory operand for target. */
28983 memory = ARRAY_SIZE (args);
28984 break;
28985 case V4SF_FTYPE_V4SF_PCV2SF:
28986 case V2DF_FTYPE_V2DF_PCDOUBLE:
28987 nargs = 2;
28988 klass = load;
28989 memory = 1;
28990 break;
28991 case V8SF_FTYPE_PCV8SF_V8SI:
28992 case V4DF_FTYPE_PCV4DF_V4DI:
28993 case V4SF_FTYPE_PCV4SF_V4SI:
28994 case V2DF_FTYPE_PCV2DF_V2DI:
28995 case V8SI_FTYPE_PCV8SI_V8SI:
28996 case V4DI_FTYPE_PCV4DI_V4DI:
28997 case V4SI_FTYPE_PCV4SI_V4SI:
28998 case V2DI_FTYPE_PCV2DI_V2DI:
28999 nargs = 2;
29000 klass = load;
29001 memory = 0;
29002 break;
29003 case VOID_FTYPE_PV8SF_V8SI_V8SF:
29004 case VOID_FTYPE_PV4DF_V4DI_V4DF:
29005 case VOID_FTYPE_PV4SF_V4SI_V4SF:
29006 case VOID_FTYPE_PV2DF_V2DI_V2DF:
29007 case VOID_FTYPE_PV8SI_V8SI_V8SI:
29008 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29009 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29010 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29011 nargs = 2;
29012 klass = store;
29013 /* Reserve memory operand for target. */
29014 memory = ARRAY_SIZE (args);
29015 break;
29016 case VOID_FTYPE_UINT_UINT_UINT:
29017 case VOID_FTYPE_UINT64_UINT_UINT:
29018 case UCHAR_FTYPE_UINT_UINT_UINT:
29019 case UCHAR_FTYPE_UINT64_UINT_UINT:
29020 nargs = 3;
29021 klass = load;
29022 memory = ARRAY_SIZE (args);
29023 last_arg_constant = true;
29024 break;
29025 default:
29026 gcc_unreachable ();
29029 gcc_assert (nargs <= ARRAY_SIZE (args));
29031 if (klass == store)
29033 arg = CALL_EXPR_ARG (exp, 0);
29034 op = expand_normal (arg);
29035 gcc_assert (target == 0);
29036 if (memory)
29038 if (GET_MODE (op) != Pmode)
29039 op = convert_to_mode (Pmode, op, 1);
29040 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29042 else
29043 target = force_reg (tmode, op);
29044 arg_adjust = 1;
29046 else
29048 arg_adjust = 0;
29049 if (optimize
29050 || target == 0
29051 || GET_MODE (target) != tmode
29052 || !insn_p->operand[0].predicate (target, tmode))
29053 target = gen_reg_rtx (tmode);
29056 for (i = 0; i < nargs; i++)
29058 enum machine_mode mode = insn_p->operand[i + 1].mode;
29059 bool match;
29061 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29062 op = expand_normal (arg);
29063 match = insn_p->operand[i + 1].predicate (op, mode);
29065 if (last_arg_constant && (i + 1) == nargs)
29067 if (!match)
29069 if (icode == CODE_FOR_lwp_lwpvalsi3
29070 || icode == CODE_FOR_lwp_lwpinssi3
29071 || icode == CODE_FOR_lwp_lwpvaldi3
29072 || icode == CODE_FOR_lwp_lwpinsdi3)
29073 error ("the last argument must be a 32-bit immediate");
29074 else
29075 error ("the last argument must be an 8-bit immediate");
29076 return const0_rtx;
29079 else
29081 if (i == memory)
29083 /* This must be the memory operand. */
29084 if (GET_MODE (op) != Pmode)
29085 op = convert_to_mode (Pmode, op, 1);
29086 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29087 gcc_assert (GET_MODE (op) == mode
29088 || GET_MODE (op) == VOIDmode);
29090 else
29092 /* This must be register. */
29093 if (VECTOR_MODE_P (mode))
29094 op = safe_vector_operand (op, mode);
29096 gcc_assert (GET_MODE (op) == mode
29097 || GET_MODE (op) == VOIDmode);
29098 op = copy_to_mode_reg (mode, op);
29102 args[i].op = op;
29103 args[i].mode = mode;
29106 switch (nargs)
29108 case 0:
29109 pat = GEN_FCN (icode) (target);
29110 break;
29111 case 1:
29112 pat = GEN_FCN (icode) (target, args[0].op);
29113 break;
29114 case 2:
29115 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29116 break;
29117 case 3:
29118 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29119 break;
29120 default:
29121 gcc_unreachable ();
29124 if (! pat)
29125 return 0;
29126 emit_insn (pat);
29127 return klass == store ? 0 : target;
29130 /* Return the integer constant in ARG. Constrain it to be in the range
29131 of the subparts of VEC_TYPE; issue an error if not. */
29133 static int
29134 get_element_number (tree vec_type, tree arg)
29136 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29138 if (!host_integerp (arg, 1)
29139 || (elt = tree_low_cst (arg, 1), elt > max))
29141 error ("selector must be an integer constant in the range 0..%wi", max);
29142 return 0;
29145 return elt;
29148 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29149 ix86_expand_vector_init. We DO have language-level syntax for this, in
29150 the form of (type){ init-list }. Except that since we can't place emms
29151 instructions from inside the compiler, we can't allow the use of MMX
29152 registers unless the user explicitly asks for it. So we do *not* define
29153 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29154 we have builtins invoked by mmintrin.h that gives us license to emit
29155 these sorts of instructions. */
29157 static rtx
29158 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29160 enum machine_mode tmode = TYPE_MODE (type);
29161 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29162 int i, n_elt = GET_MODE_NUNITS (tmode);
29163 rtvec v = rtvec_alloc (n_elt);
29165 gcc_assert (VECTOR_MODE_P (tmode));
29166 gcc_assert (call_expr_nargs (exp) == n_elt);
29168 for (i = 0; i < n_elt; ++i)
29170 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29171 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29174 if (!target || !register_operand (target, tmode))
29175 target = gen_reg_rtx (tmode);
29177 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29178 return target;
29181 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29182 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29183 had a language-level syntax for referencing vector elements. */
29185 static rtx
29186 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29188 enum machine_mode tmode, mode0;
29189 tree arg0, arg1;
29190 int elt;
29191 rtx op0;
29193 arg0 = CALL_EXPR_ARG (exp, 0);
29194 arg1 = CALL_EXPR_ARG (exp, 1);
29196 op0 = expand_normal (arg0);
29197 elt = get_element_number (TREE_TYPE (arg0), arg1);
29199 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29200 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29201 gcc_assert (VECTOR_MODE_P (mode0));
29203 op0 = force_reg (mode0, op0);
29205 if (optimize || !target || !register_operand (target, tmode))
29206 target = gen_reg_rtx (tmode);
29208 ix86_expand_vector_extract (true, target, op0, elt);
29210 return target;
29213 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29214 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29215 a language-level syntax for referencing vector elements. */
29217 static rtx
29218 ix86_expand_vec_set_builtin (tree exp)
29220 enum machine_mode tmode, mode1;
29221 tree arg0, arg1, arg2;
29222 int elt;
29223 rtx op0, op1, target;
29225 arg0 = CALL_EXPR_ARG (exp, 0);
29226 arg1 = CALL_EXPR_ARG (exp, 1);
29227 arg2 = CALL_EXPR_ARG (exp, 2);
29229 tmode = TYPE_MODE (TREE_TYPE (arg0));
29230 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29231 gcc_assert (VECTOR_MODE_P (tmode));
29233 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29234 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29235 elt = get_element_number (TREE_TYPE (arg0), arg2);
29237 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29238 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29240 op0 = force_reg (tmode, op0);
29241 op1 = force_reg (mode1, op1);
29243 /* OP0 is the source of these builtin functions and shouldn't be
29244 modified. Create a copy, use it and return it as target. */
29245 target = gen_reg_rtx (tmode);
29246 emit_move_insn (target, op0);
29247 ix86_expand_vector_set (true, target, op1, elt);
29249 return target;
29252 /* Expand an expression EXP that calls a built-in function,
29253 with result going to TARGET if that's convenient
29254 (and in mode MODE if that's convenient).
29255 SUBTARGET may be used as the target for computing one of EXP's operands.
29256 IGNORE is nonzero if the value is to be ignored. */
29258 static rtx
29259 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29260 enum machine_mode mode ATTRIBUTE_UNUSED,
29261 int ignore ATTRIBUTE_UNUSED)
29263 const struct builtin_description *d;
29264 size_t i;
29265 enum insn_code icode;
29266 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29267 tree arg0, arg1, arg2, arg3, arg4;
29268 rtx op0, op1, op2, op3, op4, pat;
29269 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29270 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29272 /* Determine whether the builtin function is available under the current ISA.
29273 Originally the builtin was not created if it wasn't applicable to the
29274 current ISA based on the command line switches. With function specific
29275 options, we need to check in the context of the function making the call
29276 whether it is supported. */
29277 if (ix86_builtins_isa[fcode].isa
29278 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29280 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29281 NULL, (enum fpmath_unit) 0, false);
29283 if (!opts)
29284 error ("%qE needs unknown isa option", fndecl);
29285 else
29287 gcc_assert (opts != NULL);
29288 error ("%qE needs isa option %s", fndecl, opts);
29289 free (opts);
29291 return const0_rtx;
29294 switch (fcode)
29296 case IX86_BUILTIN_MASKMOVQ:
29297 case IX86_BUILTIN_MASKMOVDQU:
29298 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29299 ? CODE_FOR_mmx_maskmovq
29300 : CODE_FOR_sse2_maskmovdqu);
29301 /* Note the arg order is different from the operand order. */
29302 arg1 = CALL_EXPR_ARG (exp, 0);
29303 arg2 = CALL_EXPR_ARG (exp, 1);
29304 arg0 = CALL_EXPR_ARG (exp, 2);
29305 op0 = expand_normal (arg0);
29306 op1 = expand_normal (arg1);
29307 op2 = expand_normal (arg2);
29308 mode0 = insn_data[icode].operand[0].mode;
29309 mode1 = insn_data[icode].operand[1].mode;
29310 mode2 = insn_data[icode].operand[2].mode;
29312 if (GET_MODE (op0) != Pmode)
29313 op0 = convert_to_mode (Pmode, op0, 1);
29314 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29316 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29317 op0 = copy_to_mode_reg (mode0, op0);
29318 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29319 op1 = copy_to_mode_reg (mode1, op1);
29320 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29321 op2 = copy_to_mode_reg (mode2, op2);
29322 pat = GEN_FCN (icode) (op0, op1, op2);
29323 if (! pat)
29324 return 0;
29325 emit_insn (pat);
29326 return 0;
29328 case IX86_BUILTIN_LDMXCSR:
29329 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29330 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29331 emit_move_insn (target, op0);
29332 emit_insn (gen_sse_ldmxcsr (target));
29333 return 0;
29335 case IX86_BUILTIN_STMXCSR:
29336 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29337 emit_insn (gen_sse_stmxcsr (target));
29338 return copy_to_mode_reg (SImode, target);
29340 case IX86_BUILTIN_CLFLUSH:
29341 arg0 = CALL_EXPR_ARG (exp, 0);
29342 op0 = expand_normal (arg0);
29343 icode = CODE_FOR_sse2_clflush;
29344 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29346 if (GET_MODE (op0) != Pmode)
29347 op0 = convert_to_mode (Pmode, op0, 1);
29348 op0 = force_reg (Pmode, op0);
29351 emit_insn (gen_sse2_clflush (op0));
29352 return 0;
29354 case IX86_BUILTIN_MONITOR:
29355 arg0 = CALL_EXPR_ARG (exp, 0);
29356 arg1 = CALL_EXPR_ARG (exp, 1);
29357 arg2 = CALL_EXPR_ARG (exp, 2);
29358 op0 = expand_normal (arg0);
29359 op1 = expand_normal (arg1);
29360 op2 = expand_normal (arg2);
29361 if (!REG_P (op0))
29363 if (GET_MODE (op0) != Pmode)
29364 op0 = convert_to_mode (Pmode, op0, 1);
29365 op0 = force_reg (Pmode, op0);
29367 if (!REG_P (op1))
29368 op1 = copy_to_mode_reg (SImode, op1);
29369 if (!REG_P (op2))
29370 op2 = copy_to_mode_reg (SImode, op2);
29371 emit_insn (ix86_gen_monitor (op0, op1, op2));
29372 return 0;
29374 case IX86_BUILTIN_MWAIT:
29375 arg0 = CALL_EXPR_ARG (exp, 0);
29376 arg1 = CALL_EXPR_ARG (exp, 1);
29377 op0 = expand_normal (arg0);
29378 op1 = expand_normal (arg1);
29379 if (!REG_P (op0))
29380 op0 = copy_to_mode_reg (SImode, op0);
29381 if (!REG_P (op1))
29382 op1 = copy_to_mode_reg (SImode, op1);
29383 emit_insn (gen_sse3_mwait (op0, op1));
29384 return 0;
29386 case IX86_BUILTIN_VEC_INIT_V2SI:
29387 case IX86_BUILTIN_VEC_INIT_V4HI:
29388 case IX86_BUILTIN_VEC_INIT_V8QI:
29389 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29391 case IX86_BUILTIN_VEC_EXT_V2DF:
29392 case IX86_BUILTIN_VEC_EXT_V2DI:
29393 case IX86_BUILTIN_VEC_EXT_V4SF:
29394 case IX86_BUILTIN_VEC_EXT_V4SI:
29395 case IX86_BUILTIN_VEC_EXT_V8HI:
29396 case IX86_BUILTIN_VEC_EXT_V2SI:
29397 case IX86_BUILTIN_VEC_EXT_V4HI:
29398 case IX86_BUILTIN_VEC_EXT_V16QI:
29399 return ix86_expand_vec_ext_builtin (exp, target);
29401 case IX86_BUILTIN_VEC_SET_V2DI:
29402 case IX86_BUILTIN_VEC_SET_V4SF:
29403 case IX86_BUILTIN_VEC_SET_V4SI:
29404 case IX86_BUILTIN_VEC_SET_V8HI:
29405 case IX86_BUILTIN_VEC_SET_V4HI:
29406 case IX86_BUILTIN_VEC_SET_V16QI:
29407 return ix86_expand_vec_set_builtin (exp);
29409 case IX86_BUILTIN_INFQ:
29410 case IX86_BUILTIN_HUGE_VALQ:
29412 REAL_VALUE_TYPE inf;
29413 rtx tmp;
29415 real_inf (&inf);
29416 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29418 tmp = validize_mem (force_const_mem (mode, tmp));
29420 if (target == 0)
29421 target = gen_reg_rtx (mode);
29423 emit_move_insn (target, tmp);
29424 return target;
29427 case IX86_BUILTIN_LLWPCB:
29428 arg0 = CALL_EXPR_ARG (exp, 0);
29429 op0 = expand_normal (arg0);
29430 icode = CODE_FOR_lwp_llwpcb;
29431 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29433 if (GET_MODE (op0) != Pmode)
29434 op0 = convert_to_mode (Pmode, op0, 1);
29435 op0 = force_reg (Pmode, op0);
29437 emit_insn (gen_lwp_llwpcb (op0));
29438 return 0;
29440 case IX86_BUILTIN_SLWPCB:
29441 icode = CODE_FOR_lwp_slwpcb;
29442 if (!target
29443 || !insn_data[icode].operand[0].predicate (target, Pmode))
29444 target = gen_reg_rtx (Pmode);
29445 emit_insn (gen_lwp_slwpcb (target));
29446 return target;
29448 case IX86_BUILTIN_BEXTRI32:
29449 case IX86_BUILTIN_BEXTRI64:
29450 arg0 = CALL_EXPR_ARG (exp, 0);
29451 arg1 = CALL_EXPR_ARG (exp, 1);
29452 op0 = expand_normal (arg0);
29453 op1 = expand_normal (arg1);
29454 icode = (fcode == IX86_BUILTIN_BEXTRI32
29455 ? CODE_FOR_tbm_bextri_si
29456 : CODE_FOR_tbm_bextri_di);
29457 if (!CONST_INT_P (op1))
29459 error ("last argument must be an immediate");
29460 return const0_rtx;
29462 else
29464 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29465 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29466 op1 = GEN_INT (length);
29467 op2 = GEN_INT (lsb_index);
29468 pat = GEN_FCN (icode) (target, op0, op1, op2);
29469 if (pat)
29470 emit_insn (pat);
29471 return target;
29474 case IX86_BUILTIN_RDRAND16_STEP:
29475 icode = CODE_FOR_rdrandhi_1;
29476 mode0 = HImode;
29477 goto rdrand_step;
29479 case IX86_BUILTIN_RDRAND32_STEP:
29480 icode = CODE_FOR_rdrandsi_1;
29481 mode0 = SImode;
29482 goto rdrand_step;
29484 case IX86_BUILTIN_RDRAND64_STEP:
29485 icode = CODE_FOR_rdranddi_1;
29486 mode0 = DImode;
29488 rdrand_step:
29489 op0 = gen_reg_rtx (mode0);
29490 emit_insn (GEN_FCN (icode) (op0));
29492 arg0 = CALL_EXPR_ARG (exp, 0);
29493 op1 = expand_normal (arg0);
29494 if (!address_operand (op1, VOIDmode))
29496 op1 = convert_memory_address (Pmode, op1);
29497 op1 = copy_addr_to_reg (op1);
29499 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29501 op1 = gen_reg_rtx (SImode);
29502 emit_move_insn (op1, CONST1_RTX (SImode));
29504 /* Emit SImode conditional move. */
29505 if (mode0 == HImode)
29507 op2 = gen_reg_rtx (SImode);
29508 emit_insn (gen_zero_extendhisi2 (op2, op0));
29510 else if (mode0 == SImode)
29511 op2 = op0;
29512 else
29513 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29515 if (target == 0)
29516 target = gen_reg_rtx (SImode);
29518 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29519 const0_rtx);
29520 emit_insn (gen_rtx_SET (VOIDmode, target,
29521 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29522 return target;
29524 case IX86_BUILTIN_GATHERSIV2DF:
29525 icode = CODE_FOR_avx2_gathersiv2df;
29526 goto gather_gen;
29527 case IX86_BUILTIN_GATHERSIV4DF:
29528 icode = CODE_FOR_avx2_gathersiv4df;
29529 goto gather_gen;
29530 case IX86_BUILTIN_GATHERDIV2DF:
29531 icode = CODE_FOR_avx2_gatherdiv2df;
29532 goto gather_gen;
29533 case IX86_BUILTIN_GATHERDIV4DF:
29534 icode = CODE_FOR_avx2_gatherdiv4df;
29535 goto gather_gen;
29536 case IX86_BUILTIN_GATHERSIV4SF:
29537 icode = CODE_FOR_avx2_gathersiv4sf;
29538 goto gather_gen;
29539 case IX86_BUILTIN_GATHERSIV8SF:
29540 icode = CODE_FOR_avx2_gathersiv8sf;
29541 goto gather_gen;
29542 case IX86_BUILTIN_GATHERDIV4SF:
29543 icode = CODE_FOR_avx2_gatherdiv4sf;
29544 goto gather_gen;
29545 case IX86_BUILTIN_GATHERDIV8SF:
29546 icode = CODE_FOR_avx2_gatherdiv8sf;
29547 goto gather_gen;
29548 case IX86_BUILTIN_GATHERSIV2DI:
29549 icode = CODE_FOR_avx2_gathersiv2di;
29550 goto gather_gen;
29551 case IX86_BUILTIN_GATHERSIV4DI:
29552 icode = CODE_FOR_avx2_gathersiv4di;
29553 goto gather_gen;
29554 case IX86_BUILTIN_GATHERDIV2DI:
29555 icode = CODE_FOR_avx2_gatherdiv2di;
29556 goto gather_gen;
29557 case IX86_BUILTIN_GATHERDIV4DI:
29558 icode = CODE_FOR_avx2_gatherdiv4di;
29559 goto gather_gen;
29560 case IX86_BUILTIN_GATHERSIV4SI:
29561 icode = CODE_FOR_avx2_gathersiv4si;
29562 goto gather_gen;
29563 case IX86_BUILTIN_GATHERSIV8SI:
29564 icode = CODE_FOR_avx2_gathersiv8si;
29565 goto gather_gen;
29566 case IX86_BUILTIN_GATHERDIV4SI:
29567 icode = CODE_FOR_avx2_gatherdiv4si;
29568 goto gather_gen;
29569 case IX86_BUILTIN_GATHERDIV8SI:
29570 icode = CODE_FOR_avx2_gatherdiv8si;
29571 goto gather_gen;
29572 case IX86_BUILTIN_GATHERALTSIV4DF:
29573 icode = CODE_FOR_avx2_gathersiv4df;
29574 goto gather_gen;
29575 case IX86_BUILTIN_GATHERALTDIV8SF:
29576 icode = CODE_FOR_avx2_gatherdiv8sf;
29577 goto gather_gen;
29578 case IX86_BUILTIN_GATHERALTSIV4DI:
29579 icode = CODE_FOR_avx2_gathersiv4di;
29580 goto gather_gen;
29581 case IX86_BUILTIN_GATHERALTDIV8SI:
29582 icode = CODE_FOR_avx2_gatherdiv8si;
29583 goto gather_gen;
29585 gather_gen:
29586 arg0 = CALL_EXPR_ARG (exp, 0);
29587 arg1 = CALL_EXPR_ARG (exp, 1);
29588 arg2 = CALL_EXPR_ARG (exp, 2);
29589 arg3 = CALL_EXPR_ARG (exp, 3);
29590 arg4 = CALL_EXPR_ARG (exp, 4);
29591 op0 = expand_normal (arg0);
29592 op1 = expand_normal (arg1);
29593 op2 = expand_normal (arg2);
29594 op3 = expand_normal (arg3);
29595 op4 = expand_normal (arg4);
29596 /* Note the arg order is different from the operand order. */
29597 mode0 = insn_data[icode].operand[1].mode;
29598 mode2 = insn_data[icode].operand[3].mode;
29599 mode3 = insn_data[icode].operand[4].mode;
29600 mode4 = insn_data[icode].operand[5].mode;
29602 if (target == NULL_RTX
29603 || GET_MODE (target) != insn_data[icode].operand[0].mode)
29604 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29605 else
29606 subtarget = target;
29608 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29609 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29611 rtx half = gen_reg_rtx (V4SImode);
29612 if (!nonimmediate_operand (op2, V8SImode))
29613 op2 = copy_to_mode_reg (V8SImode, op2);
29614 emit_insn (gen_vec_extract_lo_v8si (half, op2));
29615 op2 = half;
29617 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29618 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29620 rtx (*gen) (rtx, rtx);
29621 rtx half = gen_reg_rtx (mode0);
29622 if (mode0 == V4SFmode)
29623 gen = gen_vec_extract_lo_v8sf;
29624 else
29625 gen = gen_vec_extract_lo_v8si;
29626 if (!nonimmediate_operand (op0, GET_MODE (op0)))
29627 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29628 emit_insn (gen (half, op0));
29629 op0 = half;
29630 if (!nonimmediate_operand (op3, GET_MODE (op3)))
29631 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29632 emit_insn (gen (half, op3));
29633 op3 = half;
29636 /* Force memory operand only with base register here. But we
29637 don't want to do it on memory operand for other builtin
29638 functions. */
29639 if (GET_MODE (op1) != Pmode)
29640 op1 = convert_to_mode (Pmode, op1, 1);
29641 op1 = force_reg (Pmode, op1);
29643 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29644 op0 = copy_to_mode_reg (mode0, op0);
29645 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29646 op1 = copy_to_mode_reg (Pmode, op1);
29647 if (!insn_data[icode].operand[3].predicate (op2, mode2))
29648 op2 = copy_to_mode_reg (mode2, op2);
29649 if (!insn_data[icode].operand[4].predicate (op3, mode3))
29650 op3 = copy_to_mode_reg (mode3, op3);
29651 if (!insn_data[icode].operand[5].predicate (op4, mode4))
29653 error ("last argument must be scale 1, 2, 4, 8");
29654 return const0_rtx;
29657 /* Optimize. If mask is known to have all high bits set,
29658 replace op0 with pc_rtx to signal that the instruction
29659 overwrites the whole destination and doesn't use its
29660 previous contents. */
29661 if (optimize)
29663 if (TREE_CODE (arg3) == VECTOR_CST)
29665 unsigned int negative = 0;
29666 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
29668 tree cst = VECTOR_CST_ELT (arg3, i);
29669 if (TREE_CODE (cst) == INTEGER_CST
29670 && tree_int_cst_sign_bit (cst))
29671 negative++;
29672 else if (TREE_CODE (cst) == REAL_CST
29673 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29674 negative++;
29676 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29677 op0 = pc_rtx;
29679 else if (TREE_CODE (arg3) == SSA_NAME)
29681 /* Recognize also when mask is like:
29682 __v2df src = _mm_setzero_pd ();
29683 __v2df mask = _mm_cmpeq_pd (src, src);
29685 __v8sf src = _mm256_setzero_ps ();
29686 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29687 as that is a cheaper way to load all ones into
29688 a register than having to load a constant from
29689 memory. */
29690 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29691 if (is_gimple_call (def_stmt))
29693 tree fndecl = gimple_call_fndecl (def_stmt);
29694 if (fndecl
29695 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29696 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29698 case IX86_BUILTIN_CMPPD:
29699 case IX86_BUILTIN_CMPPS:
29700 case IX86_BUILTIN_CMPPD256:
29701 case IX86_BUILTIN_CMPPS256:
29702 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29703 break;
29704 /* FALLTHRU */
29705 case IX86_BUILTIN_CMPEQPD:
29706 case IX86_BUILTIN_CMPEQPS:
29707 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29708 && initializer_zerop (gimple_call_arg (def_stmt,
29709 1)))
29710 op0 = pc_rtx;
29711 break;
29712 default:
29713 break;
29719 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29720 if (! pat)
29721 return const0_rtx;
29722 emit_insn (pat);
29724 if (fcode == IX86_BUILTIN_GATHERDIV8SF
29725 || fcode == IX86_BUILTIN_GATHERDIV8SI)
29727 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29728 ? V4SFmode : V4SImode;
29729 if (target == NULL_RTX)
29730 target = gen_reg_rtx (tmode);
29731 if (tmode == V4SFmode)
29732 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29733 else
29734 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29736 else
29737 target = subtarget;
29739 return target;
29741 case IX86_BUILTIN_XABORT:
29742 icode = CODE_FOR_xabort;
29743 arg0 = CALL_EXPR_ARG (exp, 0);
29744 op0 = expand_normal (arg0);
29745 mode0 = insn_data[icode].operand[0].mode;
29746 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29748 error ("the xabort's argument must be an 8-bit immediate");
29749 return const0_rtx;
29751 emit_insn (gen_xabort (op0));
29752 return 0;
29754 default:
29755 break;
29758 for (i = 0, d = bdesc_special_args;
29759 i < ARRAY_SIZE (bdesc_special_args);
29760 i++, d++)
29761 if (d->code == fcode)
29762 return ix86_expand_special_args_builtin (d, exp, target);
29764 for (i = 0, d = bdesc_args;
29765 i < ARRAY_SIZE (bdesc_args);
29766 i++, d++)
29767 if (d->code == fcode)
29768 switch (fcode)
29770 case IX86_BUILTIN_FABSQ:
29771 case IX86_BUILTIN_COPYSIGNQ:
29772 if (!TARGET_SSE2)
29773 /* Emit a normal call if SSE2 isn't available. */
29774 return expand_call (exp, target, ignore);
29775 default:
29776 return ix86_expand_args_builtin (d, exp, target);
29779 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29780 if (d->code == fcode)
29781 return ix86_expand_sse_comi (d, exp, target);
29783 for (i = 0, d = bdesc_pcmpestr;
29784 i < ARRAY_SIZE (bdesc_pcmpestr);
29785 i++, d++)
29786 if (d->code == fcode)
29787 return ix86_expand_sse_pcmpestr (d, exp, target);
29789 for (i = 0, d = bdesc_pcmpistr;
29790 i < ARRAY_SIZE (bdesc_pcmpistr);
29791 i++, d++)
29792 if (d->code == fcode)
29793 return ix86_expand_sse_pcmpistr (d, exp, target);
29795 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29796 if (d->code == fcode)
29797 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29798 (enum ix86_builtin_func_type)
29799 d->flag, d->comparison);
29801 gcc_unreachable ();
29804 /* Returns a function decl for a vectorized version of the builtin function
29805 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29806 if it is not available. */
29808 static tree
29809 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29810 tree type_in)
29812 enum machine_mode in_mode, out_mode;
29813 int in_n, out_n;
29814 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29816 if (TREE_CODE (type_out) != VECTOR_TYPE
29817 || TREE_CODE (type_in) != VECTOR_TYPE
29818 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29819 return NULL_TREE;
29821 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29822 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29823 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29824 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29826 switch (fn)
29828 case BUILT_IN_SQRT:
29829 if (out_mode == DFmode && in_mode == DFmode)
29831 if (out_n == 2 && in_n == 2)
29832 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29833 else if (out_n == 4 && in_n == 4)
29834 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29836 break;
29838 case BUILT_IN_SQRTF:
29839 if (out_mode == SFmode && in_mode == SFmode)
29841 if (out_n == 4 && in_n == 4)
29842 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29843 else if (out_n == 8 && in_n == 8)
29844 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29846 break;
29848 case BUILT_IN_IFLOOR:
29849 case BUILT_IN_LFLOOR:
29850 case BUILT_IN_LLFLOOR:
29851 /* The round insn does not trap on denormals. */
29852 if (flag_trapping_math || !TARGET_ROUND)
29853 break;
29855 if (out_mode == SImode && in_mode == DFmode)
29857 if (out_n == 4 && in_n == 2)
29858 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
29859 else if (out_n == 8 && in_n == 4)
29860 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
29862 break;
29864 case BUILT_IN_IFLOORF:
29865 case BUILT_IN_LFLOORF:
29866 case BUILT_IN_LLFLOORF:
29867 /* The round insn does not trap on denormals. */
29868 if (flag_trapping_math || !TARGET_ROUND)
29869 break;
29871 if (out_mode == SImode && in_mode == SFmode)
29873 if (out_n == 4 && in_n == 4)
29874 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
29875 else if (out_n == 8 && in_n == 8)
29876 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
29878 break;
29880 case BUILT_IN_ICEIL:
29881 case BUILT_IN_LCEIL:
29882 case BUILT_IN_LLCEIL:
29883 /* The round insn does not trap on denormals. */
29884 if (flag_trapping_math || !TARGET_ROUND)
29885 break;
29887 if (out_mode == SImode && in_mode == DFmode)
29889 if (out_n == 4 && in_n == 2)
29890 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
29891 else if (out_n == 8 && in_n == 4)
29892 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
29894 break;
29896 case BUILT_IN_ICEILF:
29897 case BUILT_IN_LCEILF:
29898 case BUILT_IN_LLCEILF:
29899 /* The round insn does not trap on denormals. */
29900 if (flag_trapping_math || !TARGET_ROUND)
29901 break;
29903 if (out_mode == SImode && in_mode == SFmode)
29905 if (out_n == 4 && in_n == 4)
29906 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
29907 else if (out_n == 8 && in_n == 8)
29908 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
29910 break;
29912 case BUILT_IN_IRINT:
29913 case BUILT_IN_LRINT:
29914 case BUILT_IN_LLRINT:
29915 if (out_mode == SImode && in_mode == DFmode)
29917 if (out_n == 4 && in_n == 2)
29918 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29919 else if (out_n == 8 && in_n == 4)
29920 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
29922 break;
29924 case BUILT_IN_IRINTF:
29925 case BUILT_IN_LRINTF:
29926 case BUILT_IN_LLRINTF:
29927 if (out_mode == SImode && in_mode == SFmode)
29929 if (out_n == 4 && in_n == 4)
29930 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29931 else if (out_n == 8 && in_n == 8)
29932 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29934 break;
29936 case BUILT_IN_IROUND:
29937 case BUILT_IN_LROUND:
29938 case BUILT_IN_LLROUND:
29939 /* The round insn does not trap on denormals. */
29940 if (flag_trapping_math || !TARGET_ROUND)
29941 break;
29943 if (out_mode == SImode && in_mode == DFmode)
29945 if (out_n == 4 && in_n == 2)
29946 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
29947 else if (out_n == 8 && in_n == 4)
29948 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
29950 break;
29952 case BUILT_IN_IROUNDF:
29953 case BUILT_IN_LROUNDF:
29954 case BUILT_IN_LLROUNDF:
29955 /* The round insn does not trap on denormals. */
29956 if (flag_trapping_math || !TARGET_ROUND)
29957 break;
29959 if (out_mode == SImode && in_mode == SFmode)
29961 if (out_n == 4 && in_n == 4)
29962 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
29963 else if (out_n == 8 && in_n == 8)
29964 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
29966 break;
29968 case BUILT_IN_COPYSIGN:
29969 if (out_mode == DFmode && in_mode == DFmode)
29971 if (out_n == 2 && in_n == 2)
29972 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29973 else if (out_n == 4 && in_n == 4)
29974 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29976 break;
29978 case BUILT_IN_COPYSIGNF:
29979 if (out_mode == SFmode && in_mode == SFmode)
29981 if (out_n == 4 && in_n == 4)
29982 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29983 else if (out_n == 8 && in_n == 8)
29984 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29986 break;
29988 case BUILT_IN_FLOOR:
29989 /* The round insn does not trap on denormals. */
29990 if (flag_trapping_math || !TARGET_ROUND)
29991 break;
29993 if (out_mode == DFmode && in_mode == DFmode)
29995 if (out_n == 2 && in_n == 2)
29996 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29997 else if (out_n == 4 && in_n == 4)
29998 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30000 break;
30002 case BUILT_IN_FLOORF:
30003 /* The round insn does not trap on denormals. */
30004 if (flag_trapping_math || !TARGET_ROUND)
30005 break;
30007 if (out_mode == SFmode && in_mode == SFmode)
30009 if (out_n == 4 && in_n == 4)
30010 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30011 else if (out_n == 8 && in_n == 8)
30012 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30014 break;
30016 case BUILT_IN_CEIL:
30017 /* The round insn does not trap on denormals. */
30018 if (flag_trapping_math || !TARGET_ROUND)
30019 break;
30021 if (out_mode == DFmode && in_mode == DFmode)
30023 if (out_n == 2 && in_n == 2)
30024 return ix86_builtins[IX86_BUILTIN_CEILPD];
30025 else if (out_n == 4 && in_n == 4)
30026 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30028 break;
30030 case BUILT_IN_CEILF:
30031 /* The round insn does not trap on denormals. */
30032 if (flag_trapping_math || !TARGET_ROUND)
30033 break;
30035 if (out_mode == SFmode && in_mode == SFmode)
30037 if (out_n == 4 && in_n == 4)
30038 return ix86_builtins[IX86_BUILTIN_CEILPS];
30039 else if (out_n == 8 && in_n == 8)
30040 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30042 break;
30044 case BUILT_IN_TRUNC:
30045 /* The round insn does not trap on denormals. */
30046 if (flag_trapping_math || !TARGET_ROUND)
30047 break;
30049 if (out_mode == DFmode && in_mode == DFmode)
30051 if (out_n == 2 && in_n == 2)
30052 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30053 else if (out_n == 4 && in_n == 4)
30054 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30056 break;
30058 case BUILT_IN_TRUNCF:
30059 /* The round insn does not trap on denormals. */
30060 if (flag_trapping_math || !TARGET_ROUND)
30061 break;
30063 if (out_mode == SFmode && in_mode == SFmode)
30065 if (out_n == 4 && in_n == 4)
30066 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30067 else if (out_n == 8 && in_n == 8)
30068 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30070 break;
30072 case BUILT_IN_RINT:
30073 /* The round insn does not trap on denormals. */
30074 if (flag_trapping_math || !TARGET_ROUND)
30075 break;
30077 if (out_mode == DFmode && in_mode == DFmode)
30079 if (out_n == 2 && in_n == 2)
30080 return ix86_builtins[IX86_BUILTIN_RINTPD];
30081 else if (out_n == 4 && in_n == 4)
30082 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30084 break;
30086 case BUILT_IN_RINTF:
30087 /* The round insn does not trap on denormals. */
30088 if (flag_trapping_math || !TARGET_ROUND)
30089 break;
30091 if (out_mode == SFmode && in_mode == SFmode)
30093 if (out_n == 4 && in_n == 4)
30094 return ix86_builtins[IX86_BUILTIN_RINTPS];
30095 else if (out_n == 8 && in_n == 8)
30096 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30098 break;
30100 case BUILT_IN_ROUND:
30101 /* The round insn does not trap on denormals. */
30102 if (flag_trapping_math || !TARGET_ROUND)
30103 break;
30105 if (out_mode == DFmode && in_mode == DFmode)
30107 if (out_n == 2 && in_n == 2)
30108 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30109 else if (out_n == 4 && in_n == 4)
30110 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30112 break;
30114 case BUILT_IN_ROUNDF:
30115 /* The round insn does not trap on denormals. */
30116 if (flag_trapping_math || !TARGET_ROUND)
30117 break;
30119 if (out_mode == SFmode && in_mode == SFmode)
30121 if (out_n == 4 && in_n == 4)
30122 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30123 else if (out_n == 8 && in_n == 8)
30124 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30126 break;
30128 case BUILT_IN_FMA:
30129 if (out_mode == DFmode && in_mode == DFmode)
30131 if (out_n == 2 && in_n == 2)
30132 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30133 if (out_n == 4 && in_n == 4)
30134 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30136 break;
30138 case BUILT_IN_FMAF:
30139 if (out_mode == SFmode && in_mode == SFmode)
30141 if (out_n == 4 && in_n == 4)
30142 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30143 if (out_n == 8 && in_n == 8)
30144 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30146 break;
30148 default:
30149 break;
30152 /* Dispatch to a handler for a vectorization library. */
30153 if (ix86_veclib_handler)
30154 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30155 type_in);
30157 return NULL_TREE;
30160 /* Handler for an SVML-style interface to
30161 a library with vectorized intrinsics. */
30163 static tree
30164 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30166 char name[20];
30167 tree fntype, new_fndecl, args;
30168 unsigned arity;
30169 const char *bname;
30170 enum machine_mode el_mode, in_mode;
30171 int n, in_n;
30173 /* The SVML is suitable for unsafe math only. */
30174 if (!flag_unsafe_math_optimizations)
30175 return NULL_TREE;
30177 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30178 n = TYPE_VECTOR_SUBPARTS (type_out);
30179 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30180 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30181 if (el_mode != in_mode
30182 || n != in_n)
30183 return NULL_TREE;
30185 switch (fn)
30187 case BUILT_IN_EXP:
30188 case BUILT_IN_LOG:
30189 case BUILT_IN_LOG10:
30190 case BUILT_IN_POW:
30191 case BUILT_IN_TANH:
30192 case BUILT_IN_TAN:
30193 case BUILT_IN_ATAN:
30194 case BUILT_IN_ATAN2:
30195 case BUILT_IN_ATANH:
30196 case BUILT_IN_CBRT:
30197 case BUILT_IN_SINH:
30198 case BUILT_IN_SIN:
30199 case BUILT_IN_ASINH:
30200 case BUILT_IN_ASIN:
30201 case BUILT_IN_COSH:
30202 case BUILT_IN_COS:
30203 case BUILT_IN_ACOSH:
30204 case BUILT_IN_ACOS:
30205 if (el_mode != DFmode || n != 2)
30206 return NULL_TREE;
30207 break;
30209 case BUILT_IN_EXPF:
30210 case BUILT_IN_LOGF:
30211 case BUILT_IN_LOG10F:
30212 case BUILT_IN_POWF:
30213 case BUILT_IN_TANHF:
30214 case BUILT_IN_TANF:
30215 case BUILT_IN_ATANF:
30216 case BUILT_IN_ATAN2F:
30217 case BUILT_IN_ATANHF:
30218 case BUILT_IN_CBRTF:
30219 case BUILT_IN_SINHF:
30220 case BUILT_IN_SINF:
30221 case BUILT_IN_ASINHF:
30222 case BUILT_IN_ASINF:
30223 case BUILT_IN_COSHF:
30224 case BUILT_IN_COSF:
30225 case BUILT_IN_ACOSHF:
30226 case BUILT_IN_ACOSF:
30227 if (el_mode != SFmode || n != 4)
30228 return NULL_TREE;
30229 break;
30231 default:
30232 return NULL_TREE;
30235 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30237 if (fn == BUILT_IN_LOGF)
30238 strcpy (name, "vmlsLn4");
30239 else if (fn == BUILT_IN_LOG)
30240 strcpy (name, "vmldLn2");
30241 else if (n == 4)
30243 sprintf (name, "vmls%s", bname+10);
30244 name[strlen (name)-1] = '4';
30246 else
30247 sprintf (name, "vmld%s2", bname+10);
30249 /* Convert to uppercase. */
30250 name[4] &= ~0x20;
30252 arity = 0;
30253 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30254 args;
30255 args = TREE_CHAIN (args))
30256 arity++;
30258 if (arity == 1)
30259 fntype = build_function_type_list (type_out, type_in, NULL);
30260 else
30261 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30263 /* Build a function declaration for the vectorized function. */
30264 new_fndecl = build_decl (BUILTINS_LOCATION,
30265 FUNCTION_DECL, get_identifier (name), fntype);
30266 TREE_PUBLIC (new_fndecl) = 1;
30267 DECL_EXTERNAL (new_fndecl) = 1;
30268 DECL_IS_NOVOPS (new_fndecl) = 1;
30269 TREE_READONLY (new_fndecl) = 1;
30271 return new_fndecl;
30274 /* Handler for an ACML-style interface to
30275 a library with vectorized intrinsics. */
30277 static tree
30278 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30280 char name[20] = "__vr.._";
30281 tree fntype, new_fndecl, args;
30282 unsigned arity;
30283 const char *bname;
30284 enum machine_mode el_mode, in_mode;
30285 int n, in_n;
30287 /* The ACML is 64bits only and suitable for unsafe math only as
30288 it does not correctly support parts of IEEE with the required
30289 precision such as denormals. */
30290 if (!TARGET_64BIT
30291 || !flag_unsafe_math_optimizations)
30292 return NULL_TREE;
30294 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30295 n = TYPE_VECTOR_SUBPARTS (type_out);
30296 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30297 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30298 if (el_mode != in_mode
30299 || n != in_n)
30300 return NULL_TREE;
30302 switch (fn)
30304 case BUILT_IN_SIN:
30305 case BUILT_IN_COS:
30306 case BUILT_IN_EXP:
30307 case BUILT_IN_LOG:
30308 case BUILT_IN_LOG2:
30309 case BUILT_IN_LOG10:
30310 name[4] = 'd';
30311 name[5] = '2';
30312 if (el_mode != DFmode
30313 || n != 2)
30314 return NULL_TREE;
30315 break;
30317 case BUILT_IN_SINF:
30318 case BUILT_IN_COSF:
30319 case BUILT_IN_EXPF:
30320 case BUILT_IN_POWF:
30321 case BUILT_IN_LOGF:
30322 case BUILT_IN_LOG2F:
30323 case BUILT_IN_LOG10F:
30324 name[4] = 's';
30325 name[5] = '4';
30326 if (el_mode != SFmode
30327 || n != 4)
30328 return NULL_TREE;
30329 break;
30331 default:
30332 return NULL_TREE;
30335 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30336 sprintf (name + 7, "%s", bname+10);
30338 arity = 0;
30339 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30340 args;
30341 args = TREE_CHAIN (args))
30342 arity++;
30344 if (arity == 1)
30345 fntype = build_function_type_list (type_out, type_in, NULL);
30346 else
30347 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30349 /* Build a function declaration for the vectorized function. */
30350 new_fndecl = build_decl (BUILTINS_LOCATION,
30351 FUNCTION_DECL, get_identifier (name), fntype);
30352 TREE_PUBLIC (new_fndecl) = 1;
30353 DECL_EXTERNAL (new_fndecl) = 1;
30354 DECL_IS_NOVOPS (new_fndecl) = 1;
30355 TREE_READONLY (new_fndecl) = 1;
30357 return new_fndecl;
30360 /* Returns a decl of a function that implements gather load with
30361 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30362 Return NULL_TREE if it is not available. */
30364 static tree
30365 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30366 const_tree index_type, int scale)
30368 bool si;
30369 enum ix86_builtins code;
30371 if (! TARGET_AVX2)
30372 return NULL_TREE;
30374 if ((TREE_CODE (index_type) != INTEGER_TYPE
30375 && !POINTER_TYPE_P (index_type))
30376 || (TYPE_MODE (index_type) != SImode
30377 && TYPE_MODE (index_type) != DImode))
30378 return NULL_TREE;
30380 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30381 return NULL_TREE;
30383 /* v*gather* insn sign extends index to pointer mode. */
30384 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30385 && TYPE_UNSIGNED (index_type))
30386 return NULL_TREE;
30388 if (scale <= 0
30389 || scale > 8
30390 || (scale & (scale - 1)) != 0)
30391 return NULL_TREE;
30393 si = TYPE_MODE (index_type) == SImode;
30394 switch (TYPE_MODE (mem_vectype))
30396 case V2DFmode:
30397 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30398 break;
30399 case V4DFmode:
30400 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30401 break;
30402 case V2DImode:
30403 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30404 break;
30405 case V4DImode:
30406 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30407 break;
30408 case V4SFmode:
30409 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30410 break;
30411 case V8SFmode:
30412 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30413 break;
30414 case V4SImode:
30415 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30416 break;
30417 case V8SImode:
30418 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30419 break;
30420 default:
30421 return NULL_TREE;
30424 return ix86_builtins[code];
30427 /* Returns a code for a target-specific builtin that implements
30428 reciprocal of the function, or NULL_TREE if not available. */
30430 static tree
30431 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30432 bool sqrt ATTRIBUTE_UNUSED)
30434 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30435 && flag_finite_math_only && !flag_trapping_math
30436 && flag_unsafe_math_optimizations))
30437 return NULL_TREE;
30439 if (md_fn)
30440 /* Machine dependent builtins. */
30441 switch (fn)
30443 /* Vectorized version of sqrt to rsqrt conversion. */
30444 case IX86_BUILTIN_SQRTPS_NR:
30445 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30447 case IX86_BUILTIN_SQRTPS_NR256:
30448 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30450 default:
30451 return NULL_TREE;
30453 else
30454 /* Normal builtins. */
30455 switch (fn)
30457 /* Sqrt to rsqrt conversion. */
30458 case BUILT_IN_SQRTF:
30459 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30461 default:
30462 return NULL_TREE;
30466 /* Helper for avx_vpermilps256_operand et al. This is also used by
30467 the expansion functions to turn the parallel back into a mask.
30468 The return value is 0 for no match and the imm8+1 for a match. */
30471 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30473 unsigned i, nelt = GET_MODE_NUNITS (mode);
30474 unsigned mask = 0;
30475 unsigned char ipar[8];
30477 if (XVECLEN (par, 0) != (int) nelt)
30478 return 0;
30480 /* Validate that all of the elements are constants, and not totally
30481 out of range. Copy the data into an integral array to make the
30482 subsequent checks easier. */
30483 for (i = 0; i < nelt; ++i)
30485 rtx er = XVECEXP (par, 0, i);
30486 unsigned HOST_WIDE_INT ei;
30488 if (!CONST_INT_P (er))
30489 return 0;
30490 ei = INTVAL (er);
30491 if (ei >= nelt)
30492 return 0;
30493 ipar[i] = ei;
30496 switch (mode)
30498 case V4DFmode:
30499 /* In the 256-bit DFmode case, we can only move elements within
30500 a 128-bit lane. */
30501 for (i = 0; i < 2; ++i)
30503 if (ipar[i] >= 2)
30504 return 0;
30505 mask |= ipar[i] << i;
30507 for (i = 2; i < 4; ++i)
30509 if (ipar[i] < 2)
30510 return 0;
30511 mask |= (ipar[i] - 2) << i;
30513 break;
30515 case V8SFmode:
30516 /* In the 256-bit SFmode case, we have full freedom of movement
30517 within the low 128-bit lane, but the high 128-bit lane must
30518 mirror the exact same pattern. */
30519 for (i = 0; i < 4; ++i)
30520 if (ipar[i] + 4 != ipar[i + 4])
30521 return 0;
30522 nelt = 4;
30523 /* FALLTHRU */
30525 case V2DFmode:
30526 case V4SFmode:
30527 /* In the 128-bit case, we've full freedom in the placement of
30528 the elements from the source operand. */
30529 for (i = 0; i < nelt; ++i)
30530 mask |= ipar[i] << (i * (nelt / 2));
30531 break;
30533 default:
30534 gcc_unreachable ();
30537 /* Make sure success has a non-zero value by adding one. */
30538 return mask + 1;
30541 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30542 the expansion functions to turn the parallel back into a mask.
30543 The return value is 0 for no match and the imm8+1 for a match. */
30546 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30548 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30549 unsigned mask = 0;
30550 unsigned char ipar[8];
30552 if (XVECLEN (par, 0) != (int) nelt)
30553 return 0;
30555 /* Validate that all of the elements are constants, and not totally
30556 out of range. Copy the data into an integral array to make the
30557 subsequent checks easier. */
30558 for (i = 0; i < nelt; ++i)
30560 rtx er = XVECEXP (par, 0, i);
30561 unsigned HOST_WIDE_INT ei;
30563 if (!CONST_INT_P (er))
30564 return 0;
30565 ei = INTVAL (er);
30566 if (ei >= 2 * nelt)
30567 return 0;
30568 ipar[i] = ei;
30571 /* Validate that the halves of the permute are halves. */
30572 for (i = 0; i < nelt2 - 1; ++i)
30573 if (ipar[i] + 1 != ipar[i + 1])
30574 return 0;
30575 for (i = nelt2; i < nelt - 1; ++i)
30576 if (ipar[i] + 1 != ipar[i + 1])
30577 return 0;
30579 /* Reconstruct the mask. */
30580 for (i = 0; i < 2; ++i)
30582 unsigned e = ipar[i * nelt2];
30583 if (e % nelt2)
30584 return 0;
30585 e /= nelt2;
30586 mask |= e << (i * 4);
30589 /* Make sure success has a non-zero value by adding one. */
30590 return mask + 1;
30593 /* Store OPERAND to the memory after reload is completed. This means
30594 that we can't easily use assign_stack_local. */
30596 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30598 rtx result;
30600 gcc_assert (reload_completed);
30601 if (ix86_using_red_zone ())
30603 result = gen_rtx_MEM (mode,
30604 gen_rtx_PLUS (Pmode,
30605 stack_pointer_rtx,
30606 GEN_INT (-RED_ZONE_SIZE)));
30607 emit_move_insn (result, operand);
30609 else if (TARGET_64BIT)
30611 switch (mode)
30613 case HImode:
30614 case SImode:
30615 operand = gen_lowpart (DImode, operand);
30616 /* FALLTHRU */
30617 case DImode:
30618 emit_insn (
30619 gen_rtx_SET (VOIDmode,
30620 gen_rtx_MEM (DImode,
30621 gen_rtx_PRE_DEC (DImode,
30622 stack_pointer_rtx)),
30623 operand));
30624 break;
30625 default:
30626 gcc_unreachable ();
30628 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30630 else
30632 switch (mode)
30634 case DImode:
30636 rtx operands[2];
30637 split_double_mode (mode, &operand, 1, operands, operands + 1);
30638 emit_insn (
30639 gen_rtx_SET (VOIDmode,
30640 gen_rtx_MEM (SImode,
30641 gen_rtx_PRE_DEC (Pmode,
30642 stack_pointer_rtx)),
30643 operands[1]));
30644 emit_insn (
30645 gen_rtx_SET (VOIDmode,
30646 gen_rtx_MEM (SImode,
30647 gen_rtx_PRE_DEC (Pmode,
30648 stack_pointer_rtx)),
30649 operands[0]));
30651 break;
30652 case HImode:
30653 /* Store HImodes as SImodes. */
30654 operand = gen_lowpart (SImode, operand);
30655 /* FALLTHRU */
30656 case SImode:
30657 emit_insn (
30658 gen_rtx_SET (VOIDmode,
30659 gen_rtx_MEM (GET_MODE (operand),
30660 gen_rtx_PRE_DEC (SImode,
30661 stack_pointer_rtx)),
30662 operand));
30663 break;
30664 default:
30665 gcc_unreachable ();
30667 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30669 return result;
30672 /* Free operand from the memory. */
30673 void
30674 ix86_free_from_memory (enum machine_mode mode)
30676 if (!ix86_using_red_zone ())
30678 int size;
30680 if (mode == DImode || TARGET_64BIT)
30681 size = 8;
30682 else
30683 size = 4;
30684 /* Use LEA to deallocate stack space. In peephole2 it will be converted
30685 to pop or add instruction if registers are available. */
30686 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30687 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30688 GEN_INT (size))));
30692 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30694 Put float CONST_DOUBLE in the constant pool instead of fp regs.
30695 QImode must go into class Q_REGS.
30696 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
30697 movdf to do mem-to-mem moves through integer regs. */
30699 static reg_class_t
30700 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30702 enum machine_mode mode = GET_MODE (x);
30704 /* We're only allowed to return a subclass of CLASS. Many of the
30705 following checks fail for NO_REGS, so eliminate that early. */
30706 if (regclass == NO_REGS)
30707 return NO_REGS;
30709 /* All classes can load zeros. */
30710 if (x == CONST0_RTX (mode))
30711 return regclass;
30713 /* Force constants into memory if we are loading a (nonzero) constant into
30714 an MMX or SSE register. This is because there are no MMX/SSE instructions
30715 to load from a constant. */
30716 if (CONSTANT_P (x)
30717 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30718 return NO_REGS;
30720 /* Prefer SSE regs only, if we can use them for math. */
30721 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30722 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30724 /* Floating-point constants need more complex checks. */
30725 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30727 /* General regs can load everything. */
30728 if (reg_class_subset_p (regclass, GENERAL_REGS))
30729 return regclass;
30731 /* Floats can load 0 and 1 plus some others. Note that we eliminated
30732 zero above. We only want to wind up preferring 80387 registers if
30733 we plan on doing computation with them. */
30734 if (TARGET_80387
30735 && standard_80387_constant_p (x) > 0)
30737 /* Limit class to non-sse. */
30738 if (regclass == FLOAT_SSE_REGS)
30739 return FLOAT_REGS;
30740 if (regclass == FP_TOP_SSE_REGS)
30741 return FP_TOP_REG;
30742 if (regclass == FP_SECOND_SSE_REGS)
30743 return FP_SECOND_REG;
30744 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30745 return regclass;
30748 return NO_REGS;
30751 /* Generally when we see PLUS here, it's the function invariant
30752 (plus soft-fp const_int). Which can only be computed into general
30753 regs. */
30754 if (GET_CODE (x) == PLUS)
30755 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30757 /* QImode constants are easy to load, but non-constant QImode data
30758 must go into Q_REGS. */
30759 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30761 if (reg_class_subset_p (regclass, Q_REGS))
30762 return regclass;
30763 if (reg_class_subset_p (Q_REGS, regclass))
30764 return Q_REGS;
30765 return NO_REGS;
30768 return regclass;
30771 /* Discourage putting floating-point values in SSE registers unless
30772 SSE math is being used, and likewise for the 387 registers. */
30773 static reg_class_t
30774 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30776 enum machine_mode mode = GET_MODE (x);
30778 /* Restrict the output reload class to the register bank that we are doing
30779 math on. If we would like not to return a subset of CLASS, reject this
30780 alternative: if reload cannot do this, it will still use its choice. */
30781 mode = GET_MODE (x);
30782 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30783 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30785 if (X87_FLOAT_MODE_P (mode))
30787 if (regclass == FP_TOP_SSE_REGS)
30788 return FP_TOP_REG;
30789 else if (regclass == FP_SECOND_SSE_REGS)
30790 return FP_SECOND_REG;
30791 else
30792 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30795 return regclass;
30798 static reg_class_t
30799 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30800 enum machine_mode mode, secondary_reload_info *sri)
30802 /* Double-word spills from general registers to non-offsettable memory
30803 references (zero-extended addresses) require special handling. */
30804 if (TARGET_64BIT
30805 && MEM_P (x)
30806 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30807 && rclass == GENERAL_REGS
30808 && !offsettable_memref_p (x))
30810 sri->icode = (in_p
30811 ? CODE_FOR_reload_noff_load
30812 : CODE_FOR_reload_noff_store);
30813 /* Add the cost of moving address to a temporary. */
30814 sri->extra_cost = 1;
30816 return NO_REGS;
30819 /* QImode spills from non-QI registers require
30820 intermediate register on 32bit targets. */
30821 if (!TARGET_64BIT
30822 && !in_p && mode == QImode
30823 && (rclass == GENERAL_REGS
30824 || rclass == LEGACY_REGS
30825 || rclass == INDEX_REGS))
30827 int regno;
30829 if (REG_P (x))
30830 regno = REGNO (x);
30831 else
30832 regno = -1;
30834 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30835 regno = true_regnum (x);
30837 /* Return Q_REGS if the operand is in memory. */
30838 if (regno == -1)
30839 return Q_REGS;
30842 /* This condition handles corner case where an expression involving
30843 pointers gets vectorized. We're trying to use the address of a
30844 stack slot as a vector initializer.
30846 (set (reg:V2DI 74 [ vect_cst_.2 ])
30847 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
30849 Eventually frame gets turned into sp+offset like this:
30851 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30852 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30853 (const_int 392 [0x188]))))
30855 That later gets turned into:
30857 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30858 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30859 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
30861 We'll have the following reload recorded:
30863 Reload 0: reload_in (DI) =
30864 (plus:DI (reg/f:DI 7 sp)
30865 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
30866 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30867 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
30868 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
30869 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30870 reload_reg_rtx: (reg:V2DI 22 xmm1)
30872 Which isn't going to work since SSE instructions can't handle scalar
30873 additions. Returning GENERAL_REGS forces the addition into integer
30874 register and reload can handle subsequent reloads without problems. */
30876 if (in_p && GET_CODE (x) == PLUS
30877 && SSE_CLASS_P (rclass)
30878 && SCALAR_INT_MODE_P (mode))
30879 return GENERAL_REGS;
30881 return NO_REGS;
30884 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
30886 static bool
30887 ix86_class_likely_spilled_p (reg_class_t rclass)
30889 switch (rclass)
30891 case AREG:
30892 case DREG:
30893 case CREG:
30894 case BREG:
30895 case AD_REGS:
30896 case SIREG:
30897 case DIREG:
30898 case SSE_FIRST_REG:
30899 case FP_TOP_REG:
30900 case FP_SECOND_REG:
30901 return true;
30903 default:
30904 break;
30907 return false;
30910 /* If we are copying between general and FP registers, we need a memory
30911 location. The same is true for SSE and MMX registers.
30913 To optimize register_move_cost performance, allow inline variant.
30915 The macro can't work reliably when one of the CLASSES is class containing
30916 registers from multiple units (SSE, MMX, integer). We avoid this by never
30917 combining those units in single alternative in the machine description.
30918 Ensure that this constraint holds to avoid unexpected surprises.
30920 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
30921 enforce these sanity checks. */
30923 static inline bool
30924 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30925 enum machine_mode mode, int strict)
30927 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
30928 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
30929 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
30930 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30931 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30932 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30934 gcc_assert (!strict);
30935 return true;
30938 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30939 return true;
30941 /* ??? This is a lie. We do have moves between mmx/general, and for
30942 mmx/sse2. But by saying we need secondary memory we discourage the
30943 register allocator from using the mmx registers unless needed. */
30944 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30945 return true;
30947 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30949 /* SSE1 doesn't have any direct moves from other classes. */
30950 if (!TARGET_SSE2)
30951 return true;
30953 /* If the target says that inter-unit moves are more expensive
30954 than moving through memory, then don't generate them. */
30955 if (!TARGET_INTER_UNIT_MOVES)
30956 return true;
30958 /* Between SSE and general, we have moves no larger than word size. */
30959 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30960 return true;
30963 return false;
30966 bool
30967 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30968 enum machine_mode mode, int strict)
30970 return inline_secondary_memory_needed (class1, class2, mode, strict);
30973 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30975 On the 80386, this is the size of MODE in words,
30976 except in the FP regs, where a single reg is always enough. */
30978 static unsigned char
30979 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30981 if (MAYBE_INTEGER_CLASS_P (rclass))
30983 if (mode == XFmode)
30984 return (TARGET_64BIT ? 2 : 3);
30985 else if (mode == XCmode)
30986 return (TARGET_64BIT ? 4 : 6);
30987 else
30988 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30990 else
30992 if (COMPLEX_MODE_P (mode))
30993 return 2;
30994 else
30995 return 1;
30999 /* Return true if the registers in CLASS cannot represent the change from
31000 modes FROM to TO. */
31002 bool
31003 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31004 enum reg_class regclass)
31006 if (from == to)
31007 return false;
31009 /* x87 registers can't do subreg at all, as all values are reformatted
31010 to extended precision. */
31011 if (MAYBE_FLOAT_CLASS_P (regclass))
31012 return true;
31014 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31016 /* Vector registers do not support QI or HImode loads. If we don't
31017 disallow a change to these modes, reload will assume it's ok to
31018 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31019 the vec_dupv4hi pattern. */
31020 if (GET_MODE_SIZE (from) < 4)
31021 return true;
31023 /* Vector registers do not support subreg with nonzero offsets, which
31024 are otherwise valid for integer registers. Since we can't see
31025 whether we have a nonzero offset from here, prohibit all
31026 nonparadoxical subregs changing size. */
31027 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31028 return true;
31031 return false;
31034 /* Return the cost of moving data of mode M between a
31035 register and memory. A value of 2 is the default; this cost is
31036 relative to those in `REGISTER_MOVE_COST'.
31038 This function is used extensively by register_move_cost that is used to
31039 build tables at startup. Make it inline in this case.
31040 When IN is 2, return maximum of in and out move cost.
31042 If moving between registers and memory is more expensive than
31043 between two registers, you should define this macro to express the
31044 relative cost.
31046 Model also increased moving costs of QImode registers in non
31047 Q_REGS classes.
31049 static inline int
31050 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31051 int in)
31053 int cost;
31054 if (FLOAT_CLASS_P (regclass))
31056 int index;
31057 switch (mode)
31059 case SFmode:
31060 index = 0;
31061 break;
31062 case DFmode:
31063 index = 1;
31064 break;
31065 case XFmode:
31066 index = 2;
31067 break;
31068 default:
31069 return 100;
31071 if (in == 2)
31072 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31073 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31075 if (SSE_CLASS_P (regclass))
31077 int index;
31078 switch (GET_MODE_SIZE (mode))
31080 case 4:
31081 index = 0;
31082 break;
31083 case 8:
31084 index = 1;
31085 break;
31086 case 16:
31087 index = 2;
31088 break;
31089 default:
31090 return 100;
31092 if (in == 2)
31093 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31094 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31096 if (MMX_CLASS_P (regclass))
31098 int index;
31099 switch (GET_MODE_SIZE (mode))
31101 case 4:
31102 index = 0;
31103 break;
31104 case 8:
31105 index = 1;
31106 break;
31107 default:
31108 return 100;
31110 if (in)
31111 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31112 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31114 switch (GET_MODE_SIZE (mode))
31116 case 1:
31117 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31119 if (!in)
31120 return ix86_cost->int_store[0];
31121 if (TARGET_PARTIAL_REG_DEPENDENCY
31122 && optimize_function_for_speed_p (cfun))
31123 cost = ix86_cost->movzbl_load;
31124 else
31125 cost = ix86_cost->int_load[0];
31126 if (in == 2)
31127 return MAX (cost, ix86_cost->int_store[0]);
31128 return cost;
31130 else
31132 if (in == 2)
31133 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31134 if (in)
31135 return ix86_cost->movzbl_load;
31136 else
31137 return ix86_cost->int_store[0] + 4;
31139 break;
31140 case 2:
31141 if (in == 2)
31142 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31143 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31144 default:
31145 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31146 if (mode == TFmode)
31147 mode = XFmode;
31148 if (in == 2)
31149 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31150 else if (in)
31151 cost = ix86_cost->int_load[2];
31152 else
31153 cost = ix86_cost->int_store[2];
31154 return (cost * (((int) GET_MODE_SIZE (mode)
31155 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31159 static int
31160 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31161 bool in)
31163 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31167 /* Return the cost of moving data from a register in class CLASS1 to
31168 one in class CLASS2.
31170 It is not required that the cost always equal 2 when FROM is the same as TO;
31171 on some machines it is expensive to move between registers if they are not
31172 general registers. */
31174 static int
31175 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31176 reg_class_t class2_i)
31178 enum reg_class class1 = (enum reg_class) class1_i;
31179 enum reg_class class2 = (enum reg_class) class2_i;
31181 /* In case we require secondary memory, compute cost of the store followed
31182 by load. In order to avoid bad register allocation choices, we need
31183 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31185 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31187 int cost = 1;
31189 cost += inline_memory_move_cost (mode, class1, 2);
31190 cost += inline_memory_move_cost (mode, class2, 2);
31192 /* In case of copying from general_purpose_register we may emit multiple
31193 stores followed by single load causing memory size mismatch stall.
31194 Count this as arbitrarily high cost of 20. */
31195 if (targetm.class_max_nregs (class1, mode)
31196 > targetm.class_max_nregs (class2, mode))
31197 cost += 20;
31199 /* In the case of FP/MMX moves, the registers actually overlap, and we
31200 have to switch modes in order to treat them differently. */
31201 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31202 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31203 cost += 20;
31205 return cost;
31208 /* Moves between SSE/MMX and integer unit are expensive. */
31209 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31210 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31212 /* ??? By keeping returned value relatively high, we limit the number
31213 of moves between integer and MMX/SSE registers for all targets.
31214 Additionally, high value prevents problem with x86_modes_tieable_p(),
31215 where integer modes in MMX/SSE registers are not tieable
31216 because of missing QImode and HImode moves to, from or between
31217 MMX/SSE registers. */
31218 return MAX (8, ix86_cost->mmxsse_to_integer);
31220 if (MAYBE_FLOAT_CLASS_P (class1))
31221 return ix86_cost->fp_move;
31222 if (MAYBE_SSE_CLASS_P (class1))
31223 return ix86_cost->sse_move;
31224 if (MAYBE_MMX_CLASS_P (class1))
31225 return ix86_cost->mmx_move;
31226 return 2;
31229 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31230 MODE. */
31232 bool
31233 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31235 /* Flags and only flags can only hold CCmode values. */
31236 if (CC_REGNO_P (regno))
31237 return GET_MODE_CLASS (mode) == MODE_CC;
31238 if (GET_MODE_CLASS (mode) == MODE_CC
31239 || GET_MODE_CLASS (mode) == MODE_RANDOM
31240 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31241 return false;
31242 if (FP_REGNO_P (regno))
31243 return VALID_FP_MODE_P (mode);
31244 if (SSE_REGNO_P (regno))
31246 /* We implement the move patterns for all vector modes into and
31247 out of SSE registers, even when no operation instructions
31248 are available. OImode move is available only when AVX is
31249 enabled. */
31250 return ((TARGET_AVX && mode == OImode)
31251 || VALID_AVX256_REG_MODE (mode)
31252 || VALID_SSE_REG_MODE (mode)
31253 || VALID_SSE2_REG_MODE (mode)
31254 || VALID_MMX_REG_MODE (mode)
31255 || VALID_MMX_REG_MODE_3DNOW (mode));
31257 if (MMX_REGNO_P (regno))
31259 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31260 so if the register is available at all, then we can move data of
31261 the given mode into or out of it. */
31262 return (VALID_MMX_REG_MODE (mode)
31263 || VALID_MMX_REG_MODE_3DNOW (mode));
31266 if (mode == QImode)
31268 /* Take care for QImode values - they can be in non-QI regs,
31269 but then they do cause partial register stalls. */
31270 if (regno <= BX_REG || TARGET_64BIT)
31271 return true;
31272 if (!TARGET_PARTIAL_REG_STALL)
31273 return true;
31274 return !can_create_pseudo_p ();
31276 /* We handle both integer and floats in the general purpose registers. */
31277 else if (VALID_INT_MODE_P (mode))
31278 return true;
31279 else if (VALID_FP_MODE_P (mode))
31280 return true;
31281 else if (VALID_DFP_MODE_P (mode))
31282 return true;
31283 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31284 on to use that value in smaller contexts, this can easily force a
31285 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31286 supporting DImode, allow it. */
31287 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31288 return true;
31290 return false;
31293 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31294 tieable integer mode. */
31296 static bool
31297 ix86_tieable_integer_mode_p (enum machine_mode mode)
31299 switch (mode)
31301 case HImode:
31302 case SImode:
31303 return true;
31305 case QImode:
31306 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31308 case DImode:
31309 return TARGET_64BIT;
31311 default:
31312 return false;
31316 /* Return true if MODE1 is accessible in a register that can hold MODE2
31317 without copying. That is, all register classes that can hold MODE2
31318 can also hold MODE1. */
31320 bool
31321 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31323 if (mode1 == mode2)
31324 return true;
31326 if (ix86_tieable_integer_mode_p (mode1)
31327 && ix86_tieable_integer_mode_p (mode2))
31328 return true;
31330 /* MODE2 being XFmode implies fp stack or general regs, which means we
31331 can tie any smaller floating point modes to it. Note that we do not
31332 tie this with TFmode. */
31333 if (mode2 == XFmode)
31334 return mode1 == SFmode || mode1 == DFmode;
31336 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31337 that we can tie it with SFmode. */
31338 if (mode2 == DFmode)
31339 return mode1 == SFmode;
31341 /* If MODE2 is only appropriate for an SSE register, then tie with
31342 any other mode acceptable to SSE registers. */
31343 if (GET_MODE_SIZE (mode2) == 16
31344 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31345 return (GET_MODE_SIZE (mode1) == 16
31346 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31348 /* If MODE2 is appropriate for an MMX register, then tie
31349 with any other mode acceptable to MMX registers. */
31350 if (GET_MODE_SIZE (mode2) == 8
31351 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31352 return (GET_MODE_SIZE (mode1) == 8
31353 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31355 return false;
31358 /* Compute a (partial) cost for rtx X. Return true if the complete
31359 cost has been computed, and false if subexpressions should be
31360 scanned. In either case, *TOTAL contains the cost result. */
31362 static bool
31363 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31364 bool speed)
31366 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31367 enum machine_mode mode = GET_MODE (x);
31368 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31370 switch (code)
31372 case CONST_INT:
31373 case CONST:
31374 case LABEL_REF:
31375 case SYMBOL_REF:
31376 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31377 *total = 3;
31378 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31379 *total = 2;
31380 else if (flag_pic && SYMBOLIC_CONST (x)
31381 && (!TARGET_64BIT
31382 || (!GET_CODE (x) != LABEL_REF
31383 && (GET_CODE (x) != SYMBOL_REF
31384 || !SYMBOL_REF_LOCAL_P (x)))))
31385 *total = 1;
31386 else
31387 *total = 0;
31388 return true;
31390 case CONST_DOUBLE:
31391 if (mode == VOIDmode)
31392 *total = 0;
31393 else
31394 switch (standard_80387_constant_p (x))
31396 case 1: /* 0.0 */
31397 *total = 1;
31398 break;
31399 default: /* Other constants */
31400 *total = 2;
31401 break;
31402 case 0:
31403 case -1:
31404 /* Start with (MEM (SYMBOL_REF)), since that's where
31405 it'll probably end up. Add a penalty for size. */
31406 *total = (COSTS_N_INSNS (1)
31407 + (flag_pic != 0 && !TARGET_64BIT)
31408 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31409 break;
31411 return true;
31413 case ZERO_EXTEND:
31414 /* The zero extensions is often completely free on x86_64, so make
31415 it as cheap as possible. */
31416 if (TARGET_64BIT && mode == DImode
31417 && GET_MODE (XEXP (x, 0)) == SImode)
31418 *total = 1;
31419 else if (TARGET_ZERO_EXTEND_WITH_AND)
31420 *total = cost->add;
31421 else
31422 *total = cost->movzx;
31423 return false;
31425 case SIGN_EXTEND:
31426 *total = cost->movsx;
31427 return false;
31429 case ASHIFT:
31430 if (CONST_INT_P (XEXP (x, 1))
31431 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31433 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31434 if (value == 1)
31436 *total = cost->add;
31437 return false;
31439 if ((value == 2 || value == 3)
31440 && cost->lea <= cost->shift_const)
31442 *total = cost->lea;
31443 return false;
31446 /* FALLTHRU */
31448 case ROTATE:
31449 case ASHIFTRT:
31450 case LSHIFTRT:
31451 case ROTATERT:
31452 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31454 if (CONST_INT_P (XEXP (x, 1)))
31456 if (INTVAL (XEXP (x, 1)) > 32)
31457 *total = cost->shift_const + COSTS_N_INSNS (2);
31458 else
31459 *total = cost->shift_const * 2;
31461 else
31463 if (GET_CODE (XEXP (x, 1)) == AND)
31464 *total = cost->shift_var * 2;
31465 else
31466 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31469 else
31471 if (CONST_INT_P (XEXP (x, 1)))
31472 *total = cost->shift_const;
31473 else
31474 *total = cost->shift_var;
31476 return false;
31478 case FMA:
31480 rtx sub;
31482 gcc_assert (FLOAT_MODE_P (mode));
31483 gcc_assert (TARGET_FMA || TARGET_FMA4);
31485 /* ??? SSE scalar/vector cost should be used here. */
31486 /* ??? Bald assumption that fma has the same cost as fmul. */
31487 *total = cost->fmul;
31488 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31490 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31491 sub = XEXP (x, 0);
31492 if (GET_CODE (sub) == NEG)
31493 sub = XEXP (sub, 0);
31494 *total += rtx_cost (sub, FMA, 0, speed);
31496 sub = XEXP (x, 2);
31497 if (GET_CODE (sub) == NEG)
31498 sub = XEXP (sub, 0);
31499 *total += rtx_cost (sub, FMA, 2, speed);
31500 return true;
31503 case MULT:
31504 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31506 /* ??? SSE scalar cost should be used here. */
31507 *total = cost->fmul;
31508 return false;
31510 else if (X87_FLOAT_MODE_P (mode))
31512 *total = cost->fmul;
31513 return false;
31515 else if (FLOAT_MODE_P (mode))
31517 /* ??? SSE vector cost should be used here. */
31518 *total = cost->fmul;
31519 return false;
31521 else
31523 rtx op0 = XEXP (x, 0);
31524 rtx op1 = XEXP (x, 1);
31525 int nbits;
31526 if (CONST_INT_P (XEXP (x, 1)))
31528 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31529 for (nbits = 0; value != 0; value &= value - 1)
31530 nbits++;
31532 else
31533 /* This is arbitrary. */
31534 nbits = 7;
31536 /* Compute costs correctly for widening multiplication. */
31537 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31538 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31539 == GET_MODE_SIZE (mode))
31541 int is_mulwiden = 0;
31542 enum machine_mode inner_mode = GET_MODE (op0);
31544 if (GET_CODE (op0) == GET_CODE (op1))
31545 is_mulwiden = 1, op1 = XEXP (op1, 0);
31546 else if (CONST_INT_P (op1))
31548 if (GET_CODE (op0) == SIGN_EXTEND)
31549 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31550 == INTVAL (op1);
31551 else
31552 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31555 if (is_mulwiden)
31556 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31559 *total = (cost->mult_init[MODE_INDEX (mode)]
31560 + nbits * cost->mult_bit
31561 + rtx_cost (op0, outer_code, opno, speed)
31562 + rtx_cost (op1, outer_code, opno, speed));
31564 return true;
31567 case DIV:
31568 case UDIV:
31569 case MOD:
31570 case UMOD:
31571 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31572 /* ??? SSE cost should be used here. */
31573 *total = cost->fdiv;
31574 else if (X87_FLOAT_MODE_P (mode))
31575 *total = cost->fdiv;
31576 else if (FLOAT_MODE_P (mode))
31577 /* ??? SSE vector cost should be used here. */
31578 *total = cost->fdiv;
31579 else
31580 *total = cost->divide[MODE_INDEX (mode)];
31581 return false;
31583 case PLUS:
31584 if (GET_MODE_CLASS (mode) == MODE_INT
31585 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31587 if (GET_CODE (XEXP (x, 0)) == PLUS
31588 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31589 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31590 && CONSTANT_P (XEXP (x, 1)))
31592 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31593 if (val == 2 || val == 4 || val == 8)
31595 *total = cost->lea;
31596 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31597 outer_code, opno, speed);
31598 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31599 outer_code, opno, speed);
31600 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31601 return true;
31604 else if (GET_CODE (XEXP (x, 0)) == MULT
31605 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31607 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31608 if (val == 2 || val == 4 || val == 8)
31610 *total = cost->lea;
31611 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31612 outer_code, opno, speed);
31613 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31614 return true;
31617 else if (GET_CODE (XEXP (x, 0)) == PLUS)
31619 *total = cost->lea;
31620 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31621 outer_code, opno, speed);
31622 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31623 outer_code, opno, speed);
31624 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31625 return true;
31628 /* FALLTHRU */
31630 case MINUS:
31631 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31633 /* ??? SSE cost should be used here. */
31634 *total = cost->fadd;
31635 return false;
31637 else if (X87_FLOAT_MODE_P (mode))
31639 *total = cost->fadd;
31640 return false;
31642 else if (FLOAT_MODE_P (mode))
31644 /* ??? SSE vector cost should be used here. */
31645 *total = cost->fadd;
31646 return false;
31648 /* FALLTHRU */
31650 case AND:
31651 case IOR:
31652 case XOR:
31653 if (!TARGET_64BIT && mode == DImode)
31655 *total = (cost->add * 2
31656 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31657 << (GET_MODE (XEXP (x, 0)) != DImode))
31658 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31659 << (GET_MODE (XEXP (x, 1)) != DImode)));
31660 return true;
31662 /* FALLTHRU */
31664 case NEG:
31665 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31667 /* ??? SSE cost should be used here. */
31668 *total = cost->fchs;
31669 return false;
31671 else if (X87_FLOAT_MODE_P (mode))
31673 *total = cost->fchs;
31674 return false;
31676 else if (FLOAT_MODE_P (mode))
31678 /* ??? SSE vector cost should be used here. */
31679 *total = cost->fchs;
31680 return false;
31682 /* FALLTHRU */
31684 case NOT:
31685 if (!TARGET_64BIT && mode == DImode)
31686 *total = cost->add * 2;
31687 else
31688 *total = cost->add;
31689 return false;
31691 case COMPARE:
31692 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31693 && XEXP (XEXP (x, 0), 1) == const1_rtx
31694 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31695 && XEXP (x, 1) == const0_rtx)
31697 /* This kind of construct is implemented using test[bwl].
31698 Treat it as if we had an AND. */
31699 *total = (cost->add
31700 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31701 + rtx_cost (const1_rtx, outer_code, opno, speed));
31702 return true;
31704 return false;
31706 case FLOAT_EXTEND:
31707 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31708 *total = 0;
31709 return false;
31711 case ABS:
31712 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31713 /* ??? SSE cost should be used here. */
31714 *total = cost->fabs;
31715 else if (X87_FLOAT_MODE_P (mode))
31716 *total = cost->fabs;
31717 else if (FLOAT_MODE_P (mode))
31718 /* ??? SSE vector cost should be used here. */
31719 *total = cost->fabs;
31720 return false;
31722 case SQRT:
31723 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31724 /* ??? SSE cost should be used here. */
31725 *total = cost->fsqrt;
31726 else if (X87_FLOAT_MODE_P (mode))
31727 *total = cost->fsqrt;
31728 else if (FLOAT_MODE_P (mode))
31729 /* ??? SSE vector cost should be used here. */
31730 *total = cost->fsqrt;
31731 return false;
31733 case UNSPEC:
31734 if (XINT (x, 1) == UNSPEC_TP)
31735 *total = 0;
31736 return false;
31738 case VEC_SELECT:
31739 case VEC_CONCAT:
31740 case VEC_MERGE:
31741 case VEC_DUPLICATE:
31742 /* ??? Assume all of these vector manipulation patterns are
31743 recognizable. In which case they all pretty much have the
31744 same cost. */
31745 *total = COSTS_N_INSNS (1);
31746 return true;
31748 default:
31749 return false;
31753 #if TARGET_MACHO
31755 static int current_machopic_label_num;
31757 /* Given a symbol name and its associated stub, write out the
31758 definition of the stub. */
31760 void
31761 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31763 unsigned int length;
31764 char *binder_name, *symbol_name, lazy_ptr_name[32];
31765 int label = ++current_machopic_label_num;
31767 /* For 64-bit we shouldn't get here. */
31768 gcc_assert (!TARGET_64BIT);
31770 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
31771 symb = targetm.strip_name_encoding (symb);
31773 length = strlen (stub);
31774 binder_name = XALLOCAVEC (char, length + 32);
31775 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31777 length = strlen (symb);
31778 symbol_name = XALLOCAVEC (char, length + 32);
31779 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31781 sprintf (lazy_ptr_name, "L%d$lz", label);
31783 if (MACHOPIC_ATT_STUB)
31784 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31785 else if (MACHOPIC_PURE)
31786 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31787 else
31788 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31790 fprintf (file, "%s:\n", stub);
31791 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31793 if (MACHOPIC_ATT_STUB)
31795 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31797 else if (MACHOPIC_PURE)
31799 /* PIC stub. */
31800 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31801 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31802 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
31803 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31804 label, lazy_ptr_name, label);
31805 fprintf (file, "\tjmp\t*%%ecx\n");
31807 else
31808 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31810 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31811 it needs no stub-binding-helper. */
31812 if (MACHOPIC_ATT_STUB)
31813 return;
31815 fprintf (file, "%s:\n", binder_name);
31817 if (MACHOPIC_PURE)
31819 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31820 fprintf (file, "\tpushl\t%%ecx\n");
31822 else
31823 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31825 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31827 /* N.B. Keep the correspondence of these
31828 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31829 old-pic/new-pic/non-pic stubs; altering this will break
31830 compatibility with existing dylibs. */
31831 if (MACHOPIC_PURE)
31833 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31834 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31836 else
31837 /* 16-byte -mdynamic-no-pic stub. */
31838 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31840 fprintf (file, "%s:\n", lazy_ptr_name);
31841 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31842 fprintf (file, ASM_LONG "%s\n", binder_name);
31844 #endif /* TARGET_MACHO */
31846 /* Order the registers for register allocator. */
31848 void
31849 x86_order_regs_for_local_alloc (void)
31851 int pos = 0;
31852 int i;
31854 /* First allocate the local general purpose registers. */
31855 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31856 if (GENERAL_REGNO_P (i) && call_used_regs[i])
31857 reg_alloc_order [pos++] = i;
31859 /* Global general purpose registers. */
31860 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31861 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
31862 reg_alloc_order [pos++] = i;
31864 /* x87 registers come first in case we are doing FP math
31865 using them. */
31866 if (!TARGET_SSE_MATH)
31867 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31868 reg_alloc_order [pos++] = i;
31870 /* SSE registers. */
31871 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
31872 reg_alloc_order [pos++] = i;
31873 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
31874 reg_alloc_order [pos++] = i;
31876 /* x87 registers. */
31877 if (TARGET_SSE_MATH)
31878 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31879 reg_alloc_order [pos++] = i;
31881 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
31882 reg_alloc_order [pos++] = i;
31884 /* Initialize the rest of array as we do not allocate some registers
31885 at all. */
31886 while (pos < FIRST_PSEUDO_REGISTER)
31887 reg_alloc_order [pos++] = 0;
31890 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
31891 in struct attribute_spec handler. */
31892 static tree
31893 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
31894 tree args,
31895 int flags ATTRIBUTE_UNUSED,
31896 bool *no_add_attrs)
31898 if (TREE_CODE (*node) != FUNCTION_TYPE
31899 && TREE_CODE (*node) != METHOD_TYPE
31900 && TREE_CODE (*node) != FIELD_DECL
31901 && TREE_CODE (*node) != TYPE_DECL)
31903 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31904 name);
31905 *no_add_attrs = true;
31906 return NULL_TREE;
31908 if (TARGET_64BIT)
31910 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
31911 name);
31912 *no_add_attrs = true;
31913 return NULL_TREE;
31915 if (is_attribute_p ("callee_pop_aggregate_return", name))
31917 tree cst;
31919 cst = TREE_VALUE (args);
31920 if (TREE_CODE (cst) != INTEGER_CST)
31922 warning (OPT_Wattributes,
31923 "%qE attribute requires an integer constant argument",
31924 name);
31925 *no_add_attrs = true;
31927 else if (compare_tree_int (cst, 0) != 0
31928 && compare_tree_int (cst, 1) != 0)
31930 warning (OPT_Wattributes,
31931 "argument to %qE attribute is neither zero, nor one",
31932 name);
31933 *no_add_attrs = true;
31936 return NULL_TREE;
31939 return NULL_TREE;
31942 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31943 struct attribute_spec.handler. */
31944 static tree
31945 ix86_handle_abi_attribute (tree *node, tree name,
31946 tree args ATTRIBUTE_UNUSED,
31947 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31949 if (TREE_CODE (*node) != FUNCTION_TYPE
31950 && TREE_CODE (*node) != METHOD_TYPE
31951 && TREE_CODE (*node) != FIELD_DECL
31952 && TREE_CODE (*node) != TYPE_DECL)
31954 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31955 name);
31956 *no_add_attrs = true;
31957 return NULL_TREE;
31960 /* Can combine regparm with all attributes but fastcall. */
31961 if (is_attribute_p ("ms_abi", name))
31963 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31965 error ("ms_abi and sysv_abi attributes are not compatible");
31968 return NULL_TREE;
31970 else if (is_attribute_p ("sysv_abi", name))
31972 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31974 error ("ms_abi and sysv_abi attributes are not compatible");
31977 return NULL_TREE;
31980 return NULL_TREE;
31983 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31984 struct attribute_spec.handler. */
31985 static tree
31986 ix86_handle_struct_attribute (tree *node, tree name,
31987 tree args ATTRIBUTE_UNUSED,
31988 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31990 tree *type = NULL;
31991 if (DECL_P (*node))
31993 if (TREE_CODE (*node) == TYPE_DECL)
31994 type = &TREE_TYPE (*node);
31996 else
31997 type = node;
31999 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
32000 || TREE_CODE (*type) == UNION_TYPE)))
32002 warning (OPT_Wattributes, "%qE attribute ignored",
32003 name);
32004 *no_add_attrs = true;
32007 else if ((is_attribute_p ("ms_struct", name)
32008 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32009 || ((is_attribute_p ("gcc_struct", name)
32010 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32012 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32013 name);
32014 *no_add_attrs = true;
32017 return NULL_TREE;
32020 static tree
32021 ix86_handle_fndecl_attribute (tree *node, tree name,
32022 tree args ATTRIBUTE_UNUSED,
32023 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32025 if (TREE_CODE (*node) != FUNCTION_DECL)
32027 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32028 name);
32029 *no_add_attrs = true;
32031 return NULL_TREE;
32034 static bool
32035 ix86_ms_bitfield_layout_p (const_tree record_type)
32037 return ((TARGET_MS_BITFIELD_LAYOUT
32038 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32039 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32042 /* Returns an expression indicating where the this parameter is
32043 located on entry to the FUNCTION. */
32045 static rtx
32046 x86_this_parameter (tree function)
32048 tree type = TREE_TYPE (function);
32049 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32050 int nregs;
32052 if (TARGET_64BIT)
32054 const int *parm_regs;
32056 if (ix86_function_type_abi (type) == MS_ABI)
32057 parm_regs = x86_64_ms_abi_int_parameter_registers;
32058 else
32059 parm_regs = x86_64_int_parameter_registers;
32060 return gen_rtx_REG (Pmode, parm_regs[aggr]);
32063 nregs = ix86_function_regparm (type, function);
32065 if (nregs > 0 && !stdarg_p (type))
32067 int regno;
32068 unsigned int ccvt = ix86_get_callcvt (type);
32070 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32071 regno = aggr ? DX_REG : CX_REG;
32072 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32074 regno = CX_REG;
32075 if (aggr)
32076 return gen_rtx_MEM (SImode,
32077 plus_constant (stack_pointer_rtx, 4));
32079 else
32081 regno = AX_REG;
32082 if (aggr)
32084 regno = DX_REG;
32085 if (nregs == 1)
32086 return gen_rtx_MEM (SImode,
32087 plus_constant (stack_pointer_rtx, 4));
32090 return gen_rtx_REG (SImode, regno);
32093 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
32096 /* Determine whether x86_output_mi_thunk can succeed. */
32098 static bool
32099 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32100 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32101 HOST_WIDE_INT vcall_offset, const_tree function)
32103 /* 64-bit can handle anything. */
32104 if (TARGET_64BIT)
32105 return true;
32107 /* For 32-bit, everything's fine if we have one free register. */
32108 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32109 return true;
32111 /* Need a free register for vcall_offset. */
32112 if (vcall_offset)
32113 return false;
32115 /* Need a free register for GOT references. */
32116 if (flag_pic && !targetm.binds_local_p (function))
32117 return false;
32119 /* Otherwise ok. */
32120 return true;
32123 /* Output the assembler code for a thunk function. THUNK_DECL is the
32124 declaration for the thunk function itself, FUNCTION is the decl for
32125 the target function. DELTA is an immediate constant offset to be
32126 added to THIS. If VCALL_OFFSET is nonzero, the word at
32127 *(*this + vcall_offset) should be added to THIS. */
32129 static void
32130 x86_output_mi_thunk (FILE *file,
32131 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32132 HOST_WIDE_INT vcall_offset, tree function)
32134 rtx this_param = x86_this_parameter (function);
32135 rtx this_reg, tmp, fnaddr;
32137 emit_note (NOTE_INSN_PROLOGUE_END);
32139 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32140 pull it in now and let DELTA benefit. */
32141 if (REG_P (this_param))
32142 this_reg = this_param;
32143 else if (vcall_offset)
32145 /* Put the this parameter into %eax. */
32146 this_reg = gen_rtx_REG (Pmode, AX_REG);
32147 emit_move_insn (this_reg, this_param);
32149 else
32150 this_reg = NULL_RTX;
32152 /* Adjust the this parameter by a fixed constant. */
32153 if (delta)
32155 rtx delta_rtx = GEN_INT (delta);
32156 rtx delta_dst = this_reg ? this_reg : this_param;
32158 if (TARGET_64BIT)
32160 if (!x86_64_general_operand (delta_rtx, Pmode))
32162 tmp = gen_rtx_REG (Pmode, R10_REG);
32163 emit_move_insn (tmp, delta_rtx);
32164 delta_rtx = tmp;
32168 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32171 /* Adjust the this parameter by a value stored in the vtable. */
32172 if (vcall_offset)
32174 rtx vcall_addr, vcall_mem, this_mem;
32175 unsigned int tmp_regno;
32177 if (TARGET_64BIT)
32178 tmp_regno = R10_REG;
32179 else
32181 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32182 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32183 tmp_regno = AX_REG;
32184 else
32185 tmp_regno = CX_REG;
32187 tmp = gen_rtx_REG (Pmode, tmp_regno);
32189 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32190 if (Pmode != ptr_mode)
32191 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32192 emit_move_insn (tmp, this_mem);
32194 /* Adjust the this parameter. */
32195 vcall_addr = plus_constant (tmp, vcall_offset);
32196 if (TARGET_64BIT
32197 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32199 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32200 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32201 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32204 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32205 if (Pmode != ptr_mode)
32206 emit_insn (gen_addsi_1_zext (this_reg,
32207 gen_rtx_REG (ptr_mode,
32208 REGNO (this_reg)),
32209 vcall_mem));
32210 else
32211 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32214 /* If necessary, drop THIS back to its stack slot. */
32215 if (this_reg && this_reg != this_param)
32216 emit_move_insn (this_param, this_reg);
32218 fnaddr = XEXP (DECL_RTL (function), 0);
32219 if (TARGET_64BIT)
32221 if (!flag_pic || targetm.binds_local_p (function)
32222 || cfun->machine->call_abi == MS_ABI)
32224 else
32226 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32227 tmp = gen_rtx_CONST (Pmode, tmp);
32228 fnaddr = gen_rtx_MEM (Pmode, tmp);
32231 else
32233 if (!flag_pic || targetm.binds_local_p (function))
32235 #if TARGET_MACHO
32236 else if (TARGET_MACHO)
32238 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32239 fnaddr = XEXP (fnaddr, 0);
32241 #endif /* TARGET_MACHO */
32242 else
32244 tmp = gen_rtx_REG (Pmode, CX_REG);
32245 output_set_got (tmp, NULL_RTX);
32247 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32248 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32249 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32253 /* Our sibling call patterns do not allow memories, because we have no
32254 predicate that can distinguish between frame and non-frame memory.
32255 For our purposes here, we can get away with (ab)using a jump pattern,
32256 because we're going to do no optimization. */
32257 if (MEM_P (fnaddr))
32258 emit_jump_insn (gen_indirect_jump (fnaddr));
32259 else
32261 tmp = gen_rtx_MEM (QImode, fnaddr);
32262 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32263 tmp = emit_call_insn (tmp);
32264 SIBLING_CALL_P (tmp) = 1;
32266 emit_barrier ();
32268 /* Emit just enough of rest_of_compilation to get the insns emitted.
32269 Note that use_thunk calls assemble_start_function et al. */
32270 tmp = get_insns ();
32271 insn_locators_alloc ();
32272 shorten_branches (tmp);
32273 final_start_function (tmp, file, 1);
32274 final (tmp, file, 1);
32275 final_end_function ();
32278 static void
32279 x86_file_start (void)
32281 default_file_start ();
32282 #if TARGET_MACHO
32283 darwin_file_start ();
32284 #endif
32285 if (X86_FILE_START_VERSION_DIRECTIVE)
32286 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32287 if (X86_FILE_START_FLTUSED)
32288 fputs ("\t.global\t__fltused\n", asm_out_file);
32289 if (ix86_asm_dialect == ASM_INTEL)
32290 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32294 x86_field_alignment (tree field, int computed)
32296 enum machine_mode mode;
32297 tree type = TREE_TYPE (field);
32299 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32300 return computed;
32301 mode = TYPE_MODE (strip_array_types (type));
32302 if (mode == DFmode || mode == DCmode
32303 || GET_MODE_CLASS (mode) == MODE_INT
32304 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32305 return MIN (32, computed);
32306 return computed;
32309 /* Output assembler code to FILE to increment profiler label # LABELNO
32310 for profiling a function entry. */
32311 void
32312 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32314 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32315 : MCOUNT_NAME);
32317 if (TARGET_64BIT)
32319 #ifndef NO_PROFILE_COUNTERS
32320 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32321 #endif
32323 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32324 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32325 else
32326 fprintf (file, "\tcall\t%s\n", mcount_name);
32328 else if (flag_pic)
32330 #ifndef NO_PROFILE_COUNTERS
32331 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32332 LPREFIX, labelno);
32333 #endif
32334 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32336 else
32338 #ifndef NO_PROFILE_COUNTERS
32339 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32340 LPREFIX, labelno);
32341 #endif
32342 fprintf (file, "\tcall\t%s\n", mcount_name);
32346 /* We don't have exact information about the insn sizes, but we may assume
32347 quite safely that we are informed about all 1 byte insns and memory
32348 address sizes. This is enough to eliminate unnecessary padding in
32349 99% of cases. */
32351 static int
32352 min_insn_size (rtx insn)
32354 int l = 0, len;
32356 if (!INSN_P (insn) || !active_insn_p (insn))
32357 return 0;
32359 /* Discard alignments we've emit and jump instructions. */
32360 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32361 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32362 return 0;
32363 if (JUMP_TABLE_DATA_P (insn))
32364 return 0;
32366 /* Important case - calls are always 5 bytes.
32367 It is common to have many calls in the row. */
32368 if (CALL_P (insn)
32369 && symbolic_reference_mentioned_p (PATTERN (insn))
32370 && !SIBLING_CALL_P (insn))
32371 return 5;
32372 len = get_attr_length (insn);
32373 if (len <= 1)
32374 return 1;
32376 /* For normal instructions we rely on get_attr_length being exact,
32377 with a few exceptions. */
32378 if (!JUMP_P (insn))
32380 enum attr_type type = get_attr_type (insn);
32382 switch (type)
32384 case TYPE_MULTI:
32385 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32386 || asm_noperands (PATTERN (insn)) >= 0)
32387 return 0;
32388 break;
32389 case TYPE_OTHER:
32390 case TYPE_FCMP:
32391 break;
32392 default:
32393 /* Otherwise trust get_attr_length. */
32394 return len;
32397 l = get_attr_length_address (insn);
32398 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32399 l = 4;
32401 if (l)
32402 return 1+l;
32403 else
32404 return 2;
32407 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32409 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32410 window. */
32412 static void
32413 ix86_avoid_jump_mispredicts (void)
32415 rtx insn, start = get_insns ();
32416 int nbytes = 0, njumps = 0;
32417 int isjump = 0;
32419 /* Look for all minimal intervals of instructions containing 4 jumps.
32420 The intervals are bounded by START and INSN. NBYTES is the total
32421 size of instructions in the interval including INSN and not including
32422 START. When the NBYTES is smaller than 16 bytes, it is possible
32423 that the end of START and INSN ends up in the same 16byte page.
32425 The smallest offset in the page INSN can start is the case where START
32426 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
32427 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32429 for (insn = start; insn; insn = NEXT_INSN (insn))
32431 int min_size;
32433 if (LABEL_P (insn))
32435 int align = label_to_alignment (insn);
32436 int max_skip = label_to_max_skip (insn);
32438 if (max_skip > 15)
32439 max_skip = 15;
32440 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32441 already in the current 16 byte page, because otherwise
32442 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32443 bytes to reach 16 byte boundary. */
32444 if (align <= 0
32445 || (align <= 3 && max_skip != (1 << align) - 1))
32446 max_skip = 0;
32447 if (dump_file)
32448 fprintf (dump_file, "Label %i with max_skip %i\n",
32449 INSN_UID (insn), max_skip);
32450 if (max_skip)
32452 while (nbytes + max_skip >= 16)
32454 start = NEXT_INSN (start);
32455 if ((JUMP_P (start)
32456 && GET_CODE (PATTERN (start)) != ADDR_VEC
32457 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32458 || CALL_P (start))
32459 njumps--, isjump = 1;
32460 else
32461 isjump = 0;
32462 nbytes -= min_insn_size (start);
32465 continue;
32468 min_size = min_insn_size (insn);
32469 nbytes += min_size;
32470 if (dump_file)
32471 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32472 INSN_UID (insn), min_size);
32473 if ((JUMP_P (insn)
32474 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32475 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32476 || CALL_P (insn))
32477 njumps++;
32478 else
32479 continue;
32481 while (njumps > 3)
32483 start = NEXT_INSN (start);
32484 if ((JUMP_P (start)
32485 && GET_CODE (PATTERN (start)) != ADDR_VEC
32486 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32487 || CALL_P (start))
32488 njumps--, isjump = 1;
32489 else
32490 isjump = 0;
32491 nbytes -= min_insn_size (start);
32493 gcc_assert (njumps >= 0);
32494 if (dump_file)
32495 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32496 INSN_UID (start), INSN_UID (insn), nbytes);
32498 if (njumps == 3 && isjump && nbytes < 16)
32500 int padsize = 15 - nbytes + min_insn_size (insn);
32502 if (dump_file)
32503 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32504 INSN_UID (insn), padsize);
32505 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32509 #endif
32511 /* AMD Athlon works faster
32512 when RET is not destination of conditional jump or directly preceded
32513 by other jump instruction. We avoid the penalty by inserting NOP just
32514 before the RET instructions in such cases. */
32515 static void
32516 ix86_pad_returns (void)
32518 edge e;
32519 edge_iterator ei;
32521 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32523 basic_block bb = e->src;
32524 rtx ret = BB_END (bb);
32525 rtx prev;
32526 bool replace = false;
32528 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32529 || optimize_bb_for_size_p (bb))
32530 continue;
32531 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32532 if (active_insn_p (prev) || LABEL_P (prev))
32533 break;
32534 if (prev && LABEL_P (prev))
32536 edge e;
32537 edge_iterator ei;
32539 FOR_EACH_EDGE (e, ei, bb->preds)
32540 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32541 && !(e->flags & EDGE_FALLTHRU))
32542 replace = true;
32544 if (!replace)
32546 prev = prev_active_insn (ret);
32547 if (prev
32548 && ((JUMP_P (prev) && any_condjump_p (prev))
32549 || CALL_P (prev)))
32550 replace = true;
32551 /* Empty functions get branch mispredict even when
32552 the jump destination is not visible to us. */
32553 if (!prev && !optimize_function_for_size_p (cfun))
32554 replace = true;
32556 if (replace)
32558 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32559 delete_insn (ret);
32564 /* Count the minimum number of instructions in BB. Return 4 if the
32565 number of instructions >= 4. */
32567 static int
32568 ix86_count_insn_bb (basic_block bb)
32570 rtx insn;
32571 int insn_count = 0;
32573 /* Count number of instructions in this block. Return 4 if the number
32574 of instructions >= 4. */
32575 FOR_BB_INSNS (bb, insn)
32577 /* Only happen in exit blocks. */
32578 if (JUMP_P (insn)
32579 && ANY_RETURN_P (PATTERN (insn)))
32580 break;
32582 if (NONDEBUG_INSN_P (insn)
32583 && GET_CODE (PATTERN (insn)) != USE
32584 && GET_CODE (PATTERN (insn)) != CLOBBER)
32586 insn_count++;
32587 if (insn_count >= 4)
32588 return insn_count;
32592 return insn_count;
32596 /* Count the minimum number of instructions in code path in BB.
32597 Return 4 if the number of instructions >= 4. */
32599 static int
32600 ix86_count_insn (basic_block bb)
32602 edge e;
32603 edge_iterator ei;
32604 int min_prev_count;
32606 /* Only bother counting instructions along paths with no
32607 more than 2 basic blocks between entry and exit. Given
32608 that BB has an edge to exit, determine if a predecessor
32609 of BB has an edge from entry. If so, compute the number
32610 of instructions in the predecessor block. If there
32611 happen to be multiple such blocks, compute the minimum. */
32612 min_prev_count = 4;
32613 FOR_EACH_EDGE (e, ei, bb->preds)
32615 edge prev_e;
32616 edge_iterator prev_ei;
32618 if (e->src == ENTRY_BLOCK_PTR)
32620 min_prev_count = 0;
32621 break;
32623 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32625 if (prev_e->src == ENTRY_BLOCK_PTR)
32627 int count = ix86_count_insn_bb (e->src);
32628 if (count < min_prev_count)
32629 min_prev_count = count;
32630 break;
32635 if (min_prev_count < 4)
32636 min_prev_count += ix86_count_insn_bb (bb);
32638 return min_prev_count;
32641 /* Pad short funtion to 4 instructions. */
32643 static void
32644 ix86_pad_short_function (void)
32646 edge e;
32647 edge_iterator ei;
32649 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32651 rtx ret = BB_END (e->src);
32652 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32654 int insn_count = ix86_count_insn (e->src);
32656 /* Pad short function. */
32657 if (insn_count < 4)
32659 rtx insn = ret;
32661 /* Find epilogue. */
32662 while (insn
32663 && (!NOTE_P (insn)
32664 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32665 insn = PREV_INSN (insn);
32667 if (!insn)
32668 insn = ret;
32670 /* Two NOPs count as one instruction. */
32671 insn_count = 2 * (4 - insn_count);
32672 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32678 /* Implement machine specific optimizations. We implement padding of returns
32679 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
32680 static void
32681 ix86_reorg (void)
32683 /* We are freeing block_for_insn in the toplev to keep compatibility
32684 with old MDEP_REORGS that are not CFG based. Recompute it now. */
32685 compute_bb_for_insn ();
32687 /* Run the vzeroupper optimization if needed. */
32688 if (TARGET_VZEROUPPER)
32689 move_or_delete_vzeroupper ();
32691 if (optimize && optimize_function_for_speed_p (cfun))
32693 if (TARGET_PAD_SHORT_FUNCTION)
32694 ix86_pad_short_function ();
32695 else if (TARGET_PAD_RETURNS)
32696 ix86_pad_returns ();
32697 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32698 if (TARGET_FOUR_JUMP_LIMIT)
32699 ix86_avoid_jump_mispredicts ();
32700 #endif
32704 /* Return nonzero when QImode register that must be represented via REX prefix
32705 is used. */
32706 bool
32707 x86_extended_QIreg_mentioned_p (rtx insn)
32709 int i;
32710 extract_insn_cached (insn);
32711 for (i = 0; i < recog_data.n_operands; i++)
32712 if (REG_P (recog_data.operand[i])
32713 && REGNO (recog_data.operand[i]) > BX_REG)
32714 return true;
32715 return false;
32718 /* Return nonzero when P points to register encoded via REX prefix.
32719 Called via for_each_rtx. */
32720 static int
32721 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32723 unsigned int regno;
32724 if (!REG_P (*p))
32725 return 0;
32726 regno = REGNO (*p);
32727 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32730 /* Return true when INSN mentions register that must be encoded using REX
32731 prefix. */
32732 bool
32733 x86_extended_reg_mentioned_p (rtx insn)
32735 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32736 extended_reg_mentioned_1, NULL);
32739 /* If profitable, negate (without causing overflow) integer constant
32740 of mode MODE at location LOC. Return true in this case. */
32741 bool
32742 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32744 HOST_WIDE_INT val;
32746 if (!CONST_INT_P (*loc))
32747 return false;
32749 switch (mode)
32751 case DImode:
32752 /* DImode x86_64 constants must fit in 32 bits. */
32753 gcc_assert (x86_64_immediate_operand (*loc, mode));
32755 mode = SImode;
32756 break;
32758 case SImode:
32759 case HImode:
32760 case QImode:
32761 break;
32763 default:
32764 gcc_unreachable ();
32767 /* Avoid overflows. */
32768 if (mode_signbit_p (mode, *loc))
32769 return false;
32771 val = INTVAL (*loc);
32773 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32774 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
32775 if ((val < 0 && val != -128)
32776 || val == 128)
32778 *loc = GEN_INT (-val);
32779 return true;
32782 return false;
32785 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
32786 optabs would emit if we didn't have TFmode patterns. */
32788 void
32789 x86_emit_floatuns (rtx operands[2])
32791 rtx neglab, donelab, i0, i1, f0, in, out;
32792 enum machine_mode mode, inmode;
32794 inmode = GET_MODE (operands[1]);
32795 gcc_assert (inmode == SImode || inmode == DImode);
32797 out = operands[0];
32798 in = force_reg (inmode, operands[1]);
32799 mode = GET_MODE (out);
32800 neglab = gen_label_rtx ();
32801 donelab = gen_label_rtx ();
32802 f0 = gen_reg_rtx (mode);
32804 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32806 expand_float (out, in, 0);
32808 emit_jump_insn (gen_jump (donelab));
32809 emit_barrier ();
32811 emit_label (neglab);
32813 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32814 1, OPTAB_DIRECT);
32815 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32816 1, OPTAB_DIRECT);
32817 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32819 expand_float (f0, i0, 0);
32821 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32823 emit_label (donelab);
32826 /* AVX2 does support 32-byte integer vector operations,
32827 thus the longest vector we are faced with is V32QImode. */
32828 #define MAX_VECT_LEN 32
32830 struct expand_vec_perm_d
32832 rtx target, op0, op1;
32833 unsigned char perm[MAX_VECT_LEN];
32834 enum machine_mode vmode;
32835 unsigned char nelt;
32836 bool testing_p;
32839 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
32840 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
32842 /* Get a vector mode of the same size as the original but with elements
32843 twice as wide. This is only guaranteed to apply to integral vectors. */
32845 static inline enum machine_mode
32846 get_mode_wider_vector (enum machine_mode o)
32848 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
32849 enum machine_mode n = GET_MODE_WIDER_MODE (o);
32850 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
32851 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
32852 return n;
32855 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32856 with all elements equal to VAR. Return true if successful. */
32858 static bool
32859 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
32860 rtx target, rtx val)
32862 bool ok;
32864 switch (mode)
32866 case V2SImode:
32867 case V2SFmode:
32868 if (!mmx_ok)
32869 return false;
32870 /* FALLTHRU */
32872 case V4DFmode:
32873 case V4DImode:
32874 case V8SFmode:
32875 case V8SImode:
32876 case V2DFmode:
32877 case V2DImode:
32878 case V4SFmode:
32879 case V4SImode:
32881 rtx insn, dup;
32883 /* First attempt to recognize VAL as-is. */
32884 dup = gen_rtx_VEC_DUPLICATE (mode, val);
32885 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
32886 if (recog_memoized (insn) < 0)
32888 rtx seq;
32889 /* If that fails, force VAL into a register. */
32891 start_sequence ();
32892 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
32893 seq = get_insns ();
32894 end_sequence ();
32895 if (seq)
32896 emit_insn_before (seq, insn);
32898 ok = recog_memoized (insn) >= 0;
32899 gcc_assert (ok);
32902 return true;
32904 case V4HImode:
32905 if (!mmx_ok)
32906 return false;
32907 if (TARGET_SSE || TARGET_3DNOW_A)
32909 rtx x;
32911 val = gen_lowpart (SImode, val);
32912 x = gen_rtx_TRUNCATE (HImode, val);
32913 x = gen_rtx_VEC_DUPLICATE (mode, x);
32914 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32915 return true;
32917 goto widen;
32919 case V8QImode:
32920 if (!mmx_ok)
32921 return false;
32922 goto widen;
32924 case V8HImode:
32925 if (TARGET_SSE2)
32927 struct expand_vec_perm_d dperm;
32928 rtx tmp1, tmp2;
32930 permute:
32931 memset (&dperm, 0, sizeof (dperm));
32932 dperm.target = target;
32933 dperm.vmode = mode;
32934 dperm.nelt = GET_MODE_NUNITS (mode);
32935 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32937 /* Extend to SImode using a paradoxical SUBREG. */
32938 tmp1 = gen_reg_rtx (SImode);
32939 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32941 /* Insert the SImode value as low element of a V4SImode vector. */
32942 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32943 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32945 ok = (expand_vec_perm_1 (&dperm)
32946 || expand_vec_perm_broadcast_1 (&dperm));
32947 gcc_assert (ok);
32948 return ok;
32950 goto widen;
32952 case V16QImode:
32953 if (TARGET_SSE2)
32954 goto permute;
32955 goto widen;
32957 widen:
32958 /* Replicate the value once into the next wider mode and recurse. */
32960 enum machine_mode smode, wsmode, wvmode;
32961 rtx x;
32963 smode = GET_MODE_INNER (mode);
32964 wvmode = get_mode_wider_vector (mode);
32965 wsmode = GET_MODE_INNER (wvmode);
32967 val = convert_modes (wsmode, smode, val, true);
32968 x = expand_simple_binop (wsmode, ASHIFT, val,
32969 GEN_INT (GET_MODE_BITSIZE (smode)),
32970 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32971 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32973 x = gen_lowpart (wvmode, target);
32974 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32975 gcc_assert (ok);
32976 return ok;
32979 case V16HImode:
32980 case V32QImode:
32982 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32983 rtx x = gen_reg_rtx (hvmode);
32985 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32986 gcc_assert (ok);
32988 x = gen_rtx_VEC_CONCAT (mode, x, x);
32989 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32991 return true;
32993 default:
32994 return false;
32998 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32999 whose ONE_VAR element is VAR, and other elements are zero. Return true
33000 if successful. */
33002 static bool
33003 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33004 rtx target, rtx var, int one_var)
33006 enum machine_mode vsimode;
33007 rtx new_target;
33008 rtx x, tmp;
33009 bool use_vector_set = false;
33011 switch (mode)
33013 case V2DImode:
33014 /* For SSE4.1, we normally use vector set. But if the second
33015 element is zero and inter-unit moves are OK, we use movq
33016 instead. */
33017 use_vector_set = (TARGET_64BIT
33018 && TARGET_SSE4_1
33019 && !(TARGET_INTER_UNIT_MOVES
33020 && one_var == 0));
33021 break;
33022 case V16QImode:
33023 case V4SImode:
33024 case V4SFmode:
33025 use_vector_set = TARGET_SSE4_1;
33026 break;
33027 case V8HImode:
33028 use_vector_set = TARGET_SSE2;
33029 break;
33030 case V4HImode:
33031 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33032 break;
33033 case V32QImode:
33034 case V16HImode:
33035 case V8SImode:
33036 case V8SFmode:
33037 case V4DFmode:
33038 use_vector_set = TARGET_AVX;
33039 break;
33040 case V4DImode:
33041 /* Use ix86_expand_vector_set in 64bit mode only. */
33042 use_vector_set = TARGET_AVX && TARGET_64BIT;
33043 break;
33044 default:
33045 break;
33048 if (use_vector_set)
33050 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33051 var = force_reg (GET_MODE_INNER (mode), var);
33052 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33053 return true;
33056 switch (mode)
33058 case V2SFmode:
33059 case V2SImode:
33060 if (!mmx_ok)
33061 return false;
33062 /* FALLTHRU */
33064 case V2DFmode:
33065 case V2DImode:
33066 if (one_var != 0)
33067 return false;
33068 var = force_reg (GET_MODE_INNER (mode), var);
33069 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33070 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33071 return true;
33073 case V4SFmode:
33074 case V4SImode:
33075 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33076 new_target = gen_reg_rtx (mode);
33077 else
33078 new_target = target;
33079 var = force_reg (GET_MODE_INNER (mode), var);
33080 x = gen_rtx_VEC_DUPLICATE (mode, var);
33081 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33082 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33083 if (one_var != 0)
33085 /* We need to shuffle the value to the correct position, so
33086 create a new pseudo to store the intermediate result. */
33088 /* With SSE2, we can use the integer shuffle insns. */
33089 if (mode != V4SFmode && TARGET_SSE2)
33091 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33092 const1_rtx,
33093 GEN_INT (one_var == 1 ? 0 : 1),
33094 GEN_INT (one_var == 2 ? 0 : 1),
33095 GEN_INT (one_var == 3 ? 0 : 1)));
33096 if (target != new_target)
33097 emit_move_insn (target, new_target);
33098 return true;
33101 /* Otherwise convert the intermediate result to V4SFmode and
33102 use the SSE1 shuffle instructions. */
33103 if (mode != V4SFmode)
33105 tmp = gen_reg_rtx (V4SFmode);
33106 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33108 else
33109 tmp = new_target;
33111 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33112 const1_rtx,
33113 GEN_INT (one_var == 1 ? 0 : 1),
33114 GEN_INT (one_var == 2 ? 0+4 : 1+4),
33115 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33117 if (mode != V4SFmode)
33118 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33119 else if (tmp != target)
33120 emit_move_insn (target, tmp);
33122 else if (target != new_target)
33123 emit_move_insn (target, new_target);
33124 return true;
33126 case V8HImode:
33127 case V16QImode:
33128 vsimode = V4SImode;
33129 goto widen;
33130 case V4HImode:
33131 case V8QImode:
33132 if (!mmx_ok)
33133 return false;
33134 vsimode = V2SImode;
33135 goto widen;
33136 widen:
33137 if (one_var != 0)
33138 return false;
33140 /* Zero extend the variable element to SImode and recurse. */
33141 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33143 x = gen_reg_rtx (vsimode);
33144 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33145 var, one_var))
33146 gcc_unreachable ();
33148 emit_move_insn (target, gen_lowpart (mode, x));
33149 return true;
33151 default:
33152 return false;
33156 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33157 consisting of the values in VALS. It is known that all elements
33158 except ONE_VAR are constants. Return true if successful. */
33160 static bool
33161 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33162 rtx target, rtx vals, int one_var)
33164 rtx var = XVECEXP (vals, 0, one_var);
33165 enum machine_mode wmode;
33166 rtx const_vec, x;
33168 const_vec = copy_rtx (vals);
33169 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33170 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33172 switch (mode)
33174 case V2DFmode:
33175 case V2DImode:
33176 case V2SFmode:
33177 case V2SImode:
33178 /* For the two element vectors, it's just as easy to use
33179 the general case. */
33180 return false;
33182 case V4DImode:
33183 /* Use ix86_expand_vector_set in 64bit mode only. */
33184 if (!TARGET_64BIT)
33185 return false;
33186 case V4DFmode:
33187 case V8SFmode:
33188 case V8SImode:
33189 case V16HImode:
33190 case V32QImode:
33191 case V4SFmode:
33192 case V4SImode:
33193 case V8HImode:
33194 case V4HImode:
33195 break;
33197 case V16QImode:
33198 if (TARGET_SSE4_1)
33199 break;
33200 wmode = V8HImode;
33201 goto widen;
33202 case V8QImode:
33203 wmode = V4HImode;
33204 goto widen;
33205 widen:
33206 /* There's no way to set one QImode entry easily. Combine
33207 the variable value with its adjacent constant value, and
33208 promote to an HImode set. */
33209 x = XVECEXP (vals, 0, one_var ^ 1);
33210 if (one_var & 1)
33212 var = convert_modes (HImode, QImode, var, true);
33213 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33214 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33215 x = GEN_INT (INTVAL (x) & 0xff);
33217 else
33219 var = convert_modes (HImode, QImode, var, true);
33220 x = gen_int_mode (INTVAL (x) << 8, HImode);
33222 if (x != const0_rtx)
33223 var = expand_simple_binop (HImode, IOR, var, x, var,
33224 1, OPTAB_LIB_WIDEN);
33226 x = gen_reg_rtx (wmode);
33227 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33228 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33230 emit_move_insn (target, gen_lowpart (mode, x));
33231 return true;
33233 default:
33234 return false;
33237 emit_move_insn (target, const_vec);
33238 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33239 return true;
33242 /* A subroutine of ix86_expand_vector_init_general. Use vector
33243 concatenate to handle the most general case: all values variable,
33244 and none identical. */
33246 static void
33247 ix86_expand_vector_init_concat (enum machine_mode mode,
33248 rtx target, rtx *ops, int n)
33250 enum machine_mode cmode, hmode = VOIDmode;
33251 rtx first[8], second[4];
33252 rtvec v;
33253 int i, j;
33255 switch (n)
33257 case 2:
33258 switch (mode)
33260 case V8SImode:
33261 cmode = V4SImode;
33262 break;
33263 case V8SFmode:
33264 cmode = V4SFmode;
33265 break;
33266 case V4DImode:
33267 cmode = V2DImode;
33268 break;
33269 case V4DFmode:
33270 cmode = V2DFmode;
33271 break;
33272 case V4SImode:
33273 cmode = V2SImode;
33274 break;
33275 case V4SFmode:
33276 cmode = V2SFmode;
33277 break;
33278 case V2DImode:
33279 cmode = DImode;
33280 break;
33281 case V2SImode:
33282 cmode = SImode;
33283 break;
33284 case V2DFmode:
33285 cmode = DFmode;
33286 break;
33287 case V2SFmode:
33288 cmode = SFmode;
33289 break;
33290 default:
33291 gcc_unreachable ();
33294 if (!register_operand (ops[1], cmode))
33295 ops[1] = force_reg (cmode, ops[1]);
33296 if (!register_operand (ops[0], cmode))
33297 ops[0] = force_reg (cmode, ops[0]);
33298 emit_insn (gen_rtx_SET (VOIDmode, target,
33299 gen_rtx_VEC_CONCAT (mode, ops[0],
33300 ops[1])));
33301 break;
33303 case 4:
33304 switch (mode)
33306 case V4DImode:
33307 cmode = V2DImode;
33308 break;
33309 case V4DFmode:
33310 cmode = V2DFmode;
33311 break;
33312 case V4SImode:
33313 cmode = V2SImode;
33314 break;
33315 case V4SFmode:
33316 cmode = V2SFmode;
33317 break;
33318 default:
33319 gcc_unreachable ();
33321 goto half;
33323 case 8:
33324 switch (mode)
33326 case V8SImode:
33327 cmode = V2SImode;
33328 hmode = V4SImode;
33329 break;
33330 case V8SFmode:
33331 cmode = V2SFmode;
33332 hmode = V4SFmode;
33333 break;
33334 default:
33335 gcc_unreachable ();
33337 goto half;
33339 half:
33340 /* FIXME: We process inputs backward to help RA. PR 36222. */
33341 i = n - 1;
33342 j = (n >> 1) - 1;
33343 for (; i > 0; i -= 2, j--)
33345 first[j] = gen_reg_rtx (cmode);
33346 v = gen_rtvec (2, ops[i - 1], ops[i]);
33347 ix86_expand_vector_init (false, first[j],
33348 gen_rtx_PARALLEL (cmode, v));
33351 n >>= 1;
33352 if (n > 2)
33354 gcc_assert (hmode != VOIDmode);
33355 for (i = j = 0; i < n; i += 2, j++)
33357 second[j] = gen_reg_rtx (hmode);
33358 ix86_expand_vector_init_concat (hmode, second [j],
33359 &first [i], 2);
33361 n >>= 1;
33362 ix86_expand_vector_init_concat (mode, target, second, n);
33364 else
33365 ix86_expand_vector_init_concat (mode, target, first, n);
33366 break;
33368 default:
33369 gcc_unreachable ();
33373 /* A subroutine of ix86_expand_vector_init_general. Use vector
33374 interleave to handle the most general case: all values variable,
33375 and none identical. */
33377 static void
33378 ix86_expand_vector_init_interleave (enum machine_mode mode,
33379 rtx target, rtx *ops, int n)
33381 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33382 int i, j;
33383 rtx op0, op1;
33384 rtx (*gen_load_even) (rtx, rtx, rtx);
33385 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33386 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33388 switch (mode)
33390 case V8HImode:
33391 gen_load_even = gen_vec_setv8hi;
33392 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33393 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33394 inner_mode = HImode;
33395 first_imode = V4SImode;
33396 second_imode = V2DImode;
33397 third_imode = VOIDmode;
33398 break;
33399 case V16QImode:
33400 gen_load_even = gen_vec_setv16qi;
33401 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33402 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33403 inner_mode = QImode;
33404 first_imode = V8HImode;
33405 second_imode = V4SImode;
33406 third_imode = V2DImode;
33407 break;
33408 default:
33409 gcc_unreachable ();
33412 for (i = 0; i < n; i++)
33414 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
33415 op0 = gen_reg_rtx (SImode);
33416 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33418 /* Insert the SImode value as low element of V4SImode vector. */
33419 op1 = gen_reg_rtx (V4SImode);
33420 op0 = gen_rtx_VEC_MERGE (V4SImode,
33421 gen_rtx_VEC_DUPLICATE (V4SImode,
33422 op0),
33423 CONST0_RTX (V4SImode),
33424 const1_rtx);
33425 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33427 /* Cast the V4SImode vector back to a vector in orignal mode. */
33428 op0 = gen_reg_rtx (mode);
33429 emit_move_insn (op0, gen_lowpart (mode, op1));
33431 /* Load even elements into the second positon. */
33432 emit_insn (gen_load_even (op0,
33433 force_reg (inner_mode,
33434 ops [i + i + 1]),
33435 const1_rtx));
33437 /* Cast vector to FIRST_IMODE vector. */
33438 ops[i] = gen_reg_rtx (first_imode);
33439 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33442 /* Interleave low FIRST_IMODE vectors. */
33443 for (i = j = 0; i < n; i += 2, j++)
33445 op0 = gen_reg_rtx (first_imode);
33446 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33448 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33449 ops[j] = gen_reg_rtx (second_imode);
33450 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33453 /* Interleave low SECOND_IMODE vectors. */
33454 switch (second_imode)
33456 case V4SImode:
33457 for (i = j = 0; i < n / 2; i += 2, j++)
33459 op0 = gen_reg_rtx (second_imode);
33460 emit_insn (gen_interleave_second_low (op0, ops[i],
33461 ops[i + 1]));
33463 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33464 vector. */
33465 ops[j] = gen_reg_rtx (third_imode);
33466 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33468 second_imode = V2DImode;
33469 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33470 /* FALLTHRU */
33472 case V2DImode:
33473 op0 = gen_reg_rtx (second_imode);
33474 emit_insn (gen_interleave_second_low (op0, ops[0],
33475 ops[1]));
33477 /* Cast the SECOND_IMODE vector back to a vector on original
33478 mode. */
33479 emit_insn (gen_rtx_SET (VOIDmode, target,
33480 gen_lowpart (mode, op0)));
33481 break;
33483 default:
33484 gcc_unreachable ();
33488 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33489 all values variable, and none identical. */
33491 static void
33492 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33493 rtx target, rtx vals)
33495 rtx ops[32], op0, op1;
33496 enum machine_mode half_mode = VOIDmode;
33497 int n, i;
33499 switch (mode)
33501 case V2SFmode:
33502 case V2SImode:
33503 if (!mmx_ok && !TARGET_SSE)
33504 break;
33505 /* FALLTHRU */
33507 case V8SFmode:
33508 case V8SImode:
33509 case V4DFmode:
33510 case V4DImode:
33511 case V4SFmode:
33512 case V4SImode:
33513 case V2DFmode:
33514 case V2DImode:
33515 n = GET_MODE_NUNITS (mode);
33516 for (i = 0; i < n; i++)
33517 ops[i] = XVECEXP (vals, 0, i);
33518 ix86_expand_vector_init_concat (mode, target, ops, n);
33519 return;
33521 case V32QImode:
33522 half_mode = V16QImode;
33523 goto half;
33525 case V16HImode:
33526 half_mode = V8HImode;
33527 goto half;
33529 half:
33530 n = GET_MODE_NUNITS (mode);
33531 for (i = 0; i < n; i++)
33532 ops[i] = XVECEXP (vals, 0, i);
33533 op0 = gen_reg_rtx (half_mode);
33534 op1 = gen_reg_rtx (half_mode);
33535 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33536 n >> 2);
33537 ix86_expand_vector_init_interleave (half_mode, op1,
33538 &ops [n >> 1], n >> 2);
33539 emit_insn (gen_rtx_SET (VOIDmode, target,
33540 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33541 return;
33543 case V16QImode:
33544 if (!TARGET_SSE4_1)
33545 break;
33546 /* FALLTHRU */
33548 case V8HImode:
33549 if (!TARGET_SSE2)
33550 break;
33552 /* Don't use ix86_expand_vector_init_interleave if we can't
33553 move from GPR to SSE register directly. */
33554 if (!TARGET_INTER_UNIT_MOVES)
33555 break;
33557 n = GET_MODE_NUNITS (mode);
33558 for (i = 0; i < n; i++)
33559 ops[i] = XVECEXP (vals, 0, i);
33560 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33561 return;
33563 case V4HImode:
33564 case V8QImode:
33565 break;
33567 default:
33568 gcc_unreachable ();
33572 int i, j, n_elts, n_words, n_elt_per_word;
33573 enum machine_mode inner_mode;
33574 rtx words[4], shift;
33576 inner_mode = GET_MODE_INNER (mode);
33577 n_elts = GET_MODE_NUNITS (mode);
33578 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33579 n_elt_per_word = n_elts / n_words;
33580 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33582 for (i = 0; i < n_words; ++i)
33584 rtx word = NULL_RTX;
33586 for (j = 0; j < n_elt_per_word; ++j)
33588 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33589 elt = convert_modes (word_mode, inner_mode, elt, true);
33591 if (j == 0)
33592 word = elt;
33593 else
33595 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33596 word, 1, OPTAB_LIB_WIDEN);
33597 word = expand_simple_binop (word_mode, IOR, word, elt,
33598 word, 1, OPTAB_LIB_WIDEN);
33602 words[i] = word;
33605 if (n_words == 1)
33606 emit_move_insn (target, gen_lowpart (mode, words[0]));
33607 else if (n_words == 2)
33609 rtx tmp = gen_reg_rtx (mode);
33610 emit_clobber (tmp);
33611 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33612 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33613 emit_move_insn (target, tmp);
33615 else if (n_words == 4)
33617 rtx tmp = gen_reg_rtx (V4SImode);
33618 gcc_assert (word_mode == SImode);
33619 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33620 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33621 emit_move_insn (target, gen_lowpart (mode, tmp));
33623 else
33624 gcc_unreachable ();
33628 /* Initialize vector TARGET via VALS. Suppress the use of MMX
33629 instructions unless MMX_OK is true. */
33631 void
33632 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33634 enum machine_mode mode = GET_MODE (target);
33635 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33636 int n_elts = GET_MODE_NUNITS (mode);
33637 int n_var = 0, one_var = -1;
33638 bool all_same = true, all_const_zero = true;
33639 int i;
33640 rtx x;
33642 for (i = 0; i < n_elts; ++i)
33644 x = XVECEXP (vals, 0, i);
33645 if (!(CONST_INT_P (x)
33646 || GET_CODE (x) == CONST_DOUBLE
33647 || GET_CODE (x) == CONST_FIXED))
33648 n_var++, one_var = i;
33649 else if (x != CONST0_RTX (inner_mode))
33650 all_const_zero = false;
33651 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33652 all_same = false;
33655 /* Constants are best loaded from the constant pool. */
33656 if (n_var == 0)
33658 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33659 return;
33662 /* If all values are identical, broadcast the value. */
33663 if (all_same
33664 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33665 XVECEXP (vals, 0, 0)))
33666 return;
33668 /* Values where only one field is non-constant are best loaded from
33669 the pool and overwritten via move later. */
33670 if (n_var == 1)
33672 if (all_const_zero
33673 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33674 XVECEXP (vals, 0, one_var),
33675 one_var))
33676 return;
33678 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33679 return;
33682 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33685 void
33686 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33688 enum machine_mode mode = GET_MODE (target);
33689 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33690 enum machine_mode half_mode;
33691 bool use_vec_merge = false;
33692 rtx tmp;
33693 static rtx (*gen_extract[6][2]) (rtx, rtx)
33695 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33696 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33697 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33698 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33699 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33700 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33702 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33704 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33705 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33706 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33707 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33708 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33709 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33711 int i, j, n;
33713 switch (mode)
33715 case V2SFmode:
33716 case V2SImode:
33717 if (mmx_ok)
33719 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33720 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33721 if (elt == 0)
33722 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33723 else
33724 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33725 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33726 return;
33728 break;
33730 case V2DImode:
33731 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33732 if (use_vec_merge)
33733 break;
33735 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33736 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33737 if (elt == 0)
33738 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33739 else
33740 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33741 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33742 return;
33744 case V2DFmode:
33746 rtx op0, op1;
33748 /* For the two element vectors, we implement a VEC_CONCAT with
33749 the extraction of the other element. */
33751 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33752 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33754 if (elt == 0)
33755 op0 = val, op1 = tmp;
33756 else
33757 op0 = tmp, op1 = val;
33759 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33760 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33762 return;
33764 case V4SFmode:
33765 use_vec_merge = TARGET_SSE4_1;
33766 if (use_vec_merge)
33767 break;
33769 switch (elt)
33771 case 0:
33772 use_vec_merge = true;
33773 break;
33775 case 1:
33776 /* tmp = target = A B C D */
33777 tmp = copy_to_reg (target);
33778 /* target = A A B B */
33779 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33780 /* target = X A B B */
33781 ix86_expand_vector_set (false, target, val, 0);
33782 /* target = A X C D */
33783 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33784 const1_rtx, const0_rtx,
33785 GEN_INT (2+4), GEN_INT (3+4)));
33786 return;
33788 case 2:
33789 /* tmp = target = A B C D */
33790 tmp = copy_to_reg (target);
33791 /* tmp = X B C D */
33792 ix86_expand_vector_set (false, tmp, val, 0);
33793 /* target = A B X D */
33794 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33795 const0_rtx, const1_rtx,
33796 GEN_INT (0+4), GEN_INT (3+4)));
33797 return;
33799 case 3:
33800 /* tmp = target = A B C D */
33801 tmp = copy_to_reg (target);
33802 /* tmp = X B C D */
33803 ix86_expand_vector_set (false, tmp, val, 0);
33804 /* target = A B X D */
33805 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33806 const0_rtx, const1_rtx,
33807 GEN_INT (2+4), GEN_INT (0+4)));
33808 return;
33810 default:
33811 gcc_unreachable ();
33813 break;
33815 case V4SImode:
33816 use_vec_merge = TARGET_SSE4_1;
33817 if (use_vec_merge)
33818 break;
33820 /* Element 0 handled by vec_merge below. */
33821 if (elt == 0)
33823 use_vec_merge = true;
33824 break;
33827 if (TARGET_SSE2)
33829 /* With SSE2, use integer shuffles to swap element 0 and ELT,
33830 store into element 0, then shuffle them back. */
33832 rtx order[4];
33834 order[0] = GEN_INT (elt);
33835 order[1] = const1_rtx;
33836 order[2] = const2_rtx;
33837 order[3] = GEN_INT (3);
33838 order[elt] = const0_rtx;
33840 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33841 order[1], order[2], order[3]));
33843 ix86_expand_vector_set (false, target, val, 0);
33845 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33846 order[1], order[2], order[3]));
33848 else
33850 /* For SSE1, we have to reuse the V4SF code. */
33851 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
33852 gen_lowpart (SFmode, val), elt);
33854 return;
33856 case V8HImode:
33857 use_vec_merge = TARGET_SSE2;
33858 break;
33859 case V4HImode:
33860 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33861 break;
33863 case V16QImode:
33864 use_vec_merge = TARGET_SSE4_1;
33865 break;
33867 case V8QImode:
33868 break;
33870 case V32QImode:
33871 half_mode = V16QImode;
33872 j = 0;
33873 n = 16;
33874 goto half;
33876 case V16HImode:
33877 half_mode = V8HImode;
33878 j = 1;
33879 n = 8;
33880 goto half;
33882 case V8SImode:
33883 half_mode = V4SImode;
33884 j = 2;
33885 n = 4;
33886 goto half;
33888 case V4DImode:
33889 half_mode = V2DImode;
33890 j = 3;
33891 n = 2;
33892 goto half;
33894 case V8SFmode:
33895 half_mode = V4SFmode;
33896 j = 4;
33897 n = 4;
33898 goto half;
33900 case V4DFmode:
33901 half_mode = V2DFmode;
33902 j = 5;
33903 n = 2;
33904 goto half;
33906 half:
33907 /* Compute offset. */
33908 i = elt / n;
33909 elt %= n;
33911 gcc_assert (i <= 1);
33913 /* Extract the half. */
33914 tmp = gen_reg_rtx (half_mode);
33915 emit_insn (gen_extract[j][i] (tmp, target));
33917 /* Put val in tmp at elt. */
33918 ix86_expand_vector_set (false, tmp, val, elt);
33920 /* Put it back. */
33921 emit_insn (gen_insert[j][i] (target, target, tmp));
33922 return;
33924 default:
33925 break;
33928 if (use_vec_merge)
33930 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33931 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33932 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33934 else
33936 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33938 emit_move_insn (mem, target);
33940 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33941 emit_move_insn (tmp, val);
33943 emit_move_insn (target, mem);
33947 void
33948 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33950 enum machine_mode mode = GET_MODE (vec);
33951 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33952 bool use_vec_extr = false;
33953 rtx tmp;
33955 switch (mode)
33957 case V2SImode:
33958 case V2SFmode:
33959 if (!mmx_ok)
33960 break;
33961 /* FALLTHRU */
33963 case V2DFmode:
33964 case V2DImode:
33965 use_vec_extr = true;
33966 break;
33968 case V4SFmode:
33969 use_vec_extr = TARGET_SSE4_1;
33970 if (use_vec_extr)
33971 break;
33973 switch (elt)
33975 case 0:
33976 tmp = vec;
33977 break;
33979 case 1:
33980 case 3:
33981 tmp = gen_reg_rtx (mode);
33982 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33983 GEN_INT (elt), GEN_INT (elt),
33984 GEN_INT (elt+4), GEN_INT (elt+4)));
33985 break;
33987 case 2:
33988 tmp = gen_reg_rtx (mode);
33989 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33990 break;
33992 default:
33993 gcc_unreachable ();
33995 vec = tmp;
33996 use_vec_extr = true;
33997 elt = 0;
33998 break;
34000 case V4SImode:
34001 use_vec_extr = TARGET_SSE4_1;
34002 if (use_vec_extr)
34003 break;
34005 if (TARGET_SSE2)
34007 switch (elt)
34009 case 0:
34010 tmp = vec;
34011 break;
34013 case 1:
34014 case 3:
34015 tmp = gen_reg_rtx (mode);
34016 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34017 GEN_INT (elt), GEN_INT (elt),
34018 GEN_INT (elt), GEN_INT (elt)));
34019 break;
34021 case 2:
34022 tmp = gen_reg_rtx (mode);
34023 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34024 break;
34026 default:
34027 gcc_unreachable ();
34029 vec = tmp;
34030 use_vec_extr = true;
34031 elt = 0;
34033 else
34035 /* For SSE1, we have to reuse the V4SF code. */
34036 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34037 gen_lowpart (V4SFmode, vec), elt);
34038 return;
34040 break;
34042 case V8HImode:
34043 use_vec_extr = TARGET_SSE2;
34044 break;
34045 case V4HImode:
34046 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34047 break;
34049 case V16QImode:
34050 use_vec_extr = TARGET_SSE4_1;
34051 break;
34053 case V8SFmode:
34054 if (TARGET_AVX)
34056 tmp = gen_reg_rtx (V4SFmode);
34057 if (elt < 4)
34058 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34059 else
34060 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34061 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34062 return;
34064 break;
34066 case V4DFmode:
34067 if (TARGET_AVX)
34069 tmp = gen_reg_rtx (V2DFmode);
34070 if (elt < 2)
34071 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34072 else
34073 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34074 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34075 return;
34077 break;
34079 case V32QImode:
34080 if (TARGET_AVX)
34082 tmp = gen_reg_rtx (V16QImode);
34083 if (elt < 16)
34084 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34085 else
34086 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34087 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34088 return;
34090 break;
34092 case V16HImode:
34093 if (TARGET_AVX)
34095 tmp = gen_reg_rtx (V8HImode);
34096 if (elt < 8)
34097 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34098 else
34099 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34100 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34101 return;
34103 break;
34105 case V8SImode:
34106 if (TARGET_AVX)
34108 tmp = gen_reg_rtx (V4SImode);
34109 if (elt < 4)
34110 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34111 else
34112 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34113 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34114 return;
34116 break;
34118 case V4DImode:
34119 if (TARGET_AVX)
34121 tmp = gen_reg_rtx (V2DImode);
34122 if (elt < 2)
34123 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34124 else
34125 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34126 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34127 return;
34129 break;
34131 case V8QImode:
34132 /* ??? Could extract the appropriate HImode element and shift. */
34133 default:
34134 break;
34137 if (use_vec_extr)
34139 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34140 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34142 /* Let the rtl optimizers know about the zero extension performed. */
34143 if (inner_mode == QImode || inner_mode == HImode)
34145 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34146 target = gen_lowpart (SImode, target);
34149 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34151 else
34153 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34155 emit_move_insn (mem, vec);
34157 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34158 emit_move_insn (target, tmp);
34162 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34163 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34164 The upper bits of DEST are undefined, though they shouldn't cause
34165 exceptions (some bits from src or all zeros are ok). */
34167 static void
34168 emit_reduc_half (rtx dest, rtx src, int i)
34170 rtx tem;
34171 switch (GET_MODE (src))
34173 case V4SFmode:
34174 if (i == 128)
34175 tem = gen_sse_movhlps (dest, src, src);
34176 else
34177 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34178 GEN_INT (1 + 4), GEN_INT (1 + 4));
34179 break;
34180 case V2DFmode:
34181 tem = gen_vec_interleave_highv2df (dest, src, src);
34182 break;
34183 case V16QImode:
34184 case V8HImode:
34185 case V4SImode:
34186 case V2DImode:
34187 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34188 gen_lowpart (V1TImode, src),
34189 GEN_INT (i / 2));
34190 break;
34191 case V8SFmode:
34192 if (i == 256)
34193 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34194 else
34195 tem = gen_avx_shufps256 (dest, src, src,
34196 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34197 break;
34198 case V4DFmode:
34199 if (i == 256)
34200 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34201 else
34202 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34203 break;
34204 case V32QImode:
34205 case V16HImode:
34206 case V8SImode:
34207 case V4DImode:
34208 if (i == 256)
34209 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34210 gen_lowpart (V4DImode, src),
34211 gen_lowpart (V4DImode, src),
34212 const1_rtx);
34213 else
34214 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34215 gen_lowpart (V2TImode, src),
34216 GEN_INT (i / 2));
34217 break;
34218 default:
34219 gcc_unreachable ();
34221 emit_insn (tem);
34224 /* Expand a vector reduction. FN is the binary pattern to reduce;
34225 DEST is the destination; IN is the input vector. */
34227 void
34228 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34230 rtx half, dst, vec = in;
34231 enum machine_mode mode = GET_MODE (in);
34232 int i;
34234 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34235 if (TARGET_SSE4_1
34236 && mode == V8HImode
34237 && fn == gen_uminv8hi3)
34239 emit_insn (gen_sse4_1_phminposuw (dest, in));
34240 return;
34243 for (i = GET_MODE_BITSIZE (mode);
34244 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34245 i >>= 1)
34247 half = gen_reg_rtx (mode);
34248 emit_reduc_half (half, vec, i);
34249 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34250 dst = dest;
34251 else
34252 dst = gen_reg_rtx (mode);
34253 emit_insn (fn (dst, half, vec));
34254 vec = dst;
34258 /* Target hook for scalar_mode_supported_p. */
34259 static bool
34260 ix86_scalar_mode_supported_p (enum machine_mode mode)
34262 if (DECIMAL_FLOAT_MODE_P (mode))
34263 return default_decimal_float_supported_p ();
34264 else if (mode == TFmode)
34265 return true;
34266 else
34267 return default_scalar_mode_supported_p (mode);
34270 /* Implements target hook vector_mode_supported_p. */
34271 static bool
34272 ix86_vector_mode_supported_p (enum machine_mode mode)
34274 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34275 return true;
34276 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34277 return true;
34278 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34279 return true;
34280 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34281 return true;
34282 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34283 return true;
34284 return false;
34287 /* Target hook for c_mode_for_suffix. */
34288 static enum machine_mode
34289 ix86_c_mode_for_suffix (char suffix)
34291 if (suffix == 'q')
34292 return TFmode;
34293 if (suffix == 'w')
34294 return XFmode;
34296 return VOIDmode;
34299 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34301 We do this in the new i386 backend to maintain source compatibility
34302 with the old cc0-based compiler. */
34304 static tree
34305 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34306 tree inputs ATTRIBUTE_UNUSED,
34307 tree clobbers)
34309 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34310 clobbers);
34311 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34312 clobbers);
34313 return clobbers;
34316 /* Implements target vector targetm.asm.encode_section_info. */
34318 static void ATTRIBUTE_UNUSED
34319 ix86_encode_section_info (tree decl, rtx rtl, int first)
34321 default_encode_section_info (decl, rtl, first);
34323 if (TREE_CODE (decl) == VAR_DECL
34324 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34325 && ix86_in_large_data_p (decl))
34326 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34329 /* Worker function for REVERSE_CONDITION. */
34331 enum rtx_code
34332 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34334 return (mode != CCFPmode && mode != CCFPUmode
34335 ? reverse_condition (code)
34336 : reverse_condition_maybe_unordered (code));
34339 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34340 to OPERANDS[0]. */
34342 const char *
34343 output_387_reg_move (rtx insn, rtx *operands)
34345 if (REG_P (operands[0]))
34347 if (REG_P (operands[1])
34348 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34350 if (REGNO (operands[0]) == FIRST_STACK_REG)
34351 return output_387_ffreep (operands, 0);
34352 return "fstp\t%y0";
34354 if (STACK_TOP_P (operands[0]))
34355 return "fld%Z1\t%y1";
34356 return "fst\t%y0";
34358 else if (MEM_P (operands[0]))
34360 gcc_assert (REG_P (operands[1]));
34361 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34362 return "fstp%Z0\t%y0";
34363 else
34365 /* There is no non-popping store to memory for XFmode.
34366 So if we need one, follow the store with a load. */
34367 if (GET_MODE (operands[0]) == XFmode)
34368 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34369 else
34370 return "fst%Z0\t%y0";
34373 else
34374 gcc_unreachable();
34377 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34378 FP status register is set. */
34380 void
34381 ix86_emit_fp_unordered_jump (rtx label)
34383 rtx reg = gen_reg_rtx (HImode);
34384 rtx temp;
34386 emit_insn (gen_x86_fnstsw_1 (reg));
34388 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34390 emit_insn (gen_x86_sahf_1 (reg));
34392 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34393 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34395 else
34397 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34399 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34400 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34403 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34404 gen_rtx_LABEL_REF (VOIDmode, label),
34405 pc_rtx);
34406 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34408 emit_jump_insn (temp);
34409 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34412 /* Output code to perform a log1p XFmode calculation. */
34414 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34416 rtx label1 = gen_label_rtx ();
34417 rtx label2 = gen_label_rtx ();
34419 rtx tmp = gen_reg_rtx (XFmode);
34420 rtx tmp2 = gen_reg_rtx (XFmode);
34421 rtx test;
34423 emit_insn (gen_absxf2 (tmp, op1));
34424 test = gen_rtx_GE (VOIDmode, tmp,
34425 CONST_DOUBLE_FROM_REAL_VALUE (
34426 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34427 XFmode));
34428 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34430 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34431 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34432 emit_jump (label2);
34434 emit_label (label1);
34435 emit_move_insn (tmp, CONST1_RTX (XFmode));
34436 emit_insn (gen_addxf3 (tmp, op1, tmp));
34437 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34438 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34440 emit_label (label2);
34443 /* Emit code for round calculation. */
34444 void ix86_emit_i387_round (rtx op0, rtx op1)
34446 enum machine_mode inmode = GET_MODE (op1);
34447 enum machine_mode outmode = GET_MODE (op0);
34448 rtx e1, e2, res, tmp, tmp1, half;
34449 rtx scratch = gen_reg_rtx (HImode);
34450 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34451 rtx jump_label = gen_label_rtx ();
34452 rtx insn;
34453 rtx (*gen_abs) (rtx, rtx);
34454 rtx (*gen_neg) (rtx, rtx);
34456 switch (inmode)
34458 case SFmode:
34459 gen_abs = gen_abssf2;
34460 break;
34461 case DFmode:
34462 gen_abs = gen_absdf2;
34463 break;
34464 case XFmode:
34465 gen_abs = gen_absxf2;
34466 break;
34467 default:
34468 gcc_unreachable ();
34471 switch (outmode)
34473 case SFmode:
34474 gen_neg = gen_negsf2;
34475 break;
34476 case DFmode:
34477 gen_neg = gen_negdf2;
34478 break;
34479 case XFmode:
34480 gen_neg = gen_negxf2;
34481 break;
34482 case HImode:
34483 gen_neg = gen_neghi2;
34484 break;
34485 case SImode:
34486 gen_neg = gen_negsi2;
34487 break;
34488 case DImode:
34489 gen_neg = gen_negdi2;
34490 break;
34491 default:
34492 gcc_unreachable ();
34495 e1 = gen_reg_rtx (inmode);
34496 e2 = gen_reg_rtx (inmode);
34497 res = gen_reg_rtx (outmode);
34499 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34501 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34503 /* scratch = fxam(op1) */
34504 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34505 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34506 UNSPEC_FXAM)));
34507 /* e1 = fabs(op1) */
34508 emit_insn (gen_abs (e1, op1));
34510 /* e2 = e1 + 0.5 */
34511 half = force_reg (inmode, half);
34512 emit_insn (gen_rtx_SET (VOIDmode, e2,
34513 gen_rtx_PLUS (inmode, e1, half)));
34515 /* res = floor(e2) */
34516 if (inmode != XFmode)
34518 tmp1 = gen_reg_rtx (XFmode);
34520 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34521 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34523 else
34524 tmp1 = e2;
34526 switch (outmode)
34528 case SFmode:
34529 case DFmode:
34531 rtx tmp0 = gen_reg_rtx (XFmode);
34533 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34535 emit_insn (gen_rtx_SET (VOIDmode, res,
34536 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34537 UNSPEC_TRUNC_NOOP)));
34539 break;
34540 case XFmode:
34541 emit_insn (gen_frndintxf2_floor (res, tmp1));
34542 break;
34543 case HImode:
34544 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34545 break;
34546 case SImode:
34547 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34548 break;
34549 case DImode:
34550 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34551 break;
34552 default:
34553 gcc_unreachable ();
34556 /* flags = signbit(a) */
34557 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34559 /* if (flags) then res = -res */
34560 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34561 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34562 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34563 pc_rtx);
34564 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34565 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34566 JUMP_LABEL (insn) = jump_label;
34568 emit_insn (gen_neg (res, res));
34570 emit_label (jump_label);
34571 LABEL_NUSES (jump_label) = 1;
34573 emit_move_insn (op0, res);
34576 /* Output code to perform a Newton-Rhapson approximation of a single precision
34577 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
34579 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34581 rtx x0, x1, e0, e1;
34583 x0 = gen_reg_rtx (mode);
34584 e0 = gen_reg_rtx (mode);
34585 e1 = gen_reg_rtx (mode);
34586 x1 = gen_reg_rtx (mode);
34588 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34590 b = force_reg (mode, b);
34592 /* x0 = rcp(b) estimate */
34593 emit_insn (gen_rtx_SET (VOIDmode, x0,
34594 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34595 UNSPEC_RCP)));
34596 /* e0 = x0 * b */
34597 emit_insn (gen_rtx_SET (VOIDmode, e0,
34598 gen_rtx_MULT (mode, x0, b)));
34600 /* e0 = x0 * e0 */
34601 emit_insn (gen_rtx_SET (VOIDmode, e0,
34602 gen_rtx_MULT (mode, x0, e0)));
34604 /* e1 = x0 + x0 */
34605 emit_insn (gen_rtx_SET (VOIDmode, e1,
34606 gen_rtx_PLUS (mode, x0, x0)));
34608 /* x1 = e1 - e0 */
34609 emit_insn (gen_rtx_SET (VOIDmode, x1,
34610 gen_rtx_MINUS (mode, e1, e0)));
34612 /* res = a * x1 */
34613 emit_insn (gen_rtx_SET (VOIDmode, res,
34614 gen_rtx_MULT (mode, a, x1)));
34617 /* Output code to perform a Newton-Rhapson approximation of a
34618 single precision floating point [reciprocal] square root. */
34620 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34621 bool recip)
34623 rtx x0, e0, e1, e2, e3, mthree, mhalf;
34624 REAL_VALUE_TYPE r;
34626 x0 = gen_reg_rtx (mode);
34627 e0 = gen_reg_rtx (mode);
34628 e1 = gen_reg_rtx (mode);
34629 e2 = gen_reg_rtx (mode);
34630 e3 = gen_reg_rtx (mode);
34632 real_from_integer (&r, VOIDmode, -3, -1, 0);
34633 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34635 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34636 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34638 if (VECTOR_MODE_P (mode))
34640 mthree = ix86_build_const_vector (mode, true, mthree);
34641 mhalf = ix86_build_const_vector (mode, true, mhalf);
34644 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34645 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34647 a = force_reg (mode, a);
34649 /* x0 = rsqrt(a) estimate */
34650 emit_insn (gen_rtx_SET (VOIDmode, x0,
34651 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34652 UNSPEC_RSQRT)));
34654 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
34655 if (!recip)
34657 rtx zero, mask;
34659 zero = gen_reg_rtx (mode);
34660 mask = gen_reg_rtx (mode);
34662 zero = force_reg (mode, CONST0_RTX(mode));
34663 emit_insn (gen_rtx_SET (VOIDmode, mask,
34664 gen_rtx_NE (mode, zero, a)));
34666 emit_insn (gen_rtx_SET (VOIDmode, x0,
34667 gen_rtx_AND (mode, x0, mask)));
34670 /* e0 = x0 * a */
34671 emit_insn (gen_rtx_SET (VOIDmode, e0,
34672 gen_rtx_MULT (mode, x0, a)));
34673 /* e1 = e0 * x0 */
34674 emit_insn (gen_rtx_SET (VOIDmode, e1,
34675 gen_rtx_MULT (mode, e0, x0)));
34677 /* e2 = e1 - 3. */
34678 mthree = force_reg (mode, mthree);
34679 emit_insn (gen_rtx_SET (VOIDmode, e2,
34680 gen_rtx_PLUS (mode, e1, mthree)));
34682 mhalf = force_reg (mode, mhalf);
34683 if (recip)
34684 /* e3 = -.5 * x0 */
34685 emit_insn (gen_rtx_SET (VOIDmode, e3,
34686 gen_rtx_MULT (mode, x0, mhalf)));
34687 else
34688 /* e3 = -.5 * e0 */
34689 emit_insn (gen_rtx_SET (VOIDmode, e3,
34690 gen_rtx_MULT (mode, e0, mhalf)));
34691 /* ret = e2 * e3 */
34692 emit_insn (gen_rtx_SET (VOIDmode, res,
34693 gen_rtx_MULT (mode, e2, e3)));
34696 #ifdef TARGET_SOLARIS
34697 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
34699 static void
34700 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34701 tree decl)
34703 /* With Binutils 2.15, the "@unwind" marker must be specified on
34704 every occurrence of the ".eh_frame" section, not just the first
34705 one. */
34706 if (TARGET_64BIT
34707 && strcmp (name, ".eh_frame") == 0)
34709 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34710 flags & SECTION_WRITE ? "aw" : "a");
34711 return;
34714 #ifndef USE_GAS
34715 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34717 solaris_elf_asm_comdat_section (name, flags, decl);
34718 return;
34720 #endif
34722 default_elf_asm_named_section (name, flags, decl);
34724 #endif /* TARGET_SOLARIS */
34726 /* Return the mangling of TYPE if it is an extended fundamental type. */
34728 static const char *
34729 ix86_mangle_type (const_tree type)
34731 type = TYPE_MAIN_VARIANT (type);
34733 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34734 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34735 return NULL;
34737 switch (TYPE_MODE (type))
34739 case TFmode:
34740 /* __float128 is "g". */
34741 return "g";
34742 case XFmode:
34743 /* "long double" or __float80 is "e". */
34744 return "e";
34745 default:
34746 return NULL;
34750 /* For 32-bit code we can save PIC register setup by using
34751 __stack_chk_fail_local hidden function instead of calling
34752 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
34753 register, so it is better to call __stack_chk_fail directly. */
34755 static tree ATTRIBUTE_UNUSED
34756 ix86_stack_protect_fail (void)
34758 return TARGET_64BIT
34759 ? default_external_stack_protect_fail ()
34760 : default_hidden_stack_protect_fail ();
34763 /* Select a format to encode pointers in exception handling data. CODE
34764 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
34765 true if the symbol may be affected by dynamic relocations.
34767 ??? All x86 object file formats are capable of representing this.
34768 After all, the relocation needed is the same as for the call insn.
34769 Whether or not a particular assembler allows us to enter such, I
34770 guess we'll have to see. */
34772 asm_preferred_eh_data_format (int code, int global)
34774 if (flag_pic)
34776 int type = DW_EH_PE_sdata8;
34777 if (!TARGET_64BIT
34778 || ix86_cmodel == CM_SMALL_PIC
34779 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34780 type = DW_EH_PE_sdata4;
34781 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34783 if (ix86_cmodel == CM_SMALL
34784 || (ix86_cmodel == CM_MEDIUM && code))
34785 return DW_EH_PE_udata4;
34786 return DW_EH_PE_absptr;
34789 /* Expand copysign from SIGN to the positive value ABS_VALUE
34790 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
34791 the sign-bit. */
34792 static void
34793 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34795 enum machine_mode mode = GET_MODE (sign);
34796 rtx sgn = gen_reg_rtx (mode);
34797 if (mask == NULL_RTX)
34799 enum machine_mode vmode;
34801 if (mode == SFmode)
34802 vmode = V4SFmode;
34803 else if (mode == DFmode)
34804 vmode = V2DFmode;
34805 else
34806 vmode = mode;
34808 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34809 if (!VECTOR_MODE_P (mode))
34811 /* We need to generate a scalar mode mask in this case. */
34812 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34813 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34814 mask = gen_reg_rtx (mode);
34815 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34818 else
34819 mask = gen_rtx_NOT (mode, mask);
34820 emit_insn (gen_rtx_SET (VOIDmode, sgn,
34821 gen_rtx_AND (mode, mask, sign)));
34822 emit_insn (gen_rtx_SET (VOIDmode, result,
34823 gen_rtx_IOR (mode, abs_value, sgn)));
34826 /* Expand fabs (OP0) and return a new rtx that holds the result. The
34827 mask for masking out the sign-bit is stored in *SMASK, if that is
34828 non-null. */
34829 static rtx
34830 ix86_expand_sse_fabs (rtx op0, rtx *smask)
34832 enum machine_mode vmode, mode = GET_MODE (op0);
34833 rtx xa, mask;
34835 xa = gen_reg_rtx (mode);
34836 if (mode == SFmode)
34837 vmode = V4SFmode;
34838 else if (mode == DFmode)
34839 vmode = V2DFmode;
34840 else
34841 vmode = mode;
34842 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
34843 if (!VECTOR_MODE_P (mode))
34845 /* We need to generate a scalar mode mask in this case. */
34846 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34847 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34848 mask = gen_reg_rtx (mode);
34849 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34851 emit_insn (gen_rtx_SET (VOIDmode, xa,
34852 gen_rtx_AND (mode, op0, mask)));
34854 if (smask)
34855 *smask = mask;
34857 return xa;
34860 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
34861 swapping the operands if SWAP_OPERANDS is true. The expanded
34862 code is a forward jump to a newly created label in case the
34863 comparison is true. The generated label rtx is returned. */
34864 static rtx
34865 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
34866 bool swap_operands)
34868 rtx label, tmp;
34870 if (swap_operands)
34872 tmp = op0;
34873 op0 = op1;
34874 op1 = tmp;
34877 label = gen_label_rtx ();
34878 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
34879 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34880 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
34881 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
34882 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
34883 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
34884 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34885 JUMP_LABEL (tmp) = label;
34887 return label;
34890 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
34891 using comparison code CODE. Operands are swapped for the comparison if
34892 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
34893 static rtx
34894 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
34895 bool swap_operands)
34897 rtx (*insn)(rtx, rtx, rtx, rtx);
34898 enum machine_mode mode = GET_MODE (op0);
34899 rtx mask = gen_reg_rtx (mode);
34901 if (swap_operands)
34903 rtx tmp = op0;
34904 op0 = op1;
34905 op1 = tmp;
34908 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
34910 emit_insn (insn (mask, op0, op1,
34911 gen_rtx_fmt_ee (code, mode, op0, op1)));
34912 return mask;
34915 /* Generate and return a rtx of mode MODE for 2**n where n is the number
34916 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
34917 static rtx
34918 ix86_gen_TWO52 (enum machine_mode mode)
34920 REAL_VALUE_TYPE TWO52r;
34921 rtx TWO52;
34923 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
34924 TWO52 = const_double_from_real_value (TWO52r, mode);
34925 TWO52 = force_reg (mode, TWO52);
34927 return TWO52;
34930 /* Expand SSE sequence for computing lround from OP1 storing
34931 into OP0. */
34932 void
34933 ix86_expand_lround (rtx op0, rtx op1)
34935 /* C code for the stuff we're doing below:
34936 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34937 return (long)tmp;
34939 enum machine_mode mode = GET_MODE (op1);
34940 const struct real_format *fmt;
34941 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34942 rtx adj;
34944 /* load nextafter (0.5, 0.0) */
34945 fmt = REAL_MODE_FORMAT (mode);
34946 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34947 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34949 /* adj = copysign (0.5, op1) */
34950 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34951 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34953 /* adj = op1 + adj */
34954 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34956 /* op0 = (imode)adj */
34957 expand_fix (op0, adj, 0);
34960 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34961 into OPERAND0. */
34962 void
34963 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34965 /* C code for the stuff we're doing below (for do_floor):
34966 xi = (long)op1;
34967 xi -= (double)xi > op1 ? 1 : 0;
34968 return xi;
34970 enum machine_mode fmode = GET_MODE (op1);
34971 enum machine_mode imode = GET_MODE (op0);
34972 rtx ireg, freg, label, tmp;
34974 /* reg = (long)op1 */
34975 ireg = gen_reg_rtx (imode);
34976 expand_fix (ireg, op1, 0);
34978 /* freg = (double)reg */
34979 freg = gen_reg_rtx (fmode);
34980 expand_float (freg, ireg, 0);
34982 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34983 label = ix86_expand_sse_compare_and_jump (UNLE,
34984 freg, op1, !do_floor);
34985 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34986 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34987 emit_move_insn (ireg, tmp);
34989 emit_label (label);
34990 LABEL_NUSES (label) = 1;
34992 emit_move_insn (op0, ireg);
34995 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34996 result in OPERAND0. */
34997 void
34998 ix86_expand_rint (rtx operand0, rtx operand1)
35000 /* C code for the stuff we're doing below:
35001 xa = fabs (operand1);
35002 if (!isless (xa, 2**52))
35003 return operand1;
35004 xa = xa + 2**52 - 2**52;
35005 return copysign (xa, operand1);
35007 enum machine_mode mode = GET_MODE (operand0);
35008 rtx res, xa, label, TWO52, mask;
35010 res = gen_reg_rtx (mode);
35011 emit_move_insn (res, operand1);
35013 /* xa = abs (operand1) */
35014 xa = ix86_expand_sse_fabs (res, &mask);
35016 /* if (!isless (xa, TWO52)) goto label; */
35017 TWO52 = ix86_gen_TWO52 (mode);
35018 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35020 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35021 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35023 ix86_sse_copysign_to_positive (res, xa, res, mask);
35025 emit_label (label);
35026 LABEL_NUSES (label) = 1;
35028 emit_move_insn (operand0, res);
35031 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35032 into OPERAND0. */
35033 void
35034 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35036 /* C code for the stuff we expand below.
35037 double xa = fabs (x), x2;
35038 if (!isless (xa, TWO52))
35039 return x;
35040 xa = xa + TWO52 - TWO52;
35041 x2 = copysign (xa, x);
35042 Compensate. Floor:
35043 if (x2 > x)
35044 x2 -= 1;
35045 Compensate. Ceil:
35046 if (x2 < x)
35047 x2 -= -1;
35048 return x2;
35050 enum machine_mode mode = GET_MODE (operand0);
35051 rtx xa, TWO52, tmp, label, one, res, mask;
35053 TWO52 = ix86_gen_TWO52 (mode);
35055 /* Temporary for holding the result, initialized to the input
35056 operand to ease control flow. */
35057 res = gen_reg_rtx (mode);
35058 emit_move_insn (res, operand1);
35060 /* xa = abs (operand1) */
35061 xa = ix86_expand_sse_fabs (res, &mask);
35063 /* if (!isless (xa, TWO52)) goto label; */
35064 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35066 /* xa = xa + TWO52 - TWO52; */
35067 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35068 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35070 /* xa = copysign (xa, operand1) */
35071 ix86_sse_copysign_to_positive (xa, xa, res, mask);
35073 /* generate 1.0 or -1.0 */
35074 one = force_reg (mode,
35075 const_double_from_real_value (do_floor
35076 ? dconst1 : dconstm1, mode));
35078 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35079 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35080 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35081 gen_rtx_AND (mode, one, tmp)));
35082 /* We always need to subtract here to preserve signed zero. */
35083 tmp = expand_simple_binop (mode, MINUS,
35084 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35085 emit_move_insn (res, tmp);
35087 emit_label (label);
35088 LABEL_NUSES (label) = 1;
35090 emit_move_insn (operand0, res);
35093 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35094 into OPERAND0. */
35095 void
35096 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35098 /* C code for the stuff we expand below.
35099 double xa = fabs (x), x2;
35100 if (!isless (xa, TWO52))
35101 return x;
35102 x2 = (double)(long)x;
35103 Compensate. Floor:
35104 if (x2 > x)
35105 x2 -= 1;
35106 Compensate. Ceil:
35107 if (x2 < x)
35108 x2 += 1;
35109 if (HONOR_SIGNED_ZEROS (mode))
35110 return copysign (x2, x);
35111 return x2;
35113 enum machine_mode mode = GET_MODE (operand0);
35114 rtx xa, xi, TWO52, tmp, label, one, res, mask;
35116 TWO52 = ix86_gen_TWO52 (mode);
35118 /* Temporary for holding the result, initialized to the input
35119 operand to ease control flow. */
35120 res = gen_reg_rtx (mode);
35121 emit_move_insn (res, operand1);
35123 /* xa = abs (operand1) */
35124 xa = ix86_expand_sse_fabs (res, &mask);
35126 /* if (!isless (xa, TWO52)) goto label; */
35127 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35129 /* xa = (double)(long)x */
35130 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35131 expand_fix (xi, res, 0);
35132 expand_float (xa, xi, 0);
35134 /* generate 1.0 */
35135 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35137 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35138 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35139 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35140 gen_rtx_AND (mode, one, tmp)));
35141 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35142 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35143 emit_move_insn (res, tmp);
35145 if (HONOR_SIGNED_ZEROS (mode))
35146 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35148 emit_label (label);
35149 LABEL_NUSES (label) = 1;
35151 emit_move_insn (operand0, res);
35154 /* Expand SSE sequence for computing round from OPERAND1 storing
35155 into OPERAND0. Sequence that works without relying on DImode truncation
35156 via cvttsd2siq that is only available on 64bit targets. */
35157 void
35158 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35160 /* C code for the stuff we expand below.
35161 double xa = fabs (x), xa2, x2;
35162 if (!isless (xa, TWO52))
35163 return x;
35164 Using the absolute value and copying back sign makes
35165 -0.0 -> -0.0 correct.
35166 xa2 = xa + TWO52 - TWO52;
35167 Compensate.
35168 dxa = xa2 - xa;
35169 if (dxa <= -0.5)
35170 xa2 += 1;
35171 else if (dxa > 0.5)
35172 xa2 -= 1;
35173 x2 = copysign (xa2, x);
35174 return x2;
35176 enum machine_mode mode = GET_MODE (operand0);
35177 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35179 TWO52 = ix86_gen_TWO52 (mode);
35181 /* Temporary for holding the result, initialized to the input
35182 operand to ease control flow. */
35183 res = gen_reg_rtx (mode);
35184 emit_move_insn (res, operand1);
35186 /* xa = abs (operand1) */
35187 xa = ix86_expand_sse_fabs (res, &mask);
35189 /* if (!isless (xa, TWO52)) goto label; */
35190 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35192 /* xa2 = xa + TWO52 - TWO52; */
35193 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35194 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35196 /* dxa = xa2 - xa; */
35197 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35199 /* generate 0.5, 1.0 and -0.5 */
35200 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35201 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35202 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35203 0, OPTAB_DIRECT);
35205 /* Compensate. */
35206 tmp = gen_reg_rtx (mode);
35207 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35208 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35209 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35210 gen_rtx_AND (mode, one, tmp)));
35211 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35212 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35213 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35214 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35215 gen_rtx_AND (mode, one, tmp)));
35216 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35218 /* res = copysign (xa2, operand1) */
35219 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35221 emit_label (label);
35222 LABEL_NUSES (label) = 1;
35224 emit_move_insn (operand0, res);
35227 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35228 into OPERAND0. */
35229 void
35230 ix86_expand_trunc (rtx operand0, rtx operand1)
35232 /* C code for SSE variant we expand below.
35233 double xa = fabs (x), x2;
35234 if (!isless (xa, TWO52))
35235 return x;
35236 x2 = (double)(long)x;
35237 if (HONOR_SIGNED_ZEROS (mode))
35238 return copysign (x2, x);
35239 return x2;
35241 enum machine_mode mode = GET_MODE (operand0);
35242 rtx xa, xi, TWO52, label, res, mask;
35244 TWO52 = ix86_gen_TWO52 (mode);
35246 /* Temporary for holding the result, initialized to the input
35247 operand to ease control flow. */
35248 res = gen_reg_rtx (mode);
35249 emit_move_insn (res, operand1);
35251 /* xa = abs (operand1) */
35252 xa = ix86_expand_sse_fabs (res, &mask);
35254 /* if (!isless (xa, TWO52)) goto label; */
35255 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35257 /* x = (double)(long)x */
35258 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35259 expand_fix (xi, res, 0);
35260 expand_float (res, xi, 0);
35262 if (HONOR_SIGNED_ZEROS (mode))
35263 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35265 emit_label (label);
35266 LABEL_NUSES (label) = 1;
35268 emit_move_insn (operand0, res);
35271 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35272 into OPERAND0. */
35273 void
35274 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35276 enum machine_mode mode = GET_MODE (operand0);
35277 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35279 /* C code for SSE variant we expand below.
35280 double xa = fabs (x), x2;
35281 if (!isless (xa, TWO52))
35282 return x;
35283 xa2 = xa + TWO52 - TWO52;
35284 Compensate:
35285 if (xa2 > xa)
35286 xa2 -= 1.0;
35287 x2 = copysign (xa2, x);
35288 return x2;
35291 TWO52 = ix86_gen_TWO52 (mode);
35293 /* Temporary for holding the result, initialized to the input
35294 operand to ease control flow. */
35295 res = gen_reg_rtx (mode);
35296 emit_move_insn (res, operand1);
35298 /* xa = abs (operand1) */
35299 xa = ix86_expand_sse_fabs (res, &smask);
35301 /* if (!isless (xa, TWO52)) goto label; */
35302 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35304 /* res = xa + TWO52 - TWO52; */
35305 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35306 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35307 emit_move_insn (res, tmp);
35309 /* generate 1.0 */
35310 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35312 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35313 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35314 emit_insn (gen_rtx_SET (VOIDmode, mask,
35315 gen_rtx_AND (mode, mask, one)));
35316 tmp = expand_simple_binop (mode, MINUS,
35317 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35318 emit_move_insn (res, tmp);
35320 /* res = copysign (res, operand1) */
35321 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35323 emit_label (label);
35324 LABEL_NUSES (label) = 1;
35326 emit_move_insn (operand0, res);
35329 /* Expand SSE sequence for computing round from OPERAND1 storing
35330 into OPERAND0. */
35331 void
35332 ix86_expand_round (rtx operand0, rtx operand1)
35334 /* C code for the stuff we're doing below:
35335 double xa = fabs (x);
35336 if (!isless (xa, TWO52))
35337 return x;
35338 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35339 return copysign (xa, x);
35341 enum machine_mode mode = GET_MODE (operand0);
35342 rtx res, TWO52, xa, label, xi, half, mask;
35343 const struct real_format *fmt;
35344 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35346 /* Temporary for holding the result, initialized to the input
35347 operand to ease control flow. */
35348 res = gen_reg_rtx (mode);
35349 emit_move_insn (res, operand1);
35351 TWO52 = ix86_gen_TWO52 (mode);
35352 xa = ix86_expand_sse_fabs (res, &mask);
35353 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35355 /* load nextafter (0.5, 0.0) */
35356 fmt = REAL_MODE_FORMAT (mode);
35357 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35358 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35360 /* xa = xa + 0.5 */
35361 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35362 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35364 /* xa = (double)(int64_t)xa */
35365 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35366 expand_fix (xi, xa, 0);
35367 expand_float (xa, xi, 0);
35369 /* res = copysign (xa, operand1) */
35370 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35372 emit_label (label);
35373 LABEL_NUSES (label) = 1;
35375 emit_move_insn (operand0, res);
35378 /* Expand SSE sequence for computing round
35379 from OP1 storing into OP0 using sse4 round insn. */
35380 void
35381 ix86_expand_round_sse4 (rtx op0, rtx op1)
35383 enum machine_mode mode = GET_MODE (op0);
35384 rtx e1, e2, res, half;
35385 const struct real_format *fmt;
35386 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35387 rtx (*gen_copysign) (rtx, rtx, rtx);
35388 rtx (*gen_round) (rtx, rtx, rtx);
35390 switch (mode)
35392 case SFmode:
35393 gen_copysign = gen_copysignsf3;
35394 gen_round = gen_sse4_1_roundsf2;
35395 break;
35396 case DFmode:
35397 gen_copysign = gen_copysigndf3;
35398 gen_round = gen_sse4_1_rounddf2;
35399 break;
35400 default:
35401 gcc_unreachable ();
35404 /* round (a) = trunc (a + copysign (0.5, a)) */
35406 /* load nextafter (0.5, 0.0) */
35407 fmt = REAL_MODE_FORMAT (mode);
35408 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35409 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35410 half = const_double_from_real_value (pred_half, mode);
35412 /* e1 = copysign (0.5, op1) */
35413 e1 = gen_reg_rtx (mode);
35414 emit_insn (gen_copysign (e1, half, op1));
35416 /* e2 = op1 + e1 */
35417 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35419 /* res = trunc (e2) */
35420 res = gen_reg_rtx (mode);
35421 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35423 emit_move_insn (op0, res);
35427 /* Table of valid machine attributes. */
35428 static const struct attribute_spec ix86_attribute_table[] =
35430 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35431 affects_type_identity } */
35432 /* Stdcall attribute says callee is responsible for popping arguments
35433 if they are not variable. */
35434 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35435 true },
35436 /* Fastcall attribute says callee is responsible for popping arguments
35437 if they are not variable. */
35438 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35439 true },
35440 /* Thiscall attribute says callee is responsible for popping arguments
35441 if they are not variable. */
35442 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35443 true },
35444 /* Cdecl attribute says the callee is a normal C declaration */
35445 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35446 true },
35447 /* Regparm attribute specifies how many integer arguments are to be
35448 passed in registers. */
35449 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35450 true },
35451 /* Sseregparm attribute says we are using x86_64 calling conventions
35452 for FP arguments. */
35453 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35454 true },
35455 /* The transactional memory builtins are implicitly regparm or fastcall
35456 depending on the ABI. Override the generic do-nothing attribute that
35457 these builtins were declared with. */
35458 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35459 true },
35460 /* force_align_arg_pointer says this function realigns the stack at entry. */
35461 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35462 false, true, true, ix86_handle_cconv_attribute, false },
35463 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35464 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35465 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35466 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35467 false },
35468 #endif
35469 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35470 false },
35471 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35472 false },
35473 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35474 SUBTARGET_ATTRIBUTE_TABLE,
35475 #endif
35476 /* ms_abi and sysv_abi calling convention function attributes. */
35477 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35478 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35479 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35480 false },
35481 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35482 ix86_handle_callee_pop_aggregate_return, true },
35483 /* End element. */
35484 { NULL, 0, 0, false, false, false, NULL, false }
35487 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35488 static int
35489 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35490 tree vectype ATTRIBUTE_UNUSED,
35491 int misalign ATTRIBUTE_UNUSED)
35493 switch (type_of_cost)
35495 case scalar_stmt:
35496 return ix86_cost->scalar_stmt_cost;
35498 case scalar_load:
35499 return ix86_cost->scalar_load_cost;
35501 case scalar_store:
35502 return ix86_cost->scalar_store_cost;
35504 case vector_stmt:
35505 return ix86_cost->vec_stmt_cost;
35507 case vector_load:
35508 return ix86_cost->vec_align_load_cost;
35510 case vector_store:
35511 return ix86_cost->vec_store_cost;
35513 case vec_to_scalar:
35514 return ix86_cost->vec_to_scalar_cost;
35516 case scalar_to_vec:
35517 return ix86_cost->scalar_to_vec_cost;
35519 case unaligned_load:
35520 case unaligned_store:
35521 return ix86_cost->vec_unalign_load_cost;
35523 case cond_branch_taken:
35524 return ix86_cost->cond_taken_branch_cost;
35526 case cond_branch_not_taken:
35527 return ix86_cost->cond_not_taken_branch_cost;
35529 case vec_perm:
35530 case vec_promote_demote:
35531 return ix86_cost->vec_stmt_cost;
35533 default:
35534 gcc_unreachable ();
35538 /* Construct (set target (vec_select op0 (parallel perm))) and
35539 return true if that's a valid instruction in the active ISA. */
35541 static bool
35542 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
35544 rtx rperm[MAX_VECT_LEN], x;
35545 unsigned i;
35547 for (i = 0; i < nelt; ++i)
35548 rperm[i] = GEN_INT (perm[i]);
35550 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
35551 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
35552 x = gen_rtx_SET (VOIDmode, target, x);
35554 x = emit_insn (x);
35555 if (recog_memoized (x) < 0)
35557 remove_insn (x);
35558 return false;
35560 return true;
35563 /* Similar, but generate a vec_concat from op0 and op1 as well. */
35565 static bool
35566 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35567 const unsigned char *perm, unsigned nelt)
35569 enum machine_mode v2mode;
35570 rtx x;
35572 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
35573 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
35574 return expand_vselect (target, x, perm, nelt);
35577 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35578 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
35580 static bool
35581 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35583 enum machine_mode vmode = d->vmode;
35584 unsigned i, mask, nelt = d->nelt;
35585 rtx target, op0, op1, x;
35586 rtx rperm[32], vperm;
35588 if (d->op0 == d->op1)
35589 return false;
35590 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35592 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35594 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35596 else
35597 return false;
35599 /* This is a blend, not a permute. Elements must stay in their
35600 respective lanes. */
35601 for (i = 0; i < nelt; ++i)
35603 unsigned e = d->perm[i];
35604 if (!(e == i || e == i + nelt))
35605 return false;
35608 if (d->testing_p)
35609 return true;
35611 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
35612 decision should be extracted elsewhere, so that we only try that
35613 sequence once all budget==3 options have been tried. */
35614 target = d->target;
35615 op0 = d->op0;
35616 op1 = d->op1;
35617 mask = 0;
35619 switch (vmode)
35621 case V4DFmode:
35622 case V8SFmode:
35623 case V2DFmode:
35624 case V4SFmode:
35625 case V8HImode:
35626 case V8SImode:
35627 for (i = 0; i < nelt; ++i)
35628 mask |= (d->perm[i] >= nelt) << i;
35629 break;
35631 case V2DImode:
35632 for (i = 0; i < 2; ++i)
35633 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35634 vmode = V8HImode;
35635 goto do_subreg;
35637 case V4SImode:
35638 for (i = 0; i < 4; ++i)
35639 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35640 vmode = V8HImode;
35641 goto do_subreg;
35643 case V16QImode:
35644 /* See if bytes move in pairs so we can use pblendw with
35645 an immediate argument, rather than pblendvb with a vector
35646 argument. */
35647 for (i = 0; i < 16; i += 2)
35648 if (d->perm[i] + 1 != d->perm[i + 1])
35650 use_pblendvb:
35651 for (i = 0; i < nelt; ++i)
35652 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35654 finish_pblendvb:
35655 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35656 vperm = force_reg (vmode, vperm);
35658 if (GET_MODE_SIZE (vmode) == 16)
35659 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35660 else
35661 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35662 return true;
35665 for (i = 0; i < 8; ++i)
35666 mask |= (d->perm[i * 2] >= 16) << i;
35667 vmode = V8HImode;
35668 /* FALLTHRU */
35670 do_subreg:
35671 target = gen_lowpart (vmode, target);
35672 op0 = gen_lowpart (vmode, op0);
35673 op1 = gen_lowpart (vmode, op1);
35674 break;
35676 case V32QImode:
35677 /* See if bytes move in pairs. If not, vpblendvb must be used. */
35678 for (i = 0; i < 32; i += 2)
35679 if (d->perm[i] + 1 != d->perm[i + 1])
35680 goto use_pblendvb;
35681 /* See if bytes move in quadruplets. If yes, vpblendd
35682 with immediate can be used. */
35683 for (i = 0; i < 32; i += 4)
35684 if (d->perm[i] + 2 != d->perm[i + 2])
35685 break;
35686 if (i < 32)
35688 /* See if bytes move the same in both lanes. If yes,
35689 vpblendw with immediate can be used. */
35690 for (i = 0; i < 16; i += 2)
35691 if (d->perm[i] + 16 != d->perm[i + 16])
35692 goto use_pblendvb;
35694 /* Use vpblendw. */
35695 for (i = 0; i < 16; ++i)
35696 mask |= (d->perm[i * 2] >= 32) << i;
35697 vmode = V16HImode;
35698 goto do_subreg;
35701 /* Use vpblendd. */
35702 for (i = 0; i < 8; ++i)
35703 mask |= (d->perm[i * 4] >= 32) << i;
35704 vmode = V8SImode;
35705 goto do_subreg;
35707 case V16HImode:
35708 /* See if words move in pairs. If yes, vpblendd can be used. */
35709 for (i = 0; i < 16; i += 2)
35710 if (d->perm[i] + 1 != d->perm[i + 1])
35711 break;
35712 if (i < 16)
35714 /* See if words move the same in both lanes. If not,
35715 vpblendvb must be used. */
35716 for (i = 0; i < 8; i++)
35717 if (d->perm[i] + 8 != d->perm[i + 8])
35719 /* Use vpblendvb. */
35720 for (i = 0; i < 32; ++i)
35721 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35723 vmode = V32QImode;
35724 nelt = 32;
35725 target = gen_lowpart (vmode, target);
35726 op0 = gen_lowpart (vmode, op0);
35727 op1 = gen_lowpart (vmode, op1);
35728 goto finish_pblendvb;
35731 /* Use vpblendw. */
35732 for (i = 0; i < 16; ++i)
35733 mask |= (d->perm[i] >= 16) << i;
35734 break;
35737 /* Use vpblendd. */
35738 for (i = 0; i < 8; ++i)
35739 mask |= (d->perm[i * 2] >= 16) << i;
35740 vmode = V8SImode;
35741 goto do_subreg;
35743 case V4DImode:
35744 /* Use vpblendd. */
35745 for (i = 0; i < 4; ++i)
35746 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35747 vmode = V8SImode;
35748 goto do_subreg;
35750 default:
35751 gcc_unreachable ();
35754 /* This matches five different patterns with the different modes. */
35755 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35756 x = gen_rtx_SET (VOIDmode, target, x);
35757 emit_insn (x);
35759 return true;
35762 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35763 in terms of the variable form of vpermilps.
35765 Note that we will have already failed the immediate input vpermilps,
35766 which requires that the high and low part shuffle be identical; the
35767 variable form doesn't require that. */
35769 static bool
35770 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35772 rtx rperm[8], vperm;
35773 unsigned i;
35775 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35776 return false;
35778 /* We can only permute within the 128-bit lane. */
35779 for (i = 0; i < 8; ++i)
35781 unsigned e = d->perm[i];
35782 if (i < 4 ? e >= 4 : e < 4)
35783 return false;
35786 if (d->testing_p)
35787 return true;
35789 for (i = 0; i < 8; ++i)
35791 unsigned e = d->perm[i];
35793 /* Within each 128-bit lane, the elements of op0 are numbered
35794 from 0 and the elements of op1 are numbered from 4. */
35795 if (e >= 8 + 4)
35796 e -= 8;
35797 else if (e >= 4)
35798 e -= 4;
35800 rperm[i] = GEN_INT (e);
35803 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35804 vperm = force_reg (V8SImode, vperm);
35805 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35807 return true;
35810 /* Return true if permutation D can be performed as VMODE permutation
35811 instead. */
35813 static bool
35814 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35816 unsigned int i, j, chunk;
35818 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35819 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35820 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35821 return false;
35823 if (GET_MODE_NUNITS (vmode) >= d->nelt)
35824 return true;
35826 chunk = d->nelt / GET_MODE_NUNITS (vmode);
35827 for (i = 0; i < d->nelt; i += chunk)
35828 if (d->perm[i] & (chunk - 1))
35829 return false;
35830 else
35831 for (j = 1; j < chunk; ++j)
35832 if (d->perm[i] + j != d->perm[i + j])
35833 return false;
35835 return true;
35838 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35839 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
35841 static bool
35842 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
35844 unsigned i, nelt, eltsz, mask;
35845 unsigned char perm[32];
35846 enum machine_mode vmode = V16QImode;
35847 rtx rperm[32], vperm, target, op0, op1;
35849 nelt = d->nelt;
35851 if (d->op0 != d->op1)
35853 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
35855 if (TARGET_AVX2
35856 && valid_perm_using_mode_p (V2TImode, d))
35858 if (d->testing_p)
35859 return true;
35861 /* Use vperm2i128 insn. The pattern uses
35862 V4DImode instead of V2TImode. */
35863 target = gen_lowpart (V4DImode, d->target);
35864 op0 = gen_lowpart (V4DImode, d->op0);
35865 op1 = gen_lowpart (V4DImode, d->op1);
35866 rperm[0]
35867 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
35868 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
35869 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
35870 return true;
35872 return false;
35875 else
35877 if (GET_MODE_SIZE (d->vmode) == 16)
35879 if (!TARGET_SSSE3)
35880 return false;
35882 else if (GET_MODE_SIZE (d->vmode) == 32)
35884 if (!TARGET_AVX2)
35885 return false;
35887 /* V4DImode should be already handled through
35888 expand_vselect by vpermq instruction. */
35889 gcc_assert (d->vmode != V4DImode);
35891 vmode = V32QImode;
35892 if (d->vmode == V8SImode
35893 || d->vmode == V16HImode
35894 || d->vmode == V32QImode)
35896 /* First see if vpermq can be used for
35897 V8SImode/V16HImode/V32QImode. */
35898 if (valid_perm_using_mode_p (V4DImode, d))
35900 for (i = 0; i < 4; i++)
35901 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
35902 if (d->testing_p)
35903 return true;
35904 return expand_vselect (gen_lowpart (V4DImode, d->target),
35905 gen_lowpart (V4DImode, d->op0),
35906 perm, 4);
35909 /* Next see if vpermd can be used. */
35910 if (valid_perm_using_mode_p (V8SImode, d))
35911 vmode = V8SImode;
35914 if (vmode == V32QImode)
35916 /* vpshufb only works intra lanes, it is not
35917 possible to shuffle bytes in between the lanes. */
35918 for (i = 0; i < nelt; ++i)
35919 if ((d->perm[i] ^ i) & (nelt / 2))
35920 return false;
35923 else
35924 return false;
35927 if (d->testing_p)
35928 return true;
35930 if (vmode == V8SImode)
35931 for (i = 0; i < 8; ++i)
35932 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35933 else
35935 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35936 if (d->op0 != d->op1)
35937 mask = 2 * nelt - 1;
35938 else if (vmode == V16QImode)
35939 mask = nelt - 1;
35940 else
35941 mask = nelt / 2 - 1;
35943 for (i = 0; i < nelt; ++i)
35945 unsigned j, e = d->perm[i] & mask;
35946 for (j = 0; j < eltsz; ++j)
35947 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35951 vperm = gen_rtx_CONST_VECTOR (vmode,
35952 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
35953 vperm = force_reg (vmode, vperm);
35955 target = gen_lowpart (vmode, d->target);
35956 op0 = gen_lowpart (vmode, d->op0);
35957 if (d->op0 == d->op1)
35959 if (vmode == V16QImode)
35960 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
35961 else if (vmode == V32QImode)
35962 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
35963 else
35964 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
35966 else
35968 op1 = gen_lowpart (vmode, d->op1);
35969 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
35972 return true;
35975 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
35976 in a single instruction. */
35978 static bool
35979 expand_vec_perm_1 (struct expand_vec_perm_d *d)
35981 unsigned i, nelt = d->nelt;
35982 unsigned char perm2[MAX_VECT_LEN];
35984 /* Check plain VEC_SELECT first, because AVX has instructions that could
35985 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
35986 input where SEL+CONCAT may not. */
35987 if (d->op0 == d->op1)
35989 int mask = nelt - 1;
35990 bool identity_perm = true;
35991 bool broadcast_perm = true;
35993 for (i = 0; i < nelt; i++)
35995 perm2[i] = d->perm[i] & mask;
35996 if (perm2[i] != i)
35997 identity_perm = false;
35998 if (perm2[i])
35999 broadcast_perm = false;
36002 if (identity_perm)
36004 if (!d->testing_p)
36005 emit_move_insn (d->target, d->op0);
36006 return true;
36008 else if (broadcast_perm && TARGET_AVX2)
36010 /* Use vpbroadcast{b,w,d}. */
36011 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
36012 switch (d->vmode)
36014 case V32QImode:
36015 op = gen_lowpart (V16QImode, op);
36016 gen = gen_avx2_pbroadcastv32qi;
36017 break;
36018 case V16HImode:
36019 op = gen_lowpart (V8HImode, op);
36020 gen = gen_avx2_pbroadcastv16hi;
36021 break;
36022 case V8SImode:
36023 op = gen_lowpart (V4SImode, op);
36024 gen = gen_avx2_pbroadcastv8si;
36025 break;
36026 case V16QImode:
36027 gen = gen_avx2_pbroadcastv16qi;
36028 break;
36029 case V8HImode:
36030 gen = gen_avx2_pbroadcastv8hi;
36031 break;
36032 /* For other modes prefer other shuffles this function creates. */
36033 default: break;
36035 if (gen != NULL)
36037 if (!d->testing_p)
36038 emit_insn (gen (d->target, op));
36039 return true;
36043 if (expand_vselect (d->target, d->op0, perm2, nelt))
36044 return true;
36046 /* There are plenty of patterns in sse.md that are written for
36047 SEL+CONCAT and are not replicated for a single op. Perhaps
36048 that should be changed, to avoid the nastiness here. */
36050 /* Recognize interleave style patterns, which means incrementing
36051 every other permutation operand. */
36052 for (i = 0; i < nelt; i += 2)
36054 perm2[i] = d->perm[i] & mask;
36055 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36057 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
36058 return true;
36060 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
36061 if (nelt >= 4)
36063 for (i = 0; i < nelt; i += 4)
36065 perm2[i + 0] = d->perm[i + 0] & mask;
36066 perm2[i + 1] = d->perm[i + 1] & mask;
36067 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36068 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36071 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
36072 return true;
36076 /* Finally, try the fully general two operand permute. */
36077 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
36078 return true;
36080 /* Recognize interleave style patterns with reversed operands. */
36081 if (d->op0 != d->op1)
36083 for (i = 0; i < nelt; ++i)
36085 unsigned e = d->perm[i];
36086 if (e >= nelt)
36087 e -= nelt;
36088 else
36089 e += nelt;
36090 perm2[i] = e;
36093 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
36094 return true;
36097 /* Try the SSE4.1 blend variable merge instructions. */
36098 if (expand_vec_perm_blend (d))
36099 return true;
36101 /* Try one of the AVX vpermil variable permutations. */
36102 if (expand_vec_perm_vpermil (d))
36103 return true;
36105 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36106 vpshufb, vpermd or vpermq variable permutation. */
36107 if (expand_vec_perm_pshufb (d))
36108 return true;
36110 return false;
36113 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36114 in terms of a pair of pshuflw + pshufhw instructions. */
36116 static bool
36117 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36119 unsigned char perm2[MAX_VECT_LEN];
36120 unsigned i;
36121 bool ok;
36123 if (d->vmode != V8HImode || d->op0 != d->op1)
36124 return false;
36126 /* The two permutations only operate in 64-bit lanes. */
36127 for (i = 0; i < 4; ++i)
36128 if (d->perm[i] >= 4)
36129 return false;
36130 for (i = 4; i < 8; ++i)
36131 if (d->perm[i] < 4)
36132 return false;
36134 if (d->testing_p)
36135 return true;
36137 /* Emit the pshuflw. */
36138 memcpy (perm2, d->perm, 4);
36139 for (i = 4; i < 8; ++i)
36140 perm2[i] = i;
36141 ok = expand_vselect (d->target, d->op0, perm2, 8);
36142 gcc_assert (ok);
36144 /* Emit the pshufhw. */
36145 memcpy (perm2 + 4, d->perm + 4, 4);
36146 for (i = 0; i < 4; ++i)
36147 perm2[i] = i;
36148 ok = expand_vselect (d->target, d->target, perm2, 8);
36149 gcc_assert (ok);
36151 return true;
36154 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36155 the permutation using the SSSE3 palignr instruction. This succeeds
36156 when all of the elements in PERM fit within one vector and we merely
36157 need to shift them down so that a single vector permutation has a
36158 chance to succeed. */
36160 static bool
36161 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36163 unsigned i, nelt = d->nelt;
36164 unsigned min, max;
36165 bool in_order, ok;
36166 rtx shift;
36168 /* Even with AVX, palignr only operates on 128-bit vectors. */
36169 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36170 return false;
36172 min = nelt, max = 0;
36173 for (i = 0; i < nelt; ++i)
36175 unsigned e = d->perm[i];
36176 if (e < min)
36177 min = e;
36178 if (e > max)
36179 max = e;
36181 if (min == 0 || max - min >= nelt)
36182 return false;
36184 /* Given that we have SSSE3, we know we'll be able to implement the
36185 single operand permutation after the palignr with pshufb. */
36186 if (d->testing_p)
36187 return true;
36189 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36190 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36191 gen_lowpart (TImode, d->op1),
36192 gen_lowpart (TImode, d->op0), shift));
36194 d->op0 = d->op1 = d->target;
36196 in_order = true;
36197 for (i = 0; i < nelt; ++i)
36199 unsigned e = d->perm[i] - min;
36200 if (e != i)
36201 in_order = false;
36202 d->perm[i] = e;
36205 /* Test for the degenerate case where the alignment by itself
36206 produces the desired permutation. */
36207 if (in_order)
36208 return true;
36210 ok = expand_vec_perm_1 (d);
36211 gcc_assert (ok);
36213 return ok;
36216 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36218 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36219 a two vector permutation into a single vector permutation by using
36220 an interleave operation to merge the vectors. */
36222 static bool
36223 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36225 struct expand_vec_perm_d dremap, dfinal;
36226 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36227 unsigned HOST_WIDE_INT contents;
36228 unsigned char remap[2 * MAX_VECT_LEN];
36229 rtx seq;
36230 bool ok, same_halves = false;
36232 if (GET_MODE_SIZE (d->vmode) == 16)
36234 if (d->op0 == d->op1)
36235 return false;
36237 else if (GET_MODE_SIZE (d->vmode) == 32)
36239 if (!TARGET_AVX)
36240 return false;
36241 /* For 32-byte modes allow even d->op0 == d->op1.
36242 The lack of cross-lane shuffling in some instructions
36243 might prevent a single insn shuffle. */
36244 dfinal = *d;
36245 dfinal.testing_p = true;
36246 /* If expand_vec_perm_interleave3 can expand this into
36247 a 3 insn sequence, give up and let it be expanded as
36248 3 insn sequence. While that is one insn longer,
36249 it doesn't need a memory operand and in the common
36250 case that both interleave low and high permutations
36251 with the same operands are adjacent needs 4 insns
36252 for both after CSE. */
36253 if (expand_vec_perm_interleave3 (&dfinal))
36254 return false;
36256 else
36257 return false;
36259 /* Examine from whence the elements come. */
36260 contents = 0;
36261 for (i = 0; i < nelt; ++i)
36262 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36264 memset (remap, 0xff, sizeof (remap));
36265 dremap = *d;
36267 if (GET_MODE_SIZE (d->vmode) == 16)
36269 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36271 /* Split the two input vectors into 4 halves. */
36272 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36273 h2 = h1 << nelt2;
36274 h3 = h2 << nelt2;
36275 h4 = h3 << nelt2;
36277 /* If the elements from the low halves use interleave low, and similarly
36278 for interleave high. If the elements are from mis-matched halves, we
36279 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36280 if ((contents & (h1 | h3)) == contents)
36282 /* punpckl* */
36283 for (i = 0; i < nelt2; ++i)
36285 remap[i] = i * 2;
36286 remap[i + nelt] = i * 2 + 1;
36287 dremap.perm[i * 2] = i;
36288 dremap.perm[i * 2 + 1] = i + nelt;
36290 if (!TARGET_SSE2 && d->vmode == V4SImode)
36291 dremap.vmode = V4SFmode;
36293 else if ((contents & (h2 | h4)) == contents)
36295 /* punpckh* */
36296 for (i = 0; i < nelt2; ++i)
36298 remap[i + nelt2] = i * 2;
36299 remap[i + nelt + nelt2] = i * 2 + 1;
36300 dremap.perm[i * 2] = i + nelt2;
36301 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36303 if (!TARGET_SSE2 && d->vmode == V4SImode)
36304 dremap.vmode = V4SFmode;
36306 else if ((contents & (h1 | h4)) == contents)
36308 /* shufps */
36309 for (i = 0; i < nelt2; ++i)
36311 remap[i] = i;
36312 remap[i + nelt + nelt2] = i + nelt2;
36313 dremap.perm[i] = i;
36314 dremap.perm[i + nelt2] = i + nelt + nelt2;
36316 if (nelt != 4)
36318 /* shufpd */
36319 dremap.vmode = V2DImode;
36320 dremap.nelt = 2;
36321 dremap.perm[0] = 0;
36322 dremap.perm[1] = 3;
36325 else if ((contents & (h2 | h3)) == contents)
36327 /* shufps */
36328 for (i = 0; i < nelt2; ++i)
36330 remap[i + nelt2] = i;
36331 remap[i + nelt] = i + nelt2;
36332 dremap.perm[i] = i + nelt2;
36333 dremap.perm[i + nelt2] = i + nelt;
36335 if (nelt != 4)
36337 /* shufpd */
36338 dremap.vmode = V2DImode;
36339 dremap.nelt = 2;
36340 dremap.perm[0] = 1;
36341 dremap.perm[1] = 2;
36344 else
36345 return false;
36347 else
36349 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36350 unsigned HOST_WIDE_INT q[8];
36351 unsigned int nonzero_halves[4];
36353 /* Split the two input vectors into 8 quarters. */
36354 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36355 for (i = 1; i < 8; ++i)
36356 q[i] = q[0] << (nelt4 * i);
36357 for (i = 0; i < 4; ++i)
36358 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36360 nonzero_halves[nzcnt] = i;
36361 ++nzcnt;
36364 if (nzcnt == 1)
36366 gcc_assert (d->op0 == d->op1);
36367 nonzero_halves[1] = nonzero_halves[0];
36368 same_halves = true;
36370 else if (d->op0 == d->op1)
36372 gcc_assert (nonzero_halves[0] == 0);
36373 gcc_assert (nonzero_halves[1] == 1);
36376 if (nzcnt <= 2)
36378 if (d->perm[0] / nelt2 == nonzero_halves[1])
36380 /* Attempt to increase the likelyhood that dfinal
36381 shuffle will be intra-lane. */
36382 char tmph = nonzero_halves[0];
36383 nonzero_halves[0] = nonzero_halves[1];
36384 nonzero_halves[1] = tmph;
36387 /* vperm2f128 or vperm2i128. */
36388 for (i = 0; i < nelt2; ++i)
36390 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36391 remap[i + nonzero_halves[0] * nelt2] = i;
36392 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36393 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36396 if (d->vmode != V8SFmode
36397 && d->vmode != V4DFmode
36398 && d->vmode != V8SImode)
36400 dremap.vmode = V8SImode;
36401 dremap.nelt = 8;
36402 for (i = 0; i < 4; ++i)
36404 dremap.perm[i] = i + nonzero_halves[0] * 4;
36405 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36409 else if (d->op0 == d->op1)
36410 return false;
36411 else if (TARGET_AVX2
36412 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36414 /* vpunpckl* */
36415 for (i = 0; i < nelt4; ++i)
36417 remap[i] = i * 2;
36418 remap[i + nelt] = i * 2 + 1;
36419 remap[i + nelt2] = i * 2 + nelt2;
36420 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36421 dremap.perm[i * 2] = i;
36422 dremap.perm[i * 2 + 1] = i + nelt;
36423 dremap.perm[i * 2 + nelt2] = i + nelt2;
36424 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36427 else if (TARGET_AVX2
36428 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36430 /* vpunpckh* */
36431 for (i = 0; i < nelt4; ++i)
36433 remap[i + nelt4] = i * 2;
36434 remap[i + nelt + nelt4] = i * 2 + 1;
36435 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36436 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36437 dremap.perm[i * 2] = i + nelt4;
36438 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36439 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36440 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36443 else
36444 return false;
36447 /* Use the remapping array set up above to move the elements from their
36448 swizzled locations into their final destinations. */
36449 dfinal = *d;
36450 for (i = 0; i < nelt; ++i)
36452 unsigned e = remap[d->perm[i]];
36453 gcc_assert (e < nelt);
36454 /* If same_halves is true, both halves of the remapped vector are the
36455 same. Avoid cross-lane accesses if possible. */
36456 if (same_halves && i >= nelt2)
36458 gcc_assert (e < nelt2);
36459 dfinal.perm[i] = e + nelt2;
36461 else
36462 dfinal.perm[i] = e;
36464 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36465 dfinal.op1 = dfinal.op0;
36466 dremap.target = dfinal.op0;
36468 /* Test if the final remap can be done with a single insn. For V4SFmode or
36469 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36470 start_sequence ();
36471 ok = expand_vec_perm_1 (&dfinal);
36472 seq = get_insns ();
36473 end_sequence ();
36475 if (!ok)
36476 return false;
36478 if (d->testing_p)
36479 return true;
36481 if (dremap.vmode != dfinal.vmode)
36483 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36484 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36485 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36488 ok = expand_vec_perm_1 (&dremap);
36489 gcc_assert (ok);
36491 emit_insn (seq);
36492 return true;
36495 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36496 a single vector cross-lane permutation into vpermq followed
36497 by any of the single insn permutations. */
36499 static bool
36500 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36502 struct expand_vec_perm_d dremap, dfinal;
36503 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36504 unsigned contents[2];
36505 bool ok;
36507 if (!(TARGET_AVX2
36508 && (d->vmode == V32QImode || d->vmode == V16HImode)
36509 && d->op0 == d->op1))
36510 return false;
36512 contents[0] = 0;
36513 contents[1] = 0;
36514 for (i = 0; i < nelt2; ++i)
36516 contents[0] |= 1u << (d->perm[i] / nelt4);
36517 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36520 for (i = 0; i < 2; ++i)
36522 unsigned int cnt = 0;
36523 for (j = 0; j < 4; ++j)
36524 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36525 return false;
36528 if (d->testing_p)
36529 return true;
36531 dremap = *d;
36532 dremap.vmode = V4DImode;
36533 dremap.nelt = 4;
36534 dremap.target = gen_reg_rtx (V4DImode);
36535 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36536 dremap.op1 = dremap.op0;
36537 for (i = 0; i < 2; ++i)
36539 unsigned int cnt = 0;
36540 for (j = 0; j < 4; ++j)
36541 if ((contents[i] & (1u << j)) != 0)
36542 dremap.perm[2 * i + cnt++] = j;
36543 for (; cnt < 2; ++cnt)
36544 dremap.perm[2 * i + cnt] = 0;
36547 dfinal = *d;
36548 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36549 dfinal.op1 = dfinal.op0;
36550 for (i = 0, j = 0; i < nelt; ++i)
36552 if (i == nelt2)
36553 j = 2;
36554 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36555 if ((d->perm[i] / nelt4) == dremap.perm[j])
36557 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36558 dfinal.perm[i] |= nelt4;
36559 else
36560 gcc_unreachable ();
36563 ok = expand_vec_perm_1 (&dremap);
36564 gcc_assert (ok);
36566 ok = expand_vec_perm_1 (&dfinal);
36567 gcc_assert (ok);
36569 return true;
36572 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36573 a two vector permutation using 2 intra-lane interleave insns
36574 and cross-lane shuffle for 32-byte vectors. */
36576 static bool
36577 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36579 unsigned i, nelt;
36580 rtx (*gen) (rtx, rtx, rtx);
36582 if (d->op0 == d->op1)
36583 return false;
36584 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36586 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36588 else
36589 return false;
36591 nelt = d->nelt;
36592 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36593 return false;
36594 for (i = 0; i < nelt; i += 2)
36595 if (d->perm[i] != d->perm[0] + i / 2
36596 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36597 return false;
36599 if (d->testing_p)
36600 return true;
36602 switch (d->vmode)
36604 case V32QImode:
36605 if (d->perm[0])
36606 gen = gen_vec_interleave_highv32qi;
36607 else
36608 gen = gen_vec_interleave_lowv32qi;
36609 break;
36610 case V16HImode:
36611 if (d->perm[0])
36612 gen = gen_vec_interleave_highv16hi;
36613 else
36614 gen = gen_vec_interleave_lowv16hi;
36615 break;
36616 case V8SImode:
36617 if (d->perm[0])
36618 gen = gen_vec_interleave_highv8si;
36619 else
36620 gen = gen_vec_interleave_lowv8si;
36621 break;
36622 case V4DImode:
36623 if (d->perm[0])
36624 gen = gen_vec_interleave_highv4di;
36625 else
36626 gen = gen_vec_interleave_lowv4di;
36627 break;
36628 case V8SFmode:
36629 if (d->perm[0])
36630 gen = gen_vec_interleave_highv8sf;
36631 else
36632 gen = gen_vec_interleave_lowv8sf;
36633 break;
36634 case V4DFmode:
36635 if (d->perm[0])
36636 gen = gen_vec_interleave_highv4df;
36637 else
36638 gen = gen_vec_interleave_lowv4df;
36639 break;
36640 default:
36641 gcc_unreachable ();
36644 emit_insn (gen (d->target, d->op0, d->op1));
36645 return true;
36648 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
36649 a single vector permutation using a single intra-lane vector
36650 permutation, vperm2f128 swapping the lanes and vblend* insn blending
36651 the non-swapped and swapped vectors together. */
36653 static bool
36654 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
36656 struct expand_vec_perm_d dfirst, dsecond;
36657 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
36658 rtx seq;
36659 bool ok;
36660 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
36662 if (!TARGET_AVX
36663 || TARGET_AVX2
36664 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
36665 || d->op0 != d->op1)
36666 return false;
36668 dfirst = *d;
36669 for (i = 0; i < nelt; i++)
36670 dfirst.perm[i] = 0xff;
36671 for (i = 0, msk = 0; i < nelt; i++)
36673 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
36674 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
36675 return false;
36676 dfirst.perm[j] = d->perm[i];
36677 if (j != i)
36678 msk |= (1 << i);
36680 for (i = 0; i < nelt; i++)
36681 if (dfirst.perm[i] == 0xff)
36682 dfirst.perm[i] = i;
36684 if (!d->testing_p)
36685 dfirst.target = gen_reg_rtx (dfirst.vmode);
36687 start_sequence ();
36688 ok = expand_vec_perm_1 (&dfirst);
36689 seq = get_insns ();
36690 end_sequence ();
36692 if (!ok)
36693 return false;
36695 if (d->testing_p)
36696 return true;
36698 emit_insn (seq);
36700 dsecond = *d;
36701 dsecond.op0 = dfirst.target;
36702 dsecond.op1 = dfirst.target;
36703 dsecond.target = gen_reg_rtx (dsecond.vmode);
36704 for (i = 0; i < nelt; i++)
36705 dsecond.perm[i] = i ^ nelt2;
36707 ok = expand_vec_perm_1 (&dsecond);
36708 gcc_assert (ok);
36710 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
36711 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
36712 return true;
36715 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
36716 permutation with two pshufb insns and an ior. We should have already
36717 failed all two instruction sequences. */
36719 static bool
36720 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36722 rtx rperm[2][16], vperm, l, h, op, m128;
36723 unsigned int i, nelt, eltsz;
36725 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36726 return false;
36727 gcc_assert (d->op0 != d->op1);
36729 nelt = d->nelt;
36730 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36732 /* Generate two permutation masks. If the required element is within
36733 the given vector it is shuffled into the proper lane. If the required
36734 element is in the other vector, force a zero into the lane by setting
36735 bit 7 in the permutation mask. */
36736 m128 = GEN_INT (-128);
36737 for (i = 0; i < nelt; ++i)
36739 unsigned j, e = d->perm[i];
36740 unsigned which = (e >= nelt);
36741 if (e >= nelt)
36742 e -= nelt;
36744 for (j = 0; j < eltsz; ++j)
36746 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36747 rperm[1-which][i*eltsz + j] = m128;
36751 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36752 vperm = force_reg (V16QImode, vperm);
36754 l = gen_reg_rtx (V16QImode);
36755 op = gen_lowpart (V16QImode, d->op0);
36756 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36758 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36759 vperm = force_reg (V16QImode, vperm);
36761 h = gen_reg_rtx (V16QImode);
36762 op = gen_lowpart (V16QImode, d->op1);
36763 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36765 op = gen_lowpart (V16QImode, d->target);
36766 emit_insn (gen_iorv16qi3 (op, l, h));
36768 return true;
36771 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36772 with two vpshufb insns, vpermq and vpor. We should have already failed
36773 all two or three instruction sequences. */
36775 static bool
36776 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36778 rtx rperm[2][32], vperm, l, h, hp, op, m128;
36779 unsigned int i, nelt, eltsz;
36781 if (!TARGET_AVX2
36782 || d->op0 != d->op1
36783 || (d->vmode != V32QImode && d->vmode != V16HImode))
36784 return false;
36786 if (d->testing_p)
36787 return true;
36789 nelt = d->nelt;
36790 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36792 /* Generate two permutation masks. If the required element is within
36793 the same lane, it is shuffled in. If the required element from the
36794 other lane, force a zero by setting bit 7 in the permutation mask.
36795 In the other mask the mask has non-negative elements if element
36796 is requested from the other lane, but also moved to the other lane,
36797 so that the result of vpshufb can have the two V2TImode halves
36798 swapped. */
36799 m128 = GEN_INT (-128);
36800 for (i = 0; i < nelt; ++i)
36802 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36803 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36805 for (j = 0; j < eltsz; ++j)
36807 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36808 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36812 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36813 vperm = force_reg (V32QImode, vperm);
36815 h = gen_reg_rtx (V32QImode);
36816 op = gen_lowpart (V32QImode, d->op0);
36817 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36819 /* Swap the 128-byte lanes of h into hp. */
36820 hp = gen_reg_rtx (V4DImode);
36821 op = gen_lowpart (V4DImode, h);
36822 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36823 const1_rtx));
36825 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36826 vperm = force_reg (V32QImode, vperm);
36828 l = gen_reg_rtx (V32QImode);
36829 op = gen_lowpart (V32QImode, d->op0);
36830 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36832 op = gen_lowpart (V32QImode, d->target);
36833 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36835 return true;
36838 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
36839 and extract-odd permutations of two V32QImode and V16QImode operand
36840 with two vpshufb insns, vpor and vpermq. We should have already
36841 failed all two or three instruction sequences. */
36843 static bool
36844 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36846 rtx rperm[2][32], vperm, l, h, ior, op, m128;
36847 unsigned int i, nelt, eltsz;
36849 if (!TARGET_AVX2
36850 || d->op0 == d->op1
36851 || (d->vmode != V32QImode && d->vmode != V16HImode))
36852 return false;
36854 for (i = 0; i < d->nelt; ++i)
36855 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36856 return false;
36858 if (d->testing_p)
36859 return true;
36861 nelt = d->nelt;
36862 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36864 /* Generate two permutation masks. In the first permutation mask
36865 the first quarter will contain indexes for the first half
36866 of the op0, the second quarter will contain bit 7 set, third quarter
36867 will contain indexes for the second half of the op0 and the
36868 last quarter bit 7 set. In the second permutation mask
36869 the first quarter will contain bit 7 set, the second quarter
36870 indexes for the first half of the op1, the third quarter bit 7 set
36871 and last quarter indexes for the second half of the op1.
36872 I.e. the first mask e.g. for V32QImode extract even will be:
36873 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
36874 (all values masked with 0xf except for -128) and second mask
36875 for extract even will be
36876 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
36877 m128 = GEN_INT (-128);
36878 for (i = 0; i < nelt; ++i)
36880 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36881 unsigned which = d->perm[i] >= nelt;
36882 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
36884 for (j = 0; j < eltsz; ++j)
36886 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
36887 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
36891 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36892 vperm = force_reg (V32QImode, vperm);
36894 l = gen_reg_rtx (V32QImode);
36895 op = gen_lowpart (V32QImode, d->op0);
36896 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36898 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36899 vperm = force_reg (V32QImode, vperm);
36901 h = gen_reg_rtx (V32QImode);
36902 op = gen_lowpart (V32QImode, d->op1);
36903 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36905 ior = gen_reg_rtx (V32QImode);
36906 emit_insn (gen_iorv32qi3 (ior, l, h));
36908 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
36909 op = gen_lowpart (V4DImode, d->target);
36910 ior = gen_lowpart (V4DImode, ior);
36911 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
36912 const1_rtx, GEN_INT (3)));
36914 return true;
36917 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
36918 and extract-odd permutations. */
36920 static bool
36921 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
36923 rtx t1, t2, t3;
36925 switch (d->vmode)
36927 case V4DFmode:
36928 t1 = gen_reg_rtx (V4DFmode);
36929 t2 = gen_reg_rtx (V4DFmode);
36931 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36932 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
36933 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
36935 /* Now an unpck[lh]pd will produce the result required. */
36936 if (odd)
36937 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
36938 else
36939 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
36940 emit_insn (t3);
36941 break;
36943 case V8SFmode:
36945 int mask = odd ? 0xdd : 0x88;
36947 t1 = gen_reg_rtx (V8SFmode);
36948 t2 = gen_reg_rtx (V8SFmode);
36949 t3 = gen_reg_rtx (V8SFmode);
36951 /* Shuffle within the 128-bit lanes to produce:
36952 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
36953 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
36954 GEN_INT (mask)));
36956 /* Shuffle the lanes around to produce:
36957 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
36958 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
36959 GEN_INT (0x3)));
36961 /* Shuffle within the 128-bit lanes to produce:
36962 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
36963 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
36965 /* Shuffle within the 128-bit lanes to produce:
36966 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
36967 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
36969 /* Shuffle the lanes around to produce:
36970 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
36971 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
36972 GEN_INT (0x20)));
36974 break;
36976 case V2DFmode:
36977 case V4SFmode:
36978 case V2DImode:
36979 case V4SImode:
36980 /* These are always directly implementable by expand_vec_perm_1. */
36981 gcc_unreachable ();
36983 case V8HImode:
36984 if (TARGET_SSSE3)
36985 return expand_vec_perm_pshufb2 (d);
36986 else
36988 /* We need 2*log2(N)-1 operations to achieve odd/even
36989 with interleave. */
36990 t1 = gen_reg_rtx (V8HImode);
36991 t2 = gen_reg_rtx (V8HImode);
36992 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
36993 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
36994 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
36995 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
36996 if (odd)
36997 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
36998 else
36999 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
37000 emit_insn (t3);
37002 break;
37004 case V16QImode:
37005 if (TARGET_SSSE3)
37006 return expand_vec_perm_pshufb2 (d);
37007 else
37009 t1 = gen_reg_rtx (V16QImode);
37010 t2 = gen_reg_rtx (V16QImode);
37011 t3 = gen_reg_rtx (V16QImode);
37012 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37013 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37014 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37015 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37016 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37017 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37018 if (odd)
37019 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37020 else
37021 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37022 emit_insn (t3);
37024 break;
37026 case V16HImode:
37027 case V32QImode:
37028 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37030 case V4DImode:
37031 if (!TARGET_AVX2)
37033 struct expand_vec_perm_d d_copy = *d;
37034 d_copy.vmode = V4DFmode;
37035 d_copy.target = gen_lowpart (V4DFmode, d->target);
37036 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37037 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37038 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37041 t1 = gen_reg_rtx (V4DImode);
37042 t2 = gen_reg_rtx (V4DImode);
37044 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37045 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37046 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37048 /* Now an vpunpck[lh]qdq will produce the result required. */
37049 if (odd)
37050 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37051 else
37052 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37053 emit_insn (t3);
37054 break;
37056 case V8SImode:
37057 if (!TARGET_AVX2)
37059 struct expand_vec_perm_d d_copy = *d;
37060 d_copy.vmode = V8SFmode;
37061 d_copy.target = gen_lowpart (V8SFmode, d->target);
37062 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37063 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37064 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37067 t1 = gen_reg_rtx (V8SImode);
37068 t2 = gen_reg_rtx (V8SImode);
37070 /* Shuffle the lanes around into
37071 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
37072 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
37073 gen_lowpart (V4DImode, d->op0),
37074 gen_lowpart (V4DImode, d->op1),
37075 GEN_INT (0x20)));
37076 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
37077 gen_lowpart (V4DImode, d->op0),
37078 gen_lowpart (V4DImode, d->op1),
37079 GEN_INT (0x31)));
37081 /* Swap the 2nd and 3rd position in each lane into
37082 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
37083 emit_insn (gen_avx2_pshufdv3 (t1, t1,
37084 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37085 emit_insn (gen_avx2_pshufdv3 (t2, t2,
37086 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37088 /* Now an vpunpck[lh]qdq will produce
37089 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
37090 if (odd)
37091 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
37092 gen_lowpart (V4DImode, t1),
37093 gen_lowpart (V4DImode, t2));
37094 else
37095 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
37096 gen_lowpart (V4DImode, t1),
37097 gen_lowpart (V4DImode, t2));
37098 emit_insn (t3);
37099 break;
37101 default:
37102 gcc_unreachable ();
37105 return true;
37108 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37109 extract-even and extract-odd permutations. */
37111 static bool
37112 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
37114 unsigned i, odd, nelt = d->nelt;
37116 odd = d->perm[0];
37117 if (odd != 0 && odd != 1)
37118 return false;
37120 for (i = 1; i < nelt; ++i)
37121 if (d->perm[i] != 2 * i + odd)
37122 return false;
37124 return expand_vec_perm_even_odd_1 (d, odd);
37127 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
37128 permutations. We assume that expand_vec_perm_1 has already failed. */
37130 static bool
37131 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
37133 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
37134 enum machine_mode vmode = d->vmode;
37135 unsigned char perm2[4];
37136 rtx op0 = d->op0;
37137 bool ok;
37139 switch (vmode)
37141 case V4DFmode:
37142 case V8SFmode:
37143 /* These are special-cased in sse.md so that we can optionally
37144 use the vbroadcast instruction. They expand to two insns
37145 if the input happens to be in a register. */
37146 gcc_unreachable ();
37148 case V2DFmode:
37149 case V2DImode:
37150 case V4SFmode:
37151 case V4SImode:
37152 /* These are always implementable using standard shuffle patterns. */
37153 gcc_unreachable ();
37155 case V8HImode:
37156 case V16QImode:
37157 /* These can be implemented via interleave. We save one insn by
37158 stopping once we have promoted to V4SImode and then use pshufd. */
37161 rtx dest;
37162 rtx (*gen) (rtx, rtx, rtx)
37163 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
37164 : gen_vec_interleave_lowv8hi;
37166 if (elt >= nelt2)
37168 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
37169 : gen_vec_interleave_highv8hi;
37170 elt -= nelt2;
37172 nelt2 /= 2;
37174 dest = gen_reg_rtx (vmode);
37175 emit_insn (gen (dest, op0, op0));
37176 vmode = get_mode_wider_vector (vmode);
37177 op0 = gen_lowpart (vmode, dest);
37179 while (vmode != V4SImode);
37181 memset (perm2, elt, 4);
37182 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
37183 gcc_assert (ok);
37184 return true;
37186 case V32QImode:
37187 case V16HImode:
37188 case V8SImode:
37189 case V4DImode:
37190 /* For AVX2 broadcasts of the first element vpbroadcast* or
37191 vpermq should be used by expand_vec_perm_1. */
37192 gcc_assert (!TARGET_AVX2 || d->perm[0]);
37193 return false;
37195 default:
37196 gcc_unreachable ();
37200 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37201 broadcast permutations. */
37203 static bool
37204 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
37206 unsigned i, elt, nelt = d->nelt;
37208 if (d->op0 != d->op1)
37209 return false;
37211 elt = d->perm[0];
37212 for (i = 1; i < nelt; ++i)
37213 if (d->perm[i] != elt)
37214 return false;
37216 return expand_vec_perm_broadcast_1 (d);
37219 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
37220 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
37221 all the shorter instruction sequences. */
37223 static bool
37224 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37226 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37227 unsigned int i, nelt, eltsz;
37228 bool used[4];
37230 if (!TARGET_AVX2
37231 || d->op0 == d->op1
37232 || (d->vmode != V32QImode && d->vmode != V16HImode))
37233 return false;
37235 if (d->testing_p)
37236 return true;
37238 nelt = d->nelt;
37239 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37241 /* Generate 4 permutation masks. If the required element is within
37242 the same lane, it is shuffled in. If the required element from the
37243 other lane, force a zero by setting bit 7 in the permutation mask.
37244 In the other mask the mask has non-negative elements if element
37245 is requested from the other lane, but also moved to the other lane,
37246 so that the result of vpshufb can have the two V2TImode halves
37247 swapped. */
37248 m128 = GEN_INT (-128);
37249 for (i = 0; i < 32; ++i)
37251 rperm[0][i] = m128;
37252 rperm[1][i] = m128;
37253 rperm[2][i] = m128;
37254 rperm[3][i] = m128;
37256 used[0] = false;
37257 used[1] = false;
37258 used[2] = false;
37259 used[3] = false;
37260 for (i = 0; i < nelt; ++i)
37262 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37263 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37264 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37266 for (j = 0; j < eltsz; ++j)
37267 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37268 used[which] = true;
37271 for (i = 0; i < 2; ++i)
37273 if (!used[2 * i + 1])
37275 h[i] = NULL_RTX;
37276 continue;
37278 vperm = gen_rtx_CONST_VECTOR (V32QImode,
37279 gen_rtvec_v (32, rperm[2 * i + 1]));
37280 vperm = force_reg (V32QImode, vperm);
37281 h[i] = gen_reg_rtx (V32QImode);
37282 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37283 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37286 /* Swap the 128-byte lanes of h[X]. */
37287 for (i = 0; i < 2; ++i)
37289 if (h[i] == NULL_RTX)
37290 continue;
37291 op = gen_reg_rtx (V4DImode);
37292 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37293 const2_rtx, GEN_INT (3), const0_rtx,
37294 const1_rtx));
37295 h[i] = gen_lowpart (V32QImode, op);
37298 for (i = 0; i < 2; ++i)
37300 if (!used[2 * i])
37302 l[i] = NULL_RTX;
37303 continue;
37305 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37306 vperm = force_reg (V32QImode, vperm);
37307 l[i] = gen_reg_rtx (V32QImode);
37308 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37309 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37312 for (i = 0; i < 2; ++i)
37314 if (h[i] && l[i])
37316 op = gen_reg_rtx (V32QImode);
37317 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37318 l[i] = op;
37320 else if (h[i])
37321 l[i] = h[i];
37324 gcc_assert (l[0] && l[1]);
37325 op = gen_lowpart (V32QImode, d->target);
37326 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37327 return true;
37330 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37331 With all of the interface bits taken care of, perform the expansion
37332 in D and return true on success. */
37334 static bool
37335 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37337 /* Try a single instruction expansion. */
37338 if (expand_vec_perm_1 (d))
37339 return true;
37341 /* Try sequences of two instructions. */
37343 if (expand_vec_perm_pshuflw_pshufhw (d))
37344 return true;
37346 if (expand_vec_perm_palignr (d))
37347 return true;
37349 if (expand_vec_perm_interleave2 (d))
37350 return true;
37352 if (expand_vec_perm_broadcast (d))
37353 return true;
37355 if (expand_vec_perm_vpermq_perm_1 (d))
37356 return true;
37358 /* Try sequences of three instructions. */
37360 if (expand_vec_perm_pshufb2 (d))
37361 return true;
37363 if (expand_vec_perm_interleave3 (d))
37364 return true;
37366 if (expand_vec_perm_vperm2f128_vblend (d))
37367 return true;
37369 /* Try sequences of four instructions. */
37371 if (expand_vec_perm_vpshufb2_vpermq (d))
37372 return true;
37374 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37375 return true;
37377 /* ??? Look for narrow permutations whose element orderings would
37378 allow the promotion to a wider mode. */
37380 /* ??? Look for sequences of interleave or a wider permute that place
37381 the data into the correct lanes for a half-vector shuffle like
37382 pshuf[lh]w or vpermilps. */
37384 /* ??? Look for sequences of interleave that produce the desired results.
37385 The combinatorics of punpck[lh] get pretty ugly... */
37387 if (expand_vec_perm_even_odd (d))
37388 return true;
37390 /* Even longer sequences. */
37391 if (expand_vec_perm_vpshufb4_vpermq2 (d))
37392 return true;
37394 return false;
37397 bool
37398 ix86_expand_vec_perm_const (rtx operands[4])
37400 struct expand_vec_perm_d d;
37401 unsigned char perm[MAX_VECT_LEN];
37402 int i, nelt, which;
37403 rtx sel;
37405 d.target = operands[0];
37406 d.op0 = operands[1];
37407 d.op1 = operands[2];
37408 sel = operands[3];
37410 d.vmode = GET_MODE (d.target);
37411 gcc_assert (VECTOR_MODE_P (d.vmode));
37412 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37413 d.testing_p = false;
37415 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37416 gcc_assert (XVECLEN (sel, 0) == nelt);
37417 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37419 for (i = which = 0; i < nelt; ++i)
37421 rtx e = XVECEXP (sel, 0, i);
37422 int ei = INTVAL (e) & (2 * nelt - 1);
37424 which |= (ei < nelt ? 1 : 2);
37425 d.perm[i] = ei;
37426 perm[i] = ei;
37429 switch (which)
37431 default:
37432 gcc_unreachable();
37434 case 3:
37435 if (!rtx_equal_p (d.op0, d.op1))
37436 break;
37438 /* The elements of PERM do not suggest that only the first operand
37439 is used, but both operands are identical. Allow easier matching
37440 of the permutation by folding the permutation into the single
37441 input vector. */
37442 for (i = 0; i < nelt; ++i)
37443 if (d.perm[i] >= nelt)
37444 d.perm[i] -= nelt;
37445 /* FALLTHRU */
37447 case 1:
37448 d.op1 = d.op0;
37449 break;
37451 case 2:
37452 for (i = 0; i < nelt; ++i)
37453 d.perm[i] -= nelt;
37454 d.op0 = d.op1;
37455 break;
37458 if (ix86_expand_vec_perm_const_1 (&d))
37459 return true;
37461 /* If the mask says both arguments are needed, but they are the same,
37462 the above tried to expand with d.op0 == d.op1. If that didn't work,
37463 retry with d.op0 != d.op1 as that is what testing has been done with. */
37464 if (which == 3 && d.op0 == d.op1)
37466 rtx seq;
37467 bool ok;
37469 memcpy (d.perm, perm, sizeof (perm));
37470 d.op1 = gen_reg_rtx (d.vmode);
37471 start_sequence ();
37472 ok = ix86_expand_vec_perm_const_1 (&d);
37473 seq = get_insns ();
37474 end_sequence ();
37475 if (ok)
37477 emit_move_insn (d.op1, d.op0);
37478 emit_insn (seq);
37479 return true;
37483 return false;
37486 /* Implement targetm.vectorize.vec_perm_const_ok. */
37488 static bool
37489 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37490 const unsigned char *sel)
37492 struct expand_vec_perm_d d;
37493 unsigned int i, nelt, which;
37494 bool ret, one_vec;
37496 d.vmode = vmode;
37497 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37498 d.testing_p = true;
37500 /* Given sufficient ISA support we can just return true here
37501 for selected vector modes. */
37502 if (GET_MODE_SIZE (d.vmode) == 16)
37504 /* All implementable with a single vpperm insn. */
37505 if (TARGET_XOP)
37506 return true;
37507 /* All implementable with 2 pshufb + 1 ior. */
37508 if (TARGET_SSSE3)
37509 return true;
37510 /* All implementable with shufpd or unpck[lh]pd. */
37511 if (d.nelt == 2)
37512 return true;
37515 /* Extract the values from the vector CST into the permutation
37516 array in D. */
37517 memcpy (d.perm, sel, nelt);
37518 for (i = which = 0; i < nelt; ++i)
37520 unsigned char e = d.perm[i];
37521 gcc_assert (e < 2 * nelt);
37522 which |= (e < nelt ? 1 : 2);
37525 /* For all elements from second vector, fold the elements to first. */
37526 if (which == 2)
37527 for (i = 0; i < nelt; ++i)
37528 d.perm[i] -= nelt;
37530 /* Check whether the mask can be applied to the vector type. */
37531 one_vec = (which != 3);
37533 /* Implementable with shufps or pshufd. */
37534 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37535 return true;
37537 /* Otherwise we have to go through the motions and see if we can
37538 figure out how to generate the requested permutation. */
37539 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37540 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37541 if (!one_vec)
37542 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37544 start_sequence ();
37545 ret = ix86_expand_vec_perm_const_1 (&d);
37546 end_sequence ();
37548 return ret;
37551 void
37552 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37554 struct expand_vec_perm_d d;
37555 unsigned i, nelt;
37557 d.target = targ;
37558 d.op0 = op0;
37559 d.op1 = op1;
37560 d.vmode = GET_MODE (targ);
37561 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37562 d.testing_p = false;
37564 for (i = 0; i < nelt; ++i)
37565 d.perm[i] = i * 2 + odd;
37567 /* We'll either be able to implement the permutation directly... */
37568 if (expand_vec_perm_1 (&d))
37569 return;
37571 /* ... or we use the special-case patterns. */
37572 expand_vec_perm_even_odd_1 (&d, odd);
37575 /* Expand an insert into a vector register through pinsr insn.
37576 Return true if successful. */
37578 bool
37579 ix86_expand_pinsr (rtx *operands)
37581 rtx dst = operands[0];
37582 rtx src = operands[3];
37584 unsigned int size = INTVAL (operands[1]);
37585 unsigned int pos = INTVAL (operands[2]);
37587 if (GET_CODE (dst) == SUBREG)
37589 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37590 dst = SUBREG_REG (dst);
37593 if (GET_CODE (src) == SUBREG)
37594 src = SUBREG_REG (src);
37596 switch (GET_MODE (dst))
37598 case V16QImode:
37599 case V8HImode:
37600 case V4SImode:
37601 case V2DImode:
37603 enum machine_mode srcmode, dstmode;
37604 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37606 srcmode = mode_for_size (size, MODE_INT, 0);
37608 switch (srcmode)
37610 case QImode:
37611 if (!TARGET_SSE4_1)
37612 return false;
37613 dstmode = V16QImode;
37614 pinsr = gen_sse4_1_pinsrb;
37615 break;
37617 case HImode:
37618 if (!TARGET_SSE2)
37619 return false;
37620 dstmode = V8HImode;
37621 pinsr = gen_sse2_pinsrw;
37622 break;
37624 case SImode:
37625 if (!TARGET_SSE4_1)
37626 return false;
37627 dstmode = V4SImode;
37628 pinsr = gen_sse4_1_pinsrd;
37629 break;
37631 case DImode:
37632 gcc_assert (TARGET_64BIT);
37633 if (!TARGET_SSE4_1)
37634 return false;
37635 dstmode = V2DImode;
37636 pinsr = gen_sse4_1_pinsrq;
37637 break;
37639 default:
37640 return false;
37643 dst = gen_lowpart (dstmode, dst);
37644 src = gen_lowpart (srcmode, src);
37646 pos /= size;
37648 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37649 return true;
37652 default:
37653 return false;
37657 /* This function returns the calling abi specific va_list type node.
37658 It returns the FNDECL specific va_list type. */
37660 static tree
37661 ix86_fn_abi_va_list (tree fndecl)
37663 if (!TARGET_64BIT)
37664 return va_list_type_node;
37665 gcc_assert (fndecl != NULL_TREE);
37667 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37668 return ms_va_list_type_node;
37669 else
37670 return sysv_va_list_type_node;
37673 /* Returns the canonical va_list type specified by TYPE. If there
37674 is no valid TYPE provided, it return NULL_TREE. */
37676 static tree
37677 ix86_canonical_va_list_type (tree type)
37679 tree wtype, htype;
37681 /* Resolve references and pointers to va_list type. */
37682 if (TREE_CODE (type) == MEM_REF)
37683 type = TREE_TYPE (type);
37684 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37685 type = TREE_TYPE (type);
37686 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37687 type = TREE_TYPE (type);
37689 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37691 wtype = va_list_type_node;
37692 gcc_assert (wtype != NULL_TREE);
37693 htype = type;
37694 if (TREE_CODE (wtype) == ARRAY_TYPE)
37696 /* If va_list is an array type, the argument may have decayed
37697 to a pointer type, e.g. by being passed to another function.
37698 In that case, unwrap both types so that we can compare the
37699 underlying records. */
37700 if (TREE_CODE (htype) == ARRAY_TYPE
37701 || POINTER_TYPE_P (htype))
37703 wtype = TREE_TYPE (wtype);
37704 htype = TREE_TYPE (htype);
37707 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37708 return va_list_type_node;
37709 wtype = sysv_va_list_type_node;
37710 gcc_assert (wtype != NULL_TREE);
37711 htype = type;
37712 if (TREE_CODE (wtype) == ARRAY_TYPE)
37714 /* If va_list is an array type, the argument may have decayed
37715 to a pointer type, e.g. by being passed to another function.
37716 In that case, unwrap both types so that we can compare the
37717 underlying records. */
37718 if (TREE_CODE (htype) == ARRAY_TYPE
37719 || POINTER_TYPE_P (htype))
37721 wtype = TREE_TYPE (wtype);
37722 htype = TREE_TYPE (htype);
37725 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37726 return sysv_va_list_type_node;
37727 wtype = ms_va_list_type_node;
37728 gcc_assert (wtype != NULL_TREE);
37729 htype = type;
37730 if (TREE_CODE (wtype) == ARRAY_TYPE)
37732 /* If va_list is an array type, the argument may have decayed
37733 to a pointer type, e.g. by being passed to another function.
37734 In that case, unwrap both types so that we can compare the
37735 underlying records. */
37736 if (TREE_CODE (htype) == ARRAY_TYPE
37737 || POINTER_TYPE_P (htype))
37739 wtype = TREE_TYPE (wtype);
37740 htype = TREE_TYPE (htype);
37743 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37744 return ms_va_list_type_node;
37745 return NULL_TREE;
37747 return std_canonical_va_list_type (type);
37750 /* Iterate through the target-specific builtin types for va_list.
37751 IDX denotes the iterator, *PTREE is set to the result type of
37752 the va_list builtin, and *PNAME to its internal type.
37753 Returns zero if there is no element for this index, otherwise
37754 IDX should be increased upon the next call.
37755 Note, do not iterate a base builtin's name like __builtin_va_list.
37756 Used from c_common_nodes_and_builtins. */
37758 static int
37759 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37761 if (TARGET_64BIT)
37763 switch (idx)
37765 default:
37766 break;
37768 case 0:
37769 *ptree = ms_va_list_type_node;
37770 *pname = "__builtin_ms_va_list";
37771 return 1;
37773 case 1:
37774 *ptree = sysv_va_list_type_node;
37775 *pname = "__builtin_sysv_va_list";
37776 return 1;
37780 return 0;
37783 #undef TARGET_SCHED_DISPATCH
37784 #define TARGET_SCHED_DISPATCH has_dispatch
37785 #undef TARGET_SCHED_DISPATCH_DO
37786 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37787 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37788 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37790 /* The size of the dispatch window is the total number of bytes of
37791 object code allowed in a window. */
37792 #define DISPATCH_WINDOW_SIZE 16
37794 /* Number of dispatch windows considered for scheduling. */
37795 #define MAX_DISPATCH_WINDOWS 3
37797 /* Maximum number of instructions in a window. */
37798 #define MAX_INSN 4
37800 /* Maximum number of immediate operands in a window. */
37801 #define MAX_IMM 4
37803 /* Maximum number of immediate bits allowed in a window. */
37804 #define MAX_IMM_SIZE 128
37806 /* Maximum number of 32 bit immediates allowed in a window. */
37807 #define MAX_IMM_32 4
37809 /* Maximum number of 64 bit immediates allowed in a window. */
37810 #define MAX_IMM_64 2
37812 /* Maximum total of loads or prefetches allowed in a window. */
37813 #define MAX_LOAD 2
37815 /* Maximum total of stores allowed in a window. */
37816 #define MAX_STORE 1
37818 #undef BIG
37819 #define BIG 100
37822 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
37823 enum dispatch_group {
37824 disp_no_group = 0,
37825 disp_load,
37826 disp_store,
37827 disp_load_store,
37828 disp_prefetch,
37829 disp_imm,
37830 disp_imm_32,
37831 disp_imm_64,
37832 disp_branch,
37833 disp_cmp,
37834 disp_jcc,
37835 disp_last
37838 /* Number of allowable groups in a dispatch window. It is an array
37839 indexed by dispatch_group enum. 100 is used as a big number,
37840 because the number of these kind of operations does not have any
37841 effect in dispatch window, but we need them for other reasons in
37842 the table. */
37843 static unsigned int num_allowable_groups[disp_last] = {
37844 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37847 char group_name[disp_last + 1][16] = {
37848 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
37849 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
37850 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
37853 /* Instruction path. */
37854 enum insn_path {
37855 no_path = 0,
37856 path_single, /* Single micro op. */
37857 path_double, /* Double micro op. */
37858 path_multi, /* Instructions with more than 2 micro op.. */
37859 last_path
37862 /* sched_insn_info defines a window to the instructions scheduled in
37863 the basic block. It contains a pointer to the insn_info table and
37864 the instruction scheduled.
37866 Windows are allocated for each basic block and are linked
37867 together. */
37868 typedef struct sched_insn_info_s {
37869 rtx insn;
37870 enum dispatch_group group;
37871 enum insn_path path;
37872 int byte_len;
37873 int imm_bytes;
37874 } sched_insn_info;
37876 /* Linked list of dispatch windows. This is a two way list of
37877 dispatch windows of a basic block. It contains information about
37878 the number of uops in the window and the total number of
37879 instructions and of bytes in the object code for this dispatch
37880 window. */
37881 typedef struct dispatch_windows_s {
37882 int num_insn; /* Number of insn in the window. */
37883 int num_uops; /* Number of uops in the window. */
37884 int window_size; /* Number of bytes in the window. */
37885 int window_num; /* Window number between 0 or 1. */
37886 int num_imm; /* Number of immediates in an insn. */
37887 int num_imm_32; /* Number of 32 bit immediates in an insn. */
37888 int num_imm_64; /* Number of 64 bit immediates in an insn. */
37889 int imm_size; /* Total immediates in the window. */
37890 int num_loads; /* Total memory loads in the window. */
37891 int num_stores; /* Total memory stores in the window. */
37892 int violation; /* Violation exists in window. */
37893 sched_insn_info *window; /* Pointer to the window. */
37894 struct dispatch_windows_s *next;
37895 struct dispatch_windows_s *prev;
37896 } dispatch_windows;
37898 /* Immediate valuse used in an insn. */
37899 typedef struct imm_info_s
37901 int imm;
37902 int imm32;
37903 int imm64;
37904 } imm_info;
37906 static dispatch_windows *dispatch_window_list;
37907 static dispatch_windows *dispatch_window_list1;
37909 /* Get dispatch group of insn. */
37911 static enum dispatch_group
37912 get_mem_group (rtx insn)
37914 enum attr_memory memory;
37916 if (INSN_CODE (insn) < 0)
37917 return disp_no_group;
37918 memory = get_attr_memory (insn);
37919 if (memory == MEMORY_STORE)
37920 return disp_store;
37922 if (memory == MEMORY_LOAD)
37923 return disp_load;
37925 if (memory == MEMORY_BOTH)
37926 return disp_load_store;
37928 return disp_no_group;
37931 /* Return true if insn is a compare instruction. */
37933 static bool
37934 is_cmp (rtx insn)
37936 enum attr_type type;
37938 type = get_attr_type (insn);
37939 return (type == TYPE_TEST
37940 || type == TYPE_ICMP
37941 || type == TYPE_FCMP
37942 || GET_CODE (PATTERN (insn)) == COMPARE);
37945 /* Return true if a dispatch violation encountered. */
37947 static bool
37948 dispatch_violation (void)
37950 if (dispatch_window_list->next)
37951 return dispatch_window_list->next->violation;
37952 return dispatch_window_list->violation;
37955 /* Return true if insn is a branch instruction. */
37957 static bool
37958 is_branch (rtx insn)
37960 return (CALL_P (insn) || JUMP_P (insn));
37963 /* Return true if insn is a prefetch instruction. */
37965 static bool
37966 is_prefetch (rtx insn)
37968 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
37971 /* This function initializes a dispatch window and the list container holding a
37972 pointer to the window. */
37974 static void
37975 init_window (int window_num)
37977 int i;
37978 dispatch_windows *new_list;
37980 if (window_num == 0)
37981 new_list = dispatch_window_list;
37982 else
37983 new_list = dispatch_window_list1;
37985 new_list->num_insn = 0;
37986 new_list->num_uops = 0;
37987 new_list->window_size = 0;
37988 new_list->next = NULL;
37989 new_list->prev = NULL;
37990 new_list->window_num = window_num;
37991 new_list->num_imm = 0;
37992 new_list->num_imm_32 = 0;
37993 new_list->num_imm_64 = 0;
37994 new_list->imm_size = 0;
37995 new_list->num_loads = 0;
37996 new_list->num_stores = 0;
37997 new_list->violation = false;
37999 for (i = 0; i < MAX_INSN; i++)
38001 new_list->window[i].insn = NULL;
38002 new_list->window[i].group = disp_no_group;
38003 new_list->window[i].path = no_path;
38004 new_list->window[i].byte_len = 0;
38005 new_list->window[i].imm_bytes = 0;
38007 return;
38010 /* This function allocates and initializes a dispatch window and the
38011 list container holding a pointer to the window. */
38013 static dispatch_windows *
38014 allocate_window (void)
38016 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
38017 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
38019 return new_list;
38022 /* This routine initializes the dispatch scheduling information. It
38023 initiates building dispatch scheduler tables and constructs the
38024 first dispatch window. */
38026 static void
38027 init_dispatch_sched (void)
38029 /* Allocate a dispatch list and a window. */
38030 dispatch_window_list = allocate_window ();
38031 dispatch_window_list1 = allocate_window ();
38032 init_window (0);
38033 init_window (1);
38036 /* This function returns true if a branch is detected. End of a basic block
38037 does not have to be a branch, but here we assume only branches end a
38038 window. */
38040 static bool
38041 is_end_basic_block (enum dispatch_group group)
38043 return group == disp_branch;
38046 /* This function is called when the end of a window processing is reached. */
38048 static void
38049 process_end_window (void)
38051 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
38052 if (dispatch_window_list->next)
38054 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
38055 gcc_assert (dispatch_window_list->window_size
38056 + dispatch_window_list1->window_size <= 48);
38057 init_window (1);
38059 init_window (0);
38062 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
38063 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
38064 for 48 bytes of instructions. Note that these windows are not dispatch
38065 windows that their sizes are DISPATCH_WINDOW_SIZE. */
38067 static dispatch_windows *
38068 allocate_next_window (int window_num)
38070 if (window_num == 0)
38072 if (dispatch_window_list->next)
38073 init_window (1);
38074 init_window (0);
38075 return dispatch_window_list;
38078 dispatch_window_list->next = dispatch_window_list1;
38079 dispatch_window_list1->prev = dispatch_window_list;
38081 return dispatch_window_list1;
38084 /* Increment the number of immediate operands of an instruction. */
38086 static int
38087 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
38089 if (*in_rtx == 0)
38090 return 0;
38092 switch ( GET_CODE (*in_rtx))
38094 case CONST:
38095 case SYMBOL_REF:
38096 case CONST_INT:
38097 (imm_values->imm)++;
38098 if (x86_64_immediate_operand (*in_rtx, SImode))
38099 (imm_values->imm32)++;
38100 else
38101 (imm_values->imm64)++;
38102 break;
38104 case CONST_DOUBLE:
38105 (imm_values->imm)++;
38106 (imm_values->imm64)++;
38107 break;
38109 case CODE_LABEL:
38110 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
38112 (imm_values->imm)++;
38113 (imm_values->imm32)++;
38115 break;
38117 default:
38118 break;
38121 return 0;
38124 /* Compute number of immediate operands of an instruction. */
38126 static void
38127 find_constant (rtx in_rtx, imm_info *imm_values)
38129 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
38130 (rtx_function) find_constant_1, (void *) imm_values);
38133 /* Return total size of immediate operands of an instruction along with number
38134 of corresponding immediate-operands. It initializes its parameters to zero
38135 befor calling FIND_CONSTANT.
38136 INSN is the input instruction. IMM is the total of immediates.
38137 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
38138 bit immediates. */
38140 static int
38141 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
38143 imm_info imm_values = {0, 0, 0};
38145 find_constant (insn, &imm_values);
38146 *imm = imm_values.imm;
38147 *imm32 = imm_values.imm32;
38148 *imm64 = imm_values.imm64;
38149 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
38152 /* This function indicates if an operand of an instruction is an
38153 immediate. */
38155 static bool
38156 has_immediate (rtx insn)
38158 int num_imm_operand;
38159 int num_imm32_operand;
38160 int num_imm64_operand;
38162 if (insn)
38163 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38164 &num_imm64_operand);
38165 return false;
38168 /* Return single or double path for instructions. */
38170 static enum insn_path
38171 get_insn_path (rtx insn)
38173 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
38175 if ((int)path == 0)
38176 return path_single;
38178 if ((int)path == 1)
38179 return path_double;
38181 return path_multi;
38184 /* Return insn dispatch group. */
38186 static enum dispatch_group
38187 get_insn_group (rtx insn)
38189 enum dispatch_group group = get_mem_group (insn);
38190 if (group)
38191 return group;
38193 if (is_branch (insn))
38194 return disp_branch;
38196 if (is_cmp (insn))
38197 return disp_cmp;
38199 if (has_immediate (insn))
38200 return disp_imm;
38202 if (is_prefetch (insn))
38203 return disp_prefetch;
38205 return disp_no_group;
38208 /* Count number of GROUP restricted instructions in a dispatch
38209 window WINDOW_LIST. */
38211 static int
38212 count_num_restricted (rtx insn, dispatch_windows *window_list)
38214 enum dispatch_group group = get_insn_group (insn);
38215 int imm_size;
38216 int num_imm_operand;
38217 int num_imm32_operand;
38218 int num_imm64_operand;
38220 if (group == disp_no_group)
38221 return 0;
38223 if (group == disp_imm)
38225 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38226 &num_imm64_operand);
38227 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38228 || num_imm_operand + window_list->num_imm > MAX_IMM
38229 || (num_imm32_operand > 0
38230 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38231 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38232 || (num_imm64_operand > 0
38233 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38234 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38235 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38236 && num_imm64_operand > 0
38237 && ((window_list->num_imm_64 > 0
38238 && window_list->num_insn >= 2)
38239 || window_list->num_insn >= 3)))
38240 return BIG;
38242 return 1;
38245 if ((group == disp_load_store
38246 && (window_list->num_loads >= MAX_LOAD
38247 || window_list->num_stores >= MAX_STORE))
38248 || ((group == disp_load
38249 || group == disp_prefetch)
38250 && window_list->num_loads >= MAX_LOAD)
38251 || (group == disp_store
38252 && window_list->num_stores >= MAX_STORE))
38253 return BIG;
38255 return 1;
38258 /* This function returns true if insn satisfies dispatch rules on the
38259 last window scheduled. */
38261 static bool
38262 fits_dispatch_window (rtx insn)
38264 dispatch_windows *window_list = dispatch_window_list;
38265 dispatch_windows *window_list_next = dispatch_window_list->next;
38266 unsigned int num_restrict;
38267 enum dispatch_group group = get_insn_group (insn);
38268 enum insn_path path = get_insn_path (insn);
38269 int sum;
38271 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
38272 instructions should be given the lowest priority in the
38273 scheduling process in Haifa scheduler to make sure they will be
38274 scheduled in the same dispatch window as the refrence to them. */
38275 if (group == disp_jcc || group == disp_cmp)
38276 return false;
38278 /* Check nonrestricted. */
38279 if (group == disp_no_group || group == disp_branch)
38280 return true;
38282 /* Get last dispatch window. */
38283 if (window_list_next)
38284 window_list = window_list_next;
38286 if (window_list->window_num == 1)
38288 sum = window_list->prev->window_size + window_list->window_size;
38290 if (sum == 32
38291 || (min_insn_size (insn) + sum) >= 48)
38292 /* Window 1 is full. Go for next window. */
38293 return true;
38296 num_restrict = count_num_restricted (insn, window_list);
38298 if (num_restrict > num_allowable_groups[group])
38299 return false;
38301 /* See if it fits in the first window. */
38302 if (window_list->window_num == 0)
38304 /* The first widow should have only single and double path
38305 uops. */
38306 if (path == path_double
38307 && (window_list->num_uops + 2) > MAX_INSN)
38308 return false;
38309 else if (path != path_single)
38310 return false;
38312 return true;
38315 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38316 dispatch window WINDOW_LIST. */
38318 static void
38319 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38321 int byte_len = min_insn_size (insn);
38322 int num_insn = window_list->num_insn;
38323 int imm_size;
38324 sched_insn_info *window = window_list->window;
38325 enum dispatch_group group = get_insn_group (insn);
38326 enum insn_path path = get_insn_path (insn);
38327 int num_imm_operand;
38328 int num_imm32_operand;
38329 int num_imm64_operand;
38331 if (!window_list->violation && group != disp_cmp
38332 && !fits_dispatch_window (insn))
38333 window_list->violation = true;
38335 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38336 &num_imm64_operand);
38338 /* Initialize window with new instruction. */
38339 window[num_insn].insn = insn;
38340 window[num_insn].byte_len = byte_len;
38341 window[num_insn].group = group;
38342 window[num_insn].path = path;
38343 window[num_insn].imm_bytes = imm_size;
38345 window_list->window_size += byte_len;
38346 window_list->num_insn = num_insn + 1;
38347 window_list->num_uops = window_list->num_uops + num_uops;
38348 window_list->imm_size += imm_size;
38349 window_list->num_imm += num_imm_operand;
38350 window_list->num_imm_32 += num_imm32_operand;
38351 window_list->num_imm_64 += num_imm64_operand;
38353 if (group == disp_store)
38354 window_list->num_stores += 1;
38355 else if (group == disp_load
38356 || group == disp_prefetch)
38357 window_list->num_loads += 1;
38358 else if (group == disp_load_store)
38360 window_list->num_stores += 1;
38361 window_list->num_loads += 1;
38365 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38366 If the total bytes of instructions or the number of instructions in
38367 the window exceed allowable, it allocates a new window. */
38369 static void
38370 add_to_dispatch_window (rtx insn)
38372 int byte_len;
38373 dispatch_windows *window_list;
38374 dispatch_windows *next_list;
38375 dispatch_windows *window0_list;
38376 enum insn_path path;
38377 enum dispatch_group insn_group;
38378 bool insn_fits;
38379 int num_insn;
38380 int num_uops;
38381 int window_num;
38382 int insn_num_uops;
38383 int sum;
38385 if (INSN_CODE (insn) < 0)
38386 return;
38388 byte_len = min_insn_size (insn);
38389 window_list = dispatch_window_list;
38390 next_list = window_list->next;
38391 path = get_insn_path (insn);
38392 insn_group = get_insn_group (insn);
38394 /* Get the last dispatch window. */
38395 if (next_list)
38396 window_list = dispatch_window_list->next;
38398 if (path == path_single)
38399 insn_num_uops = 1;
38400 else if (path == path_double)
38401 insn_num_uops = 2;
38402 else
38403 insn_num_uops = (int) path;
38405 /* If current window is full, get a new window.
38406 Window number zero is full, if MAX_INSN uops are scheduled in it.
38407 Window number one is full, if window zero's bytes plus window
38408 one's bytes is 32, or if the bytes of the new instruction added
38409 to the total makes it greater than 48, or it has already MAX_INSN
38410 instructions in it. */
38411 num_insn = window_list->num_insn;
38412 num_uops = window_list->num_uops;
38413 window_num = window_list->window_num;
38414 insn_fits = fits_dispatch_window (insn);
38416 if (num_insn >= MAX_INSN
38417 || num_uops + insn_num_uops > MAX_INSN
38418 || !(insn_fits))
38420 window_num = ~window_num & 1;
38421 window_list = allocate_next_window (window_num);
38424 if (window_num == 0)
38426 add_insn_window (insn, window_list, insn_num_uops);
38427 if (window_list->num_insn >= MAX_INSN
38428 && insn_group == disp_branch)
38430 process_end_window ();
38431 return;
38434 else if (window_num == 1)
38436 window0_list = window_list->prev;
38437 sum = window0_list->window_size + window_list->window_size;
38438 if (sum == 32
38439 || (byte_len + sum) >= 48)
38441 process_end_window ();
38442 window_list = dispatch_window_list;
38445 add_insn_window (insn, window_list, insn_num_uops);
38447 else
38448 gcc_unreachable ();
38450 if (is_end_basic_block (insn_group))
38452 /* End of basic block is reached do end-basic-block process. */
38453 process_end_window ();
38454 return;
38458 /* Print the dispatch window, WINDOW_NUM, to FILE. */
38460 DEBUG_FUNCTION static void
38461 debug_dispatch_window_file (FILE *file, int window_num)
38463 dispatch_windows *list;
38464 int i;
38466 if (window_num == 0)
38467 list = dispatch_window_list;
38468 else
38469 list = dispatch_window_list1;
38471 fprintf (file, "Window #%d:\n", list->window_num);
38472 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
38473 list->num_insn, list->num_uops, list->window_size);
38474 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38475 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38477 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
38478 list->num_stores);
38479 fprintf (file, " insn info:\n");
38481 for (i = 0; i < MAX_INSN; i++)
38483 if (!list->window[i].insn)
38484 break;
38485 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38486 i, group_name[list->window[i].group],
38487 i, (void *)list->window[i].insn,
38488 i, list->window[i].path,
38489 i, list->window[i].byte_len,
38490 i, list->window[i].imm_bytes);
38494 /* Print to stdout a dispatch window. */
38496 DEBUG_FUNCTION void
38497 debug_dispatch_window (int window_num)
38499 debug_dispatch_window_file (stdout, window_num);
38502 /* Print INSN dispatch information to FILE. */
38504 DEBUG_FUNCTION static void
38505 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38507 int byte_len;
38508 enum insn_path path;
38509 enum dispatch_group group;
38510 int imm_size;
38511 int num_imm_operand;
38512 int num_imm32_operand;
38513 int num_imm64_operand;
38515 if (INSN_CODE (insn) < 0)
38516 return;
38518 byte_len = min_insn_size (insn);
38519 path = get_insn_path (insn);
38520 group = get_insn_group (insn);
38521 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38522 &num_imm64_operand);
38524 fprintf (file, " insn info:\n");
38525 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
38526 group_name[group], path, byte_len);
38527 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38528 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38531 /* Print to STDERR the status of the ready list with respect to
38532 dispatch windows. */
38534 DEBUG_FUNCTION void
38535 debug_ready_dispatch (void)
38537 int i;
38538 int no_ready = number_in_ready ();
38540 fprintf (stdout, "Number of ready: %d\n", no_ready);
38542 for (i = 0; i < no_ready; i++)
38543 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38546 /* This routine is the driver of the dispatch scheduler. */
38548 static void
38549 do_dispatch (rtx insn, int mode)
38551 if (mode == DISPATCH_INIT)
38552 init_dispatch_sched ();
38553 else if (mode == ADD_TO_DISPATCH_WINDOW)
38554 add_to_dispatch_window (insn);
38557 /* Return TRUE if Dispatch Scheduling is supported. */
38559 static bool
38560 has_dispatch (rtx insn, int action)
38562 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38563 && flag_dispatch_scheduler)
38564 switch (action)
38566 default:
38567 return false;
38569 case IS_DISPATCH_ON:
38570 return true;
38571 break;
38573 case IS_CMP:
38574 return is_cmp (insn);
38576 case DISPATCH_VIOLATION:
38577 return dispatch_violation ();
38579 case FITS_DISPATCH_WINDOW:
38580 return fits_dispatch_window (insn);
38583 return false;
38586 /* Implementation of reassociation_width target hook used by
38587 reassoc phase to identify parallelism level in reassociated
38588 tree. Statements tree_code is passed in OPC. Arguments type
38589 is passed in MODE.
38591 Currently parallel reassociation is enabled for Atom
38592 processors only and we set reassociation width to be 2
38593 because Atom may issue up to 2 instructions per cycle.
38595 Return value should be fixed if parallel reassociation is
38596 enabled for other processors. */
38598 static int
38599 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38600 enum machine_mode mode)
38602 int res = 1;
38604 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38605 res = 2;
38606 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38607 res = 2;
38609 return res;
38612 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38613 place emms and femms instructions. */
38615 static enum machine_mode
38616 ix86_preferred_simd_mode (enum machine_mode mode)
38618 if (!TARGET_SSE)
38619 return word_mode;
38621 switch (mode)
38623 case QImode:
38624 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38625 case HImode:
38626 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38627 case SImode:
38628 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38629 case DImode:
38630 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38632 case SFmode:
38633 if (TARGET_AVX && !TARGET_PREFER_AVX128)
38634 return V8SFmode;
38635 else
38636 return V4SFmode;
38638 case DFmode:
38639 if (!TARGET_VECTORIZE_DOUBLE)
38640 return word_mode;
38641 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38642 return V4DFmode;
38643 else if (TARGET_SSE2)
38644 return V2DFmode;
38645 /* FALLTHRU */
38647 default:
38648 return word_mode;
38652 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38653 vectors. */
38655 static unsigned int
38656 ix86_autovectorize_vector_sizes (void)
38658 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38661 /* Initialize the GCC target structure. */
38662 #undef TARGET_RETURN_IN_MEMORY
38663 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38665 #undef TARGET_LEGITIMIZE_ADDRESS
38666 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38668 #undef TARGET_ATTRIBUTE_TABLE
38669 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38670 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38671 # undef TARGET_MERGE_DECL_ATTRIBUTES
38672 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38673 #endif
38675 #undef TARGET_COMP_TYPE_ATTRIBUTES
38676 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38678 #undef TARGET_INIT_BUILTINS
38679 #define TARGET_INIT_BUILTINS ix86_init_builtins
38680 #undef TARGET_BUILTIN_DECL
38681 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38682 #undef TARGET_EXPAND_BUILTIN
38683 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38685 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38686 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38687 ix86_builtin_vectorized_function
38689 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38690 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38692 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38693 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38695 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38696 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38698 #undef TARGET_BUILTIN_RECIPROCAL
38699 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38701 #undef TARGET_ASM_FUNCTION_EPILOGUE
38702 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38704 #undef TARGET_ENCODE_SECTION_INFO
38705 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38706 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38707 #else
38708 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38709 #endif
38711 #undef TARGET_ASM_OPEN_PAREN
38712 #define TARGET_ASM_OPEN_PAREN ""
38713 #undef TARGET_ASM_CLOSE_PAREN
38714 #define TARGET_ASM_CLOSE_PAREN ""
38716 #undef TARGET_ASM_BYTE_OP
38717 #define TARGET_ASM_BYTE_OP ASM_BYTE
38719 #undef TARGET_ASM_ALIGNED_HI_OP
38720 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38721 #undef TARGET_ASM_ALIGNED_SI_OP
38722 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38723 #ifdef ASM_QUAD
38724 #undef TARGET_ASM_ALIGNED_DI_OP
38725 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38726 #endif
38728 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38729 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38731 #undef TARGET_ASM_UNALIGNED_HI_OP
38732 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38733 #undef TARGET_ASM_UNALIGNED_SI_OP
38734 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38735 #undef TARGET_ASM_UNALIGNED_DI_OP
38736 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38738 #undef TARGET_PRINT_OPERAND
38739 #define TARGET_PRINT_OPERAND ix86_print_operand
38740 #undef TARGET_PRINT_OPERAND_ADDRESS
38741 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38742 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38743 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38744 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38745 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38747 #undef TARGET_SCHED_INIT_GLOBAL
38748 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38749 #undef TARGET_SCHED_ADJUST_COST
38750 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38751 #undef TARGET_SCHED_ISSUE_RATE
38752 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38753 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38754 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38755 ia32_multipass_dfa_lookahead
38757 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38758 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38760 #ifdef HAVE_AS_TLS
38761 #undef TARGET_HAVE_TLS
38762 #define TARGET_HAVE_TLS true
38763 #endif
38764 #undef TARGET_CANNOT_FORCE_CONST_MEM
38765 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38766 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38767 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38769 #undef TARGET_DELEGITIMIZE_ADDRESS
38770 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38772 #undef TARGET_MS_BITFIELD_LAYOUT_P
38773 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38775 #if TARGET_MACHO
38776 #undef TARGET_BINDS_LOCAL_P
38777 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38778 #endif
38779 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38780 #undef TARGET_BINDS_LOCAL_P
38781 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38782 #endif
38784 #undef TARGET_ASM_OUTPUT_MI_THUNK
38785 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38786 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38787 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38789 #undef TARGET_ASM_FILE_START
38790 #define TARGET_ASM_FILE_START x86_file_start
38792 #undef TARGET_OPTION_OVERRIDE
38793 #define TARGET_OPTION_OVERRIDE ix86_option_override
38795 #undef TARGET_REGISTER_MOVE_COST
38796 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38797 #undef TARGET_MEMORY_MOVE_COST
38798 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38799 #undef TARGET_RTX_COSTS
38800 #define TARGET_RTX_COSTS ix86_rtx_costs
38801 #undef TARGET_ADDRESS_COST
38802 #define TARGET_ADDRESS_COST ix86_address_cost
38804 #undef TARGET_FIXED_CONDITION_CODE_REGS
38805 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38806 #undef TARGET_CC_MODES_COMPATIBLE
38807 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38809 #undef TARGET_MACHINE_DEPENDENT_REORG
38810 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38812 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38813 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38815 #undef TARGET_BUILD_BUILTIN_VA_LIST
38816 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38818 #undef TARGET_ENUM_VA_LIST_P
38819 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38821 #undef TARGET_FN_ABI_VA_LIST
38822 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38824 #undef TARGET_CANONICAL_VA_LIST_TYPE
38825 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38827 #undef TARGET_EXPAND_BUILTIN_VA_START
38828 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38830 #undef TARGET_MD_ASM_CLOBBERS
38831 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38833 #undef TARGET_PROMOTE_PROTOTYPES
38834 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38835 #undef TARGET_STRUCT_VALUE_RTX
38836 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38837 #undef TARGET_SETUP_INCOMING_VARARGS
38838 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38839 #undef TARGET_MUST_PASS_IN_STACK
38840 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38841 #undef TARGET_FUNCTION_ARG_ADVANCE
38842 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38843 #undef TARGET_FUNCTION_ARG
38844 #define TARGET_FUNCTION_ARG ix86_function_arg
38845 #undef TARGET_FUNCTION_ARG_BOUNDARY
38846 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38847 #undef TARGET_PASS_BY_REFERENCE
38848 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
38849 #undef TARGET_INTERNAL_ARG_POINTER
38850 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
38851 #undef TARGET_UPDATE_STACK_BOUNDARY
38852 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
38853 #undef TARGET_GET_DRAP_RTX
38854 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
38855 #undef TARGET_STRICT_ARGUMENT_NAMING
38856 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
38857 #undef TARGET_STATIC_CHAIN
38858 #define TARGET_STATIC_CHAIN ix86_static_chain
38859 #undef TARGET_TRAMPOLINE_INIT
38860 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
38861 #undef TARGET_RETURN_POPS_ARGS
38862 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
38864 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
38865 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
38867 #undef TARGET_SCALAR_MODE_SUPPORTED_P
38868 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
38870 #undef TARGET_VECTOR_MODE_SUPPORTED_P
38871 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
38873 #undef TARGET_C_MODE_FOR_SUFFIX
38874 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
38876 #ifdef HAVE_AS_TLS
38877 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
38878 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
38879 #endif
38881 #ifdef SUBTARGET_INSERT_ATTRIBUTES
38882 #undef TARGET_INSERT_ATTRIBUTES
38883 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
38884 #endif
38886 #undef TARGET_MANGLE_TYPE
38887 #define TARGET_MANGLE_TYPE ix86_mangle_type
38889 #if !TARGET_MACHO
38890 #undef TARGET_STACK_PROTECT_FAIL
38891 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
38892 #endif
38894 #undef TARGET_FUNCTION_VALUE
38895 #define TARGET_FUNCTION_VALUE ix86_function_value
38897 #undef TARGET_FUNCTION_VALUE_REGNO_P
38898 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
38900 #undef TARGET_PROMOTE_FUNCTION_MODE
38901 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
38903 #undef TARGET_SECONDARY_RELOAD
38904 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
38906 #undef TARGET_CLASS_MAX_NREGS
38907 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
38909 #undef TARGET_PREFERRED_RELOAD_CLASS
38910 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
38911 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
38912 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
38913 #undef TARGET_CLASS_LIKELY_SPILLED_P
38914 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
38916 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
38917 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
38918 ix86_builtin_vectorization_cost
38919 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
38920 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
38921 ix86_vectorize_vec_perm_const_ok
38922 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
38923 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
38924 ix86_preferred_simd_mode
38925 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
38926 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
38927 ix86_autovectorize_vector_sizes
38929 #undef TARGET_SET_CURRENT_FUNCTION
38930 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
38932 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
38933 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
38935 #undef TARGET_OPTION_SAVE
38936 #define TARGET_OPTION_SAVE ix86_function_specific_save
38938 #undef TARGET_OPTION_RESTORE
38939 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
38941 #undef TARGET_OPTION_PRINT
38942 #define TARGET_OPTION_PRINT ix86_function_specific_print
38944 #undef TARGET_CAN_INLINE_P
38945 #define TARGET_CAN_INLINE_P ix86_can_inline_p
38947 #undef TARGET_EXPAND_TO_RTL_HOOK
38948 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
38950 #undef TARGET_LEGITIMATE_ADDRESS_P
38951 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
38953 #undef TARGET_LEGITIMATE_CONSTANT_P
38954 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
38956 #undef TARGET_FRAME_POINTER_REQUIRED
38957 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
38959 #undef TARGET_CAN_ELIMINATE
38960 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
38962 #undef TARGET_EXTRA_LIVE_ON_ENTRY
38963 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
38965 #undef TARGET_ASM_CODE_END
38966 #define TARGET_ASM_CODE_END ix86_code_end
38968 #undef TARGET_CONDITIONAL_REGISTER_USAGE
38969 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
38971 #if TARGET_MACHO
38972 #undef TARGET_INIT_LIBFUNCS
38973 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
38974 #endif
38976 struct gcc_target targetm = TARGET_INITIALIZER;
38978 #include "gt-i386.h"