1 /* Copyright (C) 1988-2019 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
63 #include "sched-int.h"
65 #include "tree-pass.h"
67 #include "pass_manager.h"
68 #include "target-globals.h"
69 #include "gimple-iterator.h"
70 #include "tree-vectorizer.h"
71 #include "shrink-wrap.h"
74 #include "tree-iterator.h"
76 #include "case-cfn-macros.h"
78 #include "fold-const-call.h"
80 #include "tree-ssanames.h"
82 #include "selftest-rtl.h"
83 #include "print-rtl.h"
86 #include "symbol-summary.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
92 #include "dwarf2out.h"
93 #include "i386-options.h"
94 #include "i386-builtins.h"
95 #include "i386-expand.h"
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
104 split_double_mode (machine_mode mode
, rtx operands
[],
105 int num
, rtx lo_half
[], rtx hi_half
[])
107 machine_mode half_mode
;
122 byte
= GET_MODE_SIZE (half_mode
);
126 rtx op
= operands
[num
];
128 /* simplify_subreg refuse to split volatile memory addresses,
129 but we still have to handle it. */
132 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
133 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
137 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
138 GET_MODE (op
) == VOIDmode
139 ? mode
: GET_MODE (op
), 0);
140 hi_half
[num
] = simplify_gen_subreg (half_mode
, op
,
141 GET_MODE (op
) == VOIDmode
142 ? mode
: GET_MODE (op
), byte
);
147 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
151 ix86_expand_clear (rtx dest
)
155 /* We play register width games, which are only valid after reload. */
156 gcc_assert (reload_completed
);
158 /* Avoid HImode and its attendant prefix byte. */
159 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
160 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
161 tmp
= gen_rtx_SET (dest
, const0_rtx
);
163 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
165 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
166 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
173 ix86_expand_move (machine_mode mode
, rtx operands
[])
176 rtx tmp
, addend
= NULL_RTX
;
177 enum tls_model model
;
182 switch (GET_CODE (op1
))
187 if (GET_CODE (tmp
) != PLUS
188 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
192 addend
= XEXP (tmp
, 1);
196 model
= SYMBOL_REF_TLS_MODEL (op1
);
199 op1
= legitimize_tls_address (op1
, model
, true);
200 else if (ix86_force_load_from_GOT_p (op1
))
202 /* Load the external function address via GOT slot to avoid PLT. */
203 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
207 op1
= gen_rtx_CONST (Pmode
, op1
);
208 op1
= gen_const_mem (Pmode
, op1
);
209 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
213 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
229 op1
= force_operand (op1
, NULL_RTX
);
230 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
231 op0
, 1, OPTAB_DIRECT
);
234 op1
= force_operand (op1
, op0
);
239 op1
= convert_to_mode (mode
, op1
, 1);
245 if ((flag_pic
|| MACHOPIC_INDIRECT
)
246 && symbolic_operand (op1
, mode
))
248 if (TARGET_MACHO
&& !TARGET_64BIT
)
252 if (MACHOPIC_INDIRECT
)
254 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
255 ? op0
: gen_reg_rtx (Pmode
);
256 op1
= machopic_indirect_data_reference (op1
, temp
);
258 op1
= machopic_legitimize_pic_address (op1
, mode
,
259 temp
== op1
? 0 : temp
);
261 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
263 rtx insn
= gen_rtx_SET (op0
, op1
);
267 if (GET_CODE (op0
) == MEM
)
268 op1
= force_reg (Pmode
, op1
);
272 if (GET_CODE (temp
) != REG
)
273 temp
= gen_reg_rtx (Pmode
);
274 temp
= legitimize_pic_address (op1
, temp
);
285 op1
= force_reg (mode
, op1
);
286 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
288 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
289 op1
= legitimize_pic_address (op1
, reg
);
292 op1
= convert_to_mode (mode
, op1
, 1);
299 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
300 || !push_operand (op0
, mode
))
302 op1
= force_reg (mode
, op1
);
304 if (push_operand (op0
, mode
)
305 && ! general_no_elim_operand (op1
, mode
))
306 op1
= copy_to_mode_reg (mode
, op1
);
308 /* Force large constants in 64bit compilation into register
309 to get them CSEed. */
310 if (can_create_pseudo_p ()
311 && (mode
== DImode
) && TARGET_64BIT
312 && immediate_operand (op1
, mode
)
313 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
314 && !register_operand (op0
, mode
)
316 op1
= copy_to_mode_reg (mode
, op1
);
318 if (can_create_pseudo_p ()
319 && CONST_DOUBLE_P (op1
))
321 /* If we are loading a floating point constant to a register,
322 force the value to memory now, since we'll get better code
325 op1
= validize_mem (force_const_mem (mode
, op1
));
326 if (!register_operand (op0
, mode
))
328 rtx temp
= gen_reg_rtx (mode
);
329 emit_insn (gen_rtx_SET (temp
, op1
));
330 emit_move_insn (op0
, temp
);
336 emit_insn (gen_rtx_SET (op0
, op1
));
340 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
342 rtx op0
= operands
[0], op1
= operands
[1];
343 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
344 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
345 unsigned int align
= (TARGET_IAMCU
346 ? GET_MODE_BITSIZE (mode
)
347 : GET_MODE_ALIGNMENT (mode
));
349 if (push_operand (op0
, VOIDmode
))
350 op0
= emit_move_resolve_push (mode
, op0
);
352 /* Force constants other than zero into memory. We do not know how
353 the instructions used to build constants modify the upper 64 bits
354 of the register, once we have that information we may be able
355 to handle some of them more efficiently. */
356 if (can_create_pseudo_p ()
359 && CONSTANT_P (SUBREG_REG (op1
))))
360 && ((register_operand (op0
, mode
)
361 && !standard_sse_constant_p (op1
, mode
))
362 /* ix86_expand_vector_move_misalign() does not like constants. */
363 || (SSE_REG_MODE_P (mode
)
365 && MEM_ALIGN (op0
) < align
)))
369 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
370 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
372 r
= validize_mem (r
);
374 r
= force_reg (imode
, SUBREG_REG (op1
));
375 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
378 op1
= validize_mem (force_const_mem (mode
, op1
));
381 /* We need to check memory alignment for SSE mode since attribute
382 can make operands unaligned. */
383 if (can_create_pseudo_p ()
384 && SSE_REG_MODE_P (mode
)
385 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
386 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
390 /* ix86_expand_vector_move_misalign() does not like both
391 arguments in memory. */
392 if (!register_operand (op0
, mode
)
393 && !register_operand (op1
, mode
))
394 op1
= force_reg (mode
, op1
);
396 tmp
[0] = op0
; tmp
[1] = op1
;
397 ix86_expand_vector_move_misalign (mode
, tmp
);
401 /* Make operand1 a register if it isn't already. */
402 if (can_create_pseudo_p ()
403 && !register_operand (op0
, mode
)
404 && !register_operand (op1
, mode
))
406 emit_move_insn (op0
, force_reg (GET_MODE (op0
), op1
));
410 emit_insn (gen_rtx_SET (op0
, op1
));
413 /* Split 32-byte AVX unaligned load and store if needed. */
416 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
419 rtx (*extract
) (rtx
, rtx
, rtx
);
422 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
423 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
425 emit_insn (gen_rtx_SET (op0
, op1
));
429 rtx orig_op0
= NULL_RTX
;
430 mode
= GET_MODE (op0
);
431 switch (GET_MODE_CLASS (mode
))
433 case MODE_VECTOR_INT
:
435 if (mode
!= V32QImode
)
440 op0
= gen_reg_rtx (V32QImode
);
443 op0
= gen_lowpart (V32QImode
, op0
);
444 op1
= gen_lowpart (V32QImode
, op1
);
448 case MODE_VECTOR_FLOAT
:
459 extract
= gen_avx_vextractf128v32qi
;
463 extract
= gen_avx_vextractf128v8sf
;
467 extract
= gen_avx_vextractf128v4df
;
474 rtx r
= gen_reg_rtx (mode
);
475 m
= adjust_address (op1
, mode
, 0);
476 emit_move_insn (r
, m
);
477 m
= adjust_address (op1
, mode
, 16);
478 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
479 emit_move_insn (op0
, r
);
481 else if (MEM_P (op0
))
483 m
= adjust_address (op0
, mode
, 0);
484 emit_insn (extract (m
, op1
, const0_rtx
));
485 m
= adjust_address (op0
, mode
, 16);
486 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
492 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
495 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
496 straight to ix86_expand_vector_move. */
497 /* Code generation for scalar reg-reg moves of single and double precision data:
498 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
502 if (x86_sse_partial_reg_dependency == true)
507 Code generation for scalar loads of double precision data:
508 if (x86_sse_split_regs == true)
509 movlpd mem, reg (gas syntax)
513 Code generation for unaligned packed loads of single precision data
514 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
515 if (x86_sse_unaligned_move_optimal)
518 if (x86_sse_partial_reg_dependency == true)
530 Code generation for unaligned packed loads of double precision data
531 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
532 if (x86_sse_unaligned_move_optimal)
535 if (x86_sse_split_regs == true)
548 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
555 /* Use unaligned load/store for AVX512 or when optimizing for size. */
556 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
558 emit_insn (gen_rtx_SET (op0
, op1
));
564 if (GET_MODE_SIZE (mode
) == 32)
565 ix86_avx256_split_vector_move_misalign (op0
, op1
);
567 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
568 emit_insn (gen_rtx_SET (op0
, op1
));
572 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
573 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
575 emit_insn (gen_rtx_SET (op0
, op1
));
579 /* ??? If we have typed data, then it would appear that using
580 movdqu is the only way to get unaligned data loaded with
582 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
584 emit_insn (gen_rtx_SET (op0
, op1
));
590 if (TARGET_SSE2
&& mode
== V2DFmode
)
594 /* When SSE registers are split into halves, we can avoid
595 writing to the top half twice. */
596 if (TARGET_SSE_SPLIT_REGS
)
603 /* ??? Not sure about the best option for the Intel chips.
604 The following would seem to satisfy; the register is
605 entirely cleared, breaking the dependency chain. We
606 then store to the upper half, with a dependency depth
607 of one. A rumor has it that Intel recommends two movsd
608 followed by an unpacklpd, but this is unconfirmed. And
609 given that the dependency depth of the unpacklpd would
610 still be one, I'm not sure why this would be better. */
611 zero
= CONST0_RTX (V2DFmode
);
614 m
= adjust_address (op1
, DFmode
, 0);
615 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
616 m
= adjust_address (op1
, DFmode
, 8);
617 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
623 if (mode
!= V4SFmode
)
624 t
= gen_reg_rtx (V4SFmode
);
628 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
629 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
633 m
= adjust_address (op1
, V2SFmode
, 0);
634 emit_insn (gen_sse_loadlps (t
, t
, m
));
635 m
= adjust_address (op1
, V2SFmode
, 8);
636 emit_insn (gen_sse_loadhps (t
, t
, m
));
637 if (mode
!= V4SFmode
)
638 emit_move_insn (op0
, gen_lowpart (mode
, t
));
641 else if (MEM_P (op0
))
643 if (TARGET_SSE2
&& mode
== V2DFmode
)
645 m
= adjust_address (op0
, DFmode
, 0);
646 emit_insn (gen_sse2_storelpd (m
, op1
));
647 m
= adjust_address (op0
, DFmode
, 8);
648 emit_insn (gen_sse2_storehpd (m
, op1
));
652 if (mode
!= V4SFmode
)
653 op1
= gen_lowpart (V4SFmode
, op1
);
655 m
= adjust_address (op0
, V2SFmode
, 0);
656 emit_insn (gen_sse_storelps (m
, op1
));
657 m
= adjust_address (op0
, V2SFmode
, 8);
658 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
665 /* Move bits 64:95 to bits 32:63. */
668 ix86_move_vector_high_sse_to_mmx (rtx op
)
670 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
671 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
672 GEN_INT (0), GEN_INT (0)));
673 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
674 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
675 rtx insn
= gen_rtx_SET (dest
, op
);
679 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
682 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
684 rtx op0
= operands
[0];
685 rtx op1
= operands
[1];
686 rtx op2
= operands
[2];
688 machine_mode dmode
= GET_MODE (op0
);
689 machine_mode smode
= GET_MODE (op1
);
690 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
691 machine_mode inner_smode
= GET_MODE_INNER (smode
);
693 /* Get the corresponding SSE mode for destination. */
694 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
695 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
697 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
698 nunits
/ 2).require ();
700 /* Get the corresponding SSE mode for source. */
701 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
702 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
705 /* Generate SSE pack with signed/unsigned saturation. */
706 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
707 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
708 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
710 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
711 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
712 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
716 ix86_move_vector_high_sse_to_mmx (op0
);
719 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
722 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
724 rtx op0
= operands
[0];
725 rtx op1
= operands
[1];
726 rtx op2
= operands
[2];
727 machine_mode mode
= GET_MODE (op0
);
729 /* The corresponding SSE mode. */
730 machine_mode sse_mode
, double_sse_mode
;
735 sse_mode
= V16QImode
;
736 double_sse_mode
= V32QImode
;
737 mask
= gen_rtx_PARALLEL (VOIDmode
,
739 GEN_INT (0), GEN_INT (16),
740 GEN_INT (1), GEN_INT (17),
741 GEN_INT (2), GEN_INT (18),
742 GEN_INT (3), GEN_INT (19),
743 GEN_INT (4), GEN_INT (20),
744 GEN_INT (5), GEN_INT (21),
745 GEN_INT (6), GEN_INT (22),
746 GEN_INT (7), GEN_INT (23)));
751 double_sse_mode
= V16HImode
;
752 mask
= gen_rtx_PARALLEL (VOIDmode
,
754 GEN_INT (0), GEN_INT (8),
755 GEN_INT (1), GEN_INT (9),
756 GEN_INT (2), GEN_INT (10),
757 GEN_INT (3), GEN_INT (11)));
762 double_sse_mode
= V8SImode
;
763 mask
= gen_rtx_PARALLEL (VOIDmode
,
765 GEN_INT (0), GEN_INT (4),
766 GEN_INT (1), GEN_INT (5)));
773 /* Generate SSE punpcklXX. */
774 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
775 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
776 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
778 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
779 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
780 rtx insn
= gen_rtx_SET (dest
, op2
);
785 /* Move bits 64:127 to bits 0:63. */
786 mask
= gen_rtx_PARALLEL (VOIDmode
,
787 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
788 GEN_INT (0), GEN_INT (0)));
789 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
790 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
791 insn
= gen_rtx_SET (dest
, op1
);
796 /* Helper function of ix86_fixup_binary_operands to canonicalize
797 operand order. Returns true if the operands should be swapped. */
800 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
803 rtx dst
= operands
[0];
804 rtx src1
= operands
[1];
805 rtx src2
= operands
[2];
807 /* If the operation is not commutative, we can't do anything. */
808 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
809 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
812 /* Highest priority is that src1 should match dst. */
813 if (rtx_equal_p (dst
, src1
))
815 if (rtx_equal_p (dst
, src2
))
818 /* Next highest priority is that immediate constants come second. */
819 if (immediate_operand (src2
, mode
))
821 if (immediate_operand (src1
, mode
))
824 /* Lowest priority is that memory references should come second. */
834 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
835 destination to use for the operation. If different from the true
836 destination in operands[0], a copy operation will be required. */
839 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
842 rtx dst
= operands
[0];
843 rtx src1
= operands
[1];
844 rtx src2
= operands
[2];
846 /* Canonicalize operand order. */
847 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
849 /* It is invalid to swap operands of different modes. */
850 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
852 std::swap (src1
, src2
);
855 /* Both source operands cannot be in memory. */
856 if (MEM_P (src1
) && MEM_P (src2
))
858 /* Optimization: Only read from memory once. */
859 if (rtx_equal_p (src1
, src2
))
861 src2
= force_reg (mode
, src2
);
864 else if (rtx_equal_p (dst
, src1
))
865 src2
= force_reg (mode
, src2
);
867 src1
= force_reg (mode
, src1
);
870 /* If the destination is memory, and we do not have matching source
871 operands, do things in registers. */
872 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
873 dst
= gen_reg_rtx (mode
);
875 /* Source 1 cannot be a constant. */
876 if (CONSTANT_P (src1
))
877 src1
= force_reg (mode
, src1
);
879 /* Source 1 cannot be a non-matching memory. */
880 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
881 src1
= force_reg (mode
, src1
);
883 /* Improve address combine. */
885 && GET_MODE_CLASS (mode
) == MODE_INT
887 src2
= force_reg (mode
, src2
);
894 /* Similarly, but assume that the destination has already been
898 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
899 machine_mode mode
, rtx operands
[])
901 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
902 gcc_assert (dst
== operands
[0]);
905 /* Attempt to expand a binary operator. Make the expansion closer to the
906 actual machine, then just general_operand, which will allow 3 separate
907 memory references (one output, two input) in a single insn. */
910 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
913 rtx src1
, src2
, dst
, op
, clob
;
915 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
919 /* Emit the instruction. */
921 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
925 && !rtx_equal_p (dst
, src1
))
927 /* This is going to be an LEA; avoid splitting it later. */
932 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
933 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
936 /* Fix up the destination if needed. */
937 if (dst
!= operands
[0])
938 emit_move_insn (operands
[0], dst
);
941 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
942 the given OPERANDS. */
945 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
948 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
949 if (SUBREG_P (operands
[1]))
954 else if (SUBREG_P (operands
[2]))
959 /* Optimize (__m128i) d | (__m128i) e and similar code
960 when d and e are float vectors into float vector logical
961 insn. In C/C++ without using intrinsics there is no other way
962 to express vector logical operation on float vectors than
963 to cast them temporarily to integer vectors. */
965 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
966 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
967 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
968 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
969 && SUBREG_BYTE (op1
) == 0
970 && (GET_CODE (op2
) == CONST_VECTOR
971 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
972 && SUBREG_BYTE (op2
) == 0))
973 && can_create_pseudo_p ())
976 switch (GET_MODE (SUBREG_REG (op1
)))
984 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
985 if (GET_CODE (op2
) == CONST_VECTOR
)
987 op2
= gen_lowpart (GET_MODE (dst
), op2
);
988 op2
= force_reg (GET_MODE (dst
), op2
);
993 op2
= SUBREG_REG (operands
[2]);
994 if (!vector_operand (op2
, GET_MODE (dst
)))
995 op2
= force_reg (GET_MODE (dst
), op2
);
997 op1
= SUBREG_REG (op1
);
998 if (!vector_operand (op1
, GET_MODE (dst
)))
999 op1
= force_reg (GET_MODE (dst
), op1
);
1000 emit_insn (gen_rtx_SET (dst
,
1001 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1003 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1009 if (!vector_operand (operands
[1], mode
))
1010 operands
[1] = force_reg (mode
, operands
[1]);
1011 if (!vector_operand (operands
[2], mode
))
1012 operands
[2] = force_reg (mode
, operands
[2]);
1013 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1014 emit_insn (gen_rtx_SET (operands
[0],
1015 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1019 /* Return TRUE or FALSE depending on whether the binary operator meets the
1020 appropriate constraints. */
1023 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1026 rtx dst
= operands
[0];
1027 rtx src1
= operands
[1];
1028 rtx src2
= operands
[2];
1030 /* Both source operands cannot be in memory. */
1031 if (MEM_P (src1
) && MEM_P (src2
))
1034 /* Canonicalize operand order for commutative operators. */
1035 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1036 std::swap (src1
, src2
);
1038 /* If the destination is memory, we must have a matching source operand. */
1039 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1042 /* Source 1 cannot be a constant. */
1043 if (CONSTANT_P (src1
))
1046 /* Source 1 cannot be a non-matching memory. */
1047 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1048 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1052 || (TARGET_64BIT
&& mode
== DImode
))
1053 && satisfies_constraint_L (src2
));
1058 /* Attempt to expand a unary operator. Make the expansion closer to the
1059 actual machine, then just general_operand, which will allow 2 separate
1060 memory references (one output, one input) in a single insn. */
1063 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1066 bool matching_memory
= false;
1067 rtx src
, dst
, op
, clob
;
1072 /* If the destination is memory, and we do not have matching source
1073 operands, do things in registers. */
1076 if (rtx_equal_p (dst
, src
))
1077 matching_memory
= true;
1079 dst
= gen_reg_rtx (mode
);
1082 /* When source operand is memory, destination must match. */
1083 if (MEM_P (src
) && !matching_memory
)
1084 src
= force_reg (mode
, src
);
1086 /* Emit the instruction. */
1088 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1094 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1095 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1098 /* Fix up the destination if needed. */
1099 if (dst
!= operands
[0])
1100 emit_move_insn (operands
[0], dst
);
1103 /* Predict just emitted jump instruction to be taken with probability PROB. */
1106 predict_jump (int prob
)
1108 rtx_insn
*insn
= get_last_insn ();
1109 gcc_assert (JUMP_P (insn
));
1110 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1113 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1114 divisor are within the range [0-255]. */
1117 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1120 rtx_code_label
*end_label
, *qimode_label
;
1123 rtx scratch
, tmp0
, tmp1
, tmp2
;
1124 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1125 rtx (*gen_zero_extend
) (rtx
, rtx
);
1126 rtx (*gen_test_ccno_1
) (rtx
, rtx
);
1131 if (GET_MODE (operands
[0]) == SImode
)
1133 if (GET_MODE (operands
[1]) == SImode
)
1134 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1137 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1138 gen_zero_extend
= gen_zero_extendqisi2
;
1143 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1144 gen_zero_extend
= gen_zero_extendqidi2
;
1146 gen_test_ccno_1
= gen_testsi_ccno_1
;
1149 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1150 gen_test_ccno_1
= gen_testdi_ccno_1
;
1151 gen_zero_extend
= gen_zero_extendqidi2
;
1157 end_label
= gen_label_rtx ();
1158 qimode_label
= gen_label_rtx ();
1160 scratch
= gen_reg_rtx (mode
);
1162 /* Use 8bit unsigned divimod if dividend and divisor are within
1163 the range [0-255]. */
1164 emit_move_insn (scratch
, operands
[2]);
1165 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1166 scratch
, 1, OPTAB_DIRECT
);
1167 emit_insn (gen_test_ccno_1 (scratch
, GEN_INT (-0x100)));
1168 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1169 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1170 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1171 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1173 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1174 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1175 JUMP_LABEL (insn
) = qimode_label
;
1177 /* Generate original signed/unsigned divimod. */
1178 div
= gen_divmod4_1 (operands
[0], operands
[1],
1179 operands
[2], operands
[3]);
1182 /* Branch to the end. */
1183 emit_jump_insn (gen_jump (end_label
));
1186 /* Generate 8bit unsigned divide. */
1187 emit_label (qimode_label
);
1188 /* Don't use operands[0] for result of 8bit divide since not all
1189 registers support QImode ZERO_EXTRACT. */
1190 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1191 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1192 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1193 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1197 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1198 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1202 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1203 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1207 if (GET_MODE (operands
[0]) != SImode
)
1208 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1209 if (GET_MODE (operands
[1]) != SImode
)
1210 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1213 /* Extract remainder from AH. */
1214 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]),
1215 tmp0
, GEN_INT (8), GEN_INT (8));
1216 if (REG_P (operands
[1]))
1217 insn
= emit_move_insn (operands
[1], tmp1
);
1220 /* Need a new scratch register since the old one has result
1222 scratch
= gen_reg_rtx (GET_MODE (operands
[1]));
1223 emit_move_insn (scratch
, tmp1
);
1224 insn
= emit_move_insn (operands
[1], scratch
);
1226 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1228 /* Zero extend quotient from AL. */
1229 tmp1
= gen_lowpart (QImode
, tmp0
);
1230 insn
= emit_insn (gen_zero_extend (operands
[0], tmp1
));
1231 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1233 emit_label (end_label
);
1236 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1237 matches destination. RTX includes clobber of FLAGS_REG. */
1240 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1245 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1246 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1248 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1251 /* Return true if regno1 def is nearest to the insn. */
1254 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1256 rtx_insn
*prev
= insn
;
1257 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1261 while (prev
&& prev
!= start
)
1263 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1265 prev
= PREV_INSN (prev
);
1268 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1270 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1272 prev
= PREV_INSN (prev
);
1275 /* None of the regs is defined in the bb. */
1279 /* Split lea instructions into a sequence of instructions
1280 which are executed on ALU to avoid AGU stalls.
1281 It is assumed that it is allowed to clobber flags register
1285 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1287 unsigned int regno0
, regno1
, regno2
;
1288 struct ix86_address parts
;
1292 ok
= ix86_decompose_address (operands
[1], &parts
);
1295 target
= gen_lowpart (mode
, operands
[0]);
1297 regno0
= true_regnum (target
);
1298 regno1
= INVALID_REGNUM
;
1299 regno2
= INVALID_REGNUM
;
1303 parts
.base
= gen_lowpart (mode
, parts
.base
);
1304 regno1
= true_regnum (parts
.base
);
1309 parts
.index
= gen_lowpart (mode
, parts
.index
);
1310 regno2
= true_regnum (parts
.index
);
1314 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1316 if (parts
.scale
> 1)
1318 /* Case r1 = r1 + ... */
1319 if (regno1
== regno0
)
1321 /* If we have a case r1 = r1 + C * r2 then we
1322 should use multiplication which is very
1323 expensive. Assume cost model is wrong if we
1324 have such case here. */
1325 gcc_assert (regno2
!= regno0
);
1327 for (adds
= parts
.scale
; adds
> 0; adds
--)
1328 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1332 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1333 if (regno0
!= regno2
)
1334 emit_insn (gen_rtx_SET (target
, parts
.index
));
1336 /* Use shift for scaling. */
1337 ix86_emit_binop (ASHIFT
, mode
, target
,
1338 GEN_INT (exact_log2 (parts
.scale
)));
1341 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1343 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1344 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1347 else if (!parts
.base
&& !parts
.index
)
1349 gcc_assert(parts
.disp
);
1350 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1356 if (regno0
!= regno2
)
1357 emit_insn (gen_rtx_SET (target
, parts
.index
));
1359 else if (!parts
.index
)
1361 if (regno0
!= regno1
)
1362 emit_insn (gen_rtx_SET (target
, parts
.base
));
1366 if (regno0
== regno1
)
1368 else if (regno0
== regno2
)
1374 /* Find better operand for SET instruction, depending
1375 on which definition is farther from the insn. */
1376 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1377 tmp
= parts
.index
, tmp1
= parts
.base
;
1379 tmp
= parts
.base
, tmp1
= parts
.index
;
1381 emit_insn (gen_rtx_SET (target
, tmp
));
1383 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1384 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1386 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1390 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1393 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1394 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1398 /* Post-reload splitter for converting an SF or DFmode value in an
1399 SSE register into an unsigned SImode. */
1402 ix86_split_convert_uns_si_sse (rtx operands
[])
1404 machine_mode vecmode
;
1405 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1407 large
= operands
[1];
1408 zero_or_two31
= operands
[2];
1409 input
= operands
[3];
1410 two31
= operands
[4];
1411 vecmode
= GET_MODE (large
);
1412 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1414 /* Load up the value into the low element. We must ensure that the other
1415 elements are valid floats -- zero is the easiest such value. */
1418 if (vecmode
== V4SFmode
)
1419 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1421 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1425 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1426 emit_move_insn (value
, CONST0_RTX (vecmode
));
1427 if (vecmode
== V4SFmode
)
1428 emit_insn (gen_sse_movss (value
, value
, input
));
1430 emit_insn (gen_sse2_movsd (value
, value
, input
));
1433 emit_move_insn (large
, two31
);
1434 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1436 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1437 emit_insn (gen_rtx_SET (large
, x
));
1439 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1440 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1442 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1443 emit_insn (gen_rtx_SET (value
, x
));
1445 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1446 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1448 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1449 if (vecmode
== V4SFmode
)
1450 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1452 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1455 emit_insn (gen_xorv4si3 (value
, value
, large
));
1458 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1459 machine_mode mode
, rtx target
,
1460 rtx var
, int one_var
);
1462 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1463 Expects the 64-bit DImode to be supplied in a pair of integral
1464 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1465 -mfpmath=sse, !optimize_size only. */
1468 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1470 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1471 rtx int_xmm
, fp_xmm
;
1472 rtx biases
, exponents
;
1475 int_xmm
= gen_reg_rtx (V4SImode
);
1476 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1477 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1478 else if (TARGET_SSE_SPLIT_REGS
)
1480 emit_clobber (int_xmm
);
1481 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1485 x
= gen_reg_rtx (V2DImode
);
1486 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1487 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1490 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1491 gen_rtvec (4, GEN_INT (0x43300000UL
),
1492 GEN_INT (0x45300000UL
),
1493 const0_rtx
, const0_rtx
));
1494 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1496 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1497 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1499 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1500 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1501 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1502 (0x1.0p84 + double(fp_value_hi_xmm)).
1503 Note these exponents differ by 32. */
1505 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1507 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1508 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1509 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1510 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1511 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1512 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1513 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1514 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1515 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1517 /* Add the upper and lower DFmode values together. */
1519 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1522 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1523 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1524 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1527 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1530 /* Not used, but eases macroization of patterns. */
1532 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1537 /* Convert an unsigned SImode value into a DFmode. Only currently used
1538 for SSE, but applicable anywhere. */
1541 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1543 REAL_VALUE_TYPE TWO31r
;
1546 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1547 NULL
, 1, OPTAB_DIRECT
);
1549 fp
= gen_reg_rtx (DFmode
);
1550 emit_insn (gen_floatsidf2 (fp
, x
));
1552 real_ldexp (&TWO31r
, &dconst1
, 31);
1553 x
= const_double_from_real_value (TWO31r
, DFmode
);
1555 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1557 emit_move_insn (target
, x
);
1560 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1561 32-bit mode; otherwise we have a direct convert instruction. */
1564 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1566 REAL_VALUE_TYPE TWO32r
;
1567 rtx fp_lo
, fp_hi
, x
;
1569 fp_lo
= gen_reg_rtx (DFmode
);
1570 fp_hi
= gen_reg_rtx (DFmode
);
1572 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1574 real_ldexp (&TWO32r
, &dconst1
, 32);
1575 x
= const_double_from_real_value (TWO32r
, DFmode
);
1576 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1578 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1580 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1583 emit_move_insn (target
, x
);
1586 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1587 For x86_32, -mfpmath=sse, !optimize_size only. */
1589 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1591 REAL_VALUE_TYPE ONE16r
;
1592 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1594 real_ldexp (&ONE16r
, &dconst1
, 16);
1595 x
= const_double_from_real_value (ONE16r
, SFmode
);
1596 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1597 NULL
, 0, OPTAB_DIRECT
);
1598 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1599 NULL
, 0, OPTAB_DIRECT
);
1600 fp_hi
= gen_reg_rtx (SFmode
);
1601 fp_lo
= gen_reg_rtx (SFmode
);
1602 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1603 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1604 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1606 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1608 if (!rtx_equal_p (target
, fp_hi
))
1609 emit_move_insn (target
, fp_hi
);
1612 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1613 a vector of unsigned ints VAL to vector of floats TARGET. */
1616 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1619 REAL_VALUE_TYPE TWO16r
;
1620 machine_mode intmode
= GET_MODE (val
);
1621 machine_mode fltmode
= GET_MODE (target
);
1622 rtx (*cvt
) (rtx
, rtx
);
1624 if (intmode
== V4SImode
)
1625 cvt
= gen_floatv4siv4sf2
;
1627 cvt
= gen_floatv8siv8sf2
;
1628 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1629 tmp
[0] = force_reg (intmode
, tmp
[0]);
1630 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1632 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1633 NULL_RTX
, 1, OPTAB_DIRECT
);
1634 tmp
[3] = gen_reg_rtx (fltmode
);
1635 emit_insn (cvt (tmp
[3], tmp
[1]));
1636 tmp
[4] = gen_reg_rtx (fltmode
);
1637 emit_insn (cvt (tmp
[4], tmp
[2]));
1638 real_ldexp (&TWO16r
, &dconst1
, 16);
1639 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1640 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1641 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5], NULL_RTX
, 1,
1643 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6], target
, 1,
1645 if (tmp
[7] != target
)
1646 emit_move_insn (target
, tmp
[7]);
1649 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1650 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1651 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1652 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1655 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
1657 REAL_VALUE_TYPE TWO31r
;
1659 machine_mode mode
= GET_MODE (val
);
1660 machine_mode scalarmode
= GET_MODE_INNER (mode
);
1661 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
1662 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
1665 for (i
= 0; i
< 3; i
++)
1666 tmp
[i
] = gen_reg_rtx (mode
);
1667 real_ldexp (&TWO31r
, &dconst1
, 31);
1668 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
1669 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
1670 two31r
= force_reg (mode
, two31r
);
1673 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
1674 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
1675 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
1676 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
1677 default: gcc_unreachable ();
1679 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
1680 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
1681 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
1683 if (intmode
== V4SImode
|| TARGET_AVX2
)
1684 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
1685 gen_lowpart (intmode
, tmp
[0]),
1686 GEN_INT (31), NULL_RTX
, 0,
1690 rtx two31
= GEN_INT (HOST_WIDE_INT_1U
<< 31);
1691 two31
= ix86_build_const_vector (intmode
, 1, two31
);
1692 *xorp
= expand_simple_binop (intmode
, AND
,
1693 gen_lowpart (intmode
, tmp
[0]),
1697 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
1701 /* Generate code for floating point ABS or NEG. */
1704 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1707 rtx mask
, set
, dst
, src
;
1708 bool use_sse
= false;
1709 bool vector_mode
= VECTOR_MODE_P (mode
);
1710 machine_mode vmode
= mode
;
1714 else if (mode
== TFmode
)
1716 else if (TARGET_SSE_MATH
)
1718 use_sse
= SSE_FLOAT_MODE_P (mode
);
1721 else if (mode
== DFmode
)
1725 /* NEG and ABS performed with SSE use bitwise mask operations.
1726 Create the appropriate mask now. */
1728 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
1735 set
= gen_rtx_fmt_e (code
, mode
, src
);
1736 set
= gen_rtx_SET (dst
, set
);
1743 use
= gen_rtx_USE (VOIDmode
, mask
);
1745 par
= gen_rtvec (2, set
, use
);
1748 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1749 par
= gen_rtvec (3, set
, use
, clob
);
1751 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1757 /* Expand a copysign operation. Special case operand 0 being a constant. */
1760 ix86_expand_copysign (rtx operands
[])
1762 machine_mode mode
, vmode
;
1763 rtx dest
, op0
, op1
, mask
, nmask
;
1769 mode
= GET_MODE (dest
);
1773 else if (mode
== DFmode
)
1778 if (CONST_DOUBLE_P (op0
))
1780 rtx (*copysign_insn
)(rtx
, rtx
, rtx
, rtx
);
1782 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0
)))
1783 op0
= simplify_unary_operation (ABS
, mode
, op0
, mode
);
1785 if (mode
== SFmode
|| mode
== DFmode
)
1787 if (op0
== CONST0_RTX (mode
))
1788 op0
= CONST0_RTX (vmode
);
1791 rtx v
= ix86_build_const_vector (vmode
, false, op0
);
1793 op0
= force_reg (vmode
, v
);
1796 else if (op0
!= CONST0_RTX (mode
))
1797 op0
= force_reg (mode
, op0
);
1799 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1802 copysign_insn
= gen_copysignsf3_const
;
1803 else if (mode
== DFmode
)
1804 copysign_insn
= gen_copysigndf3_const
;
1806 copysign_insn
= gen_copysigntf3_const
;
1808 emit_insn (copysign_insn (dest
, op0
, op1
, mask
));
1812 rtx (*copysign_insn
)(rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
1814 nmask
= ix86_build_signbit_mask (vmode
, 0, 1);
1815 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1818 copysign_insn
= gen_copysignsf3_var
;
1819 else if (mode
== DFmode
)
1820 copysign_insn
= gen_copysigndf3_var
;
1822 copysign_insn
= gen_copysigntf3_var
;
1824 emit_insn (copysign_insn (dest
, NULL_RTX
, op0
, op1
, nmask
, mask
));
1828 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1829 be a constant, and so has already been expanded into a vector constant. */
1832 ix86_split_copysign_const (rtx operands
[])
1834 machine_mode mode
, vmode
;
1835 rtx dest
, op0
, mask
, x
;
1841 mode
= GET_MODE (dest
);
1842 vmode
= GET_MODE (mask
);
1844 dest
= lowpart_subreg (vmode
, dest
, mode
);
1845 x
= gen_rtx_AND (vmode
, dest
, mask
);
1846 emit_insn (gen_rtx_SET (dest
, x
));
1848 if (op0
!= CONST0_RTX (vmode
))
1850 x
= gen_rtx_IOR (vmode
, dest
, op0
);
1851 emit_insn (gen_rtx_SET (dest
, x
));
1855 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1856 so we have to do two masks. */
1859 ix86_split_copysign_var (rtx operands
[])
1861 machine_mode mode
, vmode
;
1862 rtx dest
, scratch
, op0
, op1
, mask
, nmask
, x
;
1865 scratch
= operands
[1];
1868 nmask
= operands
[4];
1871 mode
= GET_MODE (dest
);
1872 vmode
= GET_MODE (mask
);
1874 if (rtx_equal_p (op0
, op1
))
1876 /* Shouldn't happen often (it's useless, obviously), but when it does
1877 we'd generate incorrect code if we continue below. */
1878 emit_move_insn (dest
, op0
);
1882 if (REG_P (mask
) && REGNO (dest
) == REGNO (mask
)) /* alternative 0 */
1884 gcc_assert (REGNO (op1
) == REGNO (scratch
));
1886 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1887 emit_insn (gen_rtx_SET (scratch
, x
));
1890 op0
= lowpart_subreg (vmode
, op0
, mode
);
1891 x
= gen_rtx_NOT (vmode
, dest
);
1892 x
= gen_rtx_AND (vmode
, x
, op0
);
1893 emit_insn (gen_rtx_SET (dest
, x
));
1897 if (REGNO (op1
) == REGNO (scratch
)) /* alternative 1,3 */
1899 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1901 else /* alternative 2,4 */
1903 gcc_assert (REGNO (mask
) == REGNO (scratch
));
1904 op1
= lowpart_subreg (vmode
, op1
, mode
);
1905 x
= gen_rtx_AND (vmode
, scratch
, op1
);
1907 emit_insn (gen_rtx_SET (scratch
, x
));
1909 if (REGNO (op0
) == REGNO (dest
)) /* alternative 1,2 */
1911 dest
= lowpart_subreg (vmode
, op0
, mode
);
1912 x
= gen_rtx_AND (vmode
, dest
, nmask
);
1914 else /* alternative 3,4 */
1916 gcc_assert (REGNO (nmask
) == REGNO (dest
));
1918 op0
= lowpart_subreg (vmode
, op0
, mode
);
1919 x
= gen_rtx_AND (vmode
, dest
, op0
);
1921 emit_insn (gen_rtx_SET (dest
, x
));
1924 x
= gen_rtx_IOR (vmode
, dest
, scratch
);
1925 emit_insn (gen_rtx_SET (dest
, x
));
1928 /* Expand an xorsign operation. */
1931 ix86_expand_xorsign (rtx operands
[])
1933 rtx (*xorsign_insn
)(rtx
, rtx
, rtx
, rtx
);
1934 machine_mode mode
, vmode
;
1935 rtx dest
, op0
, op1
, mask
;
1941 mode
= GET_MODE (dest
);
1945 xorsign_insn
= gen_xorsignsf3_1
;
1948 else if (mode
== DFmode
)
1950 xorsign_insn
= gen_xorsigndf3_1
;
1956 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1958 emit_insn (xorsign_insn (dest
, op0
, op1
, mask
));
1961 /* Deconstruct an xorsign operation into bit masks. */
1964 ix86_split_xorsign (rtx operands
[])
1966 machine_mode mode
, vmode
;
1967 rtx dest
, op0
, mask
, x
;
1973 mode
= GET_MODE (dest
);
1974 vmode
= GET_MODE (mask
);
1976 dest
= lowpart_subreg (vmode
, dest
, mode
);
1977 x
= gen_rtx_AND (vmode
, dest
, mask
);
1978 emit_insn (gen_rtx_SET (dest
, x
));
1980 op0
= lowpart_subreg (vmode
, op0
, mode
);
1981 x
= gen_rtx_XOR (vmode
, dest
, op0
);
1982 emit_insn (gen_rtx_SET (dest
, x
));
1985 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
1988 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
1990 machine_mode mode
= GET_MODE (op0
);
1993 /* Handle special case - vector comparsion with boolean result, transform
1994 it using ptest instruction. */
1995 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
1997 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
1998 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2000 gcc_assert (code
== EQ
|| code
== NE
);
2001 /* Generate XOR since we can't check that one operand is zero vector. */
2002 tmp
= gen_reg_rtx (mode
);
2003 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2004 tmp
= gen_lowpart (p_mode
, tmp
);
2005 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2006 gen_rtx_UNSPEC (CCmode
,
2007 gen_rtvec (2, tmp
, tmp
),
2009 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2010 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2011 gen_rtx_LABEL_REF (VOIDmode
, label
),
2013 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2026 tmp
= ix86_expand_compare (code
, op0
, op1
);
2027 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2028 gen_rtx_LABEL_REF (VOIDmode
, label
),
2030 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2036 /* For 32-bit target DI comparison may be performed on
2037 SSE registers. To allow this we should avoid split
2038 to SI mode which is achieved by doing xor in DI mode
2039 and then comparing with zero (which is recognized by
2040 STV pass). We don't compare using xor when optimizing
2042 if (!optimize_insn_for_size_p ()
2044 && (code
== EQ
|| code
== NE
))
2046 op0
= force_reg (mode
, gen_rtx_XOR (mode
, op0
, op1
));
2051 /* Expand DImode branch into multiple compare+branch. */
2054 rtx_code_label
*label2
;
2055 enum rtx_code code1
, code2
, code3
;
2056 machine_mode submode
;
2058 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2060 std::swap (op0
, op1
);
2061 code
= swap_condition (code
);
2064 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2065 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2067 submode
= mode
== DImode
? SImode
: DImode
;
2069 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2070 avoid two branches. This costs one extra insn, so disable when
2071 optimizing for size. */
2073 if ((code
== EQ
|| code
== NE
)
2074 && (!optimize_insn_for_size_p ()
2075 || hi
[1] == const0_rtx
|| lo
[1] == const0_rtx
))
2080 if (hi
[1] != const0_rtx
)
2081 xor1
= expand_binop (submode
, xor_optab
, xor1
, hi
[1],
2082 NULL_RTX
, 0, OPTAB_WIDEN
);
2085 if (lo
[1] != const0_rtx
)
2086 xor0
= expand_binop (submode
, xor_optab
, xor0
, lo
[1],
2087 NULL_RTX
, 0, OPTAB_WIDEN
);
2089 tmp
= expand_binop (submode
, ior_optab
, xor1
, xor0
,
2090 NULL_RTX
, 0, OPTAB_WIDEN
);
2092 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2096 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2097 op1 is a constant and the low word is zero, then we can just
2098 examine the high word. Similarly for low word -1 and
2099 less-or-equal-than or greater-than. */
2101 if (CONST_INT_P (hi
[1]))
2104 case LT
: case LTU
: case GE
: case GEU
:
2105 if (lo
[1] == const0_rtx
)
2107 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2111 case LE
: case LEU
: case GT
: case GTU
:
2112 if (lo
[1] == constm1_rtx
)
2114 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2122 /* Emulate comparisons that do not depend on Zero flag with
2123 double-word subtraction. Note that only Overflow, Sign
2124 and Carry flags are valid, so swap arguments and condition
2125 of comparisons that would otherwise test Zero flag. */
2129 case LE
: case LEU
: case GT
: case GTU
:
2130 std::swap (lo
[0], lo
[1]);
2131 std::swap (hi
[0], hi
[1]);
2132 code
= swap_condition (code
);
2135 case LT
: case LTU
: case GE
: case GEU
:
2137 rtx (*cmp_insn
) (rtx
, rtx
);
2138 rtx (*sbb_insn
) (rtx
, rtx
, rtx
);
2139 bool uns
= (code
== LTU
|| code
== GEU
);
2143 cmp_insn
= gen_cmpdi_1
;
2145 = uns
? gen_subdi3_carry_ccc
: gen_subdi3_carry_ccgz
;
2149 cmp_insn
= gen_cmpsi_1
;
2151 = uns
? gen_subsi3_carry_ccc
: gen_subsi3_carry_ccgz
;
2154 if (!nonimmediate_operand (lo
[0], submode
))
2155 lo
[0] = force_reg (submode
, lo
[0]);
2156 if (!x86_64_general_operand (lo
[1], submode
))
2157 lo
[1] = force_reg (submode
, lo
[1]);
2159 if (!register_operand (hi
[0], submode
))
2160 hi
[0] = force_reg (submode
, hi
[0]);
2161 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2162 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2163 hi
[1] = force_reg (submode
, hi
[1]);
2165 emit_insn (cmp_insn (lo
[0], lo
[1]));
2166 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode
), hi
[0], hi
[1]));
2168 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2170 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2178 /* Otherwise, we need two or three jumps. */
2180 label2
= gen_label_rtx ();
2183 code2
= swap_condition (code
);
2184 code3
= unsigned_condition (code
);
2188 case LT
: case GT
: case LTU
: case GTU
:
2191 case LE
: code1
= LT
; code2
= GT
; break;
2192 case GE
: code1
= GT
; code2
= LT
; break;
2193 case LEU
: code1
= LTU
; code2
= GTU
; break;
2194 case GEU
: code1
= GTU
; code2
= LTU
; break;
2196 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2197 case NE
: code2
= UNKNOWN
; break;
2205 * if (hi(a) < hi(b)) goto true;
2206 * if (hi(a) > hi(b)) goto false;
2207 * if (lo(a) < lo(b)) goto true;
2211 if (code1
!= UNKNOWN
)
2212 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2213 if (code2
!= UNKNOWN
)
2214 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2216 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2218 if (code2
!= UNKNOWN
)
2219 emit_label (label2
);
2224 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2229 /* Figure out whether to use unordered fp comparisons. */
2232 ix86_unordered_fp_compare (enum rtx_code code
)
2234 if (!TARGET_IEEE_FP
)
2263 /* Return a comparison we can do and that it is equivalent to
2264 swap_condition (code) apart possibly from orderedness.
2265 But, never change orderedness if TARGET_IEEE_FP, returning
2266 UNKNOWN in that case if necessary. */
2268 static enum rtx_code
2269 ix86_fp_swap_condition (enum rtx_code code
)
2273 case GT
: /* GTU - CF=0 & ZF=0 */
2274 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2275 case GE
: /* GEU - CF=0 */
2276 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2277 case UNLT
: /* LTU - CF=1 */
2278 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2279 case UNLE
: /* LEU - CF=1 | ZF=1 */
2280 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2282 return swap_condition (code
);
2286 /* Return cost of comparison CODE using the best strategy for performance.
2287 All following functions do use number of instructions as a cost metrics.
2288 In future this should be tweaked to compute bytes for optimize_size and
2289 take into account performance of various instructions on various CPUs. */
2292 ix86_fp_comparison_cost (enum rtx_code code
)
2296 /* The cost of code using bit-twiddling on %ah. */
2313 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2317 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2323 switch (ix86_fp_comparison_strategy (code
))
2325 case IX86_FPCMP_COMI
:
2326 return arith_cost
> 4 ? 3 : 2;
2327 case IX86_FPCMP_SAHF
:
2328 return arith_cost
> 4 ? 4 : 3;
2334 /* Swap, force into registers, or otherwise massage the two operands
2335 to a fp comparison. The operands are updated in place; the new
2336 comparison code is returned. */
2338 static enum rtx_code
2339 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2341 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2342 rtx op0
= *pop0
, op1
= *pop1
;
2343 machine_mode op_mode
= GET_MODE (op0
);
2344 bool is_sse
= TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (op_mode
);
2346 /* All of the unordered compare instructions only work on registers.
2347 The same is true of the fcomi compare instructions. The XFmode
2348 compare instructions require registers except when comparing
2349 against zero or when converting operand 1 from fixed point to
2353 && (unordered_compare
2354 || (op_mode
== XFmode
2355 && ! (standard_80387_constant_p (op0
) == 1
2356 || standard_80387_constant_p (op1
) == 1)
2357 && GET_CODE (op1
) != FLOAT
)
2358 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2360 op0
= force_reg (op_mode
, op0
);
2361 op1
= force_reg (op_mode
, op1
);
2365 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2366 things around if they appear profitable, otherwise force op0
2369 if (standard_80387_constant_p (op0
) == 0
2371 && ! (standard_80387_constant_p (op1
) == 0
2374 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2375 if (new_code
!= UNKNOWN
)
2377 std::swap (op0
, op1
);
2383 op0
= force_reg (op_mode
, op0
);
2385 if (CONSTANT_P (op1
))
2387 int tmp
= standard_80387_constant_p (op1
);
2389 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2393 op1
= force_reg (op_mode
, op1
);
2396 op1
= force_reg (op_mode
, op1
);
2400 /* Try to rearrange the comparison to make it cheaper. */
2401 if (ix86_fp_comparison_cost (code
)
2402 > ix86_fp_comparison_cost (swap_condition (code
))
2403 && (REG_P (op1
) || can_create_pseudo_p ()))
2405 std::swap (op0
, op1
);
2406 code
= swap_condition (code
);
2408 op0
= force_reg (op_mode
, op0
);
2416 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2419 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2421 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2422 machine_mode cmp_mode
;
2425 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2427 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2428 if (unordered_compare
)
2429 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2431 /* Do fcomi/sahf based test when profitable. */
2432 switch (ix86_fp_comparison_strategy (code
))
2434 case IX86_FPCMP_COMI
:
2435 cmp_mode
= CCFPmode
;
2436 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2439 case IX86_FPCMP_SAHF
:
2440 cmp_mode
= CCFPmode
;
2441 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2442 scratch
= gen_reg_rtx (HImode
);
2443 emit_insn (gen_rtx_SET (scratch
, tmp
));
2444 emit_insn (gen_x86_sahf_1 (scratch
));
2447 case IX86_FPCMP_ARITH
:
2448 cmp_mode
= CCNOmode
;
2449 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2450 scratch
= gen_reg_rtx (HImode
);
2451 emit_insn (gen_rtx_SET (scratch
, tmp
));
2453 /* In the unordered case, we have to check C2 for NaN's, which
2454 doesn't happen to work out to anything nice combination-wise.
2455 So do some bit twiddling on the value we've got in AH to come
2456 up with an appropriate set of condition codes. */
2462 if (code
== GT
|| !TARGET_IEEE_FP
)
2464 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2469 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2470 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2471 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2478 if (code
== LT
&& TARGET_IEEE_FP
)
2480 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2481 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2487 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2493 if (code
== GE
|| !TARGET_IEEE_FP
)
2495 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2500 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2501 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2507 if (code
== LE
&& TARGET_IEEE_FP
)
2509 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2510 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2511 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2517 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2523 if (code
== EQ
&& TARGET_IEEE_FP
)
2525 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2526 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2532 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2538 if (code
== NE
&& TARGET_IEEE_FP
)
2540 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2541 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2547 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2553 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2557 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2570 /* Return the test that should be put into the flags user, i.e.
2571 the bcc, scc, or cmov instruction. */
2572 return gen_rtx_fmt_ee (code
, VOIDmode
,
2573 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2577 /* Generate insn patterns to do an integer compare of OPERANDS. */
2580 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2582 machine_mode cmpmode
;
2585 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2586 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2588 /* This is very simple, but making the interface the same as in the
2589 FP case makes the rest of the code easier. */
2590 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2591 emit_insn (gen_rtx_SET (flags
, tmp
));
2593 /* Return the test that should be put into the flags user, i.e.
2594 the bcc, scc, or cmov instruction. */
2595 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2599 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2603 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2604 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2606 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2608 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2609 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2612 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2618 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2622 gcc_assert (GET_MODE (dest
) == QImode
);
2624 ret
= ix86_expand_compare (code
, op0
, op1
);
2625 PUT_MODE (ret
, QImode
);
2626 emit_insn (gen_rtx_SET (dest
, ret
));
2629 /* Expand comparison setting or clearing carry flag. Return true when
2630 successful and set pop for the operation. */
2632 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
2635 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
2637 /* Do not handle double-mode compares that go through special path. */
2638 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
2641 if (SCALAR_FLOAT_MODE_P (mode
))
2644 rtx_insn
*compare_seq
;
2646 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
2648 /* Shortcut: following common codes never translate
2649 into carry flag compares. */
2650 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
2651 || code
== ORDERED
|| code
== UNORDERED
)
2654 /* These comparisons require zero flag; swap operands so they won't. */
2655 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
2658 std::swap (op0
, op1
);
2659 code
= swap_condition (code
);
2662 /* Try to expand the comparison and verify that we end up with
2663 carry flag based comparison. This fails to be true only when
2664 we decide to expand comparison using arithmetic that is not
2665 too common scenario. */
2667 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
2668 compare_seq
= get_insns ();
2671 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
2672 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
2674 code
= GET_CODE (compare_op
);
2676 if (code
!= LTU
&& code
!= GEU
)
2679 emit_insn (compare_seq
);
2684 if (!INTEGRAL_MODE_P (mode
))
2693 /* Convert a==0 into (unsigned)a<1. */
2696 if (op1
!= const0_rtx
)
2699 code
= (code
== EQ
? LTU
: GEU
);
2702 /* Convert a>b into b<a or a>=b-1. */
2705 if (CONST_INT_P (op1
))
2707 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
2708 /* Bail out on overflow. We still can swap operands but that
2709 would force loading of the constant into register. */
2710 if (op1
== const0_rtx
2711 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
2713 code
= (code
== GTU
? GEU
: LTU
);
2717 std::swap (op0
, op1
);
2718 code
= (code
== GTU
? LTU
: GEU
);
2722 /* Convert a>=0 into (unsigned)a<0x80000000. */
2725 if (mode
== DImode
|| op1
!= const0_rtx
)
2727 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2728 code
= (code
== LT
? GEU
: LTU
);
2732 if (mode
== DImode
|| op1
!= constm1_rtx
)
2734 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2735 code
= (code
== LE
? GEU
: LTU
);
2741 /* Swapping operands may cause constant to appear as first operand. */
2742 if (!nonimmediate_operand (op0
, VOIDmode
))
2744 if (!can_create_pseudo_p ())
2746 op0
= force_reg (mode
, op0
);
2748 *pop
= ix86_expand_compare (code
, op0
, op1
);
2749 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
2753 /* Expand conditional increment or decrement using adb/sbb instructions.
2754 The default case using setcc followed by the conditional move can be
2755 done by generic code. */
2757 ix86_expand_int_addcc (rtx operands
[])
2759 enum rtx_code code
= GET_CODE (operands
[1]);
2761 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
, rtx
);
2763 rtx val
= const0_rtx
;
2766 rtx op0
= XEXP (operands
[1], 0);
2767 rtx op1
= XEXP (operands
[1], 1);
2769 if (operands
[3] != const1_rtx
2770 && operands
[3] != constm1_rtx
)
2772 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2774 code
= GET_CODE (compare_op
);
2776 flags
= XEXP (compare_op
, 0);
2778 if (GET_MODE (flags
) == CCFPmode
)
2781 code
= ix86_fp_compare_code_to_integer (code
);
2788 PUT_CODE (compare_op
,
2789 reverse_condition_maybe_unordered
2790 (GET_CODE (compare_op
)));
2792 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
2795 mode
= GET_MODE (operands
[0]);
2797 /* Construct either adc or sbb insn. */
2798 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
2803 insn
= gen_subqi3_carry
;
2806 insn
= gen_subhi3_carry
;
2809 insn
= gen_subsi3_carry
;
2812 insn
= gen_subdi3_carry
;
2823 insn
= gen_addqi3_carry
;
2826 insn
= gen_addhi3_carry
;
2829 insn
= gen_addsi3_carry
;
2832 insn
= gen_adddi3_carry
;
2838 emit_insn (insn (operands
[0], operands
[2], val
, flags
, compare_op
));
2844 ix86_expand_int_movcc (rtx operands
[])
2846 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
2847 rtx_insn
*compare_seq
;
2849 machine_mode mode
= GET_MODE (operands
[0]);
2850 bool sign_bit_compare_p
= false;
2851 rtx op0
= XEXP (operands
[1], 0);
2852 rtx op1
= XEXP (operands
[1], 1);
2854 if (GET_MODE (op0
) == TImode
2855 || (GET_MODE (op0
) == DImode
2860 compare_op
= ix86_expand_compare (code
, op0
, op1
);
2861 compare_seq
= get_insns ();
2864 compare_code
= GET_CODE (compare_op
);
2866 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
2867 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
2868 sign_bit_compare_p
= true;
2870 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2871 HImode insns, we'd be swallowed in word prefix ops. */
2873 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
2874 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
2875 && CONST_INT_P (operands
[2])
2876 && CONST_INT_P (operands
[3]))
2878 rtx out
= operands
[0];
2879 HOST_WIDE_INT ct
= INTVAL (operands
[2]);
2880 HOST_WIDE_INT cf
= INTVAL (operands
[3]);
2884 /* Sign bit compares are better done using shifts than we do by using
2886 if (sign_bit_compare_p
2887 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2889 /* Detect overlap between destination and compare sources. */
2892 if (!sign_bit_compare_p
)
2897 compare_code
= GET_CODE (compare_op
);
2899 flags
= XEXP (compare_op
, 0);
2901 if (GET_MODE (flags
) == CCFPmode
)
2905 = ix86_fp_compare_code_to_integer (compare_code
);
2908 /* To simplify rest of code, restrict to the GEU case. */
2909 if (compare_code
== LTU
)
2912 compare_code
= reverse_condition (compare_code
);
2913 code
= reverse_condition (code
);
2918 PUT_CODE (compare_op
,
2919 reverse_condition_maybe_unordered
2920 (GET_CODE (compare_op
)));
2922 PUT_CODE (compare_op
,
2923 reverse_condition (GET_CODE (compare_op
)));
2927 if (reg_overlap_mentioned_p (out
, op0
)
2928 || reg_overlap_mentioned_p (out
, op1
))
2929 tmp
= gen_reg_rtx (mode
);
2932 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
2934 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
2935 flags
, compare_op
));
2939 if (code
== GT
|| code
== GE
)
2940 code
= reverse_condition (code
);
2946 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
2959 tmp
= expand_simple_binop (mode
, PLUS
,
2961 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2972 tmp
= expand_simple_binop (mode
, IOR
,
2974 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2976 else if (diff
== -1 && ct
)
2986 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
2988 tmp
= expand_simple_binop (mode
, PLUS
,
2989 copy_rtx (tmp
), GEN_INT (cf
),
2990 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2998 * andl cf - ct, dest
3008 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3011 tmp
= expand_simple_binop (mode
, AND
,
3013 gen_int_mode (cf
- ct
, mode
),
3014 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3016 tmp
= expand_simple_binop (mode
, PLUS
,
3017 copy_rtx (tmp
), GEN_INT (ct
),
3018 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3021 if (!rtx_equal_p (tmp
, out
))
3022 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3029 machine_mode cmp_mode
= GET_MODE (op0
);
3030 enum rtx_code new_code
;
3032 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3034 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3036 /* We may be reversing unordered compare to normal compare, that
3037 is not valid in general (we may convert non-trapping condition
3038 to trapping one), however on i386 we currently emit all
3039 comparisons unordered. */
3040 new_code
= reverse_condition_maybe_unordered (code
);
3043 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3044 if (new_code
!= UNKNOWN
)
3052 compare_code
= UNKNOWN
;
3053 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3054 && CONST_INT_P (op1
))
3056 if (op1
== const0_rtx
3057 && (code
== LT
|| code
== GE
))
3058 compare_code
= code
;
3059 else if (op1
== constm1_rtx
)
3063 else if (code
== GT
)
3068 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3069 if (compare_code
!= UNKNOWN
3070 && GET_MODE (op0
) == GET_MODE (out
)
3071 && (cf
== -1 || ct
== -1))
3073 /* If lea code below could be used, only optimize
3074 if it results in a 2 insn sequence. */
3076 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3077 || diff
== 3 || diff
== 5 || diff
== 9)
3078 || (compare_code
== LT
&& ct
== -1)
3079 || (compare_code
== GE
&& cf
== -1))
3082 * notl op1 (if necessary)
3090 code
= reverse_condition (code
);
3093 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3095 out
= expand_simple_binop (mode
, IOR
,
3097 out
, 1, OPTAB_DIRECT
);
3098 if (out
!= operands
[0])
3099 emit_move_insn (operands
[0], out
);
3106 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3107 || diff
== 3 || diff
== 5 || diff
== 9)
3108 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3110 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3116 * lea cf(dest*(ct-cf)),dest
3120 * This also catches the degenerate setcc-only case.
3126 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3129 /* On x86_64 the lea instruction operates on Pmode, so we need
3130 to get arithmetics done in proper mode to match. */
3132 tmp
= copy_rtx (out
);
3136 out1
= copy_rtx (out
);
3137 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3141 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3147 tmp
= gen_rtx_PLUS (mode
, tmp
, GEN_INT (cf
));
3150 if (!rtx_equal_p (tmp
, out
))
3153 out
= force_operand (tmp
, copy_rtx (out
));
3155 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3157 if (!rtx_equal_p (out
, operands
[0]))
3158 emit_move_insn (operands
[0], copy_rtx (out
));
3164 * General case: Jumpful:
3165 * xorl dest,dest cmpl op1, op2
3166 * cmpl op1, op2 movl ct, dest
3168 * decl dest movl cf, dest
3169 * andl (cf-ct),dest 1:
3174 * This is reasonably steep, but branch mispredict costs are
3175 * high on modern cpus, so consider failing only if optimizing
3179 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3180 && BRANCH_COST (optimize_insn_for_speed_p (),
3185 machine_mode cmp_mode
= GET_MODE (op0
);
3186 enum rtx_code new_code
;
3188 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3190 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3192 /* We may be reversing unordered compare to normal compare,
3193 that is not valid in general (we may convert non-trapping
3194 condition to trapping one), however on i386 we currently
3195 emit all comparisons unordered. */
3196 new_code
= reverse_condition_maybe_unordered (code
);
3200 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3201 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3202 compare_code
= reverse_condition (compare_code
);
3205 if (new_code
!= UNKNOWN
)
3213 if (compare_code
!= UNKNOWN
)
3215 /* notl op1 (if needed)
3220 For x < 0 (resp. x <= -1) there will be no notl,
3221 so if possible swap the constants to get rid of the
3223 True/false will be -1/0 while code below (store flag
3224 followed by decrement) is 0/-1, so the constants need
3225 to be exchanged once more. */
3227 if (compare_code
== GE
|| !cf
)
3229 code
= reverse_condition (code
);
3235 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3239 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3241 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3243 copy_rtx (out
), 1, OPTAB_DIRECT
);
3246 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3247 gen_int_mode (cf
- ct
, mode
),
3248 copy_rtx (out
), 1, OPTAB_DIRECT
);
3250 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3251 copy_rtx (out
), 1, OPTAB_DIRECT
);
3252 if (!rtx_equal_p (out
, operands
[0]))
3253 emit_move_insn (operands
[0], copy_rtx (out
));
3259 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3261 /* Try a few things more with specific constants and a variable. */
3264 rtx var
, orig_out
, out
, tmp
;
3266 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3269 /* If one of the two operands is an interesting constant, load a
3270 constant with the above and mask it in with a logical operation. */
3272 if (CONST_INT_P (operands
[2]))
3275 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3276 operands
[3] = constm1_rtx
, op
= and_optab
;
3277 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3278 operands
[3] = const0_rtx
, op
= ior_optab
;
3282 else if (CONST_INT_P (operands
[3]))
3285 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3286 operands
[2] = constm1_rtx
, op
= and_optab
;
3287 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3288 operands
[2] = const0_rtx
, op
= ior_optab
;
3295 orig_out
= operands
[0];
3296 tmp
= gen_reg_rtx (mode
);
3299 /* Recurse to get the constant loaded. */
3300 if (!ix86_expand_int_movcc (operands
))
3303 /* Mask in the interesting variable. */
3304 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3306 if (!rtx_equal_p (out
, orig_out
))
3307 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3313 * For comparison with above,
3323 if (! nonimmediate_operand (operands
[2], mode
))
3324 operands
[2] = force_reg (mode
, operands
[2]);
3325 if (! nonimmediate_operand (operands
[3], mode
))
3326 operands
[3] = force_reg (mode
, operands
[3]);
3328 if (! register_operand (operands
[2], VOIDmode
)
3330 || ! register_operand (operands
[3], VOIDmode
)))
3331 operands
[2] = force_reg (mode
, operands
[2]);
3334 && ! register_operand (operands
[3], VOIDmode
))
3335 operands
[3] = force_reg (mode
, operands
[3]);
3337 emit_insn (compare_seq
);
3338 emit_insn (gen_rtx_SET (operands
[0],
3339 gen_rtx_IF_THEN_ELSE (mode
,
3340 compare_op
, operands
[2],
3345 /* Detect conditional moves that exactly match min/max operational
3346 semantics. Note that this is IEEE safe, as long as we don't
3347 interchange the operands.
3349 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3350 and TRUE if the operation is successful and instructions are emitted. */
3353 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3354 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3362 else if (code
== UNGE
)
3363 std::swap (if_true
, if_false
);
3367 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3369 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3374 mode
= GET_MODE (dest
);
3376 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3377 but MODE may be a vector mode and thus not appropriate. */
3378 if (!flag_finite_math_only
|| flag_signed_zeros
)
3380 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3383 if_true
= force_reg (mode
, if_true
);
3384 v
= gen_rtvec (2, if_true
, if_false
);
3385 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3389 code
= is_min
? SMIN
: SMAX
;
3390 if (MEM_P (if_true
) && MEM_P (if_false
))
3391 if_true
= force_reg (mode
, if_true
);
3392 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3395 emit_insn (gen_rtx_SET (dest
, tmp
));
3399 /* Expand an SSE comparison. Return the register with the result. */
3402 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3403 rtx op_true
, rtx op_false
)
3405 machine_mode mode
= GET_MODE (dest
);
3406 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3408 /* In general case result of comparison can differ from operands' type. */
3409 machine_mode cmp_mode
;
3411 /* In AVX512F the result of comparison is an integer mask. */
3412 bool maskcmp
= false;
3415 if (GET_MODE_SIZE (cmp_ops_mode
) == 64)
3417 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3418 cmp_mode
= int_mode_for_size (nbits
, 0).require ();
3422 cmp_mode
= cmp_ops_mode
;
3424 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3426 int (*op1_predicate
)(rtx
, machine_mode
)
3427 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3429 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3430 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3433 || (maskcmp
&& cmp_mode
!= mode
)
3434 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3435 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3436 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3438 /* Compare patterns for int modes are unspec in AVX512F only. */
3439 if (maskcmp
&& (code
== GT
|| code
== EQ
))
3441 rtx (*gen
)(rtx
, rtx
, rtx
);
3443 switch (cmp_ops_mode
)
3446 gcc_assert (TARGET_AVX512BW
);
3447 gen
= code
== GT
? gen_avx512bw_gtv64qi3
: gen_avx512bw_eqv64qi3_1
;
3450 gcc_assert (TARGET_AVX512BW
);
3451 gen
= code
== GT
? gen_avx512bw_gtv32hi3
: gen_avx512bw_eqv32hi3_1
;
3454 gen
= code
== GT
? gen_avx512f_gtv16si3
: gen_avx512f_eqv16si3_1
;
3457 gen
= code
== GT
? gen_avx512f_gtv8di3
: gen_avx512f_eqv8di3_1
;
3465 emit_insn (gen (dest
, cmp_op0
, cmp_op1
));
3469 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3471 if (cmp_mode
!= mode
&& !maskcmp
)
3473 x
= force_reg (cmp_ops_mode
, x
);
3474 convert_move (dest
, x
, false);
3477 emit_insn (gen_rtx_SET (dest
, x
));
3482 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3483 operations. This is used for both scalar and vector conditional moves. */
3486 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3488 machine_mode mode
= GET_MODE (dest
);
3489 machine_mode cmpmode
= GET_MODE (cmp
);
3491 /* In AVX512F the result of comparison is an integer mask. */
3492 bool maskcmp
= (mode
!= cmpmode
&& TARGET_AVX512F
);
3496 /* If we have an integer mask and FP value then we need
3497 to cast mask to FP mode. */
3498 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3500 cmp
= force_reg (cmpmode
, cmp
);
3501 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3506 rtx (*gen
) (rtx
, rtx
) = NULL
;
3507 if ((op_true
== CONST0_RTX (mode
)
3508 && vector_all_ones_operand (op_false
, mode
))
3509 || (op_false
== CONST0_RTX (mode
)
3510 && vector_all_ones_operand (op_true
, mode
)))
3514 if (TARGET_AVX512BW
)
3515 gen
= gen_avx512bw_cvtmask2bv64qi
;
3518 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3519 gen
= gen_avx512vl_cvtmask2bv32qi
;
3522 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3523 gen
= gen_avx512vl_cvtmask2bv16qi
;
3526 if (TARGET_AVX512BW
)
3527 gen
= gen_avx512bw_cvtmask2wv32hi
;
3530 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3531 gen
= gen_avx512vl_cvtmask2wv16hi
;
3534 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3535 gen
= gen_avx512vl_cvtmask2wv8hi
;
3538 if (TARGET_AVX512DQ
)
3539 gen
= gen_avx512f_cvtmask2dv16si
;
3542 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3543 gen
= gen_avx512vl_cvtmask2dv8si
;
3546 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3547 gen
= gen_avx512vl_cvtmask2dv4si
;
3550 if (TARGET_AVX512DQ
)
3551 gen
= gen_avx512f_cvtmask2qv8di
;
3554 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3555 gen
= gen_avx512vl_cvtmask2qv4di
;
3558 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3559 gen
= gen_avx512vl_cvtmask2qv2di
;
3564 if (gen
&& SCALAR_INT_MODE_P (cmpmode
))
3566 cmp
= force_reg (cmpmode
, cmp
);
3567 if (op_true
== CONST0_RTX (mode
))
3569 rtx (*gen_not
) (rtx
, rtx
);
3572 case E_QImode
: gen_not
= gen_knotqi
; break;
3573 case E_HImode
: gen_not
= gen_knothi
; break;
3574 case E_SImode
: gen_not
= gen_knotsi
; break;
3575 case E_DImode
: gen_not
= gen_knotdi
; break;
3576 default: gcc_unreachable ();
3578 rtx n
= gen_reg_rtx (cmpmode
);
3579 emit_insn (gen_not (n
, cmp
));
3582 emit_insn (gen (dest
, cmp
));
3586 else if (vector_all_ones_operand (op_true
, mode
)
3587 && op_false
== CONST0_RTX (mode
))
3589 emit_insn (gen_rtx_SET (dest
, cmp
));
3592 else if (op_false
== CONST0_RTX (mode
))
3594 op_true
= force_reg (mode
, op_true
);
3595 x
= gen_rtx_AND (mode
, cmp
, op_true
);
3596 emit_insn (gen_rtx_SET (dest
, x
));
3599 else if (op_true
== CONST0_RTX (mode
))
3601 op_false
= force_reg (mode
, op_false
);
3602 x
= gen_rtx_NOT (mode
, cmp
);
3603 x
= gen_rtx_AND (mode
, x
, op_false
);
3604 emit_insn (gen_rtx_SET (dest
, x
));
3607 else if (INTEGRAL_MODE_P (mode
) && op_true
== CONSTM1_RTX (mode
))
3609 op_false
= force_reg (mode
, op_false
);
3610 x
= gen_rtx_IOR (mode
, cmp
, op_false
);
3611 emit_insn (gen_rtx_SET (dest
, x
));
3614 else if (TARGET_XOP
)
3616 op_true
= force_reg (mode
, op_true
);
3618 if (!nonimmediate_operand (op_false
, mode
))
3619 op_false
= force_reg (mode
, op_false
);
3621 emit_insn (gen_rtx_SET (dest
, gen_rtx_IF_THEN_ELSE (mode
, cmp
,
3627 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
3630 if (!vector_operand (op_true
, mode
))
3631 op_true
= force_reg (mode
, op_true
);
3633 op_false
= force_reg (mode
, op_false
);
3639 gen
= gen_sse4_1_blendvps
;
3643 gen
= gen_sse4_1_blendvpd
;
3648 gen
= gen_sse4_1_blendvss
;
3649 op_true
= force_reg (mode
, op_true
);
3655 gen
= gen_sse4_1_blendvsd
;
3656 op_true
= force_reg (mode
, op_true
);
3665 gen
= gen_sse4_1_pblendvb
;
3666 if (mode
!= V16QImode
)
3667 d
= gen_reg_rtx (V16QImode
);
3668 op_false
= gen_lowpart (V16QImode
, op_false
);
3669 op_true
= gen_lowpart (V16QImode
, op_true
);
3670 cmp
= gen_lowpart (V16QImode
, cmp
);
3675 gen
= gen_avx_blendvps256
;
3679 gen
= gen_avx_blendvpd256
;
3687 gen
= gen_avx2_pblendvb
;
3688 if (mode
!= V32QImode
)
3689 d
= gen_reg_rtx (V32QImode
);
3690 op_false
= gen_lowpart (V32QImode
, op_false
);
3691 op_true
= gen_lowpart (V32QImode
, op_true
);
3692 cmp
= gen_lowpart (V32QImode
, cmp
);
3697 gen
= gen_avx512bw_blendmv64qi
;
3700 gen
= gen_avx512bw_blendmv32hi
;
3703 gen
= gen_avx512f_blendmv16si
;
3706 gen
= gen_avx512f_blendmv8di
;
3709 gen
= gen_avx512f_blendmv8df
;
3712 gen
= gen_avx512f_blendmv16sf
;
3721 emit_insn (gen (d
, op_false
, op_true
, cmp
));
3723 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
3727 op_true
= force_reg (mode
, op_true
);
3729 t2
= gen_reg_rtx (mode
);
3731 t3
= gen_reg_rtx (mode
);
3735 x
= gen_rtx_AND (mode
, op_true
, cmp
);
3736 emit_insn (gen_rtx_SET (t2
, x
));
3738 x
= gen_rtx_NOT (mode
, cmp
);
3739 x
= gen_rtx_AND (mode
, x
, op_false
);
3740 emit_insn (gen_rtx_SET (t3
, x
));
3742 x
= gen_rtx_IOR (mode
, t3
, t2
);
3743 emit_insn (gen_rtx_SET (dest
, x
));
3747 /* Swap, force into registers, or otherwise massage the two operands
3748 to an sse comparison with a mask result. Thus we differ a bit from
3749 ix86_prepare_fp_compare_args which expects to produce a flags result.
3751 The DEST operand exists to help determine whether to commute commutative
3752 operators. The POP0/POP1 operands are updated in place. The new
3753 comparison code is returned, or UNKNOWN if not implementable. */
3755 static enum rtx_code
3756 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
3757 rtx
*pop0
, rtx
*pop1
)
3763 /* AVX supports all the needed comparisons. */
3766 /* We have no LTGT as an operator. We could implement it with
3767 NE & ORDERED, but this requires an extra temporary. It's
3768 not clear that it's worth it. */
3775 /* These are supported directly. */
3782 /* AVX has 3 operand comparisons, no need to swap anything. */
3785 /* For commutative operators, try to canonicalize the destination
3786 operand to be first in the comparison - this helps reload to
3787 avoid extra moves. */
3788 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
3796 /* These are not supported directly before AVX, and furthermore
3797 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3798 comparison operands to transform into something that is
3800 std::swap (*pop0
, *pop1
);
3801 code
= swap_condition (code
);
3811 /* Expand a floating-point conditional move. Return true if successful. */
3814 ix86_expand_fp_movcc (rtx operands
[])
3816 machine_mode mode
= GET_MODE (operands
[0]);
3817 enum rtx_code code
= GET_CODE (operands
[1]);
3818 rtx tmp
, compare_op
;
3819 rtx op0
= XEXP (operands
[1], 0);
3820 rtx op1
= XEXP (operands
[1], 1);
3822 if (TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (mode
))
3826 /* Since we've no cmove for sse registers, don't force bad register
3827 allocation just to gain access to it. Deny movcc when the
3828 comparison mode doesn't match the move mode. */
3829 cmode
= GET_MODE (op0
);
3830 if (cmode
== VOIDmode
)
3831 cmode
= GET_MODE (op1
);
3835 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
3836 if (code
== UNKNOWN
)
3839 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
3840 operands
[2], operands
[3]))
3843 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
3844 operands
[2], operands
[3]);
3845 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
3849 if (GET_MODE (op0
) == TImode
3850 || (GET_MODE (op0
) == DImode
3854 /* The floating point conditional move instructions don't directly
3855 support conditions resulting from a signed integer comparison. */
3857 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3858 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
3860 tmp
= gen_reg_rtx (QImode
);
3861 ix86_expand_setcc (tmp
, code
, op0
, op1
);
3863 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
3866 emit_insn (gen_rtx_SET (operands
[0],
3867 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
3868 operands
[2], operands
[3])));
3873 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3876 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3901 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3904 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3941 /* Return immediate value to be used in UNSPEC_PCMP
3942 for comparison CODE in MODE. */
3945 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
3947 if (FLOAT_MODE_P (mode
))
3948 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
3949 return ix86_int_cmp_code_to_pcmp_immediate (code
);
3952 /* Expand AVX-512 vector comparison. */
3955 ix86_expand_mask_vec_cmp (rtx operands
[])
3957 machine_mode mask_mode
= GET_MODE (operands
[0]);
3958 machine_mode cmp_mode
= GET_MODE (operands
[2]);
3959 enum rtx_code code
= GET_CODE (operands
[1]);
3960 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
3970 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
3974 unspec_code
= UNSPEC_PCMP
;
3977 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, operands
[2],
3980 emit_insn (gen_rtx_SET (operands
[0], unspec
));
3985 /* Expand fp vector comparison. */
3988 ix86_expand_fp_vec_cmp (rtx operands
[])
3990 enum rtx_code code
= GET_CODE (operands
[1]);
3993 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
3994 &operands
[2], &operands
[3]);
3995 if (code
== UNKNOWN
)
3998 switch (GET_CODE (operands
[1]))
4001 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4002 operands
[3], NULL
, NULL
);
4003 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4004 operands
[3], NULL
, NULL
);
4008 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4009 operands
[3], NULL
, NULL
);
4010 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4011 operands
[3], NULL
, NULL
);
4017 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4021 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4022 operands
[1], operands
[2]);
4024 if (operands
[0] != cmp
)
4025 emit_move_insn (operands
[0], cmp
);
4031 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4032 rtx op_true
, rtx op_false
, bool *negate
)
4034 machine_mode data_mode
= GET_MODE (dest
);
4035 machine_mode mode
= GET_MODE (cop0
);
4040 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4042 && (mode
== V16QImode
|| mode
== V8HImode
4043 || mode
== V4SImode
|| mode
== V2DImode
))
4047 /* Canonicalize the comparison to EQ, GT, GTU. */
4058 code
= reverse_condition (code
);
4064 code
= reverse_condition (code
);
4070 std::swap (cop0
, cop1
);
4071 code
= swap_condition (code
);
4078 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4079 if (mode
== V2DImode
)
4084 /* SSE4.1 supports EQ. */
4091 /* SSE4.2 supports GT/GTU. */
4101 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4102 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4104 std::swap (optrue
, opfalse
);
4106 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4107 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4108 min (x, y) == x). While we add one instruction (the minimum),
4109 we remove the need for two instructions in the negation, as the
4110 result is done this way.
4111 When using masks, do it for SI/DImode element types, as it is shorter
4112 than the two subtractions. */
4114 && GET_MODE_SIZE (mode
) != 64
4115 && vector_all_ones_operand (opfalse
, data_mode
)
4116 && optrue
== CONST0_RTX (data_mode
))
4118 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4119 /* Don't do it if not using integer masks and we'd end up with
4120 the right values in the registers though. */
4121 && (GET_MODE_SIZE (mode
) == 64
4122 || !vector_all_ones_operand (optrue
, data_mode
)
4123 || opfalse
!= CONST0_RTX (data_mode
))))
4125 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4130 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4133 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4134 cop0
= force_reg (mode
, cop0
);
4135 cop1
= force_reg (mode
, cop1
);
4139 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4143 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4147 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4150 if (TARGET_AVX512VL
)
4152 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4153 cop0
= force_reg (mode
, cop0
);
4154 cop1
= force_reg (mode
, cop1
);
4158 if (code
== GTU
&& TARGET_SSE2
)
4159 gen
= gen_uminv16qi3
;
4160 else if (code
== GT
&& TARGET_SSE4_1
)
4161 gen
= gen_sminv16qi3
;
4164 if (code
== GTU
&& TARGET_SSE4_1
)
4165 gen
= gen_uminv8hi3
;
4166 else if (code
== GT
&& TARGET_SSE2
)
4167 gen
= gen_sminv8hi3
;
4171 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4174 if (TARGET_AVX512VL
)
4176 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4177 cop0
= force_reg (mode
, cop0
);
4178 cop1
= force_reg (mode
, cop1
);
4187 rtx tem
= gen_reg_rtx (mode
);
4188 if (!vector_operand (cop0
, mode
))
4189 cop0
= force_reg (mode
, cop0
);
4190 if (!vector_operand (cop1
, mode
))
4191 cop1
= force_reg (mode
, cop1
);
4193 emit_insn (gen (tem
, cop0
, cop1
));
4199 /* Unsigned parallel compare is not supported by the hardware.
4200 Play some tricks to turn this into a signed comparison
4204 cop0
= force_reg (mode
, cop0
);
4216 rtx (*gen_sub3
) (rtx
, rtx
, rtx
);
4220 case E_V16SImode
: gen_sub3
= gen_subv16si3
; break;
4221 case E_V8DImode
: gen_sub3
= gen_subv8di3
; break;
4222 case E_V8SImode
: gen_sub3
= gen_subv8si3
; break;
4223 case E_V4DImode
: gen_sub3
= gen_subv4di3
; break;
4224 case E_V4SImode
: gen_sub3
= gen_subv4si3
; break;
4225 case E_V2DImode
: gen_sub3
= gen_subv2di3
; break;
4229 /* Subtract (-(INT MAX) - 1) from both operands to make
4231 mask
= ix86_build_signbit_mask (mode
, true, false);
4232 t1
= gen_reg_rtx (mode
);
4233 emit_insn (gen_sub3 (t1
, cop0
, mask
));
4235 t2
= gen_reg_rtx (mode
);
4236 emit_insn (gen_sub3 (t2
, cop1
, mask
));
4250 /* Perform a parallel unsigned saturating subtraction. */
4251 x
= gen_reg_rtx (mode
);
4252 emit_insn (gen_rtx_SET (x
, gen_rtx_US_MINUS (mode
, cop0
,
4256 cop1
= CONST0_RTX (mode
);
4268 std::swap (op_true
, op_false
);
4270 /* Allow the comparison to be done in one mode, but the movcc to
4271 happen in another mode. */
4272 if (data_mode
== mode
)
4274 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4279 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4280 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4282 if (GET_MODE (x
) == mode
)
4283 x
= gen_lowpart (data_mode
, x
);
4289 /* Expand integer vector comparison. */
4292 ix86_expand_int_vec_cmp (rtx operands
[])
4294 rtx_code code
= GET_CODE (operands
[1]);
4295 bool negate
= false;
4296 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4297 operands
[3], NULL
, NULL
, &negate
);
4303 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4304 CONST0_RTX (GET_MODE (cmp
)),
4305 NULL
, NULL
, &negate
);
4307 gcc_assert (!negate
);
4309 if (operands
[0] != cmp
)
4310 emit_move_insn (operands
[0], cmp
);
4315 /* Expand a floating-point vector conditional move; a vcond operation
4316 rather than a movcc operation. */
4319 ix86_expand_fp_vcond (rtx operands
[])
4321 enum rtx_code code
= GET_CODE (operands
[3]);
4324 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4325 &operands
[4], &operands
[5]);
4326 if (code
== UNKNOWN
)
4329 switch (GET_CODE (operands
[3]))
4332 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4333 operands
[5], operands
[0], operands
[0]);
4334 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4335 operands
[5], operands
[1], operands
[2]);
4339 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4340 operands
[5], operands
[0], operands
[0]);
4341 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4342 operands
[5], operands
[1], operands
[2]);
4348 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4350 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4354 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4355 operands
[5], operands
[1], operands
[2]))
4358 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4359 operands
[1], operands
[2]);
4360 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4364 /* Expand a signed/unsigned integral vector conditional move. */
4367 ix86_expand_int_vcond (rtx operands
[])
4369 machine_mode data_mode
= GET_MODE (operands
[0]);
4370 machine_mode mode
= GET_MODE (operands
[4]);
4371 enum rtx_code code
= GET_CODE (operands
[3]);
4372 bool negate
= false;
4378 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4379 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4380 if ((code
== LT
|| code
== GE
)
4381 && data_mode
== mode
4382 && cop1
== CONST0_RTX (mode
)
4383 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4384 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4385 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4386 && (GET_MODE_SIZE (data_mode
) == 16
4387 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4389 rtx negop
= operands
[2 - (code
== LT
)];
4390 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4391 if (negop
== CONST1_RTX (data_mode
))
4393 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4394 operands
[0], 1, OPTAB_DIRECT
);
4395 if (res
!= operands
[0])
4396 emit_move_insn (operands
[0], res
);
4399 else if (GET_MODE_INNER (data_mode
) != DImode
4400 && vector_all_ones_operand (negop
, data_mode
))
4402 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4403 operands
[0], 0, OPTAB_DIRECT
);
4404 if (res
!= operands
[0])
4405 emit_move_insn (operands
[0], res
);
4410 if (!nonimmediate_operand (cop1
, mode
))
4411 cop1
= force_reg (mode
, cop1
);
4412 if (!general_operand (operands
[1], data_mode
))
4413 operands
[1] = force_reg (data_mode
, operands
[1]);
4414 if (!general_operand (operands
[2], data_mode
))
4415 operands
[2] = force_reg (data_mode
, operands
[2]);
4417 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4418 operands
[1], operands
[2], &negate
);
4423 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4424 operands
[2-negate
]);
4429 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4430 struct expand_vec_perm_d
*d
)
4432 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4433 expander, so args are either in d, or in op0, op1 etc. */
4434 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4435 machine_mode maskmode
= mode
;
4436 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4441 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4442 gen
= gen_avx512vl_vpermt2varv8hi3
;
4445 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4446 gen
= gen_avx512vl_vpermt2varv16hi3
;
4449 if (TARGET_AVX512VBMI
)
4450 gen
= gen_avx512bw_vpermt2varv64qi3
;
4453 if (TARGET_AVX512BW
)
4454 gen
= gen_avx512bw_vpermt2varv32hi3
;
4457 if (TARGET_AVX512VL
)
4458 gen
= gen_avx512vl_vpermt2varv4si3
;
4461 if (TARGET_AVX512VL
)
4462 gen
= gen_avx512vl_vpermt2varv8si3
;
4466 gen
= gen_avx512f_vpermt2varv16si3
;
4469 if (TARGET_AVX512VL
)
4471 gen
= gen_avx512vl_vpermt2varv4sf3
;
4472 maskmode
= V4SImode
;
4476 if (TARGET_AVX512VL
)
4478 gen
= gen_avx512vl_vpermt2varv8sf3
;
4479 maskmode
= V8SImode
;
4485 gen
= gen_avx512f_vpermt2varv16sf3
;
4486 maskmode
= V16SImode
;
4490 if (TARGET_AVX512VL
)
4491 gen
= gen_avx512vl_vpermt2varv2di3
;
4494 if (TARGET_AVX512VL
)
4495 gen
= gen_avx512vl_vpermt2varv4di3
;
4499 gen
= gen_avx512f_vpermt2varv8di3
;
4502 if (TARGET_AVX512VL
)
4504 gen
= gen_avx512vl_vpermt2varv2df3
;
4505 maskmode
= V2DImode
;
4509 if (TARGET_AVX512VL
)
4511 gen
= gen_avx512vl_vpermt2varv4df3
;
4512 maskmode
= V4DImode
;
4518 gen
= gen_avx512f_vpermt2varv8df3
;
4519 maskmode
= V8DImode
;
4529 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4530 expander, so args are either in d, or in op0, op1 etc. */
4537 for (int i
= 0; i
< d
->nelt
; ++i
)
4538 vec
[i
] = GEN_INT (d
->perm
[i
]);
4539 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
4542 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
4546 /* Expand a variable vector permutation. */
4549 ix86_expand_vec_perm (rtx operands
[])
4551 rtx target
= operands
[0];
4552 rtx op0
= operands
[1];
4553 rtx op1
= operands
[2];
4554 rtx mask
= operands
[3];
4555 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
4556 machine_mode mode
= GET_MODE (op0
);
4557 machine_mode maskmode
= GET_MODE (mask
);
4559 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
4561 /* Number of elements in the vector. */
4562 w
= GET_MODE_NUNITS (mode
);
4563 e
= GET_MODE_UNIT_SIZE (mode
);
4564 gcc_assert (w
<= 64);
4566 if (TARGET_AVX512F
&& one_operand_shuffle
)
4568 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4572 gen
=gen_avx512f_permvarv16si
;
4575 gen
= gen_avx512f_permvarv16sf
;
4578 gen
= gen_avx512f_permvarv8di
;
4581 gen
= gen_avx512f_permvarv8df
;
4588 emit_insn (gen (target
, op0
, mask
));
4593 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
4598 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
4600 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4601 an constant shuffle operand. With a tiny bit of effort we can
4602 use VPERMD instead. A re-interpretation stall for V4DFmode is
4603 unfortunate but there's no avoiding it.
4604 Similarly for V16HImode we don't have instructions for variable
4605 shuffling, while for V32QImode we can use after preparing suitable
4606 masks vpshufb; vpshufb; vpermq; vpor. */
4608 if (mode
== V16HImode
)
4610 maskmode
= mode
= V32QImode
;
4616 maskmode
= mode
= V8SImode
;
4620 t1
= gen_reg_rtx (maskmode
);
4622 /* Replicate the low bits of the V4DImode mask into V8SImode:
4624 t1 = { A A B B C C D D }. */
4625 for (i
= 0; i
< w
/ 2; ++i
)
4626 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
4627 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4628 vt
= force_reg (maskmode
, vt
);
4629 mask
= gen_lowpart (maskmode
, mask
);
4630 if (maskmode
== V8SImode
)
4631 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
4633 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
4635 /* Multiply the shuffle indicies by two. */
4636 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
4639 /* Add one to the odd shuffle indicies:
4640 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4641 for (i
= 0; i
< w
/ 2; ++i
)
4643 vec
[i
* 2] = const0_rtx
;
4644 vec
[i
* 2 + 1] = const1_rtx
;
4646 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4647 vt
= validize_mem (force_const_mem (maskmode
, vt
));
4648 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
4651 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4652 operands
[3] = mask
= t1
;
4653 target
= gen_reg_rtx (mode
);
4654 op0
= gen_lowpart (mode
, op0
);
4655 op1
= gen_lowpart (mode
, op1
);
4661 /* The VPERMD and VPERMPS instructions already properly ignore
4662 the high bits of the shuffle elements. No need for us to
4663 perform an AND ourselves. */
4664 if (one_operand_shuffle
)
4666 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
4667 if (target
!= operands
[0])
4668 emit_move_insn (operands
[0],
4669 gen_lowpart (GET_MODE (operands
[0]), target
));
4673 t1
= gen_reg_rtx (V8SImode
);
4674 t2
= gen_reg_rtx (V8SImode
);
4675 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
4676 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
4682 mask
= gen_lowpart (V8SImode
, mask
);
4683 if (one_operand_shuffle
)
4684 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
4687 t1
= gen_reg_rtx (V8SFmode
);
4688 t2
= gen_reg_rtx (V8SFmode
);
4689 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
4690 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
4696 /* By combining the two 128-bit input vectors into one 256-bit
4697 input vector, we can use VPERMD and VPERMPS for the full
4698 two-operand shuffle. */
4699 t1
= gen_reg_rtx (V8SImode
);
4700 t2
= gen_reg_rtx (V8SImode
);
4701 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
4702 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4703 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
4704 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
4708 t1
= gen_reg_rtx (V8SFmode
);
4709 t2
= gen_reg_rtx (V8SImode
);
4710 mask
= gen_lowpart (V4SImode
, mask
);
4711 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
4712 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4713 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
4714 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
4718 t1
= gen_reg_rtx (V32QImode
);
4719 t2
= gen_reg_rtx (V32QImode
);
4720 t3
= gen_reg_rtx (V32QImode
);
4721 vt2
= GEN_INT (-128);
4722 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
4723 vt
= force_reg (V32QImode
, vt
);
4724 for (i
= 0; i
< 32; i
++)
4725 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
4726 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
4727 vt2
= force_reg (V32QImode
, vt2
);
4728 /* From mask create two adjusted masks, which contain the same
4729 bits as mask in the low 7 bits of each vector element.
4730 The first mask will have the most significant bit clear
4731 if it requests element from the same 128-bit lane
4732 and MSB set if it requests element from the other 128-bit lane.
4733 The second mask will have the opposite values of the MSB,
4734 and additionally will have its 128-bit lanes swapped.
4735 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4736 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4737 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4738 stands for other 12 bytes. */
4739 /* The bit whether element is from the same lane or the other
4740 lane is bit 4, so shift it up by 3 to the MSB position. */
4741 t5
= gen_reg_rtx (V4DImode
);
4742 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
4744 /* Clear MSB bits from the mask just in case it had them set. */
4745 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
4746 /* After this t1 will have MSB set for elements from other lane. */
4747 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
4748 /* Clear bits other than MSB. */
4749 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
4750 /* Or in the lower bits from mask into t3. */
4751 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
4752 /* And invert MSB bits in t1, so MSB is set for elements from the same
4754 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
4755 /* Swap 128-bit lanes in t3. */
4756 t6
= gen_reg_rtx (V4DImode
);
4757 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
4758 const2_rtx
, GEN_INT (3),
4759 const0_rtx
, const1_rtx
));
4760 /* And or in the lower bits from mask into t1. */
4761 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
4762 if (one_operand_shuffle
)
4764 /* Each of these shuffles will put 0s in places where
4765 element from the other 128-bit lane is needed, otherwise
4766 will shuffle in the requested value. */
4767 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
4768 gen_lowpart (V32QImode
, t6
)));
4769 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
4770 /* For t3 the 128-bit lanes are swapped again. */
4771 t7
= gen_reg_rtx (V4DImode
);
4772 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
4773 const2_rtx
, GEN_INT (3),
4774 const0_rtx
, const1_rtx
));
4775 /* And oring both together leads to the result. */
4776 emit_insn (gen_iorv32qi3 (target
, t1
,
4777 gen_lowpart (V32QImode
, t7
)));
4778 if (target
!= operands
[0])
4779 emit_move_insn (operands
[0],
4780 gen_lowpart (GET_MODE (operands
[0]), target
));
4784 t4
= gen_reg_rtx (V32QImode
);
4785 /* Similarly to the above one_operand_shuffle code,
4786 just for repeated twice for each operand. merge_two:
4787 code will merge the two results together. */
4788 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
4789 gen_lowpart (V32QImode
, t6
)));
4790 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
4791 gen_lowpart (V32QImode
, t6
)));
4792 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
4793 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
4794 t7
= gen_reg_rtx (V4DImode
);
4795 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
4796 const2_rtx
, GEN_INT (3),
4797 const0_rtx
, const1_rtx
));
4798 t8
= gen_reg_rtx (V4DImode
);
4799 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
4800 const2_rtx
, GEN_INT (3),
4801 const0_rtx
, const1_rtx
));
4802 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
4803 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
4809 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
4816 /* The XOP VPPERM insn supports three inputs. By ignoring the
4817 one_operand_shuffle special case, we avoid creating another
4818 set of constant vectors in memory. */
4819 one_operand_shuffle
= false;
4821 /* mask = mask & {2*w-1, ...} */
4822 vt
= GEN_INT (2*w
- 1);
4826 /* mask = mask & {w-1, ...} */
4827 vt
= GEN_INT (w
- 1);
4830 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4831 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4832 NULL_RTX
, 0, OPTAB_DIRECT
);
4834 /* For non-QImode operations, convert the word permutation control
4835 into a byte permutation control. */
4836 if (mode
!= V16QImode
)
4838 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
4839 GEN_INT (exact_log2 (e
)),
4840 NULL_RTX
, 0, OPTAB_DIRECT
);
4842 /* Convert mask to vector of chars. */
4843 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
4845 /* Replicate each of the input bytes into byte positions:
4846 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4847 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4848 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4849 for (i
= 0; i
< 16; ++i
)
4850 vec
[i
] = GEN_INT (i
/e
* e
);
4851 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4852 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4854 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
4856 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
4858 /* Convert it into the byte positions by doing
4859 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4860 for (i
= 0; i
< 16; ++i
)
4861 vec
[i
] = GEN_INT (i
% e
);
4862 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4863 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4864 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
4867 /* The actual shuffle operations all operate on V16QImode. */
4868 op0
= gen_lowpart (V16QImode
, op0
);
4869 op1
= gen_lowpart (V16QImode
, op1
);
4873 if (GET_MODE (target
) != V16QImode
)
4874 target
= gen_reg_rtx (V16QImode
);
4875 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
4876 if (target
!= operands
[0])
4877 emit_move_insn (operands
[0],
4878 gen_lowpart (GET_MODE (operands
[0]), target
));
4880 else if (one_operand_shuffle
)
4882 if (GET_MODE (target
) != V16QImode
)
4883 target
= gen_reg_rtx (V16QImode
);
4884 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
4885 if (target
!= operands
[0])
4886 emit_move_insn (operands
[0],
4887 gen_lowpart (GET_MODE (operands
[0]), target
));
4894 /* Shuffle the two input vectors independently. */
4895 t1
= gen_reg_rtx (V16QImode
);
4896 t2
= gen_reg_rtx (V16QImode
);
4897 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
4898 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
4901 /* Then merge them together. The key is whether any given control
4902 element contained a bit set that indicates the second word. */
4905 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
4907 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4908 more shuffle to convert the V2DI input mask into a V4SI
4909 input mask. At which point the masking that expand_int_vcond
4910 will work as desired. */
4911 rtx t3
= gen_reg_rtx (V4SImode
);
4912 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
4913 const0_rtx
, const0_rtx
,
4914 const2_rtx
, const2_rtx
));
4916 maskmode
= V4SImode
;
4920 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4921 vt
= force_reg (maskmode
, vt
);
4922 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4923 NULL_RTX
, 0, OPTAB_DIRECT
);
4925 if (GET_MODE (target
) != mode
)
4926 target
= gen_reg_rtx (mode
);
4928 xops
[1] = gen_lowpart (mode
, t2
);
4929 xops
[2] = gen_lowpart (mode
, t1
);
4930 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
4933 ok
= ix86_expand_int_vcond (xops
);
4935 if (target
!= operands
[0])
4936 emit_move_insn (operands
[0],
4937 gen_lowpart (GET_MODE (operands
[0]), target
));
4941 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4942 true if we should do zero extension, else sign extension. HIGH_P is
4943 true if we want the N/2 high elements, else the low elements. */
4946 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
4948 machine_mode imode
= GET_MODE (src
);
4953 rtx (*unpack
)(rtx
, rtx
);
4954 rtx (*extract
)(rtx
, rtx
) = NULL
;
4955 machine_mode halfmode
= BLKmode
;
4961 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
4963 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
4964 halfmode
= V32QImode
;
4966 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
4970 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
4972 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
4973 halfmode
= V16QImode
;
4975 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
4979 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
4981 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
4982 halfmode
= V16HImode
;
4984 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
4988 unpack
= gen_avx2_zero_extendv8hiv8si2
;
4990 unpack
= gen_avx2_sign_extendv8hiv8si2
;
4991 halfmode
= V8HImode
;
4993 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
4997 unpack
= gen_avx512f_zero_extendv8siv8di2
;
4999 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5000 halfmode
= V8SImode
;
5002 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5006 unpack
= gen_avx2_zero_extendv4siv4di2
;
5008 unpack
= gen_avx2_sign_extendv4siv4di2
;
5009 halfmode
= V4SImode
;
5011 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5015 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5017 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5021 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5023 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5027 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5029 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5035 if (GET_MODE_SIZE (imode
) >= 32)
5037 tmp
= gen_reg_rtx (halfmode
);
5038 emit_insn (extract (tmp
, src
));
5042 /* Shift higher 8 bytes to lower 8 bytes. */
5043 tmp
= gen_reg_rtx (V1TImode
);
5044 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5046 tmp
= gen_lowpart (imode
, tmp
);
5051 emit_insn (unpack (dest
, tmp
));
5055 rtx (*unpack
)(rtx
, rtx
, rtx
);
5061 unpack
= gen_vec_interleave_highv16qi
;
5063 unpack
= gen_vec_interleave_lowv16qi
;
5067 unpack
= gen_vec_interleave_highv8hi
;
5069 unpack
= gen_vec_interleave_lowv8hi
;
5073 unpack
= gen_vec_interleave_highv4si
;
5075 unpack
= gen_vec_interleave_lowv4si
;
5082 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5084 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5085 src
, pc_rtx
, pc_rtx
);
5087 rtx tmp2
= gen_reg_rtx (imode
);
5088 emit_insn (unpack (tmp2
, src
, tmp
));
5089 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5093 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5094 but works for floating pointer parameters and nonoffsetable memories.
5095 For pushes, it returns just stack offsets; the values will be saved
5096 in the right order. Maximally three parts are generated. */
5099 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5104 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5106 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5108 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5109 gcc_assert (size
>= 2 && size
<= 4);
5111 /* Optimize constant pool reference to immediates. This is used by fp
5112 moves, that force all constants to memory to allow combining. */
5113 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5114 operand
= avoid_constant_pool_reference (operand
);
5116 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5118 /* The only non-offsetable memories we handle are pushes. */
5119 int ok
= push_operand (operand
, VOIDmode
);
5123 operand
= copy_rtx (operand
);
5124 PUT_MODE (operand
, word_mode
);
5125 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5129 if (GET_CODE (operand
) == CONST_VECTOR
)
5131 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5132 /* Caution: if we looked through a constant pool memory above,
5133 the operand may actually have a different mode now. That's
5134 ok, since we want to pun this all the way back to an integer. */
5135 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5136 gcc_assert (operand
!= NULL
);
5143 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5148 if (REG_P (operand
))
5150 gcc_assert (reload_completed
);
5151 for (i
= 0; i
< size
; i
++)
5152 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5154 else if (offsettable_memref_p (operand
))
5156 operand
= adjust_address (operand
, SImode
, 0);
5158 for (i
= 1; i
< size
; i
++)
5159 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5161 else if (CONST_DOUBLE_P (operand
))
5163 const REAL_VALUE_TYPE
*r
;
5166 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5170 real_to_target (l
, r
, mode
);
5171 parts
[3] = gen_int_mode (l
[3], SImode
);
5172 parts
[2] = gen_int_mode (l
[2], SImode
);
5175 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5176 long double may not be 80-bit. */
5177 real_to_target (l
, r
, mode
);
5178 parts
[2] = gen_int_mode (l
[2], SImode
);
5181 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5186 parts
[1] = gen_int_mode (l
[1], SImode
);
5187 parts
[0] = gen_int_mode (l
[0], SImode
);
5196 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5197 if (mode
== XFmode
|| mode
== TFmode
)
5199 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5200 if (REG_P (operand
))
5202 gcc_assert (reload_completed
);
5203 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5204 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5206 else if (offsettable_memref_p (operand
))
5208 operand
= adjust_address (operand
, DImode
, 0);
5210 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5212 else if (CONST_DOUBLE_P (operand
))
5216 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5218 /* real_to_target puts 32-bit pieces in each long. */
5219 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5220 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5223 if (upper_mode
== SImode
)
5224 parts
[1] = gen_int_mode (l
[2], SImode
);
5227 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5228 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5239 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5240 Return false when normal moves are needed; true when all required
5241 insns have been emitted. Operands 2-4 contain the input values
5242 int the correct order; operands 5-7 contain the output values. */
5245 ix86_split_long_move (rtx operands
[])
5251 machine_mode mode
= GET_MODE (operands
[0]);
5252 bool collisionparts
[4];
5254 /* The DFmode expanders may ask us to move double.
5255 For 64bit target this is single move. By hiding the fact
5256 here we simplify i386.md splitters. */
5257 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5259 /* Optimize constant pool reference to immediates. This is used by
5260 fp moves, that force all constants to memory to allow combining. */
5262 if (MEM_P (operands
[1])
5263 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5264 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5265 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5266 if (push_operand (operands
[0], VOIDmode
))
5268 operands
[0] = copy_rtx (operands
[0]);
5269 PUT_MODE (operands
[0], word_mode
);
5272 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5273 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5274 emit_move_insn (operands
[0], operands
[1]);
5278 /* The only non-offsettable memory we handle is push. */
5279 if (push_operand (operands
[0], VOIDmode
))
5282 gcc_assert (!MEM_P (operands
[0])
5283 || offsettable_memref_p (operands
[0]));
5285 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5286 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5288 /* When emitting push, take care for source operands on the stack. */
5289 if (push
&& MEM_P (operands
[1])
5290 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5292 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5294 /* Compensate for the stack decrement by 4. */
5295 if (!TARGET_64BIT
&& nparts
== 3
5296 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5297 src_base
= plus_constant (Pmode
, src_base
, 4);
5299 /* src_base refers to the stack pointer and is
5300 automatically decreased by emitted push. */
5301 for (i
= 0; i
< nparts
; i
++)
5302 part
[1][i
] = change_address (part
[1][i
],
5303 GET_MODE (part
[1][i
]), src_base
);
5306 /* We need to do copy in the right order in case an address register
5307 of the source overlaps the destination. */
5308 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5312 for (i
= 0; i
< nparts
; i
++)
5315 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5316 if (collisionparts
[i
])
5320 /* Collision in the middle part can be handled by reordering. */
5321 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5323 std::swap (part
[0][1], part
[0][2]);
5324 std::swap (part
[1][1], part
[1][2]);
5326 else if (collisions
== 1
5328 && (collisionparts
[1] || collisionparts
[2]))
5330 if (collisionparts
[1])
5332 std::swap (part
[0][1], part
[0][2]);
5333 std::swap (part
[1][1], part
[1][2]);
5337 std::swap (part
[0][2], part
[0][3]);
5338 std::swap (part
[1][2], part
[1][3]);
5342 /* If there are more collisions, we can't handle it by reordering.
5343 Do an lea to the last part and use only one colliding move. */
5344 else if (collisions
> 1)
5350 base
= part
[0][nparts
- 1];
5352 /* Handle the case when the last part isn't valid for lea.
5353 Happens in 64-bit mode storing the 12-byte XFmode. */
5354 if (GET_MODE (base
) != Pmode
)
5355 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5357 addr
= XEXP (part
[1][0], 0);
5358 if (TARGET_TLS_DIRECT_SEG_REFS
)
5360 struct ix86_address parts
;
5361 int ok
= ix86_decompose_address (addr
, &parts
);
5363 /* It is not valid to use %gs: or %fs: in lea. */
5364 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5366 emit_insn (gen_rtx_SET (base
, addr
));
5367 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5368 for (i
= 1; i
< nparts
; i
++)
5370 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5371 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5382 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5383 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5384 emit_move_insn (part
[0][2], part
[1][2]);
5386 else if (nparts
== 4)
5388 emit_move_insn (part
[0][3], part
[1][3]);
5389 emit_move_insn (part
[0][2], part
[1][2]);
5394 /* In 64bit mode we don't have 32bit push available. In case this is
5395 register, it is OK - we will just use larger counterpart. We also
5396 retype memory - these comes from attempt to avoid REX prefix on
5397 moving of second half of TFmode value. */
5398 if (GET_MODE (part
[1][1]) == SImode
)
5400 switch (GET_CODE (part
[1][1]))
5403 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
5407 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
5414 if (GET_MODE (part
[1][0]) == SImode
)
5415 part
[1][0] = part
[1][1];
5418 emit_move_insn (part
[0][1], part
[1][1]);
5419 emit_move_insn (part
[0][0], part
[1][0]);
5423 /* Choose correct order to not overwrite the source before it is copied. */
5424 if ((REG_P (part
[0][0])
5425 && REG_P (part
[1][1])
5426 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
5428 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
5430 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
5432 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
5434 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
5436 operands
[2 + i
] = part
[0][j
];
5437 operands
[6 + i
] = part
[1][j
];
5442 for (i
= 0; i
< nparts
; i
++)
5444 operands
[2 + i
] = part
[0][i
];
5445 operands
[6 + i
] = part
[1][i
];
5449 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5450 if (optimize_insn_for_size_p ())
5452 for (j
= 0; j
< nparts
- 1; j
++)
5453 if (CONST_INT_P (operands
[6 + j
])
5454 && operands
[6 + j
] != const0_rtx
5455 && REG_P (operands
[2 + j
]))
5456 for (i
= j
; i
< nparts
- 1; i
++)
5457 if (CONST_INT_P (operands
[7 + i
])
5458 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
5459 operands
[7 + i
] = operands
[2 + j
];
5462 for (i
= 0; i
< nparts
; i
++)
5463 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
5468 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5469 left shift by a constant, either using a single shift or
5470 a sequence of add instructions. */
5473 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
5475 rtx (*insn
)(rtx
, rtx
, rtx
);
5478 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
5479 && !optimize_insn_for_size_p ()))
5481 insn
= mode
== DImode
? gen_addsi3
: gen_adddi3
;
5483 emit_insn (insn (operand
, operand
, operand
));
5487 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5488 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
5493 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
5495 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
5496 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
5497 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5499 rtx low
[2], high
[2];
5502 if (CONST_INT_P (operands
[2]))
5504 split_double_mode (mode
, operands
, 2, low
, high
);
5505 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5507 if (count
>= half_width
)
5509 emit_move_insn (high
[0], low
[1]);
5510 emit_move_insn (low
[0], const0_rtx
);
5512 if (count
> half_width
)
5513 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
5517 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5519 if (!rtx_equal_p (operands
[0], operands
[1]))
5520 emit_move_insn (operands
[0], operands
[1]);
5522 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
5523 ix86_expand_ashl_const (low
[0], count
, mode
);
5528 split_double_mode (mode
, operands
, 1, low
, high
);
5530 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5532 if (operands
[1] == const1_rtx
)
5534 /* Assuming we've chosen a QImode capable registers, then 1 << N
5535 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5536 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
5538 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
5540 ix86_expand_clear (low
[0]);
5541 ix86_expand_clear (high
[0]);
5542 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
5544 d
= gen_lowpart (QImode
, low
[0]);
5545 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5546 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
5547 emit_insn (gen_rtx_SET (d
, s
));
5549 d
= gen_lowpart (QImode
, high
[0]);
5550 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5551 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
5552 emit_insn (gen_rtx_SET (d
, s
));
5555 /* Otherwise, we can get the same results by manually performing
5556 a bit extract operation on bit 5/6, and then performing the two
5557 shifts. The two methods of getting 0/1 into low/high are exactly
5558 the same size. Avoiding the shift in the bit extract case helps
5559 pentium4 a bit; no one else seems to care much either way. */
5562 machine_mode half_mode
;
5563 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
5564 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
5565 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
5572 gen_lshr3
= gen_lshrsi3
;
5573 gen_and3
= gen_andsi3
;
5574 gen_xor3
= gen_xorsi3
;
5580 gen_lshr3
= gen_lshrdi3
;
5581 gen_and3
= gen_anddi3
;
5582 gen_xor3
= gen_xordi3
;
5586 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
5587 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
5589 x
= gen_lowpart (half_mode
, operands
[2]);
5590 emit_insn (gen_rtx_SET (high
[0], x
));
5592 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
5593 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
5594 emit_move_insn (low
[0], high
[0]);
5595 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
5598 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5599 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
5603 if (operands
[1] == constm1_rtx
)
5605 /* For -1 << N, we can avoid the shld instruction, because we
5606 know that we're shifting 0...31/63 ones into a -1. */
5607 emit_move_insn (low
[0], constm1_rtx
);
5608 if (optimize_insn_for_size_p ())
5609 emit_move_insn (high
[0], low
[0]);
5611 emit_move_insn (high
[0], constm1_rtx
);
5615 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5617 if (!rtx_equal_p (operands
[0], operands
[1]))
5618 emit_move_insn (operands
[0], operands
[1]);
5620 split_double_mode (mode
, operands
, 1, low
, high
);
5621 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
5624 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5626 if (TARGET_CMOVE
&& scratch
)
5628 rtx (*gen_x86_shift_adj_1
)(rtx
, rtx
, rtx
, rtx
)
5629 = mode
== DImode
? gen_x86_shiftsi_adj_1
: gen_x86_shiftdi_adj_1
;
5631 ix86_expand_clear (scratch
);
5632 emit_insn (gen_x86_shift_adj_1 (high
[0], low
[0], operands
[2], scratch
));
5636 rtx (*gen_x86_shift_adj_2
)(rtx
, rtx
, rtx
)
5637 = mode
== DImode
? gen_x86_shiftsi_adj_2
: gen_x86_shiftdi_adj_2
;
5639 emit_insn (gen_x86_shift_adj_2 (high
[0], low
[0], operands
[2]));
5644 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5646 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
5647 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
5648 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5649 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5651 rtx low
[2], high
[2];
5654 if (CONST_INT_P (operands
[2]))
5656 split_double_mode (mode
, operands
, 2, low
, high
);
5657 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5659 if (count
== GET_MODE_BITSIZE (mode
) - 1)
5661 emit_move_insn (high
[0], high
[1]);
5662 emit_insn (gen_ashr3 (high
[0], high
[0],
5663 GEN_INT (half_width
- 1)));
5664 emit_move_insn (low
[0], high
[0]);
5667 else if (count
>= half_width
)
5669 emit_move_insn (low
[0], high
[1]);
5670 emit_move_insn (high
[0], low
[0]);
5671 emit_insn (gen_ashr3 (high
[0], high
[0],
5672 GEN_INT (half_width
- 1)));
5674 if (count
> half_width
)
5675 emit_insn (gen_ashr3 (low
[0], low
[0],
5676 GEN_INT (count
- half_width
)));
5680 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5682 if (!rtx_equal_p (operands
[0], operands
[1]))
5683 emit_move_insn (operands
[0], operands
[1]);
5685 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5686 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
5691 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5693 if (!rtx_equal_p (operands
[0], operands
[1]))
5694 emit_move_insn (operands
[0], operands
[1]);
5696 split_double_mode (mode
, operands
, 1, low
, high
);
5698 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5699 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
5701 if (TARGET_CMOVE
&& scratch
)
5703 rtx (*gen_x86_shift_adj_1
)(rtx
, rtx
, rtx
, rtx
)
5704 = mode
== DImode
? gen_x86_shiftsi_adj_1
: gen_x86_shiftdi_adj_1
;
5706 emit_move_insn (scratch
, high
[0]);
5707 emit_insn (gen_ashr3 (scratch
, scratch
,
5708 GEN_INT (half_width
- 1)));
5709 emit_insn (gen_x86_shift_adj_1 (low
[0], high
[0], operands
[2],
5714 rtx (*gen_x86_shift_adj_3
)(rtx
, rtx
, rtx
)
5715 = mode
== DImode
? gen_x86_shiftsi_adj_3
: gen_x86_shiftdi_adj_3
;
5717 emit_insn (gen_x86_shift_adj_3 (low
[0], high
[0], operands
[2]));
5723 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5725 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
5726 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
5727 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5728 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5730 rtx low
[2], high
[2];
5733 if (CONST_INT_P (operands
[2]))
5735 split_double_mode (mode
, operands
, 2, low
, high
);
5736 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5738 if (count
>= half_width
)
5740 emit_move_insn (low
[0], high
[1]);
5741 ix86_expand_clear (high
[0]);
5743 if (count
> half_width
)
5744 emit_insn (gen_lshr3 (low
[0], low
[0],
5745 GEN_INT (count
- half_width
)));
5749 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5751 if (!rtx_equal_p (operands
[0], operands
[1]))
5752 emit_move_insn (operands
[0], operands
[1]);
5754 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5755 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
5760 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5762 if (!rtx_equal_p (operands
[0], operands
[1]))
5763 emit_move_insn (operands
[0], operands
[1]);
5765 split_double_mode (mode
, operands
, 1, low
, high
);
5767 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5768 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
5770 if (TARGET_CMOVE
&& scratch
)
5772 rtx (*gen_x86_shift_adj_1
)(rtx
, rtx
, rtx
, rtx
)
5773 = mode
== DImode
? gen_x86_shiftsi_adj_1
: gen_x86_shiftdi_adj_1
;
5775 ix86_expand_clear (scratch
);
5776 emit_insn (gen_x86_shift_adj_1 (low
[0], high
[0], operands
[2],
5781 rtx (*gen_x86_shift_adj_2
)(rtx
, rtx
, rtx
)
5782 = mode
== DImode
? gen_x86_shiftsi_adj_2
: gen_x86_shiftdi_adj_2
;
5784 emit_insn (gen_x86_shift_adj_2 (low
[0], high
[0], operands
[2]));
5789 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5790 DImode for constant loop counts. */
5793 counter_mode (rtx count_exp
)
5795 if (GET_MODE (count_exp
) != VOIDmode
)
5796 return GET_MODE (count_exp
);
5797 if (!CONST_INT_P (count_exp
))
5799 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
5804 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5805 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5806 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5807 memory by VALUE (supposed to be in MODE).
5809 The size is rounded down to whole number of chunk size moved at once.
5810 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5814 expand_set_or_movmem_via_loop (rtx destmem
, rtx srcmem
,
5815 rtx destptr
, rtx srcptr
, rtx value
,
5816 rtx count
, machine_mode mode
, int unroll
,
5817 int expected_size
, bool issetmem
)
5819 rtx_code_label
*out_label
, *top_label
;
5821 machine_mode iter_mode
= counter_mode (count
);
5822 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
5823 rtx piece_size
= GEN_INT (piece_size_n
);
5824 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
5828 top_label
= gen_label_rtx ();
5829 out_label
= gen_label_rtx ();
5830 iter
= gen_reg_rtx (iter_mode
);
5832 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
5833 NULL
, 1, OPTAB_DIRECT
);
5834 /* Those two should combine. */
5835 if (piece_size
== const1_rtx
)
5837 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
5839 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
5841 emit_move_insn (iter
, const0_rtx
);
5843 emit_label (top_label
);
5845 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
5847 /* This assert could be relaxed - in this case we'll need to compute
5848 smallest power of two, containing in PIECE_SIZE_N and pass it to
5850 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
5851 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
5852 destmem
= adjust_address (destmem
, mode
, 0);
5856 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
5857 srcmem
= adjust_address (srcmem
, mode
, 0);
5859 /* When unrolling for chips that reorder memory reads and writes,
5860 we can save registers by using single temporary.
5861 Also using 4 temporaries is overkill in 32bit mode. */
5862 if (!TARGET_64BIT
&& 0)
5864 for (i
= 0; i
< unroll
; i
++)
5868 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5869 GET_MODE_SIZE (mode
));
5870 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5871 GET_MODE_SIZE (mode
));
5873 emit_move_insn (destmem
, srcmem
);
5879 gcc_assert (unroll
<= 4);
5880 for (i
= 0; i
< unroll
; i
++)
5882 tmpreg
[i
] = gen_reg_rtx (mode
);
5884 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5885 GET_MODE_SIZE (mode
));
5886 emit_move_insn (tmpreg
[i
], srcmem
);
5888 for (i
= 0; i
< unroll
; i
++)
5891 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5892 GET_MODE_SIZE (mode
));
5893 emit_move_insn (destmem
, tmpreg
[i
]);
5898 for (i
= 0; i
< unroll
; i
++)
5901 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5902 GET_MODE_SIZE (mode
));
5903 emit_move_insn (destmem
, value
);
5906 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
5907 true, OPTAB_LIB_WIDEN
);
5909 emit_move_insn (iter
, tmp
);
5911 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
5913 if (expected_size
!= -1)
5915 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
5916 if (expected_size
== 0)
5918 else if (expected_size
> REG_BR_PROB_BASE
)
5919 predict_jump (REG_BR_PROB_BASE
- 1);
5921 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
5925 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
5926 iter
= ix86_zero_extend_to_Pmode (iter
);
5927 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
5928 true, OPTAB_LIB_WIDEN
);
5930 emit_move_insn (destptr
, tmp
);
5933 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
5934 true, OPTAB_LIB_WIDEN
);
5936 emit_move_insn (srcptr
, tmp
);
5938 emit_label (out_label
);
5941 /* Divide COUNTREG by SCALE. */
5943 scale_counter (rtx countreg
, int scale
)
5949 if (CONST_INT_P (countreg
))
5950 return GEN_INT (INTVAL (countreg
) / scale
);
5951 gcc_assert (REG_P (countreg
));
5953 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
5954 GEN_INT (exact_log2 (scale
)),
5955 NULL
, 1, OPTAB_DIRECT
);
5959 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5960 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5961 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5962 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5963 ORIG_VALUE is the original value passed to memset to fill the memory with.
5964 Other arguments have same meaning as for previous function. */
5967 expand_set_or_movmem_via_rep (rtx destmem
, rtx srcmem
,
5968 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
5970 machine_mode mode
, bool issetmem
)
5975 HOST_WIDE_INT rounded_count
;
5977 /* If possible, it is shorter to use rep movs.
5978 TODO: Maybe it is better to move this logic to decide_alg. */
5979 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
5980 && (!issetmem
|| orig_value
== const0_rtx
))
5983 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
5984 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
5986 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
5987 GET_MODE_SIZE (mode
)));
5990 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5991 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5992 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
5995 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
5996 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
5999 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
6000 destmem
= shallow_copy_rtx (destmem
);
6001 set_mem_size (destmem
, rounded_count
);
6003 else if (MEM_SIZE_KNOWN_P (destmem
))
6004 clear_mem_size (destmem
);
6008 value
= force_reg (mode
, gen_lowpart (mode
, value
));
6009 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
6013 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
6014 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
6017 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
6018 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
6019 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
6022 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
6023 if (CONST_INT_P (count
))
6026 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
6027 srcmem
= shallow_copy_rtx (srcmem
);
6028 set_mem_size (srcmem
, rounded_count
);
6032 if (MEM_SIZE_KNOWN_P (srcmem
))
6033 clear_mem_size (srcmem
);
6035 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
6040 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6042 SRC is passed by pointer to be updated on return.
6043 Return value is updated DST. */
6045 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
6046 HOST_WIDE_INT size_to_move
)
6048 rtx dst
= destmem
, src
= *srcmem
, adjust
, tempreg
;
6049 enum insn_code code
;
6050 machine_mode move_mode
;
6053 /* Find the widest mode in which we could perform moves.
6054 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6055 it until move of such size is supported. */
6056 piece_size
= 1 << floor_log2 (size_to_move
);
6057 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
6058 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6060 gcc_assert (piece_size
> 1);
6064 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6065 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6066 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
6068 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
6069 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
6070 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6072 move_mode
= word_mode
;
6073 piece_size
= GET_MODE_SIZE (move_mode
);
6074 code
= optab_handler (mov_optab
, move_mode
);
6077 gcc_assert (code
!= CODE_FOR_nothing
);
6079 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6080 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
6082 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6083 gcc_assert (size_to_move
% piece_size
== 0);
6084 adjust
= GEN_INT (piece_size
);
6085 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6087 /* We move from memory to memory, so we'll need to do it via
6088 a temporary register. */
6089 tempreg
= gen_reg_rtx (move_mode
);
6090 emit_insn (GEN_FCN (code
) (tempreg
, src
));
6091 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
6093 emit_move_insn (destptr
,
6094 gen_rtx_PLUS (Pmode
, copy_rtx (destptr
), adjust
));
6095 emit_move_insn (srcptr
,
6096 gen_rtx_PLUS (Pmode
, copy_rtx (srcptr
), adjust
));
6098 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6100 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
6104 /* Update DST and SRC rtx. */
6109 /* Helper function for the string operations below. Dest VARIABLE whether
6110 it is aligned to VALUE bytes. If true, jump to the label. */
6112 static rtx_code_label
*
6113 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
6115 rtx_code_label
*label
= gen_label_rtx ();
6116 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
6117 if (GET_MODE (variable
) == DImode
)
6118 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
6120 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
6121 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
6124 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
6126 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
6131 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6134 expand_movmem_epilogue (rtx destmem
, rtx srcmem
,
6135 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
6138 if (CONST_INT_P (count
))
6140 HOST_WIDE_INT countval
= INTVAL (count
);
6141 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6144 /* For now MAX_SIZE should be a power of 2. This assert could be
6145 relaxed, but it'll require a bit more complicated epilogue
6147 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6148 for (i
= max_size
; i
>= 1; i
>>= 1)
6150 if (epilogue_size
& i
)
6151 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6157 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
6158 count
, 1, OPTAB_DIRECT
);
6159 expand_set_or_movmem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
6160 count
, QImode
, 1, 4, false);
6164 /* When there are stringops, we can cheaply increase dest and src pointers.
6165 Otherwise we save code size by maintaining offset (zero is readily
6166 available from preceding rep operation) and using x86 addressing modes.
6168 if (TARGET_SINGLE_STRINGOP
)
6172 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6173 src
= change_address (srcmem
, SImode
, srcptr
);
6174 dest
= change_address (destmem
, SImode
, destptr
);
6175 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6177 LABEL_NUSES (label
) = 1;
6181 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6182 src
= change_address (srcmem
, HImode
, srcptr
);
6183 dest
= change_address (destmem
, HImode
, destptr
);
6184 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6186 LABEL_NUSES (label
) = 1;
6190 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6191 src
= change_address (srcmem
, QImode
, srcptr
);
6192 dest
= change_address (destmem
, QImode
, destptr
);
6193 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6195 LABEL_NUSES (label
) = 1;
6200 rtx offset
= force_reg (Pmode
, const0_rtx
);
6205 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6206 src
= change_address (srcmem
, SImode
, srcptr
);
6207 dest
= change_address (destmem
, SImode
, destptr
);
6208 emit_move_insn (dest
, src
);
6209 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
6210 true, OPTAB_LIB_WIDEN
);
6212 emit_move_insn (offset
, tmp
);
6214 LABEL_NUSES (label
) = 1;
6218 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6219 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6220 src
= change_address (srcmem
, HImode
, tmp
);
6221 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6222 dest
= change_address (destmem
, HImode
, tmp
);
6223 emit_move_insn (dest
, src
);
6224 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
6225 true, OPTAB_LIB_WIDEN
);
6227 emit_move_insn (offset
, tmp
);
6229 LABEL_NUSES (label
) = 1;
6233 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6234 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6235 src
= change_address (srcmem
, QImode
, tmp
);
6236 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6237 dest
= change_address (destmem
, QImode
, tmp
);
6238 emit_move_insn (dest
, src
);
6240 LABEL_NUSES (label
) = 1;
6245 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6246 with value PROMOTED_VAL.
6247 SRC is passed by pointer to be updated on return.
6248 Return value is updated DST. */
6250 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
6251 HOST_WIDE_INT size_to_move
)
6253 rtx dst
= destmem
, adjust
;
6254 enum insn_code code
;
6255 machine_mode move_mode
;
6258 /* Find the widest mode in which we could perform moves.
6259 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6260 it until move of such size is supported. */
6261 move_mode
= GET_MODE (promoted_val
);
6262 if (move_mode
== VOIDmode
)
6264 if (size_to_move
< GET_MODE_SIZE (move_mode
))
6266 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
6267 move_mode
= int_mode_for_size (move_bits
, 0).require ();
6268 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
6270 piece_size
= GET_MODE_SIZE (move_mode
);
6271 code
= optab_handler (mov_optab
, move_mode
);
6272 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
6274 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6276 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6277 gcc_assert (size_to_move
% piece_size
== 0);
6278 adjust
= GEN_INT (piece_size
);
6279 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6281 if (piece_size
<= GET_MODE_SIZE (word_mode
))
6283 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
6284 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6289 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
6291 emit_move_insn (destptr
,
6292 gen_rtx_PLUS (Pmode
, copy_rtx (destptr
), adjust
));
6294 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6298 /* Update DST rtx. */
6301 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6303 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
6304 rtx count
, int max_size
)
6306 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
6307 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
6308 expand_set_or_movmem_via_loop (destmem
, NULL
, destptr
, NULL
,
6309 gen_lowpart (QImode
, value
), count
, QImode
,
6310 1, max_size
/ 2, true);
6313 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6315 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
6316 rtx count
, int max_size
)
6320 if (CONST_INT_P (count
))
6322 HOST_WIDE_INT countval
= INTVAL (count
);
6323 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6326 /* For now MAX_SIZE should be a power of 2. This assert could be
6327 relaxed, but it'll require a bit more complicated epilogue
6329 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6330 for (i
= max_size
; i
>= 1; i
>>= 1)
6332 if (epilogue_size
& i
)
6334 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6335 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6337 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6344 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
6349 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
6352 dest
= change_address (destmem
, DImode
, destptr
);
6353 emit_insn (gen_strset (destptr
, dest
, value
));
6354 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
6355 emit_insn (gen_strset (destptr
, dest
, value
));
6359 dest
= change_address (destmem
, SImode
, destptr
);
6360 emit_insn (gen_strset (destptr
, dest
, value
));
6361 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6362 emit_insn (gen_strset (destptr
, dest
, value
));
6363 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
6364 emit_insn (gen_strset (destptr
, dest
, value
));
6365 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
6366 emit_insn (gen_strset (destptr
, dest
, value
));
6369 LABEL_NUSES (label
) = 1;
6373 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
6376 dest
= change_address (destmem
, DImode
, destptr
);
6377 emit_insn (gen_strset (destptr
, dest
, value
));
6381 dest
= change_address (destmem
, SImode
, destptr
);
6382 emit_insn (gen_strset (destptr
, dest
, value
));
6383 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6384 emit_insn (gen_strset (destptr
, dest
, value
));
6387 LABEL_NUSES (label
) = 1;
6391 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6392 dest
= change_address (destmem
, SImode
, destptr
);
6393 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
6395 LABEL_NUSES (label
) = 1;
6399 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6400 dest
= change_address (destmem
, HImode
, destptr
);
6401 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
6403 LABEL_NUSES (label
) = 1;
6407 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6408 dest
= change_address (destmem
, QImode
, destptr
);
6409 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
6411 LABEL_NUSES (label
) = 1;
6415 /* Adjust COUNTER by the VALUE. */
6417 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
6419 rtx (*gen_add
)(rtx
, rtx
, rtx
)
6420 = GET_MODE (countreg
) == DImode
? gen_adddi3
: gen_addsi3
;
6422 emit_insn (gen_add (countreg
, countreg
, GEN_INT (-value
)));
6425 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6426 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6427 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6429 Return value is updated DESTMEM. */
6432 expand_set_or_movmem_prologue (rtx destmem
, rtx srcmem
,
6433 rtx destptr
, rtx srcptr
, rtx value
,
6434 rtx vec_value
, rtx count
, int align
,
6435 int desired_alignment
, bool issetmem
)
6438 for (i
= 1; i
< desired_alignment
; i
<<= 1)
6442 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
6445 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6446 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6448 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6451 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6452 ix86_adjust_counter (count
, i
);
6454 LABEL_NUSES (label
) = 1;
6455 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
6461 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6462 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6463 and jump to DONE_LABEL. */
6465 expand_small_movmem_or_setmem (rtx destmem
, rtx srcmem
,
6466 rtx destptr
, rtx srcptr
,
6467 rtx value
, rtx vec_value
,
6468 rtx count
, int size
,
6469 rtx done_label
, bool issetmem
)
6471 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
6472 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
6476 /* If we do not have vector value to copy, we must reduce size. */
6481 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
6483 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
6484 mode
= GET_MODE (value
);
6487 mode
= GET_MODE (vec_value
), value
= vec_value
;
6491 /* Choose appropriate vector mode. */
6493 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
6494 else if (size
>= 16)
6495 mode
= TARGET_SSE
? V16QImode
: DImode
;
6496 srcmem
= change_address (srcmem
, mode
, srcptr
);
6498 destmem
= change_address (destmem
, mode
, destptr
);
6499 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6500 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6501 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6504 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6507 emit_move_insn (destmem
, srcmem
);
6508 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6510 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6513 destmem
= offset_address (destmem
, count
, 1);
6514 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
6515 GET_MODE_SIZE (mode
));
6518 srcmem
= offset_address (srcmem
, count
, 1);
6519 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
6520 GET_MODE_SIZE (mode
));
6522 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6525 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6528 emit_move_insn (destmem
, srcmem
);
6529 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6531 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6533 emit_jump_insn (gen_jump (done_label
));
6537 LABEL_NUSES (label
) = 1;
6540 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6541 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6542 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6543 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6544 DONE_LABEL is a label after the whole copying sequence. The label is created
6545 on demand if *DONE_LABEL is NULL.
6546 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6547 bounds after the initial copies.
6549 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6550 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6551 we will dispatch to a library call for large blocks.
6553 In pseudocode we do:
6557 Assume that SIZE is 4. Bigger sizes are handled analogously
6560 copy 4 bytes from SRCPTR to DESTPTR
6561 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6566 copy 1 byte from SRCPTR to DESTPTR
6569 copy 2 bytes from SRCPTR to DESTPTR
6570 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6575 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6576 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6578 OLD_DESPTR = DESTPTR;
6579 Align DESTPTR up to DESIRED_ALIGN
6580 SRCPTR += DESTPTR - OLD_DESTPTR
6581 COUNT -= DEST_PTR - OLD_DESTPTR
6583 Round COUNT down to multiple of SIZE
6584 << optional caller supplied zero size guard is here >>
6585 << optional caller supplied dynamic check is here >>
6586 << caller supplied main copy loop is here >>
6591 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
6592 rtx
*destptr
, rtx
*srcptr
,
6594 rtx value
, rtx vec_value
,
6596 rtx_code_label
**done_label
,
6600 unsigned HOST_WIDE_INT
*min_size
,
6604 rtx_code_label
*loop_label
= NULL
, *label
;
6607 int prolog_size
= 0;
6610 /* Chose proper value to copy. */
6611 if (issetmem
&& VECTOR_MODE_P (mode
))
6612 mode_value
= vec_value
;
6615 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6617 /* See if block is big or small, handle small blocks. */
6618 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
6621 loop_label
= gen_label_rtx ();
6624 *done_label
= gen_label_rtx ();
6626 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
6630 /* Handle sizes > 3. */
6631 for (;size2
> 2; size2
>>= 1)
6632 expand_small_movmem_or_setmem (destmem
, srcmem
,
6636 size2
, *done_label
, issetmem
);
6637 /* Nothing to copy? Jump to DONE_LABEL if so */
6638 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
6641 /* Do a byte copy. */
6642 destmem
= change_address (destmem
, QImode
, *destptr
);
6644 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
6647 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
6648 emit_move_insn (destmem
, srcmem
);
6651 /* Handle sizes 2 and 3. */
6652 label
= ix86_expand_aligntest (*count
, 2, false);
6653 destmem
= change_address (destmem
, HImode
, *destptr
);
6654 destmem
= offset_address (destmem
, *count
, 1);
6655 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
6657 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
6660 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
6661 srcmem
= offset_address (srcmem
, *count
, 1);
6662 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
6663 emit_move_insn (destmem
, srcmem
);
6667 LABEL_NUSES (label
) = 1;
6668 emit_jump_insn (gen_jump (*done_label
));
6672 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
6673 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
6675 /* Start memcpy for COUNT >= SIZE. */
6678 emit_label (loop_label
);
6679 LABEL_NUSES (loop_label
) = 1;
6682 /* Copy first desired_align bytes. */
6684 srcmem
= change_address (srcmem
, mode
, *srcptr
);
6685 destmem
= change_address (destmem
, mode
, *destptr
);
6686 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6687 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
6690 emit_move_insn (destmem
, mode_value
);
6693 emit_move_insn (destmem
, srcmem
);
6694 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6696 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6697 prolog_size
+= GET_MODE_SIZE (mode
);
6701 /* Copy last SIZE bytes. */
6702 destmem
= offset_address (destmem
, *count
, 1);
6703 destmem
= offset_address (destmem
,
6704 GEN_INT (-size
- prolog_size
),
6707 emit_move_insn (destmem
, mode_value
);
6710 srcmem
= offset_address (srcmem
, *count
, 1);
6711 srcmem
= offset_address (srcmem
,
6712 GEN_INT (-size
- prolog_size
),
6714 emit_move_insn (destmem
, srcmem
);
6716 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6718 destmem
= offset_address (destmem
, modesize
, 1);
6720 emit_move_insn (destmem
, mode_value
);
6723 srcmem
= offset_address (srcmem
, modesize
, 1);
6724 emit_move_insn (destmem
, srcmem
);
6728 /* Align destination. */
6729 if (desired_align
> 1 && desired_align
> align
)
6731 rtx saveddest
= *destptr
;
6733 gcc_assert (desired_align
<= size
);
6734 /* Align destptr up, place it to new register. */
6735 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
6736 GEN_INT (prolog_size
),
6737 NULL_RTX
, 1, OPTAB_DIRECT
);
6738 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
6739 REG_POINTER (*destptr
) = 1;
6740 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
6741 GEN_INT (-desired_align
),
6742 *destptr
, 1, OPTAB_DIRECT
);
6743 /* See how many bytes we skipped. */
6744 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
6746 saveddest
, 1, OPTAB_DIRECT
);
6747 /* Adjust srcptr and count. */
6749 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
6750 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
6751 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6752 saveddest
, *count
, 1, OPTAB_DIRECT
);
6753 /* We copied at most size + prolog_size. */
6754 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
6756 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
6760 /* Our loops always round down the block size, but for dispatch to
6761 library we need precise value. */
6763 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
6764 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
6768 gcc_assert (prolog_size
== 0);
6769 /* Decrease count, so we won't end up copying last word twice. */
6770 if (!CONST_INT_P (*count
))
6771 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6772 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
6774 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
6775 (unsigned HOST_WIDE_INT
)size
));
6777 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
6782 /* This function is like the previous one, except here we know how many bytes
6783 need to be copied. That allows us to update alignment not only of DST, which
6784 is returned, but also of SRC, which is passed as a pointer for that
6787 expand_set_or_movmem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
6788 rtx srcreg
, rtx value
, rtx vec_value
,
6789 int desired_align
, int align_bytes
,
6794 rtx orig_src
= NULL
;
6796 int copied_bytes
= 0;
6800 gcc_assert (srcp
!= NULL
);
6805 for (piece_size
= 1;
6806 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
6809 if (align_bytes
& piece_size
)
6813 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
6814 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
6816 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
6819 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
6820 copied_bytes
+= piece_size
;
6823 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
6824 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
6825 if (MEM_SIZE_KNOWN_P (orig_dst
))
6826 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
6830 int src_align_bytes
= get_mem_align_offset (src
, desired_align
6832 if (src_align_bytes
>= 0)
6833 src_align_bytes
= desired_align
- src_align_bytes
;
6834 if (src_align_bytes
>= 0)
6836 unsigned int src_align
;
6837 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
6839 if ((src_align_bytes
& (src_align
- 1))
6840 == (align_bytes
& (src_align
- 1)))
6843 if (src_align
> (unsigned int) desired_align
)
6844 src_align
= desired_align
;
6845 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
6846 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
6848 if (MEM_SIZE_KNOWN_P (orig_src
))
6849 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
6856 /* Return true if ALG can be used in current context.
6857 Assume we expand memset if MEMSET is true. */
6859 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
6861 if (alg
== no_stringop
)
6863 if (alg
== vector_loop
)
6864 return TARGET_SSE
|| TARGET_AVX
;
6865 /* Algorithms using the rep prefix want at least edi and ecx;
6866 additionally, memset wants eax and memcpy wants esi. Don't
6867 consider such algorithms if the user has appropriated those
6868 registers for their own purposes, or if we have a non-default
6869 address space, since some string insns cannot override the segment. */
6870 if (alg
== rep_prefix_1_byte
6871 || alg
== rep_prefix_4_byte
6872 || alg
== rep_prefix_8_byte
)
6876 if (fixed_regs
[CX_REG
]
6877 || fixed_regs
[DI_REG
]
6878 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
6884 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6885 static enum stringop_alg
6886 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
6887 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
6888 bool memset
, bool zero_memset
, bool have_as
,
6889 int *dynamic_check
, bool *noalign
, bool recur
)
6891 const struct stringop_algs
*algs
;
6892 bool optimize_for_speed
;
6894 const struct processor_costs
*cost
;
6896 bool any_alg_usable_p
= false;
6899 *dynamic_check
= -1;
6901 /* Even if the string operation call is cold, we still might spend a lot
6902 of time processing large blocks. */
6903 if (optimize_function_for_size_p (cfun
)
6904 || (optimize_insn_for_size_p ()
6906 || (expected_size
!= -1 && expected_size
< 256))))
6907 optimize_for_speed
= false;
6909 optimize_for_speed
= true;
6911 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
6913 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
6915 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
6917 /* See maximal size for user defined algorithm. */
6918 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6920 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6921 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
6922 any_alg_usable_p
|= usable
;
6924 if (candidate
!= libcall
&& candidate
&& usable
)
6925 max
= algs
->size
[i
].max
;
6928 /* If expected size is not known but max size is small enough
6929 so inline version is a win, set expected size into
6931 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
6932 && expected_size
== -1)
6933 expected_size
= min_size
/ 2 + max_size
/ 2;
6935 /* If user specified the algorithm, honor it if possible. */
6936 if (ix86_stringop_alg
!= no_stringop
6937 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
6938 return ix86_stringop_alg
;
6939 /* rep; movq or rep; movl is the smallest variant. */
6940 else if (!optimize_for_speed
)
6943 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
6944 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
6945 ? rep_prefix_1_byte
: loop_1_byte
;
6947 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
6948 ? rep_prefix_4_byte
: loop
;
6950 /* Very tiny blocks are best handled via the loop, REP is expensive to
6952 else if (expected_size
!= -1 && expected_size
< 4)
6954 else if (expected_size
!= -1)
6956 enum stringop_alg alg
= libcall
;
6957 bool alg_noalign
= false;
6958 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6960 /* We get here if the algorithms that were not libcall-based
6961 were rep-prefix based and we are unable to use rep prefixes
6962 based on global register usage. Break out of the loop and
6963 use the heuristic below. */
6964 if (algs
->size
[i
].max
== 0)
6966 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
6968 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6970 if (candidate
!= libcall
6971 && alg_usable_p (candidate
, memset
, have_as
))
6974 alg_noalign
= algs
->size
[i
].noalign
;
6976 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6977 last non-libcall inline algorithm. */
6978 if (TARGET_INLINE_ALL_STRINGOPS
)
6980 /* When the current size is best to be copied by a libcall,
6981 but we are still forced to inline, run the heuristic below
6982 that will pick code for medium sized blocks. */
6985 *noalign
= alg_noalign
;
6988 else if (!any_alg_usable_p
)
6991 else if (alg_usable_p (candidate
, memset
, have_as
))
6993 *noalign
= algs
->size
[i
].noalign
;
6999 /* When asked to inline the call anyway, try to pick meaningful choice.
7000 We look for maximal size of block that is faster to copy by hand and
7001 take blocks of at most of that size guessing that average size will
7002 be roughly half of the block.
7004 If this turns out to be bad, we might simply specify the preferred
7005 choice in ix86_costs. */
7006 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7007 && (algs
->unknown_size
== libcall
7008 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
7010 enum stringop_alg alg
;
7011 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
7013 /* If there aren't any usable algorithms or if recursing already,
7014 then recursing on smaller sizes or same size isn't going to
7015 find anything. Just return the simple byte-at-a-time copy loop. */
7016 if (!any_alg_usable_p
|| recur
)
7018 /* Pick something reasonable. */
7019 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
7020 *dynamic_check
= 128;
7023 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
7024 zero_memset
, have_as
, dynamic_check
, noalign
, true);
7025 gcc_assert (*dynamic_check
== -1);
7026 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7027 *dynamic_check
= max
;
7029 gcc_assert (alg
!= libcall
);
7032 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
7033 ? algs
->unknown_size
: libcall
);
7036 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7037 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7039 decide_alignment (int align
,
7040 enum stringop_alg alg
,
7042 machine_mode move_mode
)
7044 int desired_align
= 0;
7046 gcc_assert (alg
!= no_stringop
);
7050 if (move_mode
== VOIDmode
)
7053 desired_align
= GET_MODE_SIZE (move_mode
);
7054 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7055 copying whole cacheline at once. */
7056 if (TARGET_PENTIUMPRO
7057 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
7062 if (desired_align
< align
)
7063 desired_align
= align
;
7064 if (expected_size
!= -1 && expected_size
< 4)
7065 desired_align
= align
;
7067 return desired_align
;
7071 /* Helper function for memcpy. For QImode value 0xXY produce
7072 0xXYXYXYXY of wide specified by MODE. This is essentially
7073 a * 0x10101010, but we can do slightly better than
7074 synth_mult by unwinding the sequence by hand on CPUs with
7077 promote_duplicated_reg (machine_mode mode
, rtx val
)
7079 machine_mode valmode
= GET_MODE (val
);
7081 int nops
= mode
== DImode
? 3 : 2;
7083 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
7084 if (val
== const0_rtx
)
7085 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
7086 if (CONST_INT_P (val
))
7088 HOST_WIDE_INT v
= INTVAL (val
) & 255;
7093 v
|= (v
<< 16) << 16;
7094 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
7097 if (valmode
== VOIDmode
)
7099 if (valmode
!= QImode
)
7100 val
= gen_lowpart (QImode
, val
);
7103 if (!TARGET_PARTIAL_REG_STALL
)
7105 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
7106 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
7107 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
7108 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
7110 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7111 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
7112 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
7117 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7119 if (!TARGET_PARTIAL_REG_STALL
)
7121 emit_insn (gen_insvsi_1 (reg
, reg
));
7123 emit_insn (gen_insvdi_1 (reg
, reg
));
7126 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
7127 NULL
, 1, OPTAB_DIRECT
);
7128 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
7131 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
7132 NULL
, 1, OPTAB_DIRECT
);
7133 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7136 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
7137 NULL
, 1, OPTAB_DIRECT
);
7138 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7143 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7144 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7145 alignment from ALIGN to DESIRED_ALIGN. */
7147 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
7153 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
7154 promoted_val
= promote_duplicated_reg (DImode
, val
);
7155 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
7156 promoted_val
= promote_duplicated_reg (SImode
, val
);
7157 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
7158 promoted_val
= promote_duplicated_reg (HImode
, val
);
7162 return promoted_val
;
7165 /* Copy the address to a Pmode register. This is used for x32 to
7166 truncate DImode TLS address to a SImode register. */
7169 ix86_copy_addr_to_reg (rtx addr
)
7172 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
7174 reg
= copy_addr_to_reg (addr
);
7175 REG_POINTER (reg
) = 1;
7180 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
7181 reg
= copy_to_mode_reg (DImode
, addr
);
7182 REG_POINTER (reg
) = 1;
7183 return gen_rtx_SUBREG (SImode
, reg
, 0);
7187 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7188 operations when profitable. The code depends upon architecture, block size
7189 and alignment, but always has one of the following overall structures:
7191 Aligned move sequence:
7193 1) Prologue guard: Conditional that jumps up to epilogues for small
7194 blocks that can be handled by epilogue alone. This is faster
7195 but also needed for correctness, since prologue assume the block
7196 is larger than the desired alignment.
7198 Optional dynamic check for size and libcall for large
7199 blocks is emitted here too, with -minline-stringops-dynamically.
7201 2) Prologue: copy first few bytes in order to get destination
7202 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7203 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7204 copied. We emit either a jump tree on power of two sized
7205 blocks, or a byte loop.
7207 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7208 with specified algorithm.
7210 4) Epilogue: code copying tail of the block that is too small to be
7211 handled by main body (or up to size guarded by prologue guard).
7213 Misaligned move sequence
7215 1) missaligned move prologue/epilogue containing:
7216 a) Prologue handling small memory blocks and jumping to done_label
7217 (skipped if blocks are known to be large enough)
7218 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7219 needed by single possibly misaligned move
7220 (skipped if alignment is not needed)
7221 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7223 2) Zero size guard dispatching to done_label, if needed
7225 3) dispatch to library call, if needed,
7227 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7228 with specified algorithm. */
7230 ix86_expand_set_or_movmem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
7231 rtx align_exp
, rtx expected_align_exp
,
7232 rtx expected_size_exp
, rtx min_size_exp
,
7233 rtx max_size_exp
, rtx probable_max_size_exp
,
7238 rtx_code_label
*label
= NULL
;
7240 rtx_code_label
*jump_around_label
= NULL
;
7241 HOST_WIDE_INT align
= 1;
7242 unsigned HOST_WIDE_INT count
= 0;
7243 HOST_WIDE_INT expected_size
= -1;
7244 int size_needed
= 0, epilogue_size_needed
;
7245 int desired_align
= 0, align_bytes
= 0;
7246 enum stringop_alg alg
;
7247 rtx promoted_val
= NULL
;
7248 rtx vec_promoted_val
= NULL
;
7249 bool force_loopy_epilogue
= false;
7251 bool need_zero_guard
= false;
7253 machine_mode move_mode
= VOIDmode
;
7254 machine_mode wider_mode
;
7255 int unroll_factor
= 1;
7256 /* TODO: Once value ranges are available, fill in proper data. */
7257 unsigned HOST_WIDE_INT min_size
= 0;
7258 unsigned HOST_WIDE_INT max_size
= -1;
7259 unsigned HOST_WIDE_INT probable_max_size
= -1;
7260 bool misaligned_prologue_used
= false;
7263 if (CONST_INT_P (align_exp
))
7264 align
= INTVAL (align_exp
);
7265 /* i386 can do misaligned access on reasonably increased cost. */
7266 if (CONST_INT_P (expected_align_exp
)
7267 && INTVAL (expected_align_exp
) > align
)
7268 align
= INTVAL (expected_align_exp
);
7269 /* ALIGN is the minimum of destination and source alignment, but we care here
7270 just about destination alignment. */
7272 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
7273 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
7275 if (CONST_INT_P (count_exp
))
7277 min_size
= max_size
= probable_max_size
= count
= expected_size
7278 = INTVAL (count_exp
);
7279 /* When COUNT is 0, there is nothing to do. */
7286 min_size
= INTVAL (min_size_exp
);
7288 max_size
= INTVAL (max_size_exp
);
7289 if (probable_max_size_exp
)
7290 probable_max_size
= INTVAL (probable_max_size_exp
);
7291 if (CONST_INT_P (expected_size_exp
))
7292 expected_size
= INTVAL (expected_size_exp
);
7295 /* Make sure we don't need to care about overflow later on. */
7296 if (count
> (HOST_WIDE_INT_1U
<< 30))
7299 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
7301 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
7303 /* Step 0: Decide on preferred algorithm, desired alignment and
7304 size of chunks to be copied by main loop. */
7305 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
7307 issetmem
&& val_exp
== const0_rtx
, have_as
,
7308 &dynamic_check
, &noalign
, false);
7311 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
7312 stringop_alg_names
[alg
]);
7316 gcc_assert (alg
!= no_stringop
);
7318 /* For now vector-version of memset is generated only for memory zeroing, as
7319 creating of promoted vector value is very cheap in this case. */
7320 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
7321 alg
= unrolled_loop
;
7324 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
7325 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
7327 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
7330 move_mode
= word_mode
;
7338 need_zero_guard
= true;
7342 need_zero_guard
= true;
7345 need_zero_guard
= true;
7346 unroll_factor
= (TARGET_64BIT
? 4 : 2);
7349 need_zero_guard
= true;
7351 /* Find the widest supported mode. */
7352 move_mode
= word_mode
;
7353 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
7354 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
7355 move_mode
= wider_mode
;
7357 if (TARGET_AVX128_OPTIMAL
&& GET_MODE_BITSIZE (move_mode
) > 128)
7360 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7361 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7362 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7364 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7365 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7366 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
7367 move_mode
= word_mode
;
7369 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
7371 case rep_prefix_8_byte
:
7374 case rep_prefix_4_byte
:
7377 case rep_prefix_1_byte
:
7381 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
7382 epilogue_size_needed
= size_needed
;
7384 /* If we are going to call any library calls conditionally, make sure any
7385 pending stack adjustment happen before the first conditional branch,
7386 otherwise they will be emitted before the library call only and won't
7387 happen from the other branches. */
7388 if (dynamic_check
!= -1)
7389 do_pending_stack_adjust ();
7391 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
7392 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
7393 align
= desired_align
;
7395 /* Step 1: Prologue guard. */
7397 /* Alignment code needs count to be in register. */
7398 if (CONST_INT_P (count_exp
) && desired_align
> align
)
7400 if (INTVAL (count_exp
) > desired_align
7401 && INTVAL (count_exp
) > size_needed
)
7404 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
7405 if (align_bytes
<= 0)
7408 align_bytes
= desired_align
- align_bytes
;
7410 if (align_bytes
== 0)
7411 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
7413 gcc_assert (desired_align
>= 1 && align
>= 1);
7415 /* Misaligned move sequences handle both prologue and epilogue at once.
7416 Default code generation results in a smaller code for large alignments
7417 and also avoids redundant job when sizes are known precisely. */
7418 misaligned_prologue_used
7419 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7420 && MAX (desired_align
, epilogue_size_needed
) <= 32
7421 && desired_align
<= epilogue_size_needed
7422 && ((desired_align
> align
&& !align_bytes
)
7423 || (!count
&& epilogue_size_needed
> 1)));
7425 /* Do the cheap promotion to allow better CSE across the
7426 main loop and epilogue (ie one load of the big constant in the
7428 For now the misaligned move sequences do not have fast path
7429 without broadcasting. */
7430 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
7432 if (alg
== vector_loop
)
7434 gcc_assert (val_exp
== const0_rtx
);
7435 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
7436 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
7437 GET_MODE_SIZE (word_mode
),
7438 desired_align
, align
);
7442 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7443 desired_align
, align
);
7446 /* Misaligned move sequences handles both prologues and epilogues at once.
7447 Default code generation results in smaller code for large alignments and
7448 also avoids redundant job when sizes are known precisely. */
7449 if (misaligned_prologue_used
)
7451 /* Misaligned move prologue handled small blocks by itself. */
7452 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
7453 (dst
, src
, &destreg
, &srcreg
,
7454 move_mode
, promoted_val
, vec_promoted_val
,
7457 desired_align
< align
7458 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
7459 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
7461 src
= change_address (src
, BLKmode
, srcreg
);
7462 dst
= change_address (dst
, BLKmode
, destreg
);
7463 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
7464 epilogue_size_needed
= 0;
7466 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
7468 /* It is possible that we copied enough so the main loop will not
7470 gcc_assert (size_needed
> 1);
7471 if (jump_around_label
== NULL_RTX
)
7472 jump_around_label
= gen_label_rtx ();
7473 emit_cmp_and_jump_insns (count_exp
,
7474 GEN_INT (size_needed
),
7475 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
7476 if (expected_size
== -1
7477 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7478 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7480 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7483 /* Ensure that alignment prologue won't copy past end of block. */
7484 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
7486 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
7487 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7488 Make sure it is power of 2. */
7489 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
7491 /* To improve performance of small blocks, we jump around the VAL
7492 promoting mode. This mean that if the promoted VAL is not constant,
7493 we might not use it in the epilogue and have to use byte
7495 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
7496 force_loopy_epilogue
= true;
7497 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7498 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7500 /* If main algorithm works on QImode, no epilogue is needed.
7501 For small sizes just don't align anything. */
7502 if (size_needed
== 1)
7503 desired_align
= align
;
7508 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7510 label
= gen_label_rtx ();
7511 emit_cmp_and_jump_insns (count_exp
,
7512 GEN_INT (epilogue_size_needed
),
7513 LTU
, 0, counter_mode (count_exp
), 1, label
);
7514 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
7515 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7517 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7521 /* Emit code to decide on runtime whether library call or inline should be
7523 if (dynamic_check
!= -1)
7525 if (!issetmem
&& CONST_INT_P (count_exp
))
7527 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
7529 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7530 count_exp
= const0_rtx
;
7536 rtx_code_label
*hot_label
= gen_label_rtx ();
7537 if (jump_around_label
== NULL_RTX
)
7538 jump_around_label
= gen_label_rtx ();
7539 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
7540 LEU
, 0, counter_mode (count_exp
),
7542 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7544 set_storage_via_libcall (dst
, count_exp
, val_exp
);
7546 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7547 emit_jump (jump_around_label
);
7548 emit_label (hot_label
);
7552 /* Step 2: Alignment prologue. */
7553 /* Do the expensive promotion once we branched off the small blocks. */
7554 if (issetmem
&& !promoted_val
)
7555 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7556 desired_align
, align
);
7558 if (desired_align
> align
&& !misaligned_prologue_used
)
7560 if (align_bytes
== 0)
7562 /* Except for the first move in prologue, we no longer know
7563 constant offset in aliasing info. It don't seems to worth
7564 the pain to maintain it for the first move, so throw away
7566 dst
= change_address (dst
, BLKmode
, destreg
);
7568 src
= change_address (src
, BLKmode
, srcreg
);
7569 dst
= expand_set_or_movmem_prologue (dst
, src
, destreg
, srcreg
,
7570 promoted_val
, vec_promoted_val
,
7571 count_exp
, align
, desired_align
,
7573 /* At most desired_align - align bytes are copied. */
7574 if (min_size
< (unsigned)(desired_align
- align
))
7577 min_size
-= desired_align
- align
;
7581 /* If we know how many bytes need to be stored before dst is
7582 sufficiently aligned, maintain aliasing info accurately. */
7583 dst
= expand_set_or_movmem_constant_prologue (dst
, &src
, destreg
,
7591 count_exp
= plus_constant (counter_mode (count_exp
),
7592 count_exp
, -align_bytes
);
7593 count
-= align_bytes
;
7594 min_size
-= align_bytes
;
7595 max_size
-= align_bytes
;
7598 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
7599 && (count
< (unsigned HOST_WIDE_INT
) size_needed
7600 || (align_bytes
== 0
7601 && count
< ((unsigned HOST_WIDE_INT
) size_needed
7602 + desired_align
- align
))))
7604 /* It is possible that we copied enough so the main loop will not
7606 gcc_assert (size_needed
> 1);
7607 if (label
== NULL_RTX
)
7608 label
= gen_label_rtx ();
7609 emit_cmp_and_jump_insns (count_exp
,
7610 GEN_INT (size_needed
),
7611 LTU
, 0, counter_mode (count_exp
), 1, label
);
7612 if (expected_size
== -1
7613 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7614 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7616 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7619 if (label
&& size_needed
== 1)
7622 LABEL_NUSES (label
) = 1;
7624 epilogue_size_needed
= 1;
7626 promoted_val
= val_exp
;
7628 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
7629 epilogue_size_needed
= size_needed
;
7631 /* Step 3: Main loop. */
7642 expand_set_or_movmem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
7643 count_exp
, move_mode
, unroll_factor
,
7644 expected_size
, issetmem
);
7647 expand_set_or_movmem_via_loop (dst
, src
, destreg
, srcreg
,
7648 vec_promoted_val
, count_exp
, move_mode
,
7649 unroll_factor
, expected_size
, issetmem
);
7651 case rep_prefix_8_byte
:
7652 case rep_prefix_4_byte
:
7653 case rep_prefix_1_byte
:
7654 expand_set_or_movmem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
7655 val_exp
, count_exp
, move_mode
, issetmem
);
7658 /* Adjust properly the offset of src and dest memory for aliasing. */
7659 if (CONST_INT_P (count_exp
))
7662 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
7663 (count
/ size_needed
) * size_needed
);
7664 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
7665 (count
/ size_needed
) * size_needed
);
7670 src
= change_address (src
, BLKmode
, srcreg
);
7671 dst
= change_address (dst
, BLKmode
, destreg
);
7674 /* Step 4: Epilogue to copy the remaining bytes. */
7678 /* When the main loop is done, COUNT_EXP might hold original count,
7679 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7680 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7681 bytes. Compensate if needed. */
7683 if (size_needed
< epilogue_size_needed
)
7685 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
7686 GEN_INT (size_needed
- 1), count_exp
, 1,
7688 if (tmp
!= count_exp
)
7689 emit_move_insn (count_exp
, tmp
);
7692 LABEL_NUSES (label
) = 1;
7695 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
7697 if (force_loopy_epilogue
)
7698 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
7699 epilogue_size_needed
);
7703 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
7704 vec_promoted_val
, count_exp
,
7705 epilogue_size_needed
);
7707 expand_movmem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
7708 epilogue_size_needed
);
7711 if (jump_around_label
)
7712 emit_label (jump_around_label
);
7717 /* Expand the appropriate insns for doing strlen if not just doing
7720 out = result, initialized with the start address
7721 align_rtx = alignment of the address.
7722 scratch = scratch register, initialized with the startaddress when
7723 not aligned, otherwise undefined
7725 This is just the body. It needs the initializations mentioned above and
7726 some address computing at the end. These things are done in i386.md. */
7729 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
7733 rtx_code_label
*align_2_label
= NULL
;
7734 rtx_code_label
*align_3_label
= NULL
;
7735 rtx_code_label
*align_4_label
= gen_label_rtx ();
7736 rtx_code_label
*end_0_label
= gen_label_rtx ();
7738 rtx tmpreg
= gen_reg_rtx (SImode
);
7739 rtx scratch
= gen_reg_rtx (SImode
);
7743 if (CONST_INT_P (align_rtx
))
7744 align
= INTVAL (align_rtx
);
7746 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7748 /* Is there a known alignment and is it less than 4? */
7751 rtx scratch1
= gen_reg_rtx (Pmode
);
7752 emit_move_insn (scratch1
, out
);
7753 /* Is there a known alignment and is it not 2? */
7756 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
7757 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
7759 /* Leave just the 3 lower bits. */
7760 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
7761 NULL_RTX
, 0, OPTAB_WIDEN
);
7763 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7764 Pmode
, 1, align_4_label
);
7765 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
7766 Pmode
, 1, align_2_label
);
7767 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
7768 Pmode
, 1, align_3_label
);
7772 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7773 check if is aligned to 4 - byte. */
7775 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
7776 NULL_RTX
, 0, OPTAB_WIDEN
);
7778 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7779 Pmode
, 1, align_4_label
);
7782 mem
= change_address (src
, QImode
, out
);
7784 /* Now compare the bytes. */
7786 /* Compare the first n unaligned byte on a byte per byte basis. */
7787 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
7788 QImode
, 1, end_0_label
);
7790 /* Increment the address. */
7791 emit_insn (gen_add2_insn (out
, const1_rtx
));
7793 /* Not needed with an alignment of 2 */
7796 emit_label (align_2_label
);
7798 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7801 emit_insn (gen_add2_insn (out
, const1_rtx
));
7803 emit_label (align_3_label
);
7806 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7809 emit_insn (gen_add2_insn (out
, const1_rtx
));
7812 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7813 align this loop. It gives only huge programs, but does not help to
7815 emit_label (align_4_label
);
7817 mem
= change_address (src
, SImode
, out
);
7818 emit_move_insn (scratch
, mem
);
7819 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
7821 /* This formula yields a nonzero result iff one of the bytes is zero.
7822 This saves three branches inside loop and many cycles. */
7824 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
7825 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
7826 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
7827 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
7828 gen_int_mode (0x80808080, SImode
)));
7829 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
7834 rtx reg
= gen_reg_rtx (SImode
);
7835 rtx reg2
= gen_reg_rtx (Pmode
);
7836 emit_move_insn (reg
, tmpreg
);
7837 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
7839 /* If zero is not in the first two bytes, move two bytes forward. */
7840 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7841 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7842 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7843 emit_insn (gen_rtx_SET (tmpreg
,
7844 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
7847 /* Emit lea manually to avoid clobbering of flags. */
7848 emit_insn (gen_rtx_SET (reg2
, gen_rtx_PLUS (Pmode
, out
, const2_rtx
)));
7850 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7851 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7852 emit_insn (gen_rtx_SET (out
,
7853 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
7859 rtx_code_label
*end_2_label
= gen_label_rtx ();
7860 /* Is zero in the first two bytes? */
7862 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7863 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7864 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
7865 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
7866 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
7868 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
7869 JUMP_LABEL (tmp
) = end_2_label
;
7871 /* Not in the first two. Move two bytes forward. */
7872 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
7873 emit_insn (gen_add2_insn (out
, const2_rtx
));
7875 emit_label (end_2_label
);
7879 /* Avoid branch in fixing the byte. */
7880 tmpreg
= gen_lowpart (QImode
, tmpreg
);
7881 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
7882 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
7883 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
7884 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
7886 emit_label (end_0_label
);
7889 /* Expand strlen. */
7892 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
7894 if (TARGET_UNROLL_STRLEN
7895 && TARGET_INLINE_ALL_STRINGOPS
7896 && eoschar
== const0_rtx
7899 /* The generic case of strlen expander is long. Avoid it's
7900 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7901 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
7902 /* Well it seems that some optimizer does not combine a call like
7903 foo(strlen(bar), strlen(bar));
7904 when the move and the subtraction is done here. It does calculate
7905 the length just once when these instructions are done inside of
7906 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7907 often used and I use one fewer register for the lifetime of
7908 output_strlen_unroll() this is better. */
7910 emit_move_insn (out
, addr
);
7912 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
7914 /* strlensi_unroll_1 returns the address of the zero at the end of
7915 the string, like memchr(), so compute the length by subtracting
7916 the start address. */
7917 emit_insn (gen_sub2_insn (out
, addr
));
7924 /* For given symbol (function) construct code to compute address of it's PLT
7925 entry in large x86-64 PIC model. */
7928 construct_plt_address (rtx symbol
)
7932 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
7933 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
7934 gcc_assert (Pmode
== DImode
);
7936 tmp
= gen_reg_rtx (Pmode
);
7937 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
7939 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
7940 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
7944 /* Additional registers that are clobbered by SYSV calls. */
7946 static int const x86_64_ms_sysv_extra_clobbered_registers
7947 [NUM_X86_64_MS_CLOBBERED_REGS
] =
7951 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
7952 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
7956 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
7958 rtx pop
, bool sibcall
)
7961 rtx use
= NULL
, call
;
7962 unsigned int vec_len
= 0;
7965 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7967 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
7969 && (lookup_attribute ("interrupt",
7970 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
7971 error ("interrupt service routine cannot be called directly");
7976 if (pop
== const0_rtx
)
7978 gcc_assert (!TARGET_64BIT
|| !pop
);
7980 if (TARGET_MACHO
&& !TARGET_64BIT
)
7983 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7984 fnaddr
= machopic_indirect_call_target (fnaddr
);
7989 /* Static functions and indirect calls don't need the pic register. Also,
7990 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7991 it an indirect call. */
7992 rtx addr
= XEXP (fnaddr
, 0);
7994 && GET_CODE (addr
) == SYMBOL_REF
7995 && !SYMBOL_REF_LOCAL_P (addr
))
7998 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
7999 || !lookup_attribute ("noplt",
8000 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
8003 || (ix86_cmodel
== CM_LARGE_PIC
8004 && DEFAULT_ABI
!= MS_ABI
))
8006 use_reg (&use
, gen_rtx_REG (Pmode
,
8007 REAL_PIC_OFFSET_TABLE_REGNUM
));
8008 if (ix86_use_pseudo_pic_reg ())
8009 emit_move_insn (gen_rtx_REG (Pmode
,
8010 REAL_PIC_OFFSET_TABLE_REGNUM
),
8011 pic_offset_table_rtx
);
8014 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
8018 fnaddr
= gen_rtx_UNSPEC (Pmode
,
8019 gen_rtvec (1, addr
),
8021 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8025 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
8027 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8028 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
8031 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
8032 /* Pmode may not be the same as word_mode for x32, which
8033 doesn't support indirect branch via 32-bit memory slot.
8034 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8035 indirect branch via x32 GOT slot is OK. */
8036 if (GET_MODE (fnaddr
) != word_mode
)
8037 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
8038 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
8043 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8044 parameters passed in vector registers. */
8046 && (INTVAL (callarg2
) > 0
8047 || (INTVAL (callarg2
) == 0
8048 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
8050 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
8051 emit_move_insn (al
, callarg2
);
8055 if (ix86_cmodel
== CM_LARGE_PIC
8058 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
8059 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
8060 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
8061 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8062 branch via x32 GOT slot is OK. */
8063 else if (!(TARGET_X32
8065 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
8066 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
8068 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
8069 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
8071 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
8072 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
8075 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
8078 call
= gen_rtx_SET (retval
, call
);
8079 vec
[vec_len
++] = call
;
8083 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
8084 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
8085 vec
[vec_len
++] = pop
;
8088 if (cfun
->machine
->no_caller_saved_registers
8090 || (!TREE_THIS_VOLATILE (fndecl
)
8091 && !lookup_attribute ("no_caller_saved_registers",
8092 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
8094 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
8095 bool is_64bit_ms_abi
= (TARGET_64BIT
8096 && ix86_function_abi (fndecl
) == MS_ABI
);
8097 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
8099 /* If there are no caller-saved registers, add all registers
8100 that are clobbered by the call which returns. */
8101 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
8103 && (ix86_call_used_regs
[i
] == 1
8104 || (ix86_call_used_regs
[i
] & c_mask
))
8105 && !STACK_REGNO_P (i
)
8106 && !MMX_REGNO_P (i
))
8108 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
8110 else if (TARGET_64BIT_MS_ABI
8111 && (!callarg2
|| INTVAL (callarg2
) != -2))
8115 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
8117 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
8118 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
8120 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
8123 /* Set here, but it may get cleared later. */
8124 if (TARGET_CALL_MS2SYSV_XLOGUES
)
8129 /* Don't break hot-patched functions. */
8130 else if (ix86_function_ms_hook_prologue (current_function_decl
))
8133 /* TODO: Cases not yet examined. */
8134 else if (flag_split_stack
)
8135 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8139 gcc_assert (!reload_completed
);
8140 cfun
->machine
->call_ms2sysv
= true;
8146 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
8147 rtx_insn
*call_insn
= emit_call_insn (call
);
8149 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
8154 /* Split simple return with popping POPC bytes from stack to indirect
8155 branch with stack adjustment . */
8158 ix86_split_simple_return_pop_internal (rtx popc
)
8160 struct machine_function
*m
= cfun
->machine
;
8161 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
8164 /* There is no "pascal" calling convention in any 64bit ABI. */
8165 gcc_assert (!TARGET_64BIT
);
8167 insn
= emit_insn (gen_pop (ecx
));
8168 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
8169 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
8171 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
8172 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8173 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8174 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
8175 RTX_FRAME_RELATED_P (insn
) = 1;
8177 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
8178 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8179 insn
= emit_insn (x
);
8180 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8181 RTX_FRAME_RELATED_P (insn
) = 1;
8183 /* Now return address is in ECX. */
8184 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
8187 /* Errors in the source file can cause expand_expr to return const0_rtx
8188 where we expect a vector. To avoid crashing, use one of the vector
8189 clear instructions. */
8192 safe_vector_operand (rtx x
, machine_mode mode
)
8194 if (x
== const0_rtx
)
8195 x
= CONST0_RTX (mode
);
8199 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8202 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
8205 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8206 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8207 rtx op0
= expand_normal (arg0
);
8208 rtx op1
= expand_normal (arg1
);
8209 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8210 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8211 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
8213 if (VECTOR_MODE_P (mode0
))
8214 op0
= safe_vector_operand (op0
, mode0
);
8215 if (VECTOR_MODE_P (mode1
))
8216 op1
= safe_vector_operand (op1
, mode1
);
8218 if (optimize
|| !target
8219 || GET_MODE (target
) != tmode
8220 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8221 target
= gen_reg_rtx (tmode
);
8223 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
8225 rtx x
= gen_reg_rtx (V4SImode
);
8226 emit_insn (gen_sse2_loadd (x
, op1
));
8227 op1
= gen_lowpart (TImode
, x
);
8230 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8231 op0
= copy_to_mode_reg (mode0
, op0
);
8232 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
8233 op1
= copy_to_mode_reg (mode1
, op1
);
8235 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8244 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8247 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
8248 enum ix86_builtin_func_type m_type
,
8249 enum rtx_code sub_code
)
8254 bool comparison_p
= false;
8256 bool last_arg_constant
= false;
8263 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8267 case MULTI_ARG_4_DF2_DI_I
:
8268 case MULTI_ARG_4_DF2_DI_I1
:
8269 case MULTI_ARG_4_SF2_SI_I
:
8270 case MULTI_ARG_4_SF2_SI_I1
:
8272 last_arg_constant
= true;
8275 case MULTI_ARG_3_SF
:
8276 case MULTI_ARG_3_DF
:
8277 case MULTI_ARG_3_SF2
:
8278 case MULTI_ARG_3_DF2
:
8279 case MULTI_ARG_3_DI
:
8280 case MULTI_ARG_3_SI
:
8281 case MULTI_ARG_3_SI_DI
:
8282 case MULTI_ARG_3_HI
:
8283 case MULTI_ARG_3_HI_SI
:
8284 case MULTI_ARG_3_QI
:
8285 case MULTI_ARG_3_DI2
:
8286 case MULTI_ARG_3_SI2
:
8287 case MULTI_ARG_3_HI2
:
8288 case MULTI_ARG_3_QI2
:
8292 case MULTI_ARG_2_SF
:
8293 case MULTI_ARG_2_DF
:
8294 case MULTI_ARG_2_DI
:
8295 case MULTI_ARG_2_SI
:
8296 case MULTI_ARG_2_HI
:
8297 case MULTI_ARG_2_QI
:
8301 case MULTI_ARG_2_DI_IMM
:
8302 case MULTI_ARG_2_SI_IMM
:
8303 case MULTI_ARG_2_HI_IMM
:
8304 case MULTI_ARG_2_QI_IMM
:
8306 last_arg_constant
= true;
8309 case MULTI_ARG_1_SF
:
8310 case MULTI_ARG_1_DF
:
8311 case MULTI_ARG_1_SF2
:
8312 case MULTI_ARG_1_DF2
:
8313 case MULTI_ARG_1_DI
:
8314 case MULTI_ARG_1_SI
:
8315 case MULTI_ARG_1_HI
:
8316 case MULTI_ARG_1_QI
:
8317 case MULTI_ARG_1_SI_DI
:
8318 case MULTI_ARG_1_HI_DI
:
8319 case MULTI_ARG_1_HI_SI
:
8320 case MULTI_ARG_1_QI_DI
:
8321 case MULTI_ARG_1_QI_SI
:
8322 case MULTI_ARG_1_QI_HI
:
8326 case MULTI_ARG_2_DI_CMP
:
8327 case MULTI_ARG_2_SI_CMP
:
8328 case MULTI_ARG_2_HI_CMP
:
8329 case MULTI_ARG_2_QI_CMP
:
8331 comparison_p
= true;
8334 case MULTI_ARG_2_SF_TF
:
8335 case MULTI_ARG_2_DF_TF
:
8336 case MULTI_ARG_2_DI_TF
:
8337 case MULTI_ARG_2_SI_TF
:
8338 case MULTI_ARG_2_HI_TF
:
8339 case MULTI_ARG_2_QI_TF
:
8348 if (optimize
|| !target
8349 || GET_MODE (target
) != tmode
8350 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8351 target
= gen_reg_rtx (tmode
);
8352 else if (memory_operand (target
, tmode
))
8355 gcc_assert (nargs
<= 4);
8357 for (i
= 0; i
< nargs
; i
++)
8359 tree arg
= CALL_EXPR_ARG (exp
, i
);
8360 rtx op
= expand_normal (arg
);
8361 int adjust
= (comparison_p
) ? 1 : 0;
8362 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
8364 if (last_arg_constant
&& i
== nargs
- 1)
8366 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
8368 enum insn_code new_icode
= icode
;
8371 case CODE_FOR_xop_vpermil2v2df3
:
8372 case CODE_FOR_xop_vpermil2v4sf3
:
8373 case CODE_FOR_xop_vpermil2v4df3
:
8374 case CODE_FOR_xop_vpermil2v8sf3
:
8375 error ("the last argument must be a 2-bit immediate");
8376 return gen_reg_rtx (tmode
);
8377 case CODE_FOR_xop_rotlv2di3
:
8378 new_icode
= CODE_FOR_rotlv2di3
;
8380 case CODE_FOR_xop_rotlv4si3
:
8381 new_icode
= CODE_FOR_rotlv4si3
;
8383 case CODE_FOR_xop_rotlv8hi3
:
8384 new_icode
= CODE_FOR_rotlv8hi3
;
8386 case CODE_FOR_xop_rotlv16qi3
:
8387 new_icode
= CODE_FOR_rotlv16qi3
;
8389 if (CONST_INT_P (op
))
8391 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
8392 op
= GEN_INT (INTVAL (op
) & mask
);
8394 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
8400 && insn_data
[new_icode
].operand
[0].mode
== tmode
8401 && insn_data
[new_icode
].operand
[1].mode
== tmode
8402 && insn_data
[new_icode
].operand
[2].mode
== mode
8403 && insn_data
[new_icode
].operand
[0].predicate
8404 == insn_data
[icode
].operand
[0].predicate
8405 && insn_data
[new_icode
].operand
[1].predicate
8406 == insn_data
[icode
].operand
[1].predicate
);
8419 if (VECTOR_MODE_P (mode
))
8420 op
= safe_vector_operand (op
, mode
);
8422 /* If we aren't optimizing, only allow one memory operand to be
8424 if (memory_operand (op
, mode
))
8427 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
8430 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
8432 op
= force_reg (mode
, op
);
8436 args
[i
].mode
= mode
;
8442 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
8447 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
8448 GEN_INT ((int)sub_code
));
8449 else if (! comparison_p
)
8450 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
8453 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
8457 pat
= GEN_FCN (icode
) (target
, cmp_op
, args
[0].op
, args
[1].op
);
8462 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
8466 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
, args
[3].op
);
8480 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8481 insns with vec_merge. */
8484 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
8488 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8489 rtx op1
, op0
= expand_normal (arg0
);
8490 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8491 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8493 if (optimize
|| !target
8494 || GET_MODE (target
) != tmode
8495 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8496 target
= gen_reg_rtx (tmode
);
8498 if (VECTOR_MODE_P (mode0
))
8499 op0
= safe_vector_operand (op0
, mode0
);
8501 if ((optimize
&& !register_operand (op0
, mode0
))
8502 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8503 op0
= copy_to_mode_reg (mode0
, op0
);
8506 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
8507 op1
= copy_to_mode_reg (mode0
, op1
);
8509 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8516 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8519 ix86_expand_sse_compare (const struct builtin_description
*d
,
8520 tree exp
, rtx target
, bool swap
)
8523 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8524 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8525 rtx op0
= expand_normal (arg0
);
8526 rtx op1
= expand_normal (arg1
);
8528 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8529 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8530 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8531 enum rtx_code comparison
= d
->comparison
;
8533 if (VECTOR_MODE_P (mode0
))
8534 op0
= safe_vector_operand (op0
, mode0
);
8535 if (VECTOR_MODE_P (mode1
))
8536 op1
= safe_vector_operand (op1
, mode1
);
8538 /* Swap operands if we have a comparison that isn't available in
8541 std::swap (op0
, op1
);
8543 if (optimize
|| !target
8544 || GET_MODE (target
) != tmode
8545 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8546 target
= gen_reg_rtx (tmode
);
8548 if ((optimize
&& !register_operand (op0
, mode0
))
8549 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
8550 op0
= copy_to_mode_reg (mode0
, op0
);
8551 if ((optimize
&& !register_operand (op1
, mode1
))
8552 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
8553 op1
= copy_to_mode_reg (mode1
, op1
);
8555 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
8556 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8563 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8566 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
8570 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8571 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8572 rtx op0
= expand_normal (arg0
);
8573 rtx op1
= expand_normal (arg1
);
8574 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8575 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8576 enum rtx_code comparison
= d
->comparison
;
8578 if (VECTOR_MODE_P (mode0
))
8579 op0
= safe_vector_operand (op0
, mode0
);
8580 if (VECTOR_MODE_P (mode1
))
8581 op1
= safe_vector_operand (op1
, mode1
);
8583 /* Swap operands if we have a comparison that isn't available in
8585 if (d
->flag
& BUILTIN_DESC_SWAP_OPERANDS
)
8586 std::swap (op0
, op1
);
8588 target
= gen_reg_rtx (SImode
);
8589 emit_move_insn (target
, const0_rtx
);
8590 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8592 if ((optimize
&& !register_operand (op0
, mode0
))
8593 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8594 op0
= copy_to_mode_reg (mode0
, op0
);
8595 if ((optimize
&& !register_operand (op1
, mode1
))
8596 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8597 op1
= copy_to_mode_reg (mode1
, op1
);
8599 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8603 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8604 gen_rtx_fmt_ee (comparison
, QImode
,
8608 return SUBREG_REG (target
);
8611 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8614 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
8618 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8619 rtx op1
, op0
= expand_normal (arg0
);
8620 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8621 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8623 if (optimize
|| target
== 0
8624 || GET_MODE (target
) != tmode
8625 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8626 target
= gen_reg_rtx (tmode
);
8628 if (VECTOR_MODE_P (mode0
))
8629 op0
= safe_vector_operand (op0
, mode0
);
8631 if ((optimize
&& !register_operand (op0
, mode0
))
8632 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8633 op0
= copy_to_mode_reg (mode0
, op0
);
8635 op1
= GEN_INT (d
->comparison
);
8637 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
8645 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
8646 tree exp
, rtx target
)
8649 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8650 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8651 rtx op0
= expand_normal (arg0
);
8652 rtx op1
= expand_normal (arg1
);
8654 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8655 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8656 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8658 if (optimize
|| target
== 0
8659 || GET_MODE (target
) != tmode
8660 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8661 target
= gen_reg_rtx (tmode
);
8663 op0
= safe_vector_operand (op0
, mode0
);
8664 op1
= safe_vector_operand (op1
, mode1
);
8666 if ((optimize
&& !register_operand (op0
, mode0
))
8667 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8668 op0
= copy_to_mode_reg (mode0
, op0
);
8669 if ((optimize
&& !register_operand (op1
, mode1
))
8670 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8671 op1
= copy_to_mode_reg (mode1
, op1
);
8673 op2
= GEN_INT (d
->comparison
);
8675 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8682 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8685 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
8689 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8690 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8691 rtx op0
= expand_normal (arg0
);
8692 rtx op1
= expand_normal (arg1
);
8693 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8694 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8695 enum rtx_code comparison
= d
->comparison
;
8697 if (VECTOR_MODE_P (mode0
))
8698 op0
= safe_vector_operand (op0
, mode0
);
8699 if (VECTOR_MODE_P (mode1
))
8700 op1
= safe_vector_operand (op1
, mode1
);
8702 target
= gen_reg_rtx (SImode
);
8703 emit_move_insn (target
, const0_rtx
);
8704 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8706 if ((optimize
&& !register_operand (op0
, mode0
))
8707 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8708 op0
= copy_to_mode_reg (mode0
, op0
);
8709 if ((optimize
&& !register_operand (op1
, mode1
))
8710 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8711 op1
= copy_to_mode_reg (mode1
, op1
);
8713 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8717 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8718 gen_rtx_fmt_ee (comparison
, QImode
,
8722 return SUBREG_REG (target
);
8725 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8728 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
8729 tree exp
, rtx target
)
8732 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8733 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8734 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8735 tree arg3
= CALL_EXPR_ARG (exp
, 3);
8736 tree arg4
= CALL_EXPR_ARG (exp
, 4);
8737 rtx scratch0
, scratch1
;
8738 rtx op0
= expand_normal (arg0
);
8739 rtx op1
= expand_normal (arg1
);
8740 rtx op2
= expand_normal (arg2
);
8741 rtx op3
= expand_normal (arg3
);
8742 rtx op4
= expand_normal (arg4
);
8743 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
8745 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8746 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8747 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8748 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
8749 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
8750 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
8751 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
8753 if (VECTOR_MODE_P (modev2
))
8754 op0
= safe_vector_operand (op0
, modev2
);
8755 if (VECTOR_MODE_P (modev4
))
8756 op2
= safe_vector_operand (op2
, modev4
);
8758 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8759 op0
= copy_to_mode_reg (modev2
, op0
);
8760 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
8761 op1
= copy_to_mode_reg (modei3
, op1
);
8762 if ((optimize
&& !register_operand (op2
, modev4
))
8763 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
8764 op2
= copy_to_mode_reg (modev4
, op2
);
8765 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
8766 op3
= copy_to_mode_reg (modei5
, op3
);
8768 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
8770 error ("the fifth argument must be an 8-bit immediate");
8774 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
8776 if (optimize
|| !target
8777 || GET_MODE (target
) != tmode0
8778 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8779 target
= gen_reg_rtx (tmode0
);
8781 scratch1
= gen_reg_rtx (tmode1
);
8783 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8785 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
8787 if (optimize
|| !target
8788 || GET_MODE (target
) != tmode1
8789 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8790 target
= gen_reg_rtx (tmode1
);
8792 scratch0
= gen_reg_rtx (tmode0
);
8794 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
8798 gcc_assert (d
->flag
);
8800 scratch0
= gen_reg_rtx (tmode0
);
8801 scratch1
= gen_reg_rtx (tmode1
);
8803 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8813 target
= gen_reg_rtx (SImode
);
8814 emit_move_insn (target
, const0_rtx
);
8815 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8818 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8819 gen_rtx_fmt_ee (EQ
, QImode
,
8820 gen_rtx_REG ((machine_mode
) d
->flag
,
8823 return SUBREG_REG (target
);
8830 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8833 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
8834 tree exp
, rtx target
)
8837 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8838 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8839 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8840 rtx scratch0
, scratch1
;
8841 rtx op0
= expand_normal (arg0
);
8842 rtx op1
= expand_normal (arg1
);
8843 rtx op2
= expand_normal (arg2
);
8844 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
8846 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8847 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8848 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8849 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
8850 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
8852 if (VECTOR_MODE_P (modev2
))
8853 op0
= safe_vector_operand (op0
, modev2
);
8854 if (VECTOR_MODE_P (modev3
))
8855 op1
= safe_vector_operand (op1
, modev3
);
8857 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8858 op0
= copy_to_mode_reg (modev2
, op0
);
8859 if ((optimize
&& !register_operand (op1
, modev3
))
8860 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
8861 op1
= copy_to_mode_reg (modev3
, op1
);
8863 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
8865 error ("the third argument must be an 8-bit immediate");
8869 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
8871 if (optimize
|| !target
8872 || GET_MODE (target
) != tmode0
8873 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8874 target
= gen_reg_rtx (tmode0
);
8876 scratch1
= gen_reg_rtx (tmode1
);
8878 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
8880 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
8882 if (optimize
|| !target
8883 || GET_MODE (target
) != tmode1
8884 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8885 target
= gen_reg_rtx (tmode1
);
8887 scratch0
= gen_reg_rtx (tmode0
);
8889 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
8893 gcc_assert (d
->flag
);
8895 scratch0
= gen_reg_rtx (tmode0
);
8896 scratch1
= gen_reg_rtx (tmode1
);
8898 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
8908 target
= gen_reg_rtx (SImode
);
8909 emit_move_insn (target
, const0_rtx
);
8910 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8913 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8914 gen_rtx_fmt_ee (EQ
, QImode
,
8915 gen_rtx_REG ((machine_mode
) d
->flag
,
8918 return SUBREG_REG (target
);
8924 /* Fixup modeless constants to fit required mode. */
8927 fixup_modeless_constant (rtx x
, machine_mode mode
)
8929 if (GET_MODE (x
) == VOIDmode
)
8930 x
= convert_to_mode (mode
, x
, 1);
8934 /* Subroutine of ix86_expand_builtin to take care of insns with
8935 variable number of operands. */
8938 ix86_expand_args_builtin (const struct builtin_description
*d
,
8939 tree exp
, rtx target
)
8941 rtx pat
, real_target
;
8942 unsigned int i
, nargs
;
8943 unsigned int nargs_constant
= 0;
8944 unsigned int mask_pos
= 0;
8951 bool second_arg_count
= false;
8952 enum insn_code icode
= d
->icode
;
8953 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
8954 machine_mode tmode
= insn_p
->operand
[0].mode
;
8955 machine_mode rmode
= VOIDmode
;
8957 enum rtx_code comparison
= d
->comparison
;
8959 switch ((enum ix86_builtin_func_type
) d
->flag
)
8961 case V2DF_FTYPE_V2DF_ROUND
:
8962 case V4DF_FTYPE_V4DF_ROUND
:
8963 case V8DF_FTYPE_V8DF_ROUND
:
8964 case V4SF_FTYPE_V4SF_ROUND
:
8965 case V8SF_FTYPE_V8SF_ROUND
:
8966 case V16SF_FTYPE_V16SF_ROUND
:
8967 case V4SI_FTYPE_V4SF_ROUND
:
8968 case V8SI_FTYPE_V8SF_ROUND
:
8969 case V16SI_FTYPE_V16SF_ROUND
:
8970 return ix86_expand_sse_round (d
, exp
, target
);
8971 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
8972 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
8973 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
8974 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
8975 case INT_FTYPE_V8SF_V8SF_PTEST
:
8976 case INT_FTYPE_V4DI_V4DI_PTEST
:
8977 case INT_FTYPE_V4DF_V4DF_PTEST
:
8978 case INT_FTYPE_V4SF_V4SF_PTEST
:
8979 case INT_FTYPE_V2DI_V2DI_PTEST
:
8980 case INT_FTYPE_V2DF_V2DF_PTEST
:
8981 return ix86_expand_sse_ptest (d
, exp
, target
);
8982 case FLOAT128_FTYPE_FLOAT128
:
8983 case FLOAT_FTYPE_FLOAT
:
8985 case UINT_FTYPE_UINT
:
8986 case UINT16_FTYPE_UINT16
:
8987 case UINT64_FTYPE_INT
:
8988 case UINT64_FTYPE_UINT64
:
8989 case INT64_FTYPE_INT64
:
8990 case INT64_FTYPE_V4SF
:
8991 case INT64_FTYPE_V2DF
:
8992 case INT_FTYPE_V16QI
:
8993 case INT_FTYPE_V8QI
:
8994 case INT_FTYPE_V8SF
:
8995 case INT_FTYPE_V4DF
:
8996 case INT_FTYPE_V4SF
:
8997 case INT_FTYPE_V2DF
:
8998 case INT_FTYPE_V32QI
:
8999 case V16QI_FTYPE_V16QI
:
9000 case V8SI_FTYPE_V8SF
:
9001 case V8SI_FTYPE_V4SI
:
9002 case V8HI_FTYPE_V8HI
:
9003 case V8HI_FTYPE_V16QI
:
9004 case V8QI_FTYPE_V8QI
:
9005 case V8SF_FTYPE_V8SF
:
9006 case V8SF_FTYPE_V8SI
:
9007 case V8SF_FTYPE_V4SF
:
9008 case V8SF_FTYPE_V8HI
:
9009 case V4SI_FTYPE_V4SI
:
9010 case V4SI_FTYPE_V16QI
:
9011 case V4SI_FTYPE_V4SF
:
9012 case V4SI_FTYPE_V8SI
:
9013 case V4SI_FTYPE_V8HI
:
9014 case V4SI_FTYPE_V4DF
:
9015 case V4SI_FTYPE_V2DF
:
9016 case V4HI_FTYPE_V4HI
:
9017 case V4DF_FTYPE_V4DF
:
9018 case V4DF_FTYPE_V4SI
:
9019 case V4DF_FTYPE_V4SF
:
9020 case V4DF_FTYPE_V2DF
:
9021 case V4SF_FTYPE_V4SF
:
9022 case V4SF_FTYPE_V4SI
:
9023 case V4SF_FTYPE_V8SF
:
9024 case V4SF_FTYPE_V4DF
:
9025 case V4SF_FTYPE_V8HI
:
9026 case V4SF_FTYPE_V2DF
:
9027 case V2DI_FTYPE_V2DI
:
9028 case V2DI_FTYPE_V16QI
:
9029 case V2DI_FTYPE_V8HI
:
9030 case V2DI_FTYPE_V4SI
:
9031 case V2DF_FTYPE_V2DF
:
9032 case V2DF_FTYPE_V4SI
:
9033 case V2DF_FTYPE_V4DF
:
9034 case V2DF_FTYPE_V4SF
:
9035 case V2DF_FTYPE_V2SI
:
9036 case V2SI_FTYPE_V2SI
:
9037 case V2SI_FTYPE_V4SF
:
9038 case V2SI_FTYPE_V2SF
:
9039 case V2SI_FTYPE_V2DF
:
9040 case V2SF_FTYPE_V2SF
:
9041 case V2SF_FTYPE_V2SI
:
9042 case V32QI_FTYPE_V32QI
:
9043 case V32QI_FTYPE_V16QI
:
9044 case V16HI_FTYPE_V16HI
:
9045 case V16HI_FTYPE_V8HI
:
9046 case V8SI_FTYPE_V8SI
:
9047 case V16HI_FTYPE_V16QI
:
9048 case V8SI_FTYPE_V16QI
:
9049 case V4DI_FTYPE_V16QI
:
9050 case V8SI_FTYPE_V8HI
:
9051 case V4DI_FTYPE_V8HI
:
9052 case V4DI_FTYPE_V4SI
:
9053 case V4DI_FTYPE_V2DI
:
9060 case UHI_FTYPE_V16QI
:
9061 case USI_FTYPE_V32QI
:
9062 case UDI_FTYPE_V64QI
:
9063 case V16QI_FTYPE_UHI
:
9064 case V32QI_FTYPE_USI
:
9065 case V64QI_FTYPE_UDI
:
9066 case V8HI_FTYPE_UQI
:
9067 case V16HI_FTYPE_UHI
:
9068 case V32HI_FTYPE_USI
:
9069 case V4SI_FTYPE_UQI
:
9070 case V8SI_FTYPE_UQI
:
9071 case V4SI_FTYPE_UHI
:
9072 case V8SI_FTYPE_UHI
:
9073 case UQI_FTYPE_V8HI
:
9074 case UHI_FTYPE_V16HI
:
9075 case USI_FTYPE_V32HI
:
9076 case UQI_FTYPE_V4SI
:
9077 case UQI_FTYPE_V8SI
:
9078 case UHI_FTYPE_V16SI
:
9079 case UQI_FTYPE_V2DI
:
9080 case UQI_FTYPE_V4DI
:
9081 case UQI_FTYPE_V8DI
:
9082 case V16SI_FTYPE_UHI
:
9083 case V2DI_FTYPE_UQI
:
9084 case V4DI_FTYPE_UQI
:
9085 case V16SI_FTYPE_INT
:
9086 case V16SF_FTYPE_V8SF
:
9087 case V16SI_FTYPE_V8SI
:
9088 case V16SF_FTYPE_V4SF
:
9089 case V16SI_FTYPE_V4SI
:
9090 case V16SI_FTYPE_V16SF
:
9091 case V16SI_FTYPE_V16SI
:
9092 case V64QI_FTYPE_V64QI
:
9093 case V32HI_FTYPE_V32HI
:
9094 case V16SF_FTYPE_V16SF
:
9095 case V8DI_FTYPE_UQI
:
9096 case V8DI_FTYPE_V8DI
:
9097 case V8DF_FTYPE_V4DF
:
9098 case V8DF_FTYPE_V2DF
:
9099 case V8DF_FTYPE_V8DF
:
9100 case V4DI_FTYPE_V4DI
:
9101 case V16HI_FTYPE_V16SF
:
9102 case V8HI_FTYPE_V8SF
:
9103 case V8HI_FTYPE_V4SF
:
9106 case V4SF_FTYPE_V4SF_VEC_MERGE
:
9107 case V2DF_FTYPE_V2DF_VEC_MERGE
:
9108 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
9109 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
9110 case V16QI_FTYPE_V16QI_V16QI
:
9111 case V16QI_FTYPE_V8HI_V8HI
:
9112 case V16SF_FTYPE_V16SF_V16SF
:
9113 case V8QI_FTYPE_V8QI_V8QI
:
9114 case V8QI_FTYPE_V4HI_V4HI
:
9115 case V8HI_FTYPE_V8HI_V8HI
:
9116 case V8HI_FTYPE_V16QI_V16QI
:
9117 case V8HI_FTYPE_V4SI_V4SI
:
9118 case V8SF_FTYPE_V8SF_V8SF
:
9119 case V8SF_FTYPE_V8SF_V8SI
:
9120 case V8DF_FTYPE_V8DF_V8DF
:
9121 case V4SI_FTYPE_V4SI_V4SI
:
9122 case V4SI_FTYPE_V8HI_V8HI
:
9123 case V4SI_FTYPE_V2DF_V2DF
:
9124 case V4HI_FTYPE_V4HI_V4HI
:
9125 case V4HI_FTYPE_V8QI_V8QI
:
9126 case V4HI_FTYPE_V2SI_V2SI
:
9127 case V4DF_FTYPE_V4DF_V4DF
:
9128 case V4DF_FTYPE_V4DF_V4DI
:
9129 case V4SF_FTYPE_V4SF_V4SF
:
9130 case V4SF_FTYPE_V4SF_V4SI
:
9131 case V4SF_FTYPE_V4SF_V2SI
:
9132 case V4SF_FTYPE_V4SF_V2DF
:
9133 case V4SF_FTYPE_V4SF_UINT
:
9134 case V4SF_FTYPE_V4SF_DI
:
9135 case V4SF_FTYPE_V4SF_SI
:
9136 case V2DI_FTYPE_V2DI_V2DI
:
9137 case V2DI_FTYPE_V16QI_V16QI
:
9138 case V2DI_FTYPE_V4SI_V4SI
:
9139 case V2DI_FTYPE_V2DI_V16QI
:
9140 case V2SI_FTYPE_V2SI_V2SI
:
9141 case V2SI_FTYPE_V4HI_V4HI
:
9142 case V2SI_FTYPE_V2SF_V2SF
:
9143 case V2DF_FTYPE_V2DF_V2DF
:
9144 case V2DF_FTYPE_V2DF_V4SF
:
9145 case V2DF_FTYPE_V2DF_V2DI
:
9146 case V2DF_FTYPE_V2DF_DI
:
9147 case V2DF_FTYPE_V2DF_SI
:
9148 case V2DF_FTYPE_V2DF_UINT
:
9149 case V2SF_FTYPE_V2SF_V2SF
:
9150 case V1DI_FTYPE_V1DI_V1DI
:
9151 case V1DI_FTYPE_V8QI_V8QI
:
9152 case V1DI_FTYPE_V2SI_V2SI
:
9153 case V32QI_FTYPE_V16HI_V16HI
:
9154 case V16HI_FTYPE_V8SI_V8SI
:
9155 case V64QI_FTYPE_V64QI_V64QI
:
9156 case V32QI_FTYPE_V32QI_V32QI
:
9157 case V16HI_FTYPE_V32QI_V32QI
:
9158 case V16HI_FTYPE_V16HI_V16HI
:
9159 case V8SI_FTYPE_V4DF_V4DF
:
9160 case V8SI_FTYPE_V8SI_V8SI
:
9161 case V8SI_FTYPE_V16HI_V16HI
:
9162 case V4DI_FTYPE_V4DI_V4DI
:
9163 case V4DI_FTYPE_V8SI_V8SI
:
9164 case V8DI_FTYPE_V64QI_V64QI
:
9165 if (comparison
== UNKNOWN
)
9166 return ix86_expand_binop_builtin (icode
, exp
, target
);
9169 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
9170 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
9171 gcc_assert (comparison
!= UNKNOWN
);
9175 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
9176 case V16HI_FTYPE_V16HI_SI_COUNT
:
9177 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
9178 case V8SI_FTYPE_V8SI_SI_COUNT
:
9179 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
9180 case V4DI_FTYPE_V4DI_INT_COUNT
:
9181 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
9182 case V8HI_FTYPE_V8HI_SI_COUNT
:
9183 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
9184 case V4SI_FTYPE_V4SI_SI_COUNT
:
9185 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
9186 case V4HI_FTYPE_V4HI_SI_COUNT
:
9187 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
9188 case V2DI_FTYPE_V2DI_SI_COUNT
:
9189 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
9190 case V2SI_FTYPE_V2SI_SI_COUNT
:
9191 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
9192 case V1DI_FTYPE_V1DI_SI_COUNT
:
9194 second_arg_count
= true;
9196 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
9197 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
9198 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
9199 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
9200 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
9201 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
9202 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
9203 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
9204 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
9205 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
9206 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
9207 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
9208 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
9209 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
9210 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
9211 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
9212 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
9213 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
9215 second_arg_count
= true;
9217 case UINT64_FTYPE_UINT64_UINT64
:
9218 case UINT_FTYPE_UINT_UINT
:
9219 case UINT_FTYPE_UINT_USHORT
:
9220 case UINT_FTYPE_UINT_UCHAR
:
9221 case UINT16_FTYPE_UINT16_INT
:
9222 case UINT8_FTYPE_UINT8_INT
:
9223 case UQI_FTYPE_UQI_UQI
:
9224 case UHI_FTYPE_UHI_UHI
:
9225 case USI_FTYPE_USI_USI
:
9226 case UDI_FTYPE_UDI_UDI
:
9227 case V16SI_FTYPE_V8DF_V8DF
:
9228 case V32HI_FTYPE_V16SF_V16SF
:
9229 case V16HI_FTYPE_V8SF_V8SF
:
9230 case V8HI_FTYPE_V4SF_V4SF
:
9231 case V16HI_FTYPE_V16SF_UHI
:
9232 case V8HI_FTYPE_V8SF_UQI
:
9233 case V8HI_FTYPE_V4SF_UQI
:
9236 case V2DI_FTYPE_V2DI_INT_CONVERT
:
9241 case V4DI_FTYPE_V4DI_INT_CONVERT
:
9246 case V8DI_FTYPE_V8DI_INT_CONVERT
:
9251 case V8HI_FTYPE_V8HI_INT
:
9252 case V8HI_FTYPE_V8SF_INT
:
9253 case V16HI_FTYPE_V16SF_INT
:
9254 case V8HI_FTYPE_V4SF_INT
:
9255 case V8SF_FTYPE_V8SF_INT
:
9256 case V4SF_FTYPE_V16SF_INT
:
9257 case V16SF_FTYPE_V16SF_INT
:
9258 case V4SI_FTYPE_V4SI_INT
:
9259 case V4SI_FTYPE_V8SI_INT
:
9260 case V4HI_FTYPE_V4HI_INT
:
9261 case V4DF_FTYPE_V4DF_INT
:
9262 case V4DF_FTYPE_V8DF_INT
:
9263 case V4SF_FTYPE_V4SF_INT
:
9264 case V4SF_FTYPE_V8SF_INT
:
9265 case V2DI_FTYPE_V2DI_INT
:
9266 case V2DF_FTYPE_V2DF_INT
:
9267 case V2DF_FTYPE_V4DF_INT
:
9268 case V16HI_FTYPE_V16HI_INT
:
9269 case V8SI_FTYPE_V8SI_INT
:
9270 case V16SI_FTYPE_V16SI_INT
:
9271 case V4SI_FTYPE_V16SI_INT
:
9272 case V4DI_FTYPE_V4DI_INT
:
9273 case V2DI_FTYPE_V4DI_INT
:
9274 case V4DI_FTYPE_V8DI_INT
:
9275 case QI_FTYPE_V4SF_INT
:
9276 case QI_FTYPE_V2DF_INT
:
9277 case UQI_FTYPE_UQI_UQI_CONST
:
9278 case UHI_FTYPE_UHI_UQI
:
9279 case USI_FTYPE_USI_UQI
:
9280 case UDI_FTYPE_UDI_UQI
:
9284 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
9285 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
9286 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
9287 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
9288 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
9289 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
9290 case UHI_FTYPE_V16SI_V16SI_UHI
:
9291 case UQI_FTYPE_V8DI_V8DI_UQI
:
9292 case V16HI_FTYPE_V16SI_V16HI_UHI
:
9293 case V16QI_FTYPE_V16SI_V16QI_UHI
:
9294 case V16QI_FTYPE_V8DI_V16QI_UQI
:
9295 case V16SF_FTYPE_V16SF_V16SF_UHI
:
9296 case V16SF_FTYPE_V4SF_V16SF_UHI
:
9297 case V16SI_FTYPE_SI_V16SI_UHI
:
9298 case V16SI_FTYPE_V16HI_V16SI_UHI
:
9299 case V16SI_FTYPE_V16QI_V16SI_UHI
:
9300 case V8SF_FTYPE_V4SF_V8SF_UQI
:
9301 case V4DF_FTYPE_V2DF_V4DF_UQI
:
9302 case V8SI_FTYPE_V4SI_V8SI_UQI
:
9303 case V8SI_FTYPE_SI_V8SI_UQI
:
9304 case V4SI_FTYPE_V4SI_V4SI_UQI
:
9305 case V4SI_FTYPE_SI_V4SI_UQI
:
9306 case V4DI_FTYPE_V2DI_V4DI_UQI
:
9307 case V4DI_FTYPE_DI_V4DI_UQI
:
9308 case V2DI_FTYPE_V2DI_V2DI_UQI
:
9309 case V2DI_FTYPE_DI_V2DI_UQI
:
9310 case V64QI_FTYPE_V64QI_V64QI_UDI
:
9311 case V64QI_FTYPE_V16QI_V64QI_UDI
:
9312 case V64QI_FTYPE_QI_V64QI_UDI
:
9313 case V32QI_FTYPE_V32QI_V32QI_USI
:
9314 case V32QI_FTYPE_V16QI_V32QI_USI
:
9315 case V32QI_FTYPE_QI_V32QI_USI
:
9316 case V16QI_FTYPE_V16QI_V16QI_UHI
:
9317 case V16QI_FTYPE_QI_V16QI_UHI
:
9318 case V32HI_FTYPE_V8HI_V32HI_USI
:
9319 case V32HI_FTYPE_HI_V32HI_USI
:
9320 case V16HI_FTYPE_V8HI_V16HI_UHI
:
9321 case V16HI_FTYPE_HI_V16HI_UHI
:
9322 case V8HI_FTYPE_V8HI_V8HI_UQI
:
9323 case V8HI_FTYPE_HI_V8HI_UQI
:
9324 case V8SF_FTYPE_V8HI_V8SF_UQI
:
9325 case V4SF_FTYPE_V8HI_V4SF_UQI
:
9326 case V8SI_FTYPE_V8SF_V8SI_UQI
:
9327 case V4SI_FTYPE_V4SF_V4SI_UQI
:
9328 case V4DI_FTYPE_V4SF_V4DI_UQI
:
9329 case V2DI_FTYPE_V4SF_V2DI_UQI
:
9330 case V4SF_FTYPE_V4DI_V4SF_UQI
:
9331 case V4SF_FTYPE_V2DI_V4SF_UQI
:
9332 case V4DF_FTYPE_V4DI_V4DF_UQI
:
9333 case V2DF_FTYPE_V2DI_V2DF_UQI
:
9334 case V16QI_FTYPE_V8HI_V16QI_UQI
:
9335 case V16QI_FTYPE_V16HI_V16QI_UHI
:
9336 case V16QI_FTYPE_V4SI_V16QI_UQI
:
9337 case V16QI_FTYPE_V8SI_V16QI_UQI
:
9338 case V8HI_FTYPE_V4SI_V8HI_UQI
:
9339 case V8HI_FTYPE_V8SI_V8HI_UQI
:
9340 case V16QI_FTYPE_V2DI_V16QI_UQI
:
9341 case V16QI_FTYPE_V4DI_V16QI_UQI
:
9342 case V8HI_FTYPE_V2DI_V8HI_UQI
:
9343 case V8HI_FTYPE_V4DI_V8HI_UQI
:
9344 case V4SI_FTYPE_V2DI_V4SI_UQI
:
9345 case V4SI_FTYPE_V4DI_V4SI_UQI
:
9346 case V32QI_FTYPE_V32HI_V32QI_USI
:
9347 case UHI_FTYPE_V16QI_V16QI_UHI
:
9348 case USI_FTYPE_V32QI_V32QI_USI
:
9349 case UDI_FTYPE_V64QI_V64QI_UDI
:
9350 case UQI_FTYPE_V8HI_V8HI_UQI
:
9351 case UHI_FTYPE_V16HI_V16HI_UHI
:
9352 case USI_FTYPE_V32HI_V32HI_USI
:
9353 case UQI_FTYPE_V4SI_V4SI_UQI
:
9354 case UQI_FTYPE_V8SI_V8SI_UQI
:
9355 case UQI_FTYPE_V2DI_V2DI_UQI
:
9356 case UQI_FTYPE_V4DI_V4DI_UQI
:
9357 case V4SF_FTYPE_V2DF_V4SF_UQI
:
9358 case V4SF_FTYPE_V4DF_V4SF_UQI
:
9359 case V16SI_FTYPE_V16SI_V16SI_UHI
:
9360 case V16SI_FTYPE_V4SI_V16SI_UHI
:
9361 case V2DI_FTYPE_V4SI_V2DI_UQI
:
9362 case V2DI_FTYPE_V8HI_V2DI_UQI
:
9363 case V2DI_FTYPE_V16QI_V2DI_UQI
:
9364 case V4DI_FTYPE_V4DI_V4DI_UQI
:
9365 case V4DI_FTYPE_V4SI_V4DI_UQI
:
9366 case V4DI_FTYPE_V8HI_V4DI_UQI
:
9367 case V4DI_FTYPE_V16QI_V4DI_UQI
:
9368 case V4DI_FTYPE_V4DF_V4DI_UQI
:
9369 case V2DI_FTYPE_V2DF_V2DI_UQI
:
9370 case V4SI_FTYPE_V4DF_V4SI_UQI
:
9371 case V4SI_FTYPE_V2DF_V4SI_UQI
:
9372 case V4SI_FTYPE_V8HI_V4SI_UQI
:
9373 case V4SI_FTYPE_V16QI_V4SI_UQI
:
9374 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
9375 case V8DF_FTYPE_V2DF_V8DF_UQI
:
9376 case V8DF_FTYPE_V4DF_V8DF_UQI
:
9377 case V8DF_FTYPE_V8DF_V8DF_UQI
:
9378 case V8SF_FTYPE_V8SF_V8SF_UQI
:
9379 case V8SF_FTYPE_V8SI_V8SF_UQI
:
9380 case V4DF_FTYPE_V4DF_V4DF_UQI
:
9381 case V4SF_FTYPE_V4SF_V4SF_UQI
:
9382 case V2DF_FTYPE_V2DF_V2DF_UQI
:
9383 case V2DF_FTYPE_V4SF_V2DF_UQI
:
9384 case V2DF_FTYPE_V4SI_V2DF_UQI
:
9385 case V4SF_FTYPE_V4SI_V4SF_UQI
:
9386 case V4DF_FTYPE_V4SF_V4DF_UQI
:
9387 case V4DF_FTYPE_V4SI_V4DF_UQI
:
9388 case V8SI_FTYPE_V8SI_V8SI_UQI
:
9389 case V8SI_FTYPE_V8HI_V8SI_UQI
:
9390 case V8SI_FTYPE_V16QI_V8SI_UQI
:
9391 case V8DF_FTYPE_V8SI_V8DF_UQI
:
9392 case V8DI_FTYPE_DI_V8DI_UQI
:
9393 case V16SF_FTYPE_V8SF_V16SF_UHI
:
9394 case V16SI_FTYPE_V8SI_V16SI_UHI
:
9395 case V16HI_FTYPE_V16HI_V16HI_UHI
:
9396 case V8HI_FTYPE_V16QI_V8HI_UQI
:
9397 case V16HI_FTYPE_V16QI_V16HI_UHI
:
9398 case V32HI_FTYPE_V32HI_V32HI_USI
:
9399 case V32HI_FTYPE_V32QI_V32HI_USI
:
9400 case V8DI_FTYPE_V16QI_V8DI_UQI
:
9401 case V8DI_FTYPE_V2DI_V8DI_UQI
:
9402 case V8DI_FTYPE_V4DI_V8DI_UQI
:
9403 case V8DI_FTYPE_V8DI_V8DI_UQI
:
9404 case V8DI_FTYPE_V8HI_V8DI_UQI
:
9405 case V8DI_FTYPE_V8SI_V8DI_UQI
:
9406 case V8HI_FTYPE_V8DI_V8HI_UQI
:
9407 case V8SI_FTYPE_V8DI_V8SI_UQI
:
9408 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
9409 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
9410 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
9411 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
9412 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
9413 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
9414 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
9415 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
9416 case V32HI_FTYPE_V16SF_V16SF_USI
:
9417 case V16HI_FTYPE_V8SF_V8SF_UHI
:
9418 case V8HI_FTYPE_V4SF_V4SF_UQI
:
9419 case V16HI_FTYPE_V16SF_V16HI_UHI
:
9420 case V8HI_FTYPE_V8SF_V8HI_UQI
:
9421 case V8HI_FTYPE_V4SF_V8HI_UQI
:
9422 case V16SF_FTYPE_V16SF_V32HI_V32HI
:
9423 case V8SF_FTYPE_V8SF_V16HI_V16HI
:
9424 case V4SF_FTYPE_V4SF_V8HI_V8HI
:
9427 case V32QI_FTYPE_V32QI_V32QI_INT
:
9428 case V16HI_FTYPE_V16HI_V16HI_INT
:
9429 case V16QI_FTYPE_V16QI_V16QI_INT
:
9430 case V4DI_FTYPE_V4DI_V4DI_INT
:
9431 case V8HI_FTYPE_V8HI_V8HI_INT
:
9432 case V8SI_FTYPE_V8SI_V8SI_INT
:
9433 case V8SI_FTYPE_V8SI_V4SI_INT
:
9434 case V8SF_FTYPE_V8SF_V8SF_INT
:
9435 case V8SF_FTYPE_V8SF_V4SF_INT
:
9436 case V4SI_FTYPE_V4SI_V4SI_INT
:
9437 case V4DF_FTYPE_V4DF_V4DF_INT
:
9438 case V16SF_FTYPE_V16SF_V16SF_INT
:
9439 case V16SF_FTYPE_V16SF_V4SF_INT
:
9440 case V16SI_FTYPE_V16SI_V4SI_INT
:
9441 case V4DF_FTYPE_V4DF_V2DF_INT
:
9442 case V4SF_FTYPE_V4SF_V4SF_INT
:
9443 case V2DI_FTYPE_V2DI_V2DI_INT
:
9444 case V4DI_FTYPE_V4DI_V2DI_INT
:
9445 case V2DF_FTYPE_V2DF_V2DF_INT
:
9446 case UQI_FTYPE_V8DI_V8UDI_INT
:
9447 case UQI_FTYPE_V8DF_V8DF_INT
:
9448 case UQI_FTYPE_V2DF_V2DF_INT
:
9449 case UQI_FTYPE_V4SF_V4SF_INT
:
9450 case UHI_FTYPE_V16SI_V16SI_INT
:
9451 case UHI_FTYPE_V16SF_V16SF_INT
:
9452 case V64QI_FTYPE_V64QI_V64QI_INT
:
9453 case V32HI_FTYPE_V32HI_V32HI_INT
:
9454 case V16SI_FTYPE_V16SI_V16SI_INT
:
9455 case V8DI_FTYPE_V8DI_V8DI_INT
:
9459 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
9464 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
9469 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
9474 case V2DI_FTYPE_V2DI_UINT_UINT
:
9478 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
9483 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
9489 case QI_FTYPE_V8DF_INT_UQI
:
9490 case QI_FTYPE_V4DF_INT_UQI
:
9491 case QI_FTYPE_V2DF_INT_UQI
:
9492 case HI_FTYPE_V16SF_INT_UHI
:
9493 case QI_FTYPE_V8SF_INT_UQI
:
9494 case QI_FTYPE_V4SF_INT_UQI
:
9495 case V4SI_FTYPE_V4SI_V4SI_UHI
:
9496 case V8SI_FTYPE_V8SI_V8SI_UHI
:
9501 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
9507 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
9513 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
9514 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
9515 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
9516 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
9517 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
9518 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
9519 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
9520 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
9521 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
9522 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
9523 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
9524 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
9525 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
9526 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
9527 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
9528 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
9529 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
9530 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
9531 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
9532 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
9533 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
9534 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
9535 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
9536 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
9537 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
9538 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
9539 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
9540 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
9541 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
9542 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
9543 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
9544 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
9545 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
9546 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
9547 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
9548 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
9549 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
9550 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
9551 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
9552 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
9553 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
9554 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
9555 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
9556 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
9557 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
9558 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
9559 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
9560 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
9561 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
9562 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
9563 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
9564 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI
:
9565 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI
:
9566 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI
:
9569 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
9570 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
9571 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
9572 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
9573 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
9577 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
9578 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
9579 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
9580 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
9581 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
9582 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
9583 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
9584 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
9585 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
9586 case USI_FTYPE_V32QI_V32QI_INT_USI
:
9587 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
9588 case USI_FTYPE_V32HI_V32HI_INT_USI
:
9589 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
9590 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
9591 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT
:
9592 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT
:
9593 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT
:
9594 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT
:
9595 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT
:
9596 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT
:
9597 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT
:
9598 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT
:
9599 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT
:
9604 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
9608 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
9609 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
9610 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI
:
9611 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI
:
9612 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI
:
9615 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
9616 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
9621 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
9622 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
9623 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
9624 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
9625 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
9626 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
9627 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
9628 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
9629 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
9630 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
9631 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
9632 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
9633 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
9634 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
9635 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
9636 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
9637 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
9638 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
9639 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
9640 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
9641 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
9642 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
9643 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
9644 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
9645 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
9646 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
9647 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
9648 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
9649 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
9650 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
9655 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
9656 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
9657 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
9658 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
9659 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
9660 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
9661 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
9662 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
9663 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
9664 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
9665 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
9666 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
9667 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
9668 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
9669 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
9670 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
9671 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
9672 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
9673 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
9674 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
9675 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
9676 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
9677 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
9678 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
9679 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
9680 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
9681 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
9686 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
9687 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
9688 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
9689 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
9690 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
9691 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
9692 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
9693 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
9694 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
9695 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
9700 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
9701 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
9702 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
9703 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
9704 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
9705 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
9706 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
9707 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
9708 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
9709 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
9710 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
9711 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
9721 gcc_assert (nargs
<= ARRAY_SIZE (args
));
9723 if (comparison
!= UNKNOWN
)
9725 gcc_assert (nargs
== 2);
9726 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
9729 if (rmode
== VOIDmode
|| rmode
== tmode
)
9733 || GET_MODE (target
) != tmode
9734 || !insn_p
->operand
[0].predicate (target
, tmode
))
9735 target
= gen_reg_rtx (tmode
);
9736 else if (memory_operand (target
, tmode
))
9738 real_target
= target
;
9742 real_target
= gen_reg_rtx (tmode
);
9743 target
= lowpart_subreg (rmode
, real_target
, tmode
);
9746 for (i
= 0; i
< nargs
; i
++)
9748 tree arg
= CALL_EXPR_ARG (exp
, i
);
9749 rtx op
= expand_normal (arg
);
9750 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
9751 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
9753 if (second_arg_count
&& i
== 1)
9755 /* SIMD shift insns take either an 8-bit immediate or
9756 register as count. But builtin functions take int as
9757 count. If count doesn't match, we put it in register.
9758 The instructions are using 64-bit count, if op is just
9759 32-bit, zero-extend it, as negative shift counts
9760 are undefined behavior and zero-extension is more
9764 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
9765 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
9767 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9768 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
9769 op
= copy_to_reg (op
);
9772 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9773 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
9778 case CODE_FOR_avx_vinsertf128v4di
:
9779 case CODE_FOR_avx_vextractf128v4di
:
9780 error ("the last argument must be an 1-bit immediate");
9783 case CODE_FOR_avx512f_cmpv8di3_mask
:
9784 case CODE_FOR_avx512f_cmpv16si3_mask
:
9785 case CODE_FOR_avx512f_ucmpv8di3_mask
:
9786 case CODE_FOR_avx512f_ucmpv16si3_mask
:
9787 case CODE_FOR_avx512vl_cmpv4di3_mask
:
9788 case CODE_FOR_avx512vl_cmpv8si3_mask
:
9789 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
9790 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
9791 case CODE_FOR_avx512vl_cmpv2di3_mask
:
9792 case CODE_FOR_avx512vl_cmpv4si3_mask
:
9793 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
9794 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
9795 error ("the last argument must be a 3-bit immediate");
9798 case CODE_FOR_sse4_1_roundsd
:
9799 case CODE_FOR_sse4_1_roundss
:
9801 case CODE_FOR_sse4_1_roundpd
:
9802 case CODE_FOR_sse4_1_roundps
:
9803 case CODE_FOR_avx_roundpd256
:
9804 case CODE_FOR_avx_roundps256
:
9806 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
9807 case CODE_FOR_sse4_1_roundps_sfix
:
9808 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
9809 case CODE_FOR_avx_roundps_sfix256
:
9811 case CODE_FOR_sse4_1_blendps
:
9812 case CODE_FOR_avx_blendpd256
:
9813 case CODE_FOR_avx_vpermilv4df
:
9814 case CODE_FOR_avx_vpermilv4df_mask
:
9815 case CODE_FOR_avx512f_getmantv8df_mask
:
9816 case CODE_FOR_avx512f_getmantv16sf_mask
:
9817 case CODE_FOR_avx512vl_getmantv8sf_mask
:
9818 case CODE_FOR_avx512vl_getmantv4df_mask
:
9819 case CODE_FOR_avx512vl_getmantv4sf_mask
:
9820 case CODE_FOR_avx512vl_getmantv2df_mask
:
9821 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
9822 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
9823 case CODE_FOR_avx512dq_rangepv4df_mask
:
9824 case CODE_FOR_avx512dq_rangepv8sf_mask
:
9825 case CODE_FOR_avx512dq_rangepv2df_mask
:
9826 case CODE_FOR_avx512dq_rangepv4sf_mask
:
9827 case CODE_FOR_avx_shufpd256_mask
:
9828 error ("the last argument must be a 4-bit immediate");
9831 case CODE_FOR_sha1rnds4
:
9832 case CODE_FOR_sse4_1_blendpd
:
9833 case CODE_FOR_avx_vpermilv2df
:
9834 case CODE_FOR_avx_vpermilv2df_mask
:
9835 case CODE_FOR_xop_vpermil2v2df3
:
9836 case CODE_FOR_xop_vpermil2v4sf3
:
9837 case CODE_FOR_xop_vpermil2v4df3
:
9838 case CODE_FOR_xop_vpermil2v8sf3
:
9839 case CODE_FOR_avx512f_vinsertf32x4_mask
:
9840 case CODE_FOR_avx512f_vinserti32x4_mask
:
9841 case CODE_FOR_avx512f_vextractf32x4_mask
:
9842 case CODE_FOR_avx512f_vextracti32x4_mask
:
9843 case CODE_FOR_sse2_shufpd
:
9844 case CODE_FOR_sse2_shufpd_mask
:
9845 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
9846 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
9847 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
9848 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
9849 error ("the last argument must be a 2-bit immediate");
9852 case CODE_FOR_avx_vextractf128v4df
:
9853 case CODE_FOR_avx_vextractf128v8sf
:
9854 case CODE_FOR_avx_vextractf128v8si
:
9855 case CODE_FOR_avx_vinsertf128v4df
:
9856 case CODE_FOR_avx_vinsertf128v8sf
:
9857 case CODE_FOR_avx_vinsertf128v8si
:
9858 case CODE_FOR_avx512f_vinsertf64x4_mask
:
9859 case CODE_FOR_avx512f_vinserti64x4_mask
:
9860 case CODE_FOR_avx512f_vextractf64x4_mask
:
9861 case CODE_FOR_avx512f_vextracti64x4_mask
:
9862 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
9863 case CODE_FOR_avx512dq_vinserti32x8_mask
:
9864 case CODE_FOR_avx512vl_vinsertv4df
:
9865 case CODE_FOR_avx512vl_vinsertv4di
:
9866 case CODE_FOR_avx512vl_vinsertv8sf
:
9867 case CODE_FOR_avx512vl_vinsertv8si
:
9868 error ("the last argument must be a 1-bit immediate");
9871 case CODE_FOR_avx_vmcmpv2df3
:
9872 case CODE_FOR_avx_vmcmpv4sf3
:
9873 case CODE_FOR_avx_cmpv2df3
:
9874 case CODE_FOR_avx_cmpv4sf3
:
9875 case CODE_FOR_avx_cmpv4df3
:
9876 case CODE_FOR_avx_cmpv8sf3
:
9877 case CODE_FOR_avx512f_cmpv8df3_mask
:
9878 case CODE_FOR_avx512f_cmpv16sf3_mask
:
9879 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
9880 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
9881 error ("the last argument must be a 5-bit immediate");
9885 switch (nargs_constant
)
9888 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9889 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
9891 error ("the next to last argument must be an 8-bit immediate");
9896 error ("the last argument must be an 8-bit immediate");
9906 if (VECTOR_MODE_P (mode
))
9907 op
= safe_vector_operand (op
, mode
);
9909 /* If we aren't optimizing, only allow one memory operand to
9911 if (memory_operand (op
, mode
))
9914 op
= fixup_modeless_constant (op
, mode
);
9916 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
9918 if (optimize
|| !match
|| num_memory
> 1)
9919 op
= copy_to_mode_reg (mode
, op
);
9923 op
= copy_to_reg (op
);
9924 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9929 args
[i
].mode
= mode
;
9935 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
);
9938 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
);
9941 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9945 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9946 args
[2].op
, args
[3].op
);
9949 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9950 args
[2].op
, args
[3].op
, args
[4].op
);
9953 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9954 args
[2].op
, args
[3].op
, args
[4].op
,
9968 /* Transform pattern of following layout:
9970 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9976 ix86_erase_embedded_rounding (rtx pat
)
9978 if (GET_CODE (pat
) == INSN
)
9979 pat
= PATTERN (pat
);
9981 gcc_assert (GET_CODE (pat
) == SET
);
9982 rtx src
= SET_SRC (pat
);
9983 gcc_assert (XVECLEN (src
, 0) == 2);
9984 rtx p0
= XVECEXP (src
, 0, 0);
9985 gcc_assert (GET_CODE (src
) == UNSPEC
9986 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
9987 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
9991 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9994 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
9995 tree exp
, rtx target
)
9998 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9999 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10000 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10001 tree arg3
= CALL_EXPR_ARG (exp
, 3);
10002 rtx op0
= expand_normal (arg0
);
10003 rtx op1
= expand_normal (arg1
);
10004 rtx op2
= expand_normal (arg2
);
10005 rtx op3
= expand_normal (arg3
);
10006 enum insn_code icode
= d
->icode
;
10007 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10008 machine_mode mode0
= insn_p
->operand
[0].mode
;
10009 machine_mode mode1
= insn_p
->operand
[1].mode
;
10010 enum rtx_code comparison
= UNEQ
;
10011 bool need_ucomi
= false;
10013 /* See avxintrin.h for values. */
10014 enum rtx_code comi_comparisons
[32] =
10016 UNEQ
, GT
, GE
, UNORDERED
, LTGT
, UNLE
, UNLT
, ORDERED
, UNEQ
, UNLT
,
10017 UNLE
, LT
, LTGT
, GE
, GT
, LT
, UNEQ
, GT
, GE
, UNORDERED
, LTGT
, UNLE
,
10018 UNLT
, ORDERED
, UNEQ
, UNLT
, UNLE
, LT
, LTGT
, GE
, GT
, LT
10020 bool need_ucomi_values
[32] =
10022 true, false, false, true, true, false, false, true,
10023 true, false, false, true, true, false, false, true,
10024 false, true, true, false, false, true, true, false,
10025 false, true, true, false, false, true, true, false
10028 if (!CONST_INT_P (op2
))
10030 error ("the third argument must be comparison constant");
10033 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
10035 error ("incorrect comparison mode");
10039 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
10041 error ("incorrect rounding operand");
10045 comparison
= comi_comparisons
[INTVAL (op2
)];
10046 need_ucomi
= need_ucomi_values
[INTVAL (op2
)];
10048 if (VECTOR_MODE_P (mode0
))
10049 op0
= safe_vector_operand (op0
, mode0
);
10050 if (VECTOR_MODE_P (mode1
))
10051 op1
= safe_vector_operand (op1
, mode1
);
10053 target
= gen_reg_rtx (SImode
);
10054 emit_move_insn (target
, const0_rtx
);
10055 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10057 if ((optimize
&& !register_operand (op0
, mode0
))
10058 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10059 op0
= copy_to_mode_reg (mode0
, op0
);
10060 if ((optimize
&& !register_operand (op1
, mode1
))
10061 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10062 op1
= copy_to_mode_reg (mode1
, op1
);
10065 icode
= icode
== CODE_FOR_sse_comi_round
10066 ? CODE_FOR_sse_ucomi_round
10067 : CODE_FOR_sse2_ucomi_round
;
10069 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
10073 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10074 if (INTVAL (op3
) == NO_ROUND
)
10076 pat
= ix86_erase_embedded_rounding (pat
);
10080 set_dst
= SET_DEST (pat
);
10084 gcc_assert (GET_CODE (pat
) == SET
);
10085 set_dst
= SET_DEST (pat
);
10089 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10090 gen_rtx_fmt_ee (comparison
, QImode
,
10094 return SUBREG_REG (target
);
10098 ix86_expand_round_builtin (const struct builtin_description
*d
,
10099 tree exp
, rtx target
)
10102 unsigned int i
, nargs
;
10108 enum insn_code icode
= d
->icode
;
10109 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10110 machine_mode tmode
= insn_p
->operand
[0].mode
;
10111 unsigned int nargs_constant
= 0;
10112 unsigned int redundant_embed_rnd
= 0;
10114 switch ((enum ix86_builtin_func_type
) d
->flag
)
10116 case UINT64_FTYPE_V2DF_INT
:
10117 case UINT64_FTYPE_V4SF_INT
:
10118 case UINT_FTYPE_V2DF_INT
:
10119 case UINT_FTYPE_V4SF_INT
:
10120 case INT64_FTYPE_V2DF_INT
:
10121 case INT64_FTYPE_V4SF_INT
:
10122 case INT_FTYPE_V2DF_INT
:
10123 case INT_FTYPE_V4SF_INT
:
10126 case V4SF_FTYPE_V4SF_UINT_INT
:
10127 case V4SF_FTYPE_V4SF_UINT64_INT
:
10128 case V2DF_FTYPE_V2DF_UINT64_INT
:
10129 case V4SF_FTYPE_V4SF_INT_INT
:
10130 case V4SF_FTYPE_V4SF_INT64_INT
:
10131 case V2DF_FTYPE_V2DF_INT64_INT
:
10132 case V4SF_FTYPE_V4SF_V4SF_INT
:
10133 case V2DF_FTYPE_V2DF_V2DF_INT
:
10134 case V4SF_FTYPE_V4SF_V2DF_INT
:
10135 case V2DF_FTYPE_V2DF_V4SF_INT
:
10138 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
10139 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
10140 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
10141 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
10142 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
10143 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
10144 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
10145 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
10146 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
10147 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
10148 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
10149 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
10150 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
10151 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
10154 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
10155 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
10156 nargs_constant
= 2;
10159 case INT_FTYPE_V4SF_V4SF_INT_INT
:
10160 case INT_FTYPE_V2DF_V2DF_INT_INT
:
10161 return ix86_expand_sse_comi_round (d
, exp
, target
);
10162 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
10163 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
10164 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
10165 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
10166 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
10167 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
10168 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
10169 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
10172 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
10173 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
10174 nargs_constant
= 4;
10177 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
10178 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
10179 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
10180 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
10181 nargs_constant
= 3;
10184 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
10185 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
10186 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
10187 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
10188 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
10189 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
10191 nargs_constant
= 4;
10193 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
10194 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
10195 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
10196 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
10198 nargs_constant
= 3;
10201 gcc_unreachable ();
10203 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10207 || GET_MODE (target
) != tmode
10208 || !insn_p
->operand
[0].predicate (target
, tmode
))
10209 target
= gen_reg_rtx (tmode
);
10211 for (i
= 0; i
< nargs
; i
++)
10213 tree arg
= CALL_EXPR_ARG (exp
, i
);
10214 rtx op
= expand_normal (arg
);
10215 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10216 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10218 if (i
== nargs
- nargs_constant
)
10224 case CODE_FOR_avx512f_getmantv8df_mask_round
:
10225 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
10226 case CODE_FOR_avx512f_vgetmantv2df_round
:
10227 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
10228 case CODE_FOR_avx512f_vgetmantv4sf_round
:
10229 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
10230 error ("the immediate argument must be a 4-bit immediate");
10232 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
10233 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
10234 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
10235 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
10236 error ("the immediate argument must be a 5-bit immediate");
10239 error ("the immediate argument must be an 8-bit immediate");
10244 else if (i
== nargs
-1)
10246 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
10248 error ("incorrect rounding operand");
10252 /* If there is no rounding use normal version of the pattern. */
10253 if (INTVAL (op
) == NO_ROUND
)
10254 redundant_embed_rnd
= 1;
10258 if (VECTOR_MODE_P (mode
))
10259 op
= safe_vector_operand (op
, mode
);
10261 op
= fixup_modeless_constant (op
, mode
);
10263 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10265 if (optimize
|| !match
)
10266 op
= copy_to_mode_reg (mode
, op
);
10270 op
= copy_to_reg (op
);
10271 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10276 args
[i
].mode
= mode
;
10282 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10285 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10288 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10292 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10293 args
[2].op
, args
[3].op
);
10296 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10297 args
[2].op
, args
[3].op
, args
[4].op
);
10300 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10301 args
[2].op
, args
[3].op
, args
[4].op
,
10305 gcc_unreachable ();
10311 if (redundant_embed_rnd
)
10312 pat
= ix86_erase_embedded_rounding (pat
);
10318 /* Subroutine of ix86_expand_builtin to take care of special insns
10319 with variable number of operands. */
10322 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
10323 tree exp
, rtx target
)
10327 unsigned int i
, nargs
, arg_adjust
, memory
;
10328 bool aligned_mem
= false;
10334 enum insn_code icode
= d
->icode
;
10335 bool last_arg_constant
= false;
10336 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10337 machine_mode tmode
= insn_p
->operand
[0].mode
;
10338 enum { load
, store
} klass
;
10340 switch ((enum ix86_builtin_func_type
) d
->flag
)
10342 case VOID_FTYPE_VOID
:
10343 emit_insn (GEN_FCN (icode
) (target
));
10345 case VOID_FTYPE_UINT64
:
10346 case VOID_FTYPE_UNSIGNED
:
10352 case INT_FTYPE_VOID
:
10353 case USHORT_FTYPE_VOID
:
10354 case UINT64_FTYPE_VOID
:
10355 case UINT_FTYPE_VOID
:
10356 case UNSIGNED_FTYPE_VOID
:
10361 case UINT64_FTYPE_PUNSIGNED
:
10362 case V2DI_FTYPE_PV2DI
:
10363 case V4DI_FTYPE_PV4DI
:
10364 case V32QI_FTYPE_PCCHAR
:
10365 case V16QI_FTYPE_PCCHAR
:
10366 case V8SF_FTYPE_PCV4SF
:
10367 case V8SF_FTYPE_PCFLOAT
:
10368 case V4SF_FTYPE_PCFLOAT
:
10369 case V4DF_FTYPE_PCV2DF
:
10370 case V4DF_FTYPE_PCDOUBLE
:
10371 case V2DF_FTYPE_PCDOUBLE
:
10372 case VOID_FTYPE_PVOID
:
10373 case V8DI_FTYPE_PV8DI
:
10379 case CODE_FOR_sse4_1_movntdqa
:
10380 case CODE_FOR_avx2_movntdqa
:
10381 case CODE_FOR_avx512f_movntdqa
:
10382 aligned_mem
= true;
10388 case VOID_FTYPE_PV2SF_V4SF
:
10389 case VOID_FTYPE_PV8DI_V8DI
:
10390 case VOID_FTYPE_PV4DI_V4DI
:
10391 case VOID_FTYPE_PV2DI_V2DI
:
10392 case VOID_FTYPE_PCHAR_V32QI
:
10393 case VOID_FTYPE_PCHAR_V16QI
:
10394 case VOID_FTYPE_PFLOAT_V16SF
:
10395 case VOID_FTYPE_PFLOAT_V8SF
:
10396 case VOID_FTYPE_PFLOAT_V4SF
:
10397 case VOID_FTYPE_PDOUBLE_V8DF
:
10398 case VOID_FTYPE_PDOUBLE_V4DF
:
10399 case VOID_FTYPE_PDOUBLE_V2DF
:
10400 case VOID_FTYPE_PLONGLONG_LONGLONG
:
10401 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
10402 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
10403 case VOID_FTYPE_PINT_INT
:
10406 /* Reserve memory operand for target. */
10407 memory
= ARRAY_SIZE (args
);
10410 /* These builtins and instructions require the memory
10411 to be properly aligned. */
10412 case CODE_FOR_avx_movntv4di
:
10413 case CODE_FOR_sse2_movntv2di
:
10414 case CODE_FOR_avx_movntv8sf
:
10415 case CODE_FOR_sse_movntv4sf
:
10416 case CODE_FOR_sse4a_vmmovntv4sf
:
10417 case CODE_FOR_avx_movntv4df
:
10418 case CODE_FOR_sse2_movntv2df
:
10419 case CODE_FOR_sse4a_vmmovntv2df
:
10420 case CODE_FOR_sse2_movntidi
:
10421 case CODE_FOR_sse_movntq
:
10422 case CODE_FOR_sse2_movntisi
:
10423 case CODE_FOR_avx512f_movntv16sf
:
10424 case CODE_FOR_avx512f_movntv8df
:
10425 case CODE_FOR_avx512f_movntv8di
:
10426 aligned_mem
= true;
10432 case VOID_FTYPE_PVOID_PCVOID
:
10438 case V4SF_FTYPE_V4SF_PCV2SF
:
10439 case V2DF_FTYPE_V2DF_PCDOUBLE
:
10444 case V8SF_FTYPE_PCV8SF_V8SI
:
10445 case V4DF_FTYPE_PCV4DF_V4DI
:
10446 case V4SF_FTYPE_PCV4SF_V4SI
:
10447 case V2DF_FTYPE_PCV2DF_V2DI
:
10448 case V8SI_FTYPE_PCV8SI_V8SI
:
10449 case V4DI_FTYPE_PCV4DI_V4DI
:
10450 case V4SI_FTYPE_PCV4SI_V4SI
:
10451 case V2DI_FTYPE_PCV2DI_V2DI
:
10452 case VOID_FTYPE_INT_INT64
:
10457 case VOID_FTYPE_PV8DF_V8DF_UQI
:
10458 case VOID_FTYPE_PV4DF_V4DF_UQI
:
10459 case VOID_FTYPE_PV2DF_V2DF_UQI
:
10460 case VOID_FTYPE_PV16SF_V16SF_UHI
:
10461 case VOID_FTYPE_PV8SF_V8SF_UQI
:
10462 case VOID_FTYPE_PV4SF_V4SF_UQI
:
10463 case VOID_FTYPE_PV8DI_V8DI_UQI
:
10464 case VOID_FTYPE_PV4DI_V4DI_UQI
:
10465 case VOID_FTYPE_PV2DI_V2DI_UQI
:
10466 case VOID_FTYPE_PV16SI_V16SI_UHI
:
10467 case VOID_FTYPE_PV8SI_V8SI_UQI
:
10468 case VOID_FTYPE_PV4SI_V4SI_UQI
:
10469 case VOID_FTYPE_PV64QI_V64QI_UDI
:
10470 case VOID_FTYPE_PV32HI_V32HI_USI
:
10471 case VOID_FTYPE_PV32QI_V32QI_USI
:
10472 case VOID_FTYPE_PV16QI_V16QI_UHI
:
10473 case VOID_FTYPE_PV16HI_V16HI_UHI
:
10474 case VOID_FTYPE_PV8HI_V8HI_UQI
:
10477 /* These builtins and instructions require the memory
10478 to be properly aligned. */
10479 case CODE_FOR_avx512f_storev16sf_mask
:
10480 case CODE_FOR_avx512f_storev16si_mask
:
10481 case CODE_FOR_avx512f_storev8df_mask
:
10482 case CODE_FOR_avx512f_storev8di_mask
:
10483 case CODE_FOR_avx512vl_storev8sf_mask
:
10484 case CODE_FOR_avx512vl_storev8si_mask
:
10485 case CODE_FOR_avx512vl_storev4df_mask
:
10486 case CODE_FOR_avx512vl_storev4di_mask
:
10487 case CODE_FOR_avx512vl_storev4sf_mask
:
10488 case CODE_FOR_avx512vl_storev4si_mask
:
10489 case CODE_FOR_avx512vl_storev2df_mask
:
10490 case CODE_FOR_avx512vl_storev2di_mask
:
10491 aligned_mem
= true;
10497 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
10498 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
10499 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
10500 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
10501 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
10502 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
10503 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
10504 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
10505 case VOID_FTYPE_PV8SI_V8DI_UQI
:
10506 case VOID_FTYPE_PV8HI_V8DI_UQI
:
10507 case VOID_FTYPE_PV16HI_V16SI_UHI
:
10508 case VOID_FTYPE_PV16QI_V8DI_UQI
:
10509 case VOID_FTYPE_PV16QI_V16SI_UHI
:
10510 case VOID_FTYPE_PV4SI_V4DI_UQI
:
10511 case VOID_FTYPE_PV4SI_V2DI_UQI
:
10512 case VOID_FTYPE_PV8HI_V4DI_UQI
:
10513 case VOID_FTYPE_PV8HI_V2DI_UQI
:
10514 case VOID_FTYPE_PV8HI_V8SI_UQI
:
10515 case VOID_FTYPE_PV8HI_V4SI_UQI
:
10516 case VOID_FTYPE_PV16QI_V4DI_UQI
:
10517 case VOID_FTYPE_PV16QI_V2DI_UQI
:
10518 case VOID_FTYPE_PV16QI_V8SI_UQI
:
10519 case VOID_FTYPE_PV16QI_V4SI_UQI
:
10520 case VOID_FTYPE_PCHAR_V64QI_UDI
:
10521 case VOID_FTYPE_PCHAR_V32QI_USI
:
10522 case VOID_FTYPE_PCHAR_V16QI_UHI
:
10523 case VOID_FTYPE_PSHORT_V32HI_USI
:
10524 case VOID_FTYPE_PSHORT_V16HI_UHI
:
10525 case VOID_FTYPE_PSHORT_V8HI_UQI
:
10526 case VOID_FTYPE_PINT_V16SI_UHI
:
10527 case VOID_FTYPE_PINT_V8SI_UQI
:
10528 case VOID_FTYPE_PINT_V4SI_UQI
:
10529 case VOID_FTYPE_PINT64_V8DI_UQI
:
10530 case VOID_FTYPE_PINT64_V4DI_UQI
:
10531 case VOID_FTYPE_PINT64_V2DI_UQI
:
10532 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
10533 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
10534 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
10535 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
10536 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
10537 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
10538 case VOID_FTYPE_PV32QI_V32HI_USI
:
10539 case VOID_FTYPE_PV16QI_V16HI_UHI
:
10540 case VOID_FTYPE_PV8QI_V8HI_UQI
:
10543 /* Reserve memory operand for target. */
10544 memory
= ARRAY_SIZE (args
);
10546 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
10547 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
10548 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
10549 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
10550 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
10551 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
10552 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
10553 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
10554 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
10555 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
10556 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
10557 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
10558 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
10559 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
10560 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
10561 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
10562 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
10563 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
10566 /* These builtins and instructions require the memory
10567 to be properly aligned. */
10568 case CODE_FOR_avx512f_loadv16sf_mask
:
10569 case CODE_FOR_avx512f_loadv16si_mask
:
10570 case CODE_FOR_avx512f_loadv8df_mask
:
10571 case CODE_FOR_avx512f_loadv8di_mask
:
10572 case CODE_FOR_avx512vl_loadv8sf_mask
:
10573 case CODE_FOR_avx512vl_loadv8si_mask
:
10574 case CODE_FOR_avx512vl_loadv4df_mask
:
10575 case CODE_FOR_avx512vl_loadv4di_mask
:
10576 case CODE_FOR_avx512vl_loadv4sf_mask
:
10577 case CODE_FOR_avx512vl_loadv4si_mask
:
10578 case CODE_FOR_avx512vl_loadv2df_mask
:
10579 case CODE_FOR_avx512vl_loadv2di_mask
:
10580 case CODE_FOR_avx512bw_loadv64qi_mask
:
10581 case CODE_FOR_avx512vl_loadv32qi_mask
:
10582 case CODE_FOR_avx512vl_loadv16qi_mask
:
10583 case CODE_FOR_avx512bw_loadv32hi_mask
:
10584 case CODE_FOR_avx512vl_loadv16hi_mask
:
10585 case CODE_FOR_avx512vl_loadv8hi_mask
:
10586 aligned_mem
= true;
10592 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
10593 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
10594 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
10595 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
10596 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
10597 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
10598 case V16SI_FTYPE_PCINT_V16SI_UHI
:
10599 case V8SI_FTYPE_PCINT_V8SI_UQI
:
10600 case V4SI_FTYPE_PCINT_V4SI_UQI
:
10601 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
10602 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
10603 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
10604 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
10605 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
10606 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
10607 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
10608 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
10609 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
10614 case VOID_FTYPE_UINT_UINT_UINT
:
10615 case VOID_FTYPE_UINT64_UINT_UINT
:
10616 case UCHAR_FTYPE_UINT_UINT_UINT
:
10617 case UCHAR_FTYPE_UINT64_UINT_UINT
:
10620 memory
= ARRAY_SIZE (args
);
10621 last_arg_constant
= true;
10624 gcc_unreachable ();
10627 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10629 if (klass
== store
)
10631 arg
= CALL_EXPR_ARG (exp
, 0);
10632 op
= expand_normal (arg
);
10633 gcc_assert (target
== 0);
10636 op
= ix86_zero_extend_to_Pmode (op
);
10637 target
= gen_rtx_MEM (tmode
, op
);
10638 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10639 on it. Try to improve it using get_pointer_alignment,
10640 and if the special builtin is one that requires strict
10641 mode alignment, also from it's GET_MODE_ALIGNMENT.
10642 Failure to do so could lead to ix86_legitimate_combined_insn
10643 rejecting all changes to such insns. */
10644 unsigned int align
= get_pointer_alignment (arg
);
10645 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
10646 align
= GET_MODE_ALIGNMENT (tmode
);
10647 if (MEM_ALIGN (target
) < align
)
10648 set_mem_align (target
, align
);
10651 target
= force_reg (tmode
, op
);
10659 || !register_operand (target
, tmode
)
10660 || GET_MODE (target
) != tmode
)
10661 target
= gen_reg_rtx (tmode
);
10664 for (i
= 0; i
< nargs
; i
++)
10666 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10669 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
10670 op
= expand_normal (arg
);
10671 match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10673 if (last_arg_constant
&& (i
+ 1) == nargs
)
10677 if (icode
== CODE_FOR_lwp_lwpvalsi3
10678 || icode
== CODE_FOR_lwp_lwpinssi3
10679 || icode
== CODE_FOR_lwp_lwpvaldi3
10680 || icode
== CODE_FOR_lwp_lwpinsdi3
)
10681 error ("the last argument must be a 32-bit immediate");
10683 error ("the last argument must be an 8-bit immediate");
10691 /* This must be the memory operand. */
10692 op
= ix86_zero_extend_to_Pmode (op
);
10693 op
= gen_rtx_MEM (mode
, op
);
10694 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10695 on it. Try to improve it using get_pointer_alignment,
10696 and if the special builtin is one that requires strict
10697 mode alignment, also from it's GET_MODE_ALIGNMENT.
10698 Failure to do so could lead to ix86_legitimate_combined_insn
10699 rejecting all changes to such insns. */
10700 unsigned int align
= get_pointer_alignment (arg
);
10701 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
10702 align
= GET_MODE_ALIGNMENT (mode
);
10703 if (MEM_ALIGN (op
) < align
)
10704 set_mem_align (op
, align
);
10708 /* This must be register. */
10709 if (VECTOR_MODE_P (mode
))
10710 op
= safe_vector_operand (op
, mode
);
10712 op
= fixup_modeless_constant (op
, mode
);
10714 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10715 op
= copy_to_mode_reg (mode
, op
);
10718 op
= copy_to_reg (op
);
10719 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10725 args
[i
].mode
= mode
;
10731 pat
= GEN_FCN (icode
) (target
);
10734 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10737 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10740 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
10743 gcc_unreachable ();
10749 return klass
== store
? 0 : target
;
10752 /* Return the integer constant in ARG. Constrain it to be in the range
10753 of the subparts of VEC_TYPE; issue an error if not. */
10756 get_element_number (tree vec_type
, tree arg
)
10758 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
10760 if (!tree_fits_uhwi_p (arg
)
10761 || (elt
= tree_to_uhwi (arg
), elt
> max
))
10763 error ("selector must be an integer constant in the range "
10771 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10772 ix86_expand_vector_init. We DO have language-level syntax for this, in
10773 the form of (type){ init-list }. Except that since we can't place emms
10774 instructions from inside the compiler, we can't allow the use of MMX
10775 registers unless the user explicitly asks for it. So we do *not* define
10776 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10777 we have builtins invoked by mmintrin.h that gives us license to emit
10778 these sorts of instructions. */
10781 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
10783 machine_mode tmode
= TYPE_MODE (type
);
10784 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
10785 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
10786 rtvec v
= rtvec_alloc (n_elt
);
10788 gcc_assert (VECTOR_MODE_P (tmode
));
10789 gcc_assert (call_expr_nargs (exp
) == n_elt
);
10791 for (i
= 0; i
< n_elt
; ++i
)
10793 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
10794 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
10797 if (!target
|| !register_operand (target
, tmode
))
10798 target
= gen_reg_rtx (tmode
);
10800 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
10804 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10805 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10806 had a language-level syntax for referencing vector elements. */
10809 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
10811 machine_mode tmode
, mode0
;
10816 arg0
= CALL_EXPR_ARG (exp
, 0);
10817 arg1
= CALL_EXPR_ARG (exp
, 1);
10819 op0
= expand_normal (arg0
);
10820 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
10822 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10823 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
10824 gcc_assert (VECTOR_MODE_P (mode0
));
10826 op0
= force_reg (mode0
, op0
);
10828 if (optimize
|| !target
|| !register_operand (target
, tmode
))
10829 target
= gen_reg_rtx (tmode
);
10831 ix86_expand_vector_extract (true, target
, op0
, elt
);
10836 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10837 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10838 a language-level syntax for referencing vector elements. */
10841 ix86_expand_vec_set_builtin (tree exp
)
10843 machine_mode tmode
, mode1
;
10844 tree arg0
, arg1
, arg2
;
10846 rtx op0
, op1
, target
;
10848 arg0
= CALL_EXPR_ARG (exp
, 0);
10849 arg1
= CALL_EXPR_ARG (exp
, 1);
10850 arg2
= CALL_EXPR_ARG (exp
, 2);
10852 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
10853 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10854 gcc_assert (VECTOR_MODE_P (tmode
));
10856 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
10857 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
10858 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
10860 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
10861 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
10863 op0
= force_reg (tmode
, op0
);
10864 op1
= force_reg (mode1
, op1
);
10866 /* OP0 is the source of these builtin functions and shouldn't be
10867 modified. Create a copy, use it and return it as target. */
10868 target
= gen_reg_rtx (tmode
);
10869 emit_move_insn (target
, op0
);
10870 ix86_expand_vector_set (true, target
, op1
, elt
);
10875 /* Expand an expression EXP that calls a built-in function,
10876 with result going to TARGET if that's convenient
10877 (and in mode MODE if that's convenient).
10878 SUBTARGET may be used as the target for computing one of EXP's operands.
10879 IGNORE is nonzero if the value is to be ignored. */
10882 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
10883 machine_mode mode
, int ignore
)
10886 enum insn_code icode
, icode2
;
10887 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
10888 tree arg0
, arg1
, arg2
, arg3
, arg4
;
10889 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
10890 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
10891 unsigned int fcode
= DECL_FUNCTION_CODE (fndecl
);
10893 /* For CPU builtins that can be folded, fold first and expand the fold. */
10896 case IX86_BUILTIN_CPU_INIT
:
10898 /* Make it call __cpu_indicator_init in libgcc. */
10899 tree call_expr
, fndecl
, type
;
10900 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
10901 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
10902 call_expr
= build_call_expr (fndecl
, 0);
10903 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
10905 case IX86_BUILTIN_CPU_IS
:
10906 case IX86_BUILTIN_CPU_SUPPORTS
:
10908 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10909 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
10910 gcc_assert (fold_expr
!= NULL_TREE
);
10911 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
10915 HOST_WIDE_INT isa
= ix86_isa_flags
;
10916 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
10917 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
10918 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
10919 /* The general case is we require all the ISAs specified in bisa{,2}
10921 The exceptions are:
10922 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
10923 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
10924 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
10925 where for each this pair it is sufficient if either of the ISAs is
10926 enabled, plus if it is ored with other options also those others. */
10927 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
10928 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
10929 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
10930 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
10931 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
10932 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
10933 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
10934 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
10935 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
10936 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
10937 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
10938 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
10939 /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
10940 MMX is disabled. NB: Since MMX intrinsics are marked with
10941 SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
10943 if (TARGET_MMX
|| TARGET_MMX_WITH_SSE
)
10945 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
))
10946 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
))
10947 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
)) != 0)
10948 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
);
10949 if (((bisa
& (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
))
10950 == (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
))
10951 && (isa
& (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
)) != 0)
10952 isa
|= (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
);
10953 if (((bisa
& (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
))
10954 == (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
))
10955 && (isa
& (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
)) != 0)
10956 isa
|= (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
);
10958 if ((bisa
& isa
) != bisa
|| (bisa2
& isa2
) != bisa2
)
10960 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
10961 if (TARGET_ABI_X32
)
10962 bisa
|= OPTION_MASK_ABI_X32
;
10964 bisa
|= OPTION_MASK_ABI_64
;
10965 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
10966 (enum fpmath_unit
) 0, false, add_abi_p
);
10968 error ("%qE needs unknown isa option", fndecl
);
10971 gcc_assert (opts
!= NULL
);
10972 error ("%qE needs isa option %s", fndecl
, opts
);
10975 return expand_call (exp
, target
, ignore
);
10980 case IX86_BUILTIN_MASKMOVQ
:
10981 case IX86_BUILTIN_MASKMOVDQU
:
10982 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
10983 ? CODE_FOR_mmx_maskmovq
10984 : CODE_FOR_sse2_maskmovdqu
);
10985 /* Note the arg order is different from the operand order. */
10986 arg1
= CALL_EXPR_ARG (exp
, 0);
10987 arg2
= CALL_EXPR_ARG (exp
, 1);
10988 arg0
= CALL_EXPR_ARG (exp
, 2);
10989 op0
= expand_normal (arg0
);
10990 op1
= expand_normal (arg1
);
10991 op2
= expand_normal (arg2
);
10992 mode0
= insn_data
[icode
].operand
[0].mode
;
10993 mode1
= insn_data
[icode
].operand
[1].mode
;
10994 mode2
= insn_data
[icode
].operand
[2].mode
;
10996 op0
= ix86_zero_extend_to_Pmode (op0
);
10997 op0
= gen_rtx_MEM (mode1
, op0
);
10999 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11000 op0
= copy_to_mode_reg (mode0
, op0
);
11001 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
11002 op1
= copy_to_mode_reg (mode1
, op1
);
11003 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
11004 op2
= copy_to_mode_reg (mode2
, op2
);
11005 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11011 case IX86_BUILTIN_LDMXCSR
:
11012 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
11013 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11014 emit_move_insn (target
, op0
);
11015 emit_insn (gen_sse_ldmxcsr (target
));
11018 case IX86_BUILTIN_STMXCSR
:
11019 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11020 emit_insn (gen_sse_stmxcsr (target
));
11021 return copy_to_mode_reg (SImode
, target
);
11023 case IX86_BUILTIN_CLFLUSH
:
11024 arg0
= CALL_EXPR_ARG (exp
, 0);
11025 op0
= expand_normal (arg0
);
11026 icode
= CODE_FOR_sse2_clflush
;
11027 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11028 op0
= ix86_zero_extend_to_Pmode (op0
);
11030 emit_insn (gen_sse2_clflush (op0
));
11033 case IX86_BUILTIN_CLWB
:
11034 arg0
= CALL_EXPR_ARG (exp
, 0);
11035 op0
= expand_normal (arg0
);
11036 icode
= CODE_FOR_clwb
;
11037 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11038 op0
= ix86_zero_extend_to_Pmode (op0
);
11040 emit_insn (gen_clwb (op0
));
11043 case IX86_BUILTIN_CLFLUSHOPT
:
11044 arg0
= CALL_EXPR_ARG (exp
, 0);
11045 op0
= expand_normal (arg0
);
11046 icode
= CODE_FOR_clflushopt
;
11047 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11048 op0
= ix86_zero_extend_to_Pmode (op0
);
11050 emit_insn (gen_clflushopt (op0
));
11053 case IX86_BUILTIN_MONITOR
:
11054 case IX86_BUILTIN_MONITORX
:
11055 arg0
= CALL_EXPR_ARG (exp
, 0);
11056 arg1
= CALL_EXPR_ARG (exp
, 1);
11057 arg2
= CALL_EXPR_ARG (exp
, 2);
11058 op0
= expand_normal (arg0
);
11059 op1
= expand_normal (arg1
);
11060 op2
= expand_normal (arg2
);
11062 op0
= ix86_zero_extend_to_Pmode (op0
);
11064 op1
= copy_to_mode_reg (SImode
, op1
);
11066 op2
= copy_to_mode_reg (SImode
, op2
);
11068 emit_insn (fcode
== IX86_BUILTIN_MONITOR
11069 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
11070 : gen_monitorx (Pmode
, op0
, op1
, op2
));
11073 case IX86_BUILTIN_MWAIT
:
11074 arg0
= CALL_EXPR_ARG (exp
, 0);
11075 arg1
= CALL_EXPR_ARG (exp
, 1);
11076 op0
= expand_normal (arg0
);
11077 op1
= expand_normal (arg1
);
11079 op0
= copy_to_mode_reg (SImode
, op0
);
11081 op1
= copy_to_mode_reg (SImode
, op1
);
11082 emit_insn (gen_sse3_mwait (op0
, op1
));
11085 case IX86_BUILTIN_MWAITX
:
11086 arg0
= CALL_EXPR_ARG (exp
, 0);
11087 arg1
= CALL_EXPR_ARG (exp
, 1);
11088 arg2
= CALL_EXPR_ARG (exp
, 2);
11089 op0
= expand_normal (arg0
);
11090 op1
= expand_normal (arg1
);
11091 op2
= expand_normal (arg2
);
11093 op0
= copy_to_mode_reg (SImode
, op0
);
11095 op1
= copy_to_mode_reg (SImode
, op1
);
11097 op2
= copy_to_mode_reg (SImode
, op2
);
11098 emit_insn (gen_mwaitx (op0
, op1
, op2
));
11101 case IX86_BUILTIN_UMONITOR
:
11102 arg0
= CALL_EXPR_ARG (exp
, 0);
11103 op0
= expand_normal (arg0
);
11105 op0
= ix86_zero_extend_to_Pmode (op0
);
11107 insn
= (TARGET_64BIT
11108 ? gen_umonitor_di (op0
)
11109 : gen_umonitor_si (op0
));
11114 case IX86_BUILTIN_UMWAIT
:
11115 case IX86_BUILTIN_TPAUSE
:
11116 arg0
= CALL_EXPR_ARG (exp
, 0);
11117 arg1
= CALL_EXPR_ARG (exp
, 1);
11118 op0
= expand_normal (arg0
);
11119 op1
= expand_normal (arg1
);
11122 op0
= copy_to_mode_reg (SImode
, op0
);
11124 op1
= force_reg (DImode
, op1
);
11128 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11129 NULL
, 1, OPTAB_DIRECT
);
11132 case IX86_BUILTIN_UMWAIT
:
11133 icode
= CODE_FOR_umwait_rex64
;
11135 case IX86_BUILTIN_TPAUSE
:
11136 icode
= CODE_FOR_tpause_rex64
;
11139 gcc_unreachable ();
11142 op2
= gen_lowpart (SImode
, op2
);
11143 op1
= gen_lowpart (SImode
, op1
);
11144 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11150 case IX86_BUILTIN_UMWAIT
:
11151 icode
= CODE_FOR_umwait
;
11153 case IX86_BUILTIN_TPAUSE
:
11154 icode
= CODE_FOR_tpause
;
11157 gcc_unreachable ();
11159 pat
= GEN_FCN (icode
) (op0
, op1
);
11168 || !register_operand (target
, QImode
))
11169 target
= gen_reg_rtx (QImode
);
11171 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11173 emit_insn (gen_rtx_SET (target
, pat
));
11177 case IX86_BUILTIN_CLZERO
:
11178 arg0
= CALL_EXPR_ARG (exp
, 0);
11179 op0
= expand_normal (arg0
);
11181 op0
= ix86_zero_extend_to_Pmode (op0
);
11182 emit_insn (gen_clzero (Pmode
, op0
));
11185 case IX86_BUILTIN_CLDEMOTE
:
11186 arg0
= CALL_EXPR_ARG (exp
, 0);
11187 op0
= expand_normal (arg0
);
11188 icode
= CODE_FOR_cldemote
;
11189 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11190 op0
= ix86_zero_extend_to_Pmode (op0
);
11192 emit_insn (gen_cldemote (op0
));
11195 case IX86_BUILTIN_VEC_INIT_V2SI
:
11196 case IX86_BUILTIN_VEC_INIT_V4HI
:
11197 case IX86_BUILTIN_VEC_INIT_V8QI
:
11198 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
11200 case IX86_BUILTIN_VEC_EXT_V2DF
:
11201 case IX86_BUILTIN_VEC_EXT_V2DI
:
11202 case IX86_BUILTIN_VEC_EXT_V4SF
:
11203 case IX86_BUILTIN_VEC_EXT_V4SI
:
11204 case IX86_BUILTIN_VEC_EXT_V8HI
:
11205 case IX86_BUILTIN_VEC_EXT_V2SI
:
11206 case IX86_BUILTIN_VEC_EXT_V4HI
:
11207 case IX86_BUILTIN_VEC_EXT_V16QI
:
11208 return ix86_expand_vec_ext_builtin (exp
, target
);
11210 case IX86_BUILTIN_VEC_SET_V2DI
:
11211 case IX86_BUILTIN_VEC_SET_V4SF
:
11212 case IX86_BUILTIN_VEC_SET_V4SI
:
11213 case IX86_BUILTIN_VEC_SET_V8HI
:
11214 case IX86_BUILTIN_VEC_SET_V4HI
:
11215 case IX86_BUILTIN_VEC_SET_V16QI
:
11216 return ix86_expand_vec_set_builtin (exp
);
11218 case IX86_BUILTIN_NANQ
:
11219 case IX86_BUILTIN_NANSQ
:
11220 return expand_call (exp
, target
, ignore
);
11222 case IX86_BUILTIN_RDPID
:
11224 op0
= gen_reg_rtx (word_mode
);
11228 insn
= gen_rdpid_rex64 (op0
);
11229 op0
= convert_to_mode (SImode
, op0
, 1);
11232 insn
= gen_rdpid (op0
);
11237 || !register_operand (target
, SImode
))
11238 target
= gen_reg_rtx (SImode
);
11240 emit_move_insn (target
, op0
);
11243 case IX86_BUILTIN_RDPMC
:
11244 case IX86_BUILTIN_RDTSC
:
11245 case IX86_BUILTIN_RDTSCP
:
11246 case IX86_BUILTIN_XGETBV
:
11248 op0
= gen_reg_rtx (DImode
);
11249 op1
= gen_reg_rtx (DImode
);
11251 if (fcode
== IX86_BUILTIN_RDPMC
)
11253 arg0
= CALL_EXPR_ARG (exp
, 0);
11254 op2
= expand_normal (arg0
);
11255 if (!register_operand (op2
, SImode
))
11256 op2
= copy_to_mode_reg (SImode
, op2
);
11258 insn
= (TARGET_64BIT
11259 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
11260 : gen_rdpmc (op0
, op2
));
11263 else if (fcode
== IX86_BUILTIN_XGETBV
)
11265 arg0
= CALL_EXPR_ARG (exp
, 0);
11266 op2
= expand_normal (arg0
);
11267 if (!register_operand (op2
, SImode
))
11268 op2
= copy_to_mode_reg (SImode
, op2
);
11270 insn
= (TARGET_64BIT
11271 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
11272 : gen_xgetbv (op0
, op2
));
11275 else if (fcode
== IX86_BUILTIN_RDTSC
)
11277 insn
= (TARGET_64BIT
11278 ? gen_rdtsc_rex64 (op0
, op1
)
11279 : gen_rdtsc (op0
));
11284 op2
= gen_reg_rtx (SImode
);
11286 insn
= (TARGET_64BIT
11287 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
11288 : gen_rdtscp (op0
, op2
));
11291 arg0
= CALL_EXPR_ARG (exp
, 0);
11292 op4
= expand_normal (arg0
);
11293 if (!address_operand (op4
, VOIDmode
))
11295 op4
= convert_memory_address (Pmode
, op4
);
11296 op4
= copy_addr_to_reg (op4
);
11298 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
11302 || !register_operand (target
, DImode
))
11303 target
= gen_reg_rtx (DImode
);
11307 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
11308 op1
, 1, OPTAB_DIRECT
);
11309 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
11310 op0
, 1, OPTAB_DIRECT
);
11313 emit_move_insn (target
, op0
);
11316 case IX86_BUILTIN_ENQCMD
:
11317 case IX86_BUILTIN_ENQCMDS
:
11318 case IX86_BUILTIN_MOVDIR64B
:
11320 arg0
= CALL_EXPR_ARG (exp
, 0);
11321 arg1
= CALL_EXPR_ARG (exp
, 1);
11322 op0
= expand_normal (arg0
);
11323 op1
= expand_normal (arg1
);
11325 op0
= ix86_zero_extend_to_Pmode (op0
);
11326 if (!address_operand (op1
, VOIDmode
))
11328 op1
= convert_memory_address (Pmode
, op1
);
11329 op1
= copy_addr_to_reg (op1
);
11331 op1
= gen_rtx_MEM (XImode
, op1
);
11333 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
11335 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
11342 target
= gen_reg_rtx (SImode
);
11343 emit_move_insn (target
, const0_rtx
);
11344 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11346 if (fcode
== IX86_BUILTIN_ENQCMD
)
11347 pat
= gen_enqcmd (UNSPECV_ENQCMD
, Pmode
, op0
, op1
);
11349 pat
= gen_enqcmd (UNSPECV_ENQCMDS
, Pmode
, op0
, op1
);
11353 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
11354 gen_rtx_fmt_ee (EQ
, QImode
,
11358 return SUBREG_REG (target
);
11361 case IX86_BUILTIN_FXSAVE
:
11362 case IX86_BUILTIN_FXRSTOR
:
11363 case IX86_BUILTIN_FXSAVE64
:
11364 case IX86_BUILTIN_FXRSTOR64
:
11365 case IX86_BUILTIN_FNSTENV
:
11366 case IX86_BUILTIN_FLDENV
:
11370 case IX86_BUILTIN_FXSAVE
:
11371 icode
= CODE_FOR_fxsave
;
11373 case IX86_BUILTIN_FXRSTOR
:
11374 icode
= CODE_FOR_fxrstor
;
11376 case IX86_BUILTIN_FXSAVE64
:
11377 icode
= CODE_FOR_fxsave64
;
11379 case IX86_BUILTIN_FXRSTOR64
:
11380 icode
= CODE_FOR_fxrstor64
;
11382 case IX86_BUILTIN_FNSTENV
:
11383 icode
= CODE_FOR_fnstenv
;
11385 case IX86_BUILTIN_FLDENV
:
11386 icode
= CODE_FOR_fldenv
;
11389 gcc_unreachable ();
11392 arg0
= CALL_EXPR_ARG (exp
, 0);
11393 op0
= expand_normal (arg0
);
11395 if (!address_operand (op0
, VOIDmode
))
11397 op0
= convert_memory_address (Pmode
, op0
);
11398 op0
= copy_addr_to_reg (op0
);
11400 op0
= gen_rtx_MEM (mode0
, op0
);
11402 pat
= GEN_FCN (icode
) (op0
);
11407 case IX86_BUILTIN_XSETBV
:
11408 arg0
= CALL_EXPR_ARG (exp
, 0);
11409 arg1
= CALL_EXPR_ARG (exp
, 1);
11410 op0
= expand_normal (arg0
);
11411 op1
= expand_normal (arg1
);
11414 op0
= copy_to_mode_reg (SImode
, op0
);
11416 op1
= force_reg (DImode
, op1
);
11420 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11421 NULL
, 1, OPTAB_DIRECT
);
11423 icode
= CODE_FOR_xsetbv_rex64
;
11425 op2
= gen_lowpart (SImode
, op2
);
11426 op1
= gen_lowpart (SImode
, op1
);
11427 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11431 icode
= CODE_FOR_xsetbv
;
11433 pat
= GEN_FCN (icode
) (op0
, op1
);
11439 case IX86_BUILTIN_XSAVE
:
11440 case IX86_BUILTIN_XRSTOR
:
11441 case IX86_BUILTIN_XSAVE64
:
11442 case IX86_BUILTIN_XRSTOR64
:
11443 case IX86_BUILTIN_XSAVEOPT
:
11444 case IX86_BUILTIN_XSAVEOPT64
:
11445 case IX86_BUILTIN_XSAVES
:
11446 case IX86_BUILTIN_XRSTORS
:
11447 case IX86_BUILTIN_XSAVES64
:
11448 case IX86_BUILTIN_XRSTORS64
:
11449 case IX86_BUILTIN_XSAVEC
:
11450 case IX86_BUILTIN_XSAVEC64
:
11451 arg0
= CALL_EXPR_ARG (exp
, 0);
11452 arg1
= CALL_EXPR_ARG (exp
, 1);
11453 op0
= expand_normal (arg0
);
11454 op1
= expand_normal (arg1
);
11456 if (!address_operand (op0
, VOIDmode
))
11458 op0
= convert_memory_address (Pmode
, op0
);
11459 op0
= copy_addr_to_reg (op0
);
11461 op0
= gen_rtx_MEM (BLKmode
, op0
);
11463 op1
= force_reg (DImode
, op1
);
11467 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11468 NULL
, 1, OPTAB_DIRECT
);
11471 case IX86_BUILTIN_XSAVE
:
11472 icode
= CODE_FOR_xsave_rex64
;
11474 case IX86_BUILTIN_XRSTOR
:
11475 icode
= CODE_FOR_xrstor_rex64
;
11477 case IX86_BUILTIN_XSAVE64
:
11478 icode
= CODE_FOR_xsave64
;
11480 case IX86_BUILTIN_XRSTOR64
:
11481 icode
= CODE_FOR_xrstor64
;
11483 case IX86_BUILTIN_XSAVEOPT
:
11484 icode
= CODE_FOR_xsaveopt_rex64
;
11486 case IX86_BUILTIN_XSAVEOPT64
:
11487 icode
= CODE_FOR_xsaveopt64
;
11489 case IX86_BUILTIN_XSAVES
:
11490 icode
= CODE_FOR_xsaves_rex64
;
11492 case IX86_BUILTIN_XRSTORS
:
11493 icode
= CODE_FOR_xrstors_rex64
;
11495 case IX86_BUILTIN_XSAVES64
:
11496 icode
= CODE_FOR_xsaves64
;
11498 case IX86_BUILTIN_XRSTORS64
:
11499 icode
= CODE_FOR_xrstors64
;
11501 case IX86_BUILTIN_XSAVEC
:
11502 icode
= CODE_FOR_xsavec_rex64
;
11504 case IX86_BUILTIN_XSAVEC64
:
11505 icode
= CODE_FOR_xsavec64
;
11508 gcc_unreachable ();
11511 op2
= gen_lowpart (SImode
, op2
);
11512 op1
= gen_lowpart (SImode
, op1
);
11513 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11519 case IX86_BUILTIN_XSAVE
:
11520 icode
= CODE_FOR_xsave
;
11522 case IX86_BUILTIN_XRSTOR
:
11523 icode
= CODE_FOR_xrstor
;
11525 case IX86_BUILTIN_XSAVEOPT
:
11526 icode
= CODE_FOR_xsaveopt
;
11528 case IX86_BUILTIN_XSAVES
:
11529 icode
= CODE_FOR_xsaves
;
11531 case IX86_BUILTIN_XRSTORS
:
11532 icode
= CODE_FOR_xrstors
;
11534 case IX86_BUILTIN_XSAVEC
:
11535 icode
= CODE_FOR_xsavec
;
11538 gcc_unreachable ();
11540 pat
= GEN_FCN (icode
) (op0
, op1
);
11547 case IX86_BUILTIN_LLWPCB
:
11548 arg0
= CALL_EXPR_ARG (exp
, 0);
11549 op0
= expand_normal (arg0
);
11550 icode
= CODE_FOR_lwp_llwpcb
;
11551 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11552 op0
= ix86_zero_extend_to_Pmode (op0
);
11553 emit_insn (gen_lwp_llwpcb (op0
));
11556 case IX86_BUILTIN_SLWPCB
:
11557 icode
= CODE_FOR_lwp_slwpcb
;
11559 || !insn_data
[icode
].operand
[0].predicate (target
, Pmode
))
11560 target
= gen_reg_rtx (Pmode
);
11561 emit_insn (gen_lwp_slwpcb (target
));
11564 case IX86_BUILTIN_BEXTRI32
:
11565 case IX86_BUILTIN_BEXTRI64
:
11566 arg0
= CALL_EXPR_ARG (exp
, 0);
11567 arg1
= CALL_EXPR_ARG (exp
, 1);
11568 op0
= expand_normal (arg0
);
11569 op1
= expand_normal (arg1
);
11570 icode
= (fcode
== IX86_BUILTIN_BEXTRI32
11571 ? CODE_FOR_tbm_bextri_si
11572 : CODE_FOR_tbm_bextri_di
);
11573 if (!CONST_INT_P (op1
))
11575 error ("last argument must be an immediate");
11580 unsigned char length
= (INTVAL (op1
) >> 8) & 0xFF;
11581 unsigned char lsb_index
= INTVAL (op1
) & 0xFF;
11582 op1
= GEN_INT (length
);
11583 op2
= GEN_INT (lsb_index
);
11585 mode1
= insn_data
[icode
].operand
[1].mode
;
11586 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
11587 op0
= copy_to_mode_reg (mode1
, op0
);
11589 mode0
= insn_data
[icode
].operand
[0].mode
;
11591 || !register_operand (target
, mode0
))
11592 target
= gen_reg_rtx (mode0
);
11594 pat
= GEN_FCN (icode
) (target
, op0
, op1
, op2
);
11600 case IX86_BUILTIN_RDRAND16_STEP
:
11601 icode
= CODE_FOR_rdrandhi_1
;
11605 case IX86_BUILTIN_RDRAND32_STEP
:
11606 icode
= CODE_FOR_rdrandsi_1
;
11610 case IX86_BUILTIN_RDRAND64_STEP
:
11611 icode
= CODE_FOR_rdranddi_1
;
11615 arg0
= CALL_EXPR_ARG (exp
, 0);
11616 op1
= expand_normal (arg0
);
11617 if (!address_operand (op1
, VOIDmode
))
11619 op1
= convert_memory_address (Pmode
, op1
);
11620 op1
= copy_addr_to_reg (op1
);
11623 op0
= gen_reg_rtx (mode0
);
11624 emit_insn (GEN_FCN (icode
) (op0
));
11626 emit_move_insn (gen_rtx_MEM (mode0
, op1
), op0
);
11628 op1
= gen_reg_rtx (SImode
);
11629 emit_move_insn (op1
, CONST1_RTX (SImode
));
11631 /* Emit SImode conditional move. */
11632 if (mode0
== HImode
)
11634 if (TARGET_ZERO_EXTEND_WITH_AND
11635 && optimize_function_for_speed_p (cfun
))
11637 op2
= force_reg (SImode
, const0_rtx
);
11639 emit_insn (gen_movstricthi
11640 (gen_lowpart (HImode
, op2
), op0
));
11644 op2
= gen_reg_rtx (SImode
);
11646 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
11649 else if (mode0
== SImode
)
11652 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
11655 || !register_operand (target
, SImode
))
11656 target
= gen_reg_rtx (SImode
);
11658 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11660 emit_insn (gen_rtx_SET (target
,
11661 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
11664 case IX86_BUILTIN_RDSEED16_STEP
:
11665 icode
= CODE_FOR_rdseedhi_1
;
11669 case IX86_BUILTIN_RDSEED32_STEP
:
11670 icode
= CODE_FOR_rdseedsi_1
;
11674 case IX86_BUILTIN_RDSEED64_STEP
:
11675 icode
= CODE_FOR_rdseeddi_1
;
11679 arg0
= CALL_EXPR_ARG (exp
, 0);
11680 op1
= expand_normal (arg0
);
11681 if (!address_operand (op1
, VOIDmode
))
11683 op1
= convert_memory_address (Pmode
, op1
);
11684 op1
= copy_addr_to_reg (op1
);
11687 op0
= gen_reg_rtx (mode0
);
11688 emit_insn (GEN_FCN (icode
) (op0
));
11690 emit_move_insn (gen_rtx_MEM (mode0
, op1
), op0
);
11692 op2
= gen_reg_rtx (QImode
);
11694 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11696 emit_insn (gen_rtx_SET (op2
, pat
));
11699 || !register_operand (target
, SImode
))
11700 target
= gen_reg_rtx (SImode
);
11702 emit_insn (gen_zero_extendqisi2 (target
, op2
));
11705 case IX86_BUILTIN_SBB32
:
11706 icode
= CODE_FOR_subborrowsi
;
11707 icode2
= CODE_FOR_subborrowsi_0
;
11713 case IX86_BUILTIN_SBB64
:
11714 icode
= CODE_FOR_subborrowdi
;
11715 icode2
= CODE_FOR_subborrowdi_0
;
11721 case IX86_BUILTIN_ADDCARRYX32
:
11722 icode
= CODE_FOR_addcarrysi
;
11723 icode2
= CODE_FOR_addcarrysi_0
;
11729 case IX86_BUILTIN_ADDCARRYX64
:
11730 icode
= CODE_FOR_addcarrydi
;
11731 icode2
= CODE_FOR_addcarrydi_0
;
11737 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
11738 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
11739 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
11740 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
11742 op1
= expand_normal (arg0
);
11743 if (!integer_zerop (arg0
))
11744 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
11746 op2
= expand_normal (arg1
);
11747 if (!register_operand (op2
, mode0
))
11748 op2
= copy_to_mode_reg (mode0
, op2
);
11750 op3
= expand_normal (arg2
);
11751 if (!register_operand (op3
, mode0
))
11752 op3
= copy_to_mode_reg (mode0
, op3
);
11754 op4
= expand_normal (arg3
);
11755 if (!address_operand (op4
, VOIDmode
))
11757 op4
= convert_memory_address (Pmode
, op4
);
11758 op4
= copy_addr_to_reg (op4
);
11761 op0
= gen_reg_rtx (mode0
);
11762 if (integer_zerop (arg0
))
11764 /* If arg0 is 0, optimize right away into add or sub
11765 instruction that sets CCCmode flags. */
11766 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
11767 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
11771 /* Generate CF from input operand. */
11772 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
11774 /* Generate instruction that consumes CF. */
11775 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
11776 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
11777 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
11778 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
11781 /* Return current CF value. */
11783 target
= gen_reg_rtx (QImode
);
11785 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
11786 emit_insn (gen_rtx_SET (target
, pat
));
11788 /* Store the result. */
11789 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
11793 case IX86_BUILTIN_READ_FLAGS
:
11794 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11797 || target
== NULL_RTX
11798 || !nonimmediate_operand (target
, word_mode
)
11799 || GET_MODE (target
) != word_mode
)
11800 target
= gen_reg_rtx (word_mode
);
11802 emit_insn (gen_pop (target
));
11805 case IX86_BUILTIN_WRITE_FLAGS
:
11807 arg0
= CALL_EXPR_ARG (exp
, 0);
11808 op0
= expand_normal (arg0
);
11809 if (!general_no_elim_operand (op0
, word_mode
))
11810 op0
= copy_to_mode_reg (word_mode
, op0
);
11812 emit_insn (gen_push (op0
));
11813 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11816 case IX86_BUILTIN_KTESTC8
:
11817 icode
= CODE_FOR_ktestqi
;
11821 case IX86_BUILTIN_KTESTZ8
:
11822 icode
= CODE_FOR_ktestqi
;
11826 case IX86_BUILTIN_KTESTC16
:
11827 icode
= CODE_FOR_ktesthi
;
11831 case IX86_BUILTIN_KTESTZ16
:
11832 icode
= CODE_FOR_ktesthi
;
11836 case IX86_BUILTIN_KTESTC32
:
11837 icode
= CODE_FOR_ktestsi
;
11841 case IX86_BUILTIN_KTESTZ32
:
11842 icode
= CODE_FOR_ktestsi
;
11846 case IX86_BUILTIN_KTESTC64
:
11847 icode
= CODE_FOR_ktestdi
;
11851 case IX86_BUILTIN_KTESTZ64
:
11852 icode
= CODE_FOR_ktestdi
;
11856 case IX86_BUILTIN_KORTESTC8
:
11857 icode
= CODE_FOR_kortestqi
;
11861 case IX86_BUILTIN_KORTESTZ8
:
11862 icode
= CODE_FOR_kortestqi
;
11866 case IX86_BUILTIN_KORTESTC16
:
11867 icode
= CODE_FOR_kortesthi
;
11871 case IX86_BUILTIN_KORTESTZ16
:
11872 icode
= CODE_FOR_kortesthi
;
11876 case IX86_BUILTIN_KORTESTC32
:
11877 icode
= CODE_FOR_kortestsi
;
11881 case IX86_BUILTIN_KORTESTZ32
:
11882 icode
= CODE_FOR_kortestsi
;
11886 case IX86_BUILTIN_KORTESTC64
:
11887 icode
= CODE_FOR_kortestdi
;
11891 case IX86_BUILTIN_KORTESTZ64
:
11892 icode
= CODE_FOR_kortestdi
;
11896 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
11897 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
11898 op0
= expand_normal (arg0
);
11899 op1
= expand_normal (arg1
);
11901 mode0
= insn_data
[icode
].operand
[0].mode
;
11902 mode1
= insn_data
[icode
].operand
[1].mode
;
11904 if (GET_MODE (op0
) != VOIDmode
)
11905 op0
= force_reg (GET_MODE (op0
), op0
);
11907 op0
= gen_lowpart (mode0
, op0
);
11909 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11910 op0
= copy_to_mode_reg (mode0
, op0
);
11912 if (GET_MODE (op1
) != VOIDmode
)
11913 op1
= force_reg (GET_MODE (op1
), op1
);
11915 op1
= gen_lowpart (mode1
, op1
);
11917 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
11918 op1
= copy_to_mode_reg (mode1
, op1
);
11920 target
= gen_reg_rtx (QImode
);
11922 /* Emit kortest. */
11923 emit_insn (GEN_FCN (icode
) (op0
, op1
));
11924 /* And use setcc to return result from flags. */
11925 ix86_expand_setcc (target
, EQ
,
11926 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
11929 case IX86_BUILTIN_GATHERSIV2DF
:
11930 icode
= CODE_FOR_avx2_gathersiv2df
;
11932 case IX86_BUILTIN_GATHERSIV4DF
:
11933 icode
= CODE_FOR_avx2_gathersiv4df
;
11935 case IX86_BUILTIN_GATHERDIV2DF
:
11936 icode
= CODE_FOR_avx2_gatherdiv2df
;
11938 case IX86_BUILTIN_GATHERDIV4DF
:
11939 icode
= CODE_FOR_avx2_gatherdiv4df
;
11941 case IX86_BUILTIN_GATHERSIV4SF
:
11942 icode
= CODE_FOR_avx2_gathersiv4sf
;
11944 case IX86_BUILTIN_GATHERSIV8SF
:
11945 icode
= CODE_FOR_avx2_gathersiv8sf
;
11947 case IX86_BUILTIN_GATHERDIV4SF
:
11948 icode
= CODE_FOR_avx2_gatherdiv4sf
;
11950 case IX86_BUILTIN_GATHERDIV8SF
:
11951 icode
= CODE_FOR_avx2_gatherdiv8sf
;
11953 case IX86_BUILTIN_GATHERSIV2DI
:
11954 icode
= CODE_FOR_avx2_gathersiv2di
;
11956 case IX86_BUILTIN_GATHERSIV4DI
:
11957 icode
= CODE_FOR_avx2_gathersiv4di
;
11959 case IX86_BUILTIN_GATHERDIV2DI
:
11960 icode
= CODE_FOR_avx2_gatherdiv2di
;
11962 case IX86_BUILTIN_GATHERDIV4DI
:
11963 icode
= CODE_FOR_avx2_gatherdiv4di
;
11965 case IX86_BUILTIN_GATHERSIV4SI
:
11966 icode
= CODE_FOR_avx2_gathersiv4si
;
11968 case IX86_BUILTIN_GATHERSIV8SI
:
11969 icode
= CODE_FOR_avx2_gathersiv8si
;
11971 case IX86_BUILTIN_GATHERDIV4SI
:
11972 icode
= CODE_FOR_avx2_gatherdiv4si
;
11974 case IX86_BUILTIN_GATHERDIV8SI
:
11975 icode
= CODE_FOR_avx2_gatherdiv8si
;
11977 case IX86_BUILTIN_GATHERALTSIV4DF
:
11978 icode
= CODE_FOR_avx2_gathersiv4df
;
11980 case IX86_BUILTIN_GATHERALTDIV8SF
:
11981 icode
= CODE_FOR_avx2_gatherdiv8sf
;
11983 case IX86_BUILTIN_GATHERALTSIV4DI
:
11984 icode
= CODE_FOR_avx2_gathersiv4di
;
11986 case IX86_BUILTIN_GATHERALTDIV8SI
:
11987 icode
= CODE_FOR_avx2_gatherdiv8si
;
11989 case IX86_BUILTIN_GATHER3SIV16SF
:
11990 icode
= CODE_FOR_avx512f_gathersiv16sf
;
11992 case IX86_BUILTIN_GATHER3SIV8DF
:
11993 icode
= CODE_FOR_avx512f_gathersiv8df
;
11995 case IX86_BUILTIN_GATHER3DIV16SF
:
11996 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
11998 case IX86_BUILTIN_GATHER3DIV8DF
:
11999 icode
= CODE_FOR_avx512f_gatherdiv8df
;
12001 case IX86_BUILTIN_GATHER3SIV16SI
:
12002 icode
= CODE_FOR_avx512f_gathersiv16si
;
12004 case IX86_BUILTIN_GATHER3SIV8DI
:
12005 icode
= CODE_FOR_avx512f_gathersiv8di
;
12007 case IX86_BUILTIN_GATHER3DIV16SI
:
12008 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12010 case IX86_BUILTIN_GATHER3DIV8DI
:
12011 icode
= CODE_FOR_avx512f_gatherdiv8di
;
12013 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12014 icode
= CODE_FOR_avx512f_gathersiv8df
;
12016 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12017 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12019 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12020 icode
= CODE_FOR_avx512f_gathersiv8di
;
12022 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12023 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12025 case IX86_BUILTIN_GATHER3SIV2DF
:
12026 icode
= CODE_FOR_avx512vl_gathersiv2df
;
12028 case IX86_BUILTIN_GATHER3SIV4DF
:
12029 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12031 case IX86_BUILTIN_GATHER3DIV2DF
:
12032 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
12034 case IX86_BUILTIN_GATHER3DIV4DF
:
12035 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
12037 case IX86_BUILTIN_GATHER3SIV4SF
:
12038 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
12040 case IX86_BUILTIN_GATHER3SIV8SF
:
12041 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
12043 case IX86_BUILTIN_GATHER3DIV4SF
:
12044 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
12046 case IX86_BUILTIN_GATHER3DIV8SF
:
12047 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12049 case IX86_BUILTIN_GATHER3SIV2DI
:
12050 icode
= CODE_FOR_avx512vl_gathersiv2di
;
12052 case IX86_BUILTIN_GATHER3SIV4DI
:
12053 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12055 case IX86_BUILTIN_GATHER3DIV2DI
:
12056 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
12058 case IX86_BUILTIN_GATHER3DIV4DI
:
12059 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
12061 case IX86_BUILTIN_GATHER3SIV4SI
:
12062 icode
= CODE_FOR_avx512vl_gathersiv4si
;
12064 case IX86_BUILTIN_GATHER3SIV8SI
:
12065 icode
= CODE_FOR_avx512vl_gathersiv8si
;
12067 case IX86_BUILTIN_GATHER3DIV4SI
:
12068 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
12070 case IX86_BUILTIN_GATHER3DIV8SI
:
12071 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12073 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12074 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12076 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12077 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12079 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12080 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12082 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12083 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12085 case IX86_BUILTIN_SCATTERSIV16SF
:
12086 icode
= CODE_FOR_avx512f_scattersiv16sf
;
12088 case IX86_BUILTIN_SCATTERSIV8DF
:
12089 icode
= CODE_FOR_avx512f_scattersiv8df
;
12091 case IX86_BUILTIN_SCATTERDIV16SF
:
12092 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12094 case IX86_BUILTIN_SCATTERDIV8DF
:
12095 icode
= CODE_FOR_avx512f_scatterdiv8df
;
12097 case IX86_BUILTIN_SCATTERSIV16SI
:
12098 icode
= CODE_FOR_avx512f_scattersiv16si
;
12100 case IX86_BUILTIN_SCATTERSIV8DI
:
12101 icode
= CODE_FOR_avx512f_scattersiv8di
;
12103 case IX86_BUILTIN_SCATTERDIV16SI
:
12104 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12106 case IX86_BUILTIN_SCATTERDIV8DI
:
12107 icode
= CODE_FOR_avx512f_scatterdiv8di
;
12109 case IX86_BUILTIN_SCATTERSIV8SF
:
12110 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
12112 case IX86_BUILTIN_SCATTERSIV4SF
:
12113 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
12115 case IX86_BUILTIN_SCATTERSIV4DF
:
12116 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12118 case IX86_BUILTIN_SCATTERSIV2DF
:
12119 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12121 case IX86_BUILTIN_SCATTERDIV8SF
:
12122 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12124 case IX86_BUILTIN_SCATTERDIV4SF
:
12125 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12127 case IX86_BUILTIN_SCATTERDIV4DF
:
12128 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
12130 case IX86_BUILTIN_SCATTERDIV2DF
:
12131 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
12133 case IX86_BUILTIN_SCATTERSIV8SI
:
12134 icode
= CODE_FOR_avx512vl_scattersiv8si
;
12136 case IX86_BUILTIN_SCATTERSIV4SI
:
12137 icode
= CODE_FOR_avx512vl_scattersiv4si
;
12139 case IX86_BUILTIN_SCATTERSIV4DI
:
12140 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12142 case IX86_BUILTIN_SCATTERSIV2DI
:
12143 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12145 case IX86_BUILTIN_SCATTERDIV8SI
:
12146 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12148 case IX86_BUILTIN_SCATTERDIV4SI
:
12149 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12151 case IX86_BUILTIN_SCATTERDIV4DI
:
12152 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
12154 case IX86_BUILTIN_SCATTERDIV2DI
:
12155 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
12157 case IX86_BUILTIN_GATHERPFDPD
:
12158 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
12159 goto vec_prefetch_gen
;
12160 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12161 icode
= CODE_FOR_avx512f_scattersiv8df
;
12163 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12164 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12166 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12167 icode
= CODE_FOR_avx512f_scattersiv8di
;
12169 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12170 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12172 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12173 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12175 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12176 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12178 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12179 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12181 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12182 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12184 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12185 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12187 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12188 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12190 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12191 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12193 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12194 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12196 case IX86_BUILTIN_GATHERPFDPS
:
12197 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
12198 goto vec_prefetch_gen
;
12199 case IX86_BUILTIN_GATHERPFQPD
:
12200 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
12201 goto vec_prefetch_gen
;
12202 case IX86_BUILTIN_GATHERPFQPS
:
12203 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
12204 goto vec_prefetch_gen
;
12205 case IX86_BUILTIN_SCATTERPFDPD
:
12206 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
12207 goto vec_prefetch_gen
;
12208 case IX86_BUILTIN_SCATTERPFDPS
:
12209 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
12210 goto vec_prefetch_gen
;
12211 case IX86_BUILTIN_SCATTERPFQPD
:
12212 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
12213 goto vec_prefetch_gen
;
12214 case IX86_BUILTIN_SCATTERPFQPS
:
12215 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
12216 goto vec_prefetch_gen
;
12220 rtx (*gen
) (rtx
, rtx
);
12222 arg0
= CALL_EXPR_ARG (exp
, 0);
12223 arg1
= CALL_EXPR_ARG (exp
, 1);
12224 arg2
= CALL_EXPR_ARG (exp
, 2);
12225 arg3
= CALL_EXPR_ARG (exp
, 3);
12226 arg4
= CALL_EXPR_ARG (exp
, 4);
12227 op0
= expand_normal (arg0
);
12228 op1
= expand_normal (arg1
);
12229 op2
= expand_normal (arg2
);
12230 op3
= expand_normal (arg3
);
12231 op4
= expand_normal (arg4
);
12232 /* Note the arg order is different from the operand order. */
12233 mode0
= insn_data
[icode
].operand
[1].mode
;
12234 mode2
= insn_data
[icode
].operand
[3].mode
;
12235 mode3
= insn_data
[icode
].operand
[4].mode
;
12236 mode4
= insn_data
[icode
].operand
[5].mode
;
12238 if (target
== NULL_RTX
12239 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
12240 || !insn_data
[icode
].operand
[0].predicate (target
,
12241 GET_MODE (target
)))
12242 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
12244 subtarget
= target
;
12248 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12249 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12250 half
= gen_reg_rtx (V8SImode
);
12251 if (!nonimmediate_operand (op2
, V16SImode
))
12252 op2
= copy_to_mode_reg (V16SImode
, op2
);
12253 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12256 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12257 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12258 case IX86_BUILTIN_GATHERALTSIV4DF
:
12259 case IX86_BUILTIN_GATHERALTSIV4DI
:
12260 half
= gen_reg_rtx (V4SImode
);
12261 if (!nonimmediate_operand (op2
, V8SImode
))
12262 op2
= copy_to_mode_reg (V8SImode
, op2
);
12263 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12266 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12267 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12268 half
= gen_reg_rtx (mode0
);
12269 if (mode0
== V8SFmode
)
12270 gen
= gen_vec_extract_lo_v16sf
;
12272 gen
= gen_vec_extract_lo_v16si
;
12273 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12274 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12275 emit_insn (gen (half
, op0
));
12277 op3
= lowpart_subreg (QImode
, op3
, HImode
);
12279 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12280 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12281 case IX86_BUILTIN_GATHERALTDIV8SF
:
12282 case IX86_BUILTIN_GATHERALTDIV8SI
:
12283 half
= gen_reg_rtx (mode0
);
12284 if (mode0
== V4SFmode
)
12285 gen
= gen_vec_extract_lo_v8sf
;
12287 gen
= gen_vec_extract_lo_v8si
;
12288 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12289 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12290 emit_insn (gen (half
, op0
));
12292 if (VECTOR_MODE_P (GET_MODE (op3
)))
12294 half
= gen_reg_rtx (mode0
);
12295 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12296 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12297 emit_insn (gen (half
, op3
));
12305 /* Force memory operand only with base register here. But we
12306 don't want to do it on memory operand for other builtin
12308 op1
= ix86_zero_extend_to_Pmode (op1
);
12310 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
12311 op0
= copy_to_mode_reg (mode0
, op0
);
12312 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
12313 op1
= copy_to_mode_reg (Pmode
, op1
);
12314 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
12315 op2
= copy_to_mode_reg (mode2
, op2
);
12317 op3
= fixup_modeless_constant (op3
, mode3
);
12319 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
12321 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
12322 op3
= copy_to_mode_reg (mode3
, op3
);
12326 op3
= copy_to_reg (op3
);
12327 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
12329 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
12331 error ("the last argument must be scale 1, 2, 4, 8");
12335 /* Optimize. If mask is known to have all high bits set,
12336 replace op0 with pc_rtx to signal that the instruction
12337 overwrites the whole destination and doesn't use its
12338 previous contents. */
12341 if (TREE_CODE (arg3
) == INTEGER_CST
)
12343 if (integer_all_onesp (arg3
))
12346 else if (TREE_CODE (arg3
) == VECTOR_CST
)
12348 unsigned int negative
= 0;
12349 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
12351 tree cst
= VECTOR_CST_ELT (arg3
, i
);
12352 if (TREE_CODE (cst
) == INTEGER_CST
12353 && tree_int_cst_sign_bit (cst
))
12355 else if (TREE_CODE (cst
) == REAL_CST
12356 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
12359 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
12362 else if (TREE_CODE (arg3
) == SSA_NAME
12363 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
12365 /* Recognize also when mask is like:
12366 __v2df src = _mm_setzero_pd ();
12367 __v2df mask = _mm_cmpeq_pd (src, src);
12369 __v8sf src = _mm256_setzero_ps ();
12370 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12371 as that is a cheaper way to load all ones into
12372 a register than having to load a constant from
12374 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
12375 if (is_gimple_call (def_stmt
))
12377 tree fndecl
= gimple_call_fndecl (def_stmt
);
12379 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
12380 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl
))
12382 case IX86_BUILTIN_CMPPD
:
12383 case IX86_BUILTIN_CMPPS
:
12384 case IX86_BUILTIN_CMPPD256
:
12385 case IX86_BUILTIN_CMPPS256
:
12386 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
12389 case IX86_BUILTIN_CMPEQPD
:
12390 case IX86_BUILTIN_CMPEQPS
:
12391 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
12392 && initializer_zerop (gimple_call_arg (def_stmt
,
12403 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
12410 case IX86_BUILTIN_GATHER3DIV16SF
:
12411 if (target
== NULL_RTX
)
12412 target
= gen_reg_rtx (V8SFmode
);
12413 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
12415 case IX86_BUILTIN_GATHER3DIV16SI
:
12416 if (target
== NULL_RTX
)
12417 target
= gen_reg_rtx (V8SImode
);
12418 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
12420 case IX86_BUILTIN_GATHER3DIV8SF
:
12421 case IX86_BUILTIN_GATHERDIV8SF
:
12422 if (target
== NULL_RTX
)
12423 target
= gen_reg_rtx (V4SFmode
);
12424 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
12426 case IX86_BUILTIN_GATHER3DIV8SI
:
12427 case IX86_BUILTIN_GATHERDIV8SI
:
12428 if (target
== NULL_RTX
)
12429 target
= gen_reg_rtx (V4SImode
);
12430 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
12433 target
= subtarget
;
12439 arg0
= CALL_EXPR_ARG (exp
, 0);
12440 arg1
= CALL_EXPR_ARG (exp
, 1);
12441 arg2
= CALL_EXPR_ARG (exp
, 2);
12442 arg3
= CALL_EXPR_ARG (exp
, 3);
12443 arg4
= CALL_EXPR_ARG (exp
, 4);
12444 op0
= expand_normal (arg0
);
12445 op1
= expand_normal (arg1
);
12446 op2
= expand_normal (arg2
);
12447 op3
= expand_normal (arg3
);
12448 op4
= expand_normal (arg4
);
12449 mode1
= insn_data
[icode
].operand
[1].mode
;
12450 mode2
= insn_data
[icode
].operand
[2].mode
;
12451 mode3
= insn_data
[icode
].operand
[3].mode
;
12452 mode4
= insn_data
[icode
].operand
[4].mode
;
12454 /* Scatter instruction stores operand op3 to memory with
12455 indices from op2 and scale from op4 under writemask op1.
12456 If index operand op2 has more elements then source operand
12457 op3 one need to use only its low half. And vice versa. */
12460 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12461 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12462 half
= gen_reg_rtx (V8SImode
);
12463 if (!nonimmediate_operand (op2
, V16SImode
))
12464 op2
= copy_to_mode_reg (V16SImode
, op2
);
12465 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12468 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12469 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12470 half
= gen_reg_rtx (mode3
);
12471 if (mode3
== V8SFmode
)
12472 gen
= gen_vec_extract_lo_v16sf
;
12474 gen
= gen_vec_extract_lo_v16si
;
12475 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12476 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12477 emit_insn (gen (half
, op3
));
12480 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12481 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12482 half
= gen_reg_rtx (V4SImode
);
12483 if (!nonimmediate_operand (op2
, V8SImode
))
12484 op2
= copy_to_mode_reg (V8SImode
, op2
);
12485 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12488 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12489 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12490 half
= gen_reg_rtx (mode3
);
12491 if (mode3
== V4SFmode
)
12492 gen
= gen_vec_extract_lo_v8sf
;
12494 gen
= gen_vec_extract_lo_v8si
;
12495 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12496 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12497 emit_insn (gen (half
, op3
));
12500 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12501 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12502 if (!nonimmediate_operand (op2
, V4SImode
))
12503 op2
= copy_to_mode_reg (V4SImode
, op2
);
12505 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12506 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12507 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12508 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12514 /* Force memory operand only with base register here. But we
12515 don't want to do it on memory operand for other builtin
12517 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
12519 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12520 op0
= copy_to_mode_reg (Pmode
, op0
);
12522 op1
= fixup_modeless_constant (op1
, mode1
);
12524 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
12526 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12527 op1
= copy_to_mode_reg (mode1
, op1
);
12531 op1
= copy_to_reg (op1
);
12532 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
12535 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12536 op2
= copy_to_mode_reg (mode2
, op2
);
12538 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12539 op3
= copy_to_mode_reg (mode3
, op3
);
12541 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12543 error ("the last argument must be scale 1, 2, 4, 8");
12547 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12555 arg0
= CALL_EXPR_ARG (exp
, 0);
12556 arg1
= CALL_EXPR_ARG (exp
, 1);
12557 arg2
= CALL_EXPR_ARG (exp
, 2);
12558 arg3
= CALL_EXPR_ARG (exp
, 3);
12559 arg4
= CALL_EXPR_ARG (exp
, 4);
12560 op0
= expand_normal (arg0
);
12561 op1
= expand_normal (arg1
);
12562 op2
= expand_normal (arg2
);
12563 op3
= expand_normal (arg3
);
12564 op4
= expand_normal (arg4
);
12565 mode0
= insn_data
[icode
].operand
[0].mode
;
12566 mode1
= insn_data
[icode
].operand
[1].mode
;
12567 mode3
= insn_data
[icode
].operand
[3].mode
;
12568 mode4
= insn_data
[icode
].operand
[4].mode
;
12570 op0
= fixup_modeless_constant (op0
, mode0
);
12572 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
12574 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12575 op0
= copy_to_mode_reg (mode0
, op0
);
12579 op0
= copy_to_reg (op0
);
12580 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
12583 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12584 op1
= copy_to_mode_reg (mode1
, op1
);
12586 /* Force memory operand only with base register here. But we
12587 don't want to do it on memory operand for other builtin
12589 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
12591 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
12592 op2
= copy_to_mode_reg (Pmode
, op2
);
12594 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12596 error ("the forth argument must be scale 1, 2, 4, 8");
12600 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12602 error ("incorrect hint operand");
12606 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12614 case IX86_BUILTIN_XABORT
:
12615 icode
= CODE_FOR_xabort
;
12616 arg0
= CALL_EXPR_ARG (exp
, 0);
12617 op0
= expand_normal (arg0
);
12618 mode0
= insn_data
[icode
].operand
[0].mode
;
12619 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12621 error ("the argument to %<xabort%> intrinsic must "
12622 "be an 8-bit immediate");
12625 emit_insn (gen_xabort (op0
));
12628 case IX86_BUILTIN_RSTORSSP
:
12629 case IX86_BUILTIN_CLRSSBSY
:
12630 arg0
= CALL_EXPR_ARG (exp
, 0);
12631 op0
= expand_normal (arg0
);
12632 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
12633 ? CODE_FOR_rstorssp
12634 : CODE_FOR_clrssbsy
);
12635 if (!address_operand (op0
, VOIDmode
))
12637 op1
= convert_memory_address (Pmode
, op0
);
12638 op0
= copy_addr_to_reg (op1
);
12640 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (Pmode
, op0
)));
12643 case IX86_BUILTIN_WRSSD
:
12644 case IX86_BUILTIN_WRSSQ
:
12645 case IX86_BUILTIN_WRUSSD
:
12646 case IX86_BUILTIN_WRUSSQ
:
12647 arg0
= CALL_EXPR_ARG (exp
, 0);
12648 op0
= expand_normal (arg0
);
12649 arg1
= CALL_EXPR_ARG (exp
, 1);
12650 op1
= expand_normal (arg1
);
12653 case IX86_BUILTIN_WRSSD
:
12654 icode
= CODE_FOR_wrsssi
;
12657 case IX86_BUILTIN_WRSSQ
:
12658 icode
= CODE_FOR_wrssdi
;
12661 case IX86_BUILTIN_WRUSSD
:
12662 icode
= CODE_FOR_wrusssi
;
12665 case IX86_BUILTIN_WRUSSQ
:
12666 icode
= CODE_FOR_wrussdi
;
12670 op0
= force_reg (mode
, op0
);
12671 if (!address_operand (op1
, VOIDmode
))
12673 op2
= convert_memory_address (Pmode
, op1
);
12674 op1
= copy_addr_to_reg (op2
);
12676 emit_insn (GEN_FCN (icode
) (op0
, gen_rtx_MEM (mode
, op1
)));
12683 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12684 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
12686 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
12687 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
12691 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
12692 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
12694 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
12695 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
12696 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
12697 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
12699 machine_mode mode
, wide_mode
, nar_mode
;
12701 nar_mode
= V4SFmode
;
12703 wide_mode
= V64SFmode
;
12704 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
12705 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
12709 case IX86_BUILTIN_4FMAPS
:
12710 fcn
= gen_avx5124fmaddps_4fmaddps
;
12714 case IX86_BUILTIN_4DPWSSD
:
12715 nar_mode
= V4SImode
;
12717 wide_mode
= V64SImode
;
12718 fcn
= gen_avx5124vnniw_vp4dpwssd
;
12722 case IX86_BUILTIN_4DPWSSDS
:
12723 nar_mode
= V4SImode
;
12725 wide_mode
= V64SImode
;
12726 fcn
= gen_avx5124vnniw_vp4dpwssds
;
12730 case IX86_BUILTIN_4FNMAPS
:
12731 fcn
= gen_avx5124fmaddps_4fnmaddps
;
12735 case IX86_BUILTIN_4FNMAPS_MASK
:
12736 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
12737 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
12740 case IX86_BUILTIN_4DPWSSD_MASK
:
12741 nar_mode
= V4SImode
;
12743 wide_mode
= V64SImode
;
12744 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
12745 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
12748 case IX86_BUILTIN_4DPWSSDS_MASK
:
12749 nar_mode
= V4SImode
;
12751 wide_mode
= V64SImode
;
12752 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
12753 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
12756 case IX86_BUILTIN_4FMAPS_MASK
:
12766 wide_reg
= gen_reg_rtx (wide_mode
);
12767 for (i
= 0; i
< 4; i
++)
12769 args
[i
] = CALL_EXPR_ARG (exp
, i
);
12770 ops
[i
] = expand_normal (args
[i
]);
12772 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
12776 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
12777 accum
= force_reg (mode
, accum
);
12779 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
12780 addr
= force_reg (Pmode
, addr
);
12782 mem
= gen_rtx_MEM (nar_mode
, addr
);
12784 target
= gen_reg_rtx (mode
);
12786 emit_move_insn (target
, accum
);
12789 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
12793 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
12795 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
12797 if (CONST_INT_P (mask
))
12798 mask
= fixup_modeless_constant (mask
, HImode
);
12800 mask
= force_reg (HImode
, mask
);
12802 if (GET_MODE (mask
) != HImode
)
12803 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
12805 /* If merge is 0 then we're about to emit z-masked variant. */
12806 if (const0_operand (merge
, mode
))
12807 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
12808 /* If merge is the same as accum then emit merge-masked variant. */
12809 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
12811 merge
= force_reg (mode
, merge
);
12812 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
12814 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12817 target
= gen_reg_rtx (mode
);
12818 emit_move_insn (target
, merge
);
12819 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
12825 case IX86_BUILTIN_4FNMASS
:
12826 fcn
= gen_avx5124fmaddps_4fnmaddss
;
12830 case IX86_BUILTIN_4FMASS
:
12831 fcn
= gen_avx5124fmaddps_4fmaddss
;
12835 case IX86_BUILTIN_4FNMASS_MASK
:
12836 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
12837 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
12840 case IX86_BUILTIN_4FMASS_MASK
:
12849 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
12850 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
12854 wide_reg
= gen_reg_rtx (V64SFmode
);
12855 for (i
= 0; i
< 4; i
++)
12858 args
[i
] = CALL_EXPR_ARG (exp
, i
);
12859 ops
[i
] = expand_normal (args
[i
]);
12861 tmp
= gen_reg_rtx (SFmode
);
12862 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
12864 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
12865 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
12868 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
12869 accum
= force_reg (V4SFmode
, accum
);
12871 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
12872 addr
= force_reg (Pmode
, addr
);
12874 mem
= gen_rtx_MEM (V4SFmode
, addr
);
12876 target
= gen_reg_rtx (V4SFmode
);
12878 emit_move_insn (target
, accum
);
12881 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
12885 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
12887 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
12889 if (CONST_INT_P (mask
))
12890 mask
= fixup_modeless_constant (mask
, QImode
);
12892 mask
= force_reg (QImode
, mask
);
12894 if (GET_MODE (mask
) != QImode
)
12895 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
12897 /* If merge is 0 then we're about to emit z-masked variant. */
12898 if (const0_operand (merge
, mode
))
12899 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
12900 /* If merge is the same as accum then emit merge-masked
12902 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
12904 merge
= force_reg (mode
, merge
);
12905 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
12907 /* Merge with something unknown might happen if we z-mask
12911 target
= gen_reg_rtx (mode
);
12912 emit_move_insn (target
, merge
);
12913 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
12918 case IX86_BUILTIN_RDPID
:
12919 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
12921 case IX86_BUILTIN_FABSQ
:
12922 case IX86_BUILTIN_COPYSIGNQ
:
12924 /* Emit a normal call if SSE isn't available. */
12925 return expand_call (exp
, target
, ignore
);
12928 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
12932 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
12933 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
12935 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
12936 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
12939 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
12940 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
12942 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
12943 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
12946 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
12947 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
12949 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
12950 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
12953 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
12954 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
12956 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
12957 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
12960 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
12961 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
12963 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
12964 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
12965 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
12966 (enum ix86_builtin_func_type
)
12967 d
->flag
, d
->comparison
);
12970 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
12971 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
12973 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
12974 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
12978 if (fcode
>= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
12979 && fcode
<= IX86_BUILTIN__BDESC_CET_NORMAL_LAST
)
12981 i
= fcode
- IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
;
12982 return ix86_expand_special_args_builtin (bdesc_cet_rdssp
+ i
, exp
,
12986 gcc_unreachable ();
12989 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
12990 fill target with val via vec_duplicate. */
12993 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
12999 /* First attempt to recognize VAL as-is. */
13000 dup
= gen_vec_duplicate (mode
, val
);
13001 insn
= emit_insn (gen_rtx_SET (target
, dup
));
13002 if (recog_memoized (insn
) < 0)
13005 machine_mode innermode
= GET_MODE_INNER (mode
);
13008 /* If that fails, force VAL into a register. */
13011 reg
= force_reg (innermode
, val
);
13012 if (GET_MODE (reg
) != innermode
)
13013 reg
= gen_lowpart (innermode
, reg
);
13014 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
13015 seq
= get_insns ();
13018 emit_insn_before (seq
, insn
);
13020 ok
= recog_memoized (insn
) >= 0;
13026 /* Get a vector mode of the same size as the original but with elements
13027 twice as wide. This is only guaranteed to apply to integral vectors. */
13029 static machine_mode
13030 get_mode_wider_vector (machine_mode o
)
13032 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13033 machine_mode n
= GET_MODE_WIDER_MODE (o
).require ();
13034 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
13035 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
13039 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
13040 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
13042 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13043 with all elements equal to VAR. Return true if successful. */
13046 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
13047 rtx target
, rtx val
)
13071 return ix86_vector_duplicate_value (mode
, target
, val
);
13076 if (TARGET_SSE
|| TARGET_3DNOW_A
)
13080 val
= gen_lowpart (SImode
, val
);
13081 x
= gen_rtx_TRUNCATE (HImode
, val
);
13082 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
13083 emit_insn (gen_rtx_SET (target
, x
));
13095 return ix86_vector_duplicate_value (mode
, target
, val
);
13099 struct expand_vec_perm_d dperm
;
13103 memset (&dperm
, 0, sizeof (dperm
));
13104 dperm
.target
= target
;
13105 dperm
.vmode
= mode
;
13106 dperm
.nelt
= GET_MODE_NUNITS (mode
);
13107 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
13108 dperm
.one_operand_p
= true;
13110 /* Extend to SImode using a paradoxical SUBREG. */
13111 tmp1
= gen_reg_rtx (SImode
);
13112 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
13114 /* Insert the SImode value as low element of a V4SImode vector. */
13115 tmp2
= gen_reg_rtx (V4SImode
);
13116 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
13117 emit_move_insn (dperm
.op0
, gen_lowpart (mode
, tmp2
));
13119 ok
= (expand_vec_perm_1 (&dperm
)
13120 || expand_vec_perm_broadcast_1 (&dperm
));
13128 return ix86_vector_duplicate_value (mode
, target
, val
);
13135 /* Replicate the value once into the next wider mode and recurse. */
13137 machine_mode smode
, wsmode
, wvmode
;
13140 smode
= GET_MODE_INNER (mode
);
13141 wvmode
= get_mode_wider_vector (mode
);
13142 wsmode
= GET_MODE_INNER (wvmode
);
13144 val
= convert_modes (wsmode
, smode
, val
, true);
13145 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
13146 GEN_INT (GET_MODE_BITSIZE (smode
)),
13147 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13148 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1, OPTAB_LIB_WIDEN
);
13150 x
= gen_reg_rtx (wvmode
);
13151 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
13153 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
13160 return ix86_vector_duplicate_value (mode
, target
, val
);
13163 machine_mode hvmode
= (mode
== V16HImode
? V8HImode
: V16QImode
);
13164 rtx x
= gen_reg_rtx (hvmode
);
13166 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13169 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13170 emit_insn (gen_rtx_SET (target
, x
));
13176 if (TARGET_AVX512BW
)
13177 return ix86_vector_duplicate_value (mode
, target
, val
);
13180 machine_mode hvmode
= (mode
== V32HImode
? V16HImode
: V32QImode
);
13181 rtx x
= gen_reg_rtx (hvmode
);
13183 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13186 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13187 emit_insn (gen_rtx_SET (target
, x
));
13196 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13197 whose ONE_VAR element is VAR, and other elements are zero. Return true
13201 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
13202 rtx target
, rtx var
, int one_var
)
13204 machine_mode vsimode
;
13207 bool use_vector_set
= false;
13208 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
13213 /* For SSE4.1, we normally use vector set. But if the second
13214 element is zero and inter-unit moves are OK, we use movq
13216 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
13217 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13223 use_vector_set
= TARGET_SSE4_1
;
13226 use_vector_set
= TARGET_SSE2
;
13229 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
13233 use_vector_set
= TARGET_AVX
;
13236 use_vector_set
= TARGET_AVX
;
13237 gen_vec_set_0
= gen_vec_setv8si_0
;
13240 use_vector_set
= TARGET_AVX
;
13241 gen_vec_set_0
= gen_vec_setv8sf_0
;
13244 use_vector_set
= TARGET_AVX
;
13245 gen_vec_set_0
= gen_vec_setv4df_0
;
13248 /* Use ix86_expand_vector_set in 64bit mode only. */
13249 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
13250 gen_vec_set_0
= gen_vec_setv4di_0
;
13253 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13254 gen_vec_set_0
= gen_vec_setv16si_0
;
13257 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13258 gen_vec_set_0
= gen_vec_setv16sf_0
;
13261 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13262 gen_vec_set_0
= gen_vec_setv8df_0
;
13265 /* Use ix86_expand_vector_set in 64bit mode only. */
13266 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
13267 gen_vec_set_0
= gen_vec_setv8di_0
;
13273 if (use_vector_set
)
13275 if (gen_vec_set_0
&& one_var
== 0)
13277 var
= force_reg (GET_MODE_INNER (mode
), var
);
13278 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
13281 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
13282 var
= force_reg (GET_MODE_INNER (mode
), var
);
13283 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13299 var
= force_reg (GET_MODE_INNER (mode
), var
);
13300 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
13301 emit_insn (gen_rtx_SET (target
, x
));
13306 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
13307 new_target
= gen_reg_rtx (mode
);
13309 new_target
= target
;
13310 var
= force_reg (GET_MODE_INNER (mode
), var
);
13311 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
13312 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
13313 emit_insn (gen_rtx_SET (new_target
, x
));
13316 /* We need to shuffle the value to the correct position, so
13317 create a new pseudo to store the intermediate result. */
13319 /* With SSE2, we can use the integer shuffle insns. */
13320 if (mode
!= V4SFmode
&& TARGET_SSE2
)
13322 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
13324 GEN_INT (one_var
== 1 ? 0 : 1),
13325 GEN_INT (one_var
== 2 ? 0 : 1),
13326 GEN_INT (one_var
== 3 ? 0 : 1)));
13327 if (target
!= new_target
)
13328 emit_move_insn (target
, new_target
);
13332 /* Otherwise convert the intermediate result to V4SFmode and
13333 use the SSE1 shuffle instructions. */
13334 if (mode
!= V4SFmode
)
13336 tmp
= gen_reg_rtx (V4SFmode
);
13337 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
13342 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
13344 GEN_INT (one_var
== 1 ? 0 : 1),
13345 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
13346 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
13348 if (mode
!= V4SFmode
)
13349 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
13350 else if (tmp
!= target
)
13351 emit_move_insn (target
, tmp
);
13353 else if (target
!= new_target
)
13354 emit_move_insn (target
, new_target
);
13359 vsimode
= V4SImode
;
13365 vsimode
= V2SImode
;
13371 /* Zero extend the variable element to SImode and recurse. */
13372 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
13374 x
= gen_reg_rtx (vsimode
);
13375 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
13377 gcc_unreachable ();
13379 emit_move_insn (target
, gen_lowpart (mode
, x
));
13387 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13388 consisting of the values in VALS. It is known that all elements
13389 except ONE_VAR are constants. Return true if successful. */
13392 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
13393 rtx target
, rtx vals
, int one_var
)
13395 rtx var
= XVECEXP (vals
, 0, one_var
);
13396 machine_mode wmode
;
13399 const_vec
= copy_rtx (vals
);
13400 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
13401 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
13409 /* For the two element vectors, it's just as easy to use
13410 the general case. */
13414 /* Use ix86_expand_vector_set in 64bit mode only. */
13438 /* There's no way to set one QImode entry easily. Combine
13439 the variable value with its adjacent constant value, and
13440 promote to an HImode set. */
13441 x
= XVECEXP (vals
, 0, one_var
^ 1);
13444 var
= convert_modes (HImode
, QImode
, var
, true);
13445 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
13446 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13447 x
= GEN_INT (INTVAL (x
) & 0xff);
13451 var
= convert_modes (HImode
, QImode
, var
, true);
13452 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
13454 if (x
!= const0_rtx
)
13455 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
13456 1, OPTAB_LIB_WIDEN
);
13458 x
= gen_reg_rtx (wmode
);
13459 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
13460 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
13462 emit_move_insn (target
, gen_lowpart (mode
, x
));
13469 emit_move_insn (target
, const_vec
);
13470 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13474 /* A subroutine of ix86_expand_vector_init_general. Use vector
13475 concatenate to handle the most general case: all values variable,
13476 and none identical. */
13479 ix86_expand_vector_init_concat (machine_mode mode
,
13480 rtx target
, rtx
*ops
, int n
)
13482 machine_mode cmode
, hmode
= VOIDmode
, gmode
= VOIDmode
;
13483 rtx first
[16], second
[8], third
[4];
13535 gcc_unreachable ();
13538 if (!register_operand (ops
[1], cmode
))
13539 ops
[1] = force_reg (cmode
, ops
[1]);
13540 if (!register_operand (ops
[0], cmode
))
13541 ops
[0] = force_reg (cmode
, ops
[0]);
13542 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
13562 gcc_unreachable ();
13586 gcc_unreachable ();
13604 gcc_unreachable ();
13609 /* FIXME: We process inputs backward to help RA. PR 36222. */
13612 for (; i
> 0; i
-= 2, j
--)
13614 first
[j
] = gen_reg_rtx (cmode
);
13615 v
= gen_rtvec (2, ops
[i
- 1], ops
[i
]);
13616 ix86_expand_vector_init (false, first
[j
],
13617 gen_rtx_PARALLEL (cmode
, v
));
13623 gcc_assert (hmode
!= VOIDmode
);
13624 gcc_assert (gmode
!= VOIDmode
);
13625 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13627 second
[j
] = gen_reg_rtx (hmode
);
13628 ix86_expand_vector_init_concat (hmode
, second
[j
],
13632 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13634 third
[j
] = gen_reg_rtx (gmode
);
13635 ix86_expand_vector_init_concat (gmode
, third
[j
],
13639 ix86_expand_vector_init_concat (mode
, target
, third
, n
);
13643 gcc_assert (hmode
!= VOIDmode
);
13644 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13646 second
[j
] = gen_reg_rtx (hmode
);
13647 ix86_expand_vector_init_concat (hmode
, second
[j
],
13651 ix86_expand_vector_init_concat (mode
, target
, second
, n
);
13654 ix86_expand_vector_init_concat (mode
, target
, first
, n
);
13658 gcc_unreachable ();
13662 /* A subroutine of ix86_expand_vector_init_general. Use vector
13663 interleave to handle the most general case: all values variable,
13664 and none identical. */
13667 ix86_expand_vector_init_interleave (machine_mode mode
,
13668 rtx target
, rtx
*ops
, int n
)
13670 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
13673 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
13674 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
13675 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
13680 gen_load_even
= gen_vec_setv8hi
;
13681 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
13682 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13683 inner_mode
= HImode
;
13684 first_imode
= V4SImode
;
13685 second_imode
= V2DImode
;
13686 third_imode
= VOIDmode
;
13689 gen_load_even
= gen_vec_setv16qi
;
13690 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
13691 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
13692 inner_mode
= QImode
;
13693 first_imode
= V8HImode
;
13694 second_imode
= V4SImode
;
13695 third_imode
= V2DImode
;
13698 gcc_unreachable ();
13701 for (i
= 0; i
< n
; i
++)
13703 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13704 op0
= gen_reg_rtx (SImode
);
13705 emit_move_insn (op0
, gen_lowpart (SImode
, ops
[i
+ i
]));
13707 /* Insert the SImode value as low element of V4SImode vector. */
13708 op1
= gen_reg_rtx (V4SImode
);
13709 op0
= gen_rtx_VEC_MERGE (V4SImode
,
13710 gen_rtx_VEC_DUPLICATE (V4SImode
,
13712 CONST0_RTX (V4SImode
),
13714 emit_insn (gen_rtx_SET (op1
, op0
));
13716 /* Cast the V4SImode vector back to a vector in orignal mode. */
13717 op0
= gen_reg_rtx (mode
);
13718 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
13720 /* Load even elements into the second position. */
13721 emit_insn (gen_load_even (op0
,
13722 force_reg (inner_mode
,
13726 /* Cast vector to FIRST_IMODE vector. */
13727 ops
[i
] = gen_reg_rtx (first_imode
);
13728 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
13731 /* Interleave low FIRST_IMODE vectors. */
13732 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13734 op0
= gen_reg_rtx (first_imode
);
13735 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
13737 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13738 ops
[j
] = gen_reg_rtx (second_imode
);
13739 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
13742 /* Interleave low SECOND_IMODE vectors. */
13743 switch (second_imode
)
13746 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
13748 op0
= gen_reg_rtx (second_imode
);
13749 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
13752 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13754 ops
[j
] = gen_reg_rtx (third_imode
);
13755 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
13757 second_imode
= V2DImode
;
13758 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13762 op0
= gen_reg_rtx (second_imode
);
13763 emit_insn (gen_interleave_second_low (op0
, ops
[0],
13766 /* Cast the SECOND_IMODE vector back to a vector on original
13768 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
13772 gcc_unreachable ();
13776 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13777 all values variable, and none identical. */
13780 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
13781 rtx target
, rtx vals
)
13783 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
13784 machine_mode half_mode
= VOIDmode
;
13785 machine_mode quarter_mode
= VOIDmode
;
13792 if (!mmx_ok
&& !TARGET_SSE
)
13808 n
= GET_MODE_NUNITS (mode
);
13809 for (i
= 0; i
< n
; i
++)
13810 ops
[i
] = XVECEXP (vals
, 0, i
);
13811 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
13815 for (i
= 0; i
< 2; i
++)
13816 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13817 op0
= gen_reg_rtx (V4DImode
);
13818 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
13819 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
13823 for (i
= 0; i
< 4; i
++)
13824 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13825 ops
[4] = gen_reg_rtx (V4DImode
);
13826 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
13827 ops
[5] = gen_reg_rtx (V4DImode
);
13828 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
13829 op0
= gen_reg_rtx (V8DImode
);
13830 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
13831 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
13835 half_mode
= V16QImode
;
13839 half_mode
= V8HImode
;
13843 n
= GET_MODE_NUNITS (mode
);
13844 for (i
= 0; i
< n
; i
++)
13845 ops
[i
] = XVECEXP (vals
, 0, i
);
13846 op0
= gen_reg_rtx (half_mode
);
13847 op1
= gen_reg_rtx (half_mode
);
13848 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
13850 ix86_expand_vector_init_interleave (half_mode
, op1
,
13851 &ops
[n
>> 1], n
>> 2);
13852 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
13856 quarter_mode
= V16QImode
;
13857 half_mode
= V32QImode
;
13861 quarter_mode
= V8HImode
;
13862 half_mode
= V16HImode
;
13866 n
= GET_MODE_NUNITS (mode
);
13867 for (i
= 0; i
< n
; i
++)
13868 ops
[i
] = XVECEXP (vals
, 0, i
);
13869 op0
= gen_reg_rtx (quarter_mode
);
13870 op1
= gen_reg_rtx (quarter_mode
);
13871 op2
= gen_reg_rtx (quarter_mode
);
13872 op3
= gen_reg_rtx (quarter_mode
);
13873 op4
= gen_reg_rtx (half_mode
);
13874 op5
= gen_reg_rtx (half_mode
);
13875 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
13877 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
13878 &ops
[n
>> 2], n
>> 3);
13879 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
13880 &ops
[n
>> 1], n
>> 3);
13881 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
13882 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
13883 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
13884 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
13885 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
13889 if (!TARGET_SSE4_1
)
13897 /* Don't use ix86_expand_vector_init_interleave if we can't
13898 move from GPR to SSE register directly. */
13899 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
13902 n
= GET_MODE_NUNITS (mode
);
13903 for (i
= 0; i
< n
; i
++)
13904 ops
[i
] = XVECEXP (vals
, 0, i
);
13905 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
13913 gcc_unreachable ();
13917 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
13918 machine_mode inner_mode
;
13919 rtx words
[4], shift
;
13921 inner_mode
= GET_MODE_INNER (mode
);
13922 n_elts
= GET_MODE_NUNITS (mode
);
13923 n_words
= GET_MODE_SIZE (mode
) / UNITS_PER_WORD
;
13924 n_elt_per_word
= n_elts
/ n_words
;
13925 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
13927 for (i
= 0; i
< n_words
; ++i
)
13929 rtx word
= NULL_RTX
;
13931 for (j
= 0; j
< n_elt_per_word
; ++j
)
13933 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
13934 elt
= convert_modes (word_mode
, inner_mode
, elt
, true);
13940 word
= expand_simple_binop (word_mode
, ASHIFT
, word
, shift
,
13941 word
, 1, OPTAB_LIB_WIDEN
);
13942 word
= expand_simple_binop (word_mode
, IOR
, word
, elt
,
13943 word
, 1, OPTAB_LIB_WIDEN
);
13951 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
13952 else if (n_words
== 2)
13954 rtx tmp
= gen_reg_rtx (mode
);
13955 emit_clobber (tmp
);
13956 emit_move_insn (gen_lowpart (word_mode
, tmp
), words
[0]);
13957 emit_move_insn (gen_highpart (word_mode
, tmp
), words
[1]);
13958 emit_move_insn (target
, tmp
);
13960 else if (n_words
== 4)
13962 rtx tmp
= gen_reg_rtx (V4SImode
);
13963 gcc_assert (word_mode
== SImode
);
13964 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
13965 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
13966 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
13969 gcc_unreachable ();
13973 /* Initialize vector TARGET via VALS. Suppress the use of MMX
13974 instructions unless MMX_OK is true. */
13977 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
13979 machine_mode mode
= GET_MODE (target
);
13980 machine_mode inner_mode
= GET_MODE_INNER (mode
);
13981 int n_elts
= GET_MODE_NUNITS (mode
);
13982 int n_var
= 0, one_var
= -1;
13983 bool all_same
= true, all_const_zero
= true;
13987 /* Handle first initialization from vector elts. */
13988 if (n_elts
!= XVECLEN (vals
, 0))
13990 rtx subtarget
= target
;
13991 x
= XVECEXP (vals
, 0, 0);
13992 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
13993 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
13995 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
13996 if (inner_mode
== QImode
|| inner_mode
== HImode
)
13998 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
13999 mode
= mode_for_vector (SImode
, n_bits
/ 4).require ();
14000 inner_mode
= mode_for_vector (SImode
, n_bits
/ 8).require ();
14001 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
14002 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
14003 subtarget
= gen_reg_rtx (mode
);
14005 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
14006 if (subtarget
!= target
)
14007 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
14010 gcc_unreachable ();
14013 for (i
= 0; i
< n_elts
; ++i
)
14015 x
= XVECEXP (vals
, 0, i
);
14016 if (!(CONST_SCALAR_INT_P (x
)
14017 || CONST_DOUBLE_P (x
)
14018 || CONST_FIXED_P (x
)))
14019 n_var
++, one_var
= i
;
14020 else if (x
!= CONST0_RTX (inner_mode
))
14021 all_const_zero
= false;
14022 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
14026 /* Constants are best loaded from the constant pool. */
14029 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
14033 /* If all values are identical, broadcast the value. */
14035 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
14036 XVECEXP (vals
, 0, 0)))
14039 /* Values where only one field is non-constant are best loaded from
14040 the pool and overwritten via move later. */
14044 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
14045 XVECEXP (vals
, 0, one_var
),
14049 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
14053 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
14057 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
14059 machine_mode mode
= GET_MODE (target
);
14060 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14061 machine_mode half_mode
;
14062 bool use_vec_merge
= false;
14064 static rtx (*gen_extract
[6][2]) (rtx
, rtx
)
14066 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
14067 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
14068 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
14069 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
14070 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
14071 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
}
14073 static rtx (*gen_insert
[6][2]) (rtx
, rtx
, rtx
)
14075 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
14076 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
14077 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
14078 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
14079 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
14080 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
}
14083 machine_mode mmode
= VOIDmode
;
14084 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
14092 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14093 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
14095 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14097 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14098 emit_insn (gen_rtx_SET (target
, tmp
));
14104 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
14108 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14109 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
14111 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14113 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14114 emit_insn (gen_rtx_SET (target
, tmp
));
14121 /* For the two element vectors, we implement a VEC_CONCAT with
14122 the extraction of the other element. */
14124 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
14125 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
14128 op0
= val
, op1
= tmp
;
14130 op0
= tmp
, op1
= val
;
14132 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
14133 emit_insn (gen_rtx_SET (target
, tmp
));
14138 use_vec_merge
= TARGET_SSE4_1
;
14145 use_vec_merge
= true;
14149 /* tmp = target = A B C D */
14150 tmp
= copy_to_reg (target
);
14151 /* target = A A B B */
14152 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
14153 /* target = X A B B */
14154 ix86_expand_vector_set (false, target
, val
, 0);
14155 /* target = A X C D */
14156 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14157 const1_rtx
, const0_rtx
,
14158 GEN_INT (2+4), GEN_INT (3+4)));
14162 /* tmp = target = A B C D */
14163 tmp
= copy_to_reg (target
);
14164 /* tmp = X B C D */
14165 ix86_expand_vector_set (false, tmp
, val
, 0);
14166 /* target = A B X D */
14167 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14168 const0_rtx
, const1_rtx
,
14169 GEN_INT (0+4), GEN_INT (3+4)));
14173 /* tmp = target = A B C D */
14174 tmp
= copy_to_reg (target
);
14175 /* tmp = X B C D */
14176 ix86_expand_vector_set (false, tmp
, val
, 0);
14177 /* target = A B X D */
14178 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14179 const0_rtx
, const1_rtx
,
14180 GEN_INT (2+4), GEN_INT (0+4)));
14184 gcc_unreachable ();
14189 use_vec_merge
= TARGET_SSE4_1
;
14193 /* Element 0 handled by vec_merge below. */
14196 use_vec_merge
= true;
14202 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14203 store into element 0, then shuffle them back. */
14207 order
[0] = GEN_INT (elt
);
14208 order
[1] = const1_rtx
;
14209 order
[2] = const2_rtx
;
14210 order
[3] = GEN_INT (3);
14211 order
[elt
] = const0_rtx
;
14213 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14214 order
[1], order
[2], order
[3]));
14216 ix86_expand_vector_set (false, target
, val
, 0);
14218 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14219 order
[1], order
[2], order
[3]));
14223 /* For SSE1, we have to reuse the V4SF code. */
14224 rtx t
= gen_reg_rtx (V4SFmode
);
14225 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
14226 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
14227 emit_move_insn (target
, gen_lowpart (mode
, t
));
14232 use_vec_merge
= TARGET_SSE2
;
14235 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14239 use_vec_merge
= TARGET_SSE4_1
;
14246 half_mode
= V16QImode
;
14252 half_mode
= V8HImode
;
14258 half_mode
= V4SImode
;
14264 half_mode
= V2DImode
;
14270 half_mode
= V4SFmode
;
14276 half_mode
= V2DFmode
;
14282 /* Compute offset. */
14286 gcc_assert (i
<= 1);
14288 /* Extract the half. */
14289 tmp
= gen_reg_rtx (half_mode
);
14290 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
14292 /* Put val in tmp at elt. */
14293 ix86_expand_vector_set (false, tmp
, val
, elt
);
14296 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
14300 if (TARGET_AVX512F
)
14303 gen_blendm
= gen_avx512f_blendmv8df
;
14308 if (TARGET_AVX512F
)
14311 gen_blendm
= gen_avx512f_blendmv8di
;
14316 if (TARGET_AVX512F
)
14319 gen_blendm
= gen_avx512f_blendmv16sf
;
14324 if (TARGET_AVX512F
)
14327 gen_blendm
= gen_avx512f_blendmv16si
;
14332 if (TARGET_AVX512BW
)
14335 gen_blendm
= gen_avx512bw_blendmv32hi
;
14337 else if (TARGET_AVX512F
)
14339 half_mode
= E_V8HImode
;
14346 if (TARGET_AVX512BW
)
14349 gen_blendm
= gen_avx512bw_blendmv64qi
;
14351 else if (TARGET_AVX512F
)
14353 half_mode
= E_V16QImode
;
14360 /* Compute offset. */
14364 gcc_assert (i
<= 3);
14367 /* Extract the quarter. */
14368 tmp
= gen_reg_rtx (V4SImode
);
14369 rtx tmp2
= gen_lowpart (V16SImode
, target
);
14370 rtx mask
= gen_reg_rtx (QImode
);
14372 emit_move_insn (mask
, constm1_rtx
);
14373 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
14376 tmp2
= gen_reg_rtx (half_mode
);
14377 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
14380 /* Put val in tmp at elt. */
14381 ix86_expand_vector_set (false, tmp
, val
, elt
);
14384 tmp2
= gen_reg_rtx (V16SImode
);
14385 rtx tmp3
= gen_lowpart (V16SImode
, target
);
14386 mask
= gen_reg_rtx (HImode
);
14387 emit_move_insn (mask
, constm1_rtx
);
14388 tmp
= gen_lowpart (V4SImode
, tmp
);
14389 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
14391 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
14399 if (mmode
!= VOIDmode
)
14401 tmp
= gen_reg_rtx (mode
);
14402 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
14403 /* The avx512*_blendm<mode> expanders have different operand order
14404 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14405 elements where the mask is set and second input operand otherwise,
14406 in {sse,avx}*_*blend* the first input operand is used for elements
14407 where the mask is clear and second input operand otherwise. */
14408 emit_insn (gen_blendm (target
, target
, tmp
,
14410 gen_int_mode (HOST_WIDE_INT_1U
<< elt
,
14413 else if (use_vec_merge
)
14415 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
14416 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
14417 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
14418 emit_insn (gen_rtx_SET (target
, tmp
));
14422 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14424 emit_move_insn (mem
, target
);
14426 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
14427 emit_move_insn (tmp
, val
);
14429 emit_move_insn (target
, mem
);
14434 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
14436 machine_mode mode
= GET_MODE (vec
);
14437 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14438 bool use_vec_extr
= false;
14453 use_vec_extr
= true;
14457 use_vec_extr
= TARGET_SSE4_1
;
14469 tmp
= gen_reg_rtx (mode
);
14470 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
14471 GEN_INT (elt
), GEN_INT (elt
),
14472 GEN_INT (elt
+4), GEN_INT (elt
+4)));
14476 tmp
= gen_reg_rtx (mode
);
14477 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
14481 gcc_unreachable ();
14484 use_vec_extr
= true;
14489 use_vec_extr
= TARGET_SSE4_1
;
14503 tmp
= gen_reg_rtx (mode
);
14504 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
14505 GEN_INT (elt
), GEN_INT (elt
),
14506 GEN_INT (elt
), GEN_INT (elt
)));
14510 tmp
= gen_reg_rtx (mode
);
14511 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
14515 gcc_unreachable ();
14518 use_vec_extr
= true;
14523 /* For SSE1, we have to reuse the V4SF code. */
14524 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
14525 gen_lowpart (V4SFmode
, vec
), elt
);
14531 use_vec_extr
= TARGET_SSE2
;
14534 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14538 use_vec_extr
= TARGET_SSE4_1
;
14544 tmp
= gen_reg_rtx (V4SFmode
);
14546 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
14548 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
14549 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14557 tmp
= gen_reg_rtx (V2DFmode
);
14559 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
14561 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
14562 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14570 tmp
= gen_reg_rtx (V16QImode
);
14572 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
14574 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
14575 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14583 tmp
= gen_reg_rtx (V8HImode
);
14585 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
14587 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
14588 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14596 tmp
= gen_reg_rtx (V4SImode
);
14598 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
14600 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
14601 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14609 tmp
= gen_reg_rtx (V2DImode
);
14611 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
14613 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
14614 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14620 if (TARGET_AVX512BW
)
14622 tmp
= gen_reg_rtx (V16HImode
);
14624 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
14626 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
14627 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14633 if (TARGET_AVX512BW
)
14635 tmp
= gen_reg_rtx (V32QImode
);
14637 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
14639 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
14640 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
14646 tmp
= gen_reg_rtx (V8SFmode
);
14648 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
14650 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
14651 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14655 tmp
= gen_reg_rtx (V4DFmode
);
14657 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
14659 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
14660 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14664 tmp
= gen_reg_rtx (V8SImode
);
14666 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
14668 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
14669 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14673 tmp
= gen_reg_rtx (V4DImode
);
14675 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
14677 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
14678 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14682 /* ??? Could extract the appropriate HImode element and shift. */
14689 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
14690 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
14692 /* Let the rtl optimizers know about the zero extension performed. */
14693 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14695 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
14696 target
= gen_lowpart (SImode
, target
);
14699 emit_insn (gen_rtx_SET (target
, tmp
));
14703 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14705 emit_move_insn (mem
, vec
);
14707 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
14708 emit_move_insn (target
, tmp
);
14712 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14713 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14714 The upper bits of DEST are undefined, though they shouldn't cause
14715 exceptions (some bits from src or all zeros are ok). */
14718 emit_reduc_half (rtx dest
, rtx src
, int i
)
14721 switch (GET_MODE (src
))
14725 tem
= gen_sse_movhlps (dest
, src
, src
);
14727 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
14728 GEN_INT (1 + 4), GEN_INT (1 + 4));
14731 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
14737 d
= gen_reg_rtx (V1TImode
);
14738 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
14743 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
14745 tem
= gen_avx_shufps256 (dest
, src
, src
,
14746 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
14750 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
14752 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
14760 if (GET_MODE (dest
) != V4DImode
)
14761 d
= gen_reg_rtx (V4DImode
);
14762 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
14763 gen_lowpart (V4DImode
, src
),
14768 d
= gen_reg_rtx (V2TImode
);
14769 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
14780 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
14781 gen_lowpart (V16SImode
, src
),
14782 gen_lowpart (V16SImode
, src
),
14783 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
14784 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
14785 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
14786 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
14787 GEN_INT (0xC), GEN_INT (0xD),
14788 GEN_INT (0xE), GEN_INT (0xF),
14789 GEN_INT (0x10), GEN_INT (0x11),
14790 GEN_INT (0x12), GEN_INT (0x13),
14791 GEN_INT (0x14), GEN_INT (0x15),
14792 GEN_INT (0x16), GEN_INT (0x17));
14794 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
14795 gen_lowpart (V16SImode
, src
),
14796 GEN_INT (i
== 128 ? 0x2 : 0x1),
14800 GEN_INT (i
== 128 ? 0x6 : 0x5),
14804 GEN_INT (i
== 128 ? 0xA : 0x9),
14808 GEN_INT (i
== 128 ? 0xE : 0xD),
14814 gcc_unreachable ();
14818 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
14821 /* Expand a vector reduction. FN is the binary pattern to reduce;
14822 DEST is the destination; IN is the input vector. */
14825 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
14827 rtx half
, dst
, vec
= in
;
14828 machine_mode mode
= GET_MODE (in
);
14831 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
14833 && mode
== V8HImode
14834 && fn
== gen_uminv8hi3
)
14836 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
14840 for (i
= GET_MODE_BITSIZE (mode
);
14841 i
> GET_MODE_UNIT_BITSIZE (mode
);
14844 half
= gen_reg_rtx (mode
);
14845 emit_reduc_half (half
, vec
, i
);
14846 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
14849 dst
= gen_reg_rtx (mode
);
14850 emit_insn (fn (dst
, half
, vec
));
14855 /* Output code to perform a conditional jump to LABEL, if C2 flag in
14856 FP status register is set. */
14859 ix86_emit_fp_unordered_jump (rtx label
)
14861 rtx reg
= gen_reg_rtx (HImode
);
14865 emit_insn (gen_x86_fnstsw_1 (reg
));
14867 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
14869 emit_insn (gen_x86_sahf_1 (reg
));
14871 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
14872 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
14876 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
14878 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
14879 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
14882 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
14883 gen_rtx_LABEL_REF (VOIDmode
, label
),
14885 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
14886 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
14887 JUMP_LABEL (insn
) = label
;
14890 /* Output code to perform an sinh XFmode calculation. */
14892 void ix86_emit_i387_sinh (rtx op0
, rtx op1
)
14894 rtx e1
= gen_reg_rtx (XFmode
);
14895 rtx e2
= gen_reg_rtx (XFmode
);
14896 rtx scratch
= gen_reg_rtx (HImode
);
14897 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
14898 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
14900 rtx_code_label
*jump_label
= gen_label_rtx ();
14903 /* scratch = fxam (op1) */
14904 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
14906 /* e1 = expm1 (|op1|) */
14907 emit_insn (gen_absxf2 (e2
, op1
));
14908 emit_insn (gen_expm1xf2 (e1
, e2
));
14910 /* e2 = e1 / (e1 + 1.0) + e1 */
14911 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
14912 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
14913 emit_insn (gen_divxf3 (e2
, e1
, e2
));
14914 emit_insn (gen_addxf3 (e2
, e2
, e1
));
14916 /* flags = signbit (op1) */
14917 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
14919 /* if (flags) then e2 = -e2 */
14920 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
14921 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
14922 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
14924 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
14925 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
14926 JUMP_LABEL (insn
) = jump_label
;
14928 emit_insn (gen_negxf2 (e2
, e2
));
14930 emit_label (jump_label
);
14931 LABEL_NUSES (jump_label
) = 1;
14933 /* op0 = 0.5 * e2 */
14934 half
= force_reg (XFmode
, half
);
14935 emit_insn (gen_mulxf3 (op0
, e2
, half
));
14938 /* Output code to perform an cosh XFmode calculation. */
14940 void ix86_emit_i387_cosh (rtx op0
, rtx op1
)
14942 rtx e1
= gen_reg_rtx (XFmode
);
14943 rtx e2
= gen_reg_rtx (XFmode
);
14944 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
14947 /* e1 = exp (op1) */
14948 emit_insn (gen_expxf2 (e1
, op1
));
14950 /* e2 = e1 + 1.0 / e1 */
14951 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
14952 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
14953 emit_insn (gen_addxf3 (e2
, e1
, e2
));
14955 /* op0 = 0.5 * e2 */
14956 half
= force_reg (XFmode
, half
);
14957 emit_insn (gen_mulxf3 (op0
, e2
, half
));
14960 /* Output code to perform an tanh XFmode calculation. */
14962 void ix86_emit_i387_tanh (rtx op0
, rtx op1
)
14964 rtx e1
= gen_reg_rtx (XFmode
);
14965 rtx e2
= gen_reg_rtx (XFmode
);
14966 rtx scratch
= gen_reg_rtx (HImode
);
14967 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
14969 rtx_code_label
*jump_label
= gen_label_rtx ();
14972 /* scratch = fxam (op1) */
14973 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
14975 /* e1 = expm1 (-|2 * op1|) */
14976 emit_insn (gen_addxf3 (e2
, op1
, op1
));
14977 emit_insn (gen_absxf2 (e2
, e2
));
14978 emit_insn (gen_negxf2 (e2
, e2
));
14979 emit_insn (gen_expm1xf2 (e1
, e2
));
14981 /* e2 = e1 / (e1 + 2.0) */
14982 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
14983 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
14984 emit_insn (gen_divxf3 (e2
, e1
, e2
));
14986 /* flags = signbit (op1) */
14987 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
14989 /* if (!flags) then e2 = -e2 */
14990 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
14991 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
14992 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
14994 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
14995 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
14996 JUMP_LABEL (insn
) = jump_label
;
14998 emit_insn (gen_negxf2 (e2
, e2
));
15000 emit_label (jump_label
);
15001 LABEL_NUSES (jump_label
) = 1;
15003 emit_move_insn (op0
, e2
);
15006 /* Output code to perform an asinh XFmode calculation. */
15008 void ix86_emit_i387_asinh (rtx op0
, rtx op1
)
15010 rtx e1
= gen_reg_rtx (XFmode
);
15011 rtx e2
= gen_reg_rtx (XFmode
);
15012 rtx scratch
= gen_reg_rtx (HImode
);
15013 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15015 rtx_code_label
*jump_label
= gen_label_rtx ();
15018 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15019 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
15020 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15021 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15022 emit_insn (gen_sqrtxf2 (e2
, e2
));
15023 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
15026 emit_insn (gen_divxf3 (e1
, e1
, e2
));
15028 /* scratch = fxam (op1) */
15029 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15031 /* e1 = e1 + |op1| */
15032 emit_insn (gen_absxf2 (e2
, op1
));
15033 emit_insn (gen_addxf3 (e1
, e1
, e2
));
15035 /* e2 = log1p (e1) */
15036 ix86_emit_i387_log1p (e2
, e1
);
15038 /* flags = signbit (op1) */
15039 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15041 /* if (flags) then e2 = -e2 */
15042 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15043 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15044 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15046 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15047 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15048 JUMP_LABEL (insn
) = jump_label
;
15050 emit_insn (gen_negxf2 (e2
, e2
));
15052 emit_label (jump_label
);
15053 LABEL_NUSES (jump_label
) = 1;
15055 emit_move_insn (op0
, e2
);
15058 /* Output code to perform an acosh XFmode calculation. */
15060 void ix86_emit_i387_acosh (rtx op0
, rtx op1
)
15062 rtx e1
= gen_reg_rtx (XFmode
);
15063 rtx e2
= gen_reg_rtx (XFmode
);
15064 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15066 /* e2 = sqrt (op1 + 1.0) */
15067 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
15068 emit_insn (gen_sqrtxf2 (e2
, e2
));
15070 /* e1 = sqrt (op1 - 1.0) */
15071 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
15072 emit_insn (gen_sqrtxf2 (e1
, e1
));
15075 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
15077 /* e1 = e1 + op1 */
15078 emit_insn (gen_addxf3 (e1
, e1
, op1
));
15080 /* op0 = log (e1) */
15081 emit_insn (gen_logxf2 (op0
, e1
));
15084 /* Output code to perform an atanh XFmode calculation. */
15086 void ix86_emit_i387_atanh (rtx op0
, rtx op1
)
15088 rtx e1
= gen_reg_rtx (XFmode
);
15089 rtx e2
= gen_reg_rtx (XFmode
);
15090 rtx scratch
= gen_reg_rtx (HImode
);
15091 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15092 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15094 rtx_code_label
*jump_label
= gen_label_rtx ();
15097 /* scratch = fxam (op1) */
15098 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15101 emit_insn (gen_absxf2 (e2
, op1
));
15103 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15104 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15105 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
15106 emit_insn (gen_addxf3 (e2
, e2
, e2
));
15107 emit_insn (gen_negxf2 (e2
, e2
));
15108 emit_insn (gen_divxf3 (e1
, e2
, e1
));
15110 /* e2 = log1p (e1) */
15111 ix86_emit_i387_log1p (e2
, e1
);
15113 /* flags = signbit (op1) */
15114 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15116 /* if (!flags) then e2 = -e2 */
15117 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15118 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15119 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15121 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15122 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15123 JUMP_LABEL (insn
) = jump_label
;
15125 emit_insn (gen_negxf2 (e2
, e2
));
15127 emit_label (jump_label
);
15128 LABEL_NUSES (jump_label
) = 1;
15130 /* op0 = 0.5 * e2 */
15131 half
= force_reg (XFmode
, half
);
15132 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15135 /* Output code to perform a log1p XFmode calculation. */
15137 void ix86_emit_i387_log1p (rtx op0
, rtx op1
)
15139 rtx_code_label
*label1
= gen_label_rtx ();
15140 rtx_code_label
*label2
= gen_label_rtx ();
15142 rtx tmp
= gen_reg_rtx (XFmode
);
15143 rtx res
= gen_reg_rtx (XFmode
);
15144 rtx cst
, cstln2
, cst1
;
15147 cst
= const_double_from_real_value
15148 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
15149 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
15151 emit_insn (gen_absxf2 (tmp
, op1
));
15153 cst
= force_reg (XFmode
, cst
);
15154 ix86_expand_branch (GE
, tmp
, cst
, label1
);
15155 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15156 insn
= get_last_insn ();
15157 JUMP_LABEL (insn
) = label1
;
15159 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
15160 emit_jump (label2
);
15162 emit_label (label1
);
15163 LABEL_NUSES (label1
) = 1;
15165 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15166 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
15167 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
15169 emit_label (label2
);
15170 LABEL_NUSES (label2
) = 1;
15172 emit_move_insn (op0
, res
);
15175 /* Emit code for round calculation. */
15176 void ix86_emit_i387_round (rtx op0
, rtx op1
)
15178 machine_mode inmode
= GET_MODE (op1
);
15179 machine_mode outmode
= GET_MODE (op0
);
15180 rtx e1
= gen_reg_rtx (XFmode
);
15181 rtx e2
= gen_reg_rtx (XFmode
);
15182 rtx scratch
= gen_reg_rtx (HImode
);
15183 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15184 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15185 rtx res
= gen_reg_rtx (outmode
);
15186 rtx_code_label
*jump_label
= gen_label_rtx ();
15187 rtx (*floor_insn
) (rtx
, rtx
);
15188 rtx (*neg_insn
) (rtx
, rtx
);
15196 tmp
= gen_reg_rtx (XFmode
);
15198 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
15204 gcc_unreachable ();
15210 floor_insn
= gen_frndintxf2_floor
;
15211 neg_insn
= gen_negsf2
;
15214 floor_insn
= gen_frndintxf2_floor
;
15215 neg_insn
= gen_negdf2
;
15218 floor_insn
= gen_frndintxf2_floor
;
15219 neg_insn
= gen_negxf2
;
15222 floor_insn
= gen_lfloorxfhi2
;
15223 neg_insn
= gen_neghi2
;
15226 floor_insn
= gen_lfloorxfsi2
;
15227 neg_insn
= gen_negsi2
;
15230 floor_insn
= gen_lfloorxfdi2
;
15231 neg_insn
= gen_negdi2
;
15234 gcc_unreachable ();
15237 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15239 /* scratch = fxam(op1) */
15240 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15242 /* e1 = fabs(op1) */
15243 emit_insn (gen_absxf2 (e1
, op1
));
15245 /* e2 = e1 + 0.5 */
15246 half
= force_reg (XFmode
, half
);
15247 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
15249 /* res = floor(e2) */
15255 tmp
= gen_reg_rtx (XFmode
);
15257 emit_insn (floor_insn (tmp
, e2
));
15258 emit_insn (gen_rtx_SET (res
,
15259 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
15260 UNSPEC_TRUNC_NOOP
)));
15264 emit_insn (floor_insn (res
, e2
));
15267 /* flags = signbit(a) */
15268 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15270 /* if (flags) then res = -res */
15271 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15272 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15273 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15275 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15276 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15277 JUMP_LABEL (insn
) = jump_label
;
15279 emit_insn (neg_insn (res
, res
));
15281 emit_label (jump_label
);
15282 LABEL_NUSES (jump_label
) = 1;
15284 emit_move_insn (op0
, res
);
15287 /* Output code to perform a Newton-Rhapson approximation of a single precision
15288 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15290 void ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
15292 rtx x0
, x1
, e0
, e1
;
15294 x0
= gen_reg_rtx (mode
);
15295 e0
= gen_reg_rtx (mode
);
15296 e1
= gen_reg_rtx (mode
);
15297 x1
= gen_reg_rtx (mode
);
15299 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15301 b
= force_reg (mode
, b
);
15303 /* x0 = rcp(b) estimate */
15304 if (mode
== V16SFmode
|| mode
== V8DFmode
)
15306 if (TARGET_AVX512ER
)
15308 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15311 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
15315 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15319 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15323 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
15326 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
15329 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
15332 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
15335 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
15338 /* Output code to perform a Newton-Rhapson approximation of a
15339 single precision floating point [reciprocal] square root. */
15341 void ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
15343 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
15347 x0
= gen_reg_rtx (mode
);
15348 e0
= gen_reg_rtx (mode
);
15349 e1
= gen_reg_rtx (mode
);
15350 e2
= gen_reg_rtx (mode
);
15351 e3
= gen_reg_rtx (mode
);
15353 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
15356 /* res = rsqrt28(a) estimate */
15357 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15361 /* x0 = rsqrt28(a) estimate */
15362 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15364 /* res = rcp28(x0) estimate */
15365 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
15371 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
15372 mthree
= const_double_from_real_value (r
, SFmode
);
15374 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
15375 mhalf
= const_double_from_real_value (r
, SFmode
);
15376 unspec
= UNSPEC_RSQRT
;
15378 if (VECTOR_MODE_P (mode
))
15380 mthree
= ix86_build_const_vector (mode
, true, mthree
);
15381 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
15382 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15383 if (GET_MODE_SIZE (mode
) == 64)
15384 unspec
= UNSPEC_RSQRT14
;
15387 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15388 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15390 a
= force_reg (mode
, a
);
15392 /* x0 = rsqrt(a) estimate */
15393 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15396 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15399 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
15402 /* Handle masked compare. */
15403 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
15405 mask
= gen_reg_rtx (HImode
);
15406 /* Imm value 0x4 corresponds to not-equal comparison. */
15407 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
15408 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
15412 mask
= gen_reg_rtx (mode
);
15413 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
15414 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
15419 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
15421 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
15424 mthree
= force_reg (mode
, mthree
);
15425 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
15427 mhalf
= force_reg (mode
, mhalf
);
15429 /* e3 = -.5 * x0 */
15430 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
15432 /* e3 = -.5 * e0 */
15433 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
15434 /* ret = e2 * e3 */
15435 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
15438 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15439 mask for masking out the sign-bit is stored in *SMASK, if that is
15443 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
15445 machine_mode vmode
, mode
= GET_MODE (op0
);
15448 xa
= gen_reg_rtx (mode
);
15449 if (mode
== SFmode
)
15451 else if (mode
== DFmode
)
15455 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
15456 if (!VECTOR_MODE_P (mode
))
15458 /* We need to generate a scalar mode mask in this case. */
15459 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15460 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15461 mask
= gen_reg_rtx (mode
);
15462 emit_insn (gen_rtx_SET (mask
, tmp
));
15464 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
15472 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15473 swapping the operands if SWAP_OPERANDS is true. The expanded
15474 code is a forward jump to a newly created label in case the
15475 comparison is true. The generated label rtx is returned. */
15476 static rtx_code_label
*
15477 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
15478 bool swap_operands
)
15480 bool unordered_compare
= ix86_unordered_fp_compare (code
);
15481 rtx_code_label
*label
;
15485 std::swap (op0
, op1
);
15487 label
= gen_label_rtx ();
15488 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
15489 if (unordered_compare
)
15490 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
15491 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
15492 emit_insn (gen_rtx_SET (reg
, tmp
));
15493 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
15494 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
15495 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
15496 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15497 JUMP_LABEL (tmp
) = label
;
15502 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15503 using comparison code CODE. Operands are swapped for the comparison if
15504 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15506 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
15507 bool swap_operands
)
15509 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
15510 machine_mode mode
= GET_MODE (op0
);
15511 rtx mask
= gen_reg_rtx (mode
);
15514 std::swap (op0
, op1
);
15516 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
15518 emit_insn (insn (mask
, op0
, op1
,
15519 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
15523 /* Expand copysign from SIGN to the positive value ABS_VALUE
15524 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15528 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
15530 machine_mode mode
= GET_MODE (sign
);
15531 rtx sgn
= gen_reg_rtx (mode
);
15532 if (mask
== NULL_RTX
)
15534 machine_mode vmode
;
15536 if (mode
== SFmode
)
15538 else if (mode
== DFmode
)
15543 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
15544 if (!VECTOR_MODE_P (mode
))
15546 /* We need to generate a scalar mode mask in this case. */
15547 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15548 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15549 mask
= gen_reg_rtx (mode
);
15550 emit_insn (gen_rtx_SET (mask
, tmp
));
15554 mask
= gen_rtx_NOT (mode
, mask
);
15555 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
15556 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
15559 /* Expand SSE sequence for computing lround from OP1 storing
15563 ix86_expand_lround (rtx op0
, rtx op1
)
15565 /* C code for the stuff we're doing below:
15566 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15569 machine_mode mode
= GET_MODE (op1
);
15570 const struct real_format
*fmt
;
15571 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
15574 /* load nextafter (0.5, 0.0) */
15575 fmt
= REAL_MODE_FORMAT (mode
);
15576 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
15577 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
15579 /* adj = copysign (0.5, op1) */
15580 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
15581 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
15583 /* adj = op1 + adj */
15584 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
15586 /* op0 = (imode)adj */
15587 expand_fix (op0
, adj
, 0);
15590 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15594 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
15596 /* C code for the stuff we're doing below (for do_floor):
15598 xi -= (double)xi > op1 ? 1 : 0;
15601 machine_mode fmode
= GET_MODE (op1
);
15602 machine_mode imode
= GET_MODE (op0
);
15603 rtx ireg
, freg
, tmp
;
15604 rtx_code_label
*label
;
15606 /* reg = (long)op1 */
15607 ireg
= gen_reg_rtx (imode
);
15608 expand_fix (ireg
, op1
, 0);
15610 /* freg = (double)reg */
15611 freg
= gen_reg_rtx (fmode
);
15612 expand_float (freg
, ireg
, 0);
15614 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15615 label
= ix86_expand_sse_compare_and_jump (UNLE
,
15616 freg
, op1
, !do_floor
);
15617 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
15618 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
15619 emit_move_insn (ireg
, tmp
);
15621 emit_label (label
);
15622 LABEL_NUSES (label
) = 1;
15624 emit_move_insn (op0
, ireg
);
15627 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15628 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15631 ix86_gen_TWO52 (machine_mode mode
)
15633 REAL_VALUE_TYPE TWO52r
;
15636 real_ldexp (&TWO52r
, &dconst1
, mode
== DFmode
? 52 : 23);
15637 TWO52
= const_double_from_real_value (TWO52r
, mode
);
15638 TWO52
= force_reg (mode
, TWO52
);
15643 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15646 ix86_expand_rint (rtx operand0
, rtx operand1
)
15648 /* C code for the stuff we're doing below:
15649 xa = fabs (operand1);
15650 if (!isless (xa, 2**52))
15653 if (flag_rounding_math)
15655 two52 = copysign (two52, operand1);
15658 xa = xa + two52 - two52;
15659 return copysign (xa, operand1);
15661 machine_mode mode
= GET_MODE (operand0
);
15662 rtx res
, xa
, TWO52
, two52
, mask
;
15663 rtx_code_label
*label
;
15665 res
= gen_reg_rtx (mode
);
15666 emit_move_insn (res
, operand1
);
15668 /* xa = abs (operand1) */
15669 xa
= ix86_expand_sse_fabs (res
, &mask
);
15671 /* if (!isless (xa, TWO52)) goto label; */
15672 TWO52
= ix86_gen_TWO52 (mode
);
15673 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15676 if (flag_rounding_math
)
15678 two52
= gen_reg_rtx (mode
);
15679 ix86_sse_copysign_to_positive (two52
, TWO52
, res
, mask
);
15683 xa
= expand_simple_binop (mode
, PLUS
, xa
, two52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15684 xa
= expand_simple_binop (mode
, MINUS
, xa
, two52
, xa
, 0, OPTAB_DIRECT
);
15686 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
15688 emit_label (label
);
15689 LABEL_NUSES (label
) = 1;
15691 emit_move_insn (operand0
, res
);
15694 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15697 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
15699 /* C code for the stuff we expand below.
15700 double xa = fabs (x), x2;
15701 if (!isless (xa, TWO52))
15703 xa = xa + TWO52 - TWO52;
15704 x2 = copysign (xa, x);
15711 if (HONOR_SIGNED_ZEROS (mode))
15712 x2 = copysign (x2, x);
15715 machine_mode mode
= GET_MODE (operand0
);
15716 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
15717 rtx_code_label
*label
;
15719 TWO52
= ix86_gen_TWO52 (mode
);
15721 /* Temporary for holding the result, initialized to the input
15722 operand to ease control flow. */
15723 res
= gen_reg_rtx (mode
);
15724 emit_move_insn (res
, operand1
);
15726 /* xa = abs (operand1) */
15727 xa
= ix86_expand_sse_fabs (res
, &mask
);
15729 /* if (!isless (xa, TWO52)) goto label; */
15730 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15732 /* xa = xa + TWO52 - TWO52; */
15733 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15734 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
15736 /* xa = copysign (xa, operand1) */
15737 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
15740 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15742 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15743 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
15744 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15745 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
15746 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15747 if (!do_floor
&& HONOR_SIGNED_ZEROS (mode
))
15748 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
15749 emit_move_insn (res
, tmp
);
15751 emit_label (label
);
15752 LABEL_NUSES (label
) = 1;
15754 emit_move_insn (operand0
, res
);
15757 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15760 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
15762 /* C code for the stuff we expand below.
15763 double xa = fabs (x), x2;
15764 if (!isless (xa, TWO52))
15766 x2 = (double)(long)x;
15773 if (HONOR_SIGNED_ZEROS (mode))
15774 return copysign (x2, x);
15777 machine_mode mode
= GET_MODE (operand0
);
15778 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
15779 rtx_code_label
*label
;
15781 TWO52
= ix86_gen_TWO52 (mode
);
15783 /* Temporary for holding the result, initialized to the input
15784 operand to ease control flow. */
15785 res
= gen_reg_rtx (mode
);
15786 emit_move_insn (res
, operand1
);
15788 /* xa = abs (operand1) */
15789 xa
= ix86_expand_sse_fabs (res
, &mask
);
15791 /* if (!isless (xa, TWO52)) goto label; */
15792 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15794 /* xa = (double)(long)x */
15795 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
15796 expand_fix (xi
, res
, 0);
15797 expand_float (xa
, xi
, 0);
15800 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15802 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15803 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
15804 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15805 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
15806 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15807 emit_move_insn (res
, tmp
);
15809 if (HONOR_SIGNED_ZEROS (mode
))
15810 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
15812 emit_label (label
);
15813 LABEL_NUSES (label
) = 1;
15815 emit_move_insn (operand0
, res
);
15818 /* Expand SSE sequence for computing round from OPERAND1 storing
15819 into OPERAND0. Sequence that works without relying on DImode truncation
15820 via cvttsd2siq that is only available on 64bit targets. */
15822 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
15824 /* C code for the stuff we expand below.
15825 double xa = fabs (x), xa2, x2;
15826 if (!isless (xa, TWO52))
15828 Using the absolute value and copying back sign makes
15829 -0.0 -> -0.0 correct.
15830 xa2 = xa + TWO52 - TWO52;
15835 else if (dxa > 0.5)
15837 x2 = copysign (xa2, x);
15840 machine_mode mode
= GET_MODE (operand0
);
15841 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
15842 rtx_code_label
*label
;
15844 TWO52
= ix86_gen_TWO52 (mode
);
15846 /* Temporary for holding the result, initialized to the input
15847 operand to ease control flow. */
15848 res
= gen_reg_rtx (mode
);
15849 emit_move_insn (res
, operand1
);
15851 /* xa = abs (operand1) */
15852 xa
= ix86_expand_sse_fabs (res
, &mask
);
15854 /* if (!isless (xa, TWO52)) goto label; */
15855 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15857 /* xa2 = xa + TWO52 - TWO52; */
15858 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15859 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
15861 /* dxa = xa2 - xa; */
15862 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
15864 /* generate 0.5, 1.0 and -0.5 */
15865 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
15866 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
15867 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
15871 tmp
= gen_reg_rtx (mode
);
15872 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
15873 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
15874 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15875 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15876 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
15877 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
15878 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15879 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15881 /* res = copysign (xa2, operand1) */
15882 ix86_sse_copysign_to_positive (res
, xa2
, force_reg (mode
, operand1
), mask
);
15884 emit_label (label
);
15885 LABEL_NUSES (label
) = 1;
15887 emit_move_insn (operand0
, res
);
15890 /* Expand SSE sequence for computing trunc from OPERAND1 storing
15893 ix86_expand_trunc (rtx operand0
, rtx operand1
)
15895 /* C code for SSE variant we expand below.
15896 double xa = fabs (x), x2;
15897 if (!isless (xa, TWO52))
15899 x2 = (double)(long)x;
15900 if (HONOR_SIGNED_ZEROS (mode))
15901 return copysign (x2, x);
15904 machine_mode mode
= GET_MODE (operand0
);
15905 rtx xa
, xi
, TWO52
, res
, mask
;
15906 rtx_code_label
*label
;
15908 TWO52
= ix86_gen_TWO52 (mode
);
15910 /* Temporary for holding the result, initialized to the input
15911 operand to ease control flow. */
15912 res
= gen_reg_rtx (mode
);
15913 emit_move_insn (res
, operand1
);
15915 /* xa = abs (operand1) */
15916 xa
= ix86_expand_sse_fabs (res
, &mask
);
15918 /* if (!isless (xa, TWO52)) goto label; */
15919 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15921 /* x = (double)(long)x */
15922 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
15923 expand_fix (xi
, res
, 0);
15924 expand_float (res
, xi
, 0);
15926 if (HONOR_SIGNED_ZEROS (mode
))
15927 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
15929 emit_label (label
);
15930 LABEL_NUSES (label
) = 1;
15932 emit_move_insn (operand0
, res
);
15935 /* Expand SSE sequence for computing trunc from OPERAND1 storing
15938 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
15940 machine_mode mode
= GET_MODE (operand0
);
15941 rtx xa
, mask
, TWO52
, one
, res
, smask
, tmp
;
15942 rtx_code_label
*label
;
15944 /* C code for SSE variant we expand below.
15945 double xa = fabs (x), x2;
15946 if (!isless (xa, TWO52))
15948 xa2 = xa + TWO52 - TWO52;
15952 x2 = copysign (xa2, x);
15956 TWO52
= ix86_gen_TWO52 (mode
);
15958 /* Temporary for holding the result, initialized to the input
15959 operand to ease control flow. */
15960 res
= gen_reg_rtx (mode
);
15961 emit_move_insn (res
, operand1
);
15963 /* xa = abs (operand1) */
15964 xa
= ix86_expand_sse_fabs (res
, &smask
);
15966 /* if (!isless (xa, TWO52)) goto label; */
15967 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15969 /* res = xa + TWO52 - TWO52; */
15970 tmp
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15971 tmp
= expand_simple_binop (mode
, MINUS
, tmp
, TWO52
, tmp
, 0, OPTAB_DIRECT
);
15972 emit_move_insn (res
, tmp
);
15975 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15977 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
15978 mask
= ix86_expand_sse_compare_mask (UNGT
, res
, xa
, false);
15979 emit_insn (gen_rtx_SET (mask
, gen_rtx_AND (mode
, mask
, one
)));
15980 tmp
= expand_simple_binop (mode
, MINUS
,
15981 res
, mask
, NULL_RTX
, 0, OPTAB_DIRECT
);
15982 emit_move_insn (res
, tmp
);
15984 /* res = copysign (res, operand1) */
15985 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), smask
);
15987 emit_label (label
);
15988 LABEL_NUSES (label
) = 1;
15990 emit_move_insn (operand0
, res
);
15993 /* Expand SSE sequence for computing round from OPERAND1 storing
15996 ix86_expand_round (rtx operand0
, rtx operand1
)
15998 /* C code for the stuff we're doing below:
15999 double xa = fabs (x);
16000 if (!isless (xa, TWO52))
16002 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16003 return copysign (xa, x);
16005 machine_mode mode
= GET_MODE (operand0
);
16006 rtx res
, TWO52
, xa
, xi
, half
, mask
;
16007 rtx_code_label
*label
;
16008 const struct real_format
*fmt
;
16009 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16011 /* Temporary for holding the result, initialized to the input
16012 operand to ease control flow. */
16013 res
= gen_reg_rtx (mode
);
16014 emit_move_insn (res
, operand1
);
16016 TWO52
= ix86_gen_TWO52 (mode
);
16017 xa
= ix86_expand_sse_fabs (res
, &mask
);
16018 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16020 /* load nextafter (0.5, 0.0) */
16021 fmt
= REAL_MODE_FORMAT (mode
);
16022 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16023 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16025 /* xa = xa + 0.5 */
16026 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16027 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16029 /* xa = (double)(int64_t)xa */
16030 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16031 expand_fix (xi
, xa
, 0);
16032 expand_float (xa
, xi
, 0);
16034 /* res = copysign (xa, operand1) */
16035 ix86_sse_copysign_to_positive (res
, xa
, force_reg (mode
, operand1
), mask
);
16037 emit_label (label
);
16038 LABEL_NUSES (label
) = 1;
16040 emit_move_insn (operand0
, res
);
16043 /* Expand SSE sequence for computing round
16044 from OP1 storing into OP0 using sse4 round insn. */
16046 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
16048 machine_mode mode
= GET_MODE (op0
);
16049 rtx e1
, e2
, res
, half
;
16050 const struct real_format
*fmt
;
16051 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16052 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
16053 rtx (*gen_round
) (rtx
, rtx
, rtx
);
16058 gen_copysign
= gen_copysignsf3
;
16059 gen_round
= gen_sse4_1_roundsf2
;
16062 gen_copysign
= gen_copysigndf3
;
16063 gen_round
= gen_sse4_1_rounddf2
;
16066 gcc_unreachable ();
16069 /* round (a) = trunc (a + copysign (0.5, a)) */
16071 /* load nextafter (0.5, 0.0) */
16072 fmt
= REAL_MODE_FORMAT (mode
);
16073 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16074 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16075 half
= const_double_from_real_value (pred_half
, mode
);
16077 /* e1 = copysign (0.5, op1) */
16078 e1
= gen_reg_rtx (mode
);
16079 emit_insn (gen_copysign (e1
, half
, op1
));
16081 /* e2 = op1 + e1 */
16082 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16084 /* res = trunc (e2) */
16085 res
= gen_reg_rtx (mode
);
16086 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
16088 emit_move_insn (op0
, res
);
16091 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16092 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16093 insn every time. */
16095 static GTY(()) rtx_insn
*vselect_insn
;
16097 /* Initialize vselect_insn. */
16100 init_vselect_insn (void)
16105 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
16106 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
16107 XVECEXP (x
, 0, i
) = const0_rtx
;
16108 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
16110 x
= gen_rtx_SET (const0_rtx
, x
);
16112 vselect_insn
= emit_insn (x
);
16116 /* Construct (set target (vec_select op0 (parallel perm))) and
16117 return true if that's a valid instruction in the active ISA. */
16120 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
16121 unsigned nelt
, bool testing_p
)
16124 rtx x
, save_vconcat
;
16127 if (vselect_insn
== NULL_RTX
)
16128 init_vselect_insn ();
16130 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
16131 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
16132 for (i
= 0; i
< nelt
; ++i
)
16133 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
16134 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16135 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
16136 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
16137 SET_DEST (PATTERN (vselect_insn
)) = target
;
16138 icode
= recog_memoized (vselect_insn
);
16140 if (icode
>= 0 && !testing_p
)
16141 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
16143 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
16144 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
16145 INSN_CODE (vselect_insn
) = -1;
16150 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16153 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
16154 const unsigned char *perm
, unsigned nelt
,
16157 machine_mode v2mode
;
16161 if (vselect_insn
== NULL_RTX
)
16162 init_vselect_insn ();
16164 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
16166 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16167 PUT_MODE (x
, v2mode
);
16170 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
16171 XEXP (x
, 0) = const0_rtx
;
16172 XEXP (x
, 1) = const0_rtx
;
16176 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16177 using movss or movsd. */
16179 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
16181 machine_mode vmode
= d
->vmode
;
16182 unsigned i
, nelt
= d
->nelt
;
16185 if (d
->one_operand_p
)
16188 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
16189 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
16192 /* Only the first element is changed. */
16193 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
16195 for (i
= 1; i
< nelt
; ++i
)
16196 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
16202 if (d
->perm
[0] == nelt
)
16203 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
16205 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
16207 emit_insn (gen_rtx_SET (d
->target
, x
));
16212 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16213 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16216 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
16218 machine_mode mmode
, vmode
= d
->vmode
;
16219 unsigned i
, mask
, nelt
= d
->nelt
;
16220 rtx target
, op0
, op1
, maskop
, x
;
16221 rtx rperm
[32], vperm
;
16223 if (d
->one_operand_p
)
16225 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
16226 && (TARGET_AVX512BW
16227 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
16229 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
16231 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
16233 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
16238 /* This is a blend, not a permute. Elements must stay in their
16239 respective lanes. */
16240 for (i
= 0; i
< nelt
; ++i
)
16242 unsigned e
= d
->perm
[i
];
16243 if (!(e
== i
|| e
== i
+ nelt
))
16250 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16251 decision should be extracted elsewhere, so that we only try that
16252 sequence once all budget==3 options have been tried. */
16253 target
= d
->target
;
16272 for (i
= 0; i
< nelt
; ++i
)
16273 mask
|= (d
->perm
[i
] >= nelt
) << i
;
16277 for (i
= 0; i
< 2; ++i
)
16278 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
16283 for (i
= 0; i
< 4; ++i
)
16284 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16289 /* See if bytes move in pairs so we can use pblendw with
16290 an immediate argument, rather than pblendvb with a vector
16292 for (i
= 0; i
< 16; i
+= 2)
16293 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16296 for (i
= 0; i
< nelt
; ++i
)
16297 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
16300 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
16301 vperm
= force_reg (vmode
, vperm
);
16303 if (GET_MODE_SIZE (vmode
) == 16)
16304 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
16306 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
16307 if (target
!= d
->target
)
16308 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16312 for (i
= 0; i
< 8; ++i
)
16313 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16318 target
= gen_reg_rtx (vmode
);
16319 op0
= gen_lowpart (vmode
, op0
);
16320 op1
= gen_lowpart (vmode
, op1
);
16324 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16325 for (i
= 0; i
< 32; i
+= 2)
16326 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16328 /* See if bytes move in quadruplets. If yes, vpblendd
16329 with immediate can be used. */
16330 for (i
= 0; i
< 32; i
+= 4)
16331 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
16335 /* See if bytes move the same in both lanes. If yes,
16336 vpblendw with immediate can be used. */
16337 for (i
= 0; i
< 16; i
+= 2)
16338 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
16341 /* Use vpblendw. */
16342 for (i
= 0; i
< 16; ++i
)
16343 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
16348 /* Use vpblendd. */
16349 for (i
= 0; i
< 8; ++i
)
16350 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
16355 /* See if words move in pairs. If yes, vpblendd can be used. */
16356 for (i
= 0; i
< 16; i
+= 2)
16357 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16361 /* See if words move the same in both lanes. If not,
16362 vpblendvb must be used. */
16363 for (i
= 0; i
< 8; i
++)
16364 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
16366 /* Use vpblendvb. */
16367 for (i
= 0; i
< 32; ++i
)
16368 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
16372 target
= gen_reg_rtx (vmode
);
16373 op0
= gen_lowpart (vmode
, op0
);
16374 op1
= gen_lowpart (vmode
, op1
);
16375 goto finish_pblendvb
;
16378 /* Use vpblendw. */
16379 for (i
= 0; i
< 16; ++i
)
16380 mask
|= (d
->perm
[i
] >= 16) << i
;
16384 /* Use vpblendd. */
16385 for (i
= 0; i
< 8; ++i
)
16386 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16391 /* Use vpblendd. */
16392 for (i
= 0; i
< 4; ++i
)
16393 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16398 gcc_unreachable ();
16421 if (mmode
!= VOIDmode
)
16422 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
16424 maskop
= GEN_INT (mask
);
16426 /* This matches five different patterns with the different modes. */
16427 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
16428 x
= gen_rtx_SET (target
, x
);
16430 if (target
!= d
->target
)
16431 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16436 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16437 in terms of the variable form of vpermilps.
16439 Note that we will have already failed the immediate input vpermilps,
16440 which requires that the high and low part shuffle be identical; the
16441 variable form doesn't require that. */
16444 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
16446 rtx rperm
[8], vperm
;
16449 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
16452 /* We can only permute within the 128-bit lane. */
16453 for (i
= 0; i
< 8; ++i
)
16455 unsigned e
= d
->perm
[i
];
16456 if (i
< 4 ? e
>= 4 : e
< 4)
16463 for (i
= 0; i
< 8; ++i
)
16465 unsigned e
= d
->perm
[i
];
16467 /* Within each 128-bit lane, the elements of op0 are numbered
16468 from 0 and the elements of op1 are numbered from 4. */
16474 rperm
[i
] = GEN_INT (e
);
16477 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
16478 vperm
= force_reg (V8SImode
, vperm
);
16479 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
16484 /* Return true if permutation D can be performed as VMODE permutation
16488 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
16490 unsigned int i
, j
, chunk
;
16492 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
16493 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
16494 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
16497 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
16500 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
16501 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
16502 if (d
->perm
[i
] & (chunk
- 1))
16505 for (j
= 1; j
< chunk
; ++j
)
16506 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
16512 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16513 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16516 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
16518 unsigned i
, nelt
, eltsz
, mask
;
16519 unsigned char perm
[64];
16520 machine_mode vmode
= V16QImode
;
16521 rtx rperm
[64], vperm
, target
, op0
, op1
;
16525 if (!d
->one_operand_p
)
16527 if (!TARGET_XOP
|| GET_MODE_SIZE (d
->vmode
) != 16)
16530 && valid_perm_using_mode_p (V2TImode
, d
))
16535 /* Use vperm2i128 insn. The pattern uses
16536 V4DImode instead of V2TImode. */
16537 target
= d
->target
;
16538 if (d
->vmode
!= V4DImode
)
16539 target
= gen_reg_rtx (V4DImode
);
16540 op0
= gen_lowpart (V4DImode
, d
->op0
);
16541 op1
= gen_lowpart (V4DImode
, d
->op1
);
16543 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
16544 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
16545 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
16546 if (target
!= d
->target
)
16547 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16555 if (GET_MODE_SIZE (d
->vmode
) == 16)
16560 else if (GET_MODE_SIZE (d
->vmode
) == 32)
16565 /* V4DImode should be already handled through
16566 expand_vselect by vpermq instruction. */
16567 gcc_assert (d
->vmode
!= V4DImode
);
16570 if (d
->vmode
== V8SImode
16571 || d
->vmode
== V16HImode
16572 || d
->vmode
== V32QImode
)
16574 /* First see if vpermq can be used for
16575 V8SImode/V16HImode/V32QImode. */
16576 if (valid_perm_using_mode_p (V4DImode
, d
))
16578 for (i
= 0; i
< 4; i
++)
16579 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
16582 target
= gen_reg_rtx (V4DImode
);
16583 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
16586 emit_move_insn (d
->target
,
16587 gen_lowpart (d
->vmode
, target
));
16593 /* Next see if vpermd can be used. */
16594 if (valid_perm_using_mode_p (V8SImode
, d
))
16597 /* Or if vpermps can be used. */
16598 else if (d
->vmode
== V8SFmode
)
16601 if (vmode
== V32QImode
)
16603 /* vpshufb only works intra lanes, it is not
16604 possible to shuffle bytes in between the lanes. */
16605 for (i
= 0; i
< nelt
; ++i
)
16606 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
16610 else if (GET_MODE_SIZE (d
->vmode
) == 64)
16612 if (!TARGET_AVX512BW
)
16615 /* If vpermq didn't work, vpshufb won't work either. */
16616 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
16620 if (d
->vmode
== V16SImode
16621 || d
->vmode
== V32HImode
16622 || d
->vmode
== V64QImode
)
16624 /* First see if vpermq can be used for
16625 V16SImode/V32HImode/V64QImode. */
16626 if (valid_perm_using_mode_p (V8DImode
, d
))
16628 for (i
= 0; i
< 8; i
++)
16629 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
16632 target
= gen_reg_rtx (V8DImode
);
16633 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
16636 emit_move_insn (d
->target
,
16637 gen_lowpart (d
->vmode
, target
));
16643 /* Next see if vpermd can be used. */
16644 if (valid_perm_using_mode_p (V16SImode
, d
))
16647 /* Or if vpermps can be used. */
16648 else if (d
->vmode
== V16SFmode
)
16650 if (vmode
== V64QImode
)
16652 /* vpshufb only works intra lanes, it is not
16653 possible to shuffle bytes in between the lanes. */
16654 for (i
= 0; i
< nelt
; ++i
)
16655 if ((d
->perm
[i
] ^ i
) & (nelt
/ 4))
16666 if (vmode
== V8SImode
)
16667 for (i
= 0; i
< 8; ++i
)
16668 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
16669 else if (vmode
== V16SImode
)
16670 for (i
= 0; i
< 16; ++i
)
16671 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
16674 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
16675 if (!d
->one_operand_p
)
16676 mask
= 2 * nelt
- 1;
16677 else if (vmode
== V16QImode
)
16679 else if (vmode
== V64QImode
)
16680 mask
= nelt
/ 4 - 1;
16682 mask
= nelt
/ 2 - 1;
16684 for (i
= 0; i
< nelt
; ++i
)
16686 unsigned j
, e
= d
->perm
[i
] & mask
;
16687 for (j
= 0; j
< eltsz
; ++j
)
16688 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
16692 vperm
= gen_rtx_CONST_VECTOR (vmode
,
16693 gen_rtvec_v (GET_MODE_NUNITS (vmode
), rperm
));
16694 vperm
= force_reg (vmode
, vperm
);
16696 target
= d
->target
;
16697 if (d
->vmode
!= vmode
)
16698 target
= gen_reg_rtx (vmode
);
16699 op0
= gen_lowpart (vmode
, d
->op0
);
16700 if (d
->one_operand_p
)
16702 if (vmode
== V16QImode
)
16703 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, vperm
));
16704 else if (vmode
== V32QImode
)
16705 emit_insn (gen_avx2_pshufbv32qi3 (target
, op0
, vperm
));
16706 else if (vmode
== V64QImode
)
16707 emit_insn (gen_avx512bw_pshufbv64qi3 (target
, op0
, vperm
));
16708 else if (vmode
== V8SFmode
)
16709 emit_insn (gen_avx2_permvarv8sf (target
, op0
, vperm
));
16710 else if (vmode
== V8SImode
)
16711 emit_insn (gen_avx2_permvarv8si (target
, op0
, vperm
));
16712 else if (vmode
== V16SFmode
)
16713 emit_insn (gen_avx512f_permvarv16sf (target
, op0
, vperm
));
16714 else if (vmode
== V16SImode
)
16715 emit_insn (gen_avx512f_permvarv16si (target
, op0
, vperm
));
16717 gcc_unreachable ();
16721 op1
= gen_lowpart (vmode
, d
->op1
);
16722 emit_insn (gen_xop_pperm (target
, op0
, op1
, vperm
));
16724 if (target
!= d
->target
)
16725 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16730 /* For V*[QHS]Imode permutations, check if the same permutation
16731 can't be performed in a 2x, 4x or 8x wider inner mode. */
16734 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
16735 struct expand_vec_perm_d
*nd
)
16738 machine_mode mode
= VOIDmode
;
16742 case E_V16QImode
: mode
= V8HImode
; break;
16743 case E_V32QImode
: mode
= V16HImode
; break;
16744 case E_V64QImode
: mode
= V32HImode
; break;
16745 case E_V8HImode
: mode
= V4SImode
; break;
16746 case E_V16HImode
: mode
= V8SImode
; break;
16747 case E_V32HImode
: mode
= V16SImode
; break;
16748 case E_V4SImode
: mode
= V2DImode
; break;
16749 case E_V8SImode
: mode
= V4DImode
; break;
16750 case E_V16SImode
: mode
= V8DImode
; break;
16751 default: return false;
16753 for (i
= 0; i
< d
->nelt
; i
+= 2)
16754 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
16757 nd
->nelt
= d
->nelt
/ 2;
16758 for (i
= 0; i
< nd
->nelt
; i
++)
16759 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
16760 if (GET_MODE_INNER (mode
) != DImode
)
16761 canonicalize_vector_int_perm (nd
, nd
);
16764 nd
->one_operand_p
= d
->one_operand_p
;
16765 nd
->testing_p
= d
->testing_p
;
16766 if (d
->op0
== d
->op1
)
16767 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
16770 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
16771 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
16774 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
16776 nd
->target
= gen_reg_rtx (nd
->vmode
);
16781 /* Try to expand one-operand permutation with constant mask. */
16784 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
16786 machine_mode mode
= GET_MODE (d
->op0
);
16787 machine_mode maskmode
= mode
;
16788 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
16789 rtx target
, op0
, mask
;
16792 if (!rtx_equal_p (d
->op0
, d
->op1
))
16795 if (!TARGET_AVX512F
)
16801 gen
= gen_avx512f_permvarv16si
;
16804 gen
= gen_avx512f_permvarv16sf
;
16805 maskmode
= V16SImode
;
16808 gen
= gen_avx512f_permvarv8di
;
16811 gen
= gen_avx512f_permvarv8df
;
16812 maskmode
= V8DImode
;
16818 target
= d
->target
;
16820 for (int i
= 0; i
< d
->nelt
; ++i
)
16821 vec
[i
] = GEN_INT (d
->perm
[i
]);
16822 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
16823 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
16827 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
16829 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
16830 in a single instruction. */
16833 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
16835 unsigned i
, nelt
= d
->nelt
;
16836 struct expand_vec_perm_d nd
;
16838 /* Check plain VEC_SELECT first, because AVX has instructions that could
16839 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
16840 input where SEL+CONCAT may not. */
16841 if (d
->one_operand_p
)
16843 int mask
= nelt
- 1;
16844 bool identity_perm
= true;
16845 bool broadcast_perm
= true;
16847 for (i
= 0; i
< nelt
; i
++)
16849 nd
.perm
[i
] = d
->perm
[i
] & mask
;
16850 if (nd
.perm
[i
] != i
)
16851 identity_perm
= false;
16853 broadcast_perm
= false;
16859 emit_move_insn (d
->target
, d
->op0
);
16862 else if (broadcast_perm
&& TARGET_AVX2
)
16864 /* Use vpbroadcast{b,w,d}. */
16865 rtx (*gen
) (rtx
, rtx
) = NULL
;
16869 if (TARGET_AVX512BW
)
16870 gen
= gen_avx512bw_vec_dupv64qi_1
;
16873 gen
= gen_avx2_pbroadcastv32qi_1
;
16876 if (TARGET_AVX512BW
)
16877 gen
= gen_avx512bw_vec_dupv32hi_1
;
16880 gen
= gen_avx2_pbroadcastv16hi_1
;
16883 if (TARGET_AVX512F
)
16884 gen
= gen_avx512f_vec_dupv16si_1
;
16887 gen
= gen_avx2_pbroadcastv8si_1
;
16890 gen
= gen_avx2_pbroadcastv16qi
;
16893 gen
= gen_avx2_pbroadcastv8hi
;
16896 if (TARGET_AVX512F
)
16897 gen
= gen_avx512f_vec_dupv16sf_1
;
16900 gen
= gen_avx2_vec_dupv8sf_1
;
16903 if (TARGET_AVX512F
)
16904 gen
= gen_avx512f_vec_dupv8df_1
;
16907 if (TARGET_AVX512F
)
16908 gen
= gen_avx512f_vec_dupv8di_1
;
16910 /* For other modes prefer other shuffles this function creates. */
16916 emit_insn (gen (d
->target
, d
->op0
));
16921 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
16924 /* There are plenty of patterns in sse.md that are written for
16925 SEL+CONCAT and are not replicated for a single op. Perhaps
16926 that should be changed, to avoid the nastiness here. */
16928 /* Recognize interleave style patterns, which means incrementing
16929 every other permutation operand. */
16930 for (i
= 0; i
< nelt
; i
+= 2)
16932 nd
.perm
[i
] = d
->perm
[i
] & mask
;
16933 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
16935 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
16939 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
16942 for (i
= 0; i
< nelt
; i
+= 4)
16944 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
16945 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
16946 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
16947 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
16950 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
16956 /* Try movss/movsd instructions. */
16957 if (expand_vec_perm_movs (d
))
16960 /* Finally, try the fully general two operand permute. */
16961 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
16965 /* Recognize interleave style patterns with reversed operands. */
16966 if (!d
->one_operand_p
)
16968 for (i
= 0; i
< nelt
; ++i
)
16970 unsigned e
= d
->perm
[i
];
16978 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
16983 /* Try the SSE4.1 blend variable merge instructions. */
16984 if (expand_vec_perm_blend (d
))
16987 /* Try one of the AVX vpermil variable permutations. */
16988 if (expand_vec_perm_vpermil (d
))
16991 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
16992 vpshufb, vpermd, vpermps or vpermq variable permutation. */
16993 if (expand_vec_perm_pshufb (d
))
16996 /* Try the AVX2 vpalignr instruction. */
16997 if (expand_vec_perm_palignr (d
, true))
17000 /* Try the AVX512F vperm{s,d} instructions. */
17001 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
17004 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17005 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
17008 /* See if we can get the same permutation in different vector integer
17010 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
17013 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
17019 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
17020 in terms of a pair of pshuflw + pshufhw instructions. */
17023 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
17025 unsigned char perm2
[MAX_VECT_LEN
];
17029 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
17032 /* The two permutations only operate in 64-bit lanes. */
17033 for (i
= 0; i
< 4; ++i
)
17034 if (d
->perm
[i
] >= 4)
17036 for (i
= 4; i
< 8; ++i
)
17037 if (d
->perm
[i
] < 4)
17043 /* Emit the pshuflw. */
17044 memcpy (perm2
, d
->perm
, 4);
17045 for (i
= 4; i
< 8; ++i
)
17047 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
17050 /* Emit the pshufhw. */
17051 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
17052 for (i
= 0; i
< 4; ++i
)
17054 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
17060 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17061 the permutation using the SSSE3 palignr instruction. This succeeds
17062 when all of the elements in PERM fit within one vector and we merely
17063 need to shift them down so that a single vector permutation has a
17064 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17065 the vpalignr instruction itself can perform the requested permutation. */
17068 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
17070 unsigned i
, nelt
= d
->nelt
;
17071 unsigned min
, max
, minswap
, maxswap
;
17072 bool in_order
, ok
, swap
= false;
17074 struct expand_vec_perm_d dcopy
;
17076 /* Even with AVX, palignr only operates on 128-bit vectors,
17077 in AVX2 palignr operates on both 128-bit lanes. */
17078 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
17079 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
17084 minswap
= 2 * nelt
;
17086 for (i
= 0; i
< nelt
; ++i
)
17088 unsigned e
= d
->perm
[i
];
17089 unsigned eswap
= d
->perm
[i
] ^ nelt
;
17090 if (GET_MODE_SIZE (d
->vmode
) == 32)
17092 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
17093 eswap
= e
^ (nelt
/ 2);
17099 if (eswap
< minswap
)
17101 if (eswap
> maxswap
)
17105 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
17107 if (d
->one_operand_p
17109 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
17110 ? nelt
/ 2 : nelt
))
17117 /* Given that we have SSSE3, we know we'll be able to implement the
17118 single operand permutation after the palignr with pshufb for
17119 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17121 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
17127 dcopy
.op0
= d
->op1
;
17128 dcopy
.op1
= d
->op0
;
17129 for (i
= 0; i
< nelt
; ++i
)
17130 dcopy
.perm
[i
] ^= nelt
;
17134 for (i
= 0; i
< nelt
; ++i
)
17136 unsigned e
= dcopy
.perm
[i
];
17137 if (GET_MODE_SIZE (d
->vmode
) == 32
17139 && (e
& (nelt
/ 2 - 1)) < min
)
17140 e
= e
- min
- (nelt
/ 2);
17147 dcopy
.one_operand_p
= true;
17149 if (single_insn_only_p
&& !in_order
)
17152 /* For AVX2, test whether we can permute the result in one instruction. */
17157 dcopy
.op1
= dcopy
.op0
;
17158 return expand_vec_perm_1 (&dcopy
);
17161 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
17162 if (GET_MODE_SIZE (d
->vmode
) == 16)
17164 target
= gen_reg_rtx (TImode
);
17165 emit_insn (gen_ssse3_palignrti (target
, gen_lowpart (TImode
, dcopy
.op1
),
17166 gen_lowpart (TImode
, dcopy
.op0
), shift
));
17170 target
= gen_reg_rtx (V2TImode
);
17171 emit_insn (gen_avx2_palignrv2ti (target
,
17172 gen_lowpart (V2TImode
, dcopy
.op1
),
17173 gen_lowpart (V2TImode
, dcopy
.op0
),
17177 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
17179 /* Test for the degenerate case where the alignment by itself
17180 produces the desired permutation. */
17183 emit_move_insn (d
->target
, dcopy
.op0
);
17187 ok
= expand_vec_perm_1 (&dcopy
);
17188 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
17193 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17194 the permutation using the SSE4_1 pblendv instruction. Potentially
17195 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17198 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
17200 unsigned i
, which
, nelt
= d
->nelt
;
17201 struct expand_vec_perm_d dcopy
, dcopy1
;
17202 machine_mode vmode
= d
->vmode
;
17205 /* Use the same checks as in expand_vec_perm_blend. */
17206 if (d
->one_operand_p
)
17208 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
17210 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
17212 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
17217 /* Figure out where permutation elements stay not in their
17218 respective lanes. */
17219 for (i
= 0, which
= 0; i
< nelt
; ++i
)
17221 unsigned e
= d
->perm
[i
];
17223 which
|= (e
< nelt
? 1 : 2);
17225 /* We can pblend the part where elements stay not in their
17226 respective lanes only when these elements are all in one
17227 half of a permutation.
17228 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17229 lanes, but both 8 and 9 >= 8
17230 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17231 respective lanes and 8 >= 8, but 2 not. */
17232 if (which
!= 1 && which
!= 2)
17234 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
17237 /* First we apply one operand permutation to the part where
17238 elements stay not in their respective lanes. */
17241 dcopy
.op0
= dcopy
.op1
= d
->op1
;
17243 dcopy
.op0
= dcopy
.op1
= d
->op0
;
17245 dcopy
.target
= gen_reg_rtx (vmode
);
17246 dcopy
.one_operand_p
= true;
17248 for (i
= 0; i
< nelt
; ++i
)
17249 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17251 ok
= expand_vec_perm_1 (&dcopy
);
17252 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
17259 /* Next we put permuted elements into their positions. */
17262 dcopy1
.op1
= dcopy
.target
;
17264 dcopy1
.op0
= dcopy
.target
;
17266 for (i
= 0; i
< nelt
; ++i
)
17267 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
17269 ok
= expand_vec_perm_blend (&dcopy1
);
17275 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
17277 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17278 a two vector permutation into a single vector permutation by using
17279 an interleave operation to merge the vectors. */
17282 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
17284 struct expand_vec_perm_d dremap
, dfinal
;
17285 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17286 unsigned HOST_WIDE_INT contents
;
17287 unsigned char remap
[2 * MAX_VECT_LEN
];
17289 bool ok
, same_halves
= false;
17291 if (GET_MODE_SIZE (d
->vmode
) == 16)
17293 if (d
->one_operand_p
)
17296 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17300 /* For 32-byte modes allow even d->one_operand_p.
17301 The lack of cross-lane shuffling in some instructions
17302 might prevent a single insn shuffle. */
17304 dfinal
.testing_p
= true;
17305 /* If expand_vec_perm_interleave3 can expand this into
17306 a 3 insn sequence, give up and let it be expanded as
17307 3 insn sequence. While that is one insn longer,
17308 it doesn't need a memory operand and in the common
17309 case that both interleave low and high permutations
17310 with the same operands are adjacent needs 4 insns
17311 for both after CSE. */
17312 if (expand_vec_perm_interleave3 (&dfinal
))
17318 /* Examine from whence the elements come. */
17320 for (i
= 0; i
< nelt
; ++i
)
17321 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
17323 memset (remap
, 0xff, sizeof (remap
));
17326 if (GET_MODE_SIZE (d
->vmode
) == 16)
17328 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
17330 /* Split the two input vectors into 4 halves. */
17331 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
17336 /* If the elements from the low halves use interleave low, and similarly
17337 for interleave high. If the elements are from mis-matched halves, we
17338 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17339 if ((contents
& (h1
| h3
)) == contents
)
17342 for (i
= 0; i
< nelt2
; ++i
)
17345 remap
[i
+ nelt
] = i
* 2 + 1;
17346 dremap
.perm
[i
* 2] = i
;
17347 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17349 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17350 dremap
.vmode
= V4SFmode
;
17352 else if ((contents
& (h2
| h4
)) == contents
)
17355 for (i
= 0; i
< nelt2
; ++i
)
17357 remap
[i
+ nelt2
] = i
* 2;
17358 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
17359 dremap
.perm
[i
* 2] = i
+ nelt2
;
17360 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
17362 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17363 dremap
.vmode
= V4SFmode
;
17365 else if ((contents
& (h1
| h4
)) == contents
)
17368 for (i
= 0; i
< nelt2
; ++i
)
17371 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
17372 dremap
.perm
[i
] = i
;
17373 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
17378 dremap
.vmode
= V2DImode
;
17380 dremap
.perm
[0] = 0;
17381 dremap
.perm
[1] = 3;
17384 else if ((contents
& (h2
| h3
)) == contents
)
17387 for (i
= 0; i
< nelt2
; ++i
)
17389 remap
[i
+ nelt2
] = i
;
17390 remap
[i
+ nelt
] = i
+ nelt2
;
17391 dremap
.perm
[i
] = i
+ nelt2
;
17392 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
17397 dremap
.vmode
= V2DImode
;
17399 dremap
.perm
[0] = 1;
17400 dremap
.perm
[1] = 2;
17408 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
17409 unsigned HOST_WIDE_INT q
[8];
17410 unsigned int nonzero_halves
[4];
17412 /* Split the two input vectors into 8 quarters. */
17413 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
17414 for (i
= 1; i
< 8; ++i
)
17415 q
[i
] = q
[0] << (nelt4
* i
);
17416 for (i
= 0; i
< 4; ++i
)
17417 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
17419 nonzero_halves
[nzcnt
] = i
;
17425 gcc_assert (d
->one_operand_p
);
17426 nonzero_halves
[1] = nonzero_halves
[0];
17427 same_halves
= true;
17429 else if (d
->one_operand_p
)
17431 gcc_assert (nonzero_halves
[0] == 0);
17432 gcc_assert (nonzero_halves
[1] == 1);
17437 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
17439 /* Attempt to increase the likelihood that dfinal
17440 shuffle will be intra-lane. */
17441 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
17444 /* vperm2f128 or vperm2i128. */
17445 for (i
= 0; i
< nelt2
; ++i
)
17447 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
17448 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
17449 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
17450 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
17453 if (d
->vmode
!= V8SFmode
17454 && d
->vmode
!= V4DFmode
17455 && d
->vmode
!= V8SImode
)
17457 dremap
.vmode
= V8SImode
;
17459 for (i
= 0; i
< 4; ++i
)
17461 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
17462 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
17466 else if (d
->one_operand_p
)
17468 else if (TARGET_AVX2
17469 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
17472 for (i
= 0; i
< nelt4
; ++i
)
17475 remap
[i
+ nelt
] = i
* 2 + 1;
17476 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
17477 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
17478 dremap
.perm
[i
* 2] = i
;
17479 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17480 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
17481 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
17484 else if (TARGET_AVX2
17485 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
17488 for (i
= 0; i
< nelt4
; ++i
)
17490 remap
[i
+ nelt4
] = i
* 2;
17491 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
17492 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
17493 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
17494 dremap
.perm
[i
* 2] = i
+ nelt4
;
17495 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
17496 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
17497 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
17504 /* Use the remapping array set up above to move the elements from their
17505 swizzled locations into their final destinations. */
17507 for (i
= 0; i
< nelt
; ++i
)
17509 unsigned e
= remap
[d
->perm
[i
]];
17510 gcc_assert (e
< nelt
);
17511 /* If same_halves is true, both halves of the remapped vector are the
17512 same. Avoid cross-lane accesses if possible. */
17513 if (same_halves
&& i
>= nelt2
)
17515 gcc_assert (e
< nelt2
);
17516 dfinal
.perm
[i
] = e
+ nelt2
;
17519 dfinal
.perm
[i
] = e
;
17523 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
17524 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17526 dfinal
.op1
= dfinal
.op0
;
17527 dfinal
.one_operand_p
= true;
17529 /* Test if the final remap can be done with a single insn. For V4SFmode or
17530 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17532 ok
= expand_vec_perm_1 (&dfinal
);
17533 seq
= get_insns ();
17542 if (dremap
.vmode
!= dfinal
.vmode
)
17544 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
17545 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
17548 ok
= expand_vec_perm_1 (&dremap
);
17555 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17556 a single vector cross-lane permutation into vpermq followed
17557 by any of the single insn permutations. */
17560 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
17562 struct expand_vec_perm_d dremap
, dfinal
;
17563 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
17564 unsigned contents
[2];
17568 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
17569 && d
->one_operand_p
))
17574 for (i
= 0; i
< nelt2
; ++i
)
17576 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
17577 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
17580 for (i
= 0; i
< 2; ++i
)
17582 unsigned int cnt
= 0;
17583 for (j
= 0; j
< 4; ++j
)
17584 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
17592 dremap
.vmode
= V4DImode
;
17594 dremap
.target
= gen_reg_rtx (V4DImode
);
17595 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
17596 dremap
.op1
= dremap
.op0
;
17597 dremap
.one_operand_p
= true;
17598 for (i
= 0; i
< 2; ++i
)
17600 unsigned int cnt
= 0;
17601 for (j
= 0; j
< 4; ++j
)
17602 if ((contents
[i
] & (1u << j
)) != 0)
17603 dremap
.perm
[2 * i
+ cnt
++] = j
;
17604 for (; cnt
< 2; ++cnt
)
17605 dremap
.perm
[2 * i
+ cnt
] = 0;
17609 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17610 dfinal
.op1
= dfinal
.op0
;
17611 dfinal
.one_operand_p
= true;
17612 for (i
= 0, j
= 0; i
< nelt
; ++i
)
17616 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
17617 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
17619 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
17620 dfinal
.perm
[i
] |= nelt4
;
17622 gcc_unreachable ();
17625 ok
= expand_vec_perm_1 (&dremap
);
17628 ok
= expand_vec_perm_1 (&dfinal
);
17634 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
17636 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
17637 a vector permutation using two instructions, vperm2f128 resp.
17638 vperm2i128 followed by any single in-lane permutation. */
17641 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
17643 struct expand_vec_perm_d dfirst
, dsecond
;
17644 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
17648 || GET_MODE_SIZE (d
->vmode
) != 32
17649 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
17653 dsecond
.one_operand_p
= false;
17654 dsecond
.testing_p
= true;
17656 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17657 immediate. For perm < 16 the second permutation uses
17658 d->op0 as first operand, for perm >= 16 it uses d->op1
17659 as first operand. The second operand is the result of
17661 for (perm
= 0; perm
< 32; perm
++)
17663 /* Ignore permutations which do not move anything cross-lane. */
17666 /* The second shuffle for e.g. V4DFmode has
17667 0123 and ABCD operands.
17668 Ignore AB23, as 23 is already in the second lane
17669 of the first operand. */
17670 if ((perm
& 0xc) == (1 << 2)) continue;
17671 /* And 01CD, as 01 is in the first lane of the first
17673 if ((perm
& 3) == 0) continue;
17674 /* And 4567, as then the vperm2[fi]128 doesn't change
17675 anything on the original 4567 second operand. */
17676 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
17680 /* The second shuffle for e.g. V4DFmode has
17681 4567 and ABCD operands.
17682 Ignore AB67, as 67 is already in the second lane
17683 of the first operand. */
17684 if ((perm
& 0xc) == (3 << 2)) continue;
17685 /* And 45CD, as 45 is in the first lane of the first
17687 if ((perm
& 3) == 2) continue;
17688 /* And 0123, as then the vperm2[fi]128 doesn't change
17689 anything on the original 0123 first operand. */
17690 if ((perm
& 0xf) == (1 << 2)) continue;
17693 for (i
= 0; i
< nelt
; i
++)
17695 j
= d
->perm
[i
] / nelt2
;
17696 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
17697 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
17698 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
17699 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17707 ok
= expand_vec_perm_1 (&dsecond
);
17718 /* Found a usable second shuffle. dfirst will be
17719 vperm2f128 on d->op0 and d->op1. */
17720 dsecond
.testing_p
= false;
17722 dfirst
.target
= gen_reg_rtx (d
->vmode
);
17723 for (i
= 0; i
< nelt
; i
++)
17724 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
17725 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
17727 canonicalize_perm (&dfirst
);
17728 ok
= expand_vec_perm_1 (&dfirst
);
17731 /* And dsecond is some single insn shuffle, taking
17732 d->op0 and result of vperm2f128 (if perm < 16) or
17733 d->op1 and result of vperm2f128 (otherwise). */
17735 dsecond
.op0
= dsecond
.op1
;
17736 dsecond
.op1
= dfirst
.target
;
17738 ok
= expand_vec_perm_1 (&dsecond
);
17744 /* For one operand, the only useful vperm2f128 permutation is 0x01
17746 if (d
->one_operand_p
)
17753 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17754 a two vector permutation using 2 intra-lane interleave insns
17755 and cross-lane shuffle for 32-byte vectors. */
17758 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
17761 rtx (*gen
) (rtx
, rtx
, rtx
);
17763 if (d
->one_operand_p
)
17765 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
17767 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
17773 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
17775 for (i
= 0; i
< nelt
; i
+= 2)
17776 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
17777 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
17787 gen
= gen_vec_interleave_highv32qi
;
17789 gen
= gen_vec_interleave_lowv32qi
;
17793 gen
= gen_vec_interleave_highv16hi
;
17795 gen
= gen_vec_interleave_lowv16hi
;
17799 gen
= gen_vec_interleave_highv8si
;
17801 gen
= gen_vec_interleave_lowv8si
;
17805 gen
= gen_vec_interleave_highv4di
;
17807 gen
= gen_vec_interleave_lowv4di
;
17811 gen
= gen_vec_interleave_highv8sf
;
17813 gen
= gen_vec_interleave_lowv8sf
;
17817 gen
= gen_vec_interleave_highv4df
;
17819 gen
= gen_vec_interleave_lowv4df
;
17822 gcc_unreachable ();
17825 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
17829 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
17830 a single vector permutation using a single intra-lane vector
17831 permutation, vperm2f128 swapping the lanes and vblend* insn blending
17832 the non-swapped and swapped vectors together. */
17835 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
17837 struct expand_vec_perm_d dfirst
, dsecond
;
17838 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17841 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
17845 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
17846 || !d
->one_operand_p
)
17850 for (i
= 0; i
< nelt
; i
++)
17851 dfirst
.perm
[i
] = 0xff;
17852 for (i
= 0, msk
= 0; i
< nelt
; i
++)
17854 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
17855 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
17857 dfirst
.perm
[j
] = d
->perm
[i
];
17861 for (i
= 0; i
< nelt
; i
++)
17862 if (dfirst
.perm
[i
] == 0xff)
17863 dfirst
.perm
[i
] = i
;
17866 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
17869 ok
= expand_vec_perm_1 (&dfirst
);
17870 seq
= get_insns ();
17882 dsecond
.op0
= dfirst
.target
;
17883 dsecond
.op1
= dfirst
.target
;
17884 dsecond
.one_operand_p
= true;
17885 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
17886 for (i
= 0; i
< nelt
; i
++)
17887 dsecond
.perm
[i
] = i
^ nelt2
;
17889 ok
= expand_vec_perm_1 (&dsecond
);
17892 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
17893 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
17897 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
17898 permutation using two vperm2f128, followed by a vshufpd insn blending
17899 the two vectors together. */
17902 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
17904 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
17907 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
17917 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
17918 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
17919 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
17920 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
17921 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
17922 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
17923 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
17924 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
17925 dthird
.perm
[0] = (d
->perm
[0] % 2);
17926 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
17927 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
17928 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
17930 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
17931 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
17932 dthird
.op0
= dfirst
.target
;
17933 dthird
.op1
= dsecond
.target
;
17934 dthird
.one_operand_p
= false;
17936 canonicalize_perm (&dfirst
);
17937 canonicalize_perm (&dsecond
);
17939 ok
= expand_vec_perm_1 (&dfirst
)
17940 && expand_vec_perm_1 (&dsecond
)
17941 && expand_vec_perm_1 (&dthird
);
17948 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
17949 permutation with two pshufb insns and an ior. We should have already
17950 failed all two instruction sequences. */
17953 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
17955 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
17956 unsigned int i
, nelt
, eltsz
;
17958 if (!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
17960 gcc_assert (!d
->one_operand_p
);
17966 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
17968 /* Generate two permutation masks. If the required element is within
17969 the given vector it is shuffled into the proper lane. If the required
17970 element is in the other vector, force a zero into the lane by setting
17971 bit 7 in the permutation mask. */
17972 m128
= GEN_INT (-128);
17973 for (i
= 0; i
< nelt
; ++i
)
17975 unsigned j
, e
= d
->perm
[i
];
17976 unsigned which
= (e
>= nelt
);
17980 for (j
= 0; j
< eltsz
; ++j
)
17982 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
17983 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
17987 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
17988 vperm
= force_reg (V16QImode
, vperm
);
17990 l
= gen_reg_rtx (V16QImode
);
17991 op
= gen_lowpart (V16QImode
, d
->op0
);
17992 emit_insn (gen_ssse3_pshufbv16qi3 (l
, op
, vperm
));
17994 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
17995 vperm
= force_reg (V16QImode
, vperm
);
17997 h
= gen_reg_rtx (V16QImode
);
17998 op
= gen_lowpart (V16QImode
, d
->op1
);
17999 emit_insn (gen_ssse3_pshufbv16qi3 (h
, op
, vperm
));
18002 if (d
->vmode
!= V16QImode
)
18003 op
= gen_reg_rtx (V16QImode
);
18004 emit_insn (gen_iorv16qi3 (op
, l
, h
));
18005 if (op
!= d
->target
)
18006 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18011 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18012 with two vpshufb insns, vpermq and vpor. We should have already failed
18013 all two or three instruction sequences. */
18016 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
18018 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
18019 unsigned int i
, nelt
, eltsz
;
18022 || !d
->one_operand_p
18023 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18030 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18032 /* Generate two permutation masks. If the required element is within
18033 the same lane, it is shuffled in. If the required element from the
18034 other lane, force a zero by setting bit 7 in the permutation mask.
18035 In the other mask the mask has non-negative elements if element
18036 is requested from the other lane, but also moved to the other lane,
18037 so that the result of vpshufb can have the two V2TImode halves
18039 m128
= GEN_INT (-128);
18040 for (i
= 0; i
< nelt
; ++i
)
18042 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18043 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18045 for (j
= 0; j
< eltsz
; ++j
)
18047 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
18048 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
18052 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18053 vperm
= force_reg (V32QImode
, vperm
);
18055 h
= gen_reg_rtx (V32QImode
);
18056 op
= gen_lowpart (V32QImode
, d
->op0
);
18057 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18059 /* Swap the 128-byte lanes of h into hp. */
18060 hp
= gen_reg_rtx (V4DImode
);
18061 op
= gen_lowpart (V4DImode
, h
);
18062 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
18065 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18066 vperm
= force_reg (V32QImode
, vperm
);
18068 l
= gen_reg_rtx (V32QImode
);
18069 op
= gen_lowpart (V32QImode
, d
->op0
);
18070 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18073 if (d
->vmode
!= V32QImode
)
18074 op
= gen_reg_rtx (V32QImode
);
18075 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
18076 if (op
!= d
->target
)
18077 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18082 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18083 and extract-odd permutations of two V32QImode and V16QImode operand
18084 with two vpshufb insns, vpor and vpermq. We should have already
18085 failed all two or three instruction sequences. */
18088 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
18090 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
18091 unsigned int i
, nelt
, eltsz
;
18094 || d
->one_operand_p
18095 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18098 for (i
= 0; i
< d
->nelt
; ++i
)
18099 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
18106 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18108 /* Generate two permutation masks. In the first permutation mask
18109 the first quarter will contain indexes for the first half
18110 of the op0, the second quarter will contain bit 7 set, third quarter
18111 will contain indexes for the second half of the op0 and the
18112 last quarter bit 7 set. In the second permutation mask
18113 the first quarter will contain bit 7 set, the second quarter
18114 indexes for the first half of the op1, the third quarter bit 7 set
18115 and last quarter indexes for the second half of the op1.
18116 I.e. the first mask e.g. for V32QImode extract even will be:
18117 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18118 (all values masked with 0xf except for -128) and second mask
18119 for extract even will be
18120 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18121 m128
= GEN_INT (-128);
18122 for (i
= 0; i
< nelt
; ++i
)
18124 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18125 unsigned which
= d
->perm
[i
] >= nelt
;
18126 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
18128 for (j
= 0; j
< eltsz
; ++j
)
18130 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
18131 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
18135 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18136 vperm
= force_reg (V32QImode
, vperm
);
18138 l
= gen_reg_rtx (V32QImode
);
18139 op
= gen_lowpart (V32QImode
, d
->op0
);
18140 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18142 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18143 vperm
= force_reg (V32QImode
, vperm
);
18145 h
= gen_reg_rtx (V32QImode
);
18146 op
= gen_lowpart (V32QImode
, d
->op1
);
18147 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18149 ior
= gen_reg_rtx (V32QImode
);
18150 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
18152 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18153 op
= gen_reg_rtx (V4DImode
);
18154 ior
= gen_lowpart (V4DImode
, ior
);
18155 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
18156 const1_rtx
, GEN_INT (3)));
18157 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18162 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18163 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18164 with two "and" and "pack" or two "shift" and "pack" insns. We should
18165 have already failed all two instruction sequences. */
18168 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
18170 rtx op
, dop0
, dop1
, t
;
18171 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
18172 bool end_perm
= false;
18173 machine_mode half_mode
;
18174 rtx (*gen_and
) (rtx
, rtx
, rtx
);
18175 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
18176 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
18178 if (d
->one_operand_p
)
18184 /* Required for "pack". */
18185 if (!TARGET_SSE4_1
)
18189 half_mode
= V4SImode
;
18190 gen_and
= gen_andv4si3
;
18191 gen_pack
= gen_sse4_1_packusdw
;
18192 gen_shift
= gen_lshrv4si3
;
18195 /* No check as all instructions are SSE2. */
18198 half_mode
= V8HImode
;
18199 gen_and
= gen_andv8hi3
;
18200 gen_pack
= gen_sse2_packuswb
;
18201 gen_shift
= gen_lshrv8hi3
;
18208 half_mode
= V8SImode
;
18209 gen_and
= gen_andv8si3
;
18210 gen_pack
= gen_avx2_packusdw
;
18211 gen_shift
= gen_lshrv8si3
;
18219 half_mode
= V16HImode
;
18220 gen_and
= gen_andv16hi3
;
18221 gen_pack
= gen_avx2_packuswb
;
18222 gen_shift
= gen_lshrv16hi3
;
18226 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18227 general shuffles. */
18231 /* Check that permutation is even or odd. */
18236 for (i
= 1; i
< nelt
; ++i
)
18237 if (d
->perm
[i
] != 2 * i
+ odd
)
18243 dop0
= gen_reg_rtx (half_mode
);
18244 dop1
= gen_reg_rtx (half_mode
);
18247 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
18248 t
= force_reg (half_mode
, t
);
18249 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
18250 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
18254 emit_insn (gen_shift (dop0
,
18255 gen_lowpart (half_mode
, d
->op0
),
18257 emit_insn (gen_shift (dop1
,
18258 gen_lowpart (half_mode
, d
->op1
),
18261 /* In AVX2 for 256 bit case we need to permute pack result. */
18262 if (TARGET_AVX2
&& end_perm
)
18264 op
= gen_reg_rtx (d
->vmode
);
18265 t
= gen_reg_rtx (V4DImode
);
18266 emit_insn (gen_pack (op
, dop0
, dop1
));
18267 emit_insn (gen_avx2_permv4di_1 (t
,
18268 gen_lowpart (V4DImode
, op
),
18273 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
18276 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
18281 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18282 and extract-odd permutations of two V64QI operands
18283 with two "shifts", two "truncs" and one "concat" insns for "odd"
18284 and two "truncs" and one concat insn for "even."
18285 Have already failed all two instruction sequences. */
18288 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
18290 rtx t1
, t2
, t3
, t4
;
18291 unsigned i
, odd
, nelt
= d
->nelt
;
18293 if (!TARGET_AVX512BW
18294 || d
->one_operand_p
18295 || d
->vmode
!= V64QImode
)
18298 /* Check that permutation is even or odd. */
18303 for (i
= 1; i
< nelt
; ++i
)
18304 if (d
->perm
[i
] != 2 * i
+ odd
)
18313 t1
= gen_reg_rtx (V32HImode
);
18314 t2
= gen_reg_rtx (V32HImode
);
18315 emit_insn (gen_lshrv32hi3 (t1
,
18316 gen_lowpart (V32HImode
, d
->op0
),
18318 emit_insn (gen_lshrv32hi3 (t2
,
18319 gen_lowpart (V32HImode
, d
->op1
),
18324 t1
= gen_lowpart (V32HImode
, d
->op0
);
18325 t2
= gen_lowpart (V32HImode
, d
->op1
);
18328 t3
= gen_reg_rtx (V32QImode
);
18329 t4
= gen_reg_rtx (V32QImode
);
18330 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
18331 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
18332 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
18337 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
18338 and extract-odd permutations. */
18341 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
18343 rtx t1
, t2
, t3
, t4
, t5
;
18350 t1
= gen_reg_rtx (V4DFmode
);
18351 t2
= gen_reg_rtx (V4DFmode
);
18353 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18354 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18355 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18357 /* Now an unpck[lh]pd will produce the result required. */
18359 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
18361 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
18367 int mask
= odd
? 0xdd : 0x88;
18371 t1
= gen_reg_rtx (V8SFmode
);
18372 t2
= gen_reg_rtx (V8SFmode
);
18373 t3
= gen_reg_rtx (V8SFmode
);
18375 /* Shuffle within the 128-bit lanes to produce:
18376 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18377 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
18380 /* Shuffle the lanes around to produce:
18381 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18382 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
18385 /* Shuffle within the 128-bit lanes to produce:
18386 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18387 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
18389 /* Shuffle within the 128-bit lanes to produce:
18390 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18391 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
18393 /* Shuffle the lanes around to produce:
18394 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18395 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
18404 /* These are always directly implementable by expand_vec_perm_1. */
18405 gcc_unreachable ();
18409 return expand_vec_perm_even_odd_pack (d
);
18410 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
18411 return expand_vec_perm_pshufb2 (d
);
18416 /* We need 2*log2(N)-1 operations to achieve odd/even
18417 with interleave. */
18418 t1
= gen_reg_rtx (V8HImode
);
18419 t2
= gen_reg_rtx (V8HImode
);
18420 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
18421 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
18422 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
18423 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
18425 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
18427 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
18433 return expand_vec_perm_even_odd_pack (d
);
18437 return expand_vec_perm_even_odd_pack (d
);
18440 return expand_vec_perm_even_odd_trunc (d
);
18445 struct expand_vec_perm_d d_copy
= *d
;
18446 d_copy
.vmode
= V4DFmode
;
18448 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18450 d_copy
.target
= gen_reg_rtx (V4DFmode
);
18451 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
18452 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
18453 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18456 emit_move_insn (d
->target
,
18457 gen_lowpart (V4DImode
, d_copy
.target
));
18466 t1
= gen_reg_rtx (V4DImode
);
18467 t2
= gen_reg_rtx (V4DImode
);
18469 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18470 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18471 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18473 /* Now an vpunpck[lh]qdq will produce the result required. */
18475 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
18477 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
18484 struct expand_vec_perm_d d_copy
= *d
;
18485 d_copy
.vmode
= V8SFmode
;
18487 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18489 d_copy
.target
= gen_reg_rtx (V8SFmode
);
18490 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
18491 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
18492 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18495 emit_move_insn (d
->target
,
18496 gen_lowpart (V8SImode
, d_copy
.target
));
18505 t1
= gen_reg_rtx (V8SImode
);
18506 t2
= gen_reg_rtx (V8SImode
);
18507 t3
= gen_reg_rtx (V4DImode
);
18508 t4
= gen_reg_rtx (V4DImode
);
18509 t5
= gen_reg_rtx (V4DImode
);
18511 /* Shuffle the lanes around into
18512 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18513 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
18514 gen_lowpart (V4DImode
, d
->op1
),
18516 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
18517 gen_lowpart (V4DImode
, d
->op1
),
18520 /* Swap the 2nd and 3rd position in each lane into
18521 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18522 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
18523 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18524 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
18525 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18527 /* Now an vpunpck[lh]qdq will produce
18528 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18530 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
18531 gen_lowpart (V4DImode
, t2
));
18533 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
18534 gen_lowpart (V4DImode
, t2
));
18536 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
18540 gcc_unreachable ();
18546 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18547 extract-even and extract-odd permutations. */
18550 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
18552 unsigned i
, odd
, nelt
= d
->nelt
;
18555 if (odd
!= 0 && odd
!= 1)
18558 for (i
= 1; i
< nelt
; ++i
)
18559 if (d
->perm
[i
] != 2 * i
+ odd
)
18562 return expand_vec_perm_even_odd_1 (d
, odd
);
18565 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
18566 permutations. We assume that expand_vec_perm_1 has already failed. */
18569 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
18571 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
18572 machine_mode vmode
= d
->vmode
;
18573 unsigned char perm2
[4];
18574 rtx op0
= d
->op0
, dest
;
18581 /* These are special-cased in sse.md so that we can optionally
18582 use the vbroadcast instruction. They expand to two insns
18583 if the input happens to be in a register. */
18584 gcc_unreachable ();
18590 /* These are always implementable using standard shuffle patterns. */
18591 gcc_unreachable ();
18595 /* These can be implemented via interleave. We save one insn by
18596 stopping once we have promoted to V4SImode and then use pshufd. */
18602 rtx (*gen
) (rtx
, rtx
, rtx
)
18603 = vmode
== V16QImode
? gen_vec_interleave_lowv16qi
18604 : gen_vec_interleave_lowv8hi
;
18608 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
18609 : gen_vec_interleave_highv8hi
;
18614 dest
= gen_reg_rtx (vmode
);
18615 emit_insn (gen (dest
, op0
, op0
));
18616 vmode
= get_mode_wider_vector (vmode
);
18617 op0
= gen_lowpart (vmode
, dest
);
18619 while (vmode
!= V4SImode
);
18621 memset (perm2
, elt
, 4);
18622 dest
= gen_reg_rtx (V4SImode
);
18623 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
18626 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
18634 /* For AVX2 broadcasts of the first element vpbroadcast* or
18635 vpermq should be used by expand_vec_perm_1. */
18636 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
18640 gcc_unreachable ();
18644 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18645 broadcast permutations. */
18648 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
18650 unsigned i
, elt
, nelt
= d
->nelt
;
18652 if (!d
->one_operand_p
)
18656 for (i
= 1; i
< nelt
; ++i
)
18657 if (d
->perm
[i
] != elt
)
18660 return expand_vec_perm_broadcast_1 (d
);
18663 /* Implement arbitrary permutations of two V64QImode operands
18664 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18666 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
18668 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
18674 struct expand_vec_perm_d ds
[2];
18675 rtx rperm
[128], vperm
, target0
, target1
;
18676 unsigned int i
, nelt
;
18677 machine_mode vmode
;
18682 for (i
= 0; i
< 2; i
++)
18685 ds
[i
].vmode
= V32HImode
;
18687 ds
[i
].target
= gen_reg_rtx (V32HImode
);
18688 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
18689 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
18692 /* Prepare permutations such that the first one takes care of
18693 putting the even bytes into the right positions or one higher
18694 positions (ds[0]) and the second one takes care of
18695 putting the odd bytes into the right positions or one below
18698 for (i
= 0; i
< nelt
; i
++)
18700 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
18703 rperm
[i
] = constm1_rtx
;
18704 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
18708 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
18709 rperm
[i
+ 64] = constm1_rtx
;
18713 bool ok
= expand_vec_perm_1 (&ds
[0]);
18715 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
18717 ok
= expand_vec_perm_1 (&ds
[1]);
18719 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
18721 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
18722 vperm
= force_reg (vmode
, vperm
);
18723 target0
= gen_reg_rtx (V64QImode
);
18724 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
18726 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
18727 vperm
= force_reg (vmode
, vperm
);
18728 target1
= gen_reg_rtx (V64QImode
);
18729 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
18731 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
18735 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
18736 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
18737 all the shorter instruction sequences. */
18740 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
18742 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
18743 unsigned int i
, nelt
, eltsz
;
18747 || d
->one_operand_p
18748 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18755 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18757 /* Generate 4 permutation masks. If the required element is within
18758 the same lane, it is shuffled in. If the required element from the
18759 other lane, force a zero by setting bit 7 in the permutation mask.
18760 In the other mask the mask has non-negative elements if element
18761 is requested from the other lane, but also moved to the other lane,
18762 so that the result of vpshufb can have the two V2TImode halves
18764 m128
= GEN_INT (-128);
18765 for (i
= 0; i
< 32; ++i
)
18767 rperm
[0][i
] = m128
;
18768 rperm
[1][i
] = m128
;
18769 rperm
[2][i
] = m128
;
18770 rperm
[3][i
] = m128
;
18776 for (i
= 0; i
< nelt
; ++i
)
18778 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18779 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18780 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
18782 for (j
= 0; j
< eltsz
; ++j
)
18783 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
18784 used
[which
] = true;
18787 for (i
= 0; i
< 2; ++i
)
18789 if (!used
[2 * i
+ 1])
18794 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
18795 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
18796 vperm
= force_reg (V32QImode
, vperm
);
18797 h
[i
] = gen_reg_rtx (V32QImode
);
18798 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
18799 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
18802 /* Swap the 128-byte lanes of h[X]. */
18803 for (i
= 0; i
< 2; ++i
)
18805 if (h
[i
] == NULL_RTX
)
18807 op
= gen_reg_rtx (V4DImode
);
18808 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
18809 const2_rtx
, GEN_INT (3), const0_rtx
,
18811 h
[i
] = gen_lowpart (V32QImode
, op
);
18814 for (i
= 0; i
< 2; ++i
)
18821 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
18822 vperm
= force_reg (V32QImode
, vperm
);
18823 l
[i
] = gen_reg_rtx (V32QImode
);
18824 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
18825 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
18828 for (i
= 0; i
< 2; ++i
)
18832 op
= gen_reg_rtx (V32QImode
);
18833 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
18840 gcc_assert (l
[0] && l
[1]);
18842 if (d
->vmode
!= V32QImode
)
18843 op
= gen_reg_rtx (V32QImode
);
18844 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
18845 if (op
!= d
->target
)
18846 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18850 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
18851 taken care of, perform the expansion in D and return true on success. */
18854 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
18856 /* Try a single instruction expansion. */
18857 if (expand_vec_perm_1 (d
))
18860 /* Try sequences of two instructions. */
18862 if (expand_vec_perm_pshuflw_pshufhw (d
))
18865 if (expand_vec_perm_palignr (d
, false))
18868 if (expand_vec_perm_interleave2 (d
))
18871 if (expand_vec_perm_broadcast (d
))
18874 if (expand_vec_perm_vpermq_perm_1 (d
))
18877 if (expand_vec_perm_vperm2f128 (d
))
18880 if (expand_vec_perm_pblendv (d
))
18883 /* Try sequences of three instructions. */
18885 if (expand_vec_perm_even_odd_pack (d
))
18888 if (expand_vec_perm_2vperm2f128_vshuf (d
))
18891 if (expand_vec_perm_pshufb2 (d
))
18894 if (expand_vec_perm_interleave3 (d
))
18897 if (expand_vec_perm_vperm2f128_vblend (d
))
18900 /* Try sequences of four instructions. */
18902 if (expand_vec_perm_even_odd_trunc (d
))
18904 if (expand_vec_perm_vpshufb2_vpermq (d
))
18907 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
18910 if (expand_vec_perm_vpermt2_vpshub2 (d
))
18913 /* ??? Look for narrow permutations whose element orderings would
18914 allow the promotion to a wider mode. */
18916 /* ??? Look for sequences of interleave or a wider permute that place
18917 the data into the correct lanes for a half-vector shuffle like
18918 pshuf[lh]w or vpermilps. */
18920 /* ??? Look for sequences of interleave that produce the desired results.
18921 The combinatorics of punpck[lh] get pretty ugly... */
18923 if (expand_vec_perm_even_odd (d
))
18926 /* Even longer sequences. */
18927 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
18930 /* See if we can get the same permutation in different vector integer
18932 struct expand_vec_perm_d nd
;
18933 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
18936 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
18943 /* If a permutation only uses one operand, make it clear. Returns true
18944 if the permutation references both operands. */
18947 canonicalize_perm (struct expand_vec_perm_d
*d
)
18949 int i
, which
, nelt
= d
->nelt
;
18951 for (i
= which
= 0; i
< nelt
; ++i
)
18952 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
18954 d
->one_operand_p
= true;
18961 if (!rtx_equal_p (d
->op0
, d
->op1
))
18963 d
->one_operand_p
= false;
18966 /* The elements of PERM do not suggest that only the first operand
18967 is used, but both operands are identical. Allow easier matching
18968 of the permutation by folding the permutation into the single
18973 for (i
= 0; i
< nelt
; ++i
)
18974 d
->perm
[i
] &= nelt
- 1;
18983 return (which
== 3);
18986 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18989 ix86_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
18990 rtx op1
, const vec_perm_indices
&sel
)
18992 struct expand_vec_perm_d d
;
18993 unsigned char perm
[MAX_VECT_LEN
];
18994 unsigned int i
, nelt
, which
;
19002 gcc_assert (VECTOR_MODE_P (d
.vmode
));
19003 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19004 d
.testing_p
= !target
;
19006 gcc_assert (sel
.length () == nelt
);
19007 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
19009 /* Given sufficient ISA support we can just return true here
19010 for selected vector modes. */
19017 if (!TARGET_AVX512F
)
19019 /* All implementable with a single vperm[it]2 insn. */
19024 if (!TARGET_AVX512BW
)
19027 /* All implementable with a single vperm[it]2 insn. */
19031 if (!TARGET_AVX512BW
)
19034 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19043 if (d
.testing_p
&& TARGET_AVX512VL
)
19044 /* All implementable with a single vperm[it]2 insn. */
19050 if (d
.testing_p
&& TARGET_AVX2
)
19051 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19057 if (d
.testing_p
&& TARGET_AVX2
)
19058 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19065 /* Fall through. */
19070 /* All implementable with a single vpperm insn. */
19071 if (d
.testing_p
&& TARGET_XOP
)
19073 /* All implementable with 2 pshufb + 1 ior. */
19074 if (d
.testing_p
&& TARGET_SSSE3
)
19081 /* All implementable with shufpd or unpck[lh]pd. */
19089 for (i
= which
= 0; i
< nelt
; ++i
)
19091 unsigned char e
= sel
[i
];
19092 gcc_assert (e
< 2 * nelt
);
19095 which
|= (e
< nelt
? 1 : 2);
19100 /* For all elements from second vector, fold the elements to first. */
19102 for (i
= 0; i
< nelt
; ++i
)
19105 /* Check whether the mask can be applied to the vector type. */
19106 d
.one_operand_p
= (which
!= 3);
19108 /* Implementable with shufps or pshufd. */
19109 if (d
.one_operand_p
&& (d
.vmode
== V4SFmode
|| d
.vmode
== V4SImode
))
19112 /* Otherwise we have to go through the motions and see if we can
19113 figure out how to generate the requested permutation. */
19114 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19115 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
19116 if (!d
.one_operand_p
)
19117 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
19120 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
19126 two_args
= canonicalize_perm (&d
);
19128 if (ix86_expand_vec_perm_const_1 (&d
))
19131 /* If the selector says both arguments are needed, but the operands are the
19132 same, the above tried to expand with one_operand_p and flattened selector.
19133 If that didn't work, retry without one_operand_p; we succeeded with that
19135 if (two_args
&& d
.one_operand_p
)
19137 d
.one_operand_p
= false;
19138 memcpy (d
.perm
, perm
, sizeof (perm
));
19139 return ix86_expand_vec_perm_const_1 (&d
);
19146 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
19148 struct expand_vec_perm_d d
;
19154 d
.vmode
= GET_MODE (targ
);
19155 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19156 d
.one_operand_p
= false;
19157 d
.testing_p
= false;
19159 for (i
= 0; i
< nelt
; ++i
)
19160 d
.perm
[i
] = i
* 2 + odd
;
19162 /* We'll either be able to implement the permutation directly... */
19163 if (expand_vec_perm_1 (&d
))
19166 /* ... or we use the special-case patterns. */
19167 expand_vec_perm_even_odd_1 (&d
, odd
);
19171 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
19173 struct expand_vec_perm_d d
;
19174 unsigned i
, nelt
, base
;
19180 d
.vmode
= GET_MODE (targ
);
19181 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19182 d
.one_operand_p
= false;
19183 d
.testing_p
= false;
19185 base
= high_p
? nelt
/ 2 : 0;
19186 for (i
= 0; i
< nelt
/ 2; ++i
)
19188 d
.perm
[i
* 2] = i
+ base
;
19189 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
19192 /* Note that for AVX this isn't one instruction. */
19193 ok
= ix86_expand_vec_perm_const_1 (&d
);
19198 /* Expand a vector operation CODE for a V*QImode in terms of the
19199 same operation on V*HImode. */
19202 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
19204 machine_mode qimode
= GET_MODE (dest
);
19205 machine_mode himode
;
19206 rtx (*gen_il
) (rtx
, rtx
, rtx
);
19207 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
19208 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
19209 struct expand_vec_perm_d d
;
19210 bool ok
, full_interleave
;
19211 bool uns_p
= false;
19218 gen_il
= gen_vec_interleave_lowv16qi
;
19219 gen_ih
= gen_vec_interleave_highv16qi
;
19222 himode
= V16HImode
;
19223 gen_il
= gen_avx2_interleave_lowv32qi
;
19224 gen_ih
= gen_avx2_interleave_highv32qi
;
19227 himode
= V32HImode
;
19228 gen_il
= gen_avx512bw_interleave_lowv64qi
;
19229 gen_ih
= gen_avx512bw_interleave_highv64qi
;
19232 gcc_unreachable ();
19235 op2_l
= op2_h
= op2
;
19239 /* Unpack data such that we've got a source byte in each low byte of
19240 each word. We don't care what goes into the high byte of each word.
19241 Rather than trying to get zero in there, most convenient is to let
19242 it be a copy of the low byte. */
19243 op2_l
= gen_reg_rtx (qimode
);
19244 op2_h
= gen_reg_rtx (qimode
);
19245 emit_insn (gen_il (op2_l
, op2
, op2
));
19246 emit_insn (gen_ih (op2_h
, op2
, op2
));
19248 op1_l
= gen_reg_rtx (qimode
);
19249 op1_h
= gen_reg_rtx (qimode
);
19250 emit_insn (gen_il (op1_l
, op1
, op1
));
19251 emit_insn (gen_ih (op1_h
, op1
, op1
));
19252 full_interleave
= qimode
== V16QImode
;
19260 op1_l
= gen_reg_rtx (himode
);
19261 op1_h
= gen_reg_rtx (himode
);
19262 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
19263 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
19264 full_interleave
= true;
19267 gcc_unreachable ();
19270 /* Perform the operation. */
19271 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
19273 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
19275 gcc_assert (res_l
&& res_h
);
19277 /* Merge the data back into the right place. */
19279 d
.op0
= gen_lowpart (qimode
, res_l
);
19280 d
.op1
= gen_lowpart (qimode
, res_h
);
19282 d
.nelt
= GET_MODE_NUNITS (qimode
);
19283 d
.one_operand_p
= false;
19284 d
.testing_p
= false;
19286 if (full_interleave
)
19288 /* For SSE2, we used an full interleave, so the desired
19289 results are in the even elements. */
19290 for (i
= 0; i
< d
.nelt
; ++i
)
19295 /* For AVX, the interleave used above was not cross-lane. So the
19296 extraction is evens but with the second and third quarter swapped.
19297 Happily, that is even one insn shorter than even extraction.
19298 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19299 always first from the first and then from the second source operand,
19300 the index bits above the low 4 bits remains the same.
19301 Thus, for d.nelt == 32 we want permutation
19302 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19303 and for d.nelt == 64 we want permutation
19304 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19305 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19306 for (i
= 0; i
< d
.nelt
; ++i
)
19307 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
19310 ok
= ix86_expand_vec_perm_const_1 (&d
);
19313 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
19314 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
19317 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19318 if op is CONST_VECTOR with all odd elements equal to their
19319 preceding element. */
19322 const_vector_equal_evenodd_p (rtx op
)
19324 machine_mode mode
= GET_MODE (op
);
19325 int i
, nunits
= GET_MODE_NUNITS (mode
);
19326 if (GET_CODE (op
) != CONST_VECTOR
19327 || nunits
!= CONST_VECTOR_NUNITS (op
))
19329 for (i
= 0; i
< nunits
; i
+= 2)
19330 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
19336 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
19337 bool uns_p
, bool odd_p
)
19339 machine_mode mode
= GET_MODE (op1
);
19340 machine_mode wmode
= GET_MODE (dest
);
19342 rtx orig_op1
= op1
, orig_op2
= op2
;
19344 if (!nonimmediate_operand (op1
, mode
))
19345 op1
= force_reg (mode
, op1
);
19346 if (!nonimmediate_operand (op2
, mode
))
19347 op2
= force_reg (mode
, op2
);
19349 /* We only play even/odd games with vectors of SImode. */
19350 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
19352 /* If we're looking for the odd results, shift those members down to
19353 the even slots. For some cpus this is faster than a PSHUFD. */
19356 /* For XOP use vpmacsdqh, but only for smult, as it is only
19358 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
19360 x
= force_reg (wmode
, CONST0_RTX (wmode
));
19361 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
19365 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
19366 if (!const_vector_equal_evenodd_p (orig_op1
))
19367 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
19368 x
, NULL
, 1, OPTAB_DIRECT
);
19369 if (!const_vector_equal_evenodd_p (orig_op2
))
19370 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
19371 x
, NULL
, 1, OPTAB_DIRECT
);
19372 op1
= gen_lowpart (mode
, op1
);
19373 op2
= gen_lowpart (mode
, op2
);
19376 if (mode
== V16SImode
)
19379 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
19381 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
19383 else if (mode
== V8SImode
)
19386 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
19388 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
19391 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
19392 else if (TARGET_SSE4_1
)
19393 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
19396 rtx s1
, s2
, t0
, t1
, t2
;
19398 /* The easiest way to implement this without PMULDQ is to go through
19399 the motions as if we are performing a full 64-bit multiply. With
19400 the exception that we need to do less shuffling of the elements. */
19402 /* Compute the sign-extension, aka highparts, of the two operands. */
19403 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19404 op1
, pc_rtx
, pc_rtx
);
19405 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19406 op2
, pc_rtx
, pc_rtx
);
19408 /* Multiply LO(A) * HI(B), and vice-versa. */
19409 t1
= gen_reg_rtx (wmode
);
19410 t2
= gen_reg_rtx (wmode
);
19411 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
19412 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
19414 /* Multiply LO(A) * LO(B). */
19415 t0
= gen_reg_rtx (wmode
);
19416 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
19418 /* Combine and shift the highparts into place. */
19419 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
19420 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
19423 /* Combine high and low parts. */
19424 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
19431 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
19432 bool uns_p
, bool high_p
)
19434 machine_mode wmode
= GET_MODE (dest
);
19435 machine_mode mode
= GET_MODE (op1
);
19436 rtx t1
, t2
, t3
, t4
, mask
;
19441 t1
= gen_reg_rtx (mode
);
19442 t2
= gen_reg_rtx (mode
);
19443 if (TARGET_XOP
&& !uns_p
)
19445 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19446 shuffle the elements once so that all elements are in the right
19447 place for immediate use: { A C B D }. */
19448 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
19449 const1_rtx
, GEN_INT (3)));
19450 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
19451 const1_rtx
, GEN_INT (3)));
19455 /* Put the elements into place for the multiply. */
19456 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
19457 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
19460 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
19464 /* Shuffle the elements between the lanes. After this we
19465 have { A B E F | C D G H } for each operand. */
19466 t1
= gen_reg_rtx (V4DImode
);
19467 t2
= gen_reg_rtx (V4DImode
);
19468 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
19469 const0_rtx
, const2_rtx
,
19470 const1_rtx
, GEN_INT (3)));
19471 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
19472 const0_rtx
, const2_rtx
,
19473 const1_rtx
, GEN_INT (3)));
19475 /* Shuffle the elements within the lanes. After this we
19476 have { A A B B | C C D D } or { E E F F | G G H H }. */
19477 t3
= gen_reg_rtx (V8SImode
);
19478 t4
= gen_reg_rtx (V8SImode
);
19479 mask
= GEN_INT (high_p
19480 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19481 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19482 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
19483 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
19485 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
19490 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
19491 uns_p
, OPTAB_DIRECT
);
19492 t2
= expand_binop (mode
,
19493 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
19494 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
19495 gcc_assert (t1
&& t2
);
19497 t3
= gen_reg_rtx (mode
);
19498 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
19499 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
19507 t1
= gen_reg_rtx (wmode
);
19508 t2
= gen_reg_rtx (wmode
);
19509 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
19510 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
19512 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
19516 gcc_unreachable ();
19521 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
19523 rtx res_1
, res_2
, res_3
, res_4
;
19525 res_1
= gen_reg_rtx (V4SImode
);
19526 res_2
= gen_reg_rtx (V4SImode
);
19527 res_3
= gen_reg_rtx (V2DImode
);
19528 res_4
= gen_reg_rtx (V2DImode
);
19529 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
19530 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
19532 /* Move the results in element 2 down to element 1; we don't care
19533 what goes in elements 2 and 3. Then we can merge the parts
19534 back together with an interleave.
19536 Note that two other sequences were tried:
19537 (1) Use interleaves at the start instead of psrldq, which allows
19538 us to use a single shufps to merge things back at the end.
19539 (2) Use shufps here to combine the two vectors, then pshufd to
19540 put the elements in the correct order.
19541 In both cases the cost of the reformatting stall was too high
19542 and the overall sequence slower. */
19544 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
19545 const0_rtx
, const2_rtx
,
19546 const0_rtx
, const0_rtx
));
19547 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
19548 const0_rtx
, const2_rtx
,
19549 const0_rtx
, const0_rtx
));
19550 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
19552 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
19556 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
19558 machine_mode mode
= GET_MODE (op0
);
19559 rtx t1
, t2
, t3
, t4
, t5
, t6
;
19561 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
19562 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
19563 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
19564 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
19565 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
19566 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
19567 else if (TARGET_XOP
&& mode
== V2DImode
)
19569 /* op1: A,B,C,D, op2: E,F,G,H */
19570 op1
= gen_lowpart (V4SImode
, op1
);
19571 op2
= gen_lowpart (V4SImode
, op2
);
19573 t1
= gen_reg_rtx (V4SImode
);
19574 t2
= gen_reg_rtx (V4SImode
);
19575 t3
= gen_reg_rtx (V2DImode
);
19576 t4
= gen_reg_rtx (V2DImode
);
19579 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
19585 /* t2: (B*E),(A*F),(D*G),(C*H) */
19586 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
19588 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19589 emit_insn (gen_xop_phadddq (t3
, t2
));
19591 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19592 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
19594 /* Multiply lower parts and add all */
19595 t5
= gen_reg_rtx (V2DImode
);
19596 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
19597 gen_lowpart (V4SImode
, op1
),
19598 gen_lowpart (V4SImode
, op2
)));
19599 op0
= expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
19604 machine_mode nmode
;
19605 rtx (*umul
) (rtx
, rtx
, rtx
);
19607 if (mode
== V2DImode
)
19609 umul
= gen_vec_widen_umult_even_v4si
;
19612 else if (mode
== V4DImode
)
19614 umul
= gen_vec_widen_umult_even_v8si
;
19617 else if (mode
== V8DImode
)
19619 umul
= gen_vec_widen_umult_even_v16si
;
19623 gcc_unreachable ();
19626 /* Multiply low parts. */
19627 t1
= gen_reg_rtx (mode
);
19628 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
19630 /* Shift input vectors right 32 bits so we can multiply high parts. */
19632 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
19633 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
19635 /* Multiply high parts by low parts. */
19636 t4
= gen_reg_rtx (mode
);
19637 t5
= gen_reg_rtx (mode
);
19638 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
19639 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
19641 /* Combine and shift the highparts back. */
19642 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
19643 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
19645 /* Combine high and low parts. */
19646 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
19649 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
19650 gen_rtx_MULT (mode
, op1
, op2
));
19653 /* Return 1 if control tansfer instruction INSN
19654 should be encoded with notrack prefix. */
19657 ix86_notrack_prefixed_insn_p (rtx insn
)
19659 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
19664 rtx call
= get_call_rtx_from (insn
);
19665 gcc_assert (call
!= NULL_RTX
);
19666 rtx addr
= XEXP (call
, 0);
19668 /* Do not emit 'notrack' if it's not an indirect call. */
19670 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
19673 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
19676 if (JUMP_P (insn
) && !flag_cet_switch
)
19678 rtx target
= JUMP_LABEL (insn
);
19679 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
19682 /* Check the jump is a switch table. */
19683 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
19684 rtx_insn
*table
= next_insn (label
);
19685 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
19693 /* Calculate integer abs() using only SSE2 instructions. */
19696 ix86_expand_sse2_abs (rtx target
, rtx input
)
19698 machine_mode mode
= GET_MODE (target
);
19705 /* For 64-bit signed integer X, with SSE4.2 use
19706 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
19707 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
19708 32 and use logical instead of arithmetic right shift (which is
19709 unimplemented) and subtract. */
19712 tmp0
= gen_reg_rtx (mode
);
19713 tmp1
= gen_reg_rtx (mode
);
19714 emit_move_insn (tmp1
, CONST0_RTX (mode
));
19715 if (mode
== E_V2DImode
)
19716 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
19718 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
19722 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
19723 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
19724 - 1), NULL
, 0, OPTAB_DIRECT
);
19725 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
19728 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
19729 NULL
, 0, OPTAB_DIRECT
);
19730 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
19731 target
, 0, OPTAB_DIRECT
);
19735 /* For 32-bit signed integer X, the best way to calculate the absolute
19736 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
19737 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
19738 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
19739 NULL
, 0, OPTAB_DIRECT
);
19740 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
19741 NULL
, 0, OPTAB_DIRECT
);
19742 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
19743 target
, 0, OPTAB_DIRECT
);
19747 /* For 16-bit signed integer X, the best way to calculate the absolute
19748 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
19749 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
19751 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
19752 target
, 0, OPTAB_DIRECT
);
19756 /* For 8-bit signed integer X, the best way to calculate the absolute
19757 value of X is min ((unsigned char) X, (unsigned char) (-X)),
19758 as SSE2 provides the PMINUB insn. */
19759 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
19761 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
19762 target
, 0, OPTAB_DIRECT
);
19766 gcc_unreachable ();
19770 emit_move_insn (target
, x
);
19773 /* Expand an extract from a vector register through pextr insn.
19774 Return true if successful. */
19777 ix86_expand_pextr (rtx
*operands
)
19779 rtx dst
= operands
[0];
19780 rtx src
= operands
[1];
19782 unsigned int size
= INTVAL (operands
[2]);
19783 unsigned int pos
= INTVAL (operands
[3]);
19785 if (SUBREG_P (dst
))
19787 /* Reject non-lowpart subregs. */
19788 if (SUBREG_BYTE (dst
) > 0)
19790 dst
= SUBREG_REG (dst
);
19793 if (SUBREG_P (src
))
19795 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
19796 src
= SUBREG_REG (src
);
19799 switch (GET_MODE (src
))
19808 machine_mode srcmode
, dstmode
;
19811 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
19817 if (!TARGET_SSE4_1
)
19819 srcmode
= V16QImode
;
19825 srcmode
= V8HImode
;
19829 if (!TARGET_SSE4_1
)
19831 srcmode
= V4SImode
;
19835 gcc_assert (TARGET_64BIT
);
19836 if (!TARGET_SSE4_1
)
19838 srcmode
= V2DImode
;
19845 /* Reject extractions from misaligned positions. */
19846 if (pos
& (size
-1))
19849 if (GET_MODE (dst
) == dstmode
)
19852 d
= gen_reg_rtx (dstmode
);
19854 /* Construct insn pattern. */
19855 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
19856 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
19858 /* Let the rtl optimizers know about the zero extension performed. */
19859 if (dstmode
== QImode
|| dstmode
== HImode
)
19861 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
19862 d
= gen_lowpart (SImode
, d
);
19865 emit_insn (gen_rtx_SET (d
, pat
));
19868 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
19877 /* Expand an insert into a vector register through pinsr insn.
19878 Return true if successful. */
19881 ix86_expand_pinsr (rtx
*operands
)
19883 rtx dst
= operands
[0];
19884 rtx src
= operands
[3];
19886 unsigned int size
= INTVAL (operands
[1]);
19887 unsigned int pos
= INTVAL (operands
[2]);
19889 if (SUBREG_P (dst
))
19891 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
19892 dst
= SUBREG_REG (dst
);
19895 switch (GET_MODE (dst
))
19904 machine_mode srcmode
, dstmode
;
19905 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
19908 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
19914 if (!TARGET_SSE4_1
)
19916 dstmode
= V16QImode
;
19917 pinsr
= gen_sse4_1_pinsrb
;
19923 dstmode
= V8HImode
;
19924 pinsr
= gen_sse2_pinsrw
;
19928 if (!TARGET_SSE4_1
)
19930 dstmode
= V4SImode
;
19931 pinsr
= gen_sse4_1_pinsrd
;
19935 gcc_assert (TARGET_64BIT
);
19936 if (!TARGET_SSE4_1
)
19938 dstmode
= V2DImode
;
19939 pinsr
= gen_sse4_1_pinsrq
;
19946 /* Reject insertions to misaligned positions. */
19947 if (pos
& (size
-1))
19950 if (SUBREG_P (src
))
19952 unsigned int srcpos
= SUBREG_BYTE (src
);
19958 extr_ops
[0] = gen_reg_rtx (srcmode
);
19959 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
19960 extr_ops
[2] = GEN_INT (size
);
19961 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
19963 if (!ix86_expand_pextr (extr_ops
))
19969 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
19972 if (GET_MODE (dst
) == dstmode
)
19975 d
= gen_reg_rtx (dstmode
);
19977 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
19978 gen_lowpart (srcmode
, src
),
19979 GEN_INT (1 << (pos
/ size
))));
19981 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
19990 /* All CPUs prefer to avoid cross-lane operations so perform reductions
19991 upper against lower halves up to SSE reg size. */
19994 ix86_split_reduction (machine_mode mode
)
19996 /* Reduce lowpart against highpart until we reach SSE reg width to
19997 avoid cross-lane operations. */
20023 /* Generate call to __divmoddi4. */
20026 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
20028 rtx
*quot_p
, rtx
*rem_p
)
20030 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
20032 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
20033 mode
, op0
, mode
, op1
, mode
,
20034 XEXP (rem
, 0), Pmode
);
20039 #include "gt-i386-expand.h"