1 /* Copyright (C) 1988-2024 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "shrink-wrap.h"
72 #include "tree-iterator.h"
74 #include "case-cfn-macros.h"
76 #include "fold-const-call.h"
78 #include "tree-ssanames.h"
80 #include "selftest-rtl.h"
81 #include "print-rtl.h"
84 #include "symbol-summary.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
92 #include "dwarf2out.h"
93 #include "i386-options.h"
94 #include "i386-builtins.h"
95 #include "i386-expand.h"
98 /* Split one or more double-mode RTL references into pairs of half-mode
99 references. The RTL can be REG, offsettable MEM, integer constant, or
100 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
101 split and "num" is its length. lo_half and hi_half are output arrays
102 that parallel "operands". */
105 split_double_mode (machine_mode mode
, rtx operands
[],
106 int num
, rtx lo_half
[], rtx hi_half
[])
108 machine_mode half_mode
;
110 rtx mem_op
= NULL_RTX
;
131 byte
= GET_MODE_SIZE (half_mode
);
135 rtx op
= operands
[num
];
137 /* simplify_subreg refuse to split volatile memory addresses,
138 but we still have to handle it. */
141 if (mem_op
&& rtx_equal_p (op
, mem_op
))
143 lo_half
[num
] = lo_half
[mem_num
];
144 hi_half
[num
] = hi_half
[mem_num
];
150 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
151 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
156 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
157 GET_MODE (op
) == VOIDmode
158 ? mode
: GET_MODE (op
), 0);
160 rtx tmp
= simplify_gen_subreg (half_mode
, op
,
161 GET_MODE (op
) == VOIDmode
162 ? mode
: GET_MODE (op
), byte
);
163 /* simplify_gen_subreg will return NULL RTX for the
164 high half of the paradoxical subreg. */
165 hi_half
[num
] = tmp
? tmp
: gen_reg_rtx (half_mode
);
170 /* Emit the double word assignment DST = { LO, HI }. */
173 split_double_concat (machine_mode mode
, rtx dst
, rtx lo
, rtx hi
)
176 int deleted_move_count
= 0;
177 split_double_mode (mode
, &dst
, 1, &dlo
, &dhi
);
178 /* Constraints ensure that if both lo and hi are MEMs, then
179 dst has early-clobber and thus addresses of MEMs don't use
180 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
181 dlo/dhi are registers. */
183 && rtx_equal_p (dlo
, hi
)
184 && reg_overlap_mentioned_p (dhi
, lo
))
186 /* If dlo is same as hi and lo's address uses dhi register,
187 code below would first emit_move_insn (dhi, hi)
188 and then emit_move_insn (dlo, lo). But the former
189 would invalidate lo's address. Load into dhi first,
191 emit_move_insn (dhi
, lo
);
196 && !rtx_equal_p (dlo
, lo
)
197 && reg_overlap_mentioned_p (dlo
, hi
))
199 /* In this case, code below would first emit_move_insn (dlo, lo)
200 and then emit_move_insn (dhi, hi). But the former would
201 invalidate hi's address. */
202 if (rtx_equal_p (dhi
, lo
))
204 /* We can't load into dhi first, so load into dlo
205 first and we'll swap. */
206 emit_move_insn (dlo
, hi
);
211 /* Load into dhi first. */
212 emit_move_insn (dhi
, hi
);
216 if (!rtx_equal_p (dlo
, hi
))
218 if (!rtx_equal_p (dlo
, lo
))
219 emit_move_insn (dlo
, lo
);
221 deleted_move_count
++;
222 if (!rtx_equal_p (dhi
, hi
))
223 emit_move_insn (dhi
, hi
);
225 deleted_move_count
++;
227 else if (!rtx_equal_p (lo
, dhi
))
229 if (!rtx_equal_p (dhi
, hi
))
230 emit_move_insn (dhi
, hi
);
232 deleted_move_count
++;
233 if (!rtx_equal_p (dlo
, lo
))
234 emit_move_insn (dlo
, lo
);
236 deleted_move_count
++;
238 else if (mode
== TImode
)
239 emit_insn (gen_swapdi (dlo
, dhi
));
241 emit_insn (gen_swapsi (dlo
, dhi
));
243 if (deleted_move_count
== 2)
244 emit_note (NOTE_INSN_DELETED
);
248 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
252 ix86_expand_clear (rtx dest
)
256 /* We play register width games, which are only valid after reload. */
257 gcc_assert (reload_completed
);
259 /* Avoid HImode and its attendant prefix byte. */
260 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
261 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
262 tmp
= gen_rtx_SET (dest
, const0_rtx
);
264 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
266 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
267 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
273 /* Return true if V can be broadcasted from an integer of WIDTH bits
274 which is returned in VAL_BROADCAST. Otherwise, return false. */
277 ix86_broadcast (HOST_WIDE_INT v
, unsigned int width
,
278 HOST_WIDE_INT
&val_broadcast
)
280 wide_int val
= wi::uhwi (v
, HOST_BITS_PER_WIDE_INT
);
281 val_broadcast
= wi::extract_uhwi (val
, 0, width
);
282 for (unsigned int i
= width
; i
< HOST_BITS_PER_WIDE_INT
; i
+= width
)
284 HOST_WIDE_INT each
= wi::extract_uhwi (val
, i
, width
);
285 if (val_broadcast
!= each
)
288 val_broadcast
= sext_hwi (val_broadcast
, width
);
292 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
295 ix86_convert_const_wide_int_to_broadcast (machine_mode mode
, rtx op
)
297 /* Don't use integer vector broadcast if we can't move from GPR to SSE
298 register directly. */
299 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
302 unsigned int msize
= GET_MODE_SIZE (mode
);
304 /* Only optimized for vpbroadcast[bwsd]/vbroadcastss with xmm/ymm/zmm. */
305 if (msize
!= 16 && msize
!= 32 && msize
!= 64)
308 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
309 broadcast only if vector broadcast is available. */
311 || !CONST_WIDE_INT_P (op
)
312 || standard_sse_constant_p (op
, mode
)
313 || (CONST_WIDE_INT_NUNITS (op
) * HOST_BITS_PER_WIDE_INT
314 != GET_MODE_BITSIZE (mode
)))
317 HOST_WIDE_INT val
= CONST_WIDE_INT_ELT (op
, 0);
318 HOST_WIDE_INT val_broadcast
;
319 scalar_int_mode broadcast_mode
;
320 /* vpbroadcastb zmm requires TARGET_AVX512BW. */
321 if ((msize
== 64 ? TARGET_AVX512BW
: TARGET_AVX2
)
322 && ix86_broadcast (val
, GET_MODE_BITSIZE (QImode
),
324 broadcast_mode
= QImode
;
325 else if ((msize
== 64 ? TARGET_AVX512BW
: TARGET_AVX2
)
326 && ix86_broadcast (val
, GET_MODE_BITSIZE (HImode
),
328 broadcast_mode
= HImode
;
329 /* vbroadcasts[sd] only support memory operand w/o AVX2.
330 When msize == 16, pshufs is used for vec_duplicate.
331 when msize == 64, vpbroadcastd is used, and TARGET_AVX512F must be existed. */
332 else if ((msize
!= 32 || TARGET_AVX2
)
333 && ix86_broadcast (val
, GET_MODE_BITSIZE (SImode
),
335 broadcast_mode
= SImode
;
336 else if (TARGET_64BIT
&& (msize
!= 32 || TARGET_AVX2
)
337 && ix86_broadcast (val
, GET_MODE_BITSIZE (DImode
),
339 broadcast_mode
= DImode
;
343 /* Check if OP can be broadcasted from VAL. */
344 for (int i
= 1; i
< CONST_WIDE_INT_NUNITS (op
); i
++)
345 if (val
!= CONST_WIDE_INT_ELT (op
, i
))
348 unsigned int nunits
= (GET_MODE_SIZE (mode
)
349 / GET_MODE_SIZE (broadcast_mode
));
350 machine_mode vector_mode
;
351 if (!mode_for_vector (broadcast_mode
, nunits
).exists (&vector_mode
))
353 rtx target
= gen_reg_rtx (vector_mode
);
354 bool ok
= ix86_expand_vector_init_duplicate (false, vector_mode
,
356 GEN_INT (val_broadcast
));
359 target
= lowpart_subreg (mode
, target
, vector_mode
);
364 ix86_expand_move (machine_mode mode
, rtx operands
[])
367 rtx tmp
, addend
= NULL_RTX
;
368 enum tls_model model
;
373 /* Avoid complex sets of likely spilled hard registers before reload. */
374 if (!ix86_hardreg_mov_ok (op0
, op1
))
376 tmp
= gen_reg_rtx (mode
);
378 ix86_expand_move (mode
, operands
);
384 switch (GET_CODE (op1
))
389 if (GET_CODE (tmp
) != PLUS
390 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
394 addend
= XEXP (tmp
, 1);
398 model
= SYMBOL_REF_TLS_MODEL (op1
);
401 op1
= legitimize_tls_address (op1
, model
, true);
402 else if (ix86_force_load_from_GOT_p (op1
))
404 /* Load the external function address via GOT slot to avoid PLT. */
405 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
409 op1
= gen_rtx_CONST (Pmode
, op1
);
410 op1
= gen_const_mem (Pmode
, op1
);
411 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
415 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
431 op1
= force_operand (op1
, NULL_RTX
);
432 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
433 op0
, 1, OPTAB_DIRECT
);
436 op1
= force_operand (op1
, op0
);
441 op1
= convert_to_mode (mode
, op1
, 1);
447 /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
451 && GET_MODE (SUBREG_REG (op1
)) == DImode
452 && SUBREG_BYTE (op1
) == 0)
453 op1
= gen_rtx_ZERO_EXTEND (TImode
, SUBREG_REG (op1
));
454 /* As not all values in XFmode are representable in real_value,
455 we might be called with unfoldable SUBREGs of constants. */
457 && CONSTANT_P (SUBREG_REG (op1
))
458 && can_create_pseudo_p ())
460 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
461 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
463 r
= validize_mem (r
);
465 r
= force_reg (imode
, SUBREG_REG (op1
));
466 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
471 if ((flag_pic
|| MACHOPIC_INDIRECT
)
472 && symbolic_operand (op1
, mode
))
475 if (TARGET_MACHO
&& !TARGET_64BIT
)
478 if (MACHOPIC_INDIRECT
)
480 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
481 ? op0
: gen_reg_rtx (Pmode
);
482 op1
= machopic_indirect_data_reference (op1
, temp
);
484 op1
= machopic_legitimize_pic_address (op1
, mode
,
485 temp
== op1
? 0 : temp
);
487 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
489 rtx insn
= gen_rtx_SET (op0
, op1
);
497 op1
= force_reg (mode
, op1
);
498 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
500 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
501 op1
= legitimize_pic_address (op1
, reg
);
504 op1
= convert_to_mode (mode
, op1
, 1);
510 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
511 || !push_operand (op0
, mode
))
513 op1
= force_reg (mode
, op1
);
515 if (push_operand (op0
, mode
)
516 && ! general_no_elim_operand (op1
, mode
))
517 op1
= copy_to_mode_reg (mode
, op1
);
519 /* Force large constants in 64bit compilation into register
520 to get them CSEed. */
521 if (can_create_pseudo_p ()
522 && (mode
== DImode
) && TARGET_64BIT
523 && immediate_operand (op1
, mode
)
524 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
525 && !register_operand (op0
, mode
)
527 op1
= copy_to_mode_reg (mode
, op1
);
529 if (can_create_pseudo_p ())
531 if (CONST_DOUBLE_P (op1
))
533 /* If we are loading a floating point constant to a
534 register, force the value to memory now, since we'll
535 get better code out the back end. */
537 op1
= validize_mem (force_const_mem (mode
, op1
));
538 if (!register_operand (op0
, mode
))
540 rtx temp
= gen_reg_rtx (mode
);
541 emit_insn (gen_rtx_SET (temp
, op1
));
542 emit_move_insn (op0
, temp
);
549 /* Special case inserting 64-bit values into a TImode register. */
551 /* Disable for -O0 (see PR110587) unless naked (PR110533). */
552 && (optimize
|| ix86_function_naked (current_function_decl
))
553 && (mode
== DImode
|| mode
== DFmode
)
555 && GET_MODE (SUBREG_REG (op0
)) == TImode
556 && REG_P (SUBREG_REG (op0
))
559 /* Use *insvti_lowpart_1 to set lowpart. */
560 if (SUBREG_BYTE (op0
) == 0)
562 wide_int mask
= wi::mask (64, true, 128);
563 rtx tmp
= immed_wide_int_const (mask
, TImode
);
564 op0
= SUBREG_REG (op0
);
565 tmp
= gen_rtx_AND (TImode
, copy_rtx (op0
), tmp
);
567 op1
= gen_lowpart (DImode
, op1
);
568 op1
= gen_rtx_ZERO_EXTEND (TImode
, op1
);
569 op1
= gen_rtx_IOR (TImode
, tmp
, op1
);
571 /* Use *insvti_highpart_1 to set highpart. */
572 else if (SUBREG_BYTE (op0
) == 8)
574 wide_int mask
= wi::mask (64, false, 128);
575 rtx tmp
= immed_wide_int_const (mask
, TImode
);
576 op0
= SUBREG_REG (op0
);
577 tmp
= gen_rtx_AND (TImode
, copy_rtx (op0
), tmp
);
579 op1
= gen_lowpart (DImode
, op1
);
580 op1
= gen_rtx_ZERO_EXTEND (TImode
, op1
);
581 op1
= gen_rtx_ASHIFT (TImode
, op1
, GEN_INT (64));
582 op1
= gen_rtx_IOR (TImode
, tmp
, op1
);
586 emit_insn (gen_rtx_SET (op0
, op1
));
589 /* OP is a memref of CONST_VECTOR, return scalar constant mem
590 if CONST_VECTOR is a vec_duplicate, else return NULL. */
592 ix86_broadcast_from_constant (machine_mode mode
, rtx op
)
594 int nunits
= GET_MODE_NUNITS (mode
);
598 /* Don't use integer vector broadcast if we can't move from GPR to SSE
599 register directly. */
600 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
601 && INTEGRAL_MODE_P (mode
))
604 /* Convert CONST_VECTOR to a non-standard SSE constant integer
605 broadcast only if vector broadcast is available. */
606 if (standard_sse_constant_p (op
, mode
))
609 if (GET_MODE_INNER (mode
) == TImode
)
612 rtx constant
= get_pool_constant (XEXP (op
, 0));
613 if (GET_CODE (constant
) != CONST_VECTOR
)
616 /* There could be some rtx like
617 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
618 but with "*.LC1" refer to V2DI constant vector. */
619 if (GET_MODE (constant
) != mode
)
621 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
),
623 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
627 rtx first
= XVECEXP (constant
, 0, 0);
629 for (int i
= 1; i
< nunits
; ++i
)
631 rtx tmp
= XVECEXP (constant
, 0, i
);
632 /* Vector duplicate value. */
633 if (!rtx_equal_p (tmp
, first
))
641 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
643 rtx op0
= operands
[0], op1
= operands
[1];
644 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
645 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
646 unsigned int align
= (TARGET_IAMCU
647 ? GET_MODE_BITSIZE (mode
)
648 : GET_MODE_ALIGNMENT (mode
));
650 if (push_operand (op0
, VOIDmode
))
651 op0
= emit_move_resolve_push (mode
, op0
);
653 /* Force constants other than zero into memory. We do not know how
654 the instructions used to build constants modify the upper 64 bits
655 of the register, once we have that information we may be able
656 to handle some of them more efficiently. */
657 if (can_create_pseudo_p ()
660 && CONSTANT_P (SUBREG_REG (op1
))))
661 && ((register_operand (op0
, mode
)
662 && !standard_sse_constant_p (op1
, mode
))
663 /* ix86_expand_vector_move_misalign() does not like constants. */
664 || (SSE_REG_MODE_P (mode
)
666 && MEM_ALIGN (op0
) < align
)))
670 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
671 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
673 r
= validize_mem (r
);
675 r
= force_reg (imode
, SUBREG_REG (op1
));
676 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
680 machine_mode mode
= GET_MODE (op0
);
681 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
684 op1
= validize_mem (force_const_mem (mode
, op1
));
690 if (can_create_pseudo_p ()
691 && GET_MODE_SIZE (mode
) >= 16
692 && VECTOR_MODE_P (mode
)
694 && SYMBOL_REF_P (XEXP (op1
, 0))
695 && CONSTANT_POOL_ADDRESS_P (XEXP (op1
, 0))))
697 rtx first
= ix86_broadcast_from_constant (mode
, op1
);
698 if (first
!= nullptr)
700 /* Broadcast to XMM/YMM/ZMM register from an integer
701 constant or scalar mem. */
702 rtx tmp
= gen_reg_rtx (mode
);
703 if (FLOAT_MODE_P (mode
))
704 first
= force_const_mem (GET_MODE_INNER (mode
), first
);
705 bool ok
= ix86_expand_vector_init_duplicate (false, mode
,
707 if (!ok
&& !TARGET_64BIT
&& GET_MODE_INNER (mode
) == DImode
)
709 first
= force_const_mem (GET_MODE_INNER (mode
), first
);
710 ok
= ix86_expand_vector_init_duplicate (false, mode
,
715 emit_move_insn (op0
, tmp
);
721 /* We need to check memory alignment for SSE mode since attribute
722 can make operands unaligned. */
723 if (can_create_pseudo_p ()
724 && SSE_REG_MODE_P (mode
)
725 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
726 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
730 /* ix86_expand_vector_move_misalign() does not like both
731 arguments in memory. */
732 if (!register_operand (op0
, mode
)
733 && !register_operand (op1
, mode
))
735 rtx scratch
= gen_reg_rtx (mode
);
736 emit_move_insn (scratch
, op1
);
740 tmp
[0] = op0
; tmp
[1] = op1
;
741 ix86_expand_vector_move_misalign (mode
, tmp
);
745 /* Special case TImode to 128-bit vector conversions via V2DI. */
746 if (VECTOR_MODE_P (mode
)
747 && GET_MODE_SIZE (mode
) == 16
749 && GET_MODE (SUBREG_REG (op1
)) == TImode
750 && TARGET_64BIT
&& TARGET_SSE
751 && can_create_pseudo_p ())
753 rtx tmp
= gen_reg_rtx (V2DImode
);
754 rtx lo
= gen_reg_rtx (DImode
);
755 rtx hi
= gen_reg_rtx (DImode
);
756 emit_move_insn (lo
, gen_lowpart (DImode
, SUBREG_REG (op1
)));
757 emit_move_insn (hi
, gen_highpart (DImode
, SUBREG_REG (op1
)));
758 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
759 emit_move_insn (op0
, gen_lowpart (mode
, tmp
));
763 /* If operand0 is a hard register, make operand1 a pseudo. */
764 if (can_create_pseudo_p ()
765 && !ix86_hardreg_mov_ok (op0
, op1
))
767 rtx tmp
= gen_reg_rtx (GET_MODE (op0
));
768 emit_move_insn (tmp
, op1
);
769 emit_move_insn (op0
, tmp
);
773 /* Make operand1 a register if it isn't already. */
774 if (can_create_pseudo_p ()
775 && !register_operand (op0
, mode
)
776 && !register_operand (op1
, mode
))
778 rtx tmp
= gen_reg_rtx (GET_MODE (op0
));
779 emit_move_insn (tmp
, op1
);
780 emit_move_insn (op0
, tmp
);
784 emit_insn (gen_rtx_SET (op0
, op1
));
787 /* Split 32-byte AVX unaligned load and store if needed. */
790 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
793 rtx (*extract
) (rtx
, rtx
, rtx
);
796 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
797 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
799 emit_insn (gen_rtx_SET (op0
, op1
));
803 rtx orig_op0
= NULL_RTX
;
804 mode
= GET_MODE (op0
);
805 switch (GET_MODE_CLASS (mode
))
807 case MODE_VECTOR_INT
:
809 if (mode
!= V32QImode
)
814 op0
= gen_reg_rtx (V32QImode
);
817 op0
= gen_lowpart (V32QImode
, op0
);
818 op1
= gen_lowpart (V32QImode
, op1
);
822 case MODE_VECTOR_FLOAT
:
833 extract
= gen_avx_vextractf128v32qi
;
837 extract
= gen_avx_vextractf128v16bf
;
841 extract
= gen_avx_vextractf128v16hf
;
845 extract
= gen_avx_vextractf128v8sf
;
849 extract
= gen_avx_vextractf128v4df
;
856 rtx r
= gen_reg_rtx (mode
);
857 m
= adjust_address (op1
, mode
, 0);
858 emit_move_insn (r
, m
);
859 m
= adjust_address (op1
, mode
, 16);
860 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
861 emit_move_insn (op0
, r
);
863 else if (MEM_P (op0
))
865 m
= adjust_address (op0
, mode
, 0);
866 emit_insn (extract (m
, op1
, const0_rtx
));
867 m
= adjust_address (op0
, mode
, 16);
868 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
874 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
877 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
878 straight to ix86_expand_vector_move. */
879 /* Code generation for scalar reg-reg moves of single and double precision data:
880 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
884 if (x86_sse_partial_reg_dependency == true)
889 Code generation for scalar loads of double precision data:
890 if (x86_sse_split_regs == true)
891 movlpd mem, reg (gas syntax)
895 Code generation for unaligned packed loads of single precision data
896 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
897 if (x86_sse_unaligned_move_optimal)
900 if (x86_sse_partial_reg_dependency == true)
912 Code generation for unaligned packed loads of double precision data
913 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
914 if (x86_sse_unaligned_move_optimal)
917 if (x86_sse_split_regs == true)
930 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
937 /* Use unaligned load/store for AVX512 or when optimizing for size. */
938 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
940 emit_insn (gen_rtx_SET (op0
, op1
));
946 if (GET_MODE_SIZE (mode
) == 32)
947 ix86_avx256_split_vector_move_misalign (op0
, op1
);
949 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
950 emit_insn (gen_rtx_SET (op0
, op1
));
954 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
955 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
957 emit_insn (gen_rtx_SET (op0
, op1
));
961 /* ??? If we have typed data, then it would appear that using
962 movdqu is the only way to get unaligned data loaded with
964 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
966 emit_insn (gen_rtx_SET (op0
, op1
));
972 if (TARGET_SSE2
&& mode
== V2DFmode
)
976 /* When SSE registers are split into halves, we can avoid
977 writing to the top half twice. */
978 if (TARGET_SSE_SPLIT_REGS
)
985 /* ??? Not sure about the best option for the Intel chips.
986 The following would seem to satisfy; the register is
987 entirely cleared, breaking the dependency chain. We
988 then store to the upper half, with a dependency depth
989 of one. A rumor has it that Intel recommends two movsd
990 followed by an unpacklpd, but this is unconfirmed. And
991 given that the dependency depth of the unpacklpd would
992 still be one, I'm not sure why this would be better. */
993 zero
= CONST0_RTX (V2DFmode
);
996 m
= adjust_address (op1
, DFmode
, 0);
997 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
998 m
= adjust_address (op1
, DFmode
, 8);
999 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
1005 if (mode
!= V4SFmode
)
1006 t
= gen_reg_rtx (V4SFmode
);
1010 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
1011 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
1015 m
= adjust_address (op1
, V2SFmode
, 0);
1016 emit_insn (gen_sse_loadlps (t
, t
, m
));
1017 m
= adjust_address (op1
, V2SFmode
, 8);
1018 emit_insn (gen_sse_loadhps (t
, t
, m
));
1019 if (mode
!= V4SFmode
)
1020 emit_move_insn (op0
, gen_lowpart (mode
, t
));
1023 else if (MEM_P (op0
))
1025 if (TARGET_SSE2
&& mode
== V2DFmode
)
1027 m
= adjust_address (op0
, DFmode
, 0);
1028 emit_insn (gen_sse2_storelpd (m
, op1
));
1029 m
= adjust_address (op0
, DFmode
, 8);
1030 emit_insn (gen_sse2_storehpd (m
, op1
));
1034 if (mode
!= V4SFmode
)
1035 op1
= gen_lowpart (V4SFmode
, op1
);
1037 m
= adjust_address (op0
, V2SFmode
, 0);
1038 emit_insn (gen_sse_storelps (m
, op1
));
1039 m
= adjust_address (op0
, V2SFmode
, 8);
1040 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
1047 /* Move bits 64:95 to bits 32:63. */
1050 ix86_move_vector_high_sse_to_mmx (rtx op
)
1052 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
1053 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1054 GEN_INT (0), GEN_INT (0)));
1055 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
1056 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
1057 rtx insn
= gen_rtx_SET (dest
, op
);
1061 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1064 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
1066 rtx op0
= operands
[0];
1067 rtx op1
= operands
[1];
1068 rtx op2
= operands
[2];
1071 machine_mode dmode
= GET_MODE (op0
);
1072 machine_mode smode
= GET_MODE (op1
);
1073 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
1074 machine_mode inner_smode
= GET_MODE_INNER (smode
);
1076 /* Get the corresponding SSE mode for destination. */
1077 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
1078 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
1080 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
1081 nunits
/ 2).require ();
1083 /* Get the corresponding SSE mode for source. */
1084 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
1085 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
1088 /* Generate SSE pack with signed/unsigned saturation. */
1089 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
1090 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
1091 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
1093 /* paskusdw/packuswb does unsigned saturation of a signed source
1094 which is different from generic us_truncate RTX. */
1095 if (code
== US_TRUNCATE
)
1096 src
= gen_rtx_UNSPEC (sse_dmode
,
1097 gen_rtvec (2, op1
, op2
),
1098 UNSPEC_US_TRUNCATE
);
1101 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
1102 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
1103 src
= gen_rtx_VEC_CONCAT (sse_dmode
, op1
, op2
);
1106 emit_move_insn (dest
, src
);
1108 ix86_move_vector_high_sse_to_mmx (op0
);
1111 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. This is also used
1112 for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
1116 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
1118 rtx op0
= operands
[0];
1119 rtx op1
= operands
[1];
1120 rtx op2
= operands
[2];
1121 machine_mode mode
= GET_MODE (op1
);
1123 /* The corresponding SSE mode. */
1124 machine_mode sse_mode
, double_sse_mode
;
1131 sse_mode
= V16QImode
;
1132 double_sse_mode
= V32QImode
;
1133 mask
= gen_rtx_PARALLEL (VOIDmode
,
1135 GEN_INT (0), GEN_INT (16),
1136 GEN_INT (1), GEN_INT (17),
1137 GEN_INT (2), GEN_INT (18),
1138 GEN_INT (3), GEN_INT (19),
1139 GEN_INT (4), GEN_INT (20),
1140 GEN_INT (5), GEN_INT (21),
1141 GEN_INT (6), GEN_INT (22),
1142 GEN_INT (7), GEN_INT (23)));
1147 sse_mode
= V8HImode
;
1148 double_sse_mode
= V16HImode
;
1149 mask
= gen_rtx_PARALLEL (VOIDmode
,
1151 GEN_INT (0), GEN_INT (8),
1152 GEN_INT (1), GEN_INT (9),
1153 GEN_INT (2), GEN_INT (10),
1154 GEN_INT (3), GEN_INT (11)));
1158 sse_mode
= V4SImode
;
1159 double_sse_mode
= V8SImode
;
1160 mask
= gen_rtx_PARALLEL (VOIDmode
,
1162 GEN_INT (0), GEN_INT (4),
1163 GEN_INT (1), GEN_INT (5)));
1167 sse_mode
= V4SFmode
;
1168 double_sse_mode
= V8SFmode
;
1169 mask
= gen_rtx_PARALLEL (VOIDmode
,
1171 GEN_INT (0), GEN_INT (4),
1172 GEN_INT (1), GEN_INT (5)));
1179 /* Generate SSE punpcklXX. */
1180 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
1181 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
1182 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
1184 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
1185 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
1186 rtx insn
= gen_rtx_SET (dest
, op2
);
1189 /* Move high bits to low bits. */
1192 if (sse_mode
== V4SFmode
)
1194 mask
= gen_rtx_PARALLEL (VOIDmode
,
1195 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1196 GEN_INT (4), GEN_INT (5)));
1197 op2
= gen_rtx_VEC_CONCAT (V8SFmode
, dest
, dest
);
1198 op1
= gen_rtx_VEC_SELECT (V4SFmode
, op2
, mask
);
1202 int sz
= GET_MODE_SIZE (mode
);
1205 mask
= gen_rtx_PARALLEL (VOIDmode
,
1206 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1207 GEN_INT (0), GEN_INT (1)));
1209 mask
= gen_rtx_PARALLEL (VOIDmode
,
1210 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1211 GEN_INT (0), GEN_INT (1)));
1215 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
1216 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
1219 insn
= gen_rtx_SET (dest
, op1
);
1224 /* Helper function of ix86_fixup_binary_operands to canonicalize
1225 operand order. Returns true if the operands should be swapped. */
1228 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
1231 rtx dst
= operands
[0];
1232 rtx src1
= operands
[1];
1233 rtx src2
= operands
[2];
1235 /* If the operation is not commutative, we can't do anything. */
1236 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
1237 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
1240 /* Highest priority is that src1 should match dst. */
1241 if (rtx_equal_p (dst
, src1
))
1243 if (rtx_equal_p (dst
, src2
))
1246 /* Next highest priority is that immediate constants come second. */
1247 if (immediate_operand (src2
, mode
))
1249 if (immediate_operand (src1
, mode
))
1252 /* Lowest priority is that memory references should come second. */
1261 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1262 destination to use for the operation. If different from the true
1263 destination in operands[0], a copy operation will be required except
1264 under TARGET_APX_NDD. */
1267 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
1268 rtx operands
[], bool use_ndd
)
1270 rtx dst
= operands
[0];
1271 rtx src1
= operands
[1];
1272 rtx src2
= operands
[2];
1274 /* Canonicalize operand order. */
1275 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1277 /* It is invalid to swap operands of different modes. */
1278 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
1280 std::swap (src1
, src2
);
1283 /* Both source operands cannot be in memory. */
1284 if (MEM_P (src1
) && MEM_P (src2
))
1286 /* Optimization: Only read from memory once. */
1287 if (rtx_equal_p (src1
, src2
))
1289 src2
= force_reg (mode
, src2
);
1292 else if (rtx_equal_p (dst
, src1
))
1293 src2
= force_reg (mode
, src2
);
1295 src1
= force_reg (mode
, src1
);
1298 /* If the destination is memory, and we do not have matching source
1299 operands, do things in registers. */
1300 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1301 dst
= gen_reg_rtx (mode
);
1303 /* Source 1 cannot be a constant. */
1304 if (CONSTANT_P (src1
))
1305 src1
= force_reg (mode
, src1
);
1307 /* Source 1 cannot be a non-matching memory. */
1308 if (!use_ndd
&& MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1309 src1
= force_reg (mode
, src1
);
1311 /* Improve address combine. */
1313 && GET_MODE_CLASS (mode
) == MODE_INT
1315 src2
= force_reg (mode
, src2
);
1322 /* Similarly, but assume that the destination has already been
1326 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
1327 machine_mode mode
, rtx operands
[],
1330 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
, use_ndd
);
1331 gcc_assert (dst
== operands
[0]);
1334 /* Attempt to expand a binary operator. Make the expansion closer to the
1335 actual machine, then just general_operand, which will allow 3 separate
1336 memory references (one output, two input) in a single insn. */
1339 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
1340 rtx operands
[], bool use_ndd
)
1342 rtx src1
, src2
, dst
, op
, clob
;
1344 dst
= ix86_fixup_binary_operands (code
, mode
, operands
, use_ndd
);
1348 /* Emit the instruction. */
1350 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
1352 if (reload_completed
1354 && !rtx_equal_p (dst
, src1
)
1357 /* This is going to be an LEA; avoid splitting it later. */
1362 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1363 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1366 /* Fix up the destination if needed. */
1367 if (dst
!= operands
[0])
1368 emit_move_insn (operands
[0], dst
);
1371 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1372 the given OPERANDS. */
1375 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
1378 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
1379 if (SUBREG_P (operands
[1]))
1384 else if (SUBREG_P (operands
[2]))
1389 /* Optimize (__m128i) d | (__m128i) e and similar code
1390 when d and e are float vectors into float vector logical
1391 insn. In C/C++ without using intrinsics there is no other way
1392 to express vector logical operation on float vectors than
1393 to cast them temporarily to integer vectors. */
1395 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1396 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
1397 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
1398 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
1399 && SUBREG_BYTE (op1
) == 0
1400 && (GET_CODE (op2
) == CONST_VECTOR
1401 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
1402 && SUBREG_BYTE (op2
) == 0))
1403 && can_create_pseudo_p ())
1406 switch (GET_MODE (SUBREG_REG (op1
)))
1414 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
1415 if (GET_CODE (op2
) == CONST_VECTOR
)
1417 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1418 op2
= force_reg (GET_MODE (dst
), op2
);
1423 op2
= SUBREG_REG (operands
[2]);
1424 if (!vector_operand (op2
, GET_MODE (dst
)))
1425 op2
= force_reg (GET_MODE (dst
), op2
);
1427 op1
= SUBREG_REG (op1
);
1428 if (!vector_operand (op1
, GET_MODE (dst
)))
1429 op1
= force_reg (GET_MODE (dst
), op1
);
1430 emit_insn (gen_rtx_SET (dst
,
1431 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1433 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1439 if (!vector_operand (operands
[1], mode
))
1440 operands
[1] = force_reg (mode
, operands
[1]);
1441 if (!vector_operand (operands
[2], mode
))
1442 operands
[2] = force_reg (mode
, operands
[2]);
1443 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1444 emit_insn (gen_rtx_SET (operands
[0],
1445 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1449 /* Return TRUE or FALSE depending on whether the binary operator meets the
1450 appropriate constraints. */
1453 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1454 rtx operands
[3], bool use_ndd
)
1456 rtx dst
= operands
[0];
1457 rtx src1
= operands
[1];
1458 rtx src2
= operands
[2];
1460 /* Both source operands cannot be in memory. */
1461 if ((MEM_P (src1
) || bcst_mem_operand (src1
, mode
))
1462 && (MEM_P (src2
) || bcst_mem_operand (src2
, mode
)))
1465 /* Canonicalize operand order for commutative operators. */
1466 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1467 std::swap (src1
, src2
);
1469 /* If the destination is memory, we must have a matching source operand. */
1470 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1473 /* Source 1 cannot be a constant. */
1474 if (CONSTANT_P (src1
))
1477 /* Source 1 cannot be a non-matching memory. */
1478 if (!use_ndd
&& MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1479 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1483 || (TARGET_64BIT
&& mode
== DImode
))
1484 && satisfies_constraint_L (src2
));
1489 /* Attempt to expand a unary operator. Make the expansion closer to the
1490 actual machine, then just general_operand, which will allow 2 separate
1491 memory references (one output, one input) in a single insn. */
1494 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1495 rtx operands
[], bool use_ndd
)
1497 bool matching_memory
= false;
1498 rtx src
, dst
, op
, clob
;
1503 /* If the destination is memory, and we do not have matching source
1504 operands, do things in registers. */
1507 if (rtx_equal_p (dst
, src
))
1508 matching_memory
= true;
1510 dst
= gen_reg_rtx (mode
);
1513 /* When source operand is memory, destination must match. */
1514 if (!use_ndd
&& MEM_P (src
) && !matching_memory
)
1515 src
= force_reg (mode
, src
);
1517 /* Emit the instruction. */
1519 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1525 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1526 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1529 /* Fix up the destination if needed. */
1530 if (dst
!= operands
[0])
1531 emit_move_insn (operands
[0], dst
);
1534 /* Return TRUE or FALSE depending on whether the unary operator meets the
1535 appropriate constraints. */
1538 ix86_unary_operator_ok (enum rtx_code
,
1543 /* If one of operands is memory, source and destination must match. */
1544 if ((MEM_P (operands
[0])
1545 || (!use_ndd
&& MEM_P (operands
[1])))
1546 && ! rtx_equal_p (operands
[0], operands
[1]))
1551 /* Predict just emitted jump instruction to be taken with probability PROB. */
1554 predict_jump (int prob
)
1556 rtx_insn
*insn
= get_last_insn ();
1557 gcc_assert (JUMP_P (insn
));
1558 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1561 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1562 divisor are within the range [0-255]. */
1565 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1568 rtx_code_label
*end_label
, *qimode_label
;
1571 rtx scratch
, tmp0
, tmp1
, tmp2
;
1572 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1574 operands
[2] = force_reg (mode
, operands
[2]);
1575 operands
[3] = force_reg (mode
, operands
[3]);
1580 if (GET_MODE (operands
[0]) == SImode
)
1582 if (GET_MODE (operands
[1]) == SImode
)
1583 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1586 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1590 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1594 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1601 end_label
= gen_label_rtx ();
1602 qimode_label
= gen_label_rtx ();
1604 scratch
= gen_reg_rtx (mode
);
1606 /* Use 8bit unsigned divimod if dividend and divisor are within
1607 the range [0-255]. */
1608 emit_move_insn (scratch
, operands
[2]);
1609 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1610 scratch
, 1, OPTAB_DIRECT
);
1611 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1612 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1613 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1614 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1615 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1617 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1618 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1619 JUMP_LABEL (insn
) = qimode_label
;
1621 /* Generate original signed/unsigned divimod. */
1622 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1623 operands
[2], operands
[3]));
1625 /* Branch to the end. */
1626 emit_jump_insn (gen_jump (end_label
));
1629 /* Generate 8bit unsigned divide. */
1630 emit_label (qimode_label
);
1631 /* Don't use operands[0] for result of 8bit divide since not all
1632 registers support QImode ZERO_EXTRACT. */
1633 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1634 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1635 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1636 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1640 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1641 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1645 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1646 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1650 if (GET_MODE (operands
[0]) != SImode
)
1651 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1652 if (GET_MODE (operands
[1]) != SImode
)
1653 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1656 /* Extract remainder from AH. */
1657 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1658 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1659 GEN_INT (8), GEN_INT (8));
1660 insn
= emit_move_insn (operands
[1], tmp1
);
1661 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1663 /* Zero extend quotient from AL. */
1664 tmp1
= gen_lowpart (QImode
, tmp0
);
1665 insn
= emit_insn (gen_extend_insn
1667 GET_MODE (operands
[0]), QImode
, 1));
1668 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1670 emit_label (end_label
);
1673 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1674 matches destination. RTX includes clobber of FLAGS_REG. */
1677 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1682 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1683 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1685 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1688 /* Return true if regno1 def is nearest to the insn. */
1691 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1693 rtx_insn
*prev
= insn
;
1694 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1698 while (prev
&& prev
!= start
)
1700 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1702 prev
= PREV_INSN (prev
);
1705 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1707 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1709 prev
= PREV_INSN (prev
);
1712 /* None of the regs is defined in the bb. */
1716 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1717 int ix86_last_zero_store_uid
;
1719 /* Split lea instructions into a sequence of instructions
1720 which are executed on ALU to avoid AGU stalls.
1721 It is assumed that it is allowed to clobber flags register
1725 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1727 unsigned int regno0
, regno1
, regno2
;
1728 struct ix86_address parts
;
1732 ok
= ix86_decompose_address (operands
[1], &parts
);
1735 target
= gen_lowpart (mode
, operands
[0]);
1737 regno0
= true_regnum (target
);
1738 regno1
= INVALID_REGNUM
;
1739 regno2
= INVALID_REGNUM
;
1743 parts
.base
= gen_lowpart (mode
, parts
.base
);
1744 regno1
= true_regnum (parts
.base
);
1749 parts
.index
= gen_lowpart (mode
, parts
.index
);
1750 regno2
= true_regnum (parts
.index
);
1754 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1756 if (parts
.scale
> 1)
1758 /* Case r1 = r1 + ... */
1759 if (regno1
== regno0
)
1761 /* If we have a case r1 = r1 + C * r2 then we
1762 should use multiplication which is very
1763 expensive. Assume cost model is wrong if we
1764 have such case here. */
1765 gcc_assert (regno2
!= regno0
);
1767 for (adds
= parts
.scale
; adds
> 0; adds
--)
1768 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1772 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1773 if (regno0
!= regno2
)
1774 emit_insn (gen_rtx_SET (target
, parts
.index
));
1776 /* Use shift for scaling, but emit it as MULT instead
1777 to avoid it being immediately peephole2 optimized back
1779 ix86_emit_binop (MULT
, mode
, target
, GEN_INT (parts
.scale
));
1782 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1784 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1785 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1788 else if (!parts
.base
&& !parts
.index
)
1790 gcc_assert(parts
.disp
);
1791 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1797 if (regno0
!= regno2
)
1798 emit_insn (gen_rtx_SET (target
, parts
.index
));
1800 else if (!parts
.index
)
1802 if (regno0
!= regno1
)
1803 emit_insn (gen_rtx_SET (target
, parts
.base
));
1807 if (regno0
== regno1
)
1809 else if (regno0
== regno2
)
1815 /* Find better operand for SET instruction, depending
1816 on which definition is farther from the insn. */
1817 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1818 tmp
= parts
.index
, tmp1
= parts
.base
;
1820 tmp
= parts
.base
, tmp1
= parts
.index
;
1822 emit_insn (gen_rtx_SET (target
, tmp
));
1824 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1825 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1827 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1831 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1834 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1835 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1839 /* Post-reload splitter for converting an SF or DFmode value in an
1840 SSE register into an unsigned SImode. */
1843 ix86_split_convert_uns_si_sse (rtx operands
[])
1845 machine_mode vecmode
;
1846 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1848 large
= operands
[1];
1849 zero_or_two31
= operands
[2];
1850 input
= operands
[3];
1851 two31
= operands
[4];
1852 vecmode
= GET_MODE (large
);
1853 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1855 /* Load up the value into the low element. We must ensure that the other
1856 elements are valid floats -- zero is the easiest such value. */
1859 if (vecmode
== V4SFmode
)
1860 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1862 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1866 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1867 emit_move_insn (value
, CONST0_RTX (vecmode
));
1868 if (vecmode
== V4SFmode
)
1869 emit_insn (gen_sse_movss_v4sf (value
, value
, input
));
1871 emit_insn (gen_sse2_movsd_v2df (value
, value
, input
));
1874 emit_move_insn (large
, two31
);
1875 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1877 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1878 emit_insn (gen_rtx_SET (large
, x
));
1880 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1881 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1883 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1884 emit_insn (gen_rtx_SET (value
, x
));
1886 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1887 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1889 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1890 if (vecmode
== V4SFmode
)
1891 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1893 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1896 emit_insn (gen_xorv4si3 (value
, value
, large
));
1899 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1900 machine_mode mode
, rtx target
,
1901 rtx var
, int one_var
);
1903 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1904 Expects the 64-bit DImode to be supplied in a pair of integral
1905 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1906 -mfpmath=sse, !optimize_size only. */
1909 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1911 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1912 rtx int_xmm
, fp_xmm
;
1913 rtx biases
, exponents
;
1916 int_xmm
= gen_reg_rtx (V4SImode
);
1917 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1918 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1919 else if (TARGET_SSE_SPLIT_REGS
)
1921 emit_clobber (int_xmm
);
1922 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1926 x
= gen_reg_rtx (V2DImode
);
1927 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1928 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1931 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1932 gen_rtvec (4, GEN_INT (0x43300000UL
),
1933 GEN_INT (0x45300000UL
),
1934 const0_rtx
, const0_rtx
));
1935 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1937 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1938 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1940 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1941 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1942 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1943 (0x1.0p84 + double(fp_value_hi_xmm)).
1944 Note these exponents differ by 32. */
1946 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1948 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1949 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1950 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1951 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1952 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1953 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1954 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1955 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1956 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1958 /* Add the upper and lower DFmode values together. */
1960 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1963 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1964 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1965 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1968 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1971 /* Not used, but eases macroization of patterns. */
1973 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1978 static rtx
ix86_expand_sse_fabs (rtx op0
, rtx
*smask
);
1980 /* Convert an unsigned SImode value into a DFmode. Only currently used
1981 for SSE, but applicable anywhere. */
1984 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1986 REAL_VALUE_TYPE TWO31r
;
1989 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1990 NULL
, 1, OPTAB_DIRECT
);
1992 fp
= gen_reg_rtx (DFmode
);
1993 emit_insn (gen_floatsidf2 (fp
, x
));
1995 real_ldexp (&TWO31r
, &dconst1
, 31);
1996 x
= const_double_from_real_value (TWO31r
, DFmode
);
1998 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
2000 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
2001 if (HONOR_SIGNED_ZEROS (DFmode
) && flag_rounding_math
)
2002 x
= ix86_expand_sse_fabs (x
, NULL
);
2005 emit_move_insn (target
, x
);
2008 /* Convert a signed DImode value into a DFmode. Only used for SSE in
2009 32-bit mode; otherwise we have a direct convert instruction. */
2012 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
2014 REAL_VALUE_TYPE TWO32r
;
2015 rtx fp_lo
, fp_hi
, x
;
2017 fp_lo
= gen_reg_rtx (DFmode
);
2018 fp_hi
= gen_reg_rtx (DFmode
);
2020 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
2022 real_ldexp (&TWO32r
, &dconst1
, 32);
2023 x
= const_double_from_real_value (TWO32r
, DFmode
);
2024 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
2026 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
2028 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
2031 emit_move_insn (target
, x
);
2034 /* Convert an unsigned SImode value into a SFmode, using only SSE.
2035 For x86_32, -mfpmath=sse, !optimize_size only. */
2037 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
2039 REAL_VALUE_TYPE ONE16r
;
2040 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
2042 real_ldexp (&ONE16r
, &dconst1
, 16);
2043 x
= const_double_from_real_value (ONE16r
, SFmode
);
2044 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
2045 NULL
, 0, OPTAB_DIRECT
);
2046 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
2047 NULL
, 0, OPTAB_DIRECT
);
2048 fp_hi
= gen_reg_rtx (SFmode
);
2049 fp_lo
= gen_reg_rtx (SFmode
);
2050 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
2051 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
2054 x
= validize_mem (force_const_mem (SFmode
, x
));
2055 fp_hi
= gen_rtx_FMA (SFmode
, fp_hi
, x
, fp_lo
);
2056 emit_move_insn (target
, fp_hi
);
2060 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
2062 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
2064 if (!rtx_equal_p (target
, fp_hi
))
2065 emit_move_insn (target
, fp_hi
);
2069 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
2070 a vector of unsigned ints VAL to vector of floats TARGET. */
2073 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
2076 REAL_VALUE_TYPE TWO16r
;
2077 machine_mode intmode
= GET_MODE (val
);
2078 machine_mode fltmode
= GET_MODE (target
);
2079 rtx (*cvt
) (rtx
, rtx
);
2081 if (intmode
== V4SImode
)
2082 cvt
= gen_floatv4siv4sf2
;
2084 cvt
= gen_floatv8siv8sf2
;
2085 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
2086 tmp
[0] = force_reg (intmode
, tmp
[0]);
2087 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
2089 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
2090 NULL_RTX
, 1, OPTAB_DIRECT
);
2091 tmp
[3] = gen_reg_rtx (fltmode
);
2092 emit_insn (cvt (tmp
[3], tmp
[1]));
2093 tmp
[4] = gen_reg_rtx (fltmode
);
2094 emit_insn (cvt (tmp
[4], tmp
[2]));
2095 real_ldexp (&TWO16r
, &dconst1
, 16);
2096 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
2097 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
2100 tmp
[6] = gen_rtx_FMA (fltmode
, tmp
[4], tmp
[5], tmp
[3]);
2101 emit_move_insn (target
, tmp
[6]);
2105 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5],
2106 NULL_RTX
, 1, OPTAB_DIRECT
);
2107 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6],
2108 target
, 1, OPTAB_DIRECT
);
2109 if (tmp
[7] != target
)
2110 emit_move_insn (target
, tmp
[7]);
2114 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2115 pattern can be used on it instead of fixuns_trunc*.
2116 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2117 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2120 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
2122 REAL_VALUE_TYPE TWO31r
;
2124 machine_mode mode
= GET_MODE (val
);
2125 machine_mode scalarmode
= GET_MODE_INNER (mode
);
2126 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
2127 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
2130 for (i
= 0; i
< 3; i
++)
2131 tmp
[i
] = gen_reg_rtx (mode
);
2132 real_ldexp (&TWO31r
, &dconst1
, 31);
2133 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
2134 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
2135 two31r
= force_reg (mode
, two31r
);
2138 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
2139 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
2140 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
2141 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
2142 default: gcc_unreachable ();
2144 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
2145 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
2146 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
2148 if (intmode
== V4SImode
|| TARGET_AVX2
)
2149 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
2150 gen_lowpart (intmode
, tmp
[0]),
2151 GEN_INT (31), NULL_RTX
, 0,
2155 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
2156 two31
= ix86_build_const_vector (intmode
, 1, two31
);
2157 *xorp
= expand_simple_binop (intmode
, AND
,
2158 gen_lowpart (intmode
, tmp
[0]),
2162 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
2166 /* Generate code for floating point ABS or NEG. */
2169 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2173 bool use_sse
= false;
2174 bool vector_mode
= VECTOR_MODE_P (mode
);
2175 machine_mode vmode
= mode
;
2178 if (vector_mode
|| mode
== TFmode
|| mode
== HFmode
)
2184 else if (TARGET_SSE_MATH
)
2186 use_sse
= SSE_FLOAT_MODE_P (mode
);
2189 else if (mode
== DFmode
)
2196 set
= gen_rtx_fmt_e (code
, mode
, src
);
2197 set
= gen_rtx_SET (dst
, set
);
2201 rtx mask
, use
, clob
;
2203 /* NEG and ABS performed with SSE use bitwise mask operations.
2204 Create the appropriate mask now. */
2205 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
2206 use
= gen_rtx_USE (VOIDmode
, mask
);
2207 if (vector_mode
|| mode
== TFmode
)
2208 par
= gen_rtvec (2, set
, use
);
2211 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2212 par
= gen_rtvec (3, set
, use
, clob
);
2219 /* Changing of sign for FP values is doable using integer unit too. */
2220 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2221 par
= gen_rtvec (2, set
, clob
);
2224 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2227 /* Deconstruct a floating point ABS or NEG operation
2228 with integer registers into integer operations. */
2231 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2234 enum rtx_code absneg_op
;
2237 gcc_assert (operands_match_p (operands
[0], operands
[1]));
2242 dst
= gen_lowpart (SImode
, operands
[0]);
2246 set
= gen_int_mode (0x7fffffff, SImode
);
2251 set
= gen_int_mode (0x80000000, SImode
);
2254 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2260 dst
= gen_lowpart (DImode
, operands
[0]);
2261 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
2266 set
= gen_rtx_NOT (DImode
, dst
);
2270 dst
= gen_highpart (SImode
, operands
[0]);
2274 set
= gen_int_mode (0x7fffffff, SImode
);
2279 set
= gen_int_mode (0x80000000, SImode
);
2282 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2287 dst
= gen_rtx_REG (SImode
,
2288 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
2291 set
= GEN_INT (0x7fff);
2296 set
= GEN_INT (0x8000);
2299 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2306 set
= gen_rtx_SET (dst
, set
);
2308 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2309 rtvec par
= gen_rtvec (2, set
, clob
);
2311 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2314 /* Expand a copysign operation. Special case operand 0 being a constant. */
2317 ix86_expand_copysign (rtx operands
[])
2319 machine_mode mode
, vmode
;
2320 rtx dest
, vdest
, op0
, op1
, mask
, op2
, op3
;
2322 mode
= GET_MODE (operands
[0]);
2326 else if (mode
== SFmode
)
2328 else if (mode
== DFmode
)
2330 else if (mode
== TFmode
)
2335 if (rtx_equal_p (operands
[1], operands
[2]))
2337 emit_move_insn (operands
[0], operands
[1]);
2342 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2343 if (vdest
== NULL_RTX
)
2344 vdest
= gen_reg_rtx (vmode
);
2347 op1
= lowpart_subreg (vmode
, force_reg (mode
, operands
[2]), mode
);
2348 mask
= ix86_build_signbit_mask (vmode
, TARGET_AVX512F
&& mode
!= HFmode
, 0);
2350 if (CONST_DOUBLE_P (operands
[1]))
2352 op0
= simplify_unary_operation (ABS
, mode
, operands
[1], mode
);
2353 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2354 if (op0
== CONST0_RTX (mode
))
2356 emit_move_insn (vdest
, gen_rtx_AND (vmode
, mask
, op1
));
2358 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2362 if (GET_MODE_SIZE (mode
) < 16)
2363 op0
= ix86_build_const_vector (vmode
, false, op0
);
2364 op0
= force_reg (vmode
, op0
);
2367 op0
= lowpart_subreg (vmode
, force_reg (mode
, operands
[1]), mode
);
2369 op2
= gen_reg_rtx (vmode
);
2370 op3
= gen_reg_rtx (vmode
);
2371 emit_move_insn (op2
, gen_rtx_AND (vmode
,
2372 gen_rtx_NOT (vmode
, mask
),
2374 emit_move_insn (op3
, gen_rtx_AND (vmode
, mask
, op1
));
2375 emit_move_insn (vdest
, gen_rtx_IOR (vmode
, op2
, op3
));
2377 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2380 /* Expand an xorsign operation. */
2383 ix86_expand_xorsign (rtx operands
[])
2385 machine_mode mode
, vmode
;
2386 rtx dest
, vdest
, op0
, op1
, mask
, x
, temp
;
2392 mode
= GET_MODE (dest
);
2396 else if (mode
== SFmode
)
2398 else if (mode
== DFmode
)
2403 temp
= gen_reg_rtx (vmode
);
2404 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2406 op1
= lowpart_subreg (vmode
, force_reg (mode
, op1
), mode
);
2407 x
= gen_rtx_AND (vmode
, op1
, mask
);
2408 emit_insn (gen_rtx_SET (temp
, x
));
2410 op0
= lowpart_subreg (vmode
, force_reg (mode
, op0
), mode
);
2411 x
= gen_rtx_XOR (vmode
, temp
, op0
);
2413 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2414 if (vdest
== NULL_RTX
)
2415 vdest
= gen_reg_rtx (vmode
);
2418 emit_insn (gen_rtx_SET (vdest
, x
));
2421 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2424 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2427 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2429 machine_mode mode
= GET_MODE (op0
);
2432 /* Handle special case - vector comparsion with boolean result, transform
2433 it using ptest instruction or vpcmpeq + kortest. */
2434 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
2435 || (mode
== TImode
&& !TARGET_64BIT
)
2437 || GET_MODE_SIZE (mode
) == 64)
2439 unsigned msize
= GET_MODE_SIZE (mode
);
2441 = msize
== 64 ? V16SImode
: msize
== 32 ? V4DImode
: V2DImode
;
2442 /* kortest set CF when result is 0xFFFF (op0 == op1). */
2443 rtx flag
= gen_rtx_REG (msize
== 64 ? CCCmode
: CCZmode
, FLAGS_REG
);
2445 gcc_assert (code
== EQ
|| code
== NE
);
2447 /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors. */
2450 if (mode
!= V16SImode
)
2452 op0
= lowpart_subreg (p_mode
, force_reg (mode
, op0
), mode
);
2453 op1
= lowpart_subreg (p_mode
, force_reg (mode
, op1
), mode
);
2456 tmp
= gen_reg_rtx (HImode
);
2457 emit_insn (gen_avx512f_cmpv16si3 (tmp
, op0
, op1
, GEN_INT (0)));
2458 emit_insn (gen_kortesthi_ccc (tmp
, tmp
));
2460 /* Using ptest for 128/256-bit vectors. */
2463 if (GET_MODE_CLASS (mode
) != MODE_VECTOR_INT
)
2465 op0
= lowpart_subreg (p_mode
, force_reg (mode
, op0
), mode
);
2466 op1
= lowpart_subreg (p_mode
, force_reg (mode
, op1
), mode
);
2470 /* Generate XOR since we can't check that one operand is zero
2472 tmp
= gen_reg_rtx (mode
);
2473 rtx ops
[3] = { tmp
, op0
, op1
};
2474 ix86_expand_vector_logical_operator (XOR
, mode
, ops
);
2475 tmp
= gen_lowpart (p_mode
, tmp
);
2476 emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode
, FLAGS_REG
),
2477 gen_rtx_UNSPEC (CCZmode
,
2478 gen_rtvec (2, tmp
, tmp
),
2481 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2482 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2483 gen_rtx_LABEL_REF (VOIDmode
, label
),
2485 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2499 tmp
= ix86_expand_compare (code
, op0
, op1
);
2500 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2501 gen_rtx_LABEL_REF (VOIDmode
, label
),
2503 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2511 /* DI and TI mode equality/inequality comparisons may be performed
2512 on SSE registers. Avoid splitting them, except when optimizing
2514 if ((code
== EQ
|| code
== NE
)
2515 && !optimize_insn_for_size_p ())
2518 /* Expand DImode branch into multiple compare+branch. */
2521 rtx_code_label
*label2
;
2522 enum rtx_code code1
, code2
, code3
;
2523 machine_mode submode
;
2525 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2527 std::swap (op0
, op1
);
2528 code
= swap_condition (code
);
2531 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2532 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2534 submode
= mode
== DImode
? SImode
: DImode
;
2536 /* If we are doing less-than or greater-or-equal-than,
2537 op1 is a constant and the low word is zero, then we can just
2538 examine the high word. Similarly for low word -1 and
2539 less-or-equal-than or greater-than. */
2541 if (CONST_INT_P (hi
[1]))
2544 case LT
: case LTU
: case GE
: case GEU
:
2545 if (lo
[1] == const0_rtx
)
2547 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2551 case LE
: case LEU
: case GT
: case GTU
:
2552 if (lo
[1] == constm1_rtx
)
2554 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2562 /* Emulate comparisons that do not depend on Zero flag with
2563 double-word subtraction. Note that only Overflow, Sign
2564 and Carry flags are valid, so swap arguments and condition
2565 of comparisons that would otherwise test Zero flag. */
2569 case LE
: case LEU
: case GT
: case GTU
:
2570 std::swap (lo
[0], lo
[1]);
2571 std::swap (hi
[0], hi
[1]);
2572 code
= swap_condition (code
);
2575 case LT
: case LTU
: case GE
: case GEU
:
2577 bool uns
= (code
== LTU
|| code
== GEU
);
2578 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2579 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2581 if (!nonimmediate_operand (lo
[0], submode
))
2582 lo
[0] = force_reg (submode
, lo
[0]);
2583 if (!x86_64_general_operand (lo
[1], submode
))
2584 lo
[1] = force_reg (submode
, lo
[1]);
2586 if (!register_operand (hi
[0], submode
))
2587 hi
[0] = force_reg (submode
, hi
[0]);
2588 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2589 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2590 hi
[1] = force_reg (submode
, hi
[1]);
2592 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2594 tmp
= gen_rtx_SCRATCH (submode
);
2595 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2597 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2598 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2606 /* Otherwise, we need two or three jumps. */
2608 label2
= gen_label_rtx ();
2611 code2
= swap_condition (code
);
2612 code3
= unsigned_condition (code
);
2616 case LT
: case GT
: case LTU
: case GTU
:
2619 case LE
: code1
= LT
; code2
= GT
; break;
2620 case GE
: code1
= GT
; code2
= LT
; break;
2621 case LEU
: code1
= LTU
; code2
= GTU
; break;
2622 case GEU
: code1
= GTU
; code2
= LTU
; break;
2624 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2625 case NE
: code2
= UNKNOWN
; break;
2633 * if (hi(a) < hi(b)) goto true;
2634 * if (hi(a) > hi(b)) goto false;
2635 * if (lo(a) < lo(b)) goto true;
2639 if (code1
!= UNKNOWN
)
2640 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2641 if (code2
!= UNKNOWN
)
2642 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2644 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2646 if (code2
!= UNKNOWN
)
2647 emit_label (label2
);
2652 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2657 /* Figure out whether to use unordered fp comparisons. */
2660 ix86_unordered_fp_compare (enum rtx_code code
)
2662 if (!TARGET_IEEE_FP
)
2691 /* Return a comparison we can do and that it is equivalent to
2692 swap_condition (code) apart possibly from orderedness.
2693 But, never change orderedness if TARGET_IEEE_FP, returning
2694 UNKNOWN in that case if necessary. */
2696 static enum rtx_code
2697 ix86_fp_swap_condition (enum rtx_code code
)
2701 case GT
: /* GTU - CF=0 & ZF=0 */
2702 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2703 case GE
: /* GEU - CF=0 */
2704 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2705 case UNLT
: /* LTU - CF=1 */
2706 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2707 case UNLE
: /* LEU - CF=1 | ZF=1 */
2708 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2710 return swap_condition (code
);
2714 /* Return cost of comparison CODE using the best strategy for performance.
2715 All following functions do use number of instructions as a cost metrics.
2716 In future this should be tweaked to compute bytes for optimize_size and
2717 take into account performance of various instructions on various CPUs. */
2720 ix86_fp_comparison_cost (enum rtx_code code
)
2724 /* The cost of code using bit-twiddling on %ah. */
2741 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2745 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2751 switch (ix86_fp_comparison_strategy (code
))
2753 case IX86_FPCMP_COMI
:
2754 return arith_cost
> 4 ? 3 : 2;
2755 case IX86_FPCMP_SAHF
:
2756 return arith_cost
> 4 ? 4 : 3;
2762 /* Swap, force into registers, or otherwise massage the two operands
2763 to a fp comparison. The operands are updated in place; the new
2764 comparison code is returned. */
2766 static enum rtx_code
2767 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2769 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2770 rtx op0
= *pop0
, op1
= *pop1
;
2771 machine_mode op_mode
= GET_MODE (op0
);
2772 bool is_sse
= SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode
);
2774 if (op_mode
== BFmode
)
2776 rtx op
= gen_lowpart (HImode
, op0
);
2777 if (CONST_INT_P (op
))
2778 op
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
2782 rtx t1
= gen_reg_rtx (SImode
);
2783 emit_insn (gen_zero_extendhisi2 (t1
, op
));
2784 emit_insn (gen_ashlsi3 (t1
, t1
, GEN_INT (16)));
2785 op
= gen_lowpart (SFmode
, t1
);
2788 op
= gen_lowpart (HImode
, op1
);
2789 if (CONST_INT_P (op
))
2790 op
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
2794 rtx t1
= gen_reg_rtx (SImode
);
2795 emit_insn (gen_zero_extendhisi2 (t1
, op
));
2796 emit_insn (gen_ashlsi3 (t1
, t1
, GEN_INT (16)));
2797 op
= gen_lowpart (SFmode
, t1
);
2800 return ix86_prepare_fp_compare_args (code
, pop0
, pop1
);
2803 /* All of the unordered compare instructions only work on registers.
2804 The same is true of the fcomi compare instructions. The XFmode
2805 compare instructions require registers except when comparing
2806 against zero or when converting operand 1 from fixed point to
2810 && (unordered_compare
2811 || (op_mode
== XFmode
2812 && ! (standard_80387_constant_p (op0
) == 1
2813 || standard_80387_constant_p (op1
) == 1)
2814 && GET_CODE (op1
) != FLOAT
)
2815 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2817 op0
= force_reg (op_mode
, op0
);
2818 op1
= force_reg (op_mode
, op1
);
2822 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2823 things around if they appear profitable, otherwise force op0
2826 if (standard_80387_constant_p (op0
) == 0
2828 && ! (standard_80387_constant_p (op1
) == 0
2831 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2832 if (new_code
!= UNKNOWN
)
2834 std::swap (op0
, op1
);
2840 op0
= force_reg (op_mode
, op0
);
2842 if (CONSTANT_P (op1
))
2844 int tmp
= standard_80387_constant_p (op1
);
2846 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2850 op1
= force_reg (op_mode
, op1
);
2853 op1
= force_reg (op_mode
, op1
);
2857 /* Try to rearrange the comparison to make it cheaper. */
2858 if (ix86_fp_comparison_cost (code
)
2859 > ix86_fp_comparison_cost (swap_condition (code
))
2860 && (REG_P (op1
) || can_create_pseudo_p ()))
2862 std::swap (op0
, op1
);
2863 code
= swap_condition (code
);
2865 op0
= force_reg (op_mode
, op0
);
2873 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2876 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2878 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2879 machine_mode cmp_mode
;
2882 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2884 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2885 if (unordered_compare
)
2886 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2888 /* Do fcomi/sahf based test when profitable. */
2889 switch (ix86_fp_comparison_strategy (code
))
2891 case IX86_FPCMP_COMI
:
2892 cmp_mode
= CCFPmode
;
2893 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2896 case IX86_FPCMP_SAHF
:
2897 cmp_mode
= CCFPmode
;
2898 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2899 scratch
= gen_reg_rtx (HImode
);
2900 emit_insn (gen_rtx_SET (scratch
, tmp
));
2901 emit_insn (gen_x86_sahf_1 (scratch
));
2904 case IX86_FPCMP_ARITH
:
2905 cmp_mode
= CCNOmode
;
2906 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2907 scratch
= gen_reg_rtx (HImode
);
2908 emit_insn (gen_rtx_SET (scratch
, tmp
));
2910 /* In the unordered case, we have to check C2 for NaN's, which
2911 doesn't happen to work out to anything nice combination-wise.
2912 So do some bit twiddling on the value we've got in AH to come
2913 up with an appropriate set of condition codes. */
2919 if (code
== GT
|| !TARGET_IEEE_FP
)
2921 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2926 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2927 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2928 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2935 if (code
== LT
&& TARGET_IEEE_FP
)
2937 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2938 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2944 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2950 if (code
== GE
|| !TARGET_IEEE_FP
)
2952 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2957 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2958 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2964 if (code
== LE
&& TARGET_IEEE_FP
)
2966 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2967 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2968 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2974 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2980 if (code
== EQ
&& TARGET_IEEE_FP
)
2982 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2983 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2989 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2995 if (code
== NE
&& TARGET_IEEE_FP
)
2997 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2998 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
3004 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
3010 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
3014 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
3027 /* Return the test that should be put into the flags user, i.e.
3028 the bcc, scc, or cmov instruction. */
3029 return gen_rtx_fmt_ee (code
, VOIDmode
,
3030 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
3034 /* Generate insn patterns to do an integer compare of OPERANDS. */
3037 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
3039 machine_mode cmpmode
;
3042 /* Swap operands to emit carry flag comparison. */
3043 if ((code
== GTU
|| code
== LEU
)
3044 && nonimmediate_operand (op1
, VOIDmode
))
3046 std::swap (op0
, op1
);
3047 code
= swap_condition (code
);
3050 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
3051 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
3053 /* Attempt to use PTEST, if available, when testing vector modes for
3054 equality/inequality against zero. */
3055 if (op1
== const0_rtx
3057 && cmpmode
== CCZmode
3058 && SUBREG_BYTE (op0
) == 0
3059 && REG_P (SUBREG_REG (op0
))
3060 && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0
)))
3062 && GET_MODE (op0
) == TImode
3063 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0
))) == 16)
3065 tmp
= SUBREG_REG (op0
);
3066 tmp
= gen_rtx_UNSPEC (CCZmode
, gen_rtvec (2, tmp
, tmp
), UNSPEC_PTEST
);
3069 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
3071 /* This is very simple, but making the interface the same as in the
3072 FP case makes the rest of the code easier. */
3073 emit_insn (gen_rtx_SET (flags
, tmp
));
3075 /* Return the test that should be put into the flags user, i.e.
3076 the bcc, scc, or cmov instruction. */
3077 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
3081 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
3085 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
3086 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
3088 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
3090 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
3091 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
3094 ret
= ix86_expand_int_compare (code
, op0
, op1
);
3100 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
3104 gcc_assert (GET_MODE (dest
) == QImode
);
3106 ret
= ix86_expand_compare (code
, op0
, op1
);
3107 PUT_MODE (ret
, QImode
);
3108 emit_insn (gen_rtx_SET (dest
, ret
));
3111 /* Expand floating point op0 <=> op1, i.e.
3112 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
3115 ix86_expand_fp_spaceship (rtx dest
, rtx op0
, rtx op1
)
3117 gcc_checking_assert (ix86_fp_comparison_strategy (GT
) != IX86_FPCMP_ARITH
);
3118 rtx gt
= ix86_expand_fp_compare (GT
, op0
, op1
);
3119 rtx l0
= gen_label_rtx ();
3120 rtx l1
= gen_label_rtx ();
3121 rtx l2
= TARGET_IEEE_FP
? gen_label_rtx () : NULL_RTX
;
3122 rtx lend
= gen_label_rtx ();
3127 rtx un
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
,
3128 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
3129 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, un
,
3130 gen_rtx_LABEL_REF (VOIDmode
, l2
), pc_rtx
);
3131 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3132 add_reg_br_prob_note (jmp
, profile_probability:: very_unlikely ());
3134 rtx eq
= gen_rtx_fmt_ee (UNEQ
, VOIDmode
,
3135 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
3136 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, eq
,
3137 gen_rtx_LABEL_REF (VOIDmode
, l0
), pc_rtx
);
3138 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3139 add_reg_br_prob_note (jmp
, profile_probability::unlikely ());
3140 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, gt
,
3141 gen_rtx_LABEL_REF (VOIDmode
, l1
), pc_rtx
);
3142 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3143 add_reg_br_prob_note (jmp
, profile_probability::even ());
3144 emit_move_insn (dest
, constm1_rtx
);
3147 emit_move_insn (dest
, const0_rtx
);
3150 emit_move_insn (dest
, const1_rtx
);
3155 emit_move_insn (dest
, const2_rtx
);
3160 /* Expand comparison setting or clearing carry flag. Return true when
3161 successful and set pop for the operation. */
3163 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
3166 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
3168 /* Do not handle double-mode compares that go through special path. */
3169 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
3172 if (SCALAR_FLOAT_MODE_P (mode
))
3175 rtx_insn
*compare_seq
;
3177 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
3179 /* Shortcut: following common codes never translate
3180 into carry flag compares. */
3181 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
3182 || code
== ORDERED
|| code
== UNORDERED
)
3185 /* These comparisons require zero flag; swap operands so they won't. */
3186 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
3189 std::swap (op0
, op1
);
3190 code
= swap_condition (code
);
3193 /* Try to expand the comparison and verify that we end up with
3194 carry flag based comparison. This fails to be true only when
3195 we decide to expand comparison using arithmetic that is not
3196 too common scenario. */
3198 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
3199 compare_seq
= get_insns ();
3202 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
3203 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
3205 code
= GET_CODE (compare_op
);
3207 if (code
!= LTU
&& code
!= GEU
)
3210 emit_insn (compare_seq
);
3215 if (!INTEGRAL_MODE_P (mode
))
3224 /* Convert a==0 into (unsigned)a<1. */
3227 if (op1
!= const0_rtx
)
3230 code
= (code
== EQ
? LTU
: GEU
);
3233 /* Convert a>b into b<a or a>=b-1. */
3236 if (CONST_INT_P (op1
))
3238 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
3239 /* Bail out on overflow. We still can swap operands but that
3240 would force loading of the constant into register. */
3241 if (op1
== const0_rtx
3242 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
3244 code
= (code
== GTU
? GEU
: LTU
);
3248 std::swap (op0
, op1
);
3249 code
= (code
== GTU
? LTU
: GEU
);
3253 /* Convert a>=0 into (unsigned)a<0x80000000. */
3256 if (mode
== DImode
|| op1
!= const0_rtx
)
3258 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3259 code
= (code
== LT
? GEU
: LTU
);
3263 if (mode
== DImode
|| op1
!= constm1_rtx
)
3265 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3266 code
= (code
== LE
? GEU
: LTU
);
3272 /* Swapping operands may cause constant to appear as first operand. */
3273 if (!nonimmediate_operand (op0
, VOIDmode
))
3275 if (!can_create_pseudo_p ())
3277 op0
= force_reg (mode
, op0
);
3279 *pop
= ix86_expand_compare (code
, op0
, op1
);
3280 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
3284 /* Expand conditional increment or decrement using adb/sbb instructions.
3285 The default case using setcc followed by the conditional move can be
3286 done by generic code. */
3288 ix86_expand_int_addcc (rtx operands
[])
3290 enum rtx_code code
= GET_CODE (operands
[1]);
3292 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
3294 rtx val
= const0_rtx
;
3297 rtx op0
= XEXP (operands
[1], 0);
3298 rtx op1
= XEXP (operands
[1], 1);
3300 if (operands
[3] != const1_rtx
3301 && operands
[3] != constm1_rtx
)
3303 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3305 code
= GET_CODE (compare_op
);
3307 flags
= XEXP (compare_op
, 0);
3309 if (GET_MODE (flags
) == CCFPmode
)
3312 code
= ix86_fp_compare_code_to_integer (code
);
3319 PUT_CODE (compare_op
,
3320 reverse_condition_maybe_unordered
3321 (GET_CODE (compare_op
)));
3323 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
3326 mode
= GET_MODE (operands
[0]);
3328 /* Construct either adc or sbb insn. */
3329 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
3330 insn
= gen_sub3_carry
;
3332 insn
= gen_add3_carry
;
3334 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
3340 ix86_expand_int_movcc (rtx operands
[])
3342 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
3343 rtx_insn
*compare_seq
;
3345 machine_mode mode
= GET_MODE (operands
[0]);
3346 bool sign_bit_compare_p
= false;
3347 bool negate_cc_compare_p
= false;
3348 rtx op0
= XEXP (operands
[1], 0);
3349 rtx op1
= XEXP (operands
[1], 1);
3350 rtx op2
= operands
[2];
3351 rtx op3
= operands
[3];
3353 if (GET_MODE (op0
) == TImode
3354 || (GET_MODE (op0
) == DImode
3358 if (GET_MODE (op0
) == BFmode
3359 && !ix86_fp_comparison_operator (operands
[1], VOIDmode
))
3363 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3364 compare_seq
= get_insns ();
3367 compare_code
= GET_CODE (compare_op
);
3369 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
3370 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
3371 sign_bit_compare_p
= true;
3373 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3374 but if op1 is a constant, the latter form allows more optimizations,
3375 either through the last 2 ops being constant handling, or the one
3376 constant and one variable cases. On the other side, for cmov the
3377 former might be better as we don't need to load the constant into
3378 another register. */
3379 if (code
== EQ
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op2
))
3381 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3382 else if (code
== NE
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op3
))
3385 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3386 HImode insns, we'd be swallowed in word prefix ops. */
3388 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
3389 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
3390 && CONST_INT_P (op2
)
3391 && CONST_INT_P (op3
))
3393 rtx out
= operands
[0];
3394 HOST_WIDE_INT ct
= INTVAL (op2
);
3395 HOST_WIDE_INT cf
= INTVAL (op3
);
3399 || (TARGET_64BIT
&& mode
== DImode
))
3400 && (GET_MODE (op0
) == SImode
3401 || (TARGET_64BIT
&& GET_MODE (op0
) == DImode
)))
3403 /* Special case x != 0 ? -1 : y. */
3404 if (code
== NE
&& op1
== const0_rtx
&& ct
== -1)
3406 negate_cc_compare_p
= true;
3410 else if (code
== EQ
&& op1
== const0_rtx
&& cf
== -1)
3411 negate_cc_compare_p
= true;
3415 /* Sign bit compares are better done using shifts than we do by using
3417 if (sign_bit_compare_p
3418 || negate_cc_compare_p
3419 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3421 /* Detect overlap between destination and compare sources. */
3424 if (negate_cc_compare_p
)
3426 if (GET_MODE (op0
) == DImode
)
3427 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode
), op0
));
3429 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode
),
3430 gen_lowpart (SImode
, op0
)));
3432 tmp
= gen_reg_rtx (mode
);
3434 emit_insn (gen_x86_movdicc_0_m1_neg (tmp
));
3436 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode
,
3439 else if (!sign_bit_compare_p
)
3444 compare_code
= GET_CODE (compare_op
);
3446 flags
= XEXP (compare_op
, 0);
3448 if (GET_MODE (flags
) == CCFPmode
)
3452 = ix86_fp_compare_code_to_integer (compare_code
);
3455 /* To simplify rest of code, restrict to the GEU case. */
3456 if (compare_code
== LTU
)
3459 compare_code
= reverse_condition (compare_code
);
3460 code
= reverse_condition (code
);
3465 PUT_CODE (compare_op
,
3466 reverse_condition_maybe_unordered
3467 (GET_CODE (compare_op
)));
3469 PUT_CODE (compare_op
,
3470 reverse_condition (GET_CODE (compare_op
)));
3474 if (reg_overlap_mentioned_p (out
, compare_op
))
3475 tmp
= gen_reg_rtx (mode
);
3478 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
3480 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
3481 flags
, compare_op
));
3485 if (code
== GT
|| code
== GE
)
3486 code
= reverse_condition (code
);
3492 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
3505 tmp
= expand_simple_binop (mode
, PLUS
,
3507 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3518 tmp
= expand_simple_binop (mode
, IOR
,
3520 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3522 else if (diff
== -1 && ct
)
3532 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3534 tmp
= expand_simple_binop (mode
, PLUS
,
3535 copy_rtx (tmp
), GEN_INT (cf
),
3536 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3544 * andl cf - ct, dest
3554 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3557 tmp
= expand_simple_binop (mode
, AND
,
3559 gen_int_mode (cf
- ct
, mode
),
3560 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3562 tmp
= expand_simple_binop (mode
, PLUS
,
3563 copy_rtx (tmp
), GEN_INT (ct
),
3564 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3567 if (!rtx_equal_p (tmp
, out
))
3568 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3575 machine_mode cmp_mode
= GET_MODE (op0
);
3576 enum rtx_code new_code
;
3578 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3580 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3582 /* We may be reversing a non-trapping
3583 comparison to a trapping comparison. */
3584 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3585 && code
!= EQ
&& code
!= NE
3586 && code
!= ORDERED
&& code
!= UNORDERED
)
3589 new_code
= reverse_condition_maybe_unordered (code
);
3592 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3593 if (new_code
!= UNKNOWN
)
3601 compare_code
= UNKNOWN
;
3602 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3603 && CONST_INT_P (op1
))
3605 if (op1
== const0_rtx
3606 && (code
== LT
|| code
== GE
))
3607 compare_code
= code
;
3608 else if (op1
== constm1_rtx
)
3612 else if (code
== GT
)
3617 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3618 if (compare_code
!= UNKNOWN
3619 && GET_MODE (op0
) == GET_MODE (out
)
3620 && (cf
== -1 || ct
== -1))
3622 /* If lea code below could be used, only optimize
3623 if it results in a 2 insn sequence. */
3625 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3626 || diff
== 3 || diff
== 5 || diff
== 9)
3627 || (compare_code
== LT
&& ct
== -1)
3628 || (compare_code
== GE
&& cf
== -1))
3631 * notl op1 (if necessary)
3639 code
= reverse_condition (code
);
3642 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3644 out
= expand_simple_binop (mode
, IOR
,
3646 out
, 1, OPTAB_DIRECT
);
3647 if (out
!= operands
[0])
3648 emit_move_insn (operands
[0], out
);
3655 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3656 || diff
== 3 || diff
== 5 || diff
== 9)
3657 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3659 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3665 * lea cf(dest*(ct-cf)),dest
3669 * This also catches the degenerate setcc-only case.
3675 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3678 /* On x86_64 the lea instruction operates on Pmode, so we need
3679 to get arithmetics done in proper mode to match. */
3681 tmp
= copy_rtx (out
);
3685 out1
= copy_rtx (out
);
3686 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3690 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3696 tmp
= plus_constant (mode
, tmp
, cf
);
3699 if (!rtx_equal_p (tmp
, out
))
3702 out
= force_operand (tmp
, copy_rtx (out
));
3704 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3706 if (!rtx_equal_p (out
, operands
[0]))
3707 emit_move_insn (operands
[0], copy_rtx (out
));
3713 * General case: Jumpful:
3714 * xorl dest,dest cmpl op1, op2
3715 * cmpl op1, op2 movl ct, dest
3717 * decl dest movl cf, dest
3718 * andl (cf-ct),dest 1:
3723 * This is reasonably steep, but branch mispredict costs are
3724 * high on modern cpus, so consider failing only if optimizing
3728 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3729 && BRANCH_COST (optimize_insn_for_speed_p (),
3734 machine_mode cmp_mode
= GET_MODE (op0
);
3735 enum rtx_code new_code
;
3737 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3739 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3741 /* We may be reversing a non-trapping
3742 comparison to a trapping comparison. */
3743 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3744 && code
!= EQ
&& code
!= NE
3745 && code
!= ORDERED
&& code
!= UNORDERED
)
3748 new_code
= reverse_condition_maybe_unordered (code
);
3753 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3754 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3755 compare_code
= reverse_condition (compare_code
);
3758 if (new_code
!= UNKNOWN
)
3766 if (compare_code
!= UNKNOWN
)
3768 /* notl op1 (if needed)
3773 For x < 0 (resp. x <= -1) there will be no notl,
3774 so if possible swap the constants to get rid of the
3776 True/false will be -1/0 while code below (store flag
3777 followed by decrement) is 0/-1, so the constants need
3778 to be exchanged once more. */
3780 if (compare_code
== GE
|| !cf
)
3782 code
= reverse_condition (code
);
3788 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3792 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3794 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3796 copy_rtx (out
), 1, OPTAB_DIRECT
);
3799 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3800 gen_int_mode (cf
- ct
, mode
),
3801 copy_rtx (out
), 1, OPTAB_DIRECT
);
3803 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3804 copy_rtx (out
), 1, OPTAB_DIRECT
);
3805 if (!rtx_equal_p (out
, operands
[0]))
3806 emit_move_insn (operands
[0], copy_rtx (out
));
3812 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3814 /* Try a few things more with specific constants and a variable. */
3817 rtx var
, orig_out
, out
, tmp
;
3819 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3825 /* If one of the two operands is an interesting constant, load a
3826 constant with the above and mask it in with a logical operation. */
3828 if (CONST_INT_P (operands
[2]))
3831 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3832 operands
[3] = constm1_rtx
, op
= and_optab
;
3833 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3834 operands
[3] = const0_rtx
, op
= ior_optab
;
3838 else if (CONST_INT_P (operands
[3]))
3841 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3843 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3844 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3845 if (code
== LE
&& op1
== const0_rtx
&& rtx_equal_p (op0
, var
))
3846 operands
[1] = simplify_gen_relational (LT
, VOIDmode
,
3850 operands
[2] = constm1_rtx
;
3853 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3854 operands
[2] = const0_rtx
, op
= ior_optab
;
3861 orig_out
= operands
[0];
3862 tmp
= gen_reg_rtx (mode
);
3865 /* Recurse to get the constant loaded. */
3866 if (!ix86_expand_int_movcc (operands
))
3869 /* Mask in the interesting variable. */
3870 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3872 if (!rtx_equal_p (out
, orig_out
))
3873 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3879 * For comparison with above,
3889 if (! nonimmediate_operand (operands
[2], mode
))
3890 operands
[2] = force_reg (mode
, operands
[2]);
3891 if (! nonimmediate_operand (operands
[3], mode
))
3892 operands
[3] = force_reg (mode
, operands
[3]);
3894 if (! register_operand (operands
[2], VOIDmode
)
3896 || ! register_operand (operands
[3], VOIDmode
)))
3897 operands
[2] = force_reg (mode
, operands
[2]);
3900 && ! register_operand (operands
[3], VOIDmode
))
3901 operands
[3] = force_reg (mode
, operands
[3]);
3903 emit_insn (compare_seq
);
3904 emit_insn (gen_rtx_SET (operands
[0],
3905 gen_rtx_IF_THEN_ELSE (mode
,
3906 compare_op
, operands
[2],
3911 /* Detect conditional moves that exactly match min/max operational
3912 semantics. Note that this is IEEE safe, as long as we don't
3913 interchange the operands.
3915 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3916 and TRUE if the operation is successful and instructions are emitted. */
3919 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3920 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3928 else if (code
== UNGE
)
3929 std::swap (if_true
, if_false
);
3933 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3935 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3940 mode
= GET_MODE (dest
);
3942 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3943 but MODE may be a vector mode and thus not appropriate. */
3944 if (!flag_finite_math_only
|| flag_signed_zeros
)
3946 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3949 if_true
= force_reg (mode
, if_true
);
3950 v
= gen_rtvec (2, if_true
, if_false
);
3951 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3955 code
= is_min
? SMIN
: SMAX
;
3956 if (MEM_P (if_true
) && MEM_P (if_false
))
3957 if_true
= force_reg (mode
, if_true
);
3958 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3961 emit_insn (gen_rtx_SET (dest
, tmp
));
3965 /* Return true if MODE is valid for vector compare to mask register,
3966 Same result for conditionl vector move with mask register. */
3968 ix86_valid_mask_cmp_mode (machine_mode mode
)
3970 /* XOP has its own vector conditional movement. */
3971 if (TARGET_XOP
&& !TARGET_AVX512F
)
3974 /* HFmode only supports vcmpsh whose dest is mask register. */
3975 if (TARGET_AVX512FP16
&& mode
== HFmode
)
3978 /* AVX512F is needed for mask operation. */
3979 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3982 /* AVX512BW is needed for vector QI/HImode,
3983 AVX512VL is needed for 128/256-bit vector. */
3984 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3985 int vector_size
= GET_MODE_SIZE (mode
);
3986 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3989 return (vector_size
== 64 && TARGET_EVEX512
) || TARGET_AVX512VL
;
3992 /* Return true if integer mask comparison should be used. */
3994 ix86_use_mask_cmp_p (machine_mode mode
, machine_mode cmp_mode
,
3995 rtx op_true
, rtx op_false
)
3997 int vector_size
= GET_MODE_SIZE (mode
);
3999 if (cmp_mode
== HFmode
)
4001 else if (vector_size
< 16)
4003 else if (vector_size
== 64)
4005 else if (GET_MODE_INNER (cmp_mode
) == HFmode
)
4008 /* When op_true is NULL, op_false must be NULL, or vice versa. */
4009 gcc_assert (!op_true
== !op_false
);
4011 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
4012 vector dest is required. */
4013 if (!op_true
|| !ix86_valid_mask_cmp_mode (cmp_mode
))
4016 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
4017 if (op_false
== CONST0_RTX (mode
)
4018 || op_true
== CONST0_RTX (mode
)
4019 || (INTEGRAL_MODE_P (mode
)
4020 && (op_true
== CONSTM1_RTX (mode
)
4021 || op_false
== CONSTM1_RTX (mode
))))
4027 /* Expand an SSE comparison. Return the register with the result. */
4030 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
4031 rtx op_true
, rtx op_false
)
4033 machine_mode mode
= GET_MODE (dest
);
4034 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
4036 /* In general case result of comparison can differ from operands' type. */
4037 machine_mode cmp_mode
;
4039 /* In AVX512F the result of comparison is an integer mask. */
4040 bool maskcmp
= false;
4043 if (ix86_use_mask_cmp_p (mode
, cmp_ops_mode
, op_true
, op_false
))
4045 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
4047 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
4050 cmp_mode
= cmp_ops_mode
;
4052 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
4054 bool (*op1_predicate
)(rtx
, machine_mode
)
4055 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
4057 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
4058 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
4061 || (maskcmp
&& cmp_mode
!= mode
)
4062 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
4063 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
4064 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
4068 bool ok
= ix86_expand_mask_vec_cmp (dest
, code
, cmp_op0
, cmp_op1
);
4073 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
4075 if (cmp_mode
!= mode
)
4077 x
= force_reg (cmp_ops_mode
, x
);
4078 convert_move (dest
, x
, false);
4081 emit_insn (gen_rtx_SET (dest
, x
));
4086 /* Emit x86 binary operand CODE in mode MODE for SSE vector
4087 instructions that can be performed using GP registers. */
4090 ix86_emit_vec_binop (enum rtx_code code
, machine_mode mode
,
4091 rtx dst
, rtx src1
, rtx src2
)
4095 tmp
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
4097 if (GET_MODE_SIZE (mode
) <= GET_MODE_SIZE (SImode
)
4098 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
4100 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
4101 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
4107 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
4108 operations. This is used for both scalar and vector conditional moves. */
4111 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
4113 machine_mode mode
= GET_MODE (dest
);
4114 machine_mode cmpmode
= GET_MODE (cmp
);
4117 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
4118 if (rtx_equal_p (op_true
, op_false
))
4120 emit_move_insn (dest
, op_true
);
4124 /* If we have an integer mask and FP value then we need
4125 to cast mask to FP mode. */
4126 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
4128 cmp
= force_reg (cmpmode
, cmp
);
4129 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
4132 /* In AVX512F the result of comparison is an integer mask. */
4134 && GET_MODE_CLASS (cmpmode
) == MODE_INT
)
4136 gcc_assert (ix86_valid_mask_cmp_mode (mode
));
4137 /* Using scalar/vector move with mask register. */
4138 cmp
= force_reg (cmpmode
, cmp
);
4139 /* Optimize for mask zero. */
4140 op_true
= (op_true
!= CONST0_RTX (mode
)
4141 ? force_reg (mode
, op_true
) : op_true
);
4142 op_false
= (op_false
!= CONST0_RTX (mode
)
4143 ? force_reg (mode
, op_false
) : op_false
);
4144 if (op_true
== CONST0_RTX (mode
))
4146 if (cmpmode
== E_DImode
&& !TARGET_64BIT
)
4148 x
= gen_reg_rtx (cmpmode
);
4149 emit_insn (gen_knotdi (x
, cmp
));
4152 x
= expand_simple_unop (cmpmode
, NOT
, cmp
, NULL
, 1);
4154 /* Reverse op_true op_false. */
4155 std::swap (op_true
, op_false
);
4159 emit_insn (gen_movhf_mask (dest
, op_true
, op_false
, cmp
));
4161 emit_insn (gen_rtx_SET (dest
,
4162 gen_rtx_VEC_MERGE (mode
,
4163 op_true
, op_false
, cmp
)));
4167 if (vector_all_ones_operand (op_true
, mode
)
4168 && op_false
== CONST0_RTX (mode
))
4170 emit_move_insn (dest
, cmp
);
4173 else if (op_false
== CONST0_RTX (mode
))
4175 x
= expand_simple_binop (mode
, AND
, cmp
, op_true
,
4176 dest
, 1, OPTAB_DIRECT
);
4178 emit_move_insn (dest
, x
);
4181 else if (op_true
== CONST0_RTX (mode
))
4183 op_false
= force_reg (mode
, op_false
);
4184 x
= gen_rtx_NOT (mode
, cmp
);
4185 ix86_emit_vec_binop (AND
, mode
, dest
, x
, op_false
);
4188 else if (vector_all_ones_operand (op_true
, mode
))
4190 x
= expand_simple_binop (mode
, IOR
, cmp
, op_false
,
4191 dest
, 1, OPTAB_DIRECT
);
4193 emit_move_insn (dest
, x
);
4199 op_true
= force_reg (mode
, op_true
);
4201 if (GET_MODE_SIZE (mode
) < 16
4202 || !nonimmediate_operand (op_false
, mode
))
4203 op_false
= force_reg (mode
, op_false
);
4205 emit_insn (gen_rtx_SET (dest
,
4206 gen_rtx_IF_THEN_ELSE (mode
, cmp
,
4207 op_true
, op_false
)));
4211 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4212 machine_mode blend_mode
= mode
;
4214 if (GET_MODE_SIZE (mode
) < 16
4215 || !vector_operand (op_true
, mode
))
4216 op_true
= force_reg (mode
, op_true
);
4218 op_false
= force_reg (mode
, op_false
);
4224 gen
= gen_mmx_blendvps
;
4228 gen
= gen_sse4_1_blendvps
;
4232 gen
= gen_sse4_1_blendvpd
;
4236 gen
= gen_sse4_1_blendvss
;
4240 gen
= gen_sse4_1_blendvsd
;
4249 gen
= gen_mmx_pblendvb_v8qi
;
4250 blend_mode
= V8QImode
;
4259 gen
= gen_mmx_pblendvb_v4qi
;
4260 blend_mode
= V4QImode
;
4265 gen
= gen_mmx_pblendvb_v2qi
;
4276 gen
= gen_sse4_1_pblendvb
;
4277 blend_mode
= V16QImode
;
4282 gen
= gen_avx_blendvps256
;
4286 gen
= gen_avx_blendvpd256
;
4296 gen
= gen_avx2_pblendvb
;
4297 blend_mode
= V32QImode
;
4302 gen
= gen_avx512bw_blendmv64qi
;
4305 gen
= gen_avx512bw_blendmv32hi
;
4308 gen
= gen_avx512bw_blendmv32hf
;
4311 gen
= gen_avx512bw_blendmv32bf
;
4314 gen
= gen_avx512f_blendmv16si
;
4317 gen
= gen_avx512f_blendmv8di
;
4320 gen
= gen_avx512f_blendmv8df
;
4323 gen
= gen_avx512f_blendmv16sf
;
4332 if (blend_mode
== mode
)
4336 x
= gen_reg_rtx (blend_mode
);
4337 op_false
= gen_lowpart (blend_mode
, op_false
);
4338 op_true
= gen_lowpart (blend_mode
, op_true
);
4339 cmp
= gen_lowpart (blend_mode
, cmp
);
4342 emit_insn (gen (x
, op_false
, op_true
, cmp
));
4345 emit_move_insn (dest
, gen_lowpart (mode
, x
));
4351 t2
= expand_simple_binop (mode
, AND
, op_true
, cmp
,
4352 NULL
, 1, OPTAB_DIRECT
);
4354 t3
= gen_reg_rtx (mode
);
4355 x
= gen_rtx_NOT (mode
, cmp
);
4356 ix86_emit_vec_binop (AND
, mode
, t3
, x
, op_false
);
4358 x
= expand_simple_binop (mode
, IOR
, t3
, t2
,
4359 dest
, 1, OPTAB_DIRECT
);
4361 emit_move_insn (dest
, x
);
4365 /* Swap, force into registers, or otherwise massage the two operands
4366 to an sse comparison with a mask result. Thus we differ a bit from
4367 ix86_prepare_fp_compare_args which expects to produce a flags result.
4369 The DEST operand exists to help determine whether to commute commutative
4370 operators. The POP0/POP1 operands are updated in place. The new
4371 comparison code is returned, or UNKNOWN if not implementable. */
4373 static enum rtx_code
4374 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
4375 rtx
*pop0
, rtx
*pop1
)
4381 /* AVX supports all the needed comparisons. */
4384 /* We have no LTGT as an operator. We could implement it with
4385 NE & ORDERED, but this requires an extra temporary. It's
4386 not clear that it's worth it. */
4393 /* These are supported directly. */
4400 /* AVX has 3 operand comparisons, no need to swap anything. */
4403 /* For commutative operators, try to canonicalize the destination
4404 operand to be first in the comparison - this helps reload to
4405 avoid extra moves. */
4406 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
4414 /* These are not supported directly before AVX, and furthermore
4415 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4416 comparison operands to transform into something that is
4418 std::swap (*pop0
, *pop1
);
4419 code
= swap_condition (code
);
4429 /* Expand a floating-point conditional move. Return true if successful. */
4432 ix86_expand_fp_movcc (rtx operands
[])
4434 machine_mode mode
= GET_MODE (operands
[0]);
4435 enum rtx_code code
= GET_CODE (operands
[1]);
4436 rtx tmp
, compare_op
;
4437 rtx op0
= XEXP (operands
[1], 0);
4438 rtx op1
= XEXP (operands
[1], 1);
4440 if (GET_MODE (op0
) == BFmode
4441 && !ix86_fp_comparison_operator (operands
[1], VOIDmode
))
4444 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode
))
4448 /* Since we've no cmove for sse registers, don't force bad register
4449 allocation just to gain access to it. Deny movcc when the
4450 comparison mode doesn't match the move mode. */
4451 cmode
= GET_MODE (op0
);
4452 if (cmode
== VOIDmode
)
4453 cmode
= GET_MODE (op1
);
4457 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
4458 if (code
== UNKNOWN
)
4461 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
4462 operands
[2], operands
[3]))
4465 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
4466 operands
[2], operands
[3]);
4467 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
4471 if (GET_MODE (op0
) == TImode
4472 || (GET_MODE (op0
) == DImode
4476 /* The floating point conditional move instructions don't directly
4477 support conditions resulting from a signed integer comparison. */
4479 compare_op
= ix86_expand_compare (code
, op0
, op1
);
4480 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
4482 tmp
= gen_reg_rtx (QImode
);
4483 ix86_expand_setcc (tmp
, code
, op0
, op1
);
4485 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
4488 emit_insn (gen_rtx_SET (operands
[0],
4489 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
4490 operands
[2], operands
[3])));
4495 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4498 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4523 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4526 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4563 /* Return immediate value to be used in UNSPEC_PCMP
4564 for comparison CODE in MODE. */
4567 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
4569 if (FLOAT_MODE_P (mode
))
4570 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
4571 return ix86_int_cmp_code_to_pcmp_immediate (code
);
4574 /* Expand AVX-512 vector comparison. */
4577 ix86_expand_mask_vec_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
)
4579 machine_mode mask_mode
= GET_MODE (dest
);
4580 machine_mode cmp_mode
= GET_MODE (cmp_op0
);
4581 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
4591 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
4595 unspec_code
= UNSPEC_PCMP
;
4598 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, cmp_op0
, cmp_op1
, imm
),
4600 emit_insn (gen_rtx_SET (dest
, unspec
));
4605 /* Expand fp vector comparison. */
4608 ix86_expand_fp_vec_cmp (rtx operands
[])
4610 enum rtx_code code
= GET_CODE (operands
[1]);
4613 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4614 &operands
[2], &operands
[3]);
4615 if (code
== UNKNOWN
)
4618 switch (GET_CODE (operands
[1]))
4621 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4622 operands
[3], NULL
, NULL
);
4623 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4624 operands
[3], NULL
, NULL
);
4628 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4629 operands
[3], NULL
, NULL
);
4630 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4631 operands
[3], NULL
, NULL
);
4637 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4641 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4644 if (operands
[0] != cmp
)
4645 emit_move_insn (operands
[0], cmp
);
4651 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4652 rtx op_true
, rtx op_false
, bool *negate
)
4654 machine_mode data_mode
= GET_MODE (dest
);
4655 machine_mode mode
= GET_MODE (cop0
);
4660 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4662 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
4663 && GET_MODE_SIZE (mode
) <= 16)
4665 /* AVX512F supports all of the comparsions
4666 on all 128/256/512-bit vector int types. */
4667 else if (ix86_use_mask_cmp_p (data_mode
, mode
, op_true
, op_false
))
4671 /* Canonicalize the comparison to EQ, GT, GTU. */
4681 /* x <= cst can be handled as x < cst + 1 unless there is
4682 wrap around in cst + 1. */
4683 if (GET_CODE (cop1
) == CONST_VECTOR
4684 && GET_MODE_INNER (mode
) != TImode
)
4686 unsigned int n_elts
= GET_MODE_NUNITS (mode
), i
;
4687 machine_mode eltmode
= GET_MODE_INNER (mode
);
4688 for (i
= 0; i
< n_elts
; ++i
)
4690 rtx elt
= CONST_VECTOR_ELT (cop1
, i
);
4691 if (!CONST_INT_P (elt
))
4695 /* For LE punt if some element is signed maximum. */
4696 if ((INTVAL (elt
) & (GET_MODE_MASK (eltmode
) >> 1))
4697 == (GET_MODE_MASK (eltmode
) >> 1))
4700 /* For LEU punt if some element is unsigned maximum. */
4701 else if (elt
== constm1_rtx
)
4706 rtvec v
= rtvec_alloc (n_elts
);
4707 for (i
= 0; i
< n_elts
; ++i
)
4709 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1
, i
)) + 1,
4711 cop1
= gen_rtx_CONST_VECTOR (mode
, v
);
4712 std::swap (cop0
, cop1
);
4713 code
= code
== LE
? GT
: GTU
;
4719 code
= reverse_condition (code
);
4725 /* x >= cst can be handled as x > cst - 1 unless there is
4726 wrap around in cst - 1. */
4727 if (GET_CODE (cop1
) == CONST_VECTOR
4728 && GET_MODE_INNER (mode
) != TImode
)
4730 unsigned int n_elts
= GET_MODE_NUNITS (mode
), i
;
4731 machine_mode eltmode
= GET_MODE_INNER (mode
);
4732 for (i
= 0; i
< n_elts
; ++i
)
4734 rtx elt
= CONST_VECTOR_ELT (cop1
, i
);
4735 if (!CONST_INT_P (elt
))
4739 /* For GE punt if some element is signed minimum. */
4740 if (INTVAL (elt
) < 0
4741 && ((INTVAL (elt
) & (GET_MODE_MASK (eltmode
) >> 1))
4745 /* For GEU punt if some element is zero. */
4746 else if (elt
== const0_rtx
)
4751 rtvec v
= rtvec_alloc (n_elts
);
4752 for (i
= 0; i
< n_elts
; ++i
)
4754 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1
, i
)) - 1,
4756 cop1
= gen_rtx_CONST_VECTOR (mode
, v
);
4757 code
= code
== GE
? GT
: GTU
;
4761 code
= reverse_condition (code
);
4767 std::swap (cop0
, cop1
);
4768 code
= swap_condition (code
);
4775 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4776 if (mode
== V2DImode
)
4781 /* SSE4.1 supports EQ. */
4788 /* SSE4.2 supports GT/GTU. */
4798 if (GET_CODE (cop0
) == CONST_VECTOR
)
4799 cop0
= force_reg (mode
, cop0
);
4800 else if (GET_CODE (cop1
) == CONST_VECTOR
)
4801 cop1
= force_reg (mode
, cop1
);
4803 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4804 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4806 std::swap (optrue
, opfalse
);
4808 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4809 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4810 min (x, y) == x). While we add one instruction (the minimum),
4811 we remove the need for two instructions in the negation, as the
4812 result is done this way.
4813 When using masks, do it for SI/DImode element types, as it is shorter
4814 than the two subtractions. */
4816 && GET_MODE_SIZE (mode
) != 64
4817 && vector_all_ones_operand (opfalse
, data_mode
)
4818 && optrue
== CONST0_RTX (data_mode
))
4820 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4821 /* Don't do it if not using integer masks and we'd end up with
4822 the right values in the registers though. */
4823 && ((GET_MODE_SIZE (mode
) == 64 && TARGET_EVEX512
)
4824 || !vector_all_ones_operand (optrue
, data_mode
)
4825 || opfalse
!= CONST0_RTX (data_mode
))))
4827 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4832 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4835 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4836 cop0
= force_reg (mode
, cop0
);
4837 cop1
= force_reg (mode
, cop1
);
4841 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4845 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4849 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4852 if (TARGET_AVX512VL
)
4854 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4855 cop0
= force_reg (mode
, cop0
);
4856 cop1
= force_reg (mode
, cop1
);
4860 if (code
== GTU
&& TARGET_SSE2
)
4861 gen
= gen_uminv16qi3
;
4862 else if (code
== GT
&& TARGET_SSE4_1
)
4863 gen
= gen_sminv16qi3
;
4866 if (code
== GTU
&& TARGET_SSE2
)
4867 gen
= gen_uminv8qi3
;
4868 else if (code
== GT
&& TARGET_SSE4_1
)
4869 gen
= gen_sminv8qi3
;
4872 if (code
== GTU
&& TARGET_SSE2
)
4873 gen
= gen_uminv4qi3
;
4874 else if (code
== GT
&& TARGET_SSE4_1
)
4875 gen
= gen_sminv4qi3
;
4878 if (code
== GTU
&& TARGET_SSE2
)
4879 gen
= gen_uminv2qi3
;
4880 else if (code
== GT
&& TARGET_SSE4_1
)
4881 gen
= gen_sminv2qi3
;
4884 if (code
== GTU
&& TARGET_SSE4_1
)
4885 gen
= gen_uminv8hi3
;
4886 else if (code
== GT
&& TARGET_SSE2
)
4887 gen
= gen_sminv8hi3
;
4890 if (code
== GTU
&& TARGET_SSE4_1
)
4891 gen
= gen_uminv4hi3
;
4892 else if (code
== GT
&& TARGET_SSE2
)
4893 gen
= gen_sminv4hi3
;
4896 if (code
== GTU
&& TARGET_SSE4_1
)
4897 gen
= gen_uminv2hi3
;
4898 else if (code
== GT
&& TARGET_SSE2
)
4899 gen
= gen_sminv2hi3
;
4903 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4907 gen
= (code
== GTU
) ? gen_uminv2si3
: gen_sminv2si3
;
4910 if (TARGET_AVX512VL
)
4912 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4913 cop0
= force_reg (mode
, cop0
);
4914 cop1
= force_reg (mode
, cop1
);
4923 rtx tem
= gen_reg_rtx (mode
);
4924 if (!vector_operand (cop0
, mode
))
4925 cop0
= force_reg (mode
, cop0
);
4926 if (!vector_operand (cop1
, mode
))
4927 cop1
= force_reg (mode
, cop1
);
4929 emit_insn (gen (tem
, cop0
, cop1
));
4935 /* Unsigned parallel compare is not supported by the hardware.
4936 Play some tricks to turn this into a signed comparison
4940 cop0
= force_reg (mode
, cop0
);
4954 /* Subtract (-(INT MAX) - 1) from both operands to make
4956 mask
= ix86_build_signbit_mask (mode
, true, false);
4957 t1
= gen_reg_rtx (mode
);
4958 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4960 t2
= gen_reg_rtx (mode
);
4961 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4980 /* Perform a parallel unsigned saturating subtraction. */
4981 x
= gen_reg_rtx (mode
);
4982 emit_insn (gen_rtx_SET
4983 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4985 cop1
= CONST0_RTX (mode
);
4997 std::swap (op_true
, op_false
);
4999 if (GET_CODE (cop1
) == CONST_VECTOR
)
5000 cop1
= force_reg (mode
, cop1
);
5002 /* Allow the comparison to be done in one mode, but the movcc to
5003 happen in another mode. */
5004 if (data_mode
== mode
)
5005 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
, op_true
, op_false
);
5008 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
5009 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
5011 if (GET_MODE (x
) == mode
)
5012 x
= gen_lowpart (data_mode
, x
);
5018 /* Expand integer vector comparison. */
5021 ix86_expand_int_vec_cmp (rtx operands
[])
5023 rtx_code code
= GET_CODE (operands
[1]);
5024 bool negate
= false;
5025 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
5026 operands
[3], NULL
, NULL
, &negate
);
5032 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
5033 CONST0_RTX (GET_MODE (cmp
)),
5034 NULL
, NULL
, &negate
);
5036 gcc_assert (!negate
);
5038 if (operands
[0] != cmp
)
5039 emit_move_insn (operands
[0], cmp
);
5044 /* Expand a floating-point vector conditional move; a vcond operation
5045 rather than a movcc operation. */
5048 ix86_expand_fp_vcond (rtx operands
[])
5050 enum rtx_code code
= GET_CODE (operands
[3]);
5053 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
5054 &operands
[4], &operands
[5]);
5055 if (code
== UNKNOWN
)
5058 switch (GET_CODE (operands
[3]))
5061 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
5062 operands
[5], operands
[0], operands
[0]);
5063 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
5064 operands
[5], operands
[1], operands
[2]);
5068 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
5069 operands
[5], operands
[0], operands
[0]);
5070 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
5071 operands
[5], operands
[1], operands
[2]);
5077 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
5079 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
5083 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
5084 operands
[5], operands
[1], operands
[2]))
5087 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
5088 operands
[1], operands
[2]);
5089 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
5093 /* Expand a signed/unsigned integral vector conditional move. */
5096 ix86_expand_int_vcond (rtx operands
[])
5098 machine_mode data_mode
= GET_MODE (operands
[0]);
5099 machine_mode mode
= GET_MODE (operands
[4]);
5100 enum rtx_code code
= GET_CODE (operands
[3]);
5101 bool negate
= false;
5107 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
5108 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
5109 if ((code
== LT
|| code
== GE
)
5110 && data_mode
== mode
5111 && cop1
== CONST0_RTX (mode
)
5112 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
5113 && GET_MODE_UNIT_SIZE (data_mode
) > 1
5114 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
5115 && (GET_MODE_SIZE (data_mode
) == 16
5116 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
5118 rtx negop
= operands
[2 - (code
== LT
)];
5119 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
5120 if (negop
== CONST1_RTX (data_mode
))
5122 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
5123 operands
[0], 1, OPTAB_DIRECT
);
5124 if (res
!= operands
[0])
5125 emit_move_insn (operands
[0], res
);
5128 else if (GET_MODE_INNER (data_mode
) != DImode
5129 && vector_all_ones_operand (negop
, data_mode
))
5131 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
5132 operands
[0], 0, OPTAB_DIRECT
);
5133 if (res
!= operands
[0])
5134 emit_move_insn (operands
[0], res
);
5139 if (!nonimmediate_operand (cop1
, mode
))
5140 cop1
= force_reg (mode
, cop1
);
5141 if (!general_operand (operands
[1], data_mode
))
5142 operands
[1] = force_reg (data_mode
, operands
[1]);
5143 if (!general_operand (operands
[2], data_mode
))
5144 operands
[2] = force_reg (data_mode
, operands
[2]);
5146 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
5147 operands
[1], operands
[2], &negate
);
5152 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
5153 operands
[2-negate
]);
5158 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
5159 struct expand_vec_perm_d
*d
)
5161 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5162 expander, so args are either in d, or in op0, op1 etc. */
5163 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
5164 machine_mode maskmode
= mode
;
5165 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
5170 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
5171 gen
= gen_avx512vl_vpermt2varv16qi3
;
5174 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
5175 gen
= gen_avx512vl_vpermt2varv32qi3
;
5178 if (TARGET_AVX512VBMI
)
5179 gen
= gen_avx512bw_vpermt2varv64qi3
;
5182 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
5183 gen
= gen_avx512vl_vpermt2varv8hi3
;
5186 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
5187 gen
= gen_avx512vl_vpermt2varv16hi3
;
5190 if (TARGET_AVX512BW
)
5191 gen
= gen_avx512bw_vpermt2varv32hi3
;
5194 if (TARGET_AVX512VL
)
5195 gen
= gen_avx512vl_vpermt2varv4si3
;
5198 if (TARGET_AVX512VL
)
5199 gen
= gen_avx512vl_vpermt2varv8si3
;
5203 gen
= gen_avx512f_vpermt2varv16si3
;
5206 if (TARGET_AVX512VL
)
5208 gen
= gen_avx512vl_vpermt2varv4sf3
;
5209 maskmode
= V4SImode
;
5213 if (TARGET_AVX512VL
)
5215 gen
= gen_avx512vl_vpermt2varv8sf3
;
5216 maskmode
= V8SImode
;
5222 gen
= gen_avx512f_vpermt2varv16sf3
;
5223 maskmode
= V16SImode
;
5227 if (TARGET_AVX512VL
)
5228 gen
= gen_avx512vl_vpermt2varv2di3
;
5231 if (TARGET_AVX512VL
)
5232 gen
= gen_avx512vl_vpermt2varv4di3
;
5236 gen
= gen_avx512f_vpermt2varv8di3
;
5239 if (TARGET_AVX512VL
)
5241 gen
= gen_avx512vl_vpermt2varv2df3
;
5242 maskmode
= V2DImode
;
5246 if (TARGET_AVX512VL
)
5248 gen
= gen_avx512vl_vpermt2varv4df3
;
5249 maskmode
= V4DImode
;
5255 gen
= gen_avx512f_vpermt2varv8df3
;
5256 maskmode
= V8DImode
;
5266 if (d
&& d
->testing_p
)
5269 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5270 expander, so args are either in d, or in op0, op1 etc. */
5277 for (int i
= 0; i
< d
->nelt
; ++i
)
5278 vec
[i
] = GEN_INT (d
->perm
[i
]);
5279 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
5282 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
5286 /* Expand a variable vector permutation. */
5289 ix86_expand_vec_perm (rtx operands
[])
5291 rtx target
= operands
[0];
5292 rtx op0
= operands
[1];
5293 rtx op1
= operands
[2];
5294 rtx mask
= operands
[3];
5295 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
5296 machine_mode mode
= GET_MODE (op0
);
5297 machine_mode maskmode
= GET_MODE (mask
);
5299 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
5301 /* Number of elements in the vector. */
5302 w
= GET_MODE_NUNITS (mode
);
5303 e
= GET_MODE_UNIT_SIZE (mode
);
5304 gcc_assert (w
<= 64);
5306 /* For HF mode vector, convert it to HI using subreg. */
5307 if (GET_MODE_INNER (mode
) == HFmode
)
5309 machine_mode orig_mode
= mode
;
5310 mode
= mode_for_vector (HImode
, w
).require ();
5311 target
= lowpart_subreg (mode
, target
, orig_mode
);
5312 op0
= lowpart_subreg (mode
, op0
, orig_mode
);
5313 op1
= lowpart_subreg (mode
, op1
, orig_mode
);
5316 if (TARGET_AVX512F
&& one_operand_shuffle
)
5318 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
5322 gen
=gen_avx512f_permvarv16si
;
5325 gen
= gen_avx512f_permvarv16sf
;
5328 gen
= gen_avx512f_permvarv8di
;
5331 gen
= gen_avx512f_permvarv8df
;
5338 emit_insn (gen (target
, op0
, mask
));
5343 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
5348 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
5350 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5351 an constant shuffle operand. With a tiny bit of effort we can
5352 use VPERMD instead. A re-interpretation stall for V4DFmode is
5353 unfortunate but there's no avoiding it.
5354 Similarly for V16HImode we don't have instructions for variable
5355 shuffling, while for V32QImode we can use after preparing suitable
5356 masks vpshufb; vpshufb; vpermq; vpor. */
5358 if (mode
== V16HImode
)
5360 maskmode
= mode
= V32QImode
;
5366 maskmode
= mode
= V8SImode
;
5370 t1
= gen_reg_rtx (maskmode
);
5372 /* Replicate the low bits of the V4DImode mask into V8SImode:
5374 t1 = { A A B B C C D D }. */
5375 for (i
= 0; i
< w
/ 2; ++i
)
5376 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
5377 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5378 vt
= force_reg (maskmode
, vt
);
5379 mask
= gen_lowpart (maskmode
, mask
);
5380 if (maskmode
== V8SImode
)
5381 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
5383 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
5385 /* Multiply the shuffle indicies by two. */
5386 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
5389 /* Add one to the odd shuffle indicies:
5390 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5391 for (i
= 0; i
< w
/ 2; ++i
)
5393 vec
[i
* 2] = const0_rtx
;
5394 vec
[i
* 2 + 1] = const1_rtx
;
5396 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5397 vt
= validize_mem (force_const_mem (maskmode
, vt
));
5398 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
5401 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5402 operands
[3] = mask
= t1
;
5403 target
= gen_reg_rtx (mode
);
5404 op0
= gen_lowpart (mode
, op0
);
5405 op1
= gen_lowpart (mode
, op1
);
5411 /* The VPERMD and VPERMPS instructions already properly ignore
5412 the high bits of the shuffle elements. No need for us to
5413 perform an AND ourselves. */
5414 if (one_operand_shuffle
)
5416 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
5417 if (target
!= operands
[0])
5418 emit_move_insn (operands
[0],
5419 gen_lowpart (GET_MODE (operands
[0]), target
));
5423 t1
= gen_reg_rtx (V8SImode
);
5424 t2
= gen_reg_rtx (V8SImode
);
5425 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
5426 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
5432 mask
= gen_lowpart (V8SImode
, mask
);
5433 if (one_operand_shuffle
)
5434 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
5437 t1
= gen_reg_rtx (V8SFmode
);
5438 t2
= gen_reg_rtx (V8SFmode
);
5439 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
5440 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
5446 /* By combining the two 128-bit input vectors into one 256-bit
5447 input vector, we can use VPERMD and VPERMPS for the full
5448 two-operand shuffle. */
5449 t1
= gen_reg_rtx (V8SImode
);
5450 t2
= gen_reg_rtx (V8SImode
);
5451 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
5452 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5453 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
5454 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
5458 t1
= gen_reg_rtx (V8SFmode
);
5459 t2
= gen_reg_rtx (V8SImode
);
5460 mask
= gen_lowpart (V4SImode
, mask
);
5461 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
5462 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5463 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
5464 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
5468 t1
= gen_reg_rtx (V32QImode
);
5469 t2
= gen_reg_rtx (V32QImode
);
5470 t3
= gen_reg_rtx (V32QImode
);
5471 vt2
= GEN_INT (-128);
5472 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
5473 vt
= force_reg (V32QImode
, vt
);
5474 for (i
= 0; i
< 32; i
++)
5475 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
5476 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
5477 vt2
= force_reg (V32QImode
, vt2
);
5478 /* From mask create two adjusted masks, which contain the same
5479 bits as mask in the low 7 bits of each vector element.
5480 The first mask will have the most significant bit clear
5481 if it requests element from the same 128-bit lane
5482 and MSB set if it requests element from the other 128-bit lane.
5483 The second mask will have the opposite values of the MSB,
5484 and additionally will have its 128-bit lanes swapped.
5485 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5486 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5487 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5488 stands for other 12 bytes. */
5489 /* The bit whether element is from the same lane or the other
5490 lane is bit 4, so shift it up by 3 to the MSB position. */
5491 t5
= gen_reg_rtx (V4DImode
);
5492 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
5494 /* Clear MSB bits from the mask just in case it had them set. */
5495 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
5496 /* After this t1 will have MSB set for elements from other lane. */
5497 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
5498 /* Clear bits other than MSB. */
5499 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
5500 /* Or in the lower bits from mask into t3. */
5501 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
5502 /* And invert MSB bits in t1, so MSB is set for elements from the same
5504 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
5505 /* Swap 128-bit lanes in t3. */
5506 t6
= gen_reg_rtx (V4DImode
);
5507 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
5508 const2_rtx
, GEN_INT (3),
5509 const0_rtx
, const1_rtx
));
5510 /* And or in the lower bits from mask into t1. */
5511 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
5512 if (one_operand_shuffle
)
5514 /* Each of these shuffles will put 0s in places where
5515 element from the other 128-bit lane is needed, otherwise
5516 will shuffle in the requested value. */
5517 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
5518 gen_lowpart (V32QImode
, t6
)));
5519 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
5520 /* For t3 the 128-bit lanes are swapped again. */
5521 t7
= gen_reg_rtx (V4DImode
);
5522 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
5523 const2_rtx
, GEN_INT (3),
5524 const0_rtx
, const1_rtx
));
5525 /* And oring both together leads to the result. */
5526 emit_insn (gen_iorv32qi3 (target
, t1
,
5527 gen_lowpart (V32QImode
, t7
)));
5528 if (target
!= operands
[0])
5529 emit_move_insn (operands
[0],
5530 gen_lowpart (GET_MODE (operands
[0]), target
));
5534 t4
= gen_reg_rtx (V32QImode
);
5535 /* Similarly to the above one_operand_shuffle code,
5536 just for repeated twice for each operand. merge_two:
5537 code will merge the two results together. */
5538 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
5539 gen_lowpart (V32QImode
, t6
)));
5540 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
5541 gen_lowpart (V32QImode
, t6
)));
5542 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
5543 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
5544 t7
= gen_reg_rtx (V4DImode
);
5545 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
5546 const2_rtx
, GEN_INT (3),
5547 const0_rtx
, const1_rtx
));
5548 t8
= gen_reg_rtx (V4DImode
);
5549 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
5550 const2_rtx
, GEN_INT (3),
5551 const0_rtx
, const1_rtx
));
5552 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
5553 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
5559 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
5566 /* The XOP VPPERM insn supports three inputs. By ignoring the
5567 one_operand_shuffle special case, we avoid creating another
5568 set of constant vectors in memory. */
5569 one_operand_shuffle
= false;
5571 /* mask = mask & {2*w-1, ...} */
5572 vt
= GEN_INT (2*w
- 1);
5576 /* mask = mask & {w-1, ...} */
5577 vt
= GEN_INT (w
- 1);
5580 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5581 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5582 NULL_RTX
, 0, OPTAB_DIRECT
);
5584 /* For non-QImode operations, convert the word permutation control
5585 into a byte permutation control. */
5586 if (mode
!= V16QImode
)
5588 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
5589 GEN_INT (exact_log2 (e
)),
5590 NULL_RTX
, 0, OPTAB_DIRECT
);
5592 /* Convert mask to vector of chars. */
5593 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
5595 /* Replicate each of the input bytes into byte positions:
5596 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5597 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5598 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5599 for (i
= 0; i
< 16; ++i
)
5600 vec
[i
] = GEN_INT (i
/e
* e
);
5601 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5602 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5604 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
5606 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
5608 /* Convert it into the byte positions by doing
5609 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5610 for (i
= 0; i
< 16; ++i
)
5611 vec
[i
] = GEN_INT (i
% e
);
5612 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5613 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5614 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
5617 /* The actual shuffle operations all operate on V16QImode. */
5618 op0
= gen_lowpart (V16QImode
, op0
);
5619 op1
= gen_lowpart (V16QImode
, op1
);
5623 if (GET_MODE (target
) != V16QImode
)
5624 target
= gen_reg_rtx (V16QImode
);
5625 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
5626 if (target
!= operands
[0])
5627 emit_move_insn (operands
[0],
5628 gen_lowpart (GET_MODE (operands
[0]), target
));
5630 else if (one_operand_shuffle
)
5632 if (GET_MODE (target
) != V16QImode
)
5633 target
= gen_reg_rtx (V16QImode
);
5634 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
5635 if (target
!= operands
[0])
5636 emit_move_insn (operands
[0],
5637 gen_lowpart (GET_MODE (operands
[0]), target
));
5644 /* Shuffle the two input vectors independently. */
5645 t1
= gen_reg_rtx (V16QImode
);
5646 t2
= gen_reg_rtx (V16QImode
);
5647 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
5648 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
5651 /* Then merge them together. The key is whether any given control
5652 element contained a bit set that indicates the second word. */
5655 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
5657 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5658 more shuffle to convert the V2DI input mask into a V4SI
5659 input mask. At which point the masking that expand_int_vcond
5660 will work as desired. */
5661 rtx t3
= gen_reg_rtx (V4SImode
);
5662 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
5663 const0_rtx
, const0_rtx
,
5664 const2_rtx
, const2_rtx
));
5666 maskmode
= V4SImode
;
5670 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5671 vt
= force_reg (maskmode
, vt
);
5672 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5673 NULL_RTX
, 0, OPTAB_DIRECT
);
5675 if (GET_MODE (target
) != mode
)
5676 target
= gen_reg_rtx (mode
);
5678 xops
[1] = gen_lowpart (mode
, t2
);
5679 xops
[2] = gen_lowpart (mode
, t1
);
5680 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
5683 ok
= ix86_expand_int_vcond (xops
);
5685 if (target
!= operands
[0])
5686 emit_move_insn (operands
[0],
5687 gen_lowpart (GET_MODE (operands
[0]), target
));
5691 /* Extend SRC into next wider integer vector type. UNSIGNED_P is
5692 true if we should do zero extension, else sign extension. */
5695 ix86_expand_sse_extend (rtx dest
, rtx src
, bool unsigned_p
)
5697 machine_mode imode
= GET_MODE (src
);
5715 ops
[1] = force_reg (imode
, src
);
5718 ops
[2] = force_reg (imode
, CONST0_RTX (imode
));
5720 ops
[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5721 ops
[1], pc_rtx
, pc_rtx
);
5723 ix86_split_mmx_punpck (ops
, false);
5726 /* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
5727 true if we should do zero extension, else sign extension. HIGH_P is
5728 true if we want the N/2 high elements, else the low elements. */
5731 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
5733 machine_mode imode
= GET_MODE (src
);
5738 rtx (*unpack
)(rtx
, rtx
);
5739 rtx (*extract
)(rtx
, rtx
) = NULL
;
5740 machine_mode halfmode
= BLKmode
;
5746 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
5748 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
5749 halfmode
= V32QImode
;
5751 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
5755 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
5757 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
5758 halfmode
= V16QImode
;
5760 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
5764 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
5766 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
5767 halfmode
= V16HImode
;
5769 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
5773 unpack
= gen_avx2_zero_extendv8hiv8si2
;
5775 unpack
= gen_avx2_sign_extendv8hiv8si2
;
5776 halfmode
= V8HImode
;
5778 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
5782 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5784 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5785 halfmode
= V8SImode
;
5787 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5791 unpack
= gen_avx2_zero_extendv4siv4di2
;
5793 unpack
= gen_avx2_sign_extendv4siv4di2
;
5794 halfmode
= V4SImode
;
5796 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5800 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5802 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5806 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5808 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5812 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5814 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5818 unpack
= gen_sse4_1_zero_extendv4qiv4hi2
;
5820 unpack
= gen_sse4_1_sign_extendv4qiv4hi2
;
5824 unpack
= gen_sse4_1_zero_extendv2hiv2si2
;
5826 unpack
= gen_sse4_1_sign_extendv2hiv2si2
;
5830 unpack
= gen_sse4_1_zero_extendv2qiv2hi2
;
5832 unpack
= gen_sse4_1_sign_extendv2qiv2hi2
;
5838 if (GET_MODE_SIZE (imode
) >= 32)
5840 tmp
= gen_reg_rtx (halfmode
);
5841 emit_insn (extract (tmp
, src
));
5845 switch (GET_MODE_SIZE (imode
))
5848 /* Shift higher 8 bytes to lower 8 bytes. */
5849 tmp
= gen_reg_rtx (V1TImode
);
5850 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5854 /* Shift higher 4 bytes to lower 4 bytes. */
5855 tmp
= gen_reg_rtx (V1DImode
);
5856 emit_insn (gen_mmx_lshrv1di3 (tmp
, gen_lowpart (V1DImode
, src
),
5860 /* Shift higher 2 bytes to lower 2 bytes. */
5861 tmp
= gen_reg_rtx (V1SImode
);
5862 emit_insn (gen_mmx_lshrv1si3 (tmp
, gen_lowpart (V1SImode
, src
),
5869 tmp
= gen_lowpart (imode
, tmp
);
5874 emit_insn (unpack (dest
, tmp
));
5878 rtx (*unpack
)(rtx
, rtx
, rtx
);
5884 unpack
= gen_vec_interleave_highv16qi
;
5886 unpack
= gen_vec_interleave_lowv16qi
;
5890 unpack
= gen_vec_interleave_highv8hi
;
5892 unpack
= gen_vec_interleave_lowv8hi
;
5896 unpack
= gen_vec_interleave_highv4si
;
5898 unpack
= gen_vec_interleave_lowv4si
;
5902 unpack
= gen_mmx_punpckhbw
;
5904 unpack
= gen_mmx_punpcklbw
;
5908 unpack
= gen_mmx_punpckhwd
;
5910 unpack
= gen_mmx_punpcklwd
;
5914 unpack
= gen_mmx_punpckhbw_low
;
5916 unpack
= gen_mmx_punpcklbw_low
;
5923 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5925 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5926 src
, pc_rtx
, pc_rtx
);
5928 rtx tmp2
= gen_reg_rtx (imode
);
5929 emit_insn (unpack (tmp2
, src
, tmp
));
5930 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5934 /* Return true if mem is pool constant which contains a const_vector
5935 perm index, assign the index to PERM. */
5937 ix86_extract_perm_from_pool_constant (int* perm
, rtx mem
)
5939 machine_mode mode
= GET_MODE (mem
);
5940 int nelt
= GET_MODE_NUNITS (mode
);
5942 if (!INTEGRAL_MODE_P (mode
))
5945 /* Needs to be constant pool. */
5947 || !SYMBOL_REF_P (XEXP (mem
, 0))
5948 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem
, 0)))
5951 rtx constant
= get_pool_constant (XEXP (mem
, 0));
5953 if (GET_CODE (constant
) != CONST_VECTOR
)
5956 /* There could be some rtx like
5957 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5958 but with "*.LC1" refer to V2DI constant vector. */
5959 if (GET_MODE (constant
) != mode
)
5961 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
), 0);
5963 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
5967 for (int i
= 0; i
!= nelt
; i
++)
5968 perm
[i
] = UINTVAL (XVECEXP (constant
, 0, i
));
5973 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5974 but works for floating pointer parameters and nonoffsetable memories.
5975 For pushes, it returns just stack offsets; the values will be saved
5976 in the right order. Maximally three parts are generated. */
5979 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5984 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5986 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5988 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5989 gcc_assert (size
>= 2 && size
<= 4);
5991 /* Optimize constant pool reference to immediates. This is used by fp
5992 moves, that force all constants to memory to allow combining. */
5993 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5994 operand
= avoid_constant_pool_reference (operand
);
5996 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5998 /* The only non-offsetable memories we handle are pushes. */
5999 int ok
= push_operand (operand
, VOIDmode
);
6003 operand
= copy_rtx (operand
);
6004 PUT_MODE (operand
, word_mode
);
6005 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
6009 if (GET_CODE (operand
) == CONST_VECTOR
)
6011 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
6012 /* Caution: if we looked through a constant pool memory above,
6013 the operand may actually have a different mode now. That's
6014 ok, since we want to pun this all the way back to an integer. */
6015 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
6016 gcc_assert (operand
!= NULL
);
6023 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
6028 if (REG_P (operand
))
6030 gcc_assert (reload_completed
);
6031 for (i
= 0; i
< size
; i
++)
6032 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
6034 else if (offsettable_memref_p (operand
))
6036 operand
= adjust_address (operand
, SImode
, 0);
6038 for (i
= 1; i
< size
; i
++)
6039 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
6041 else if (CONST_DOUBLE_P (operand
))
6043 const REAL_VALUE_TYPE
*r
;
6046 r
= CONST_DOUBLE_REAL_VALUE (operand
);
6050 real_to_target (l
, r
, mode
);
6051 parts
[3] = gen_int_mode (l
[3], SImode
);
6052 parts
[2] = gen_int_mode (l
[2], SImode
);
6055 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
6056 long double may not be 80-bit. */
6057 real_to_target (l
, r
, mode
);
6058 parts
[2] = gen_int_mode (l
[2], SImode
);
6061 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
6066 parts
[1] = gen_int_mode (l
[1], SImode
);
6067 parts
[0] = gen_int_mode (l
[0], SImode
);
6076 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
6077 if (mode
== XFmode
|| mode
== TFmode
)
6079 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
6080 if (REG_P (operand
))
6082 gcc_assert (reload_completed
);
6083 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
6084 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
6086 else if (offsettable_memref_p (operand
))
6088 operand
= adjust_address (operand
, DImode
, 0);
6090 parts
[1] = adjust_address (operand
, upper_mode
, 8);
6092 else if (CONST_DOUBLE_P (operand
))
6096 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
6098 /* real_to_target puts 32-bit pieces in each long. */
6099 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
6100 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
6103 if (upper_mode
== SImode
)
6104 parts
[1] = gen_int_mode (l
[2], SImode
);
6107 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
6108 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
6119 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
6120 Return false when normal moves are needed; true when all required
6121 insns have been emitted. Operands 2-4 contain the input values
6122 int the correct order; operands 5-7 contain the output values. */
6125 ix86_split_long_move (rtx operands
[])
6131 machine_mode mode
= GET_MODE (operands
[0]);
6132 bool collisionparts
[4];
6134 /* The DFmode expanders may ask us to move double.
6135 For 64bit target this is single move. By hiding the fact
6136 here we simplify i386.md splitters. */
6137 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
6139 /* Optimize constant pool reference to immediates. This is used by
6140 fp moves, that force all constants to memory to allow combining. */
6142 if (MEM_P (operands
[1])
6143 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
6144 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
6145 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
6146 if (push_operand (operands
[0], VOIDmode
))
6148 operands
[0] = copy_rtx (operands
[0]);
6149 PUT_MODE (operands
[0], word_mode
);
6152 operands
[0] = gen_lowpart (DImode
, operands
[0]);
6153 operands
[1] = gen_lowpart (DImode
, operands
[1]);
6154 emit_move_insn (operands
[0], operands
[1]);
6158 /* The only non-offsettable memory we handle is push. */
6159 if (push_operand (operands
[0], VOIDmode
))
6162 gcc_assert (!MEM_P (operands
[0])
6163 || offsettable_memref_p (operands
[0]));
6165 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
6166 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
6168 /* When emitting push, take care for source operands on the stack. */
6169 if (push
&& MEM_P (operands
[1])
6170 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
6172 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
6174 /* Compensate for the stack decrement by 4. */
6175 if (!TARGET_64BIT
&& nparts
== 3
6176 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
6177 src_base
= plus_constant (Pmode
, src_base
, 4);
6179 /* src_base refers to the stack pointer and is
6180 automatically decreased by emitted push. */
6181 for (i
= 0; i
< nparts
; i
++)
6182 part
[1][i
] = change_address (part
[1][i
],
6183 GET_MODE (part
[1][i
]), src_base
);
6186 /* We need to do copy in the right order in case an address register
6187 of the source overlaps the destination. */
6188 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
6192 for (i
= 0; i
< nparts
; i
++)
6195 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
6196 if (collisionparts
[i
])
6200 /* Collision in the middle part can be handled by reordering. */
6201 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
6203 std::swap (part
[0][1], part
[0][2]);
6204 std::swap (part
[1][1], part
[1][2]);
6206 else if (collisions
== 1
6208 && (collisionparts
[1] || collisionparts
[2]))
6210 if (collisionparts
[1])
6212 std::swap (part
[0][1], part
[0][2]);
6213 std::swap (part
[1][1], part
[1][2]);
6217 std::swap (part
[0][2], part
[0][3]);
6218 std::swap (part
[1][2], part
[1][3]);
6222 /* If there are more collisions, we can't handle it by reordering.
6223 Do an lea to the last part and use only one colliding move. */
6224 else if (collisions
> 1)
6230 base
= part
[0][nparts
- 1];
6232 /* Handle the case when the last part isn't valid for lea.
6233 Happens in 64-bit mode storing the 12-byte XFmode. */
6234 if (GET_MODE (base
) != Pmode
)
6235 base
= gen_rtx_REG (Pmode
, REGNO (base
));
6237 addr
= XEXP (part
[1][0], 0);
6238 if (TARGET_TLS_DIRECT_SEG_REFS
)
6240 struct ix86_address parts
;
6241 int ok
= ix86_decompose_address (addr
, &parts
);
6243 /* It is not valid to use %gs: or %fs: in lea. */
6244 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
6246 emit_insn (gen_rtx_SET (base
, addr
));
6247 part
[1][0] = replace_equiv_address (part
[1][0], base
);
6248 for (i
= 1; i
< nparts
; i
++)
6250 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
6251 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
6262 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
6263 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
6264 emit_move_insn (part
[0][2], part
[1][2]);
6266 else if (nparts
== 4)
6268 emit_move_insn (part
[0][3], part
[1][3]);
6269 emit_move_insn (part
[0][2], part
[1][2]);
6274 /* In 64bit mode we don't have 32bit push available. In case this is
6275 register, it is OK - we will just use larger counterpart. We also
6276 retype memory - these comes from attempt to avoid REX prefix on
6277 moving of second half of TFmode value. */
6278 if (GET_MODE (part
[1][1]) == SImode
)
6280 switch (GET_CODE (part
[1][1]))
6283 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
6287 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
6294 if (GET_MODE (part
[1][0]) == SImode
)
6295 part
[1][0] = part
[1][1];
6298 emit_move_insn (part
[0][1], part
[1][1]);
6299 emit_move_insn (part
[0][0], part
[1][0]);
6303 /* Choose correct order to not overwrite the source before it is copied. */
6304 if ((REG_P (part
[0][0])
6305 && REG_P (part
[1][1])
6306 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
6308 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
6310 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
6312 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
6314 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
6316 operands
[2 + i
] = part
[0][j
];
6317 operands
[6 + i
] = part
[1][j
];
6322 for (i
= 0; i
< nparts
; i
++)
6324 operands
[2 + i
] = part
[0][i
];
6325 operands
[6 + i
] = part
[1][i
];
6329 /* Attempt to locally unCSE nonzero constants. */
6330 for (j
= 0; j
< nparts
- 1; j
++)
6331 if (CONST_INT_P (operands
[6 + j
])
6332 && operands
[6 + j
] != const0_rtx
6333 && REG_P (operands
[2 + j
]))
6334 for (i
= j
; i
< nparts
- 1; i
++)
6335 if (CONST_INT_P (operands
[7 + i
])
6336 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
6337 operands
[7 + i
] = operands
[2 + j
];
6339 for (i
= 0; i
< nparts
; i
++)
6340 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
6345 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6346 left shift by a constant, either using a single shift or
6347 a sequence of add instructions. */
6350 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
6353 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
6354 && !optimize_insn_for_size_p ()))
6357 emit_insn (gen_add2_insn (operand
, operand
));
6361 rtx (*insn
)(rtx
, rtx
, rtx
);
6363 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6364 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
6369 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
6371 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
6372 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
6373 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6374 machine_mode half_mode
;
6376 rtx low
[2], high
[2];
6379 if (CONST_INT_P (operands
[2]))
6381 split_double_mode (mode
, operands
, 2, low
, high
);
6382 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6384 if (count
>= half_width
)
6386 emit_move_insn (high
[0], low
[1]);
6387 ix86_expand_clear (low
[0]);
6389 if (count
> half_width
)
6390 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
6392 else if (count
== 1)
6394 if (!rtx_equal_p (operands
[0], operands
[1]))
6395 emit_move_insn (operands
[0], operands
[1]);
6396 rtx x3
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
6397 rtx x4
= gen_rtx_LTU (mode
, x3
, const0_rtx
);
6398 half_mode
= mode
== DImode
? SImode
: DImode
;
6399 emit_insn (gen_add3_cc_overflow_1 (half_mode
, low
[0],
6401 emit_insn (gen_add3_carry (half_mode
, high
[0], high
[0], high
[0],
6406 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6408 if (!rtx_equal_p (operands
[0], operands
[1]))
6409 emit_move_insn (operands
[0], operands
[1]);
6411 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
6412 ix86_expand_ashl_const (low
[0], count
, mode
);
6417 split_double_mode (mode
, operands
, 1, low
, high
);
6418 half_mode
= mode
== DImode
? SImode
: DImode
;
6420 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6422 if (operands
[1] == const1_rtx
)
6424 /* Assuming we've chosen a QImode capable registers, then 1 << N
6425 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6426 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
6428 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
6430 ix86_expand_clear (low
[0]);
6431 ix86_expand_clear (high
[0]);
6432 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
6434 d
= gen_lowpart (QImode
, low
[0]);
6435 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6436 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
6437 emit_insn (gen_rtx_SET (d
, s
));
6439 d
= gen_lowpart (QImode
, high
[0]);
6440 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6441 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
6442 emit_insn (gen_rtx_SET (d
, s
));
6445 /* Otherwise, we can get the same results by manually performing
6446 a bit extract operation on bit 5/6, and then performing the two
6447 shifts. The two methods of getting 0/1 into low/high are exactly
6448 the same size. Avoiding the shift in the bit extract case helps
6449 pentium4 a bit; no one else seems to care much either way. */
6452 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
6453 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
6454 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
6460 gen_lshr3
= gen_lshrsi3
;
6461 gen_and3
= gen_andsi3
;
6462 gen_xor3
= gen_xorsi3
;
6467 gen_lshr3
= gen_lshrdi3
;
6468 gen_and3
= gen_anddi3
;
6469 gen_xor3
= gen_xordi3
;
6473 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
6474 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
6476 x
= gen_lowpart (half_mode
, operands
[2]);
6477 emit_insn (gen_rtx_SET (high
[0], x
));
6479 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
6480 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
6481 emit_move_insn (low
[0], high
[0]);
6482 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
6485 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6486 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
6490 if (operands
[1] == constm1_rtx
)
6492 /* For -1 << N, we can avoid the shld instruction, because we
6493 know that we're shifting 0...31/63 ones into a -1. */
6494 emit_move_insn (low
[0], constm1_rtx
);
6495 if (optimize_insn_for_size_p ())
6496 emit_move_insn (high
[0], low
[0]);
6498 emit_move_insn (high
[0], constm1_rtx
);
6502 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6504 if (!rtx_equal_p (operands
[0], operands
[1]))
6505 emit_move_insn (operands
[0], operands
[1]);
6507 split_double_mode (mode
, operands
, 1, low
, high
);
6508 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
6511 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6513 if (TARGET_CMOVE
&& scratch
)
6515 ix86_expand_clear (scratch
);
6516 emit_insn (gen_x86_shift_adj_1
6517 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
6520 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
6524 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6526 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
6527 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
6528 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6529 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6531 rtx low
[2], high
[2];
6534 if (CONST_INT_P (operands
[2]))
6536 split_double_mode (mode
, operands
, 2, low
, high
);
6537 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6539 if (count
== GET_MODE_BITSIZE (mode
) - 1)
6541 emit_move_insn (high
[0], high
[1]);
6542 emit_insn (gen_ashr3 (high
[0], high
[0],
6543 GEN_INT (half_width
- 1)));
6544 emit_move_insn (low
[0], high
[0]);
6547 else if (count
>= half_width
)
6549 emit_move_insn (low
[0], high
[1]);
6550 emit_move_insn (high
[0], low
[0]);
6551 emit_insn (gen_ashr3 (high
[0], high
[0],
6552 GEN_INT (half_width
- 1)));
6554 if (count
> half_width
)
6555 emit_insn (gen_ashr3 (low
[0], low
[0],
6556 GEN_INT (count
- half_width
)));
6559 && (TARGET_USE_RCR
|| optimize_size
> 1))
6561 if (!rtx_equal_p (operands
[0], operands
[1]))
6562 emit_move_insn (operands
[0], operands
[1]);
6565 emit_insn (gen_ashrsi3_carry (high
[0], high
[0]));
6566 emit_insn (gen_rcrsi2 (low
[0], low
[0]));
6570 emit_insn (gen_ashrdi3_carry (high
[0], high
[0]));
6571 emit_insn (gen_rcrdi2 (low
[0], low
[0]));
6576 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6578 if (!rtx_equal_p (operands
[0], operands
[1]))
6579 emit_move_insn (operands
[0], operands
[1]);
6581 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6582 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
6587 machine_mode half_mode
;
6589 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6591 if (!rtx_equal_p (operands
[0], operands
[1]))
6592 emit_move_insn (operands
[0], operands
[1]);
6594 split_double_mode (mode
, operands
, 1, low
, high
);
6595 half_mode
= mode
== DImode
? SImode
: DImode
;
6597 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6598 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
6600 if (TARGET_CMOVE
&& scratch
)
6602 emit_move_insn (scratch
, high
[0]);
6603 emit_insn (gen_ashr3 (scratch
, scratch
,
6604 GEN_INT (half_width
- 1)));
6605 emit_insn (gen_x86_shift_adj_1
6606 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6609 emit_insn (gen_x86_shift_adj_3
6610 (half_mode
, low
[0], high
[0], operands
[2]));
6615 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6617 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
6618 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
6619 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6620 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6622 rtx low
[2], high
[2];
6625 if (CONST_INT_P (operands
[2]))
6627 split_double_mode (mode
, operands
, 2, low
, high
);
6628 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6630 if (count
>= half_width
)
6632 emit_move_insn (low
[0], high
[1]);
6633 ix86_expand_clear (high
[0]);
6635 if (count
> half_width
)
6636 emit_insn (gen_lshr3 (low
[0], low
[0],
6637 GEN_INT (count
- half_width
)));
6640 && (TARGET_USE_RCR
|| optimize_size
> 1))
6642 if (!rtx_equal_p (operands
[0], operands
[1]))
6643 emit_move_insn (operands
[0], operands
[1]);
6646 emit_insn (gen_lshrsi3_carry (high
[0], high
[0]));
6647 emit_insn (gen_rcrsi2 (low
[0], low
[0]));
6651 emit_insn (gen_lshrdi3_carry (high
[0], high
[0]));
6652 emit_insn (gen_rcrdi2 (low
[0], low
[0]));
6657 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6659 if (!rtx_equal_p (operands
[0], operands
[1]))
6660 emit_move_insn (operands
[0], operands
[1]);
6662 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6663 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
6668 machine_mode half_mode
;
6670 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6672 if (!rtx_equal_p (operands
[0], operands
[1]))
6673 emit_move_insn (operands
[0], operands
[1]);
6675 split_double_mode (mode
, operands
, 1, low
, high
);
6676 half_mode
= mode
== DImode
? SImode
: DImode
;
6678 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6679 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
6681 if (TARGET_CMOVE
&& scratch
)
6683 ix86_expand_clear (scratch
);
6684 emit_insn (gen_x86_shift_adj_1
6685 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6688 emit_insn (gen_x86_shift_adj_2
6689 (half_mode
, low
[0], high
[0], operands
[2]));
6693 /* Helper function to split TImode ashl under NDD. */
6695 ix86_split_ashl_ndd (rtx
*operands
, rtx scratch
)
6697 gcc_assert (TARGET_APX_NDD
);
6698 int half_width
= GET_MODE_BITSIZE (TImode
) >> 1;
6700 rtx low
[2], high
[2];
6703 split_double_mode (TImode
, operands
, 2, low
, high
);
6704 if (CONST_INT_P (operands
[2]))
6706 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (TImode
) - 1);
6708 if (count
>= half_width
)
6710 count
= count
- half_width
;
6713 if (!rtx_equal_p (high
[0], low
[1]))
6714 emit_move_insn (high
[0], low
[1]);
6716 else if (count
== 1)
6717 emit_insn (gen_adddi3 (high
[0], low
[1], low
[1]));
6719 emit_insn (gen_ashldi3 (high
[0], low
[1], GEN_INT (count
)));
6721 ix86_expand_clear (low
[0]);
6723 else if (count
== 1)
6725 rtx x3
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
6726 rtx x4
= gen_rtx_LTU (TImode
, x3
, const0_rtx
);
6727 emit_insn (gen_add3_cc_overflow_1 (DImode
, low
[0],
6729 emit_insn (gen_add3_carry (DImode
, high
[0], high
[1], high
[1],
6734 emit_insn (gen_x86_64_shld_ndd (high
[0], high
[1], low
[1],
6736 emit_insn (gen_ashldi3 (low
[0], low
[1], GEN_INT (count
)));
6741 emit_insn (gen_x86_64_shld_ndd (high
[0], high
[1], low
[1],
6743 emit_insn (gen_ashldi3 (low
[0], low
[1], operands
[2]));
6744 if (TARGET_CMOVE
&& scratch
)
6746 ix86_expand_clear (scratch
);
6747 emit_insn (gen_x86_shift_adj_1
6748 (DImode
, high
[0], low
[0], operands
[2], scratch
));
6751 emit_insn (gen_x86_shift_adj_2 (DImode
, high
[0], low
[0], operands
[2]));
6755 /* Helper function to split TImode l/ashr under NDD. */
6757 ix86_split_rshift_ndd (enum rtx_code code
, rtx
*operands
, rtx scratch
)
6759 gcc_assert (TARGET_APX_NDD
);
6760 int half_width
= GET_MODE_BITSIZE (TImode
) >> 1;
6761 bool ashr_p
= code
== ASHIFTRT
;
6762 rtx (*gen_shr
)(rtx
, rtx
, rtx
) = ashr_p
? gen_ashrdi3
6765 rtx low
[2], high
[2];
6768 split_double_mode (TImode
, operands
, 2, low
, high
);
6769 if (CONST_INT_P (operands
[2]))
6771 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (TImode
) - 1);
6773 if (ashr_p
&& (count
== GET_MODE_BITSIZE (TImode
) - 1))
6775 emit_insn (gen_shr (high
[0], high
[1],
6776 GEN_INT (half_width
- 1)));
6777 emit_move_insn (low
[0], high
[0]);
6779 else if (count
>= half_width
)
6782 emit_insn (gen_shr (high
[0], high
[1],
6783 GEN_INT (half_width
- 1)));
6785 ix86_expand_clear (high
[0]);
6787 if (count
> half_width
)
6788 emit_insn (gen_shr (low
[0], high
[1],
6789 GEN_INT (count
- half_width
)));
6791 emit_move_insn (low
[0], high
[1]);
6795 emit_insn (gen_x86_64_shrd_ndd (low
[0], low
[1], high
[1],
6797 emit_insn (gen_shr (high
[0], high
[1], GEN_INT (count
)));
6802 emit_insn (gen_x86_64_shrd_ndd (low
[0], low
[1], high
[1],
6804 emit_insn (gen_shr (high
[0], high
[1], operands
[2]));
6806 if (TARGET_CMOVE
&& scratch
)
6810 emit_move_insn (scratch
, high
[0]);
6811 emit_insn (gen_shr (scratch
, scratch
,
6812 GEN_INT (half_width
- 1)));
6815 ix86_expand_clear (scratch
);
6817 emit_insn (gen_x86_shift_adj_1
6818 (DImode
, low
[0], high
[0], operands
[2], scratch
));
6821 emit_insn (gen_x86_shift_adj_3
6822 (DImode
, low
[0], high
[0], operands
[2]));
6824 emit_insn (gen_x86_shift_adj_2
6825 (DImode
, low
[0], high
[0], operands
[2]));
6829 /* Expand move of V1TI mode register X to a new TI mode register. */
6831 ix86_expand_v1ti_to_ti (rtx x
)
6833 rtx result
= gen_reg_rtx (TImode
);
6836 rtx temp
= force_reg (V2DImode
, gen_lowpart (V2DImode
, x
));
6837 rtx lo
= gen_lowpart (DImode
, result
);
6838 emit_insn (gen_vec_extractv2didi (lo
, temp
, const0_rtx
));
6839 rtx hi
= gen_highpart (DImode
, result
);
6840 emit_insn (gen_vec_extractv2didi (hi
, temp
, const1_rtx
));
6843 emit_move_insn (result
, gen_lowpart (TImode
, x
));
6847 /* Expand move of TI mode register X to a new V1TI mode register. */
6849 ix86_expand_ti_to_v1ti (rtx x
)
6853 rtx lo
= gen_lowpart (DImode
, x
);
6854 rtx hi
= gen_highpart (DImode
, x
);
6855 rtx tmp
= gen_reg_rtx (V2DImode
);
6856 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
6857 return force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp
));
6860 return force_reg (V1TImode
, gen_lowpart (V1TImode
, x
));
6863 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6865 ix86_expand_v1ti_shift (enum rtx_code code
, rtx operands
[])
6867 rtx op1
= force_reg (V1TImode
, operands
[1]);
6869 if (!CONST_INT_P (operands
[2]))
6871 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6872 rtx tmp2
= gen_reg_rtx (TImode
);
6873 rtx (*shift
) (rtx
, rtx
, rtx
)
6874 = (code
== ASHIFT
) ? gen_ashlti3
: gen_lshrti3
;
6875 emit_insn (shift (tmp2
, tmp1
, operands
[2]));
6876 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6877 emit_move_insn (operands
[0], tmp3
);
6881 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6885 emit_move_insn (operands
[0], op1
);
6889 if ((bits
& 7) == 0)
6891 rtx tmp
= gen_reg_rtx (V1TImode
);
6893 emit_insn (gen_sse2_ashlv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6895 emit_insn (gen_sse2_lshrv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6896 emit_move_insn (operands
[0], tmp
);
6900 rtx tmp1
= gen_reg_rtx (V1TImode
);
6902 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (64)));
6904 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
6906 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6907 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6909 /* tmp3 will be the V2DImode result. */
6910 rtx tmp3
= gen_reg_rtx (V2DImode
);
6915 emit_insn (gen_ashlv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6917 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6921 /* tmp4 is operands[1], in V2DImode. */
6922 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6924 rtx tmp5
= gen_reg_rtx (V2DImode
);
6926 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6928 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6930 rtx tmp6
= gen_reg_rtx (V2DImode
);
6932 emit_insn (gen_lshrv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6934 emit_insn (gen_ashlv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6936 emit_insn (gen_iorv2di3 (tmp3
, tmp5
, tmp6
));
6939 /* Convert the result back to V1TImode and store in operands[0]. */
6940 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6941 emit_move_insn (operands
[0], tmp7
);
6944 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6946 ix86_expand_v1ti_rotate (enum rtx_code code
, rtx operands
[])
6948 rtx op1
= force_reg (V1TImode
, operands
[1]);
6950 if (!CONST_INT_P (operands
[2]))
6952 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6953 rtx tmp2
= gen_reg_rtx (TImode
);
6954 rtx (*rotate
) (rtx
, rtx
, rtx
)
6955 = (code
== ROTATE
) ? gen_rotlti3
: gen_rotrti3
;
6956 emit_insn (rotate (tmp2
, tmp1
, operands
[2]));
6957 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6958 emit_move_insn (operands
[0], tmp3
);
6962 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6966 emit_move_insn (operands
[0], op1
);
6970 if (code
== ROTATERT
)
6973 if ((bits
& 31) == 0)
6975 rtx tmp2
= gen_reg_rtx (V4SImode
);
6976 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6978 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x93)));
6979 else if (bits
== 64)
6980 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x4e)));
6982 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x39)));
6983 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp2
));
6987 if ((bits
& 7) == 0)
6989 rtx tmp1
= gen_reg_rtx (V1TImode
);
6990 rtx tmp2
= gen_reg_rtx (V1TImode
);
6991 rtx tmp3
= gen_reg_rtx (V1TImode
);
6993 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (bits
)));
6994 emit_insn (gen_sse2_lshrv1ti3 (tmp2
, op1
, GEN_INT (128 - bits
)));
6995 emit_insn (gen_iorv1ti3 (tmp3
, tmp1
, tmp2
));
6996 emit_move_insn (operands
[0], tmp3
);
7000 rtx op1_v4si
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7009 hibits
= gen_reg_rtx (V4SImode
);
7010 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x93)));
7014 lobits
= gen_reg_rtx (V4SImode
);
7015 hibits
= gen_reg_rtx (V4SImode
);
7016 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x93)));
7017 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x4e)));
7021 lobits
= gen_reg_rtx (V4SImode
);
7022 hibits
= gen_reg_rtx (V4SImode
);
7023 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x4e)));
7024 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x39)));
7028 lobits
= gen_reg_rtx (V4SImode
);
7029 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x39)));
7034 rtx tmp1
= gen_reg_rtx (V4SImode
);
7035 rtx tmp2
= gen_reg_rtx (V4SImode
);
7036 rtx tmp3
= gen_reg_rtx (V4SImode
);
7038 emit_insn (gen_ashlv4si3 (tmp1
, lobits
, GEN_INT (bits
& 31)));
7039 emit_insn (gen_lshrv4si3 (tmp2
, hibits
, GEN_INT (32 - (bits
& 31))));
7040 emit_insn (gen_iorv4si3 (tmp3
, tmp1
, tmp2
));
7042 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
7045 /* Expand V1TI mode ashiftrt by constant. */
7047 ix86_expand_v1ti_ashiftrt (rtx operands
[])
7049 rtx op1
= force_reg (V1TImode
, operands
[1]);
7051 if (!CONST_INT_P (operands
[2]))
7053 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
7054 rtx tmp2
= gen_reg_rtx (TImode
);
7055 emit_insn (gen_ashrti3 (tmp2
, tmp1
, operands
[2]));
7056 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
7057 emit_move_insn (operands
[0], tmp3
);
7061 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
7065 emit_move_insn (operands
[0], op1
);
7071 /* Two operations. */
7072 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
7073 rtx tmp2
= gen_reg_rtx (V4SImode
);
7074 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7076 rtx tmp3
= gen_reg_rtx (V4SImode
);
7077 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7079 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
7085 /* Three operations. */
7086 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
7087 rtx tmp2
= gen_reg_rtx (V4SImode
);
7088 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7090 rtx tmp3
= gen_reg_rtx (V4SImode
);
7091 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7093 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
7094 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
7095 rtx tmp6
= gen_reg_rtx (V2DImode
);
7096 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
7098 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
7104 /* Three operations. */
7105 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
7106 rtx tmp2
= gen_reg_rtx (V4SImode
);
7107 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
7109 rtx tmp3
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
7110 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
7111 rtx tmp5
= gen_reg_rtx (V2DImode
);
7112 emit_insn (gen_vec_interleave_highv2di (tmp5
, tmp3
, tmp4
));
7114 rtx tmp6
= force_reg(V4SImode
, gen_lowpart (V4SImode
, tmp5
));
7115 rtx tmp7
= gen_reg_rtx (V4SImode
);
7116 emit_insn (gen_sse2_pshufd (tmp7
, tmp6
, GEN_INT (0xfd)));
7118 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
7124 /* Three operations. */
7125 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7126 rtx tmp2
= gen_reg_rtx (V4SImode
);
7127 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
7129 rtx tmp3
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
7130 rtx tmp4
= gen_reg_rtx (V8HImode
);
7131 emit_insn (gen_sse2_pshufhw (tmp4
, tmp3
, GEN_INT (0xfe)));
7133 rtx tmp5
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp4
));
7134 rtx tmp6
= gen_reg_rtx (V4SImode
);
7135 emit_insn (gen_sse2_pshufd (tmp6
, tmp5
, GEN_INT (0xfe)));
7137 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
7141 if (TARGET_AVX2
|| TARGET_SSE4_1
)
7143 /* Three operations. */
7146 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7147 rtx tmp2
= gen_reg_rtx (V4SImode
);
7148 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
7150 rtx tmp3
= gen_reg_rtx (V1TImode
);
7151 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (32)));
7155 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
7156 rtx tmp5
= gen_reg_rtx (V4SImode
);
7157 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
7160 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
7164 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
7165 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
7166 rtx tmp6
= gen_reg_rtx (V8HImode
);
7167 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
7170 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
7175 /* Three operations. */
7176 if (bits
== 8 || bits
== 16 || bits
== 24)
7178 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7179 rtx tmp2
= gen_reg_rtx (V4SImode
);
7180 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
7182 rtx tmp3
= gen_reg_rtx (V1TImode
);
7183 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (bits
)));
7187 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
7188 rtx tmp5
= gen_reg_rtx (V4SImode
);
7189 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
7192 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
7196 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
7197 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
7198 rtx tmp6
= gen_reg_rtx (V8HImode
);
7199 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
7202 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
7210 /* Four operations. */
7211 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7212 rtx tmp2
= gen_reg_rtx (V4SImode
);
7213 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
7215 rtx tmp3
= gen_reg_rtx (V4SImode
);
7216 emit_insn (gen_ashrv4si3 (tmp3
, tmp1
, GEN_INT (31)));
7218 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
7219 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
7220 rtx tmp6
= gen_reg_rtx (V2DImode
);
7221 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
7223 rtx tmp7
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp6
));
7224 rtx tmp8
= gen_reg_rtx (V4SImode
);
7225 emit_insn (gen_sse2_pshufd (tmp8
, tmp7
, GEN_INT (0xfd)));
7227 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp8
));
7231 if (TARGET_SSE4_1
&& (bits
== 48 || bits
== 80))
7233 /* Four operations. */
7234 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7235 rtx tmp2
= gen_reg_rtx (V4SImode
);
7236 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7238 rtx tmp3
= gen_reg_rtx (V4SImode
);
7239 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7241 rtx tmp4
= gen_reg_rtx (V1TImode
);
7242 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
7244 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
7245 rtx tmp6
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp4
));
7246 rtx tmp7
= gen_reg_rtx (V8HImode
);
7247 emit_insn (gen_sse4_1_pblendw (tmp7
, tmp5
, tmp6
,
7248 GEN_INT (bits
== 48 ? 0x1f : 0x07)));
7250 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
7254 if ((bits
& 7) == 0)
7256 /* Five operations. */
7257 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7258 rtx tmp2
= gen_reg_rtx (V4SImode
);
7259 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7261 rtx tmp3
= gen_reg_rtx (V4SImode
);
7262 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7264 rtx tmp4
= gen_reg_rtx (V1TImode
);
7265 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
7267 rtx tmp5
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
7268 rtx tmp6
= gen_reg_rtx (V1TImode
);
7269 emit_insn (gen_sse2_ashlv1ti3 (tmp6
, tmp5
, GEN_INT (128 - bits
)));
7271 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
7272 rtx tmp8
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp6
));
7273 rtx tmp9
= gen_reg_rtx (V2DImode
);
7274 emit_insn (gen_iorv2di3 (tmp9
, tmp7
, tmp8
));
7276 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp9
));
7280 if (TARGET_AVX2
&& bits
< 32)
7282 /* Six operations. */
7283 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7284 rtx tmp2
= gen_reg_rtx (V4SImode
);
7285 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
7287 rtx tmp3
= gen_reg_rtx (V1TImode
);
7288 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
7290 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7291 rtx tmp5
= gen_reg_rtx (V2DImode
);
7292 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
7294 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
7295 rtx tmp7
= gen_reg_rtx (V2DImode
);
7296 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
7298 rtx tmp8
= gen_reg_rtx (V2DImode
);
7299 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
7301 rtx tmp9
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp8
));
7302 rtx tmp10
= gen_reg_rtx (V4SImode
);
7303 emit_insn (gen_avx2_pblenddv4si (tmp10
, tmp2
, tmp9
, GEN_INT (7)));
7305 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp10
));
7309 if (TARGET_SSE4_1
&& bits
< 15)
7311 /* Six operations. */
7312 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7313 rtx tmp2
= gen_reg_rtx (V4SImode
);
7314 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
7316 rtx tmp3
= gen_reg_rtx (V1TImode
);
7317 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
7319 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7320 rtx tmp5
= gen_reg_rtx (V2DImode
);
7321 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
7323 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
7324 rtx tmp7
= gen_reg_rtx (V2DImode
);
7325 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
7327 rtx tmp8
= gen_reg_rtx (V2DImode
);
7328 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
7330 rtx tmp9
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
7331 rtx tmp10
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp8
));
7332 rtx tmp11
= gen_reg_rtx (V8HImode
);
7333 emit_insn (gen_sse4_1_pblendw (tmp11
, tmp9
, tmp10
, GEN_INT (0x3f)));
7335 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp11
));
7341 /* Eight operations. */
7342 rtx tmp1
= gen_reg_rtx (V1TImode
);
7343 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
7345 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7346 rtx tmp3
= gen_reg_rtx (V2DImode
);
7347 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (1)));
7349 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
7350 rtx tmp5
= gen_reg_rtx (V2DImode
);
7351 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (63)));
7353 rtx tmp6
= gen_reg_rtx (V2DImode
);
7354 emit_insn (gen_iorv2di3 (tmp6
, tmp3
, tmp5
));
7356 rtx tmp7
= gen_reg_rtx (V2DImode
);
7357 emit_insn (gen_lshrv2di3 (tmp7
, tmp2
, GEN_INT (63)));
7359 rtx tmp8
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp7
));
7360 rtx tmp9
= gen_reg_rtx (V4SImode
);
7361 emit_insn (gen_sse2_pshufd (tmp9
, tmp8
, GEN_INT (0xbf)));
7363 rtx tmp10
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp9
));
7364 rtx tmp11
= gen_reg_rtx (V2DImode
);
7365 emit_insn (gen_ashlv2di3 (tmp11
, tmp10
, GEN_INT (31)));
7367 rtx tmp12
= gen_reg_rtx (V2DImode
);
7368 emit_insn (gen_iorv2di3 (tmp12
, tmp6
, tmp11
));
7370 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp12
));
7376 /* Eight operations. */
7377 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7378 rtx tmp2
= gen_reg_rtx (V4SImode
);
7379 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7381 rtx tmp3
= gen_reg_rtx (V4SImode
);
7382 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7384 rtx tmp4
= gen_reg_rtx (V1TImode
);
7385 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
7387 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
7388 rtx tmp6
= gen_reg_rtx (V2DImode
);
7389 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
- 64)));
7391 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
7392 rtx tmp8
= gen_reg_rtx (V1TImode
);
7393 emit_insn (gen_sse2_ashlv1ti3 (tmp8
, tmp7
, GEN_INT (64)));
7395 rtx tmp9
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
7396 rtx tmp10
= gen_reg_rtx (V2DImode
);
7397 emit_insn (gen_ashlv2di3 (tmp10
, tmp9
, GEN_INT (128 - bits
)));
7399 rtx tmp11
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp8
));
7400 rtx tmp12
= gen_reg_rtx (V2DImode
);
7401 emit_insn (gen_iorv2di3 (tmp12
, tmp10
, tmp11
));
7403 rtx tmp13
= gen_reg_rtx (V2DImode
);
7404 emit_insn (gen_iorv2di3 (tmp13
, tmp6
, tmp12
));
7406 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp13
));
7410 /* Nine operations. */
7411 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7412 rtx tmp2
= gen_reg_rtx (V4SImode
);
7413 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7415 rtx tmp3
= gen_reg_rtx (V4SImode
);
7416 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7418 rtx tmp4
= gen_reg_rtx (V1TImode
);
7419 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
7421 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7422 rtx tmp6
= gen_reg_rtx (V2DImode
);
7423 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
)));
7425 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
7426 rtx tmp8
= gen_reg_rtx (V2DImode
);
7427 emit_insn (gen_ashlv2di3 (tmp8
, tmp7
, GEN_INT (64 - bits
)));
7429 rtx tmp9
= gen_reg_rtx (V2DImode
);
7430 emit_insn (gen_iorv2di3 (tmp9
, tmp6
, tmp8
));
7432 rtx tmp10
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
7433 rtx tmp11
= gen_reg_rtx (V1TImode
);
7434 emit_insn (gen_sse2_ashlv1ti3 (tmp11
, tmp10
, GEN_INT (64)));
7436 rtx tmp12
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp11
));
7437 rtx tmp13
= gen_reg_rtx (V2DImode
);
7438 emit_insn (gen_ashlv2di3 (tmp13
, tmp12
, GEN_INT (64 - bits
)));
7440 rtx tmp14
= gen_reg_rtx (V2DImode
);
7441 emit_insn (gen_iorv2di3 (tmp14
, tmp9
, tmp13
));
7443 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp14
));
7447 /* Replace all occurrences of REG FROM with REG TO in X, including
7448 occurrences with different modes. */
7451 ix86_replace_reg_with_reg (rtx x
, rtx from
, rtx to
)
7453 gcc_checking_assert (REG_P (from
)
7455 && GET_MODE (from
) == GET_MODE (to
));
7456 if (!reg_overlap_mentioned_p (from
, x
))
7458 rtx ret
= copy_rtx (x
);
7459 subrtx_ptr_iterator::array_type array
;
7460 FOR_EACH_SUBRTX_PTR (iter
, array
, &ret
, NONCONST
)
7464 if (REG_P (x
) && REGNO (x
) == REGNO (from
))
7470 gcc_checking_assert (REG_NREGS (x
) == 1);
7471 *loc
= gen_rtx_REG (GET_MODE (x
), REGNO (to
));
7478 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7479 DImode for constant loop counts. */
7482 counter_mode (rtx count_exp
)
7484 if (GET_MODE (count_exp
) != VOIDmode
)
7485 return GET_MODE (count_exp
);
7486 if (!CONST_INT_P (count_exp
))
7488 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
7493 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7494 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7495 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7496 memory by VALUE (supposed to be in MODE).
7498 The size is rounded down to whole number of chunk size moved at once.
7499 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7503 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
7504 rtx destptr
, rtx srcptr
, rtx value
,
7505 rtx count
, machine_mode mode
, int unroll
,
7506 int expected_size
, bool issetmem
)
7508 rtx_code_label
*out_label
, *top_label
;
7510 machine_mode iter_mode
= counter_mode (count
);
7511 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
7512 rtx piece_size
= GEN_INT (piece_size_n
);
7513 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
7517 top_label
= gen_label_rtx ();
7518 out_label
= gen_label_rtx ();
7519 iter
= gen_reg_rtx (iter_mode
);
7521 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
7522 NULL
, 1, OPTAB_DIRECT
);
7523 /* Those two should combine. */
7524 if (piece_size
== const1_rtx
)
7526 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
7528 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
7530 emit_move_insn (iter
, const0_rtx
);
7532 emit_label (top_label
);
7534 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
7536 /* This assert could be relaxed - in this case we'll need to compute
7537 smallest power of two, containing in PIECE_SIZE_N and pass it to
7539 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
7540 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
7541 destmem
= adjust_address (destmem
, mode
, 0);
7545 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
7546 srcmem
= adjust_address (srcmem
, mode
, 0);
7548 /* When unrolling for chips that reorder memory reads and writes,
7549 we can save registers by using single temporary.
7550 Also using 4 temporaries is overkill in 32bit mode. */
7551 if (!TARGET_64BIT
&& 0)
7553 for (i
= 0; i
< unroll
; i
++)
7557 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7558 GET_MODE_SIZE (mode
));
7559 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
7560 GET_MODE_SIZE (mode
));
7562 emit_move_insn (destmem
, srcmem
);
7568 gcc_assert (unroll
<= 4);
7569 for (i
= 0; i
< unroll
; i
++)
7571 tmpreg
[i
] = gen_reg_rtx (mode
);
7573 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
7574 GET_MODE_SIZE (mode
));
7575 emit_move_insn (tmpreg
[i
], srcmem
);
7577 for (i
= 0; i
< unroll
; i
++)
7580 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7581 GET_MODE_SIZE (mode
));
7582 emit_move_insn (destmem
, tmpreg
[i
]);
7587 for (i
= 0; i
< unroll
; i
++)
7590 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7591 GET_MODE_SIZE (mode
));
7592 emit_move_insn (destmem
, value
);
7595 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
7596 true, OPTAB_LIB_WIDEN
);
7598 emit_move_insn (iter
, tmp
);
7600 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
7602 if (expected_size
!= -1)
7604 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
7605 if (expected_size
== 0)
7607 else if (expected_size
> REG_BR_PROB_BASE
)
7608 predict_jump (REG_BR_PROB_BASE
- 1);
7610 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
7614 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
7615 iter
= ix86_zero_extend_to_Pmode (iter
);
7616 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
7617 true, OPTAB_LIB_WIDEN
);
7619 emit_move_insn (destptr
, tmp
);
7622 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
7623 true, OPTAB_LIB_WIDEN
);
7625 emit_move_insn (srcptr
, tmp
);
7627 emit_label (out_label
);
7630 /* Divide COUNTREG by SCALE. */
7632 scale_counter (rtx countreg
, int scale
)
7638 if (CONST_INT_P (countreg
))
7639 return GEN_INT (INTVAL (countreg
) / scale
);
7640 gcc_assert (REG_P (countreg
));
7642 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
7643 GEN_INT (exact_log2 (scale
)),
7644 NULL
, 1, OPTAB_DIRECT
);
7648 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7649 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7650 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7651 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7652 ORIG_VALUE is the original value passed to memset to fill the memory with.
7653 Other arguments have same meaning as for previous function. */
7656 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
7657 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
7659 machine_mode mode
, bool issetmem
)
7664 HOST_WIDE_INT rounded_count
;
7666 /* If possible, it is shorter to use rep movs.
7667 TODO: Maybe it is better to move this logic to decide_alg. */
7668 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
7669 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7670 && (!issetmem
|| orig_value
== const0_rtx
))
7673 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
7674 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
7676 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
7677 GET_MODE_SIZE (mode
)));
7680 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7681 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7682 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
7685 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
7686 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
7689 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7690 destmem
= shallow_copy_rtx (destmem
);
7691 set_mem_size (destmem
, rounded_count
);
7693 else if (MEM_SIZE_KNOWN_P (destmem
))
7694 clear_mem_size (destmem
);
7698 value
= force_reg (mode
, gen_lowpart (mode
, value
));
7699 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
7703 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
7704 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
7707 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7708 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7709 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
7712 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
7713 if (CONST_INT_P (count
))
7716 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7717 srcmem
= shallow_copy_rtx (srcmem
);
7718 set_mem_size (srcmem
, rounded_count
);
7722 if (MEM_SIZE_KNOWN_P (srcmem
))
7723 clear_mem_size (srcmem
);
7725 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
7730 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7732 SRC is passed by pointer to be updated on return.
7733 Return value is updated DST. */
7735 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
7736 HOST_WIDE_INT size_to_move
)
7738 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
7739 enum insn_code code
;
7740 machine_mode move_mode
;
7743 /* Find the widest mode in which we could perform moves.
7744 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7745 it until move of such size is supported. */
7746 piece_size
= 1 << floor_log2 (size_to_move
);
7747 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
7748 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7750 gcc_assert (piece_size
> 1);
7754 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7755 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7756 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7758 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7759 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7760 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7762 move_mode
= word_mode
;
7763 piece_size
= GET_MODE_SIZE (move_mode
);
7764 code
= optab_handler (mov_optab
, move_mode
);
7767 gcc_assert (code
!= CODE_FOR_nothing
);
7769 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7770 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
7772 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7773 gcc_assert (size_to_move
% piece_size
== 0);
7775 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7777 /* We move from memory to memory, so we'll need to do it via
7778 a temporary register. */
7779 tempreg
= gen_reg_rtx (move_mode
);
7780 emit_insn (GEN_FCN (code
) (tempreg
, src
));
7781 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
7783 emit_move_insn (destptr
,
7784 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7785 emit_move_insn (srcptr
,
7786 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
7788 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7790 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
7794 /* Update DST and SRC rtx. */
7799 /* Helper function for the string operations below. Dest VARIABLE whether
7800 it is aligned to VALUE bytes. If true, jump to the label. */
7802 static rtx_code_label
*
7803 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
7805 rtx_code_label
*label
= gen_label_rtx ();
7806 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
7807 if (GET_MODE (variable
) == DImode
)
7808 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
7810 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
7811 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
7814 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
7816 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7821 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7824 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
7825 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
7828 if (CONST_INT_P (count
))
7830 HOST_WIDE_INT countval
= INTVAL (count
);
7831 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7834 /* For now MAX_SIZE should be a power of 2. This assert could be
7835 relaxed, but it'll require a bit more complicated epilogue
7837 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7838 for (i
= max_size
; i
>= 1; i
>>= 1)
7840 if (epilogue_size
& i
)
7841 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7847 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
7848 count
, 1, OPTAB_DIRECT
);
7849 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
7850 count
, QImode
, 1, 4, false);
7854 /* When there are stringops, we can cheaply increase dest and src pointers.
7855 Otherwise we save code size by maintaining offset (zero is readily
7856 available from preceding rep operation) and using x86 addressing modes.
7858 if (TARGET_SINGLE_STRINGOP
)
7862 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7863 src
= change_address (srcmem
, SImode
, srcptr
);
7864 dest
= change_address (destmem
, SImode
, destptr
);
7865 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7867 LABEL_NUSES (label
) = 1;
7871 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7872 src
= change_address (srcmem
, HImode
, srcptr
);
7873 dest
= change_address (destmem
, HImode
, destptr
);
7874 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7876 LABEL_NUSES (label
) = 1;
7880 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7881 src
= change_address (srcmem
, QImode
, srcptr
);
7882 dest
= change_address (destmem
, QImode
, destptr
);
7883 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7885 LABEL_NUSES (label
) = 1;
7890 rtx offset
= force_reg (Pmode
, const0_rtx
);
7895 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7896 src
= change_address (srcmem
, SImode
, srcptr
);
7897 dest
= change_address (destmem
, SImode
, destptr
);
7898 emit_move_insn (dest
, src
);
7899 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
7900 true, OPTAB_LIB_WIDEN
);
7902 emit_move_insn (offset
, tmp
);
7904 LABEL_NUSES (label
) = 1;
7908 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7909 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7910 src
= change_address (srcmem
, HImode
, tmp
);
7911 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7912 dest
= change_address (destmem
, HImode
, tmp
);
7913 emit_move_insn (dest
, src
);
7914 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
7915 true, OPTAB_LIB_WIDEN
);
7917 emit_move_insn (offset
, tmp
);
7919 LABEL_NUSES (label
) = 1;
7923 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7924 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7925 src
= change_address (srcmem
, QImode
, tmp
);
7926 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7927 dest
= change_address (destmem
, QImode
, tmp
);
7928 emit_move_insn (dest
, src
);
7930 LABEL_NUSES (label
) = 1;
7935 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7936 with value PROMOTED_VAL.
7937 SRC is passed by pointer to be updated on return.
7938 Return value is updated DST. */
7940 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
7941 HOST_WIDE_INT size_to_move
)
7944 enum insn_code code
;
7945 machine_mode move_mode
;
7948 /* Find the widest mode in which we could perform moves.
7949 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7950 it until move of such size is supported. */
7951 move_mode
= GET_MODE (promoted_val
);
7952 if (move_mode
== VOIDmode
)
7954 if (size_to_move
< GET_MODE_SIZE (move_mode
))
7956 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
7957 move_mode
= int_mode_for_size (move_bits
, 0).require ();
7958 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
7960 piece_size
= GET_MODE_SIZE (move_mode
);
7961 code
= optab_handler (mov_optab
, move_mode
);
7962 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
7964 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7966 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7967 gcc_assert (size_to_move
% piece_size
== 0);
7969 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7971 if (piece_size
<= GET_MODE_SIZE (word_mode
))
7973 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
7974 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7979 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
7981 emit_move_insn (destptr
,
7982 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7984 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7988 /* Update DST rtx. */
7991 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7993 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
7994 rtx count
, int max_size
)
7996 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
7997 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
7998 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
7999 gen_lowpart (QImode
, value
), count
, QImode
,
8000 1, max_size
/ 2, true);
8003 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
8005 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
8006 rtx count
, int max_size
)
8010 if (CONST_INT_P (count
))
8012 HOST_WIDE_INT countval
= INTVAL (count
);
8013 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
8016 /* For now MAX_SIZE should be a power of 2. This assert could be
8017 relaxed, but it'll require a bit more complicated epilogue
8019 gcc_assert ((max_size
& (max_size
- 1)) == 0);
8020 for (i
= max_size
; i
>= 1; i
>>= 1)
8022 if (epilogue_size
& i
)
8024 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
8025 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
8027 destmem
= emit_memset (destmem
, destptr
, value
, i
);
8034 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
8039 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
8042 dest
= change_address (destmem
, DImode
, destptr
);
8043 emit_insn (gen_strset (destptr
, dest
, value
));
8044 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
8045 emit_insn (gen_strset (destptr
, dest
, value
));
8049 dest
= change_address (destmem
, SImode
, destptr
);
8050 emit_insn (gen_strset (destptr
, dest
, value
));
8051 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
8052 emit_insn (gen_strset (destptr
, dest
, value
));
8053 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
8054 emit_insn (gen_strset (destptr
, dest
, value
));
8055 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
8056 emit_insn (gen_strset (destptr
, dest
, value
));
8059 LABEL_NUSES (label
) = 1;
8063 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
8066 dest
= change_address (destmem
, DImode
, destptr
);
8067 emit_insn (gen_strset (destptr
, dest
, value
));
8071 dest
= change_address (destmem
, SImode
, destptr
);
8072 emit_insn (gen_strset (destptr
, dest
, value
));
8073 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
8074 emit_insn (gen_strset (destptr
, dest
, value
));
8077 LABEL_NUSES (label
) = 1;
8081 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
8082 dest
= change_address (destmem
, SImode
, destptr
);
8083 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
8085 LABEL_NUSES (label
) = 1;
8089 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
8090 dest
= change_address (destmem
, HImode
, destptr
);
8091 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
8093 LABEL_NUSES (label
) = 1;
8097 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
8098 dest
= change_address (destmem
, QImode
, destptr
);
8099 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
8101 LABEL_NUSES (label
) = 1;
8105 /* Adjust COUNTER by the VALUE. */
8107 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
8109 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
8112 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
8113 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
8114 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
8116 Return value is updated DESTMEM. */
8119 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
8120 rtx destptr
, rtx srcptr
, rtx value
,
8121 rtx vec_value
, rtx count
, int align
,
8122 int desired_alignment
, bool issetmem
)
8125 for (i
= 1; i
< desired_alignment
; i
<<= 1)
8129 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
8132 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
8133 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
8135 destmem
= emit_memset (destmem
, destptr
, value
, i
);
8138 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
8139 ix86_adjust_counter (count
, i
);
8141 LABEL_NUSES (label
) = 1;
8142 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
8148 /* Test if COUNT&SIZE is nonzero and if so, expand movme
8149 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
8150 and jump to DONE_LABEL. */
8152 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
8153 rtx destptr
, rtx srcptr
,
8154 rtx value
, rtx vec_value
,
8155 rtx count
, int size
,
8156 rtx done_label
, bool issetmem
)
8158 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
8159 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
8163 /* If we do not have vector value to copy, we must reduce size. */
8168 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
8170 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
8171 mode
= GET_MODE (value
);
8174 mode
= GET_MODE (vec_value
), value
= vec_value
;
8178 /* Choose appropriate vector mode. */
8180 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
8181 else if (size
>= 16)
8182 mode
= TARGET_SSE
? V16QImode
: DImode
;
8183 srcmem
= change_address (srcmem
, mode
, srcptr
);
8185 destmem
= change_address (destmem
, mode
, destptr
);
8186 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
8187 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
8188 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
8191 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
8194 emit_move_insn (destmem
, srcmem
);
8195 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
8197 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
8200 destmem
= offset_address (destmem
, count
, 1);
8201 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
8202 GET_MODE_SIZE (mode
));
8205 srcmem
= offset_address (srcmem
, count
, 1);
8206 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
8207 GET_MODE_SIZE (mode
));
8209 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
8212 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
8215 emit_move_insn (destmem
, srcmem
);
8216 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
8218 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
8220 emit_jump_insn (gen_jump (done_label
));
8224 LABEL_NUSES (label
) = 1;
8227 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
8228 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
8229 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
8230 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
8231 DONE_LABEL is a label after the whole copying sequence. The label is created
8232 on demand if *DONE_LABEL is NULL.
8233 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
8234 bounds after the initial copies.
8236 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
8237 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
8238 we will dispatch to a library call for large blocks.
8240 In pseudocode we do:
8244 Assume that SIZE is 4. Bigger sizes are handled analogously
8247 copy 4 bytes from SRCPTR to DESTPTR
8248 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
8253 copy 1 byte from SRCPTR to DESTPTR
8256 copy 2 bytes from SRCPTR to DESTPTR
8257 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
8262 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
8263 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
8265 OLD_DESPTR = DESTPTR;
8266 Align DESTPTR up to DESIRED_ALIGN
8267 SRCPTR += DESTPTR - OLD_DESTPTR
8268 COUNT -= DEST_PTR - OLD_DESTPTR
8270 Round COUNT down to multiple of SIZE
8271 << optional caller supplied zero size guard is here >>
8272 << optional caller supplied dynamic check is here >>
8273 << caller supplied main copy loop is here >>
8278 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
8279 rtx
*destptr
, rtx
*srcptr
,
8281 rtx value
, rtx vec_value
,
8283 rtx_code_label
**done_label
,
8287 unsigned HOST_WIDE_INT
*min_size
,
8291 rtx_code_label
*loop_label
= NULL
, *label
;
8294 int prolog_size
= 0;
8297 /* Chose proper value to copy. */
8298 if (issetmem
&& VECTOR_MODE_P (mode
))
8299 mode_value
= vec_value
;
8302 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
8304 /* See if block is big or small, handle small blocks. */
8305 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
8308 loop_label
= gen_label_rtx ();
8311 *done_label
= gen_label_rtx ();
8313 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
8317 /* Handle sizes > 3. */
8318 for (;size2
> 2; size2
>>= 1)
8319 expand_small_cpymem_or_setmem (destmem
, srcmem
,
8323 size2
, *done_label
, issetmem
);
8324 /* Nothing to copy? Jump to DONE_LABEL if so */
8325 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
8328 /* Do a byte copy. */
8329 destmem
= change_address (destmem
, QImode
, *destptr
);
8331 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
8334 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
8335 emit_move_insn (destmem
, srcmem
);
8338 /* Handle sizes 2 and 3. */
8339 label
= ix86_expand_aligntest (*count
, 2, false);
8340 destmem
= change_address (destmem
, HImode
, *destptr
);
8341 destmem
= offset_address (destmem
, *count
, 1);
8342 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
8344 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
8347 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
8348 srcmem
= offset_address (srcmem
, *count
, 1);
8349 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
8350 emit_move_insn (destmem
, srcmem
);
8354 LABEL_NUSES (label
) = 1;
8355 emit_jump_insn (gen_jump (*done_label
));
8359 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
8360 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
8362 /* Start memcpy for COUNT >= SIZE. */
8365 emit_label (loop_label
);
8366 LABEL_NUSES (loop_label
) = 1;
8369 /* Copy first desired_align bytes. */
8371 srcmem
= change_address (srcmem
, mode
, *srcptr
);
8372 destmem
= change_address (destmem
, mode
, *destptr
);
8373 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
8374 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
8377 emit_move_insn (destmem
, mode_value
);
8380 emit_move_insn (destmem
, srcmem
);
8381 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
8383 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
8384 prolog_size
+= GET_MODE_SIZE (mode
);
8388 /* Copy last SIZE bytes. */
8389 destmem
= offset_address (destmem
, *count
, 1);
8390 destmem
= offset_address (destmem
,
8391 GEN_INT (-size
- prolog_size
),
8394 emit_move_insn (destmem
, mode_value
);
8397 srcmem
= offset_address (srcmem
, *count
, 1);
8398 srcmem
= offset_address (srcmem
,
8399 GEN_INT (-size
- prolog_size
),
8401 emit_move_insn (destmem
, srcmem
);
8403 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
8405 destmem
= offset_address (destmem
, modesize
, 1);
8407 emit_move_insn (destmem
, mode_value
);
8410 srcmem
= offset_address (srcmem
, modesize
, 1);
8411 emit_move_insn (destmem
, srcmem
);
8415 /* Align destination. */
8416 if (desired_align
> 1 && desired_align
> align
)
8418 rtx saveddest
= *destptr
;
8420 gcc_assert (desired_align
<= size
);
8421 /* Align destptr up, place it to new register. */
8422 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
8423 GEN_INT (prolog_size
),
8424 NULL_RTX
, 1, OPTAB_DIRECT
);
8425 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
8426 REG_POINTER (*destptr
) = 1;
8427 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
8428 GEN_INT (-desired_align
),
8429 *destptr
, 1, OPTAB_DIRECT
);
8430 /* See how many bytes we skipped. */
8431 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
8433 NULL_RTX
, 1, OPTAB_DIRECT
);
8434 /* Adjust srcptr and count. */
8436 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
8437 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
8438 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
8439 saveddest
, *count
, 1, OPTAB_DIRECT
);
8440 /* We copied at most size + prolog_size. */
8441 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
8443 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
8447 /* Our loops always round down the block size, but for dispatch to
8448 library we need precise value. */
8450 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
8451 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
8455 gcc_assert (prolog_size
== 0);
8456 /* Decrease count, so we won't end up copying last word twice. */
8457 if (!CONST_INT_P (*count
))
8458 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
8459 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
8461 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
8462 (unsigned HOST_WIDE_INT
)size
));
8464 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
8469 /* This function is like the previous one, except here we know how many bytes
8470 need to be copied. That allows us to update alignment not only of DST, which
8471 is returned, but also of SRC, which is passed as a pointer for that
8474 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
8475 rtx srcreg
, rtx value
, rtx vec_value
,
8476 int desired_align
, int align_bytes
,
8481 rtx orig_src
= NULL
;
8483 int copied_bytes
= 0;
8487 gcc_assert (srcp
!= NULL
);
8492 for (piece_size
= 1;
8493 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
8496 if (align_bytes
& piece_size
)
8500 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
8501 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
8503 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
8506 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
8507 copied_bytes
+= piece_size
;
8510 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
8511 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
8512 if (MEM_SIZE_KNOWN_P (orig_dst
))
8513 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
8517 int src_align_bytes
= get_mem_align_offset (src
, desired_align
8519 if (src_align_bytes
>= 0)
8520 src_align_bytes
= desired_align
- src_align_bytes
;
8521 if (src_align_bytes
>= 0)
8523 unsigned int src_align
;
8524 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
8526 if ((src_align_bytes
& (src_align
- 1))
8527 == (align_bytes
& (src_align
- 1)))
8530 if (src_align
> (unsigned int) desired_align
)
8531 src_align
= desired_align
;
8532 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
8533 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
8535 if (MEM_SIZE_KNOWN_P (orig_src
))
8536 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
8543 /* Return true if ALG can be used in current context.
8544 Assume we expand memset if MEMSET is true. */
8546 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
8548 if (alg
== no_stringop
)
8550 /* It is not possible to use a library call if we have non-default
8551 address space. We can do better than the generic byte-at-a-time
8552 loop, used as a fallback. */
8553 if (alg
== libcall
&& have_as
)
8555 if (alg
== vector_loop
)
8556 return TARGET_SSE
|| TARGET_AVX
;
8557 /* Algorithms using the rep prefix want at least edi and ecx;
8558 additionally, memset wants eax and memcpy wants esi. Don't
8559 consider such algorithms if the user has appropriated those
8560 registers for their own purposes, or if we have a non-default
8561 address space, since some string insns cannot override the segment. */
8562 if (alg
== rep_prefix_1_byte
8563 || alg
== rep_prefix_4_byte
8564 || alg
== rep_prefix_8_byte
)
8568 if (fixed_regs
[CX_REG
]
8569 || fixed_regs
[DI_REG
]
8570 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
8576 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8577 static enum stringop_alg
8578 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
8579 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
8580 bool memset
, bool zero_memset
, bool have_as
,
8581 int *dynamic_check
, bool *noalign
, bool recur
)
8583 const struct stringop_algs
*algs
;
8584 bool optimize_for_speed
;
8586 const struct processor_costs
*cost
;
8588 bool any_alg_usable_p
= false;
8591 *dynamic_check
= -1;
8593 /* Even if the string operation call is cold, we still might spend a lot
8594 of time processing large blocks. */
8595 if (optimize_function_for_size_p (cfun
)
8596 || (optimize_insn_for_size_p ()
8598 || (expected_size
!= -1 && expected_size
< 256))))
8599 optimize_for_speed
= false;
8601 optimize_for_speed
= true;
8603 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
8605 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
8607 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
8609 /* See maximal size for user defined algorithm. */
8610 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8612 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8613 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
8614 any_alg_usable_p
|= usable
;
8616 if (candidate
!= libcall
&& candidate
&& usable
)
8617 max
= algs
->size
[i
].max
;
8620 /* If expected size is not known but max size is small enough
8621 so inline version is a win, set expected size into
8623 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
8624 && expected_size
== -1)
8625 expected_size
= min_size
/ 2 + max_size
/ 2;
8627 /* If user specified the algorithm, honor it if possible. */
8628 if (ix86_stringop_alg
!= no_stringop
8629 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
8630 return ix86_stringop_alg
;
8631 /* rep; movq or rep; movl is the smallest variant. */
8632 else if (!optimize_for_speed
)
8635 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
8636 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
8637 ? rep_prefix_1_byte
: loop_1_byte
;
8639 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
8640 ? rep_prefix_4_byte
: loop
;
8642 /* Very tiny blocks are best handled via the loop, REP is expensive to
8644 else if (expected_size
!= -1 && expected_size
< 4)
8646 else if (expected_size
!= -1)
8648 enum stringop_alg alg
= libcall
;
8649 bool alg_noalign
= false;
8650 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8652 /* We get here if the algorithms that were not libcall-based
8653 were rep-prefix based and we are unable to use rep prefixes
8654 based on global register usage. Break out of the loop and
8655 use the heuristic below. */
8656 if (algs
->size
[i
].max
== 0)
8658 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
8660 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8662 if (candidate
!= libcall
8663 && alg_usable_p (candidate
, memset
, have_as
))
8666 alg_noalign
= algs
->size
[i
].noalign
;
8668 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8669 last non-libcall inline algorithm. */
8670 if (TARGET_INLINE_ALL_STRINGOPS
)
8672 /* When the current size is best to be copied by a libcall,
8673 but we are still forced to inline, run the heuristic below
8674 that will pick code for medium sized blocks. */
8677 *noalign
= alg_noalign
;
8680 else if (!any_alg_usable_p
)
8683 else if (alg_usable_p (candidate
, memset
, have_as
)
8684 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8685 && candidate
== rep_prefix_1_byte
8686 /* NB: If min_size != max_size, size is
8688 && min_size
!= max_size
))
8690 *noalign
= algs
->size
[i
].noalign
;
8696 /* When asked to inline the call anyway, try to pick meaningful choice.
8697 We look for maximal size of block that is faster to copy by hand and
8698 take blocks of at most of that size guessing that average size will
8699 be roughly half of the block.
8701 If this turns out to be bad, we might simply specify the preferred
8702 choice in ix86_costs. */
8703 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8704 && (algs
->unknown_size
== libcall
8705 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
8707 enum stringop_alg alg
;
8708 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
8710 /* If there aren't any usable algorithms or if recursing already,
8711 then recursing on smaller sizes or same size isn't going to
8712 find anything. Just return the simple byte-at-a-time copy loop. */
8713 if (!any_alg_usable_p
|| recur
)
8715 /* Pick something reasonable. */
8716 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
8717 *dynamic_check
= 128;
8720 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
8721 zero_memset
, have_as
, dynamic_check
, noalign
, true);
8722 gcc_assert (*dynamic_check
== -1);
8723 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8724 *dynamic_check
= max
;
8726 gcc_assert (alg
!= libcall
);
8730 /* Try to use some reasonable fallback algorithm. Note that for
8731 non-default address spaces we default to a loop instead of
8733 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
8734 ? algs
->unknown_size
: have_as
? loop
: libcall
);
8737 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8738 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8740 decide_alignment (int align
,
8741 enum stringop_alg alg
,
8743 machine_mode move_mode
)
8745 int desired_align
= 0;
8747 gcc_assert (alg
!= no_stringop
);
8751 if (move_mode
== VOIDmode
)
8754 desired_align
= GET_MODE_SIZE (move_mode
);
8755 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8756 copying whole cacheline at once. */
8757 if (TARGET_CPU_P (PENTIUMPRO
)
8758 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
8763 if (desired_align
< align
)
8764 desired_align
= align
;
8765 if (expected_size
!= -1 && expected_size
< 4)
8766 desired_align
= align
;
8768 return desired_align
;
8772 /* Helper function for memcpy. For QImode value 0xXY produce
8773 0xXYXYXYXY of wide specified by MODE. This is essentially
8774 a * 0x10101010, but we can do slightly better than
8775 synth_mult by unwinding the sequence by hand on CPUs with
8778 promote_duplicated_reg (machine_mode mode
, rtx val
)
8780 machine_mode valmode
= GET_MODE (val
);
8782 int nops
= mode
== DImode
? 3 : 2;
8784 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
8785 if (val
== const0_rtx
)
8786 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
8787 if (CONST_INT_P (val
))
8789 HOST_WIDE_INT v
= INTVAL (val
) & 255;
8794 v
|= (v
<< 16) << 16;
8795 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
8798 if (valmode
== VOIDmode
)
8800 if (valmode
!= QImode
)
8801 val
= gen_lowpart (QImode
, val
);
8804 if (!TARGET_PARTIAL_REG_STALL
)
8806 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
8807 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
8808 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
8809 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
8811 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8812 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
8813 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
8818 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8820 if (!TARGET_PARTIAL_REG_STALL
)
8821 emit_insn (gen_insv_1 (mode
, reg
, reg
));
8824 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
8825 NULL
, 1, OPTAB_DIRECT
);
8826 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
8829 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
8830 NULL
, 1, OPTAB_DIRECT
);
8831 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8834 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
8835 NULL
, 1, OPTAB_DIRECT
);
8836 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8841 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8842 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8843 alignment from ALIGN to DESIRED_ALIGN. */
8845 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
8851 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
8852 promoted_val
= promote_duplicated_reg (DImode
, val
);
8853 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
8854 promoted_val
= promote_duplicated_reg (SImode
, val
);
8855 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
8856 promoted_val
= promote_duplicated_reg (HImode
, val
);
8860 return promoted_val
;
8863 /* Copy the address to a Pmode register. This is used for x32 to
8864 truncate DImode TLS address to a SImode register. */
8867 ix86_copy_addr_to_reg (rtx addr
)
8870 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
8872 reg
= copy_addr_to_reg (addr
);
8873 REG_POINTER (reg
) = 1;
8878 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
8879 reg
= copy_to_mode_reg (DImode
, addr
);
8880 REG_POINTER (reg
) = 1;
8881 return gen_rtx_SUBREG (SImode
, reg
, 0);
8885 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8886 operations when profitable. The code depends upon architecture, block size
8887 and alignment, but always has one of the following overall structures:
8889 Aligned move sequence:
8891 1) Prologue guard: Conditional that jumps up to epilogues for small
8892 blocks that can be handled by epilogue alone. This is faster
8893 but also needed for correctness, since prologue assume the block
8894 is larger than the desired alignment.
8896 Optional dynamic check for size and libcall for large
8897 blocks is emitted here too, with -minline-stringops-dynamically.
8899 2) Prologue: copy first few bytes in order to get destination
8900 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8901 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8902 copied. We emit either a jump tree on power of two sized
8903 blocks, or a byte loop.
8905 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8906 with specified algorithm.
8908 4) Epilogue: code copying tail of the block that is too small to be
8909 handled by main body (or up to size guarded by prologue guard).
8911 Misaligned move sequence
8913 1) missaligned move prologue/epilogue containing:
8914 a) Prologue handling small memory blocks and jumping to done_label
8915 (skipped if blocks are known to be large enough)
8916 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8917 needed by single possibly misaligned move
8918 (skipped if alignment is not needed)
8919 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8921 2) Zero size guard dispatching to done_label, if needed
8923 3) dispatch to library call, if needed,
8925 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8926 with specified algorithm. */
8928 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
8929 rtx align_exp
, rtx expected_align_exp
,
8930 rtx expected_size_exp
, rtx min_size_exp
,
8931 rtx max_size_exp
, rtx probable_max_size_exp
,
8936 rtx_code_label
*label
= NULL
;
8938 rtx_code_label
*jump_around_label
= NULL
;
8939 HOST_WIDE_INT align
= 1;
8940 unsigned HOST_WIDE_INT count
= 0;
8941 HOST_WIDE_INT expected_size
= -1;
8942 int size_needed
= 0, epilogue_size_needed
;
8943 int desired_align
= 0, align_bytes
= 0;
8944 enum stringop_alg alg
;
8945 rtx promoted_val
= NULL
;
8946 rtx vec_promoted_val
= NULL
;
8947 bool force_loopy_epilogue
= false;
8949 bool need_zero_guard
= false;
8951 machine_mode move_mode
= VOIDmode
;
8952 machine_mode wider_mode
;
8953 int unroll_factor
= 1;
8954 /* TODO: Once value ranges are available, fill in proper data. */
8955 unsigned HOST_WIDE_INT min_size
= 0;
8956 unsigned HOST_WIDE_INT max_size
= -1;
8957 unsigned HOST_WIDE_INT probable_max_size
= -1;
8958 bool misaligned_prologue_used
= false;
8961 if (CONST_INT_P (align_exp
))
8962 align
= INTVAL (align_exp
);
8963 /* i386 can do misaligned access on reasonably increased cost. */
8964 if (CONST_INT_P (expected_align_exp
)
8965 && INTVAL (expected_align_exp
) > align
)
8966 align
= INTVAL (expected_align_exp
);
8967 /* ALIGN is the minimum of destination and source alignment, but we care here
8968 just about destination alignment. */
8970 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
8971 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
8973 if (CONST_INT_P (count_exp
))
8975 min_size
= max_size
= probable_max_size
= count
= expected_size
8976 = INTVAL (count_exp
);
8977 /* When COUNT is 0, there is nothing to do. */
8984 min_size
= INTVAL (min_size_exp
);
8986 max_size
= INTVAL (max_size_exp
);
8987 if (probable_max_size_exp
)
8988 probable_max_size
= INTVAL (probable_max_size_exp
);
8989 if (CONST_INT_P (expected_size_exp
))
8990 expected_size
= INTVAL (expected_size_exp
);
8993 /* Make sure we don't need to care about overflow later on. */
8994 if (count
> (HOST_WIDE_INT_1U
<< 30))
8997 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
8999 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
9001 /* Step 0: Decide on preferred algorithm, desired alignment and
9002 size of chunks to be copied by main loop. */
9003 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
9005 issetmem
&& val_exp
== const0_rtx
, have_as
,
9006 &dynamic_check
, &noalign
, false);
9009 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
9010 stringop_alg_names
[alg
]);
9014 gcc_assert (alg
!= no_stringop
);
9016 /* For now vector-version of memset is generated only for memory zeroing, as
9017 creating of promoted vector value is very cheap in this case. */
9018 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
9019 alg
= unrolled_loop
;
9022 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
9023 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
9025 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
9028 move_mode
= word_mode
;
9036 need_zero_guard
= true;
9040 need_zero_guard
= true;
9043 need_zero_guard
= true;
9044 unroll_factor
= (TARGET_64BIT
? 4 : 2);
9047 need_zero_guard
= true;
9049 /* Find the widest supported mode. */
9050 move_mode
= word_mode
;
9051 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
9052 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
9053 move_mode
= wider_mode
;
9055 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
9057 if (TARGET_AVX512_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 256)
9060 /* Find the corresponding vector mode with the same size as MOVE_MODE.
9061 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
9062 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
9064 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
9065 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
9066 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
9067 move_mode
= word_mode
;
9069 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
9071 case rep_prefix_8_byte
:
9074 case rep_prefix_4_byte
:
9077 case rep_prefix_1_byte
:
9081 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
9082 epilogue_size_needed
= size_needed
;
9084 /* If we are going to call any library calls conditionally, make sure any
9085 pending stack adjustment happen before the first conditional branch,
9086 otherwise they will be emitted before the library call only and won't
9087 happen from the other branches. */
9088 if (dynamic_check
!= -1)
9089 do_pending_stack_adjust ();
9091 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
9092 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
9093 align
= desired_align
;
9095 /* Step 1: Prologue guard. */
9097 /* Alignment code needs count to be in register. */
9098 if (CONST_INT_P (count_exp
) && desired_align
> align
)
9100 if (INTVAL (count_exp
) > desired_align
9101 && INTVAL (count_exp
) > size_needed
)
9104 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
9105 if (align_bytes
<= 0)
9108 align_bytes
= desired_align
- align_bytes
;
9110 if (align_bytes
== 0)
9111 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
9113 gcc_assert (desired_align
>= 1 && align
>= 1);
9115 /* Misaligned move sequences handle both prologue and epilogue at once.
9116 Default code generation results in a smaller code for large alignments
9117 and also avoids redundant job when sizes are known precisely. */
9118 misaligned_prologue_used
9119 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
9120 && MAX (desired_align
, epilogue_size_needed
) <= 32
9121 && desired_align
<= epilogue_size_needed
9122 && ((desired_align
> align
&& !align_bytes
)
9123 || (!count
&& epilogue_size_needed
> 1)));
9125 /* Do the cheap promotion to allow better CSE across the
9126 main loop and epilogue (ie one load of the big constant in the
9128 For now the misaligned move sequences do not have fast path
9129 without broadcasting. */
9130 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
9132 if (alg
== vector_loop
)
9134 gcc_assert (val_exp
== const0_rtx
);
9135 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
9136 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
9137 GET_MODE_SIZE (word_mode
),
9138 desired_align
, align
);
9142 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
9143 desired_align
, align
);
9146 /* Misaligned move sequences handles both prologues and epilogues at once.
9147 Default code generation results in smaller code for large alignments and
9148 also avoids redundant job when sizes are known precisely. */
9149 if (misaligned_prologue_used
)
9151 /* Misaligned move prologue handled small blocks by itself. */
9152 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
9153 (dst
, src
, &destreg
, &srcreg
,
9154 move_mode
, promoted_val
, vec_promoted_val
,
9157 desired_align
< align
9158 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
9159 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
9161 src
= change_address (src
, BLKmode
, srcreg
);
9162 dst
= change_address (dst
, BLKmode
, destreg
);
9163 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
9164 epilogue_size_needed
= 0;
9166 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
9168 /* It is possible that we copied enough so the main loop will not
9170 gcc_assert (size_needed
> 1);
9171 if (jump_around_label
== NULL_RTX
)
9172 jump_around_label
= gen_label_rtx ();
9173 emit_cmp_and_jump_insns (count_exp
,
9174 GEN_INT (size_needed
),
9175 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
9176 if (expected_size
== -1
9177 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
9178 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
9180 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
9183 /* Ensure that alignment prologue won't copy past end of block. */
9184 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
9186 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
9187 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
9188 Make sure it is power of 2. */
9189 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
9191 /* To improve performance of small blocks, we jump around the VAL
9192 promoting mode. This mean that if the promoted VAL is not constant,
9193 we might not use it in the epilogue and have to use byte
9195 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
9196 force_loopy_epilogue
= true;
9197 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
9198 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
9200 /* If main algorithm works on QImode, no epilogue is needed.
9201 For small sizes just don't align anything. */
9202 if (size_needed
== 1)
9203 desired_align
= align
;
9208 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
9210 label
= gen_label_rtx ();
9211 emit_cmp_and_jump_insns (count_exp
,
9212 GEN_INT (epilogue_size_needed
),
9213 LTU
, 0, counter_mode (count_exp
), 1, label
);
9214 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
9215 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
9217 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
9221 /* Emit code to decide on runtime whether library call or inline should be
9223 if (dynamic_check
!= -1)
9225 if (!issetmem
&& CONST_INT_P (count_exp
))
9227 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
9229 emit_block_copy_via_libcall (dst
, src
, count_exp
);
9230 count_exp
= const0_rtx
;
9236 rtx_code_label
*hot_label
= gen_label_rtx ();
9237 if (jump_around_label
== NULL_RTX
)
9238 jump_around_label
= gen_label_rtx ();
9239 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
9240 LEU
, 0, counter_mode (count_exp
),
9242 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
9244 set_storage_via_libcall (dst
, count_exp
, val_exp
);
9246 emit_block_copy_via_libcall (dst
, src
, count_exp
);
9247 emit_jump (jump_around_label
);
9248 emit_label (hot_label
);
9252 /* Step 2: Alignment prologue. */
9253 /* Do the expensive promotion once we branched off the small blocks. */
9254 if (issetmem
&& !promoted_val
)
9255 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
9256 desired_align
, align
);
9258 if (desired_align
> align
&& !misaligned_prologue_used
)
9260 if (align_bytes
== 0)
9262 /* Except for the first move in prologue, we no longer know
9263 constant offset in aliasing info. It don't seems to worth
9264 the pain to maintain it for the first move, so throw away
9266 dst
= change_address (dst
, BLKmode
, destreg
);
9268 src
= change_address (src
, BLKmode
, srcreg
);
9269 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
9270 promoted_val
, vec_promoted_val
,
9271 count_exp
, align
, desired_align
,
9273 /* At most desired_align - align bytes are copied. */
9274 if (min_size
< (unsigned)(desired_align
- align
))
9277 min_size
-= desired_align
- align
;
9281 /* If we know how many bytes need to be stored before dst is
9282 sufficiently aligned, maintain aliasing info accurately. */
9283 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
9291 count_exp
= plus_constant (counter_mode (count_exp
),
9292 count_exp
, -align_bytes
);
9293 count
-= align_bytes
;
9294 min_size
-= align_bytes
;
9295 max_size
-= align_bytes
;
9298 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
9299 && (count
< (unsigned HOST_WIDE_INT
) size_needed
9300 || (align_bytes
== 0
9301 && count
< ((unsigned HOST_WIDE_INT
) size_needed
9302 + desired_align
- align
))))
9304 /* It is possible that we copied enough so the main loop will not
9306 gcc_assert (size_needed
> 1);
9307 if (label
== NULL_RTX
)
9308 label
= gen_label_rtx ();
9309 emit_cmp_and_jump_insns (count_exp
,
9310 GEN_INT (size_needed
),
9311 LTU
, 0, counter_mode (count_exp
), 1, label
);
9312 if (expected_size
== -1
9313 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
9314 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
9316 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
9319 if (label
&& size_needed
== 1)
9322 LABEL_NUSES (label
) = 1;
9324 epilogue_size_needed
= 1;
9326 promoted_val
= val_exp
;
9328 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
9329 epilogue_size_needed
= size_needed
;
9331 /* Step 3: Main loop. */
9342 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
9343 count_exp
, move_mode
, unroll_factor
,
9344 expected_size
, issetmem
);
9347 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
9348 vec_promoted_val
, count_exp
, move_mode
,
9349 unroll_factor
, expected_size
, issetmem
);
9351 case rep_prefix_8_byte
:
9352 case rep_prefix_4_byte
:
9353 case rep_prefix_1_byte
:
9354 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
9355 val_exp
, count_exp
, move_mode
, issetmem
);
9358 /* Adjust properly the offset of src and dest memory for aliasing. */
9359 if (CONST_INT_P (count_exp
))
9362 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
9363 (count
/ size_needed
) * size_needed
);
9364 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
9365 (count
/ size_needed
) * size_needed
);
9370 src
= change_address (src
, BLKmode
, srcreg
);
9371 dst
= change_address (dst
, BLKmode
, destreg
);
9374 /* Step 4: Epilogue to copy the remaining bytes. */
9378 /* When the main loop is done, COUNT_EXP might hold original count,
9379 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9380 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9381 bytes. Compensate if needed. */
9383 if (size_needed
< epilogue_size_needed
)
9385 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
9386 GEN_INT (size_needed
- 1), count_exp
, 1,
9388 if (tmp
!= count_exp
)
9389 emit_move_insn (count_exp
, tmp
);
9392 LABEL_NUSES (label
) = 1;
9395 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
9397 if (force_loopy_epilogue
)
9398 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
9399 epilogue_size_needed
);
9403 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
9404 vec_promoted_val
, count_exp
,
9405 epilogue_size_needed
);
9407 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
9408 epilogue_size_needed
);
9411 if (jump_around_label
)
9412 emit_label (jump_around_label
);
9416 /* Expand cmpstrn or memcmp. */
9419 ix86_expand_cmpstrn_or_cmpmem (rtx result
, rtx src1
, rtx src2
,
9420 rtx length
, rtx align
, bool is_cmpstrn
)
9422 /* Expand strncmp and memcmp only with -minline-all-stringops since
9423 "repz cmpsb" can be much slower than strncmp and memcmp functions
9424 implemented with vector instructions, see
9426 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9428 if (!TARGET_INLINE_ALL_STRINGOPS
)
9431 /* Can't use this if the user has appropriated ecx, esi or edi. */
9432 if (fixed_regs
[CX_REG
] || fixed_regs
[SI_REG
] || fixed_regs
[DI_REG
])
9437 /* For strncmp, length is the maximum length, which can be larger
9438 than actual string lengths. We can expand the cmpstrn pattern
9439 to "repz cmpsb" only if one of the strings is a constant so
9440 that expand_builtin_strncmp() can write the length argument to
9441 be the minimum of the const string length and the actual length
9442 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9443 tree t1
= MEM_EXPR (src1
);
9444 tree t2
= MEM_EXPR (src2
);
9445 if (!((t1
&& TREE_CODE (t1
) == MEM_REF
9446 && TREE_CODE (TREE_OPERAND (t1
, 0)) == ADDR_EXPR
9447 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1
, 0), 0))
9449 || (t2
&& TREE_CODE (t2
) == MEM_REF
9450 && TREE_CODE (TREE_OPERAND (t2
, 0)) == ADDR_EXPR
9451 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2
, 0), 0))
9456 rtx addr1
= copy_addr_to_reg (XEXP (src1
, 0));
9457 rtx addr2
= copy_addr_to_reg (XEXP (src2
, 0));
9458 if (addr1
!= XEXP (src1
, 0))
9459 src1
= replace_equiv_address_nv (src1
, addr1
);
9460 if (addr2
!= XEXP (src2
, 0))
9461 src2
= replace_equiv_address_nv (src2
, addr2
);
9463 /* NB: Make a copy of the data length to avoid changing the original
9464 data length by cmpstrnqi patterns. */
9465 length
= ix86_zero_extend_to_Pmode (length
);
9466 rtx lengthreg
= gen_reg_rtx (Pmode
);
9467 emit_move_insn (lengthreg
, length
);
9469 /* If we are testing strict equality, we can use known alignment to
9470 good advantage. This may be possible with combine, particularly
9471 once cc0 is dead. */
9472 if (CONST_INT_P (length
))
9474 if (length
== const0_rtx
)
9476 emit_move_insn (result
, const0_rtx
);
9479 emit_insn (gen_cmpstrnqi_nz_1 (addr1
, addr2
, lengthreg
, align
,
9484 emit_insn (gen_cmp_1 (Pmode
, lengthreg
, lengthreg
));
9485 emit_insn (gen_cmpstrnqi_1 (addr1
, addr2
, lengthreg
, align
,
9489 rtx out
= gen_lowpart (QImode
, result
);
9490 emit_insn (gen_cmpintqi (out
));
9491 emit_move_insn (result
, gen_rtx_SIGN_EXTEND (SImode
, out
));
9496 /* Expand the appropriate insns for doing strlen if not just doing
9499 out = result, initialized with the start address
9500 align_rtx = alignment of the address.
9501 scratch = scratch register, initialized with the startaddress when
9502 not aligned, otherwise undefined
9504 This is just the body. It needs the initializations mentioned above and
9505 some address computing at the end. These things are done in i386.md. */
9508 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
9512 rtx_code_label
*align_2_label
= NULL
;
9513 rtx_code_label
*align_3_label
= NULL
;
9514 rtx_code_label
*align_4_label
= gen_label_rtx ();
9515 rtx_code_label
*end_0_label
= gen_label_rtx ();
9517 rtx tmpreg
= gen_reg_rtx (SImode
);
9518 rtx scratch
= gen_reg_rtx (SImode
);
9522 if (CONST_INT_P (align_rtx
))
9523 align
= INTVAL (align_rtx
);
9525 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9527 /* Is there a known alignment and is it less than 4? */
9530 rtx scratch1
= gen_reg_rtx (Pmode
);
9531 emit_move_insn (scratch1
, out
);
9532 /* Is there a known alignment and is it not 2? */
9535 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
9536 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
9538 /* Leave just the 3 lower bits. */
9539 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
9540 NULL_RTX
, 0, OPTAB_WIDEN
);
9542 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
9543 Pmode
, 1, align_4_label
);
9544 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
9545 Pmode
, 1, align_2_label
);
9546 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
9547 Pmode
, 1, align_3_label
);
9551 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9552 check if is aligned to 4 - byte. */
9554 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
9555 NULL_RTX
, 0, OPTAB_WIDEN
);
9557 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
9558 Pmode
, 1, align_4_label
);
9561 mem
= change_address (src
, QImode
, out
);
9563 /* Now compare the bytes. */
9565 /* Compare the first n unaligned byte on a byte per byte basis. */
9566 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
9567 QImode
, 1, end_0_label
);
9569 /* Increment the address. */
9570 emit_insn (gen_add2_insn (out
, const1_rtx
));
9572 /* Not needed with an alignment of 2 */
9575 emit_label (align_2_label
);
9577 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
9580 emit_insn (gen_add2_insn (out
, const1_rtx
));
9582 emit_label (align_3_label
);
9585 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
9588 emit_insn (gen_add2_insn (out
, const1_rtx
));
9591 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9592 align this loop. It gives only huge programs, but does not help to
9594 emit_label (align_4_label
);
9596 mem
= change_address (src
, SImode
, out
);
9597 emit_move_insn (scratch
, mem
);
9598 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
9600 /* This formula yields a nonzero result iff one of the bytes is zero.
9601 This saves three branches inside loop and many cycles. */
9603 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
9604 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
9605 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
9606 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
9607 gen_int_mode (0x80808080, SImode
)));
9608 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
9613 rtx reg
= gen_reg_rtx (SImode
);
9614 rtx reg2
= gen_reg_rtx (Pmode
);
9615 emit_move_insn (reg
, tmpreg
);
9616 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
9618 /* If zero is not in the first two bytes, move two bytes forward. */
9619 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9620 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9621 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9622 emit_insn (gen_rtx_SET (tmpreg
,
9623 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
9626 /* Emit lea manually to avoid clobbering of flags. */
9627 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
9629 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9630 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9631 emit_insn (gen_rtx_SET (out
,
9632 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
9638 rtx_code_label
*end_2_label
= gen_label_rtx ();
9639 /* Is zero in the first two bytes? */
9641 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9642 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9643 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
9644 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
9645 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
9647 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
9648 JUMP_LABEL (tmp
) = end_2_label
;
9650 /* Not in the first two. Move two bytes forward. */
9651 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
9652 emit_insn (gen_add2_insn (out
, const2_rtx
));
9654 emit_label (end_2_label
);
9658 /* Avoid branch in fixing the byte. */
9659 tmpreg
= gen_lowpart (QImode
, tmpreg
);
9660 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
9661 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
9662 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
9663 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
9665 emit_label (end_0_label
);
9668 /* Expand strlen. */
9671 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
9673 if (TARGET_UNROLL_STRLEN
9674 && TARGET_INLINE_ALL_STRINGOPS
9675 && eoschar
== const0_rtx
9678 /* The generic case of strlen expander is long. Avoid it's
9679 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9680 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
9681 /* Well it seems that some optimizer does not combine a call like
9682 foo(strlen(bar), strlen(bar));
9683 when the move and the subtraction is done here. It does calculate
9684 the length just once when these instructions are done inside of
9685 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9686 often used and I use one fewer register for the lifetime of
9687 output_strlen_unroll() this is better. */
9689 emit_move_insn (out
, addr
);
9691 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
9693 /* strlensi_unroll_1 returns the address of the zero at the end of
9694 the string, like memchr(), so compute the length by subtracting
9695 the start address. */
9696 emit_insn (gen_sub2_insn (out
, addr
));
9703 /* For given symbol (function) construct code to compute address of it's PLT
9704 entry in large x86-64 PIC model. */
9707 construct_plt_address (rtx symbol
)
9711 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
9712 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
9713 gcc_assert (Pmode
== DImode
);
9715 tmp
= gen_reg_rtx (Pmode
);
9716 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
9718 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
9719 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
9723 /* Additional registers that are clobbered by SYSV calls. */
9725 static int const x86_64_ms_sysv_extra_clobbered_registers
9726 [NUM_X86_64_MS_CLOBBERED_REGS
] =
9730 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
9731 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
9735 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
9737 rtx pop
, bool sibcall
)
9740 rtx use
= NULL
, call
;
9741 unsigned int vec_len
= 0;
9743 bool call_no_callee_saved_registers
= false;
9745 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9747 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
9750 if (lookup_attribute ("interrupt",
9751 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))
9752 error ("interrupt service routine cannot be called directly");
9753 else if (lookup_attribute ("no_callee_saved_registers",
9754 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))
9755 call_no_callee_saved_registers
= true;
9762 tree mem_expr
= MEM_EXPR (fnaddr
);
9763 if (mem_expr
!= nullptr
9764 && TREE_CODE (mem_expr
) == MEM_REF
9765 && lookup_attribute ("no_callee_saved_registers",
9766 TYPE_ATTRIBUTES (TREE_TYPE (mem_expr
))))
9767 call_no_callee_saved_registers
= true;
9773 if (pop
== const0_rtx
)
9775 gcc_assert (!TARGET_64BIT
|| !pop
);
9777 rtx addr
= XEXP (fnaddr
, 0);
9778 if (TARGET_MACHO
&& !TARGET_64BIT
)
9781 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9782 fnaddr
= machopic_indirect_call_target (fnaddr
);
9787 /* Static functions and indirect calls don't need the pic register. Also,
9788 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9789 it an indirect call. */
9791 && GET_CODE (addr
) == SYMBOL_REF
9792 && ix86_call_use_plt_p (addr
))
9795 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
9796 || !lookup_attribute ("noplt",
9797 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
9800 || (ix86_cmodel
== CM_LARGE_PIC
9801 && DEFAULT_ABI
!= MS_ABI
))
9803 use_reg (&use
, gen_rtx_REG (Pmode
,
9804 REAL_PIC_OFFSET_TABLE_REGNUM
));
9805 if (ix86_use_pseudo_pic_reg ())
9806 emit_move_insn (gen_rtx_REG (Pmode
,
9807 REAL_PIC_OFFSET_TABLE_REGNUM
),
9808 pic_offset_table_rtx
);
9811 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
9814 && ix86_cmodel
== CM_LARGE_PIC
9815 && DEFAULT_ABI
!= MS_ABI
)
9817 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9819 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9820 fnaddr
= force_reg (Pmode
, fnaddr
);
9821 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
, fnaddr
);
9823 else if (TARGET_64BIT
)
9825 fnaddr
= gen_rtx_UNSPEC (Pmode
,
9826 gen_rtvec (1, addr
),
9828 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9832 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9834 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9835 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
9838 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
9839 /* Pmode may not be the same as word_mode for x32, which
9840 doesn't support indirect branch via 32-bit memory slot.
9841 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9842 indirect branch via x32 GOT slot is OK. */
9843 if (GET_MODE (fnaddr
) != word_mode
)
9844 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
9845 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
9850 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9851 parameters passed in vector registers. */
9853 && (INTVAL (callarg2
) > 0
9854 || (INTVAL (callarg2
) == 0
9855 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
9857 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
9858 emit_move_insn (al
, callarg2
);
9862 if (ix86_cmodel
== CM_LARGE_PIC
9865 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
9866 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
9867 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
9868 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9869 branch via x32 GOT slot is OK. */
9870 else if (!(TARGET_X32
9872 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
9873 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
9875 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
9876 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
9878 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
9879 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
9882 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9883 mask off code pointers here.
9884 TODO: also need to handle indirect jump. */
9885 if (ix86_memtag_can_tag_addresses () && !fndecl
9886 && sanitize_flags_p (SANITIZE_HWADDRESS
))
9888 rtx untagged_addr
= ix86_memtag_untagged_pointer (XEXP (fnaddr
, 0),
9890 fnaddr
= gen_rtx_MEM (QImode
, untagged_addr
);
9893 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
9896 call
= gen_rtx_SET (retval
, call
);
9897 vec
[vec_len
++] = call
;
9901 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
9902 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
9903 vec
[vec_len
++] = pop
;
9906 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
9908 if ((cfun
->machine
->call_saved_registers
9909 == TYPE_NO_CALLER_SAVED_REGISTERS
)
9911 || (!TREE_THIS_VOLATILE (fndecl
)
9912 && !lookup_attribute ("no_caller_saved_registers",
9913 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
9915 bool is_64bit_ms_abi
= (TARGET_64BIT
9916 && ix86_function_abi (fndecl
) == MS_ABI
);
9917 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
9919 /* If there are no caller-saved registers, add all registers
9920 that are clobbered by the call which returns. */
9921 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
9923 && (ix86_call_used_regs
[i
] == 1
9924 || (ix86_call_used_regs
[i
] & c_mask
))
9925 && !STACK_REGNO_P (i
)
9926 && !MMX_REGNO_P (i
))
9928 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
9930 else if (TARGET_64BIT_MS_ABI
9931 && (!callarg2
|| INTVAL (callarg2
) != -2))
9935 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
9937 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
9938 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
9940 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
9943 /* Set here, but it may get cleared later. */
9944 if (TARGET_CALL_MS2SYSV_XLOGUES
)
9949 /* Don't break hot-patched functions. */
9950 else if (ix86_function_ms_hook_prologue (current_function_decl
))
9953 /* TODO: Cases not yet examined. */
9954 else if (flag_split_stack
)
9955 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9959 gcc_assert (!reload_completed
);
9960 cfun
->machine
->call_ms2sysv
= true;
9965 if (TARGET_MACHO
&& TARGET_64BIT
&& !sibcall
9966 && ((GET_CODE (addr
) == SYMBOL_REF
&& !SYMBOL_REF_LOCAL_P (addr
))
9967 || !fndecl
|| TREE_PUBLIC (fndecl
)))
9969 /* We allow public functions defined in a TU to bind locally for PIC
9970 code (the default) on 64bit Mach-O.
9971 If such functions are not inlined, we cannot tell at compile-time if
9972 they will be called via the lazy symbol resolver (this can depend on
9973 options given at link-time). Therefore, we must assume that the lazy
9974 resolver could be used which clobbers R11 and R10. */
9975 clobber_reg (&use
, gen_rtx_REG (DImode
, R11_REG
));
9976 clobber_reg (&use
, gen_rtx_REG (DImode
, R10_REG
));
9979 if (call_no_callee_saved_registers
)
9981 /* After calling a no_callee_saved_registers function, all
9982 registers may be clobbered. Clobber all registers that are
9983 not used by the callee. */
9984 bool is_64bit_ms_abi
= (TARGET_64BIT
9985 && ix86_function_abi (fndecl
) == MS_ABI
);
9986 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
9987 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
9989 && !(ix86_call_used_regs
[i
] == 1
9990 || (ix86_call_used_regs
[i
] & c_mask
))
9991 && !STACK_REGNO_P (i
)
9992 && !MMX_REGNO_P (i
))
9994 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
9998 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
9999 rtx_insn
*call_insn
= emit_call_insn (call
);
10001 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
10006 /* Split simple return with popping POPC bytes from stack to indirect
10007 branch with stack adjustment . */
10010 ix86_split_simple_return_pop_internal (rtx popc
)
10012 struct machine_function
*m
= cfun
->machine
;
10013 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
10016 /* There is no "pascal" calling convention in any 64bit ABI. */
10017 gcc_assert (!TARGET_64BIT
);
10019 insn
= emit_insn (gen_pop (ecx
));
10020 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
10021 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
10023 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
10024 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
10025 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
10026 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
10027 RTX_FRAME_RELATED_P (insn
) = 1;
10029 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
10030 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
10031 insn
= emit_insn (x
);
10032 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
10033 RTX_FRAME_RELATED_P (insn
) = 1;
10035 /* Now return address is in ECX. */
10036 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
10039 /* Errors in the source file can cause expand_expr to return const0_rtx
10040 where we expect a vector. To avoid crashing, use one of the vector
10041 clear instructions. */
10044 safe_vector_operand (rtx x
, machine_mode mode
)
10046 if (x
== const0_rtx
)
10047 x
= CONST0_RTX (mode
);
10051 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
10054 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
10057 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10058 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10059 rtx op0
= expand_normal (arg0
);
10060 rtx op1
= expand_normal (arg1
);
10061 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
10062 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
10063 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
10065 if (VECTOR_MODE_P (mode0
))
10066 op0
= safe_vector_operand (op0
, mode0
);
10067 if (VECTOR_MODE_P (mode1
))
10068 op1
= safe_vector_operand (op1
, mode1
);
10070 if (optimize
|| !target
10071 || GET_MODE (target
) != tmode
10072 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
10073 target
= gen_reg_rtx (tmode
);
10075 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
10077 rtx x
= gen_reg_rtx (V4SImode
);
10078 emit_insn (gen_sse2_loadd (x
, op1
));
10079 op1
= gen_lowpart (TImode
, x
);
10082 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
10083 op0
= copy_to_mode_reg (mode0
, op0
);
10084 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
10085 op1
= copy_to_mode_reg (mode1
, op1
);
10087 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
10096 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
10099 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
10100 enum ix86_builtin_func_type m_type
,
10101 enum rtx_code sub_code
)
10104 unsigned int i
, nargs
;
10105 bool comparison_p
= false;
10107 bool last_arg_constant
= false;
10108 int num_memory
= 0;
10111 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
10115 case MULTI_ARG_4_DF2_DI_I
:
10116 case MULTI_ARG_4_DF2_DI_I1
:
10117 case MULTI_ARG_4_SF2_SI_I
:
10118 case MULTI_ARG_4_SF2_SI_I1
:
10120 last_arg_constant
= true;
10123 case MULTI_ARG_3_SF
:
10124 case MULTI_ARG_3_DF
:
10125 case MULTI_ARG_3_SF2
:
10126 case MULTI_ARG_3_DF2
:
10127 case MULTI_ARG_3_DI
:
10128 case MULTI_ARG_3_SI
:
10129 case MULTI_ARG_3_SI_DI
:
10130 case MULTI_ARG_3_HI
:
10131 case MULTI_ARG_3_HI_SI
:
10132 case MULTI_ARG_3_QI
:
10133 case MULTI_ARG_3_DI2
:
10134 case MULTI_ARG_3_SI2
:
10135 case MULTI_ARG_3_HI2
:
10136 case MULTI_ARG_3_QI2
:
10140 case MULTI_ARG_2_SF
:
10141 case MULTI_ARG_2_DF
:
10142 case MULTI_ARG_2_DI
:
10143 case MULTI_ARG_2_SI
:
10144 case MULTI_ARG_2_HI
:
10145 case MULTI_ARG_2_QI
:
10149 case MULTI_ARG_2_DI_IMM
:
10150 case MULTI_ARG_2_SI_IMM
:
10151 case MULTI_ARG_2_HI_IMM
:
10152 case MULTI_ARG_2_QI_IMM
:
10154 last_arg_constant
= true;
10157 case MULTI_ARG_1_SF
:
10158 case MULTI_ARG_1_DF
:
10159 case MULTI_ARG_1_SF2
:
10160 case MULTI_ARG_1_DF2
:
10161 case MULTI_ARG_1_DI
:
10162 case MULTI_ARG_1_SI
:
10163 case MULTI_ARG_1_HI
:
10164 case MULTI_ARG_1_QI
:
10165 case MULTI_ARG_1_SI_DI
:
10166 case MULTI_ARG_1_HI_DI
:
10167 case MULTI_ARG_1_HI_SI
:
10168 case MULTI_ARG_1_QI_DI
:
10169 case MULTI_ARG_1_QI_SI
:
10170 case MULTI_ARG_1_QI_HI
:
10174 case MULTI_ARG_2_DI_CMP
:
10175 case MULTI_ARG_2_SI_CMP
:
10176 case MULTI_ARG_2_HI_CMP
:
10177 case MULTI_ARG_2_QI_CMP
:
10179 comparison_p
= true;
10182 case MULTI_ARG_2_SF_TF
:
10183 case MULTI_ARG_2_DF_TF
:
10184 case MULTI_ARG_2_DI_TF
:
10185 case MULTI_ARG_2_SI_TF
:
10186 case MULTI_ARG_2_HI_TF
:
10187 case MULTI_ARG_2_QI_TF
:
10193 gcc_unreachable ();
10196 if (optimize
|| !target
10197 || GET_MODE (target
) != tmode
10198 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
10199 target
= gen_reg_rtx (tmode
);
10200 else if (memory_operand (target
, tmode
))
10203 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
10205 for (i
= 0; i
< nargs
; i
++)
10207 tree arg
= CALL_EXPR_ARG (exp
, i
);
10208 rtx op
= expand_normal (arg
);
10209 int adjust
= (comparison_p
) ? 1 : 0;
10210 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
10212 if (last_arg_constant
&& i
== nargs
- 1)
10214 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
10216 enum insn_code new_icode
= icode
;
10219 case CODE_FOR_xop_vpermil2v2df3
:
10220 case CODE_FOR_xop_vpermil2v4sf3
:
10221 case CODE_FOR_xop_vpermil2v4df3
:
10222 case CODE_FOR_xop_vpermil2v8sf3
:
10223 error ("the last argument must be a 2-bit immediate");
10224 return gen_reg_rtx (tmode
);
10225 case CODE_FOR_xop_rotlv2di3
:
10226 new_icode
= CODE_FOR_rotlv2di3
;
10228 case CODE_FOR_xop_rotlv4si3
:
10229 new_icode
= CODE_FOR_rotlv4si3
;
10231 case CODE_FOR_xop_rotlv8hi3
:
10232 new_icode
= CODE_FOR_rotlv8hi3
;
10234 case CODE_FOR_xop_rotlv16qi3
:
10235 new_icode
= CODE_FOR_rotlv16qi3
;
10237 if (CONST_INT_P (op
))
10239 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
10240 op
= GEN_INT (INTVAL (op
) & mask
);
10241 gcc_checking_assert
10242 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
10246 gcc_checking_assert
10248 && insn_data
[new_icode
].operand
[0].mode
== tmode
10249 && insn_data
[new_icode
].operand
[1].mode
== tmode
10250 && insn_data
[new_icode
].operand
[2].mode
== mode
10251 && insn_data
[new_icode
].operand
[0].predicate
10252 == insn_data
[icode
].operand
[0].predicate
10253 && insn_data
[new_icode
].operand
[1].predicate
10254 == insn_data
[icode
].operand
[1].predicate
);
10260 gcc_unreachable ();
10267 if (VECTOR_MODE_P (mode
))
10268 op
= safe_vector_operand (op
, mode
);
10270 /* If we aren't optimizing, only allow one memory operand to be
10272 if (memory_operand (op
, mode
))
10275 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
10278 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
10280 op
= force_reg (mode
, op
);
10289 pat
= GEN_FCN (icode
) (target
, xops
[0]);
10294 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10295 GEN_INT ((int)sub_code
));
10296 else if (! comparison_p
)
10297 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
10300 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
10303 pat
= GEN_FCN (icode
) (target
, cmp_op
, xops
[0], xops
[1]);
10308 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
10312 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
10316 gcc_unreachable ();
10326 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
10327 insns with vec_merge. */
10330 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
10334 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10335 rtx op1
, op0
= expand_normal (arg0
);
10336 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
10337 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
10339 if (optimize
|| !target
10340 || GET_MODE (target
) != tmode
10341 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
10342 target
= gen_reg_rtx (tmode
);
10344 if (VECTOR_MODE_P (mode0
))
10345 op0
= safe_vector_operand (op0
, mode0
);
10347 if ((optimize
&& !register_operand (op0
, mode0
))
10348 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
10349 op0
= copy_to_mode_reg (mode0
, op0
);
10352 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
10353 op1
= copy_to_mode_reg (mode0
, op1
);
10355 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
10362 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
10365 ix86_expand_sse_compare (const struct builtin_description
*d
,
10366 tree exp
, rtx target
, bool swap
)
10369 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10370 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10371 rtx op0
= expand_normal (arg0
);
10372 rtx op1
= expand_normal (arg1
);
10374 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10375 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10376 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
10377 enum rtx_code comparison
= d
->comparison
;
10379 if (VECTOR_MODE_P (mode0
))
10380 op0
= safe_vector_operand (op0
, mode0
);
10381 if (VECTOR_MODE_P (mode1
))
10382 op1
= safe_vector_operand (op1
, mode1
);
10384 /* Swap operands if we have a comparison that isn't available in
10387 std::swap (op0
, op1
);
10389 if (optimize
|| !target
10390 || GET_MODE (target
) != tmode
10391 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10392 target
= gen_reg_rtx (tmode
);
10394 if ((optimize
&& !register_operand (op0
, mode0
))
10395 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
10396 op0
= copy_to_mode_reg (mode0
, op0
);
10397 if ((optimize
&& !register_operand (op1
, mode1
))
10398 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
10399 op1
= copy_to_mode_reg (mode1
, op1
);
10401 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
10402 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
10409 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
10410 * ordered EQ or unordered NE, generate PF jump. */
10413 ix86_ssecom_setcc (const enum rtx_code comparison
,
10414 bool check_unordered
, machine_mode mode
,
10415 rtx set_dst
, rtx target
)
10418 rtx_code_label
*label
= NULL
;
10420 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10421 with NAN operands. */
10422 if (check_unordered
)
10424 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10426 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10427 label
= gen_label_rtx ();
10428 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10429 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10430 gen_rtx_LABEL_REF (VOIDmode
, label
),
10432 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10435 /* NB: Set CCFPmode and check a different CCmode which is in subset
10437 if (GET_MODE (set_dst
) != mode
)
10439 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10440 || mode
== CCOmode
|| mode
== CCPmode
10441 || mode
== CCSmode
|| mode
== CCZmode
);
10442 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10445 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10446 gen_rtx_fmt_ee (comparison
, QImode
,
10451 emit_label (label
);
10453 return SUBREG_REG (target
);
10456 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
10459 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
10463 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10464 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10465 rtx op0
= expand_normal (arg0
);
10466 rtx op1
= expand_normal (arg1
);
10467 enum insn_code icode
= d
->icode
;
10468 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10469 machine_mode mode0
= insn_p
->operand
[0].mode
;
10470 machine_mode mode1
= insn_p
->operand
[1].mode
;
10472 if (VECTOR_MODE_P (mode0
))
10473 op0
= safe_vector_operand (op0
, mode0
);
10474 if (VECTOR_MODE_P (mode1
))
10475 op1
= safe_vector_operand (op1
, mode1
);
10477 enum rtx_code comparison
= d
->comparison
;
10478 rtx const_val
= const0_rtx
;
10480 bool check_unordered
= false;
10481 machine_mode mode
= CCFPmode
;
10482 switch (comparison
)
10484 case LE
: /* -> GE */
10485 case LT
: /* -> GT */
10486 std::swap (op0
, op1
);
10487 comparison
= swap_condition (comparison
);
10493 check_unordered
= true;
10497 check_unordered
= true;
10499 const_val
= const1_rtx
;
10502 gcc_unreachable ();
10505 target
= gen_reg_rtx (SImode
);
10506 emit_move_insn (target
, const_val
);
10507 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10509 if ((optimize
&& !register_operand (op0
, mode0
))
10510 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10511 op0
= copy_to_mode_reg (mode0
, op0
);
10512 if ((optimize
&& !register_operand (op1
, mode1
))
10513 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10514 op1
= copy_to_mode_reg (mode1
, op1
);
10516 pat
= GEN_FCN (icode
) (op0
, op1
);
10520 set_dst
= SET_DEST (pat
);
10522 return ix86_ssecom_setcc (comparison
, check_unordered
, mode
,
10526 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10529 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
10533 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10534 rtx op1
, op0
= expand_normal (arg0
);
10535 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10536 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10538 if (optimize
|| target
== 0
10539 || GET_MODE (target
) != tmode
10540 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10541 target
= gen_reg_rtx (tmode
);
10543 if (VECTOR_MODE_P (mode0
))
10544 op0
= safe_vector_operand (op0
, mode0
);
10546 if ((optimize
&& !register_operand (op0
, mode0
))
10547 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10548 op0
= copy_to_mode_reg (mode0
, op0
);
10550 op1
= GEN_INT (d
->comparison
);
10552 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
10560 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
10561 tree exp
, rtx target
)
10564 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10565 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10566 rtx op0
= expand_normal (arg0
);
10567 rtx op1
= expand_normal (arg1
);
10569 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10570 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10571 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
10573 if (optimize
|| target
== 0
10574 || GET_MODE (target
) != tmode
10575 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10576 target
= gen_reg_rtx (tmode
);
10578 op0
= safe_vector_operand (op0
, mode0
);
10579 op1
= safe_vector_operand (op1
, mode1
);
10581 if ((optimize
&& !register_operand (op0
, mode0
))
10582 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10583 op0
= copy_to_mode_reg (mode0
, op0
);
10584 if ((optimize
&& !register_operand (op1
, mode1
))
10585 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
10586 op1
= copy_to_mode_reg (mode1
, op1
);
10588 op2
= GEN_INT (d
->comparison
);
10590 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
10597 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10600 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
10604 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10605 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10606 rtx op0
= expand_normal (arg0
);
10607 rtx op1
= expand_normal (arg1
);
10608 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
10609 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
10610 enum rtx_code comparison
= d
->comparison
;
10612 /* ptest reg, reg sets the carry flag. */
10613 if (comparison
== LTU
10614 && (d
->code
== IX86_BUILTIN_PTESTC
10615 || d
->code
== IX86_BUILTIN_PTESTC256
)
10616 && rtx_equal_p (op0
, op1
))
10619 target
= gen_reg_rtx (SImode
);
10620 emit_move_insn (target
, const1_rtx
);
10624 if (VECTOR_MODE_P (mode0
))
10625 op0
= safe_vector_operand (op0
, mode0
);
10626 if (VECTOR_MODE_P (mode1
))
10627 op1
= safe_vector_operand (op1
, mode1
);
10629 target
= gen_reg_rtx (SImode
);
10630 emit_move_insn (target
, const0_rtx
);
10631 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10633 if ((optimize
&& !register_operand (op0
, mode0
))
10634 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10635 op0
= copy_to_mode_reg (mode0
, op0
);
10636 if ((optimize
&& !register_operand (op1
, mode1
))
10637 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
10638 op1
= copy_to_mode_reg (mode1
, op1
);
10640 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
10644 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10645 gen_rtx_fmt_ee (comparison
, QImode
,
10649 return SUBREG_REG (target
);
10652 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10655 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
10656 tree exp
, rtx target
)
10659 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10660 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10661 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10662 tree arg3
= CALL_EXPR_ARG (exp
, 3);
10663 tree arg4
= CALL_EXPR_ARG (exp
, 4);
10664 rtx scratch0
, scratch1
;
10665 rtx op0
= expand_normal (arg0
);
10666 rtx op1
= expand_normal (arg1
);
10667 rtx op2
= expand_normal (arg2
);
10668 rtx op3
= expand_normal (arg3
);
10669 rtx op4
= expand_normal (arg4
);
10670 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
10672 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10673 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10674 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10675 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
10676 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
10677 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
10678 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
10680 if (VECTOR_MODE_P (modev2
))
10681 op0
= safe_vector_operand (op0
, modev2
);
10682 if (VECTOR_MODE_P (modev4
))
10683 op2
= safe_vector_operand (op2
, modev4
);
10685 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10686 op0
= copy_to_mode_reg (modev2
, op0
);
10687 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
10688 op1
= copy_to_mode_reg (modei3
, op1
);
10689 if ((optimize
&& !register_operand (op2
, modev4
))
10690 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
10691 op2
= copy_to_mode_reg (modev4
, op2
);
10692 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
10693 op3
= copy_to_mode_reg (modei5
, op3
);
10695 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
10697 error ("the fifth argument must be an 8-bit immediate");
10701 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
10703 if (optimize
|| !target
10704 || GET_MODE (target
) != tmode0
10705 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10706 target
= gen_reg_rtx (tmode0
);
10708 scratch1
= gen_reg_rtx (tmode1
);
10710 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10712 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
10714 if (optimize
|| !target
10715 || GET_MODE (target
) != tmode1
10716 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10717 target
= gen_reg_rtx (tmode1
);
10719 scratch0
= gen_reg_rtx (tmode0
);
10721 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
10725 gcc_assert (d
->flag
);
10727 scratch0
= gen_reg_rtx (tmode0
);
10728 scratch1
= gen_reg_rtx (tmode1
);
10730 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10740 target
= gen_reg_rtx (SImode
);
10741 emit_move_insn (target
, const0_rtx
);
10742 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10745 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10746 gen_rtx_fmt_ee (EQ
, QImode
,
10747 gen_rtx_REG ((machine_mode
) d
->flag
,
10750 return SUBREG_REG (target
);
10757 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10760 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
10761 tree exp
, rtx target
)
10764 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10765 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10766 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10767 rtx scratch0
, scratch1
;
10768 rtx op0
= expand_normal (arg0
);
10769 rtx op1
= expand_normal (arg1
);
10770 rtx op2
= expand_normal (arg2
);
10771 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
10773 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10774 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10775 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10776 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
10777 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
10779 if (VECTOR_MODE_P (modev2
))
10780 op0
= safe_vector_operand (op0
, modev2
);
10781 if (VECTOR_MODE_P (modev3
))
10782 op1
= safe_vector_operand (op1
, modev3
);
10784 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10785 op0
= copy_to_mode_reg (modev2
, op0
);
10786 if ((optimize
&& !register_operand (op1
, modev3
))
10787 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
10788 op1
= copy_to_mode_reg (modev3
, op1
);
10790 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
10792 error ("the third argument must be an 8-bit immediate");
10796 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
10798 if (optimize
|| !target
10799 || GET_MODE (target
) != tmode0
10800 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10801 target
= gen_reg_rtx (tmode0
);
10803 scratch1
= gen_reg_rtx (tmode1
);
10805 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
10807 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
10809 if (optimize
|| !target
10810 || GET_MODE (target
) != tmode1
10811 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10812 target
= gen_reg_rtx (tmode1
);
10814 scratch0
= gen_reg_rtx (tmode0
);
10816 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
10820 gcc_assert (d
->flag
);
10822 scratch0
= gen_reg_rtx (tmode0
);
10823 scratch1
= gen_reg_rtx (tmode1
);
10825 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
10835 target
= gen_reg_rtx (SImode
);
10836 emit_move_insn (target
, const0_rtx
);
10837 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10840 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10841 gen_rtx_fmt_ee (EQ
, QImode
,
10842 gen_rtx_REG ((machine_mode
) d
->flag
,
10845 return SUBREG_REG (target
);
10851 /* Fixup modeless constants to fit required mode. */
10854 fixup_modeless_constant (rtx x
, machine_mode mode
)
10856 if (GET_MODE (x
) == VOIDmode
)
10857 x
= convert_to_mode (mode
, x
, 1);
10861 /* Subroutine of ix86_expand_builtin to take care of insns with
10862 variable number of operands. */
10865 ix86_expand_args_builtin (const struct builtin_description
*d
,
10866 tree exp
, rtx target
)
10868 rtx pat
, real_target
;
10869 unsigned int i
, nargs
;
10870 unsigned int nargs_constant
= 0;
10871 unsigned int mask_pos
= 0;
10872 int num_memory
= 0;
10874 bool second_arg_count
= false;
10875 enum insn_code icode
= d
->icode
;
10876 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10877 machine_mode tmode
= insn_p
->operand
[0].mode
;
10878 machine_mode rmode
= VOIDmode
;
10880 enum rtx_code comparison
= d
->comparison
;
10882 switch ((enum ix86_builtin_func_type
) d
->flag
)
10884 case V2DF_FTYPE_V2DF_ROUND
:
10885 case V4DF_FTYPE_V4DF_ROUND
:
10886 case V8DF_FTYPE_V8DF_ROUND
:
10887 case V4SF_FTYPE_V4SF_ROUND
:
10888 case V8SF_FTYPE_V8SF_ROUND
:
10889 case V16SF_FTYPE_V16SF_ROUND
:
10890 case V8HF_FTYPE_V8HF_ROUND
:
10891 case V16HF_FTYPE_V16HF_ROUND
:
10892 case V32HF_FTYPE_V32HF_ROUND
:
10893 case V4SI_FTYPE_V4SF_ROUND
:
10894 case V8SI_FTYPE_V8SF_ROUND
:
10895 case V16SI_FTYPE_V16SF_ROUND
:
10896 return ix86_expand_sse_round (d
, exp
, target
);
10897 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
10898 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
10899 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
10900 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
10901 case INT_FTYPE_V8SF_V8SF_PTEST
:
10902 case INT_FTYPE_V4DI_V4DI_PTEST
:
10903 case INT_FTYPE_V4DF_V4DF_PTEST
:
10904 case INT_FTYPE_V4SF_V4SF_PTEST
:
10905 case INT_FTYPE_V2DI_V2DI_PTEST
:
10906 case INT_FTYPE_V2DF_V2DF_PTEST
:
10907 return ix86_expand_sse_ptest (d
, exp
, target
);
10908 case FLOAT128_FTYPE_FLOAT128
:
10909 case FLOAT_FTYPE_FLOAT
:
10910 case FLOAT_FTYPE_BFLOAT16
:
10911 case INT_FTYPE_INT
:
10912 case UINT_FTYPE_UINT
:
10913 case UINT16_FTYPE_UINT16
:
10914 case UINT64_FTYPE_INT
:
10915 case UINT64_FTYPE_UINT64
:
10916 case INT64_FTYPE_INT64
:
10917 case INT64_FTYPE_V4SF
:
10918 case INT64_FTYPE_V2DF
:
10919 case INT_FTYPE_V16QI
:
10920 case INT_FTYPE_V8QI
:
10921 case INT_FTYPE_V8SF
:
10922 case INT_FTYPE_V4DF
:
10923 case INT_FTYPE_V4SF
:
10924 case INT_FTYPE_V2DF
:
10925 case INT_FTYPE_V32QI
:
10926 case V16QI_FTYPE_V16QI
:
10927 case V8SI_FTYPE_V8SF
:
10928 case V8SI_FTYPE_V4SI
:
10929 case V8HI_FTYPE_V8HI
:
10930 case V8HI_FTYPE_V16QI
:
10931 case V8QI_FTYPE_V8QI
:
10932 case V8SF_FTYPE_V8SF
:
10933 case V8SF_FTYPE_V8SI
:
10934 case V8SF_FTYPE_V4SF
:
10935 case V8SF_FTYPE_V8HI
:
10936 case V4SI_FTYPE_V4SI
:
10937 case V4SI_FTYPE_V16QI
:
10938 case V4SI_FTYPE_V4SF
:
10939 case V4SI_FTYPE_V8SI
:
10940 case V4SI_FTYPE_V8HI
:
10941 case V4SI_FTYPE_V4DF
:
10942 case V4SI_FTYPE_V2DF
:
10943 case V4HI_FTYPE_V4HI
:
10944 case V4DF_FTYPE_V4DF
:
10945 case V4DF_FTYPE_V4SI
:
10946 case V4DF_FTYPE_V4SF
:
10947 case V4DF_FTYPE_V2DF
:
10948 case V4SF_FTYPE_V4SF
:
10949 case V4SF_FTYPE_V4SI
:
10950 case V4SF_FTYPE_V8SF
:
10951 case V4SF_FTYPE_V4DF
:
10952 case V4SF_FTYPE_V8HI
:
10953 case V4SF_FTYPE_V2DF
:
10954 case V2DI_FTYPE_V2DI
:
10955 case V2DI_FTYPE_V16QI
:
10956 case V2DI_FTYPE_V8HI
:
10957 case V2DI_FTYPE_V4SI
:
10958 case V2DF_FTYPE_V2DF
:
10959 case V2DF_FTYPE_V4SI
:
10960 case V2DF_FTYPE_V4DF
:
10961 case V2DF_FTYPE_V4SF
:
10962 case V2DF_FTYPE_V2SI
:
10963 case V2SI_FTYPE_V2SI
:
10964 case V2SI_FTYPE_V4SF
:
10965 case V2SI_FTYPE_V2SF
:
10966 case V2SI_FTYPE_V2DF
:
10967 case V2SF_FTYPE_V2SF
:
10968 case V2SF_FTYPE_V2SI
:
10969 case V32QI_FTYPE_V32QI
:
10970 case V32QI_FTYPE_V16QI
:
10971 case V16HI_FTYPE_V16HI
:
10972 case V16HI_FTYPE_V8HI
:
10973 case V8SI_FTYPE_V8SI
:
10974 case V16HI_FTYPE_V16QI
:
10975 case V8SI_FTYPE_V16QI
:
10976 case V4DI_FTYPE_V16QI
:
10977 case V8SI_FTYPE_V8HI
:
10978 case V4DI_FTYPE_V8HI
:
10979 case V4DI_FTYPE_V4SI
:
10980 case V4DI_FTYPE_V2DI
:
10981 case UQI_FTYPE_UQI
:
10982 case UHI_FTYPE_UHI
:
10983 case USI_FTYPE_USI
:
10984 case USI_FTYPE_UQI
:
10985 case USI_FTYPE_UHI
:
10986 case UDI_FTYPE_UDI
:
10987 case UHI_FTYPE_V16QI
:
10988 case USI_FTYPE_V32QI
:
10989 case UDI_FTYPE_V64QI
:
10990 case V16QI_FTYPE_UHI
:
10991 case V32QI_FTYPE_USI
:
10992 case V64QI_FTYPE_UDI
:
10993 case V8HI_FTYPE_UQI
:
10994 case V16HI_FTYPE_UHI
:
10995 case V32HI_FTYPE_USI
:
10996 case V4SI_FTYPE_UQI
:
10997 case V8SI_FTYPE_UQI
:
10998 case V4SI_FTYPE_UHI
:
10999 case V8SI_FTYPE_UHI
:
11000 case UQI_FTYPE_V8HI
:
11001 case UHI_FTYPE_V16HI
:
11002 case USI_FTYPE_V32HI
:
11003 case UQI_FTYPE_V4SI
:
11004 case UQI_FTYPE_V8SI
:
11005 case UHI_FTYPE_V16SI
:
11006 case UQI_FTYPE_V2DI
:
11007 case UQI_FTYPE_V4DI
:
11008 case UQI_FTYPE_V8DI
:
11009 case V16SI_FTYPE_UHI
:
11010 case V2DI_FTYPE_UQI
:
11011 case V4DI_FTYPE_UQI
:
11012 case V16SI_FTYPE_INT
:
11013 case V16SF_FTYPE_V8SF
:
11014 case V16SI_FTYPE_V8SI
:
11015 case V16SF_FTYPE_V4SF
:
11016 case V16SI_FTYPE_V4SI
:
11017 case V16SI_FTYPE_V16SF
:
11018 case V16SI_FTYPE_V16SI
:
11019 case V64QI_FTYPE_V64QI
:
11020 case V32HI_FTYPE_V32HI
:
11021 case V16SF_FTYPE_V16SF
:
11022 case V8DI_FTYPE_UQI
:
11023 case V8DI_FTYPE_V8DI
:
11024 case V8DF_FTYPE_V4DF
:
11025 case V8DF_FTYPE_V2DF
:
11026 case V8DF_FTYPE_V8DF
:
11027 case V4DI_FTYPE_V4DI
:
11028 case V16BF_FTYPE_V16SF
:
11029 case V8BF_FTYPE_V8SF
:
11030 case V8BF_FTYPE_V4SF
:
11033 case V4SF_FTYPE_V4SF_VEC_MERGE
:
11034 case V2DF_FTYPE_V2DF_VEC_MERGE
:
11035 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
11036 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
11037 case V16QI_FTYPE_V16QI_V16QI
:
11038 case V16QI_FTYPE_V8HI_V8HI
:
11039 case V16HF_FTYPE_V16HF_V16HF
:
11040 case V16SF_FTYPE_V16SF_V16SF
:
11041 case V8QI_FTYPE_V8QI_V8QI
:
11042 case V8QI_FTYPE_V4HI_V4HI
:
11043 case V8HI_FTYPE_V8HI_V8HI
:
11044 case V8HI_FTYPE_V16QI_V16QI
:
11045 case V8HI_FTYPE_V4SI_V4SI
:
11046 case V8HF_FTYPE_V8HF_V8HF
:
11047 case V8SF_FTYPE_V8SF_V8SF
:
11048 case V8SF_FTYPE_V8SF_V8SI
:
11049 case V8DF_FTYPE_V8DF_V8DF
:
11050 case V4SI_FTYPE_V4SI_V4SI
:
11051 case V4SI_FTYPE_V8HI_V8HI
:
11052 case V4SI_FTYPE_V2DF_V2DF
:
11053 case V4HI_FTYPE_V4HI_V4HI
:
11054 case V4HI_FTYPE_V8QI_V8QI
:
11055 case V4HI_FTYPE_V2SI_V2SI
:
11056 case V4DF_FTYPE_V4DF_V4DF
:
11057 case V4DF_FTYPE_V4DF_V4DI
:
11058 case V4SF_FTYPE_V4SF_V4SF
:
11059 case V4SF_FTYPE_V4SF_V4SI
:
11060 case V4SF_FTYPE_V4SF_V2SI
:
11061 case V4SF_FTYPE_V4SF_V2DF
:
11062 case V4SF_FTYPE_V4SF_UINT
:
11063 case V4SF_FTYPE_V4SF_DI
:
11064 case V4SF_FTYPE_V4SF_SI
:
11065 case V4DI_FTYPE_V4DI_V2DI
:
11066 case V2DI_FTYPE_V2DI_V2DI
:
11067 case V2DI_FTYPE_V16QI_V16QI
:
11068 case V2DI_FTYPE_V4SI_V4SI
:
11069 case V2DI_FTYPE_V2DI_V16QI
:
11070 case V2SI_FTYPE_V2SI_V2SI
:
11071 case V2SI_FTYPE_V4HI_V4HI
:
11072 case V2SI_FTYPE_V2SF_V2SF
:
11073 case V2DF_FTYPE_V2DF_V2DF
:
11074 case V2DF_FTYPE_V2DF_V4SF
:
11075 case V2DF_FTYPE_V2DF_V2DI
:
11076 case V2DF_FTYPE_V2DF_DI
:
11077 case V2DF_FTYPE_V2DF_SI
:
11078 case V2DF_FTYPE_V2DF_UINT
:
11079 case V2SF_FTYPE_V2SF_V2SF
:
11080 case V1DI_FTYPE_V1DI_V1DI
:
11081 case V1DI_FTYPE_V8QI_V8QI
:
11082 case V1DI_FTYPE_V2SI_V2SI
:
11083 case V32QI_FTYPE_V16HI_V16HI
:
11084 case V16HI_FTYPE_V8SI_V8SI
:
11085 case V64QI_FTYPE_V64QI_V64QI
:
11086 case V32QI_FTYPE_V32QI_V32QI
:
11087 case V16HI_FTYPE_V32QI_V32QI
:
11088 case V16HI_FTYPE_V16HI_V16HI
:
11089 case V8SI_FTYPE_V4DF_V4DF
:
11090 case V8SI_FTYPE_V8SI_V8SI
:
11091 case V8SI_FTYPE_V16HI_V16HI
:
11092 case V4DI_FTYPE_V4DI_V4DI
:
11093 case V4DI_FTYPE_V8SI_V8SI
:
11094 case V4DI_FTYPE_V32QI_V32QI
:
11095 case V8DI_FTYPE_V64QI_V64QI
:
11096 if (comparison
== UNKNOWN
)
11097 return ix86_expand_binop_builtin (icode
, exp
, target
);
11100 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
11101 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
11102 gcc_assert (comparison
!= UNKNOWN
);
11106 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
11107 case V16HI_FTYPE_V16HI_SI_COUNT
:
11108 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
11109 case V8SI_FTYPE_V8SI_SI_COUNT
:
11110 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
11111 case V4DI_FTYPE_V4DI_INT_COUNT
:
11112 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
11113 case V8HI_FTYPE_V8HI_SI_COUNT
:
11114 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
11115 case V4SI_FTYPE_V4SI_SI_COUNT
:
11116 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
11117 case V4HI_FTYPE_V4HI_SI_COUNT
:
11118 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
11119 case V2DI_FTYPE_V2DI_SI_COUNT
:
11120 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
11121 case V2SI_FTYPE_V2SI_SI_COUNT
:
11122 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
11123 case V1DI_FTYPE_V1DI_SI_COUNT
:
11125 second_arg_count
= true;
11127 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
11128 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
11129 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
11130 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
11131 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
11132 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
11133 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
11134 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
11135 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
11136 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
11137 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
11138 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
11139 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
11140 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
11141 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
11142 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
11143 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
11144 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
11146 second_arg_count
= true;
11148 case UINT64_FTYPE_UINT64_UINT64
:
11149 case UINT_FTYPE_UINT_UINT
:
11150 case UINT_FTYPE_UINT_USHORT
:
11151 case UINT_FTYPE_UINT_UCHAR
:
11152 case UINT16_FTYPE_UINT16_INT
:
11153 case UINT8_FTYPE_UINT8_INT
:
11154 case UQI_FTYPE_UQI_UQI
:
11155 case UHI_FTYPE_UHI_UHI
:
11156 case USI_FTYPE_USI_USI
:
11157 case UDI_FTYPE_UDI_UDI
:
11158 case V16SI_FTYPE_V8DF_V8DF
:
11159 case V32BF_FTYPE_V16SF_V16SF
:
11160 case V16BF_FTYPE_V8SF_V8SF
:
11161 case V8BF_FTYPE_V4SF_V4SF
:
11162 case V16BF_FTYPE_V16SF_UHI
:
11163 case V8BF_FTYPE_V8SF_UQI
:
11164 case V8BF_FTYPE_V4SF_UQI
:
11167 case V2DI_FTYPE_V2DI_INT_CONVERT
:
11170 nargs_constant
= 1;
11172 case V4DI_FTYPE_V4DI_INT_CONVERT
:
11175 nargs_constant
= 1;
11177 case V8DI_FTYPE_V8DI_INT_CONVERT
:
11180 nargs_constant
= 1;
11182 case V8HI_FTYPE_V8HI_INT
:
11183 case V8HI_FTYPE_V8SF_INT
:
11184 case V16HI_FTYPE_V16SF_INT
:
11185 case V8HI_FTYPE_V4SF_INT
:
11186 case V8SF_FTYPE_V8SF_INT
:
11187 case V4SF_FTYPE_V16SF_INT
:
11188 case V16SF_FTYPE_V16SF_INT
:
11189 case V4SI_FTYPE_V4SI_INT
:
11190 case V4SI_FTYPE_V8SI_INT
:
11191 case V4HI_FTYPE_V4HI_INT
:
11192 case V4DF_FTYPE_V4DF_INT
:
11193 case V4DF_FTYPE_V8DF_INT
:
11194 case V4SF_FTYPE_V4SF_INT
:
11195 case V4SF_FTYPE_V8SF_INT
:
11196 case V2DI_FTYPE_V2DI_INT
:
11197 case V2DF_FTYPE_V2DF_INT
:
11198 case V2DF_FTYPE_V4DF_INT
:
11199 case V16HI_FTYPE_V16HI_INT
:
11200 case V8SI_FTYPE_V8SI_INT
:
11201 case V16SI_FTYPE_V16SI_INT
:
11202 case V4SI_FTYPE_V16SI_INT
:
11203 case V4DI_FTYPE_V4DI_INT
:
11204 case V2DI_FTYPE_V4DI_INT
:
11205 case V4DI_FTYPE_V8DI_INT
:
11206 case UQI_FTYPE_UQI_UQI_CONST
:
11207 case UHI_FTYPE_UHI_UQI
:
11208 case USI_FTYPE_USI_UQI
:
11209 case UDI_FTYPE_UDI_UQI
:
11211 nargs_constant
= 1;
11213 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
11214 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
11215 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
11216 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
11217 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
11218 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
11219 case UHI_FTYPE_V16SI_V16SI_UHI
:
11220 case UQI_FTYPE_V8DI_V8DI_UQI
:
11221 case V16HI_FTYPE_V16SI_V16HI_UHI
:
11222 case V16QI_FTYPE_V16SI_V16QI_UHI
:
11223 case V16QI_FTYPE_V8DI_V16QI_UQI
:
11224 case V32HF_FTYPE_V32HF_V32HF_USI
:
11225 case V16SF_FTYPE_V16SF_V16SF_UHI
:
11226 case V16SF_FTYPE_V4SF_V16SF_UHI
:
11227 case V16SI_FTYPE_SI_V16SI_UHI
:
11228 case V16SI_FTYPE_V16HI_V16SI_UHI
:
11229 case V16SI_FTYPE_V16QI_V16SI_UHI
:
11230 case V8SF_FTYPE_V4SF_V8SF_UQI
:
11231 case V4DF_FTYPE_V2DF_V4DF_UQI
:
11232 case V8SI_FTYPE_V4SI_V8SI_UQI
:
11233 case V8SI_FTYPE_SI_V8SI_UQI
:
11234 case V4SI_FTYPE_V4SI_V4SI_UQI
:
11235 case V4SI_FTYPE_SI_V4SI_UQI
:
11236 case V4DI_FTYPE_V2DI_V4DI_UQI
:
11237 case V4DI_FTYPE_DI_V4DI_UQI
:
11238 case V2DI_FTYPE_V2DI_V2DI_UQI
:
11239 case V2DI_FTYPE_DI_V2DI_UQI
:
11240 case V64QI_FTYPE_V64QI_V64QI_UDI
:
11241 case V64QI_FTYPE_V16QI_V64QI_UDI
:
11242 case V64QI_FTYPE_QI_V64QI_UDI
:
11243 case V32QI_FTYPE_V32QI_V32QI_USI
:
11244 case V32QI_FTYPE_V16QI_V32QI_USI
:
11245 case V32QI_FTYPE_QI_V32QI_USI
:
11246 case V16QI_FTYPE_V16QI_V16QI_UHI
:
11247 case V16QI_FTYPE_QI_V16QI_UHI
:
11248 case V32HI_FTYPE_V8HI_V32HI_USI
:
11249 case V32HI_FTYPE_HI_V32HI_USI
:
11250 case V16HI_FTYPE_V8HI_V16HI_UHI
:
11251 case V16HI_FTYPE_HI_V16HI_UHI
:
11252 case V8HI_FTYPE_V8HI_V8HI_UQI
:
11253 case V8HI_FTYPE_HI_V8HI_UQI
:
11254 case V16HF_FTYPE_V16HF_V16HF_UHI
:
11255 case V8SF_FTYPE_V8HI_V8SF_UQI
:
11256 case V4SF_FTYPE_V8HI_V4SF_UQI
:
11257 case V8SI_FTYPE_V8HF_V8SI_UQI
:
11258 case V8SF_FTYPE_V8HF_V8SF_UQI
:
11259 case V8SI_FTYPE_V8SF_V8SI_UQI
:
11260 case V4SI_FTYPE_V4SF_V4SI_UQI
:
11261 case V4SI_FTYPE_V8HF_V4SI_UQI
:
11262 case V4SF_FTYPE_V8HF_V4SF_UQI
:
11263 case V4DI_FTYPE_V8HF_V4DI_UQI
:
11264 case V4DI_FTYPE_V4SF_V4DI_UQI
:
11265 case V2DI_FTYPE_V8HF_V2DI_UQI
:
11266 case V2DI_FTYPE_V4SF_V2DI_UQI
:
11267 case V8HF_FTYPE_V8HF_V8HF_UQI
:
11268 case V8HF_FTYPE_V8HF_V8HF_V8HF
:
11269 case V8HF_FTYPE_V8HI_V8HF_UQI
:
11270 case V8HF_FTYPE_V8SI_V8HF_UQI
:
11271 case V8HF_FTYPE_V8SF_V8HF_UQI
:
11272 case V8HF_FTYPE_V4SI_V8HF_UQI
:
11273 case V8HF_FTYPE_V4SF_V8HF_UQI
:
11274 case V8HF_FTYPE_V4DI_V8HF_UQI
:
11275 case V8HF_FTYPE_V4DF_V8HF_UQI
:
11276 case V8HF_FTYPE_V2DI_V8HF_UQI
:
11277 case V8HF_FTYPE_V2DF_V8HF_UQI
:
11278 case V4SF_FTYPE_V4DI_V4SF_UQI
:
11279 case V4SF_FTYPE_V2DI_V4SF_UQI
:
11280 case V4DF_FTYPE_V4DI_V4DF_UQI
:
11281 case V4DF_FTYPE_V8HF_V4DF_UQI
:
11282 case V2DF_FTYPE_V8HF_V2DF_UQI
:
11283 case V2DF_FTYPE_V2DI_V2DF_UQI
:
11284 case V16QI_FTYPE_V8HI_V16QI_UQI
:
11285 case V16QI_FTYPE_V16HI_V16QI_UHI
:
11286 case V16QI_FTYPE_V4SI_V16QI_UQI
:
11287 case V16QI_FTYPE_V8SI_V16QI_UQI
:
11288 case V8HI_FTYPE_V8HF_V8HI_UQI
:
11289 case V8HI_FTYPE_V4SI_V8HI_UQI
:
11290 case V8HI_FTYPE_V8SI_V8HI_UQI
:
11291 case V16QI_FTYPE_V2DI_V16QI_UQI
:
11292 case V16QI_FTYPE_V4DI_V16QI_UQI
:
11293 case V8HI_FTYPE_V2DI_V8HI_UQI
:
11294 case V8HI_FTYPE_V4DI_V8HI_UQI
:
11295 case V4SI_FTYPE_V2DI_V4SI_UQI
:
11296 case V4SI_FTYPE_V4DI_V4SI_UQI
:
11297 case V32QI_FTYPE_V32HI_V32QI_USI
:
11298 case UHI_FTYPE_V16QI_V16QI_UHI
:
11299 case USI_FTYPE_V32QI_V32QI_USI
:
11300 case UDI_FTYPE_V64QI_V64QI_UDI
:
11301 case UQI_FTYPE_V8HI_V8HI_UQI
:
11302 case UHI_FTYPE_V16HI_V16HI_UHI
:
11303 case USI_FTYPE_V32HI_V32HI_USI
:
11304 case UQI_FTYPE_V4SI_V4SI_UQI
:
11305 case UQI_FTYPE_V8SI_V8SI_UQI
:
11306 case UQI_FTYPE_V2DI_V2DI_UQI
:
11307 case UQI_FTYPE_V4DI_V4DI_UQI
:
11308 case V4SF_FTYPE_V2DF_V4SF_UQI
:
11309 case V4SF_FTYPE_V4DF_V4SF_UQI
:
11310 case V16SI_FTYPE_V16SI_V16SI_UHI
:
11311 case V16SI_FTYPE_V4SI_V16SI_UHI
:
11312 case V2DI_FTYPE_V4SI_V2DI_UQI
:
11313 case V2DI_FTYPE_V8HI_V2DI_UQI
:
11314 case V2DI_FTYPE_V16QI_V2DI_UQI
:
11315 case V4DI_FTYPE_V4DI_V4DI_UQI
:
11316 case V4DI_FTYPE_V4SI_V4DI_UQI
:
11317 case V4DI_FTYPE_V8HI_V4DI_UQI
:
11318 case V4DI_FTYPE_V16QI_V4DI_UQI
:
11319 case V4DI_FTYPE_V4DF_V4DI_UQI
:
11320 case V2DI_FTYPE_V2DF_V2DI_UQI
:
11321 case V4SI_FTYPE_V4DF_V4SI_UQI
:
11322 case V4SI_FTYPE_V2DF_V4SI_UQI
:
11323 case V4SI_FTYPE_V8HI_V4SI_UQI
:
11324 case V4SI_FTYPE_V16QI_V4SI_UQI
:
11325 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
11326 case V8DF_FTYPE_V2DF_V8DF_UQI
:
11327 case V8DF_FTYPE_V4DF_V8DF_UQI
:
11328 case V8DF_FTYPE_V8DF_V8DF_UQI
:
11329 case V8SF_FTYPE_V8SF_V8SF_UQI
:
11330 case V8SF_FTYPE_V8SI_V8SF_UQI
:
11331 case V4DF_FTYPE_V4DF_V4DF_UQI
:
11332 case V4SF_FTYPE_V4SF_V4SF_UQI
:
11333 case V2DF_FTYPE_V2DF_V2DF_UQI
:
11334 case V2DF_FTYPE_V4SF_V2DF_UQI
:
11335 case V2DF_FTYPE_V4SI_V2DF_UQI
:
11336 case V4SF_FTYPE_V4SI_V4SF_UQI
:
11337 case V4DF_FTYPE_V4SF_V4DF_UQI
:
11338 case V4DF_FTYPE_V4SI_V4DF_UQI
:
11339 case V8SI_FTYPE_V8SI_V8SI_UQI
:
11340 case V8SI_FTYPE_V8HI_V8SI_UQI
:
11341 case V8SI_FTYPE_V16QI_V8SI_UQI
:
11342 case V8DF_FTYPE_V8SI_V8DF_UQI
:
11343 case V8DI_FTYPE_DI_V8DI_UQI
:
11344 case V16SF_FTYPE_V8SF_V16SF_UHI
:
11345 case V16SI_FTYPE_V8SI_V16SI_UHI
:
11346 case V16HF_FTYPE_V16HI_V16HF_UHI
:
11347 case V16HF_FTYPE_V16HF_V16HF_V16HF
:
11348 case V16HI_FTYPE_V16HF_V16HI_UHI
:
11349 case V16HI_FTYPE_V16HI_V16HI_UHI
:
11350 case V8HI_FTYPE_V16QI_V8HI_UQI
:
11351 case V16HI_FTYPE_V16QI_V16HI_UHI
:
11352 case V32HI_FTYPE_V32HI_V32HI_USI
:
11353 case V32HI_FTYPE_V32QI_V32HI_USI
:
11354 case V8DI_FTYPE_V16QI_V8DI_UQI
:
11355 case V8DI_FTYPE_V2DI_V8DI_UQI
:
11356 case V8DI_FTYPE_V4DI_V8DI_UQI
:
11357 case V8DI_FTYPE_V8DI_V8DI_UQI
:
11358 case V8DI_FTYPE_V8HI_V8DI_UQI
:
11359 case V8DI_FTYPE_V8SI_V8DI_UQI
:
11360 case V8HI_FTYPE_V8DI_V8HI_UQI
:
11361 case V8SI_FTYPE_V8DI_V8SI_UQI
:
11362 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
11363 case V4DI_FTYPE_V4DI_V4DI_V2DI
:
11364 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
11365 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
11366 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
11367 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
11368 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
11369 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
11370 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
11371 case V32BF_FTYPE_V16SF_V16SF_USI
:
11372 case V16BF_FTYPE_V8SF_V8SF_UHI
:
11373 case V8BF_FTYPE_V4SF_V4SF_UQI
:
11374 case V16BF_FTYPE_V16SF_V16BF_UHI
:
11375 case V8BF_FTYPE_V8SF_V8BF_UQI
:
11376 case V8BF_FTYPE_V4SF_V8BF_UQI
:
11377 case V16SF_FTYPE_V16SF_V32BF_V32BF
:
11378 case V8SF_FTYPE_V8SF_V16BF_V16BF
:
11379 case V4SF_FTYPE_V4SF_V8BF_V8BF
:
11382 case V32QI_FTYPE_V32QI_V32QI_INT
:
11383 case V16HI_FTYPE_V16HI_V16HI_INT
:
11384 case V16QI_FTYPE_V16QI_V16QI_INT
:
11385 case V4DI_FTYPE_V4DI_V4DI_INT
:
11386 case V8HI_FTYPE_V8HI_V8HI_INT
:
11387 case V8SI_FTYPE_V8SI_V8SI_INT
:
11388 case V8SI_FTYPE_V8SI_V4SI_INT
:
11389 case V8SF_FTYPE_V8SF_V8SF_INT
:
11390 case V8SF_FTYPE_V8SF_V4SF_INT
:
11391 case V4SI_FTYPE_V4SI_V4SI_INT
:
11392 case V4DF_FTYPE_V4DF_V4DF_INT
:
11393 case V16SF_FTYPE_V16SF_V16SF_INT
:
11394 case V16SF_FTYPE_V16SF_V4SF_INT
:
11395 case V16SI_FTYPE_V16SI_V4SI_INT
:
11396 case V4DF_FTYPE_V4DF_V2DF_INT
:
11397 case V4SF_FTYPE_V4SF_V4SF_INT
:
11398 case V2DI_FTYPE_V2DI_V2DI_INT
:
11399 case V4DI_FTYPE_V4DI_V2DI_INT
:
11400 case V2DF_FTYPE_V2DF_V2DF_INT
:
11401 case UQI_FTYPE_V8DI_V8UDI_INT
:
11402 case UQI_FTYPE_V8DF_V8DF_INT
:
11403 case UQI_FTYPE_V2DF_V2DF_INT
:
11404 case UQI_FTYPE_V4SF_V4SF_INT
:
11405 case UHI_FTYPE_V16SI_V16SI_INT
:
11406 case UHI_FTYPE_V16SF_V16SF_INT
:
11407 case V64QI_FTYPE_V64QI_V64QI_INT
:
11408 case V32HI_FTYPE_V32HI_V32HI_INT
:
11409 case V16SI_FTYPE_V16SI_V16SI_INT
:
11410 case V8DI_FTYPE_V8DI_V8DI_INT
:
11412 nargs_constant
= 1;
11414 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
11417 nargs_constant
= 1;
11419 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
11422 nargs_constant
= 1;
11424 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
11427 nargs_constant
= 1;
11429 case V2DI_FTYPE_V2DI_UINT_UINT
:
11431 nargs_constant
= 2;
11433 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
11436 nargs_constant
= 1;
11438 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
11442 nargs_constant
= 1;
11444 case QI_FTYPE_V8DF_INT_UQI
:
11445 case QI_FTYPE_V4DF_INT_UQI
:
11446 case QI_FTYPE_V2DF_INT_UQI
:
11447 case HI_FTYPE_V16SF_INT_UHI
:
11448 case QI_FTYPE_V8SF_INT_UQI
:
11449 case QI_FTYPE_V4SF_INT_UQI
:
11450 case QI_FTYPE_V8HF_INT_UQI
:
11451 case HI_FTYPE_V16HF_INT_UHI
:
11452 case SI_FTYPE_V32HF_INT_USI
:
11453 case V4SI_FTYPE_V4SI_V4SI_UHI
:
11454 case V8SI_FTYPE_V8SI_V8SI_UHI
:
11457 nargs_constant
= 1;
11459 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
11463 nargs_constant
= 1;
11465 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
11469 nargs_constant
= 1;
11471 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
11472 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
11473 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
11474 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
11475 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
11476 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
11477 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
11478 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
11479 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
11480 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
11481 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
11482 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
11483 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
11484 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
11485 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
11486 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
11487 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI
:
11488 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
11489 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
11490 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
11491 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
11492 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
11493 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
11494 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
11495 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
11496 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
11497 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
11498 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
11499 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
11500 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
11501 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
11502 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
11503 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
11504 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
11505 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI
:
11506 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI
:
11507 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
11508 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
11509 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
11510 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
11511 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
11512 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
11513 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
11514 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI
:
11515 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
11516 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
11517 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
11518 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
11519 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
11520 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
11521 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
11522 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
11523 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
11524 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
11525 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
11526 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI
:
11527 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI
:
11528 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI
:
11531 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
11532 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
11533 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
11534 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
11535 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
11536 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT
:
11538 nargs_constant
= 1;
11540 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
11541 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
11542 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
11543 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
11544 case UHI_FTYPE_V16HF_V16HF_INT_UHI
:
11545 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
11546 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
11547 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
11548 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
11549 case UQI_FTYPE_V8HF_V8HF_INT_UQI
:
11550 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
11551 case USI_FTYPE_V32QI_V32QI_INT_USI
:
11552 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
11553 case USI_FTYPE_V32HI_V32HI_INT_USI
:
11554 case USI_FTYPE_V32HF_V32HF_INT_USI
:
11555 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
11556 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
11559 nargs_constant
= 1;
11561 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
11563 nargs_constant
= 2;
11565 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
11566 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
11567 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI
:
11568 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI
:
11569 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI
:
11572 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
11573 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
11576 nargs_constant
= 1;
11578 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
11579 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
11580 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
11581 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
11582 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
11583 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
11584 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
11585 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
11586 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
11587 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
11588 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
11589 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
11590 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
11591 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
11592 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
11593 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
11594 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
11595 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
11596 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
11597 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
11598 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
11599 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
11600 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
11601 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
11602 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
11603 case V16HF_FTYPE_V16HF_INT_V16HF_UHI
:
11604 case V8HF_FTYPE_V8HF_INT_V8HF_UQI
:
11605 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
11606 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
11607 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
11608 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
11609 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
11612 nargs_constant
= 1;
11614 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
11615 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
11616 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
11617 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
11618 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
11619 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
11620 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
11621 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
11622 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
11623 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
11624 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
11625 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
11626 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
11627 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
11628 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
11629 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
11630 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
11631 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
11632 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
11633 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
11634 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
11635 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
11636 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
11637 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
11638 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
11639 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
11640 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
11643 nargs_constant
= 1;
11645 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
11646 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
11647 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
11648 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
11649 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
11650 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
11651 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
11652 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
11653 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
11654 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
11657 nargs_constant
= 1;
11659 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
11660 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
11661 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
11662 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
11663 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
11664 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
11665 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
11666 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
11667 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
11668 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
11669 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
11670 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
11673 nargs_constant
= 2;
11677 gcc_unreachable ();
11680 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
11682 if (comparison
!= UNKNOWN
)
11684 gcc_assert (nargs
== 2);
11685 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
11688 if (rmode
== VOIDmode
|| rmode
== tmode
)
11692 || GET_MODE (target
) != tmode
11693 || !insn_p
->operand
[0].predicate (target
, tmode
))
11694 target
= gen_reg_rtx (tmode
);
11695 else if (memory_operand (target
, tmode
))
11697 real_target
= target
;
11701 real_target
= gen_reg_rtx (tmode
);
11702 target
= lowpart_subreg (rmode
, real_target
, tmode
);
11705 for (i
= 0; i
< nargs
; i
++)
11707 tree arg
= CALL_EXPR_ARG (exp
, i
);
11708 rtx op
= expand_normal (arg
);
11709 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
11710 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
11712 if (second_arg_count
&& i
== 1)
11714 /* SIMD shift insns take either an 8-bit immediate or
11715 register as count. But builtin functions take int as
11716 count. If count doesn't match, we put it in register.
11717 The instructions are using 64-bit count, if op is just
11718 32-bit, zero-extend it, as negative shift counts
11719 are undefined behavior and zero-extension is more
11723 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
11724 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
11726 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11727 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
11728 op
= copy_to_reg (op
);
11731 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11732 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
11737 case CODE_FOR_avx_vinsertf128v4di
:
11738 case CODE_FOR_avx_vextractf128v4di
:
11739 error ("the last argument must be an 1-bit immediate");
11742 case CODE_FOR_avx512f_cmpv8di3_mask
:
11743 case CODE_FOR_avx512f_cmpv16si3_mask
:
11744 case CODE_FOR_avx512f_ucmpv8di3_mask
:
11745 case CODE_FOR_avx512f_ucmpv16si3_mask
:
11746 case CODE_FOR_avx512vl_cmpv4di3_mask
:
11747 case CODE_FOR_avx512vl_cmpv8si3_mask
:
11748 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
11749 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
11750 case CODE_FOR_avx512vl_cmpv2di3_mask
:
11751 case CODE_FOR_avx512vl_cmpv4si3_mask
:
11752 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
11753 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
11754 error ("the last argument must be a 3-bit immediate");
11757 case CODE_FOR_sse4_1_roundsd
:
11758 case CODE_FOR_sse4_1_roundss
:
11760 case CODE_FOR_sse4_1_roundpd
:
11761 case CODE_FOR_sse4_1_roundps
:
11762 case CODE_FOR_avx_roundpd256
:
11763 case CODE_FOR_avx_roundps256
:
11765 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
11766 case CODE_FOR_sse4_1_roundps_sfix
:
11767 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
11768 case CODE_FOR_avx_roundps_sfix256
:
11770 case CODE_FOR_sse4_1_blendps
:
11771 case CODE_FOR_avx_blendpd256
:
11772 case CODE_FOR_avx_vpermilv4df
:
11773 case CODE_FOR_avx_vpermilv4df_mask
:
11774 case CODE_FOR_avx512f_getmantv8df_mask
:
11775 case CODE_FOR_avx512f_getmantv16sf_mask
:
11776 case CODE_FOR_avx512vl_getmantv16hf_mask
:
11777 case CODE_FOR_avx512vl_getmantv8sf_mask
:
11778 case CODE_FOR_avx512vl_getmantv4df_mask
:
11779 case CODE_FOR_avx512fp16_getmantv8hf_mask
:
11780 case CODE_FOR_avx512vl_getmantv4sf_mask
:
11781 case CODE_FOR_avx512vl_getmantv2df_mask
:
11782 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
11783 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
11784 case CODE_FOR_avx512dq_rangepv4df_mask
:
11785 case CODE_FOR_avx512dq_rangepv8sf_mask
:
11786 case CODE_FOR_avx512dq_rangepv2df_mask
:
11787 case CODE_FOR_avx512dq_rangepv4sf_mask
:
11788 case CODE_FOR_avx_shufpd256_mask
:
11789 error ("the last argument must be a 4-bit immediate");
11792 case CODE_FOR_sha1rnds4
:
11793 case CODE_FOR_sse4_1_blendpd
:
11794 case CODE_FOR_avx_vpermilv2df
:
11795 case CODE_FOR_avx_vpermilv2df_mask
:
11796 case CODE_FOR_xop_vpermil2v2df3
:
11797 case CODE_FOR_xop_vpermil2v4sf3
:
11798 case CODE_FOR_xop_vpermil2v4df3
:
11799 case CODE_FOR_xop_vpermil2v8sf3
:
11800 case CODE_FOR_avx512f_vinsertf32x4_mask
:
11801 case CODE_FOR_avx512f_vinserti32x4_mask
:
11802 case CODE_FOR_avx512f_vextractf32x4_mask
:
11803 case CODE_FOR_avx512f_vextracti32x4_mask
:
11804 case CODE_FOR_sse2_shufpd
:
11805 case CODE_FOR_sse2_shufpd_mask
:
11806 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
11807 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
11808 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
11809 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
11810 error ("the last argument must be a 2-bit immediate");
11813 case CODE_FOR_avx_vextractf128v4df
:
11814 case CODE_FOR_avx_vextractf128v8sf
:
11815 case CODE_FOR_avx_vextractf128v8si
:
11816 case CODE_FOR_avx_vinsertf128v4df
:
11817 case CODE_FOR_avx_vinsertf128v8sf
:
11818 case CODE_FOR_avx_vinsertf128v8si
:
11819 case CODE_FOR_avx512f_vinsertf64x4_mask
:
11820 case CODE_FOR_avx512f_vinserti64x4_mask
:
11821 case CODE_FOR_avx512f_vextractf64x4_mask
:
11822 case CODE_FOR_avx512f_vextracti64x4_mask
:
11823 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
11824 case CODE_FOR_avx512dq_vinserti32x8_mask
:
11825 case CODE_FOR_avx512vl_vinsertv4df
:
11826 case CODE_FOR_avx512vl_vinsertv4di
:
11827 case CODE_FOR_avx512vl_vinsertv8sf
:
11828 case CODE_FOR_avx512vl_vinsertv8si
:
11829 error ("the last argument must be a 1-bit immediate");
11832 case CODE_FOR_avx_vmcmpv2df3
:
11833 case CODE_FOR_avx_vmcmpv4sf3
:
11834 case CODE_FOR_avx_cmpv2df3
:
11835 case CODE_FOR_avx_cmpv4sf3
:
11836 case CODE_FOR_avx_cmpv4df3
:
11837 case CODE_FOR_avx_cmpv8sf3
:
11838 case CODE_FOR_avx512f_cmpv8df3_mask
:
11839 case CODE_FOR_avx512f_cmpv16sf3_mask
:
11840 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
11841 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
11842 case CODE_FOR_avx512bw_cmpv32hf3_mask
:
11843 case CODE_FOR_avx512vl_cmpv16hf3_mask
:
11844 case CODE_FOR_avx512fp16_cmpv8hf3_mask
:
11845 error ("the last argument must be a 5-bit immediate");
11849 switch (nargs_constant
)
11852 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11853 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
11855 error ("the next to last argument must be an 8-bit immediate");
11860 error ("the last argument must be an 8-bit immediate");
11863 gcc_unreachable ();
11870 if (VECTOR_MODE_P (mode
))
11871 op
= safe_vector_operand (op
, mode
);
11873 /* If we aren't optimizing, only allow one memory operand to
11875 if (memory_operand (op
, mode
))
11878 op
= fixup_modeless_constant (op
, mode
);
11880 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
11882 if (optimize
|| !match
|| num_memory
> 1)
11883 op
= copy_to_mode_reg (mode
, op
);
11887 op
= copy_to_reg (op
);
11888 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11898 pat
= GEN_FCN (icode
) (real_target
, xops
[0]);
11901 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1]);
11904 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1], xops
[2]);
11907 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11911 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11912 xops
[2], xops
[3], xops
[4]);
11915 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11916 xops
[2], xops
[3], xops
[4], xops
[5]);
11919 gcc_unreachable ();
11929 /* Transform pattern of following layout:
11931 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11937 ix86_erase_embedded_rounding (rtx pat
)
11939 if (GET_CODE (pat
) == INSN
)
11940 pat
= PATTERN (pat
);
11942 gcc_assert (GET_CODE (pat
) == SET
);
11943 rtx src
= SET_SRC (pat
);
11944 gcc_assert (XVECLEN (src
, 0) == 2);
11945 rtx p0
= XVECEXP (src
, 0, 0);
11946 gcc_assert (GET_CODE (src
) == UNSPEC
11947 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
11948 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
11952 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11955 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
11956 tree exp
, rtx target
)
11959 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11960 tree arg1
= CALL_EXPR_ARG (exp
, 1);
11961 tree arg2
= CALL_EXPR_ARG (exp
, 2);
11962 tree arg3
= CALL_EXPR_ARG (exp
, 3);
11963 rtx op0
= expand_normal (arg0
);
11964 rtx op1
= expand_normal (arg1
);
11965 rtx op2
= expand_normal (arg2
);
11966 rtx op3
= expand_normal (arg3
);
11967 enum insn_code icode
= d
->icode
;
11968 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11969 machine_mode mode0
= insn_p
->operand
[0].mode
;
11970 machine_mode mode1
= insn_p
->operand
[1].mode
;
11972 /* See avxintrin.h for values. */
11973 static const enum rtx_code comparisons
[32] =
11975 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11976 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
11977 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11978 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
11980 static const bool ordereds
[32] =
11982 true, true, true, false, false, false, false, true,
11983 false, false, false, true, true, true, true, false,
11984 true, true, true, false, false, false, false, true,
11985 false, false, false, true, true, true, true, false
11987 static const bool non_signalings
[32] =
11989 true, false, false, true, true, false, false, true,
11990 true, false, false, true, true, false, false, true,
11991 false, true, true, false, false, true, true, false,
11992 false, true, true, false, false, true, true, false
11995 if (!CONST_INT_P (op2
))
11997 error ("the third argument must be comparison constant");
12000 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
12002 error ("incorrect comparison mode");
12006 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
12008 error ("incorrect rounding operand");
12012 if (VECTOR_MODE_P (mode0
))
12013 op0
= safe_vector_operand (op0
, mode0
);
12014 if (VECTOR_MODE_P (mode1
))
12015 op1
= safe_vector_operand (op1
, mode1
);
12017 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
12018 bool ordered
= ordereds
[INTVAL (op2
)];
12019 bool non_signaling
= non_signalings
[INTVAL (op2
)];
12020 rtx const_val
= const0_rtx
;
12022 bool check_unordered
= false;
12023 machine_mode mode
= CCFPmode
;
12024 switch (comparison
)
12029 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
12030 if (!non_signaling
)
12036 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
12046 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
12053 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
12054 if (!non_signaling
)
12061 case LE
: /* -> GE */
12062 case LT
: /* -> GT */
12063 case UNGE
: /* -> UNLE */
12064 case UNGT
: /* -> UNLT */
12065 std::swap (op0
, op1
);
12066 comparison
= swap_condition (comparison
);
12074 /* These are supported by CCFPmode. NB: Use ordered/signaling
12075 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
12076 with NAN operands. */
12077 if (ordered
== non_signaling
)
12078 ordered
= !ordered
;
12081 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
12082 _CMP_EQ_OQ/_CMP_EQ_OS. */
12083 check_unordered
= true;
12087 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
12088 _CMP_NEQ_UQ/_CMP_NEQ_US. */
12089 gcc_assert (!ordered
);
12090 check_unordered
= true;
12092 const_val
= const1_rtx
;
12095 gcc_unreachable ();
12098 target
= gen_reg_rtx (SImode
);
12099 emit_move_insn (target
, const_val
);
12100 target
= gen_rtx_SUBREG (QImode
, target
, 0);
12102 if ((optimize
&& !register_operand (op0
, mode0
))
12103 || !insn_p
->operand
[0].predicate (op0
, mode0
))
12104 op0
= copy_to_mode_reg (mode0
, op0
);
12105 if ((optimize
&& !register_operand (op1
, mode1
))
12106 || !insn_p
->operand
[1].predicate (op1
, mode1
))
12107 op1
= copy_to_mode_reg (mode1
, op1
);
12110 1. COMI: ordered and signaling.
12111 2. UCOMI: unordered and non-signaling.
12114 icode
= (icode
== CODE_FOR_sse_comi_round
12115 ? CODE_FOR_sse_ucomi_round
12116 : CODE_FOR_sse2_ucomi_round
);
12118 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
12122 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
12123 if (INTVAL (op3
) == NO_ROUND
)
12125 pat
= ix86_erase_embedded_rounding (pat
);
12129 set_dst
= SET_DEST (pat
);
12133 gcc_assert (GET_CODE (pat
) == SET
);
12134 set_dst
= SET_DEST (pat
);
12139 return ix86_ssecom_setcc (comparison
, check_unordered
, mode
,
12144 ix86_expand_round_builtin (const struct builtin_description
*d
,
12145 tree exp
, rtx target
)
12148 unsigned int i
, nargs
;
12150 enum insn_code icode
= d
->icode
;
12151 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
12152 machine_mode tmode
= insn_p
->operand
[0].mode
;
12153 unsigned int nargs_constant
= 0;
12154 unsigned int redundant_embed_rnd
= 0;
12156 switch ((enum ix86_builtin_func_type
) d
->flag
)
12158 case UINT64_FTYPE_V2DF_INT
:
12159 case UINT64_FTYPE_V4SF_INT
:
12160 case UINT64_FTYPE_V8HF_INT
:
12161 case UINT_FTYPE_V2DF_INT
:
12162 case UINT_FTYPE_V4SF_INT
:
12163 case UINT_FTYPE_V8HF_INT
:
12164 case INT64_FTYPE_V2DF_INT
:
12165 case INT64_FTYPE_V4SF_INT
:
12166 case INT64_FTYPE_V8HF_INT
:
12167 case INT_FTYPE_V2DF_INT
:
12168 case INT_FTYPE_V4SF_INT
:
12169 case INT_FTYPE_V8HF_INT
:
12172 case V32HF_FTYPE_V32HF_V32HF_INT
:
12173 case V8HF_FTYPE_V8HF_V8HF_INT
:
12174 case V8HF_FTYPE_V8HF_INT_INT
:
12175 case V8HF_FTYPE_V8HF_UINT_INT
:
12176 case V8HF_FTYPE_V8HF_INT64_INT
:
12177 case V8HF_FTYPE_V8HF_UINT64_INT
:
12178 case V4SF_FTYPE_V4SF_UINT_INT
:
12179 case V4SF_FTYPE_V4SF_UINT64_INT
:
12180 case V2DF_FTYPE_V2DF_UINT64_INT
:
12181 case V4SF_FTYPE_V4SF_INT_INT
:
12182 case V4SF_FTYPE_V4SF_INT64_INT
:
12183 case V2DF_FTYPE_V2DF_INT64_INT
:
12184 case V4SF_FTYPE_V4SF_V4SF_INT
:
12185 case V2DF_FTYPE_V2DF_V2DF_INT
:
12186 case V4SF_FTYPE_V4SF_V2DF_INT
:
12187 case V2DF_FTYPE_V2DF_V4SF_INT
:
12190 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
12191 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
12192 case V32HI_FTYPE_V32HF_V32HI_USI_INT
:
12193 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
12194 case V8DI_FTYPE_V8HF_V8DI_UQI_INT
:
12195 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
12196 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
12197 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
12198 case V8DF_FTYPE_V8HF_V8DF_UQI_INT
:
12199 case V16SF_FTYPE_V16HF_V16SF_UHI_INT
:
12200 case V32HF_FTYPE_V32HI_V32HF_USI_INT
:
12201 case V32HF_FTYPE_V32HF_V32HF_USI_INT
:
12202 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT
:
12203 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
12204 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
12205 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
12206 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
12207 case V16SI_FTYPE_V16HF_V16SI_UHI_INT
:
12208 case V16HF_FTYPE_V16SI_V16HF_UHI_INT
:
12209 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
12210 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
12211 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
12212 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
12213 case V8HF_FTYPE_V8DI_V8HF_UQI_INT
:
12214 case V8HF_FTYPE_V8DF_V8HF_UQI_INT
:
12215 case V16HF_FTYPE_V16SF_V16HF_UHI_INT
:
12216 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT
:
12219 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
12220 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
12221 nargs_constant
= 2;
12224 case INT_FTYPE_V4SF_V4SF_INT_INT
:
12225 case INT_FTYPE_V2DF_V2DF_INT_INT
:
12226 return ix86_expand_sse_comi_round (d
, exp
, target
);
12227 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
12228 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
12229 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
12230 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT
:
12231 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
12232 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT
:
12233 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT
:
12234 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT
:
12235 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
12236 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
12237 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT
:
12238 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
12239 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
12240 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT
:
12241 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT
:
12242 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT
:
12243 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT
:
12246 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT
:
12247 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
12248 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
12249 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT
:
12250 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT
:
12251 nargs_constant
= 4;
12254 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
12255 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
12256 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
12257 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
12258 case USI_FTYPE_V32HF_V32HF_INT_USI_INT
:
12259 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT
:
12260 nargs_constant
= 3;
12263 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
12264 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
12265 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
12266 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
12267 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
12268 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
12269 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT
:
12271 nargs_constant
= 4;
12273 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
12274 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
12275 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
12276 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
12278 nargs_constant
= 3;
12281 gcc_unreachable ();
12283 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
12287 || GET_MODE (target
) != tmode
12288 || !insn_p
->operand
[0].predicate (target
, tmode
))
12289 target
= gen_reg_rtx (tmode
);
12291 for (i
= 0; i
< nargs
; i
++)
12293 tree arg
= CALL_EXPR_ARG (exp
, i
);
12294 rtx op
= expand_normal (arg
);
12295 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
12296 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
12298 if (i
== nargs
- nargs_constant
)
12304 case CODE_FOR_avx512f_getmantv8df_mask_round
:
12305 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
12306 case CODE_FOR_avx512bw_getmantv32hf_mask_round
:
12307 case CODE_FOR_avx512f_vgetmantv2df_round
:
12308 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
12309 case CODE_FOR_avx512f_vgetmantv4sf_round
:
12310 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
12311 case CODE_FOR_avx512f_vgetmantv8hf_mask_round
:
12312 error ("the immediate argument must be a 4-bit immediate");
12314 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
12315 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
12316 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
12317 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
12318 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round
:
12319 case CODE_FOR_avx512bw_cmpv32hf3_mask_round
:
12320 error ("the immediate argument must be a 5-bit immediate");
12323 error ("the immediate argument must be an 8-bit immediate");
12328 else if (i
== nargs
-1)
12330 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
12332 error ("incorrect rounding operand");
12336 /* If there is no rounding use normal version of the pattern. */
12337 if (INTVAL (op
) == NO_ROUND
)
12339 /* Skip erasing embedded rounding for below expanders who
12340 generates multiple insns. In ix86_erase_embedded_rounding
12341 the pattern will be transformed to a single set, and emit_insn
12342 appends the set insead of insert it to chain. So the insns
12343 emitted inside define_expander would be ignored. */
12346 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round
:
12347 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round
:
12348 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round
:
12349 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round
:
12350 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round
:
12351 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round
:
12352 redundant_embed_rnd
= 0;
12355 redundant_embed_rnd
= 1;
12362 if (VECTOR_MODE_P (mode
))
12363 op
= safe_vector_operand (op
, mode
);
12365 op
= fixup_modeless_constant (op
, mode
);
12367 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
12369 if (optimize
|| !match
)
12370 op
= copy_to_mode_reg (mode
, op
);
12374 op
= copy_to_reg (op
);
12375 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
12385 pat
= GEN_FCN (icode
) (target
, xops
[0]);
12388 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
12391 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
12394 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
12398 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
12399 xops
[2], xops
[3], xops
[4]);
12402 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
12403 xops
[2], xops
[3], xops
[4], xops
[5]);
12406 gcc_unreachable ();
12412 if (redundant_embed_rnd
)
12413 pat
= ix86_erase_embedded_rounding (pat
);
12419 /* Subroutine of ix86_expand_builtin to take care of special insns
12420 with variable number of operands. */
12423 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
12424 tree exp
, rtx target
)
12428 unsigned int i
, nargs
, arg_adjust
, memory
;
12429 unsigned int constant
= 100;
12430 bool aligned_mem
= false;
12432 enum insn_code icode
= d
->icode
;
12433 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
12434 machine_mode tmode
= insn_p
->operand
[0].mode
;
12435 enum { load
, store
} klass
;
12437 switch ((enum ix86_builtin_func_type
) d
->flag
)
12439 case VOID_FTYPE_VOID
:
12440 emit_insn (GEN_FCN (icode
) (target
));
12442 case VOID_FTYPE_UINT64
:
12443 case VOID_FTYPE_UNSIGNED
:
12449 case INT_FTYPE_VOID
:
12450 case USHORT_FTYPE_VOID
:
12451 case UINT64_FTYPE_VOID
:
12452 case UINT_FTYPE_VOID
:
12453 case UINT8_FTYPE_VOID
:
12454 case UNSIGNED_FTYPE_VOID
:
12459 case UINT64_FTYPE_PUNSIGNED
:
12460 case V2DI_FTYPE_PV2DI
:
12461 case V4DI_FTYPE_PV4DI
:
12462 case V32QI_FTYPE_PCCHAR
:
12463 case V16QI_FTYPE_PCCHAR
:
12464 case V8SF_FTYPE_PCV4SF
:
12465 case V8SF_FTYPE_PCFLOAT
:
12466 case V4SF_FTYPE_PCFLOAT
:
12467 case V4SF_FTYPE_PCFLOAT16
:
12468 case V4SF_FTYPE_PCBFLOAT16
:
12469 case V4SF_FTYPE_PCV8BF
:
12470 case V4SF_FTYPE_PCV8HF
:
12471 case V8SF_FTYPE_PCFLOAT16
:
12472 case V8SF_FTYPE_PCBFLOAT16
:
12473 case V8SF_FTYPE_PCV16HF
:
12474 case V8SF_FTYPE_PCV16BF
:
12475 case V4DF_FTYPE_PCV2DF
:
12476 case V4DF_FTYPE_PCDOUBLE
:
12477 case V2DF_FTYPE_PCDOUBLE
:
12478 case VOID_FTYPE_PVOID
:
12479 case V8DI_FTYPE_PV8DI
:
12485 case CODE_FOR_sse4_1_movntdqa
:
12486 case CODE_FOR_avx2_movntdqa
:
12487 case CODE_FOR_avx512f_movntdqa
:
12488 aligned_mem
= true;
12494 case VOID_FTYPE_PV2SF_V4SF
:
12495 case VOID_FTYPE_PV8DI_V8DI
:
12496 case VOID_FTYPE_PV4DI_V4DI
:
12497 case VOID_FTYPE_PV2DI_V2DI
:
12498 case VOID_FTYPE_PCHAR_V32QI
:
12499 case VOID_FTYPE_PCHAR_V16QI
:
12500 case VOID_FTYPE_PFLOAT_V16SF
:
12501 case VOID_FTYPE_PFLOAT_V8SF
:
12502 case VOID_FTYPE_PFLOAT_V4SF
:
12503 case VOID_FTYPE_PDOUBLE_V8DF
:
12504 case VOID_FTYPE_PDOUBLE_V4DF
:
12505 case VOID_FTYPE_PDOUBLE_V2DF
:
12506 case VOID_FTYPE_PLONGLONG_LONGLONG
:
12507 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
12508 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
12509 case VOID_FTYPE_PINT_INT
:
12512 /* Reserve memory operand for target. */
12513 memory
= ARRAY_SIZE (xops
);
12516 /* These builtins and instructions require the memory
12517 to be properly aligned. */
12518 case CODE_FOR_avx_movntv4di
:
12519 case CODE_FOR_sse2_movntv2di
:
12520 case CODE_FOR_avx_movntv8sf
:
12521 case CODE_FOR_sse_movntv4sf
:
12522 case CODE_FOR_sse4a_vmmovntv4sf
:
12523 case CODE_FOR_avx_movntv4df
:
12524 case CODE_FOR_sse2_movntv2df
:
12525 case CODE_FOR_sse4a_vmmovntv2df
:
12526 case CODE_FOR_sse2_movntidi
:
12527 case CODE_FOR_sse_movntq
:
12528 case CODE_FOR_sse2_movntisi
:
12529 case CODE_FOR_avx512f_movntv16sf
:
12530 case CODE_FOR_avx512f_movntv8df
:
12531 case CODE_FOR_avx512f_movntv8di
:
12532 aligned_mem
= true;
12538 case VOID_FTYPE_PVOID_PCVOID
:
12544 case V4SF_FTYPE_V4SF_PCV2SF
:
12545 case V2DF_FTYPE_V2DF_PCDOUBLE
:
12550 case V8SF_FTYPE_PCV8SF_V8SI
:
12551 case V4DF_FTYPE_PCV4DF_V4DI
:
12552 case V4SF_FTYPE_PCV4SF_V4SI
:
12553 case V2DF_FTYPE_PCV2DF_V2DI
:
12554 case V8SI_FTYPE_PCV8SI_V8SI
:
12555 case V4DI_FTYPE_PCV4DI_V4DI
:
12556 case V4SI_FTYPE_PCV4SI_V4SI
:
12557 case V2DI_FTYPE_PCV2DI_V2DI
:
12558 case VOID_FTYPE_INT_INT64
:
12563 case VOID_FTYPE_PV8DF_V8DF_UQI
:
12564 case VOID_FTYPE_PV4DF_V4DF_UQI
:
12565 case VOID_FTYPE_PV2DF_V2DF_UQI
:
12566 case VOID_FTYPE_PV16SF_V16SF_UHI
:
12567 case VOID_FTYPE_PV8SF_V8SF_UQI
:
12568 case VOID_FTYPE_PV4SF_V4SF_UQI
:
12569 case VOID_FTYPE_PV8DI_V8DI_UQI
:
12570 case VOID_FTYPE_PV4DI_V4DI_UQI
:
12571 case VOID_FTYPE_PV2DI_V2DI_UQI
:
12572 case VOID_FTYPE_PV16SI_V16SI_UHI
:
12573 case VOID_FTYPE_PV8SI_V8SI_UQI
:
12574 case VOID_FTYPE_PV4SI_V4SI_UQI
:
12575 case VOID_FTYPE_PV64QI_V64QI_UDI
:
12576 case VOID_FTYPE_PV32HI_V32HI_USI
:
12577 case VOID_FTYPE_PV32QI_V32QI_USI
:
12578 case VOID_FTYPE_PV16QI_V16QI_UHI
:
12579 case VOID_FTYPE_PV16HI_V16HI_UHI
:
12580 case VOID_FTYPE_PV8HI_V8HI_UQI
:
12583 /* These builtins and instructions require the memory
12584 to be properly aligned. */
12585 case CODE_FOR_avx512f_storev16sf_mask
:
12586 case CODE_FOR_avx512f_storev16si_mask
:
12587 case CODE_FOR_avx512f_storev8df_mask
:
12588 case CODE_FOR_avx512f_storev8di_mask
:
12589 case CODE_FOR_avx512vl_storev8sf_mask
:
12590 case CODE_FOR_avx512vl_storev8si_mask
:
12591 case CODE_FOR_avx512vl_storev4df_mask
:
12592 case CODE_FOR_avx512vl_storev4di_mask
:
12593 case CODE_FOR_avx512vl_storev4sf_mask
:
12594 case CODE_FOR_avx512vl_storev4si_mask
:
12595 case CODE_FOR_avx512vl_storev2df_mask
:
12596 case CODE_FOR_avx512vl_storev2di_mask
:
12597 aligned_mem
= true;
12603 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
12604 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
12605 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
12606 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
12607 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
12608 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
12609 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
12610 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
12611 case VOID_FTYPE_PV8SI_V8DI_UQI
:
12612 case VOID_FTYPE_PV8HI_V8DI_UQI
:
12613 case VOID_FTYPE_PV16HI_V16SI_UHI
:
12614 case VOID_FTYPE_PUDI_V8DI_UQI
:
12615 case VOID_FTYPE_PV16QI_V16SI_UHI
:
12616 case VOID_FTYPE_PV4SI_V4DI_UQI
:
12617 case VOID_FTYPE_PUDI_V2DI_UQI
:
12618 case VOID_FTYPE_PUDI_V4DI_UQI
:
12619 case VOID_FTYPE_PUSI_V2DI_UQI
:
12620 case VOID_FTYPE_PV8HI_V8SI_UQI
:
12621 case VOID_FTYPE_PUDI_V4SI_UQI
:
12622 case VOID_FTYPE_PUSI_V4DI_UQI
:
12623 case VOID_FTYPE_PUHI_V2DI_UQI
:
12624 case VOID_FTYPE_PUDI_V8SI_UQI
:
12625 case VOID_FTYPE_PUSI_V4SI_UQI
:
12626 case VOID_FTYPE_PCHAR_V64QI_UDI
:
12627 case VOID_FTYPE_PCHAR_V32QI_USI
:
12628 case VOID_FTYPE_PCHAR_V16QI_UHI
:
12629 case VOID_FTYPE_PSHORT_V32HI_USI
:
12630 case VOID_FTYPE_PSHORT_V16HI_UHI
:
12631 case VOID_FTYPE_PSHORT_V8HI_UQI
:
12632 case VOID_FTYPE_PINT_V16SI_UHI
:
12633 case VOID_FTYPE_PINT_V8SI_UQI
:
12634 case VOID_FTYPE_PINT_V4SI_UQI
:
12635 case VOID_FTYPE_PINT64_V8DI_UQI
:
12636 case VOID_FTYPE_PINT64_V4DI_UQI
:
12637 case VOID_FTYPE_PINT64_V2DI_UQI
:
12638 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
12639 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
12640 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
12641 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
12642 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
12643 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
12644 case VOID_FTYPE_PCFLOAT16_V8HF_UQI
:
12645 case VOID_FTYPE_PV32QI_V32HI_USI
:
12646 case VOID_FTYPE_PV16QI_V16HI_UHI
:
12647 case VOID_FTYPE_PUDI_V8HI_UQI
:
12650 /* Reserve memory operand for target. */
12651 memory
= ARRAY_SIZE (xops
);
12653 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
12654 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
12655 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
12656 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
12657 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
12658 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
12659 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
12660 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
12661 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
12662 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
12663 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
12664 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
12665 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
12666 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
12667 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
12668 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
12669 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
12670 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
12673 /* These builtins and instructions require the memory
12674 to be properly aligned. */
12675 case CODE_FOR_avx512f_loadv16sf_mask
:
12676 case CODE_FOR_avx512f_loadv16si_mask
:
12677 case CODE_FOR_avx512f_loadv8df_mask
:
12678 case CODE_FOR_avx512f_loadv8di_mask
:
12679 case CODE_FOR_avx512vl_loadv8sf_mask
:
12680 case CODE_FOR_avx512vl_loadv8si_mask
:
12681 case CODE_FOR_avx512vl_loadv4df_mask
:
12682 case CODE_FOR_avx512vl_loadv4di_mask
:
12683 case CODE_FOR_avx512vl_loadv4sf_mask
:
12684 case CODE_FOR_avx512vl_loadv4si_mask
:
12685 case CODE_FOR_avx512vl_loadv2df_mask
:
12686 case CODE_FOR_avx512vl_loadv2di_mask
:
12687 case CODE_FOR_avx512bw_loadv64qi_mask
:
12688 case CODE_FOR_avx512vl_loadv32qi_mask
:
12689 case CODE_FOR_avx512vl_loadv16qi_mask
:
12690 case CODE_FOR_avx512bw_loadv32hi_mask
:
12691 case CODE_FOR_avx512vl_loadv16hi_mask
:
12692 case CODE_FOR_avx512vl_loadv8hi_mask
:
12693 aligned_mem
= true;
12699 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
12700 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
12701 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
12702 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
12703 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
12704 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
12705 case V16SI_FTYPE_PCINT_V16SI_UHI
:
12706 case V8SI_FTYPE_PCINT_V8SI_UQI
:
12707 case V4SI_FTYPE_PCINT_V4SI_UQI
:
12708 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
12709 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
12710 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
12711 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
12712 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
12713 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
12714 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
12715 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
12716 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
12717 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI
:
12722 case INT_FTYPE_PINT_INT_INT_INT
:
12723 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT
:
12730 gcc_unreachable ();
12733 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
12735 if (klass
== store
)
12737 arg
= CALL_EXPR_ARG (exp
, 0);
12738 op
= expand_normal (arg
);
12739 gcc_assert (target
== 0);
12742 op
= ix86_zero_extend_to_Pmode (op
);
12743 target
= gen_rtx_MEM (tmode
, op
);
12744 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12745 on it. Try to improve it using get_pointer_alignment,
12746 and if the special builtin is one that requires strict
12747 mode alignment, also from it's GET_MODE_ALIGNMENT.
12748 Failure to do so could lead to ix86_legitimate_combined_insn
12749 rejecting all changes to such insns. */
12750 unsigned int align
= get_pointer_alignment (arg
);
12751 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
12752 align
= GET_MODE_ALIGNMENT (tmode
);
12753 if (MEM_ALIGN (target
) < align
)
12754 set_mem_align (target
, align
);
12757 target
= force_reg (tmode
, op
);
12765 || !register_operand (target
, tmode
)
12766 || GET_MODE (target
) != tmode
)
12767 target
= gen_reg_rtx (tmode
);
12770 for (i
= 0; i
< nargs
; i
++)
12772 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
12774 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
12775 op
= expand_normal (arg
);
12779 /* This must be the memory operand. */
12780 op
= ix86_zero_extend_to_Pmode (op
);
12781 op
= gen_rtx_MEM (mode
, op
);
12782 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12783 on it. Try to improve it using get_pointer_alignment,
12784 and if the special builtin is one that requires strict
12785 mode alignment, also from it's GET_MODE_ALIGNMENT.
12786 Failure to do so could lead to ix86_legitimate_combined_insn
12787 rejecting all changes to such insns. */
12788 unsigned int align
= get_pointer_alignment (arg
);
12789 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
12790 align
= GET_MODE_ALIGNMENT (mode
);
12791 if (MEM_ALIGN (op
) < align
)
12792 set_mem_align (op
, align
);
12794 else if (i
== constant
)
12796 /* This must be the constant. */
12797 if (!insn_p
->operand
[nargs
].predicate(op
, SImode
))
12799 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12805 /* This must be register. */
12806 if (VECTOR_MODE_P (mode
))
12807 op
= safe_vector_operand (op
, mode
);
12809 op
= fixup_modeless_constant (op
, mode
);
12811 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12812 and that mask operand shoud be at the end.
12813 Keep all-ones mask which would be simplified by the expander. */
12814 if (nargs
== 3 && i
== 2 && klass
== load
12815 && constm1_operand (op
, mode
)
12816 && insn_p
->operand
[i
].predicate (op
, mode
))
12818 else if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
12819 op
= copy_to_mode_reg (mode
, op
);
12822 op
= copy_to_reg (op
);
12823 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
12833 pat
= GEN_FCN (icode
) (target
);
12836 pat
= GEN_FCN (icode
) (target
, xops
[0]);
12839 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
12842 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
12845 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
12848 gcc_unreachable ();
12855 return klass
== store
? 0 : target
;
12858 /* Return the integer constant in ARG. Constrain it to be in the range
12859 of the subparts of VEC_TYPE; issue an error if not. */
12862 get_element_number (tree vec_type
, tree arg
)
12864 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
12866 if (!tree_fits_uhwi_p (arg
)
12867 || (elt
= tree_to_uhwi (arg
), elt
> max
))
12869 error ("selector must be an integer constant in the range "
12877 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12878 ix86_expand_vector_init. We DO have language-level syntax for this, in
12879 the form of (type){ init-list }. Except that since we can't place emms
12880 instructions from inside the compiler, we can't allow the use of MMX
12881 registers unless the user explicitly asks for it. So we do *not* define
12882 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12883 we have builtins invoked by mmintrin.h that gives us license to emit
12884 these sorts of instructions. */
12887 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
12889 machine_mode tmode
= TYPE_MODE (type
);
12890 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
12891 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
12892 rtvec v
= rtvec_alloc (n_elt
);
12894 gcc_assert (VECTOR_MODE_P (tmode
));
12895 gcc_assert (call_expr_nargs (exp
) == n_elt
);
12897 for (i
= 0; i
< n_elt
; ++i
)
12899 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
12900 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
12903 if (!target
|| !register_operand (target
, tmode
))
12904 target
= gen_reg_rtx (tmode
);
12906 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
12910 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12911 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12912 had a language-level syntax for referencing vector elements. */
12915 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
12917 machine_mode tmode
, mode0
;
12922 arg0
= CALL_EXPR_ARG (exp
, 0);
12923 arg1
= CALL_EXPR_ARG (exp
, 1);
12925 op0
= expand_normal (arg0
);
12926 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
12928 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12929 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
12930 gcc_assert (VECTOR_MODE_P (mode0
));
12932 op0
= force_reg (mode0
, op0
);
12934 if (optimize
|| !target
|| !register_operand (target
, tmode
))
12935 target
= gen_reg_rtx (tmode
);
12937 ix86_expand_vector_extract (true, target
, op0
, elt
);
12942 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12943 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12944 a language-level syntax for referencing vector elements. */
12947 ix86_expand_vec_set_builtin (tree exp
)
12949 machine_mode tmode
, mode1
;
12950 tree arg0
, arg1
, arg2
;
12952 rtx op0
, op1
, target
;
12954 arg0
= CALL_EXPR_ARG (exp
, 0);
12955 arg1
= CALL_EXPR_ARG (exp
, 1);
12956 arg2
= CALL_EXPR_ARG (exp
, 2);
12958 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
12959 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12960 gcc_assert (VECTOR_MODE_P (tmode
));
12962 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
12963 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
12964 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
12966 if (GET_MODE (op1
) != mode1
)
12967 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
12969 op0
= force_reg (tmode
, op0
);
12970 op1
= force_reg (mode1
, op1
);
12972 /* OP0 is the source of these builtin functions and shouldn't be
12973 modified. Create a copy, use it and return it as target. */
12974 target
= gen_reg_rtx (tmode
);
12975 emit_move_insn (target
, op0
);
12976 ix86_expand_vector_set (true, target
, op1
, elt
);
12981 /* Return true if the necessary isa options for this builtin exist,
12983 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12985 ix86_check_builtin_isa_match (unsigned int fcode
,
12986 HOST_WIDE_INT
* pbisa
,
12987 HOST_WIDE_INT
* pbisa2
)
12989 HOST_WIDE_INT isa
= ix86_isa_flags
;
12990 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
12991 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
12992 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
12993 HOST_WIDE_INT tmp_isa
= isa
, tmp_isa2
= isa2
;
12994 /* The general case is we require all the ISAs specified in bisa{,2}
12996 The exceptions are:
12997 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12998 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12999 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
13000 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
13001 OPTION_MASK_ISA2_AVXVNNI
13002 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
13003 OPTION_MASK_ISA2_AVXIFMA
13004 (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
13005 OPTION_MASK_ISA2_AVXNECONVERT
13006 OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
13007 where for each such pair it is sufficient if either of the ISAs is
13008 enabled, plus if it is ored with other options also those others.
13009 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
13011 #define SHARE_BUILTIN(A1, A2, B1, B2) \
13012 if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
13013 && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
13014 && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
13015 || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
13017 tmp_isa |= (A1) | (B1); \
13018 tmp_isa2 |= (A2) | (B2); \
13021 SHARE_BUILTIN (OPTION_MASK_ISA_SSE
, 0, OPTION_MASK_ISA_3DNOW_A
, 0);
13022 SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2
, 0, OPTION_MASK_ISA_CRC32
, 0);
13023 SHARE_BUILTIN (OPTION_MASK_ISA_FMA
, 0, OPTION_MASK_ISA_FMA4
, 0);
13024 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
, 0, 0,
13025 OPTION_MASK_ISA2_AVXVNNI
);
13026 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
, 0, 0,
13027 OPTION_MASK_ISA2_AVXIFMA
);
13028 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL
, OPTION_MASK_ISA2_AVX512BF16
, 0,
13029 OPTION_MASK_ISA2_AVXNECONVERT
);
13030 SHARE_BUILTIN (OPTION_MASK_ISA_AES
, 0, OPTION_MASK_ISA_AVX512VL
,
13031 OPTION_MASK_ISA2_VAES
);
13035 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
13036 /* __builtin_ia32_maskmovq requires MMX registers. */
13037 && fcode
!= IX86_BUILTIN_MASKMOVQ
)
13039 bisa
&= ~OPTION_MASK_ISA_MMX
;
13040 bisa
|= OPTION_MASK_ISA_SSE2
;
13048 return (bisa
& isa
) == bisa
&& (bisa2
& isa2
) == bisa2
;
13051 /* Emit instructions to set the carry flag from ARG. */
13054 ix86_expand_carry (rtx arg
)
13056 if (!CONST_INT_P (arg
) || arg
== const0_rtx
)
13058 arg
= convert_to_mode (QImode
, arg
, 1);
13059 arg
= copy_to_mode_reg (QImode
, arg
);
13060 emit_insn (gen_addqi3_cconly_overflow (arg
, constm1_rtx
));
13063 emit_insn (gen_x86_stc ());
13066 /* Expand an expression EXP that calls a built-in function,
13067 with result going to TARGET if that's convenient
13068 (and in mode MODE if that's convenient).
13069 SUBTARGET may be used as the target for computing one of EXP's operands.
13070 IGNORE is nonzero if the value is to be ignored. */
13073 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
13074 machine_mode mode
, int ignore
)
13077 enum insn_code icode
, icode2
;
13078 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
13079 tree arg0
, arg1
, arg2
, arg3
, arg4
;
13080 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
13081 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
13082 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
13083 HOST_WIDE_INT bisa
, bisa2
;
13085 /* For CPU builtins that can be folded, fold first and expand the fold. */
13088 case IX86_BUILTIN_CPU_INIT
:
13090 /* Make it call __cpu_indicator_init in libgcc. */
13091 tree call_expr
, fndecl
, type
;
13092 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
13093 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
13094 call_expr
= build_call_expr (fndecl
, 0);
13095 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
13097 case IX86_BUILTIN_CPU_IS
:
13098 case IX86_BUILTIN_CPU_SUPPORTS
:
13100 tree arg0
= CALL_EXPR_ARG (exp
, 0);
13101 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
13102 gcc_assert (fold_expr
!= NULL_TREE
);
13103 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
13107 if (!ix86_check_builtin_isa_match (fcode
, &bisa
, &bisa2
))
13109 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
13110 if (TARGET_ABI_X32
)
13111 bisa
|= OPTION_MASK_ABI_X32
;
13113 bisa
|= OPTION_MASK_ABI_64
;
13114 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
13115 (enum fpmath_unit
) 0,
13116 (enum prefer_vector_width
) 0,
13117 PVW_NONE
, PVW_NONE
,
13120 error ("%qE needs unknown isa option", fndecl
);
13123 gcc_assert (opts
!= NULL
);
13124 error ("%qE needs isa option %s", fndecl
, opts
);
13127 return expand_call (exp
, target
, ignore
);
13132 case IX86_BUILTIN_MASKMOVQ
:
13133 case IX86_BUILTIN_MASKMOVDQU
:
13134 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
13135 ? CODE_FOR_mmx_maskmovq
13136 : CODE_FOR_sse2_maskmovdqu
);
13137 /* Note the arg order is different from the operand order. */
13138 arg1
= CALL_EXPR_ARG (exp
, 0);
13139 arg2
= CALL_EXPR_ARG (exp
, 1);
13140 arg0
= CALL_EXPR_ARG (exp
, 2);
13141 op0
= expand_normal (arg0
);
13142 op1
= expand_normal (arg1
);
13143 op2
= expand_normal (arg2
);
13144 mode0
= insn_data
[icode
].operand
[0].mode
;
13145 mode1
= insn_data
[icode
].operand
[1].mode
;
13146 mode2
= insn_data
[icode
].operand
[2].mode
;
13148 op0
= ix86_zero_extend_to_Pmode (op0
);
13149 op0
= gen_rtx_MEM (mode1
, op0
);
13151 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13152 op0
= copy_to_mode_reg (mode0
, op0
);
13153 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
13154 op1
= copy_to_mode_reg (mode1
, op1
);
13155 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
13156 op2
= copy_to_mode_reg (mode2
, op2
);
13157 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13163 case IX86_BUILTIN_LDMXCSR
:
13164 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
13165 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
13166 emit_move_insn (target
, op0
);
13167 emit_insn (gen_sse_ldmxcsr (target
));
13170 case IX86_BUILTIN_STMXCSR
:
13171 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
13172 emit_insn (gen_sse_stmxcsr (target
));
13173 return copy_to_mode_reg (SImode
, target
);
13175 case IX86_BUILTIN_CLFLUSH
:
13176 arg0
= CALL_EXPR_ARG (exp
, 0);
13177 op0
= expand_normal (arg0
);
13178 icode
= CODE_FOR_sse2_clflush
;
13179 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
13180 op0
= ix86_zero_extend_to_Pmode (op0
);
13182 emit_insn (gen_sse2_clflush (op0
));
13185 case IX86_BUILTIN_CLWB
:
13186 arg0
= CALL_EXPR_ARG (exp
, 0);
13187 op0
= expand_normal (arg0
);
13188 icode
= CODE_FOR_clwb
;
13189 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
13190 op0
= ix86_zero_extend_to_Pmode (op0
);
13192 emit_insn (gen_clwb (op0
));
13195 case IX86_BUILTIN_CLFLUSHOPT
:
13196 arg0
= CALL_EXPR_ARG (exp
, 0);
13197 op0
= expand_normal (arg0
);
13198 icode
= CODE_FOR_clflushopt
;
13199 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
13200 op0
= ix86_zero_extend_to_Pmode (op0
);
13202 emit_insn (gen_clflushopt (op0
));
13205 case IX86_BUILTIN_MONITOR
:
13206 case IX86_BUILTIN_MONITORX
:
13207 arg0
= CALL_EXPR_ARG (exp
, 0);
13208 arg1
= CALL_EXPR_ARG (exp
, 1);
13209 arg2
= CALL_EXPR_ARG (exp
, 2);
13210 op0
= expand_normal (arg0
);
13211 op1
= expand_normal (arg1
);
13212 op2
= expand_normal (arg2
);
13214 op0
= ix86_zero_extend_to_Pmode (op0
);
13216 op1
= copy_to_mode_reg (SImode
, op1
);
13218 op2
= copy_to_mode_reg (SImode
, op2
);
13220 emit_insn (fcode
== IX86_BUILTIN_MONITOR
13221 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
13222 : gen_monitorx (Pmode
, op0
, op1
, op2
));
13225 case IX86_BUILTIN_MWAIT
:
13226 arg0
= CALL_EXPR_ARG (exp
, 0);
13227 arg1
= CALL_EXPR_ARG (exp
, 1);
13228 op0
= expand_normal (arg0
);
13229 op1
= expand_normal (arg1
);
13231 op0
= copy_to_mode_reg (SImode
, op0
);
13233 op1
= copy_to_mode_reg (SImode
, op1
);
13234 emit_insn (gen_sse3_mwait (op0
, op1
));
13237 case IX86_BUILTIN_MWAITX
:
13238 arg0
= CALL_EXPR_ARG (exp
, 0);
13239 arg1
= CALL_EXPR_ARG (exp
, 1);
13240 arg2
= CALL_EXPR_ARG (exp
, 2);
13241 op0
= expand_normal (arg0
);
13242 op1
= expand_normal (arg1
);
13243 op2
= expand_normal (arg2
);
13245 op0
= copy_to_mode_reg (SImode
, op0
);
13247 op1
= copy_to_mode_reg (SImode
, op1
);
13249 op2
= copy_to_mode_reg (SImode
, op2
);
13250 emit_insn (gen_mwaitx (op0
, op1
, op2
));
13253 case IX86_BUILTIN_UMONITOR
:
13254 arg0
= CALL_EXPR_ARG (exp
, 0);
13255 op0
= expand_normal (arg0
);
13257 op0
= ix86_zero_extend_to_Pmode (op0
);
13258 emit_insn (gen_umonitor (Pmode
, op0
));
13261 case IX86_BUILTIN_UMWAIT
:
13262 case IX86_BUILTIN_TPAUSE
:
13263 arg0
= CALL_EXPR_ARG (exp
, 0);
13264 arg1
= CALL_EXPR_ARG (exp
, 1);
13265 op0
= expand_normal (arg0
);
13266 op1
= expand_normal (arg1
);
13269 op0
= copy_to_mode_reg (SImode
, op0
);
13271 op1
= force_reg (DImode
, op1
);
13275 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13276 NULL
, 1, OPTAB_DIRECT
);
13279 case IX86_BUILTIN_UMWAIT
:
13280 icode
= CODE_FOR_umwait_rex64
;
13282 case IX86_BUILTIN_TPAUSE
:
13283 icode
= CODE_FOR_tpause_rex64
;
13286 gcc_unreachable ();
13289 op2
= gen_lowpart (SImode
, op2
);
13290 op1
= gen_lowpart (SImode
, op1
);
13291 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13297 case IX86_BUILTIN_UMWAIT
:
13298 icode
= CODE_FOR_umwait
;
13300 case IX86_BUILTIN_TPAUSE
:
13301 icode
= CODE_FOR_tpause
;
13304 gcc_unreachable ();
13306 pat
= GEN_FCN (icode
) (op0
, op1
);
13315 || !register_operand (target
, QImode
))
13316 target
= gen_reg_rtx (QImode
);
13318 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13320 emit_insn (gen_rtx_SET (target
, pat
));
13324 case IX86_BUILTIN_TESTUI
:
13325 emit_insn (gen_testui ());
13328 || !register_operand (target
, QImode
))
13329 target
= gen_reg_rtx (QImode
);
13331 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13333 emit_insn (gen_rtx_SET (target
, pat
));
13337 case IX86_BUILTIN_CLZERO
:
13338 arg0
= CALL_EXPR_ARG (exp
, 0);
13339 op0
= expand_normal (arg0
);
13341 op0
= ix86_zero_extend_to_Pmode (op0
);
13342 emit_insn (gen_clzero (Pmode
, op0
));
13345 case IX86_BUILTIN_CLDEMOTE
:
13346 arg0
= CALL_EXPR_ARG (exp
, 0);
13347 op0
= expand_normal (arg0
);
13348 icode
= CODE_FOR_cldemote
;
13349 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
13350 op0
= ix86_zero_extend_to_Pmode (op0
);
13352 emit_insn (gen_cldemote (op0
));
13355 case IX86_BUILTIN_LOADIWKEY
:
13357 arg0
= CALL_EXPR_ARG (exp
, 0);
13358 arg1
= CALL_EXPR_ARG (exp
, 1);
13359 arg2
= CALL_EXPR_ARG (exp
, 2);
13360 arg3
= CALL_EXPR_ARG (exp
, 3);
13362 op0
= expand_normal (arg0
);
13363 op1
= expand_normal (arg1
);
13364 op2
= expand_normal (arg2
);
13365 op3
= expand_normal (arg3
);
13368 op0
= copy_to_mode_reg (V2DImode
, op0
);
13370 op1
= copy_to_mode_reg (V2DImode
, op1
);
13372 op2
= copy_to_mode_reg (V2DImode
, op2
);
13374 op3
= copy_to_mode_reg (SImode
, op3
);
13376 emit_insn (gen_loadiwkey (op0
, op1
, op2
, op3
));
13381 case IX86_BUILTIN_AESDEC128KLU8
:
13382 icode
= CODE_FOR_aesdec128klu8
;
13383 goto aesdecenc_expand
;
13385 case IX86_BUILTIN_AESDEC256KLU8
:
13386 icode
= CODE_FOR_aesdec256klu8
;
13387 goto aesdecenc_expand
;
13389 case IX86_BUILTIN_AESENC128KLU8
:
13390 icode
= CODE_FOR_aesenc128klu8
;
13391 goto aesdecenc_expand
;
13393 case IX86_BUILTIN_AESENC256KLU8
:
13394 icode
= CODE_FOR_aesenc256klu8
;
13398 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i *odata
13399 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i idata
13400 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
13402 op0
= expand_normal (arg0
);
13403 op1
= expand_normal (arg1
);
13404 op2
= expand_normal (arg2
);
13406 if (!address_operand (op0
, V2DImode
))
13408 op0
= convert_memory_address (Pmode
, op0
);
13409 op0
= copy_addr_to_reg (op0
);
13411 op0
= gen_rtx_MEM (V2DImode
, op0
);
13414 op1
= copy_to_mode_reg (V2DImode
, op1
);
13416 if (!address_operand (op2
, VOIDmode
))
13418 op2
= convert_memory_address (Pmode
, op2
);
13419 op2
= copy_addr_to_reg (op2
);
13421 op2
= gen_rtx_MEM (BLKmode
, op2
);
13423 emit_insn (GEN_FCN (icode
) (op1
, op1
, op2
));
13426 target
= gen_reg_rtx (QImode
);
13428 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
13429 error occurs. Then the output should be cleared for safety. */
13430 rtx_code_label
*ok_label
;
13433 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
13434 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
13435 ok_label
= gen_label_rtx ();
13436 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
13438 /* Usually the runtime error seldom occur, so predict OK path as
13439 hotspot to optimize it as fallthrough block. */
13440 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
13442 emit_insn (gen_rtx_SET (op1
, const0_rtx
));
13444 emit_label (ok_label
);
13445 emit_insn (gen_rtx_SET (target
, pat
));
13446 emit_insn (gen_rtx_SET (op0
, op1
));
13450 case IX86_BUILTIN_AESDECWIDE128KLU8
:
13451 icode
= CODE_FOR_aesdecwide128klu8
;
13452 goto wideaesdecenc_expand
;
13454 case IX86_BUILTIN_AESDECWIDE256KLU8
:
13455 icode
= CODE_FOR_aesdecwide256klu8
;
13456 goto wideaesdecenc_expand
;
13458 case IX86_BUILTIN_AESENCWIDE128KLU8
:
13459 icode
= CODE_FOR_aesencwide128klu8
;
13460 goto wideaesdecenc_expand
;
13462 case IX86_BUILTIN_AESENCWIDE256KLU8
:
13463 icode
= CODE_FOR_aesencwide256klu8
;
13465 wideaesdecenc_expand
:
13470 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i * odata
13471 arg1
= CALL_EXPR_ARG (exp
, 1); // const __m128i * idata
13472 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
13474 op0
= expand_normal (arg0
);
13475 op1
= expand_normal (arg1
);
13476 op2
= expand_normal (arg2
);
13478 if (!address_operand (op2
, VOIDmode
))
13480 op2
= convert_memory_address (Pmode
, op2
);
13481 op2
= copy_addr_to_reg (op2
);
13483 op2
= gen_rtx_MEM (BLKmode
, op2
);
13485 for (i
= 0; i
< 8; i
++)
13487 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13489 op
= gen_rtx_MEM (V2DImode
,
13490 plus_constant (Pmode
, op1
, (i
* 16)));
13492 emit_move_insn (xmm_regs
[i
], op
);
13495 emit_insn (GEN_FCN (icode
) (op2
));
13498 target
= gen_reg_rtx (QImode
);
13500 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
13501 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
13502 ok_label
= gen_label_rtx ();
13503 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
13505 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
13507 for (i
= 0; i
< 8; i
++)
13508 emit_insn (gen_rtx_SET (xmm_regs
[i
], const0_rtx
));
13510 emit_label (ok_label
);
13511 emit_insn (gen_rtx_SET (target
, pat
));
13513 for (i
= 0; i
< 8; i
++)
13515 op
= gen_rtx_MEM (V2DImode
,
13516 plus_constant (Pmode
, op0
, (i
* 16)));
13517 emit_move_insn (op
, xmm_regs
[i
]);
13522 case IX86_BUILTIN_ENCODEKEY128U32
:
13524 rtx op
, xmm_regs
[7];
13526 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
13527 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i key
13528 arg2
= CALL_EXPR_ARG (exp
, 2); // void *h
13530 op0
= expand_normal (arg0
);
13531 op1
= expand_normal (arg1
);
13532 op2
= expand_normal (arg2
);
13535 op0
= copy_to_mode_reg (SImode
, op0
);
13537 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
13538 emit_move_insn (op
, op1
);
13540 for (i
= 0; i
< 3; i
++)
13541 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13544 target
= gen_reg_rtx (SImode
);
13546 emit_insn (gen_encodekey128u32 (target
, op0
));
13548 for (i
= 0; i
< 3; i
++)
13550 op
= gen_rtx_MEM (V2DImode
,
13551 plus_constant (Pmode
, op2
, (i
* 16)));
13552 emit_move_insn (op
, xmm_regs
[i
]);
13557 case IX86_BUILTIN_ENCODEKEY256U32
:
13559 rtx op
, xmm_regs
[7];
13561 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
13562 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i keylow
13563 arg2
= CALL_EXPR_ARG (exp
, 2); // __m128i keyhi
13564 arg3
= CALL_EXPR_ARG (exp
, 3); // void *h
13566 op0
= expand_normal (arg0
);
13567 op1
= expand_normal (arg1
);
13568 op2
= expand_normal (arg2
);
13569 op3
= expand_normal (arg3
);
13572 op0
= copy_to_mode_reg (SImode
, op0
);
13574 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13575 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
13576 emit_move_insn (op
, op1
);
13577 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (1));
13578 emit_move_insn (op
, op2
);
13580 for (i
= 0; i
< 4; i
++)
13581 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13584 target
= gen_reg_rtx (SImode
);
13586 emit_insn (gen_encodekey256u32 (target
, op0
));
13588 for (i
= 0; i
< 4; i
++)
13590 op
= gen_rtx_MEM (V2DImode
,
13591 plus_constant (Pmode
, op3
, (i
* 16)));
13592 emit_move_insn (op
, xmm_regs
[i
]);
13598 case IX86_BUILTIN_PREFETCH
:
13600 arg0
= CALL_EXPR_ARG (exp
, 0); // const void *
13601 arg1
= CALL_EXPR_ARG (exp
, 1); // const int
13602 arg2
= CALL_EXPR_ARG (exp
, 2); // const int
13603 arg3
= CALL_EXPR_ARG (exp
, 3); // const int
13605 op0
= expand_normal (arg0
);
13606 op1
= expand_normal (arg1
);
13607 op2
= expand_normal (arg2
);
13608 op3
= expand_normal (arg3
);
13610 if (!CONST_INT_P (op1
) || !CONST_INT_P (op2
) || !CONST_INT_P (op3
))
13612 error ("second, third and fourth argument must be a const");
13616 if (INTVAL (op3
) == 1)
13618 if (INTVAL (op2
) < 2 || INTVAL (op2
) > 3)
13620 error ("invalid third argument");
13624 if (TARGET_64BIT
&& TARGET_PREFETCHI
13625 && local_func_symbolic_operand (op0
, GET_MODE (op0
)))
13626 emit_insn (gen_prefetchi (op0
, op2
));
13629 warning (0, "instruction prefetch applies when in 64-bit mode"
13630 " with RIP-relative addressing and"
13631 " option %<-mprefetchi%>;"
13632 " they stay NOPs otherwise");
13633 emit_insn (gen_nop ());
13638 if (!address_operand (op0
, VOIDmode
))
13640 op0
= convert_memory_address (Pmode
, op0
);
13641 op0
= copy_addr_to_reg (op0
);
13644 if (INTVAL (op2
) < 0 || INTVAL (op2
) > 3)
13646 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
13650 if (TARGET_3DNOW
|| TARGET_PREFETCH_SSE
13651 || TARGET_PRFCHW
|| TARGET_PREFETCHWT1
)
13652 emit_insn (gen_prefetch (op0
, op1
, op2
));
13653 else if (!MEM_P (op0
) && side_effects_p (op0
))
13654 /* Don't do anything with direct references to volatile memory,
13655 but generate code to handle other side effects. */
13662 case IX86_BUILTIN_PREFETCHI
:
13664 arg0
= CALL_EXPR_ARG (exp
, 0); // const void *
13665 arg1
= CALL_EXPR_ARG (exp
, 1); // const int
13667 op0
= expand_normal (arg0
);
13668 op1
= expand_normal (arg1
);
13670 if (!CONST_INT_P (op1
))
13672 error ("second argument must be a const");
13676 /* GOT/PLT_PIC should not be available for instruction prefetch.
13677 It must be real instruction address. */
13679 && local_func_symbolic_operand (op0
, GET_MODE (op0
)))
13680 emit_insn (gen_prefetchi (op0
, op1
));
13683 /* Ignore the hint. */
13684 warning (0, "instruction prefetch applies when in 64-bit mode"
13685 " with RIP-relative addressing and"
13686 " option %<-mprefetchi%>;"
13687 " they stay NOPs otherwise");
13688 emit_insn (gen_nop ());
13694 case IX86_BUILTIN_URDMSR
:
13695 case IX86_BUILTIN_UWRMSR
:
13697 arg0
= CALL_EXPR_ARG (exp
, 0);
13698 op0
= expand_normal (arg0
);
13700 if (CONST_INT_P (op0
))
13702 unsigned HOST_WIDE_INT val
= UINTVAL (op0
);
13703 if (val
> 0xffffffff)
13704 op0
= force_reg (DImode
, op0
);
13707 op0
= force_reg (DImode
, op0
);
13709 if (fcode
== IX86_BUILTIN_UWRMSR
)
13711 arg1
= CALL_EXPR_ARG (exp
, 1);
13712 op1
= expand_normal (arg1
);
13713 op1
= force_reg (DImode
, op1
);
13714 icode
= CODE_FOR_uwrmsr
;
13720 target
= gen_reg_rtx (DImode
);
13721 icode
= CODE_FOR_urdmsr
;
13725 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13729 case IX86_BUILTIN_VEC_INIT_V2SI
:
13730 case IX86_BUILTIN_VEC_INIT_V4HI
:
13731 case IX86_BUILTIN_VEC_INIT_V8QI
:
13732 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
13734 case IX86_BUILTIN_VEC_EXT_V2DF
:
13735 case IX86_BUILTIN_VEC_EXT_V2DI
:
13736 case IX86_BUILTIN_VEC_EXT_V4SF
:
13737 case IX86_BUILTIN_VEC_EXT_V4SI
:
13738 case IX86_BUILTIN_VEC_EXT_V8HI
:
13739 case IX86_BUILTIN_VEC_EXT_V2SI
:
13740 case IX86_BUILTIN_VEC_EXT_V4HI
:
13741 case IX86_BUILTIN_VEC_EXT_V16QI
:
13742 return ix86_expand_vec_ext_builtin (exp
, target
);
13744 case IX86_BUILTIN_VEC_SET_V2DI
:
13745 case IX86_BUILTIN_VEC_SET_V4SF
:
13746 case IX86_BUILTIN_VEC_SET_V4SI
:
13747 case IX86_BUILTIN_VEC_SET_V8HI
:
13748 case IX86_BUILTIN_VEC_SET_V4HI
:
13749 case IX86_BUILTIN_VEC_SET_V16QI
:
13750 return ix86_expand_vec_set_builtin (exp
);
13752 case IX86_BUILTIN_NANQ
:
13753 case IX86_BUILTIN_NANSQ
:
13754 return expand_call (exp
, target
, ignore
);
13756 case IX86_BUILTIN_RDPID
:
13758 op0
= gen_reg_rtx (word_mode
);
13762 insn
= gen_rdpid_rex64 (op0
);
13763 op0
= convert_to_mode (SImode
, op0
, 1);
13766 insn
= gen_rdpid (op0
);
13771 || !register_operand (target
, SImode
))
13772 target
= gen_reg_rtx (SImode
);
13774 emit_move_insn (target
, op0
);
13777 case IX86_BUILTIN_2INTERSECTD512
:
13778 case IX86_BUILTIN_2INTERSECTQ512
:
13779 case IX86_BUILTIN_2INTERSECTD256
:
13780 case IX86_BUILTIN_2INTERSECTQ256
:
13781 case IX86_BUILTIN_2INTERSECTD128
:
13782 case IX86_BUILTIN_2INTERSECTQ128
:
13783 arg0
= CALL_EXPR_ARG (exp
, 0);
13784 arg1
= CALL_EXPR_ARG (exp
, 1);
13785 arg2
= CALL_EXPR_ARG (exp
, 2);
13786 arg3
= CALL_EXPR_ARG (exp
, 3);
13787 op0
= expand_normal (arg0
);
13788 op1
= expand_normal (arg1
);
13789 op2
= expand_normal (arg2
);
13790 op3
= expand_normal (arg3
);
13792 if (!address_operand (op0
, VOIDmode
))
13794 op0
= convert_memory_address (Pmode
, op0
);
13795 op0
= copy_addr_to_reg (op0
);
13797 if (!address_operand (op1
, VOIDmode
))
13799 op1
= convert_memory_address (Pmode
, op1
);
13800 op1
= copy_addr_to_reg (op1
);
13805 case IX86_BUILTIN_2INTERSECTD512
:
13807 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
13809 case IX86_BUILTIN_2INTERSECTQ512
:
13811 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
13813 case IX86_BUILTIN_2INTERSECTD256
:
13815 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
13817 case IX86_BUILTIN_2INTERSECTQ256
:
13819 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
13821 case IX86_BUILTIN_2INTERSECTD128
:
13823 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
13825 case IX86_BUILTIN_2INTERSECTQ128
:
13827 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
13830 gcc_unreachable ();
13833 mode2
= insn_data
[icode
].operand
[1].mode
;
13834 mode3
= insn_data
[icode
].operand
[2].mode
;
13835 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
13836 op2
= copy_to_mode_reg (mode2
, op2
);
13837 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
13838 op3
= copy_to_mode_reg (mode3
, op3
);
13840 op4
= gen_reg_rtx (mode4
);
13841 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
13842 mode0
= mode4
== P2HImode
? HImode
: QImode
;
13843 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
13844 gen_lowpart (mode0
, op4
));
13845 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
13846 gen_highpart (mode0
, op4
));
13850 case IX86_BUILTIN_RDPMC
:
13851 case IX86_BUILTIN_RDTSC
:
13852 case IX86_BUILTIN_RDTSCP
:
13853 case IX86_BUILTIN_XGETBV
:
13855 op0
= gen_reg_rtx (DImode
);
13856 op1
= gen_reg_rtx (DImode
);
13858 if (fcode
== IX86_BUILTIN_RDPMC
)
13860 arg0
= CALL_EXPR_ARG (exp
, 0);
13861 op2
= expand_normal (arg0
);
13862 if (!register_operand (op2
, SImode
))
13863 op2
= copy_to_mode_reg (SImode
, op2
);
13865 insn
= (TARGET_64BIT
13866 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
13867 : gen_rdpmc (op0
, op2
));
13870 else if (fcode
== IX86_BUILTIN_XGETBV
)
13872 arg0
= CALL_EXPR_ARG (exp
, 0);
13873 op2
= expand_normal (arg0
);
13874 if (!register_operand (op2
, SImode
))
13875 op2
= copy_to_mode_reg (SImode
, op2
);
13877 insn
= (TARGET_64BIT
13878 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
13879 : gen_xgetbv (op0
, op2
));
13882 else if (fcode
== IX86_BUILTIN_RDTSC
)
13884 insn
= (TARGET_64BIT
13885 ? gen_rdtsc_rex64 (op0
, op1
)
13886 : gen_rdtsc (op0
));
13891 op2
= gen_reg_rtx (SImode
);
13893 insn
= (TARGET_64BIT
13894 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
13895 : gen_rdtscp (op0
, op2
));
13898 arg0
= CALL_EXPR_ARG (exp
, 0);
13899 op4
= expand_normal (arg0
);
13900 if (!address_operand (op4
, VOIDmode
))
13902 op4
= convert_memory_address (Pmode
, op4
);
13903 op4
= copy_addr_to_reg (op4
);
13905 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
13909 || !register_operand (target
, DImode
))
13910 target
= gen_reg_rtx (DImode
);
13914 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
13915 op1
, 1, OPTAB_DIRECT
);
13916 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
13917 op0
, 1, OPTAB_DIRECT
);
13920 emit_move_insn (target
, op0
);
13923 case IX86_BUILTIN_ENQCMD
:
13924 case IX86_BUILTIN_ENQCMDS
:
13925 case IX86_BUILTIN_MOVDIR64B
:
13927 arg0
= CALL_EXPR_ARG (exp
, 0);
13928 arg1
= CALL_EXPR_ARG (exp
, 1);
13929 op0
= expand_normal (arg0
);
13930 op1
= expand_normal (arg1
);
13932 op0
= ix86_zero_extend_to_Pmode (op0
);
13933 if (!address_operand (op1
, VOIDmode
))
13935 op1
= convert_memory_address (Pmode
, op1
);
13936 op1
= copy_addr_to_reg (op1
);
13938 op1
= gen_rtx_MEM (XImode
, op1
);
13940 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
13942 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
13948 || !register_operand (target
, SImode
))
13949 target
= gen_reg_rtx (SImode
);
13951 emit_move_insn (target
, const0_rtx
);
13952 target
= gen_rtx_SUBREG (QImode
, target
, 0);
13954 int unspecv
= (fcode
== IX86_BUILTIN_ENQCMD
13956 : UNSPECV_ENQCMDS
);
13957 icode
= code_for_enqcmd (unspecv
, Pmode
);
13958 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13961 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
13962 gen_rtx_fmt_ee (EQ
, QImode
,
13963 gen_rtx_REG (CCZmode
, FLAGS_REG
),
13965 return SUBREG_REG (target
);
13968 case IX86_BUILTIN_FXSAVE
:
13969 case IX86_BUILTIN_FXRSTOR
:
13970 case IX86_BUILTIN_FXSAVE64
:
13971 case IX86_BUILTIN_FXRSTOR64
:
13972 case IX86_BUILTIN_FNSTENV
:
13973 case IX86_BUILTIN_FLDENV
:
13977 case IX86_BUILTIN_FXSAVE
:
13978 icode
= CODE_FOR_fxsave
;
13980 case IX86_BUILTIN_FXRSTOR
:
13981 icode
= CODE_FOR_fxrstor
;
13983 case IX86_BUILTIN_FXSAVE64
:
13984 icode
= CODE_FOR_fxsave64
;
13986 case IX86_BUILTIN_FXRSTOR64
:
13987 icode
= CODE_FOR_fxrstor64
;
13989 case IX86_BUILTIN_FNSTENV
:
13990 icode
= CODE_FOR_fnstenv
;
13992 case IX86_BUILTIN_FLDENV
:
13993 icode
= CODE_FOR_fldenv
;
13996 gcc_unreachable ();
13999 arg0
= CALL_EXPR_ARG (exp
, 0);
14000 op0
= expand_normal (arg0
);
14002 if (!address_operand (op0
, VOIDmode
))
14004 op0
= convert_memory_address (Pmode
, op0
);
14005 op0
= copy_addr_to_reg (op0
);
14007 op0
= gen_rtx_MEM (mode0
, op0
);
14009 pat
= GEN_FCN (icode
) (op0
);
14014 case IX86_BUILTIN_XSETBV
:
14015 arg0
= CALL_EXPR_ARG (exp
, 0);
14016 arg1
= CALL_EXPR_ARG (exp
, 1);
14017 op0
= expand_normal (arg0
);
14018 op1
= expand_normal (arg1
);
14021 op0
= copy_to_mode_reg (SImode
, op0
);
14023 op1
= force_reg (DImode
, op1
);
14027 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
14028 NULL
, 1, OPTAB_DIRECT
);
14030 icode
= CODE_FOR_xsetbv_rex64
;
14032 op2
= gen_lowpart (SImode
, op2
);
14033 op1
= gen_lowpart (SImode
, op1
);
14034 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
14038 icode
= CODE_FOR_xsetbv
;
14040 pat
= GEN_FCN (icode
) (op0
, op1
);
14046 case IX86_BUILTIN_XSAVE
:
14047 case IX86_BUILTIN_XRSTOR
:
14048 case IX86_BUILTIN_XSAVE64
:
14049 case IX86_BUILTIN_XRSTOR64
:
14050 case IX86_BUILTIN_XSAVEOPT
:
14051 case IX86_BUILTIN_XSAVEOPT64
:
14052 case IX86_BUILTIN_XSAVES
:
14053 case IX86_BUILTIN_XRSTORS
:
14054 case IX86_BUILTIN_XSAVES64
:
14055 case IX86_BUILTIN_XRSTORS64
:
14056 case IX86_BUILTIN_XSAVEC
:
14057 case IX86_BUILTIN_XSAVEC64
:
14058 arg0
= CALL_EXPR_ARG (exp
, 0);
14059 arg1
= CALL_EXPR_ARG (exp
, 1);
14060 op0
= expand_normal (arg0
);
14061 op1
= expand_normal (arg1
);
14063 if (!address_operand (op0
, VOIDmode
))
14065 op0
= convert_memory_address (Pmode
, op0
);
14066 op0
= copy_addr_to_reg (op0
);
14068 op0
= gen_rtx_MEM (BLKmode
, op0
);
14070 op1
= force_reg (DImode
, op1
);
14074 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
14075 NULL
, 1, OPTAB_DIRECT
);
14078 case IX86_BUILTIN_XSAVE
:
14079 icode
= CODE_FOR_xsave_rex64
;
14081 case IX86_BUILTIN_XRSTOR
:
14082 icode
= CODE_FOR_xrstor_rex64
;
14084 case IX86_BUILTIN_XSAVE64
:
14085 icode
= CODE_FOR_xsave64
;
14087 case IX86_BUILTIN_XRSTOR64
:
14088 icode
= CODE_FOR_xrstor64
;
14090 case IX86_BUILTIN_XSAVEOPT
:
14091 icode
= CODE_FOR_xsaveopt_rex64
;
14093 case IX86_BUILTIN_XSAVEOPT64
:
14094 icode
= CODE_FOR_xsaveopt64
;
14096 case IX86_BUILTIN_XSAVES
:
14097 icode
= CODE_FOR_xsaves_rex64
;
14099 case IX86_BUILTIN_XRSTORS
:
14100 icode
= CODE_FOR_xrstors_rex64
;
14102 case IX86_BUILTIN_XSAVES64
:
14103 icode
= CODE_FOR_xsaves64
;
14105 case IX86_BUILTIN_XRSTORS64
:
14106 icode
= CODE_FOR_xrstors64
;
14108 case IX86_BUILTIN_XSAVEC
:
14109 icode
= CODE_FOR_xsavec_rex64
;
14111 case IX86_BUILTIN_XSAVEC64
:
14112 icode
= CODE_FOR_xsavec64
;
14115 gcc_unreachable ();
14118 op2
= gen_lowpart (SImode
, op2
);
14119 op1
= gen_lowpart (SImode
, op1
);
14120 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
14126 case IX86_BUILTIN_XSAVE
:
14127 icode
= CODE_FOR_xsave
;
14129 case IX86_BUILTIN_XRSTOR
:
14130 icode
= CODE_FOR_xrstor
;
14132 case IX86_BUILTIN_XSAVEOPT
:
14133 icode
= CODE_FOR_xsaveopt
;
14135 case IX86_BUILTIN_XSAVES
:
14136 icode
= CODE_FOR_xsaves
;
14138 case IX86_BUILTIN_XRSTORS
:
14139 icode
= CODE_FOR_xrstors
;
14141 case IX86_BUILTIN_XSAVEC
:
14142 icode
= CODE_FOR_xsavec
;
14145 gcc_unreachable ();
14147 pat
= GEN_FCN (icode
) (op0
, op1
);
14154 case IX86_BUILTIN_LDTILECFG
:
14155 case IX86_BUILTIN_STTILECFG
:
14156 arg0
= CALL_EXPR_ARG (exp
, 0);
14157 op0
= expand_normal (arg0
);
14159 if (!address_operand (op0
, VOIDmode
))
14161 op0
= convert_memory_address (Pmode
, op0
);
14162 op0
= copy_addr_to_reg (op0
);
14164 op0
= gen_rtx_MEM (XImode
, op0
);
14165 if (fcode
== IX86_BUILTIN_LDTILECFG
)
14166 icode
= CODE_FOR_ldtilecfg
;
14168 icode
= CODE_FOR_sttilecfg
;
14169 pat
= GEN_FCN (icode
) (op0
);
14173 case IX86_BUILTIN_LLWPCB
:
14174 arg0
= CALL_EXPR_ARG (exp
, 0);
14175 op0
= expand_normal (arg0
);
14177 if (!register_operand (op0
, Pmode
))
14178 op0
= ix86_zero_extend_to_Pmode (op0
);
14179 emit_insn (gen_lwp_llwpcb (Pmode
, op0
));
14182 case IX86_BUILTIN_SLWPCB
:
14184 || !register_operand (target
, Pmode
))
14185 target
= gen_reg_rtx (Pmode
);
14186 emit_insn (gen_lwp_slwpcb (Pmode
, target
));
14189 case IX86_BUILTIN_LWPVAL32
:
14190 case IX86_BUILTIN_LWPVAL64
:
14191 case IX86_BUILTIN_LWPINS32
:
14192 case IX86_BUILTIN_LWPINS64
:
14193 mode
= ((fcode
== IX86_BUILTIN_LWPVAL32
14194 || fcode
== IX86_BUILTIN_LWPINS32
)
14195 ? SImode
: DImode
);
14197 if (fcode
== IX86_BUILTIN_LWPVAL32
14198 || fcode
== IX86_BUILTIN_LWPVAL64
)
14199 icode
= code_for_lwp_lwpval (mode
);
14201 icode
= code_for_lwp_lwpins (mode
);
14203 arg0
= CALL_EXPR_ARG (exp
, 0);
14204 arg1
= CALL_EXPR_ARG (exp
, 1);
14205 arg2
= CALL_EXPR_ARG (exp
, 2);
14206 op0
= expand_normal (arg0
);
14207 op1
= expand_normal (arg1
);
14208 op2
= expand_normal (arg2
);
14209 mode0
= insn_data
[icode
].operand
[0].mode
;
14211 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14212 op0
= copy_to_mode_reg (mode0
, op0
);
14213 if (!insn_data
[icode
].operand
[1].predicate (op1
, SImode
))
14214 op1
= copy_to_mode_reg (SImode
, op1
);
14216 if (!CONST_INT_P (op2
))
14218 error ("the last argument must be a 32-bit immediate");
14222 emit_insn (GEN_FCN (icode
) (op0
, op1
, op2
));
14224 if (fcode
== IX86_BUILTIN_LWPINS32
14225 || fcode
== IX86_BUILTIN_LWPINS64
)
14228 || !nonimmediate_operand (target
, QImode
))
14229 target
= gen_reg_rtx (QImode
);
14231 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
14233 emit_insn (gen_rtx_SET (target
, pat
));
14240 case IX86_BUILTIN_BEXTRI32
:
14241 case IX86_BUILTIN_BEXTRI64
:
14242 mode
= (fcode
== IX86_BUILTIN_BEXTRI32
? SImode
: DImode
);
14244 arg0
= CALL_EXPR_ARG (exp
, 0);
14245 arg1
= CALL_EXPR_ARG (exp
, 1);
14246 op0
= expand_normal (arg0
);
14247 op1
= expand_normal (arg1
);
14249 if (!CONST_INT_P (op1
))
14251 error ("last argument must be an immediate");
14256 unsigned char lsb_index
= UINTVAL (op1
);
14257 unsigned char length
= UINTVAL (op1
) >> 8;
14259 unsigned char bitsize
= GET_MODE_BITSIZE (mode
);
14261 icode
= code_for_tbm_bextri (mode
);
14263 mode1
= insn_data
[icode
].operand
[1].mode
;
14264 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
14265 op0
= copy_to_mode_reg (mode1
, op0
);
14267 mode0
= insn_data
[icode
].operand
[0].mode
;
14269 || !register_operand (target
, mode0
))
14270 target
= gen_reg_rtx (mode0
);
14272 if (length
== 0 || lsb_index
>= bitsize
)
14274 emit_move_insn (target
, const0_rtx
);
14278 if (length
+ lsb_index
> bitsize
)
14279 length
= bitsize
- lsb_index
;
14281 op1
= GEN_INT (length
);
14282 op2
= GEN_INT (lsb_index
);
14284 emit_insn (GEN_FCN (icode
) (target
, op0
, op1
, op2
));
14288 case IX86_BUILTIN_RDRAND16_STEP
:
14292 case IX86_BUILTIN_RDRAND32_STEP
:
14296 case IX86_BUILTIN_RDRAND64_STEP
:
14300 arg0
= CALL_EXPR_ARG (exp
, 0);
14301 op1
= expand_normal (arg0
);
14302 if (!address_operand (op1
, VOIDmode
))
14304 op1
= convert_memory_address (Pmode
, op1
);
14305 op1
= copy_addr_to_reg (op1
);
14308 op0
= gen_reg_rtx (mode
);
14309 emit_insn (gen_rdrand (mode
, op0
));
14311 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
14313 op1
= force_reg (SImode
, const1_rtx
);
14315 /* Emit SImode conditional move. */
14316 if (mode
== HImode
)
14318 if (TARGET_ZERO_EXTEND_WITH_AND
14319 && optimize_function_for_speed_p (cfun
))
14321 op2
= force_reg (SImode
, const0_rtx
);
14323 emit_insn (gen_movstricthi
14324 (gen_lowpart (HImode
, op2
), op0
));
14328 op2
= gen_reg_rtx (SImode
);
14330 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
14333 else if (mode
== SImode
)
14336 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
14339 || !register_operand (target
, SImode
))
14340 target
= gen_reg_rtx (SImode
);
14342 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
14344 emit_insn (gen_rtx_SET (target
,
14345 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
14348 case IX86_BUILTIN_RDSEED16_STEP
:
14352 case IX86_BUILTIN_RDSEED32_STEP
:
14356 case IX86_BUILTIN_RDSEED64_STEP
:
14360 arg0
= CALL_EXPR_ARG (exp
, 0);
14361 op1
= expand_normal (arg0
);
14362 if (!address_operand (op1
, VOIDmode
))
14364 op1
= convert_memory_address (Pmode
, op1
);
14365 op1
= copy_addr_to_reg (op1
);
14368 op0
= gen_reg_rtx (mode
);
14369 emit_insn (gen_rdseed (mode
, op0
));
14371 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
14373 op2
= gen_reg_rtx (QImode
);
14375 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
14377 emit_insn (gen_rtx_SET (op2
, pat
));
14380 || !register_operand (target
, SImode
))
14381 target
= gen_reg_rtx (SImode
);
14383 emit_insn (gen_zero_extendqisi2 (target
, op2
));
14386 case IX86_BUILTIN_SBB32
:
14387 icode
= CODE_FOR_subborrowsi
;
14388 icode2
= CODE_FOR_subborrowsi_0
;
14394 case IX86_BUILTIN_SBB64
:
14395 icode
= CODE_FOR_subborrowdi
;
14396 icode2
= CODE_FOR_subborrowdi_0
;
14402 case IX86_BUILTIN_ADDCARRYX32
:
14403 icode
= CODE_FOR_addcarrysi
;
14404 icode2
= CODE_FOR_addcarrysi_0
;
14410 case IX86_BUILTIN_ADDCARRYX64
:
14411 icode
= CODE_FOR_addcarrydi
;
14412 icode2
= CODE_FOR_addcarrydi_0
;
14418 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
14419 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
14420 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
14421 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
14423 op1
= expand_normal (arg0
);
14425 op2
= expand_normal (arg1
);
14426 if (!register_operand (op2
, mode0
))
14427 op2
= copy_to_mode_reg (mode0
, op2
);
14429 op3
= expand_normal (arg2
);
14430 if (!register_operand (op3
, mode0
))
14431 op3
= copy_to_mode_reg (mode0
, op3
);
14433 op4
= expand_normal (arg3
);
14434 if (!address_operand (op4
, VOIDmode
))
14436 op4
= convert_memory_address (Pmode
, op4
);
14437 op4
= copy_addr_to_reg (op4
);
14440 op0
= gen_reg_rtx (mode0
);
14441 if (op1
== const0_rtx
)
14443 /* If arg0 is 0, optimize right away into add or sub
14444 instruction that sets CCCmode flags. */
14445 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
14446 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
14450 /* Generate CF from input operand. */
14451 ix86_expand_carry (op1
);
14453 /* Generate instruction that consumes CF. */
14454 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
14455 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
14456 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
14457 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
14460 /* Return current CF value. */
14462 target
= gen_reg_rtx (QImode
);
14464 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
14465 emit_insn (gen_rtx_SET (target
, pat
));
14467 /* Store the result. */
14468 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
14472 case IX86_BUILTIN_READ_FLAGS
:
14476 emit_insn (gen_pushfl ());
14479 || target
== NULL_RTX
14480 || !nonimmediate_operand (target
, word_mode
)
14481 || GET_MODE (target
) != word_mode
)
14482 target
= gen_reg_rtx (word_mode
);
14484 emit_insn (gen_pop (target
));
14487 case IX86_BUILTIN_WRITE_FLAGS
:
14489 arg0
= CALL_EXPR_ARG (exp
, 0);
14490 op0
= expand_normal (arg0
);
14491 if (!general_no_elim_operand (op0
, word_mode
))
14492 op0
= copy_to_mode_reg (word_mode
, op0
);
14494 emit_insn (gen_push (op0
));
14495 emit_insn (gen_popfl ());
14498 case IX86_BUILTIN_KTESTC8
:
14499 icode
= CODE_FOR_ktestqi
;
14503 case IX86_BUILTIN_KTESTZ8
:
14504 icode
= CODE_FOR_ktestqi
;
14508 case IX86_BUILTIN_KTESTC16
:
14509 icode
= CODE_FOR_ktesthi
;
14513 case IX86_BUILTIN_KTESTZ16
:
14514 icode
= CODE_FOR_ktesthi
;
14518 case IX86_BUILTIN_KTESTC32
:
14519 icode
= CODE_FOR_ktestsi
;
14523 case IX86_BUILTIN_KTESTZ32
:
14524 icode
= CODE_FOR_ktestsi
;
14528 case IX86_BUILTIN_KTESTC64
:
14529 icode
= CODE_FOR_ktestdi
;
14533 case IX86_BUILTIN_KTESTZ64
:
14534 icode
= CODE_FOR_ktestdi
;
14538 case IX86_BUILTIN_KORTESTC8
:
14539 icode
= CODE_FOR_kortestqi
;
14543 case IX86_BUILTIN_KORTESTZ8
:
14544 icode
= CODE_FOR_kortestqi
;
14548 case IX86_BUILTIN_KORTESTC16
:
14549 icode
= CODE_FOR_kortesthi
;
14553 case IX86_BUILTIN_KORTESTZ16
:
14554 icode
= CODE_FOR_kortesthi
;
14558 case IX86_BUILTIN_KORTESTC32
:
14559 icode
= CODE_FOR_kortestsi
;
14563 case IX86_BUILTIN_KORTESTZ32
:
14564 icode
= CODE_FOR_kortestsi
;
14568 case IX86_BUILTIN_KORTESTC64
:
14569 icode
= CODE_FOR_kortestdi
;
14573 case IX86_BUILTIN_KORTESTZ64
:
14574 icode
= CODE_FOR_kortestdi
;
14578 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
14579 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
14580 op0
= expand_normal (arg0
);
14581 op1
= expand_normal (arg1
);
14583 mode0
= insn_data
[icode
].operand
[0].mode
;
14584 mode1
= insn_data
[icode
].operand
[1].mode
;
14586 if (GET_MODE (op0
) != VOIDmode
)
14587 op0
= force_reg (GET_MODE (op0
), op0
);
14589 op0
= gen_lowpart (mode0
, op0
);
14591 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14592 op0
= copy_to_mode_reg (mode0
, op0
);
14594 if (GET_MODE (op1
) != VOIDmode
)
14595 op1
= force_reg (GET_MODE (op1
), op1
);
14597 op1
= gen_lowpart (mode1
, op1
);
14599 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14600 op1
= copy_to_mode_reg (mode1
, op1
);
14602 target
= gen_reg_rtx (QImode
);
14604 /* Emit kortest. */
14605 emit_insn (GEN_FCN (icode
) (op0
, op1
));
14606 /* And use setcc to return result from flags. */
14607 ix86_expand_setcc (target
, EQ
,
14608 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
14611 case IX86_BUILTIN_GATHERSIV2DF
:
14612 icode
= CODE_FOR_avx2_gathersiv2df
;
14614 case IX86_BUILTIN_GATHERSIV4DF
:
14615 icode
= CODE_FOR_avx2_gathersiv4df
;
14617 case IX86_BUILTIN_GATHERDIV2DF
:
14618 icode
= CODE_FOR_avx2_gatherdiv2df
;
14620 case IX86_BUILTIN_GATHERDIV4DF
:
14621 icode
= CODE_FOR_avx2_gatherdiv4df
;
14623 case IX86_BUILTIN_GATHERSIV4SF
:
14624 icode
= CODE_FOR_avx2_gathersiv4sf
;
14626 case IX86_BUILTIN_GATHERSIV8SF
:
14627 icode
= CODE_FOR_avx2_gathersiv8sf
;
14629 case IX86_BUILTIN_GATHERDIV4SF
:
14630 icode
= CODE_FOR_avx2_gatherdiv4sf
;
14632 case IX86_BUILTIN_GATHERDIV8SF
:
14633 icode
= CODE_FOR_avx2_gatherdiv8sf
;
14635 case IX86_BUILTIN_GATHERSIV2DI
:
14636 icode
= CODE_FOR_avx2_gathersiv2di
;
14638 case IX86_BUILTIN_GATHERSIV4DI
:
14639 icode
= CODE_FOR_avx2_gathersiv4di
;
14641 case IX86_BUILTIN_GATHERDIV2DI
:
14642 icode
= CODE_FOR_avx2_gatherdiv2di
;
14644 case IX86_BUILTIN_GATHERDIV4DI
:
14645 icode
= CODE_FOR_avx2_gatherdiv4di
;
14647 case IX86_BUILTIN_GATHERSIV4SI
:
14648 icode
= CODE_FOR_avx2_gathersiv4si
;
14650 case IX86_BUILTIN_GATHERSIV8SI
:
14651 icode
= CODE_FOR_avx2_gathersiv8si
;
14653 case IX86_BUILTIN_GATHERDIV4SI
:
14654 icode
= CODE_FOR_avx2_gatherdiv4si
;
14656 case IX86_BUILTIN_GATHERDIV8SI
:
14657 icode
= CODE_FOR_avx2_gatherdiv8si
;
14659 case IX86_BUILTIN_GATHERALTSIV4DF
:
14660 icode
= CODE_FOR_avx2_gathersiv4df
;
14662 case IX86_BUILTIN_GATHERALTDIV8SF
:
14663 icode
= CODE_FOR_avx2_gatherdiv8sf
;
14665 case IX86_BUILTIN_GATHERALTSIV4DI
:
14666 icode
= CODE_FOR_avx2_gathersiv4di
;
14668 case IX86_BUILTIN_GATHERALTDIV8SI
:
14669 icode
= CODE_FOR_avx2_gatherdiv8si
;
14671 case IX86_BUILTIN_GATHER3SIV16SF
:
14672 icode
= CODE_FOR_avx512f_gathersiv16sf
;
14674 case IX86_BUILTIN_GATHER3SIV8DF
:
14675 icode
= CODE_FOR_avx512f_gathersiv8df
;
14677 case IX86_BUILTIN_GATHER3DIV16SF
:
14678 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
14680 case IX86_BUILTIN_GATHER3DIV8DF
:
14681 icode
= CODE_FOR_avx512f_gatherdiv8df
;
14683 case IX86_BUILTIN_GATHER3SIV16SI
:
14684 icode
= CODE_FOR_avx512f_gathersiv16si
;
14686 case IX86_BUILTIN_GATHER3SIV8DI
:
14687 icode
= CODE_FOR_avx512f_gathersiv8di
;
14689 case IX86_BUILTIN_GATHER3DIV16SI
:
14690 icode
= CODE_FOR_avx512f_gatherdiv16si
;
14692 case IX86_BUILTIN_GATHER3DIV8DI
:
14693 icode
= CODE_FOR_avx512f_gatherdiv8di
;
14695 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
14696 icode
= CODE_FOR_avx512f_gathersiv8df
;
14698 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
14699 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
14701 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
14702 icode
= CODE_FOR_avx512f_gathersiv8di
;
14704 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
14705 icode
= CODE_FOR_avx512f_gatherdiv16si
;
14707 case IX86_BUILTIN_GATHER3SIV2DF
:
14708 icode
= CODE_FOR_avx512vl_gathersiv2df
;
14710 case IX86_BUILTIN_GATHER3SIV4DF
:
14711 icode
= CODE_FOR_avx512vl_gathersiv4df
;
14713 case IX86_BUILTIN_GATHER3DIV2DF
:
14714 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
14716 case IX86_BUILTIN_GATHER3DIV4DF
:
14717 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
14719 case IX86_BUILTIN_GATHER3SIV4SF
:
14720 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
14722 case IX86_BUILTIN_GATHER3SIV8SF
:
14723 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
14725 case IX86_BUILTIN_GATHER3DIV4SF
:
14726 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
14728 case IX86_BUILTIN_GATHER3DIV8SF
:
14729 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
14731 case IX86_BUILTIN_GATHER3SIV2DI
:
14732 icode
= CODE_FOR_avx512vl_gathersiv2di
;
14734 case IX86_BUILTIN_GATHER3SIV4DI
:
14735 icode
= CODE_FOR_avx512vl_gathersiv4di
;
14737 case IX86_BUILTIN_GATHER3DIV2DI
:
14738 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
14740 case IX86_BUILTIN_GATHER3DIV4DI
:
14741 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
14743 case IX86_BUILTIN_GATHER3SIV4SI
:
14744 icode
= CODE_FOR_avx512vl_gathersiv4si
;
14746 case IX86_BUILTIN_GATHER3SIV8SI
:
14747 icode
= CODE_FOR_avx512vl_gathersiv8si
;
14749 case IX86_BUILTIN_GATHER3DIV4SI
:
14750 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
14752 case IX86_BUILTIN_GATHER3DIV8SI
:
14753 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
14755 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14756 icode
= CODE_FOR_avx512vl_gathersiv4df
;
14758 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14759 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
14761 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14762 icode
= CODE_FOR_avx512vl_gathersiv4di
;
14764 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14765 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
14767 case IX86_BUILTIN_SCATTERSIV16SF
:
14768 icode
= CODE_FOR_avx512f_scattersiv16sf
;
14770 case IX86_BUILTIN_SCATTERSIV8DF
:
14771 icode
= CODE_FOR_avx512f_scattersiv8df
;
14773 case IX86_BUILTIN_SCATTERDIV16SF
:
14774 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
14776 case IX86_BUILTIN_SCATTERDIV8DF
:
14777 icode
= CODE_FOR_avx512f_scatterdiv8df
;
14779 case IX86_BUILTIN_SCATTERSIV16SI
:
14780 icode
= CODE_FOR_avx512f_scattersiv16si
;
14782 case IX86_BUILTIN_SCATTERSIV8DI
:
14783 icode
= CODE_FOR_avx512f_scattersiv8di
;
14785 case IX86_BUILTIN_SCATTERDIV16SI
:
14786 icode
= CODE_FOR_avx512f_scatterdiv16si
;
14788 case IX86_BUILTIN_SCATTERDIV8DI
:
14789 icode
= CODE_FOR_avx512f_scatterdiv8di
;
14791 case IX86_BUILTIN_SCATTERSIV8SF
:
14792 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
14794 case IX86_BUILTIN_SCATTERSIV4SF
:
14795 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
14797 case IX86_BUILTIN_SCATTERSIV4DF
:
14798 icode
= CODE_FOR_avx512vl_scattersiv4df
;
14800 case IX86_BUILTIN_SCATTERSIV2DF
:
14801 icode
= CODE_FOR_avx512vl_scattersiv2df
;
14803 case IX86_BUILTIN_SCATTERDIV8SF
:
14804 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
14806 case IX86_BUILTIN_SCATTERDIV4SF
:
14807 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
14809 case IX86_BUILTIN_SCATTERDIV4DF
:
14810 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
14812 case IX86_BUILTIN_SCATTERDIV2DF
:
14813 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
14815 case IX86_BUILTIN_SCATTERSIV8SI
:
14816 icode
= CODE_FOR_avx512vl_scattersiv8si
;
14818 case IX86_BUILTIN_SCATTERSIV4SI
:
14819 icode
= CODE_FOR_avx512vl_scattersiv4si
;
14821 case IX86_BUILTIN_SCATTERSIV4DI
:
14822 icode
= CODE_FOR_avx512vl_scattersiv4di
;
14824 case IX86_BUILTIN_SCATTERSIV2DI
:
14825 icode
= CODE_FOR_avx512vl_scattersiv2di
;
14827 case IX86_BUILTIN_SCATTERDIV8SI
:
14828 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
14830 case IX86_BUILTIN_SCATTERDIV4SI
:
14831 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
14833 case IX86_BUILTIN_SCATTERDIV4DI
:
14834 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
14836 case IX86_BUILTIN_SCATTERDIV2DI
:
14837 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
14839 case IX86_BUILTIN_GATHERPFDPD
:
14840 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
14841 goto vec_prefetch_gen
;
14842 case IX86_BUILTIN_SCATTERALTSIV8DF
:
14843 icode
= CODE_FOR_avx512f_scattersiv8df
;
14845 case IX86_BUILTIN_SCATTERALTDIV16SF
:
14846 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
14848 case IX86_BUILTIN_SCATTERALTSIV8DI
:
14849 icode
= CODE_FOR_avx512f_scattersiv8di
;
14851 case IX86_BUILTIN_SCATTERALTDIV16SI
:
14852 icode
= CODE_FOR_avx512f_scatterdiv16si
;
14854 case IX86_BUILTIN_SCATTERALTSIV4DF
:
14855 icode
= CODE_FOR_avx512vl_scattersiv4df
;
14857 case IX86_BUILTIN_SCATTERALTDIV8SF
:
14858 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
14860 case IX86_BUILTIN_SCATTERALTSIV4DI
:
14861 icode
= CODE_FOR_avx512vl_scattersiv4di
;
14863 case IX86_BUILTIN_SCATTERALTDIV8SI
:
14864 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
14866 case IX86_BUILTIN_SCATTERALTSIV2DF
:
14867 icode
= CODE_FOR_avx512vl_scattersiv2df
;
14869 case IX86_BUILTIN_SCATTERALTDIV4SF
:
14870 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
14872 case IX86_BUILTIN_SCATTERALTSIV2DI
:
14873 icode
= CODE_FOR_avx512vl_scattersiv2di
;
14875 case IX86_BUILTIN_SCATTERALTDIV4SI
:
14876 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
14878 case IX86_BUILTIN_GATHERPFDPS
:
14879 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
14880 goto vec_prefetch_gen
;
14881 case IX86_BUILTIN_GATHERPFQPD
:
14882 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
14883 goto vec_prefetch_gen
;
14884 case IX86_BUILTIN_GATHERPFQPS
:
14885 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
14886 goto vec_prefetch_gen
;
14887 case IX86_BUILTIN_SCATTERPFDPD
:
14888 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
14889 goto vec_prefetch_gen
;
14890 case IX86_BUILTIN_SCATTERPFDPS
:
14891 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
14892 goto vec_prefetch_gen
;
14893 case IX86_BUILTIN_SCATTERPFQPD
:
14894 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
14895 goto vec_prefetch_gen
;
14896 case IX86_BUILTIN_SCATTERPFQPS
:
14897 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
14898 goto vec_prefetch_gen
;
14902 rtx (*gen
) (rtx
, rtx
);
14904 arg0
= CALL_EXPR_ARG (exp
, 0);
14905 arg1
= CALL_EXPR_ARG (exp
, 1);
14906 arg2
= CALL_EXPR_ARG (exp
, 2);
14907 arg3
= CALL_EXPR_ARG (exp
, 3);
14908 arg4
= CALL_EXPR_ARG (exp
, 4);
14909 op0
= expand_normal (arg0
);
14910 op1
= expand_normal (arg1
);
14911 op2
= expand_normal (arg2
);
14912 op3
= expand_normal (arg3
);
14913 op4
= expand_normal (arg4
);
14914 /* Note the arg order is different from the operand order. */
14915 mode0
= insn_data
[icode
].operand
[1].mode
;
14916 mode2
= insn_data
[icode
].operand
[3].mode
;
14917 mode3
= insn_data
[icode
].operand
[4].mode
;
14918 mode4
= insn_data
[icode
].operand
[5].mode
;
14920 if (target
== NULL_RTX
14921 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
14922 || !insn_data
[icode
].operand
[0].predicate (target
,
14923 GET_MODE (target
)))
14924 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
14926 subtarget
= target
;
14930 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
14931 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
14932 half
= gen_reg_rtx (V8SImode
);
14933 if (!nonimmediate_operand (op2
, V16SImode
))
14934 op2
= copy_to_mode_reg (V16SImode
, op2
);
14935 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14938 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14939 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14940 case IX86_BUILTIN_GATHERALTSIV4DF
:
14941 case IX86_BUILTIN_GATHERALTSIV4DI
:
14942 half
= gen_reg_rtx (V4SImode
);
14943 if (!nonimmediate_operand (op2
, V8SImode
))
14944 op2
= copy_to_mode_reg (V8SImode
, op2
);
14945 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14948 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
14949 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
14950 half
= gen_reg_rtx (mode0
);
14951 if (mode0
== V8SFmode
)
14952 gen
= gen_vec_extract_lo_v16sf
;
14954 gen
= gen_vec_extract_lo_v16si
;
14955 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14956 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14957 emit_insn (gen (half
, op0
));
14959 op3
= lowpart_subreg (QImode
, op3
, HImode
);
14961 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14962 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14963 case IX86_BUILTIN_GATHERALTDIV8SF
:
14964 case IX86_BUILTIN_GATHERALTDIV8SI
:
14965 half
= gen_reg_rtx (mode0
);
14966 if (mode0
== V4SFmode
)
14967 gen
= gen_vec_extract_lo_v8sf
;
14969 gen
= gen_vec_extract_lo_v8si
;
14970 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14971 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14972 emit_insn (gen (half
, op0
));
14974 if (VECTOR_MODE_P (GET_MODE (op3
)))
14976 half
= gen_reg_rtx (mode0
);
14977 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14978 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14979 emit_insn (gen (half
, op3
));
14987 /* Force memory operand only with base register here. But we
14988 don't want to do it on memory operand for other builtin
14990 op1
= ix86_zero_extend_to_Pmode (op1
);
14992 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
14993 op0
= copy_to_mode_reg (mode0
, op0
);
14994 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
14995 op1
= copy_to_mode_reg (Pmode
, op1
);
14996 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
14997 op2
= copy_to_mode_reg (mode2
, op2
);
14999 op3
= fixup_modeless_constant (op3
, mode3
);
15001 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
15003 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
15004 op3
= copy_to_mode_reg (mode3
, op3
);
15008 op3
= copy_to_reg (op3
);
15009 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
15011 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
15013 error ("the last argument must be scale 1, 2, 4, 8");
15017 /* Optimize. If mask is known to have all high bits set,
15018 replace op0 with pc_rtx to signal that the instruction
15019 overwrites the whole destination and doesn't use its
15020 previous contents. */
15023 if (TREE_CODE (arg3
) == INTEGER_CST
)
15025 if (integer_all_onesp (arg3
))
15028 else if (TREE_CODE (arg3
) == VECTOR_CST
)
15030 unsigned int negative
= 0;
15031 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
15033 tree cst
= VECTOR_CST_ELT (arg3
, i
);
15034 if (TREE_CODE (cst
) == INTEGER_CST
15035 && tree_int_cst_sign_bit (cst
))
15037 else if (TREE_CODE (cst
) == REAL_CST
15038 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
15041 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
15044 else if (TREE_CODE (arg3
) == SSA_NAME
15045 && VECTOR_TYPE_P (TREE_TYPE (arg3
)))
15047 /* Recognize also when mask is like:
15048 __v2df src = _mm_setzero_pd ();
15049 __v2df mask = _mm_cmpeq_pd (src, src);
15051 __v8sf src = _mm256_setzero_ps ();
15052 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
15053 as that is a cheaper way to load all ones into
15054 a register than having to load a constant from
15056 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
15057 if (is_gimple_call (def_stmt
))
15059 tree fndecl
= gimple_call_fndecl (def_stmt
);
15061 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
15062 switch (DECL_MD_FUNCTION_CODE (fndecl
))
15064 case IX86_BUILTIN_CMPPD
:
15065 case IX86_BUILTIN_CMPPS
:
15066 case IX86_BUILTIN_CMPPD256
:
15067 case IX86_BUILTIN_CMPPS256
:
15068 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
15071 case IX86_BUILTIN_CMPEQPD
:
15072 case IX86_BUILTIN_CMPEQPS
:
15073 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
15074 && initializer_zerop (gimple_call_arg (def_stmt
,
15085 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
15092 case IX86_BUILTIN_GATHER3DIV16SF
:
15093 if (target
== NULL_RTX
)
15094 target
= gen_reg_rtx (V8SFmode
);
15095 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
15097 case IX86_BUILTIN_GATHER3DIV16SI
:
15098 if (target
== NULL_RTX
)
15099 target
= gen_reg_rtx (V8SImode
);
15100 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
15102 case IX86_BUILTIN_GATHER3DIV8SF
:
15103 case IX86_BUILTIN_GATHERDIV8SF
:
15104 if (target
== NULL_RTX
)
15105 target
= gen_reg_rtx (V4SFmode
);
15106 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
15108 case IX86_BUILTIN_GATHER3DIV8SI
:
15109 case IX86_BUILTIN_GATHERDIV8SI
:
15110 if (target
== NULL_RTX
)
15111 target
= gen_reg_rtx (V4SImode
);
15112 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
15115 target
= subtarget
;
15121 arg0
= CALL_EXPR_ARG (exp
, 0);
15122 arg1
= CALL_EXPR_ARG (exp
, 1);
15123 arg2
= CALL_EXPR_ARG (exp
, 2);
15124 arg3
= CALL_EXPR_ARG (exp
, 3);
15125 arg4
= CALL_EXPR_ARG (exp
, 4);
15126 op0
= expand_normal (arg0
);
15127 op1
= expand_normal (arg1
);
15128 op2
= expand_normal (arg2
);
15129 op3
= expand_normal (arg3
);
15130 op4
= expand_normal (arg4
);
15131 mode1
= insn_data
[icode
].operand
[1].mode
;
15132 mode2
= insn_data
[icode
].operand
[2].mode
;
15133 mode3
= insn_data
[icode
].operand
[3].mode
;
15134 mode4
= insn_data
[icode
].operand
[4].mode
;
15136 /* Scatter instruction stores operand op3 to memory with
15137 indices from op2 and scale from op4 under writemask op1.
15138 If index operand op2 has more elements then source operand
15139 op3 one need to use only its low half. And vice versa. */
15142 case IX86_BUILTIN_SCATTERALTSIV8DF
:
15143 case IX86_BUILTIN_SCATTERALTSIV8DI
:
15144 half
= gen_reg_rtx (V8SImode
);
15145 if (!nonimmediate_operand (op2
, V16SImode
))
15146 op2
= copy_to_mode_reg (V16SImode
, op2
);
15147 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
15150 case IX86_BUILTIN_SCATTERALTDIV16SF
:
15151 case IX86_BUILTIN_SCATTERALTDIV16SI
:
15152 half
= gen_reg_rtx (mode3
);
15153 if (mode3
== V8SFmode
)
15154 gen
= gen_vec_extract_lo_v16sf
;
15156 gen
= gen_vec_extract_lo_v16si
;
15157 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
15158 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
15159 emit_insn (gen (half
, op3
));
15162 case IX86_BUILTIN_SCATTERALTSIV4DF
:
15163 case IX86_BUILTIN_SCATTERALTSIV4DI
:
15164 half
= gen_reg_rtx (V4SImode
);
15165 if (!nonimmediate_operand (op2
, V8SImode
))
15166 op2
= copy_to_mode_reg (V8SImode
, op2
);
15167 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
15170 case IX86_BUILTIN_SCATTERALTDIV8SF
:
15171 case IX86_BUILTIN_SCATTERALTDIV8SI
:
15172 half
= gen_reg_rtx (mode3
);
15173 if (mode3
== V4SFmode
)
15174 gen
= gen_vec_extract_lo_v8sf
;
15176 gen
= gen_vec_extract_lo_v8si
;
15177 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
15178 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
15179 emit_insn (gen (half
, op3
));
15182 case IX86_BUILTIN_SCATTERALTSIV2DF
:
15183 case IX86_BUILTIN_SCATTERALTSIV2DI
:
15184 if (!nonimmediate_operand (op2
, V4SImode
))
15185 op2
= copy_to_mode_reg (V4SImode
, op2
);
15187 case IX86_BUILTIN_SCATTERALTDIV4SF
:
15188 case IX86_BUILTIN_SCATTERALTDIV4SI
:
15189 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
15190 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
15196 /* Force memory operand only with base register here. But we
15197 don't want to do it on memory operand for other builtin
15199 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
15201 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
15202 op0
= copy_to_mode_reg (Pmode
, op0
);
15204 op1
= fixup_modeless_constant (op1
, mode1
);
15206 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
15208 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
15209 op1
= copy_to_mode_reg (mode1
, op1
);
15213 op1
= copy_to_reg (op1
);
15214 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
15217 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
15218 op2
= copy_to_mode_reg (mode2
, op2
);
15220 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
15221 op3
= copy_to_mode_reg (mode3
, op3
);
15223 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
15225 error ("the last argument must be scale 1, 2, 4, 8");
15229 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
15237 arg0
= CALL_EXPR_ARG (exp
, 0);
15238 arg1
= CALL_EXPR_ARG (exp
, 1);
15239 arg2
= CALL_EXPR_ARG (exp
, 2);
15240 arg3
= CALL_EXPR_ARG (exp
, 3);
15241 arg4
= CALL_EXPR_ARG (exp
, 4);
15242 op0
= expand_normal (arg0
);
15243 op1
= expand_normal (arg1
);
15244 op2
= expand_normal (arg2
);
15245 op3
= expand_normal (arg3
);
15246 op4
= expand_normal (arg4
);
15247 mode0
= insn_data
[icode
].operand
[0].mode
;
15248 mode1
= insn_data
[icode
].operand
[1].mode
;
15249 mode3
= insn_data
[icode
].operand
[3].mode
;
15250 mode4
= insn_data
[icode
].operand
[4].mode
;
15252 op0
= fixup_modeless_constant (op0
, mode0
);
15254 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
15256 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
15257 op0
= copy_to_mode_reg (mode0
, op0
);
15261 op0
= copy_to_reg (op0
);
15262 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
15265 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
15266 op1
= copy_to_mode_reg (mode1
, op1
);
15268 /* Force memory operand only with base register here. But we
15269 don't want to do it on memory operand for other builtin
15271 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
15273 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
15274 op2
= copy_to_mode_reg (Pmode
, op2
);
15276 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
15278 error ("the forth argument must be scale 1, 2, 4, 8");
15282 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
15284 error ("incorrect hint operand");
15288 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
15296 case IX86_BUILTIN_XABORT
:
15297 icode
= CODE_FOR_xabort
;
15298 arg0
= CALL_EXPR_ARG (exp
, 0);
15299 op0
= expand_normal (arg0
);
15300 mode0
= insn_data
[icode
].operand
[0].mode
;
15301 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
15303 error ("the argument to %<xabort%> intrinsic must "
15304 "be an 8-bit immediate");
15307 emit_insn (gen_xabort (op0
));
15310 case IX86_BUILTIN_RDSSPD
:
15311 case IX86_BUILTIN_RDSSPQ
:
15312 mode
= (fcode
== IX86_BUILTIN_RDSSPD
? SImode
: DImode
);
15315 || !register_operand (target
, mode
))
15316 target
= gen_reg_rtx (mode
);
15318 op0
= force_reg (mode
, const0_rtx
);
15320 emit_insn (gen_rdssp (mode
, target
, op0
));
15323 case IX86_BUILTIN_INCSSPD
:
15324 case IX86_BUILTIN_INCSSPQ
:
15325 mode
= (fcode
== IX86_BUILTIN_INCSSPD
? SImode
: DImode
);
15327 arg0
= CALL_EXPR_ARG (exp
, 0);
15328 op0
= expand_normal (arg0
);
15330 op0
= force_reg (mode
, op0
);
15332 emit_insn (gen_incssp (mode
, op0
));
15335 case IX86_BUILTIN_HRESET
:
15336 icode
= CODE_FOR_hreset
;
15337 arg0
= CALL_EXPR_ARG (exp
, 0);
15338 op0
= expand_normal (arg0
);
15339 op0
= force_reg (SImode
, op0
);
15340 emit_insn (gen_hreset (op0
));
15343 case IX86_BUILTIN_RSTORSSP
:
15344 case IX86_BUILTIN_CLRSSBSY
:
15345 arg0
= CALL_EXPR_ARG (exp
, 0);
15346 op0
= expand_normal (arg0
);
15347 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
15348 ? CODE_FOR_rstorssp
15349 : CODE_FOR_clrssbsy
);
15351 if (!address_operand (op0
, VOIDmode
))
15353 op0
= convert_memory_address (Pmode
, op0
);
15354 op0
= copy_addr_to_reg (op0
);
15356 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (DImode
, op0
)));
15359 case IX86_BUILTIN_WRSSD
:
15360 case IX86_BUILTIN_WRSSQ
:
15361 case IX86_BUILTIN_WRUSSD
:
15362 case IX86_BUILTIN_WRUSSQ
:
15363 mode
= ((fcode
== IX86_BUILTIN_WRSSD
15364 || fcode
== IX86_BUILTIN_WRUSSD
)
15365 ? SImode
: DImode
);
15367 arg0
= CALL_EXPR_ARG (exp
, 0);
15368 op0
= expand_normal (arg0
);
15369 arg1
= CALL_EXPR_ARG (exp
, 1);
15370 op1
= expand_normal (arg1
);
15372 op0
= force_reg (mode
, op0
);
15374 if (!address_operand (op1
, VOIDmode
))
15376 op1
= convert_memory_address (Pmode
, op1
);
15377 op1
= copy_addr_to_reg (op1
);
15379 op1
= gen_rtx_MEM (mode
, op1
);
15381 icode
= ((fcode
== IX86_BUILTIN_WRSSD
15382 || fcode
== IX86_BUILTIN_WRSSQ
)
15383 ? code_for_wrss (mode
)
15384 : code_for_wruss (mode
));
15385 emit_insn (GEN_FCN (icode
) (op0
, op1
));
15393 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
15394 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
15396 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
15397 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
15401 if (fcode
>= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
15402 && fcode
<= IX86_BUILTIN__BDESC_PURE_ARGS_LAST
)
15404 i
= fcode
- IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
;
15405 return ix86_expand_special_args_builtin (bdesc_pure_args
+ i
, exp
,
15409 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
15410 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
15412 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
15413 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
15414 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
15415 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
15417 machine_mode mode
, wide_mode
, nar_mode
;
15419 nar_mode
= V4SFmode
;
15421 wide_mode
= V64SFmode
;
15422 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
15423 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
15427 case IX86_BUILTIN_4FMAPS
:
15428 fcn
= gen_avx5124fmaddps_4fmaddps
;
15432 case IX86_BUILTIN_4DPWSSD
:
15433 nar_mode
= V4SImode
;
15435 wide_mode
= V64SImode
;
15436 fcn
= gen_avx5124vnniw_vp4dpwssd
;
15440 case IX86_BUILTIN_4DPWSSDS
:
15441 nar_mode
= V4SImode
;
15443 wide_mode
= V64SImode
;
15444 fcn
= gen_avx5124vnniw_vp4dpwssds
;
15448 case IX86_BUILTIN_4FNMAPS
:
15449 fcn
= gen_avx5124fmaddps_4fnmaddps
;
15453 case IX86_BUILTIN_4FNMAPS_MASK
:
15454 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
15455 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
15458 case IX86_BUILTIN_4DPWSSD_MASK
:
15459 nar_mode
= V4SImode
;
15461 wide_mode
= V64SImode
;
15462 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
15463 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
15466 case IX86_BUILTIN_4DPWSSDS_MASK
:
15467 nar_mode
= V4SImode
;
15469 wide_mode
= V64SImode
;
15470 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
15471 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
15474 case IX86_BUILTIN_4FMAPS_MASK
:
15484 wide_reg
= gen_reg_rtx (wide_mode
);
15485 for (i
= 0; i
< 4; i
++)
15487 args
[i
] = CALL_EXPR_ARG (exp
, i
);
15488 ops
[i
] = expand_normal (args
[i
]);
15490 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
15494 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
15495 accum
= force_reg (mode
, accum
);
15497 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
15498 addr
= force_reg (Pmode
, addr
);
15500 mem
= gen_rtx_MEM (nar_mode
, addr
);
15502 target
= gen_reg_rtx (mode
);
15504 emit_move_insn (target
, accum
);
15507 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
15511 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
15513 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
15515 if (CONST_INT_P (mask
))
15516 mask
= fixup_modeless_constant (mask
, HImode
);
15518 mask
= force_reg (HImode
, mask
);
15520 if (GET_MODE (mask
) != HImode
)
15521 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
15523 /* If merge is 0 then we're about to emit z-masked variant. */
15524 if (const0_operand (merge
, mode
))
15525 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
15526 /* If merge is the same as accum then emit merge-masked variant. */
15527 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
15529 merge
= force_reg (mode
, merge
);
15530 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
15532 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15535 target
= gen_reg_rtx (mode
);
15536 emit_move_insn (target
, merge
);
15537 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
15543 case IX86_BUILTIN_4FNMASS
:
15544 fcn
= gen_avx5124fmaddps_4fnmaddss
;
15548 case IX86_BUILTIN_4FMASS
:
15549 fcn
= gen_avx5124fmaddps_4fmaddss
;
15553 case IX86_BUILTIN_4FNMASS_MASK
:
15554 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
15555 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
15558 case IX86_BUILTIN_4FMASS_MASK
:
15567 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
15568 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
15572 wide_reg
= gen_reg_rtx (V64SFmode
);
15573 for (i
= 0; i
< 4; i
++)
15576 args
[i
] = CALL_EXPR_ARG (exp
, i
);
15577 ops
[i
] = expand_normal (args
[i
]);
15579 tmp
= gen_reg_rtx (SFmode
);
15580 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
15582 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
15583 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
15586 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
15587 accum
= force_reg (V4SFmode
, accum
);
15589 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
15590 addr
= force_reg (Pmode
, addr
);
15592 mem
= gen_rtx_MEM (V4SFmode
, addr
);
15594 target
= gen_reg_rtx (V4SFmode
);
15596 emit_move_insn (target
, accum
);
15599 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
15603 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
15605 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
15607 if (CONST_INT_P (mask
))
15608 mask
= fixup_modeless_constant (mask
, QImode
);
15610 mask
= force_reg (QImode
, mask
);
15612 if (GET_MODE (mask
) != QImode
)
15613 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
15615 /* If merge is 0 then we're about to emit z-masked variant. */
15616 if (const0_operand (merge
, mode
))
15617 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
15618 /* If merge is the same as accum then emit merge-masked
15620 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
15622 merge
= force_reg (mode
, merge
);
15623 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
15625 /* Merge with something unknown might happen if we z-mask
15629 target
= gen_reg_rtx (mode
);
15630 emit_move_insn (target
, merge
);
15631 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
15636 case IX86_BUILTIN_RDPID
:
15637 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
15639 case IX86_BUILTIN_FABSQ
:
15640 case IX86_BUILTIN_COPYSIGNQ
:
15642 /* Emit a normal call if SSE isn't available. */
15643 return expand_call (exp
, target
, ignore
);
15646 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
15650 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
15651 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
15653 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
15654 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
15657 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15658 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
15660 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
15661 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
15664 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15665 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
15667 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
15668 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
15671 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15672 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
15674 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
15675 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
15678 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15679 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
15681 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
15682 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
15683 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
15684 (enum ix86_builtin_func_type
)
15685 d
->flag
, d
->comparison
);
15688 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
15689 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
15691 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
15692 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
15696 gcc_unreachable ();
15699 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15700 fill target with val via vec_duplicate. */
15703 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
15708 /* Save/restore recog_data in case this is called from splitters
15709 or other routines where recog_data needs to stay valid across
15710 force_reg. See PR106577. */
15711 recog_data_d recog_data_save
= recog_data
;
15713 /* First attempt to recognize VAL as-is. */
15714 dup
= gen_vec_duplicate (mode
, val
);
15715 insn
= emit_insn (gen_rtx_SET (target
, dup
));
15716 if (recog_memoized (insn
) < 0)
15719 machine_mode innermode
= GET_MODE_INNER (mode
);
15722 /* If that fails, force VAL into a register. */
15725 reg
= force_reg (innermode
, val
);
15726 if (GET_MODE (reg
) != innermode
)
15727 reg
= gen_lowpart (innermode
, reg
);
15728 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
15729 seq
= get_insns ();
15732 emit_insn_before (seq
, insn
);
15734 ok
= recog_memoized (insn
) >= 0;
15737 recog_data
= recog_data_save
;
15741 /* Get a vector mode of the same size as the original but with elements
15742 twice as wide. This is only guaranteed to apply to integral vectors. */
15744 static machine_mode
15745 get_mode_wider_vector (machine_mode o
)
15747 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15748 machine_mode n
= GET_MODE_NEXT_MODE (o
).require ();
15749 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
15750 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
15754 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
15755 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
15757 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15758 with all elements equal to VAR. Return true if successful. */
15761 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
15762 rtx target
, rtx val
)
15769 if (CONST_INT_P (val
))
15771 int tmp
= (int)INTVAL (val
);
15772 if (tmp
== (int)(INTVAL (val
) >> 32))
15774 rtx reg
= gen_reg_rtx (V4SImode
);
15775 ok
= ix86_vector_duplicate_value (V4SImode
, reg
,
15779 emit_move_insn (target
, gen_lowpart (V2DImode
, reg
));
15784 return ix86_vector_duplicate_value (mode
, target
, val
);
15787 if (CONST_INT_P (val
))
15789 int tmp
= (int)INTVAL (val
);
15790 if (tmp
== (int)(INTVAL (val
) >> 32))
15792 rtx reg
= gen_reg_rtx (V8SImode
);
15793 ok
= ix86_vector_duplicate_value (V8SImode
, reg
,
15797 emit_move_insn (target
, gen_lowpart (V4DImode
, reg
));
15802 return ix86_vector_duplicate_value (mode
, target
, val
);
15820 return ix86_vector_duplicate_value (mode
, target
, val
);
15825 if (TARGET_SSE
|| TARGET_3DNOW_A
)
15829 val
= gen_lowpart (SImode
, val
);
15830 if (CONST_INT_P (val
))
15832 x
= gen_rtx_TRUNCATE (HImode
, val
);
15833 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
15834 emit_insn (gen_rtx_SET (target
, x
));
15841 if (TARGET_MMX_WITH_SSE
)
15843 val
= force_reg (GET_MODE_INNER (mode
), val
);
15844 rtx x
= gen_rtx_VEC_DUPLICATE (mode
, val
);
15845 emit_insn (gen_rtx_SET (target
, x
));
15855 val
= gen_lowpart (SImode
, val
);
15856 if (CONST_INT_P (val
))
15858 x
= gen_rtx_TRUNCATE (HImode
, val
);
15859 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
15860 emit_insn (gen_rtx_SET (target
, x
));
15869 val
= force_reg (GET_MODE_INNER (mode
), val
);
15870 rtx x
= gen_rtx_VEC_DUPLICATE (mode
, val
);
15871 emit_insn (gen_rtx_SET (target
, x
));
15883 if (CONST_INT_P (val
))
15890 return ix86_vector_duplicate_value (mode
, target
, val
);
15894 struct expand_vec_perm_d dperm
;
15898 memset (&dperm
, 0, sizeof (dperm
));
15899 dperm
.target
= target
;
15900 dperm
.vmode
= mode
;
15901 dperm
.nelt
= GET_MODE_NUNITS (mode
);
15902 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
15903 dperm
.one_operand_p
= true;
15905 if (mode
== V8HFmode
|| mode
== V8BFmode
)
15907 tmp1
= force_reg (GET_MODE_INNER (mode
), val
);
15908 tmp2
= gen_reg_rtx (mode
);
15909 emit_insn (gen_vec_set_0 (mode
, tmp2
, CONST0_RTX (mode
), tmp1
));
15910 tmp1
= gen_lowpart (mode
, tmp2
);
15914 /* Extend to SImode using a paradoxical SUBREG. */
15915 tmp1
= gen_reg_rtx (SImode
);
15916 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
15918 /* Insert the SImode value as
15919 low element of a V4SImode vector. */
15920 tmp2
= gen_reg_rtx (V4SImode
);
15921 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
15922 tmp1
= gen_lowpart (mode
, tmp2
);
15925 emit_move_insn (dperm
.op0
, tmp1
);
15926 ok
= (expand_vec_perm_1 (&dperm
)
15927 || expand_vec_perm_broadcast_1 (&dperm
));
15934 if (CONST_INT_P (val
))
15937 return ix86_vector_duplicate_value (mode
, target
, val
);
15944 /* Replicate the value once into the next wider mode and recurse. */
15946 machine_mode smode
, wsmode
, wvmode
;
15949 smode
= GET_MODE_INNER (mode
);
15950 wvmode
= get_mode_wider_vector (mode
);
15951 wsmode
= GET_MODE_INNER (wvmode
);
15953 val
= convert_modes (wsmode
, smode
, val
, true);
15955 if (CONST_INT_P (val
))
15957 x
= simplify_binary_operation (ASHIFT
, wsmode
, val
,
15958 GEN_INT (GET_MODE_BITSIZE (smode
)));
15959 val
= simplify_binary_operation (IOR
, wsmode
, val
, x
);
15961 else if (smode
== QImode
&& !TARGET_PARTIAL_REG_STALL
)
15962 emit_insn (gen_insv_1 (wsmode
, val
, val
));
15965 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
15966 GEN_INT (GET_MODE_BITSIZE (smode
)),
15967 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15968 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1,
15972 x
= gen_reg_rtx (wvmode
);
15973 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
15976 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
15982 if (CONST_INT_P (val
))
15989 return ix86_vector_duplicate_value (mode
, target
, val
);
15992 machine_mode hvmode
;
16005 hvmode
= V16QImode
;
16008 gcc_unreachable ();
16010 rtx x
= gen_reg_rtx (hvmode
);
16012 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
16016 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
16017 emit_insn (gen_rtx_SET (target
, x
));
16025 gcc_assert (TARGET_EVEX512
);
16026 if (TARGET_AVX512BW
)
16027 return ix86_vector_duplicate_value (mode
, target
, val
);
16030 machine_mode hvmode
;
16034 hvmode
= V16HImode
;
16037 hvmode
= V16HFmode
;
16040 hvmode
= V16BFmode
;
16043 hvmode
= V32QImode
;
16046 gcc_unreachable ();
16048 rtx x
= gen_reg_rtx (hvmode
);
16050 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
16054 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
16055 emit_insn (gen_rtx_SET (target
, x
));
16064 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
16065 whose ONE_VAR element is VAR, and other elements are zero. Return true
16069 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
16070 rtx target
, rtx var
, int one_var
)
16072 machine_mode vsimode
;
16075 bool use_vector_set
= false;
16076 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
16078 if (GET_MODE_SIZE (mode
) == 64 && !TARGET_EVEX512
)
16084 /* For SSE4.1, we normally use vector set. But if the second
16085 element is zero and inter-unit moves are OK, we use movq
16087 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
16088 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
16094 use_vector_set
= TARGET_SSE4_1
;
16097 use_vector_set
= TARGET_SSE2
;
16098 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
16099 ? gen_vec_setv8hi_0
: NULL
;
16102 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16107 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
16110 use_vector_set
= TARGET_SSE4_1
;
16113 use_vector_set
= TARGET_AVX
;
16116 use_vector_set
= TARGET_AVX
;
16117 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
16118 ? gen_vec_setv16hi_0
: NULL
;
16121 use_vector_set
= TARGET_AVX
;
16122 gen_vec_set_0
= gen_vec_setv8si_0
;
16125 use_vector_set
= TARGET_AVX
;
16126 gen_vec_set_0
= gen_vec_setv8sf_0
;
16129 use_vector_set
= TARGET_AVX
;
16130 gen_vec_set_0
= gen_vec_setv4df_0
;
16133 /* Use ix86_expand_vector_set in 64bit mode only. */
16134 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
16135 gen_vec_set_0
= gen_vec_setv4di_0
;
16138 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
16139 gen_vec_set_0
= gen_vec_setv16si_0
;
16142 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
16143 gen_vec_set_0
= gen_vec_setv16sf_0
;
16146 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
16147 gen_vec_set_0
= gen_vec_setv8df_0
;
16150 /* Use ix86_expand_vector_set in 64bit mode only. */
16151 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
16152 gen_vec_set_0
= gen_vec_setv8di_0
;
16155 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
16156 gen_vec_set_0
= gen_vec_setv8hf_0
;
16159 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
16160 gen_vec_set_0
= gen_vec_setv16hf_0
;
16163 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
16164 gen_vec_set_0
= gen_vec_setv32hf_0
;
16167 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
16168 gen_vec_set_0
= gen_vec_setv8bf_0
;
16171 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
16172 gen_vec_set_0
= gen_vec_setv16bf_0
;
16175 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
16176 gen_vec_set_0
= gen_vec_setv32bf_0
;
16179 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
16180 gen_vec_set_0
= gen_vec_setv32hi_0
;
16185 if (use_vector_set
)
16187 if (gen_vec_set_0
&& one_var
== 0)
16189 var
= force_reg (GET_MODE_INNER (mode
), var
);
16190 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
16193 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
16194 var
= force_reg (GET_MODE_INNER (mode
), var
);
16195 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
16211 var
= force_reg (GET_MODE_INNER (mode
), var
);
16212 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
16213 emit_insn (gen_rtx_SET (target
, x
));
16218 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
16219 new_target
= gen_reg_rtx (mode
);
16221 new_target
= target
;
16222 var
= force_reg (GET_MODE_INNER (mode
), var
);
16223 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
16224 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
16225 emit_insn (gen_rtx_SET (new_target
, x
));
16228 /* We need to shuffle the value to the correct position, so
16229 create a new pseudo to store the intermediate result. */
16231 /* With SSE2, we can use the integer shuffle insns. */
16232 if (mode
!= V4SFmode
&& TARGET_SSE2
)
16234 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
16236 GEN_INT (one_var
== 1 ? 0 : 1),
16237 GEN_INT (one_var
== 2 ? 0 : 1),
16238 GEN_INT (one_var
== 3 ? 0 : 1)));
16239 if (target
!= new_target
)
16240 emit_move_insn (target
, new_target
);
16244 /* Otherwise convert the intermediate result to V4SFmode and
16245 use the SSE1 shuffle instructions. */
16246 if (mode
!= V4SFmode
)
16248 tmp
= gen_reg_rtx (V4SFmode
);
16249 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
16254 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
16256 GEN_INT (one_var
== 1 ? 0 : 1),
16257 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
16258 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
16260 if (mode
!= V4SFmode
)
16261 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
16262 else if (tmp
!= target
)
16263 emit_move_insn (target
, tmp
);
16265 else if (target
!= new_target
)
16266 emit_move_insn (target
, new_target
);
16271 vsimode
= V4SImode
;
16277 vsimode
= V2SImode
;
16283 /* Zero extend the variable element to SImode and recurse. */
16284 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
16286 x
= gen_reg_rtx (vsimode
);
16287 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
16289 gcc_unreachable ();
16291 emit_move_insn (target
, gen_lowpart (mode
, x
));
16299 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
16300 consisting of the values in VALS. It is known that all elements
16301 except ONE_VAR are constants. Return true if successful. */
16304 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
16305 rtx target
, rtx vals
, int one_var
)
16307 rtx var
= XVECEXP (vals
, 0, one_var
);
16308 machine_mode wmode
;
16311 const_vec
= copy_rtx (vals
);
16312 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
16313 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
16321 /* For the two element vectors, it's just as easy to use
16322 the general case. */
16326 /* Use ix86_expand_vector_set in 64bit mode only. */
16353 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
16362 /* There's no way to set one QImode entry easily. Combine
16363 the variable value with its adjacent constant value, and
16364 promote to an HImode set. */
16365 x
= XVECEXP (vals
, 0, one_var
^ 1);
16368 var
= convert_modes (HImode
, QImode
, var
, true);
16369 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
16370 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16371 x
= GEN_INT (INTVAL (x
) & 0xff);
16375 var
= convert_modes (HImode
, QImode
, var
, true);
16376 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
16378 if (x
!= const0_rtx
)
16379 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
16380 1, OPTAB_LIB_WIDEN
);
16382 x
= gen_reg_rtx (wmode
);
16383 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
16384 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
16386 emit_move_insn (target
, gen_lowpart (mode
, x
));
16393 emit_move_insn (target
, const_vec
);
16394 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
16398 /* A subroutine of ix86_expand_vector_init_general. Use vector
16399 concatenate to handle the most general case: all values variable,
16400 and none identical. */
16403 ix86_expand_vector_init_concat (machine_mode mode
,
16404 rtx target
, rtx
*ops
, int n
)
16406 machine_mode half_mode
= VOIDmode
;
16417 half_mode
= V16HFmode
;
16420 half_mode
= V16BFmode
;
16423 half_mode
= V8SImode
;
16426 half_mode
= V8SFmode
;
16429 half_mode
= V4DImode
;
16432 half_mode
= V4DFmode
;
16435 half_mode
= V8HFmode
;
16438 half_mode
= V8BFmode
;
16441 half_mode
= V4SImode
;
16444 half_mode
= V4SFmode
;
16447 half_mode
= V2DImode
;
16450 half_mode
= V2DFmode
;
16453 half_mode
= V2SImode
;
16456 half_mode
= V2SFmode
;
16459 half_mode
= DImode
;
16462 half_mode
= SImode
;
16465 half_mode
= DFmode
;
16468 half_mode
= SFmode
;
16471 gcc_unreachable ();
16474 if (!register_operand (ops
[1], half_mode
))
16475 ops
[1] = force_reg (half_mode
, ops
[1]);
16476 if (!register_operand (ops
[0], half_mode
))
16477 ops
[0] = force_reg (half_mode
, ops
[0]);
16478 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
16486 half_mode
= V2DImode
;
16489 half_mode
= V2DFmode
;
16492 half_mode
= V2SImode
;
16495 half_mode
= V2SFmode
;
16498 gcc_unreachable ();
16506 half_mode
= V4DImode
;
16509 half_mode
= V4DFmode
;
16512 half_mode
= V4SImode
;
16515 half_mode
= V4SFmode
;
16518 gcc_unreachable ();
16526 half_mode
= V8SImode
;
16529 half_mode
= V8SFmode
;
16532 gcc_unreachable ();
16537 /* FIXME: We process inputs backward to help RA. PR 36222. */
16539 for (j
= 1; j
!= -1; j
--)
16541 half
[j
] = gen_reg_rtx (half_mode
);
16545 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
16549 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
16553 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
16554 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
16558 gcc_unreachable ();
16560 ix86_expand_vector_init (false, half
[j
],
16561 gen_rtx_PARALLEL (half_mode
, v
));
16564 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
16568 gcc_unreachable ();
16572 /* A subroutine of ix86_expand_vector_init_general. Use vector
16573 interleave to handle the most general case: all values variable,
16574 and none identical. */
16577 ix86_expand_vector_init_interleave (machine_mode mode
,
16578 rtx target
, rtx
*ops
, int n
)
16580 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
16583 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
16584 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
16585 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
16590 gen_load_even
= gen_vec_interleave_lowv8hf
;
16591 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16592 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16593 inner_mode
= HFmode
;
16594 first_imode
= V4SImode
;
16595 second_imode
= V2DImode
;
16596 third_imode
= VOIDmode
;
16599 gen_load_even
= gen_vec_interleave_lowv8bf
;
16600 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16601 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16602 inner_mode
= BFmode
;
16603 first_imode
= V4SImode
;
16604 second_imode
= V2DImode
;
16605 third_imode
= VOIDmode
;
16608 gen_load_even
= gen_vec_setv8hi
;
16609 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16610 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16611 inner_mode
= HImode
;
16612 first_imode
= V4SImode
;
16613 second_imode
= V2DImode
;
16614 third_imode
= VOIDmode
;
16617 gen_load_even
= gen_vec_setv16qi
;
16618 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
16619 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
16620 inner_mode
= QImode
;
16621 first_imode
= V8HImode
;
16622 second_imode
= V4SImode
;
16623 third_imode
= V2DImode
;
16626 gcc_unreachable ();
16629 for (i
= 0; i
< n
; i
++)
16632 if (inner_mode
== HFmode
|| inner_mode
== BFmode
)
16635 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16636 machine_mode vec_mode
=
16637 (inner_mode
== HFmode
) ? V8HFmode
: V8BFmode
;
16638 op0
= gen_reg_rtx (vec_mode
);
16639 even
= lowpart_subreg (vec_mode
,
16640 force_reg (inner_mode
, op
), inner_mode
);
16641 odd
= lowpart_subreg (vec_mode
,
16642 force_reg (inner_mode
, ops
[i
+ i
+ 1]),
16644 emit_insn (gen_load_even (op0
, even
, odd
));
16648 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16649 op0
= gen_reg_rtx (SImode
);
16650 emit_move_insn (op0
, gen_lowpart (SImode
, op
));
16652 /* Insert the SImode value as low element of V4SImode vector. */
16653 op1
= gen_reg_rtx (V4SImode
);
16654 op0
= gen_rtx_VEC_MERGE (V4SImode
,
16655 gen_rtx_VEC_DUPLICATE (V4SImode
,
16657 CONST0_RTX (V4SImode
),
16659 emit_insn (gen_rtx_SET (op1
, op0
));
16661 /* Cast the V4SImode vector back to a vector in orignal mode. */
16662 op0
= gen_reg_rtx (mode
);
16663 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
16665 /* Load even elements into the second position. */
16666 emit_insn (gen_load_even (op0
,
16667 force_reg (inner_mode
,
16672 /* Cast vector to FIRST_IMODE vector. */
16673 ops
[i
] = gen_reg_rtx (first_imode
);
16674 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
16677 /* Interleave low FIRST_IMODE vectors. */
16678 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
16680 op0
= gen_reg_rtx (first_imode
);
16681 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
16683 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16684 ops
[j
] = gen_reg_rtx (second_imode
);
16685 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
16688 /* Interleave low SECOND_IMODE vectors. */
16689 switch (second_imode
)
16692 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
16694 op0
= gen_reg_rtx (second_imode
);
16695 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
16698 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16700 ops
[j
] = gen_reg_rtx (third_imode
);
16701 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
16703 second_imode
= V2DImode
;
16704 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16708 op0
= gen_reg_rtx (second_imode
);
16709 emit_insn (gen_interleave_second_low (op0
, ops
[0],
16712 /* Cast the SECOND_IMODE vector back to a vector on original
16714 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
16718 gcc_unreachable ();
16722 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
16723 all values variable, and none identical. */
16726 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
16727 rtx target
, rtx vals
)
16729 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
16730 machine_mode half_mode
= VOIDmode
;
16731 machine_mode quarter_mode
= VOIDmode
;
16732 machine_mode int_inner_mode
= VOIDmode
;
16739 if (!mmx_ok
&& !TARGET_SSE
)
16755 n
= GET_MODE_NUNITS (mode
);
16756 for (i
= 0; i
< n
; i
++)
16757 ops
[i
] = XVECEXP (vals
, 0, i
);
16758 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
16762 for (i
= 0; i
< 2; i
++)
16763 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
16764 op0
= gen_reg_rtx (V4DImode
);
16765 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
16766 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
16770 for (i
= 0; i
< 4; i
++)
16771 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
16772 ops
[4] = gen_reg_rtx (V4DImode
);
16773 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
16774 ops
[5] = gen_reg_rtx (V4DImode
);
16775 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
16776 op0
= gen_reg_rtx (V8DImode
);
16777 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
16778 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
16782 half_mode
= V16QImode
;
16786 half_mode
= V8HImode
;
16790 half_mode
= V8HFmode
;
16794 half_mode
= V8BFmode
;
16798 n
= GET_MODE_NUNITS (mode
);
16799 for (i
= 0; i
< n
; i
++)
16800 ops
[i
] = XVECEXP (vals
, 0, i
);
16801 op0
= gen_reg_rtx (half_mode
);
16802 op1
= gen_reg_rtx (half_mode
);
16803 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
16805 ix86_expand_vector_init_interleave (half_mode
, op1
,
16806 &ops
[n
>> 1], n
>> 2);
16807 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
16811 quarter_mode
= V16QImode
;
16812 half_mode
= V32QImode
;
16816 quarter_mode
= V8HImode
;
16817 half_mode
= V16HImode
;
16821 quarter_mode
= V8HFmode
;
16822 half_mode
= V16HFmode
;
16826 quarter_mode
= V8BFmode
;
16827 half_mode
= V16BFmode
;
16831 n
= GET_MODE_NUNITS (mode
);
16832 for (i
= 0; i
< n
; i
++)
16833 ops
[i
] = XVECEXP (vals
, 0, i
);
16834 op0
= gen_reg_rtx (quarter_mode
);
16835 op1
= gen_reg_rtx (quarter_mode
);
16836 op2
= gen_reg_rtx (quarter_mode
);
16837 op3
= gen_reg_rtx (quarter_mode
);
16838 op4
= gen_reg_rtx (half_mode
);
16839 op5
= gen_reg_rtx (half_mode
);
16840 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
16842 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
16843 &ops
[n
>> 2], n
>> 3);
16844 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
16845 &ops
[n
>> 1], n
>> 3);
16846 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
16847 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
16848 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
16849 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
16850 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
16854 if (!TARGET_SSE4_1
)
16862 /* Don't use ix86_expand_vector_init_interleave if we can't
16863 move from GPR to SSE register directly. */
16864 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
16871 n
= GET_MODE_NUNITS (mode
);
16872 for (i
= 0; i
< n
; i
++)
16873 ops
[i
] = XVECEXP (vals
, 0, i
);
16874 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
16881 int_inner_mode
= HImode
;
16892 gcc_unreachable ();
16896 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
16897 machine_mode tmp_mode
, inner_mode
;
16898 rtx words
[4], shift
;
16900 tmp_mode
= (GET_MODE_SIZE (mode
) < UNITS_PER_WORD
) ? SImode
: word_mode
;
16902 inner_mode
= GET_MODE_INNER (mode
);
16903 n_elts
= GET_MODE_NUNITS (mode
);
16904 n_words
= GET_MODE_SIZE (mode
) / GET_MODE_SIZE (tmp_mode
);
16905 n_elt_per_word
= n_elts
/ n_words
;
16906 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
16908 for (i
= 0; i
< n_words
; ++i
)
16910 rtx word
= NULL_RTX
;
16912 for (j
= 0; j
< n_elt_per_word
; ++j
)
16914 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
16915 if (int_inner_mode
!= E_VOIDmode
)
16917 gcc_assert (TARGET_SSE2
&& int_inner_mode
== HImode
);
16918 rtx tmp
= gen_reg_rtx (int_inner_mode
);
16919 elt
= lowpart_subreg (int_inner_mode
,
16920 force_reg (inner_mode
, elt
),
16922 emit_move_insn (tmp
, elt
);
16925 elt
= convert_modes (tmp_mode
, inner_mode
, elt
, true);
16931 word
= expand_simple_binop (tmp_mode
, ASHIFT
, word
, shift
,
16932 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16933 word
= expand_simple_binop (tmp_mode
, IOR
, word
, elt
,
16934 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16942 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
16943 else if (n_words
== 2)
16945 gcc_assert (tmp_mode
== DImode
|| tmp_mode
== SImode
);
16946 machine_mode concat_mode
= tmp_mode
== DImode
? V2DImode
: V2SImode
;
16947 rtx tmp
= gen_reg_rtx (concat_mode
);
16948 vals
= gen_rtx_PARALLEL (concat_mode
, gen_rtvec_v (2, words
));
16949 ix86_expand_vector_init_general (mmx_ok
, concat_mode
, tmp
, vals
);
16950 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
16952 else if (n_words
== 4)
16954 rtx tmp
= gen_reg_rtx (V4SImode
);
16955 gcc_assert (tmp_mode
== SImode
);
16956 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
16957 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
16958 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
16961 gcc_unreachable ();
16965 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16966 instructions unless MMX_OK is true. */
16969 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
16971 machine_mode mode
= GET_MODE (target
);
16972 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16973 int n_elts
= GET_MODE_NUNITS (mode
);
16974 int n_var
= 0, one_var
= -1;
16975 bool all_same
= true, all_const_zero
= true;
16979 /* Handle first initialization from vector elts. */
16980 if (n_elts
!= XVECLEN (vals
, 0))
16982 rtx subtarget
= target
;
16983 x
= XVECEXP (vals
, 0, 0);
16984 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
16985 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
16987 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
16988 if (inner_mode
== QImode
16989 || inner_mode
== HImode
16990 || inner_mode
== TImode
16991 || inner_mode
== HFmode
16992 || inner_mode
== BFmode
)
16994 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
16995 scalar_mode elt_mode
= inner_mode
== TImode
? DImode
: SImode
;
16996 n_bits
/= GET_MODE_SIZE (elt_mode
);
16997 mode
= mode_for_vector (elt_mode
, n_bits
).require ();
16998 inner_mode
= mode_for_vector (elt_mode
, n_bits
/ 2).require ();
16999 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
17000 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
17001 subtarget
= gen_reg_rtx (mode
);
17003 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
17004 if (subtarget
!= target
)
17005 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
17008 gcc_unreachable ();
17011 for (i
= 0; i
< n_elts
; ++i
)
17013 x
= XVECEXP (vals
, 0, i
);
17014 if (!(CONST_SCALAR_INT_P (x
)
17015 || CONST_DOUBLE_P (x
)
17016 || CONST_FIXED_P (x
)))
17017 n_var
++, one_var
= i
;
17018 else if (x
!= CONST0_RTX (inner_mode
))
17019 all_const_zero
= false;
17020 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
17024 /* If all values are identical, broadcast the value. */
17026 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
17027 XVECEXP (vals
, 0, 0)))
17030 /* Constants are best loaded from the constant pool. */
17033 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
17037 /* Values where only one field is non-constant are best loaded from
17038 the pool and overwritten via move later. */
17042 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
17043 XVECEXP (vals
, 0, one_var
),
17047 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
17051 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
17055 V setg (V v, int idx, T val)
17057 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
17058 V valv = (V){val, val, val, val, val, val, val, val};
17059 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
17060 v = (v & ~mask) | (valv & mask);
17064 ix86_expand_vector_set_var (rtx target
, rtx val
, rtx idx
)
17067 machine_mode mode
= GET_MODE (target
);
17068 machine_mode cmp_mode
= mode
;
17069 int n_elts
= GET_MODE_NUNITS (mode
);
17070 rtx valv
,idxv
,constv
,idx_tmp
;
17073 /* 512-bits vector byte/word broadcast and comparison only available
17074 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
17075 when without TARGET_AVX512BW. */
17076 if ((mode
== V32HImode
|| mode
== V32HFmode
|| mode
== V32BFmode
17077 || mode
== V64QImode
)
17078 && !TARGET_AVX512BW
)
17080 gcc_assert (TARGET_AVX512F
);
17081 rtx vhi
, vlo
, idx_hi
;
17082 machine_mode half_mode
;
17083 rtx (*extract_hi
)(rtx
, rtx
);
17084 rtx (*extract_lo
)(rtx
, rtx
);
17086 if (mode
== V32HImode
)
17088 half_mode
= V16HImode
;
17089 extract_hi
= gen_vec_extract_hi_v32hi
;
17090 extract_lo
= gen_vec_extract_lo_v32hi
;
17092 else if (mode
== V32HFmode
)
17094 half_mode
= V16HFmode
;
17095 extract_hi
= gen_vec_extract_hi_v32hf
;
17096 extract_lo
= gen_vec_extract_lo_v32hf
;
17098 else if (mode
== V32BFmode
)
17100 half_mode
= V16BFmode
;
17101 extract_hi
= gen_vec_extract_hi_v32bf
;
17102 extract_lo
= gen_vec_extract_lo_v32bf
;
17106 half_mode
= V32QImode
;
17107 extract_hi
= gen_vec_extract_hi_v64qi
;
17108 extract_lo
= gen_vec_extract_lo_v64qi
;
17111 vhi
= gen_reg_rtx (half_mode
);
17112 vlo
= gen_reg_rtx (half_mode
);
17113 idx_hi
= gen_reg_rtx (GET_MODE (idx
));
17114 emit_insn (extract_hi (vhi
, target
));
17115 emit_insn (extract_lo (vlo
, target
));
17118 vec
[2] = GEN_INT (n_elts
/2);
17119 ix86_expand_binary_operator (MINUS
, GET_MODE (idx
), vec
);
17120 ix86_expand_vector_set_var (vhi
, val
, idx_hi
);
17121 ix86_expand_vector_set_var (vlo
, val
, idx
);
17122 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, vlo
, vhi
)));
17126 if (FLOAT_MODE_P (GET_MODE_INNER (mode
)))
17131 cmp_mode
= V2DImode
;
17134 cmp_mode
= V4DImode
;
17137 cmp_mode
= V8DImode
;
17140 cmp_mode
= V2SImode
;
17143 cmp_mode
= V4SImode
;
17146 cmp_mode
= V8SImode
;
17149 cmp_mode
= V16SImode
;
17153 cmp_mode
= V2HImode
;
17157 cmp_mode
= V4HImode
;
17160 cmp_mode
= V8HImode
;
17163 cmp_mode
= V16HImode
;
17166 cmp_mode
= V32HImode
;
17169 cmp_mode
= V8HImode
;
17172 cmp_mode
= V16HImode
;
17175 cmp_mode
= V32HImode
;
17178 gcc_unreachable ();
17182 for (int i
= 0; i
!= n_elts
; i
++)
17183 vec
[i
] = GEN_INT (i
);
17184 constv
= gen_rtx_CONST_VECTOR (cmp_mode
, gen_rtvec_v (n_elts
, vec
));
17185 valv
= gen_reg_rtx (mode
);
17186 idxv
= gen_reg_rtx (cmp_mode
);
17187 idx_tmp
= convert_to_mode (GET_MODE_INNER (cmp_mode
), idx
, 1);
17189 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
17192 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
17193 cmp_mode
, idxv
, idx_tmp
);
17198 vec
[3] = gen_rtx_EQ (mode
, idxv
, constv
);
17201 ok
= ix86_expand_int_vcond (vec
);
17206 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
17208 machine_mode mode
= GET_MODE (target
);
17209 machine_mode inner_mode
= GET_MODE_INNER (mode
);
17210 machine_mode half_mode
;
17211 bool use_vec_merge
= false;
17212 bool blendm_const
= false;
17214 static rtx (*gen_extract
[8][2]) (rtx
, rtx
)
17216 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
17217 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
17218 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
17219 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
17220 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
17221 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
},
17222 { gen_vec_extract_lo_v16hf
, gen_vec_extract_hi_v16hf
},
17223 { gen_vec_extract_lo_v16bf
, gen_vec_extract_hi_v16bf
}
17225 static rtx (*gen_insert
[8][2]) (rtx
, rtx
, rtx
)
17227 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
17228 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
17229 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
17230 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
17231 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
17232 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
},
17233 { gen_vec_set_lo_v16hf
, gen_vec_set_hi_v16hf
},
17234 { gen_vec_set_lo_v16bf
, gen_vec_set_hi_v16bf
},
17237 machine_mode mmode
= VOIDmode
;
17238 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
17243 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17251 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
17252 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
17254 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
17256 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
17257 emit_insn (gen_rtx_SET (target
, tmp
));
17263 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
17267 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
17268 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
17270 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
17272 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
17273 emit_insn (gen_rtx_SET (target
, tmp
));
17277 /* NB: For ELT == 0, use standard scalar operation patterns which
17278 preserve the rest of the vector for combiner:
17281 (vec_duplicate:V2DF (reg:DF))
17291 /* For the two element vectors, we implement a VEC_CONCAT with
17292 the extraction of the other element. */
17294 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
17295 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
17298 op0
= val
, op1
= tmp
;
17300 op0
= tmp
, op1
= val
;
17302 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
17303 emit_insn (gen_rtx_SET (target
, tmp
));
17308 use_vec_merge
= TARGET_SSE4_1
;
17315 use_vec_merge
= true;
17319 /* tmp = target = A B C D */
17320 tmp
= copy_to_reg (target
);
17321 /* target = A A B B */
17322 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
17323 /* target = X A B B */
17324 ix86_expand_vector_set (false, target
, val
, 0);
17325 /* target = A X C D */
17326 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
17327 const1_rtx
, const0_rtx
,
17328 GEN_INT (2+4), GEN_INT (3+4)));
17332 /* tmp = target = A B C D */
17333 tmp
= copy_to_reg (target
);
17334 /* tmp = X B C D */
17335 ix86_expand_vector_set (false, tmp
, val
, 0);
17336 /* target = A B X D */
17337 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
17338 const0_rtx
, const1_rtx
,
17339 GEN_INT (0+4), GEN_INT (3+4)));
17343 /* tmp = target = A B C D */
17344 tmp
= copy_to_reg (target
);
17345 /* tmp = X B C D */
17346 ix86_expand_vector_set (false, tmp
, val
, 0);
17347 /* target = A B X D */
17348 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
17349 const0_rtx
, const1_rtx
,
17350 GEN_INT (2+4), GEN_INT (0+4)));
17354 gcc_unreachable ();
17359 use_vec_merge
= TARGET_SSE4_1
;
17363 /* Element 0 handled by vec_merge below. */
17366 use_vec_merge
= true;
17372 /* With SSE2, use integer shuffles to swap element 0 and ELT,
17373 store into element 0, then shuffle them back. */
17377 order
[0] = GEN_INT (elt
);
17378 order
[1] = const1_rtx
;
17379 order
[2] = const2_rtx
;
17380 order
[3] = GEN_INT (3);
17381 order
[elt
] = const0_rtx
;
17383 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
17384 order
[1], order
[2], order
[3]));
17386 ix86_expand_vector_set (false, target
, val
, 0);
17388 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
17389 order
[1], order
[2], order
[3]));
17393 /* For SSE1, we have to reuse the V4SF code. */
17394 rtx t
= gen_reg_rtx (V4SFmode
);
17395 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
17396 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
17397 emit_move_insn (target
, gen_lowpart (mode
, t
));
17407 use_vec_merge
= TARGET_SSE2
;
17412 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
17417 use_vec_merge
= TARGET_SSE4_1
;
17421 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17425 half_mode
= V16QImode
;
17432 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
17433 if (TARGET_AVX2
&& elt
!= 0)
17436 gen_blendm
= ((mode
== E_V16HFmode
) ? gen_avx2_pblendph_1
17437 : gen_avx2_pblendbf_1
);
17438 blendm_const
= true;
17443 half_mode
= ((mode
== E_V16HFmode
) ? V8HFmode
: V8BFmode
);
17444 j
= ((mode
== E_V16HFmode
) ? 6 : 7);
17450 half_mode
= V8HImode
;
17456 half_mode
= V4SImode
;
17462 half_mode
= V2DImode
;
17468 half_mode
= V4SFmode
;
17474 half_mode
= V2DFmode
;
17480 /* Compute offset. */
17484 gcc_assert (i
<= 1);
17486 /* Extract the half. */
17487 tmp
= gen_reg_rtx (half_mode
);
17488 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
17490 /* Put val in tmp at elt. */
17491 ix86_expand_vector_set (false, tmp
, val
, elt
);
17494 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
17498 if (TARGET_AVX512F
)
17501 gen_blendm
= gen_avx512f_blendmv8df
;
17506 if (TARGET_AVX512F
)
17509 gen_blendm
= gen_avx512f_blendmv8di
;
17514 if (TARGET_AVX512F
)
17517 gen_blendm
= gen_avx512f_blendmv16sf
;
17522 if (TARGET_AVX512F
)
17525 gen_blendm
= gen_avx512f_blendmv16si
;
17530 if (TARGET_AVX512BW
)
17533 gen_blendm
= gen_avx512bw_blendmv32hf
;
17537 if (TARGET_AVX512BW
)
17540 gen_blendm
= gen_avx512bw_blendmv32bf
;
17544 if (TARGET_AVX512BW
)
17547 gen_blendm
= gen_avx512bw_blendmv32hi
;
17549 else if (TARGET_AVX512F
)
17551 half_mode
= E_V8HImode
;
17558 if (TARGET_AVX512BW
)
17561 gen_blendm
= gen_avx512bw_blendmv64qi
;
17563 else if (TARGET_AVX512F
)
17565 half_mode
= E_V16QImode
;
17572 /* Compute offset. */
17576 gcc_assert (i
<= 3);
17579 /* Extract the quarter. */
17580 tmp
= gen_reg_rtx (V4SImode
);
17581 rtx tmp2
= gen_lowpart (V16SImode
, target
);
17582 rtx mask
= gen_reg_rtx (QImode
);
17584 emit_move_insn (mask
, constm1_rtx
);
17585 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
17588 tmp2
= gen_reg_rtx (half_mode
);
17589 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
17592 /* Put val in tmp at elt. */
17593 ix86_expand_vector_set (false, tmp
, val
, elt
);
17596 tmp2
= gen_reg_rtx (V16SImode
);
17597 rtx tmp3
= gen_lowpart (V16SImode
, target
);
17598 mask
= gen_reg_rtx (HImode
);
17599 emit_move_insn (mask
, constm1_rtx
);
17600 tmp
= gen_lowpart (V4SImode
, tmp
);
17601 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
17603 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
17611 if (mmode
!= VOIDmode
)
17613 tmp
= gen_reg_rtx (mode
);
17614 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
17615 rtx merge_mask
= gen_int_mode (HOST_WIDE_INT_1U
<< elt
, mmode
);
17616 /* The avx512*_blendm<mode> expanders have different operand order
17617 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
17618 elements where the mask is set and second input operand otherwise,
17619 in {sse,avx}*_*blend* the first input operand is used for elements
17620 where the mask is clear and second input operand otherwise. */
17622 merge_mask
= force_reg (mmode
, merge_mask
);
17623 emit_insn (gen_blendm (target
, target
, tmp
, merge_mask
));
17625 else if (use_vec_merge
)
17628 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
17629 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
17630 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
17631 emit_insn (gen_rtx_SET (target
, tmp
));
17635 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
17637 emit_move_insn (mem
, target
);
17639 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
17640 emit_move_insn (tmp
, val
);
17642 emit_move_insn (target
, mem
);
17647 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
17649 machine_mode mode
= GET_MODE (vec
);
17650 machine_mode inner_mode
= GET_MODE_INNER (mode
);
17651 bool use_vec_extr
= false;
17657 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17671 use_vec_extr
= true;
17675 use_vec_extr
= TARGET_SSE4_1
;
17687 tmp
= gen_reg_rtx (mode
);
17688 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
17689 GEN_INT (elt
), GEN_INT (elt
),
17690 GEN_INT (elt
+4), GEN_INT (elt
+4)));
17694 tmp
= gen_reg_rtx (mode
);
17695 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
17699 gcc_unreachable ();
17702 use_vec_extr
= true;
17707 use_vec_extr
= TARGET_SSE4_1
;
17721 tmp
= gen_reg_rtx (mode
);
17722 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
17723 GEN_INT (elt
), GEN_INT (elt
),
17724 GEN_INT (elt
), GEN_INT (elt
)));
17728 tmp
= gen_reg_rtx (mode
);
17729 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
17733 gcc_unreachable ();
17736 use_vec_extr
= true;
17741 /* For SSE1, we have to reuse the V4SF code. */
17742 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
17743 gen_lowpart (V4SFmode
, vec
), elt
);
17754 use_vec_extr
= TARGET_SSE2
;
17759 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
17763 use_vec_extr
= TARGET_SSE4_1
;
17767 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
17769 tmp
= gen_reg_rtx (SImode
);
17770 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
17772 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
17777 use_vec_extr
= TARGET_SSE4_1
;
17783 tmp
= gen_reg_rtx (V4SFmode
);
17785 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
17787 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
17788 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17796 tmp
= gen_reg_rtx (V2DFmode
);
17798 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
17800 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
17801 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
17809 tmp
= gen_reg_rtx (V16QImode
);
17811 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
17813 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
17814 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17822 tmp
= gen_reg_rtx (V8HImode
);
17824 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
17826 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
17827 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17835 tmp
= gen_reg_rtx (V4SImode
);
17837 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
17839 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
17840 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17848 tmp
= gen_reg_rtx (V2DImode
);
17850 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
17852 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
17853 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
17859 if (TARGET_AVX512BW
)
17861 tmp
= gen_reg_rtx (V16HImode
);
17863 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
17865 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
17866 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17872 if (TARGET_AVX512BW
)
17874 tmp
= gen_reg_rtx (V32QImode
);
17876 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
17878 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
17879 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
17885 tmp
= gen_reg_rtx (V8SFmode
);
17887 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
17889 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
17890 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17894 tmp
= gen_reg_rtx (V4DFmode
);
17896 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
17898 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
17899 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17903 tmp
= gen_reg_rtx (V8SImode
);
17905 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
17907 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
17908 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17912 tmp
= gen_reg_rtx (V4DImode
);
17914 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
17916 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
17917 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17922 if (TARGET_AVX512BW
)
17924 tmp
= (mode
== E_V32HFmode
17925 ? gen_reg_rtx (V16HFmode
)
17926 : gen_reg_rtx (V16BFmode
));
17928 emit_insn (gen_vec_extract_lo (mode
, tmp
, vec
));
17930 emit_insn (gen_vec_extract_hi (mode
, tmp
, vec
));
17931 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17940 tmp
= (mode
== E_V16HFmode
17941 ? gen_reg_rtx (V8HFmode
)
17942 : gen_reg_rtx (V8BFmode
));
17944 emit_insn (gen_vec_extract_lo (mode
, tmp
, vec
));
17946 emit_insn (gen_vec_extract_hi (mode
, tmp
, vec
));
17947 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17953 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17954 /* ??? Could extract the appropriate HImode element and shift. */
17963 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
17964 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
17966 /* Let the rtl optimizers know about the zero extension performed. */
17967 if (inner_mode
== QImode
|| inner_mode
== HImode
)
17969 rtx reg
= gen_reg_rtx (SImode
);
17970 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
17971 emit_move_insn (reg
, tmp
);
17972 tmp
= gen_lowpart (inner_mode
, reg
);
17973 SUBREG_PROMOTED_VAR_P (tmp
) = 1;
17974 SUBREG_PROMOTED_SET (tmp
, 1);
17977 emit_move_insn (target
, tmp
);
17981 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
17983 emit_move_insn (mem
, vec
);
17985 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
17986 emit_move_insn (target
, tmp
);
17990 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17991 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17992 The upper bits of DEST are undefined, though they shouldn't cause
17993 exceptions (some bits from src or all zeros are ok). */
17996 emit_reduc_half (rtx dest
, rtx src
, int i
)
17999 switch (GET_MODE (src
))
18003 tem
= gen_sse_movhlps (dest
, src
, src
);
18005 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
18006 GEN_INT (1 + 4), GEN_INT (1 + 4));
18009 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
18012 d
= gen_reg_rtx (V1SImode
);
18013 tem
= gen_mmx_lshrv1si3 (d
, gen_lowpart (V1SImode
, src
),
18018 d
= gen_reg_rtx (V1DImode
);
18019 tem
= gen_mmx_lshrv1di3 (d
, gen_lowpart (V1DImode
, src
),
18027 d
= gen_reg_rtx (V1TImode
);
18028 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
18033 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
18035 tem
= gen_avx_shufps256 (dest
, src
, src
,
18036 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
18040 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
18042 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
18051 if (GET_MODE (dest
) != V4DImode
)
18052 d
= gen_reg_rtx (V4DImode
);
18053 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
18054 gen_lowpart (V4DImode
, src
),
18059 d
= gen_reg_rtx (V2TImode
);
18060 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
18069 d
= gen_reg_rtx (V4TImode
);
18070 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
18080 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
18081 gen_lowpart (V16SImode
, src
),
18082 gen_lowpart (V16SImode
, src
),
18083 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
18084 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
18085 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
18086 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
18087 GEN_INT (0xC), GEN_INT (0xD),
18088 GEN_INT (0xE), GEN_INT (0xF),
18089 GEN_INT (0x10), GEN_INT (0x11),
18090 GEN_INT (0x12), GEN_INT (0x13),
18091 GEN_INT (0x14), GEN_INT (0x15),
18092 GEN_INT (0x16), GEN_INT (0x17));
18094 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
18095 gen_lowpart (V16SImode
, src
),
18096 GEN_INT (i
== 128 ? 0x2 : 0x1),
18100 GEN_INT (i
== 128 ? 0x6 : 0x5),
18104 GEN_INT (i
== 128 ? 0xA : 0x9),
18108 GEN_INT (i
== 128 ? 0xE : 0xD),
18114 gcc_unreachable ();
18118 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
18121 /* Expand a vector reduction. FN is the binary pattern to reduce;
18122 DEST is the destination; IN is the input vector. */
18125 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
18127 rtx half
, dst
, vec
= in
;
18128 machine_mode mode
= GET_MODE (in
);
18131 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
18133 && mode
== V8HImode
18134 && fn
== gen_uminv8hi3
)
18136 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
18140 for (i
= GET_MODE_BITSIZE (mode
);
18141 i
> GET_MODE_UNIT_BITSIZE (mode
);
18144 half
= gen_reg_rtx (mode
);
18145 emit_reduc_half (half
, vec
, i
);
18146 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
18149 dst
= gen_reg_rtx (mode
);
18150 emit_insn (fn (dst
, half
, vec
));
18155 /* Output code to perform a conditional jump to LABEL, if C2 flag in
18156 FP status register is set. */
18159 ix86_emit_fp_unordered_jump (rtx label
)
18161 rtx reg
= gen_reg_rtx (HImode
);
18165 emit_insn (gen_x86_fnstsw_1 (reg
));
18167 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
18169 emit_insn (gen_x86_sahf_1 (reg
));
18171 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
18172 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
18176 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
18178 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
18179 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
18182 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
18183 gen_rtx_LABEL_REF (VOIDmode
, label
),
18185 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
18186 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
18187 JUMP_LABEL (insn
) = label
;
18190 /* Output code to perform an sinh XFmode calculation. */
18193 ix86_emit_i387_sinh (rtx op0
, rtx op1
)
18195 rtx e1
= gen_reg_rtx (XFmode
);
18196 rtx e2
= gen_reg_rtx (XFmode
);
18197 rtx scratch
= gen_reg_rtx (HImode
);
18198 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
18199 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
18201 rtx_code_label
*jump_label
= gen_label_rtx ();
18204 /* scratch = fxam (op1) */
18205 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
18207 /* e1 = expm1 (|op1|) */
18208 emit_insn (gen_absxf2 (e2
, op1
));
18209 emit_insn (gen_expm1xf2 (e1
, e2
));
18211 /* e2 = e1 / (e1 + 1.0) + e1 */
18212 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
18213 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
18214 emit_insn (gen_divxf3 (e2
, e1
, e2
));
18215 emit_insn (gen_addxf3 (e2
, e2
, e1
));
18217 /* flags = signbit (op1) */
18218 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
18220 /* if (flags) then e2 = -e2 */
18221 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
18222 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
18223 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
18225 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
18226 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
18227 JUMP_LABEL (insn
) = jump_label
;
18229 emit_insn (gen_negxf2 (e2
, e2
));
18231 emit_label (jump_label
);
18232 LABEL_NUSES (jump_label
) = 1;
18234 /* op0 = 0.5 * e2 */
18235 half
= force_reg (XFmode
, half
);
18236 emit_insn (gen_mulxf3 (op0
, e2
, half
));
18239 /* Output code to perform an cosh XFmode calculation. */
18242 ix86_emit_i387_cosh (rtx op0
, rtx op1
)
18244 rtx e1
= gen_reg_rtx (XFmode
);
18245 rtx e2
= gen_reg_rtx (XFmode
);
18246 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
18249 /* e1 = exp (op1) */
18250 emit_insn (gen_expxf2 (e1
, op1
));
18252 /* e2 = e1 + 1.0 / e1 */
18253 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
18254 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
18255 emit_insn (gen_addxf3 (e2
, e1
, e2
));
18257 /* op0 = 0.5 * e2 */
18258 half
= force_reg (XFmode
, half
);
18259 emit_insn (gen_mulxf3 (op0
, e2
, half
));
18262 /* Output code to perform an tanh XFmode calculation. */
18265 ix86_emit_i387_tanh (rtx op0
, rtx op1
)
18267 rtx e1
= gen_reg_rtx (XFmode
);
18268 rtx e2
= gen_reg_rtx (XFmode
);
18269 rtx scratch
= gen_reg_rtx (HImode
);
18270 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
18272 rtx_code_label
*jump_label
= gen_label_rtx ();
18275 /* scratch = fxam (op1) */
18276 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
18278 /* e1 = expm1 (-|2 * op1|) */
18279 emit_insn (gen_addxf3 (e2
, op1
, op1
));
18280 emit_insn (gen_absxf2 (e2
, e2
));
18281 emit_insn (gen_negxf2 (e2
, e2
));
18282 emit_insn (gen_expm1xf2 (e1
, e2
));
18284 /* e2 = e1 / (e1 + 2.0) */
18285 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
18286 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
18287 emit_insn (gen_divxf3 (e2
, e1
, e2
));
18289 /* flags = signbit (op1) */
18290 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
18292 /* if (!flags) then e2 = -e2 */
18293 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
18294 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
18295 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
18297 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
18298 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
18299 JUMP_LABEL (insn
) = jump_label
;
18301 emit_insn (gen_negxf2 (e2
, e2
));
18303 emit_label (jump_label
);
18304 LABEL_NUSES (jump_label
) = 1;
18306 emit_move_insn (op0
, e2
);
18309 /* Output code to perform an asinh XFmode calculation. */
18312 ix86_emit_i387_asinh (rtx op0
, rtx op1
)
18314 rtx e1
= gen_reg_rtx (XFmode
);
18315 rtx e2
= gen_reg_rtx (XFmode
);
18316 rtx scratch
= gen_reg_rtx (HImode
);
18317 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
18319 rtx_code_label
*jump_label
= gen_label_rtx ();
18322 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
18323 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
18324 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
18325 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
18326 emit_insn (gen_sqrtxf2 (e2
, e2
));
18327 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
18330 emit_insn (gen_divxf3 (e1
, e1
, e2
));
18332 /* scratch = fxam (op1) */
18333 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
18335 /* e1 = e1 + |op1| */
18336 emit_insn (gen_absxf2 (e2
, op1
));
18337 emit_insn (gen_addxf3 (e1
, e1
, e2
));
18339 /* e2 = log1p (e1) */
18340 ix86_emit_i387_log1p (e2
, e1
);
18342 /* flags = signbit (op1) */
18343 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
18345 /* if (flags) then e2 = -e2 */
18346 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
18347 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
18348 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
18350 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
18351 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
18352 JUMP_LABEL (insn
) = jump_label
;
18354 emit_insn (gen_negxf2 (e2
, e2
));
18356 emit_label (jump_label
);
18357 LABEL_NUSES (jump_label
) = 1;
18359 emit_move_insn (op0
, e2
);
18362 /* Output code to perform an acosh XFmode calculation. */
18365 ix86_emit_i387_acosh (rtx op0
, rtx op1
)
18367 rtx e1
= gen_reg_rtx (XFmode
);
18368 rtx e2
= gen_reg_rtx (XFmode
);
18369 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
18371 /* e2 = sqrt (op1 + 1.0) */
18372 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
18373 emit_insn (gen_sqrtxf2 (e2
, e2
));
18375 /* e1 = sqrt (op1 - 1.0) */
18376 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
18377 emit_insn (gen_sqrtxf2 (e1
, e1
));
18380 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
18382 /* e1 = e1 + op1 */
18383 emit_insn (gen_addxf3 (e1
, e1
, op1
));
18385 /* op0 = log (e1) */
18386 emit_insn (gen_logxf2 (op0
, e1
));
18389 /* Output code to perform an atanh XFmode calculation. */
18392 ix86_emit_i387_atanh (rtx op0
, rtx op1
)
18394 rtx e1
= gen_reg_rtx (XFmode
);
18395 rtx e2
= gen_reg_rtx (XFmode
);
18396 rtx scratch
= gen_reg_rtx (HImode
);
18397 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
18398 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
18400 rtx_code_label
*jump_label
= gen_label_rtx ();
18403 /* scratch = fxam (op1) */
18404 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
18407 emit_insn (gen_absxf2 (e2
, op1
));
18409 /* e1 = -(e2 + e2) / (e2 + 1.0) */
18410 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
18411 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
18412 emit_insn (gen_addxf3 (e2
, e2
, e2
));
18413 emit_insn (gen_negxf2 (e2
, e2
));
18414 emit_insn (gen_divxf3 (e1
, e2
, e1
));
18416 /* e2 = log1p (e1) */
18417 ix86_emit_i387_log1p (e2
, e1
);
18419 /* flags = signbit (op1) */
18420 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
18422 /* if (!flags) then e2 = -e2 */
18423 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
18424 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
18425 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
18427 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
18428 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
18429 JUMP_LABEL (insn
) = jump_label
;
18431 emit_insn (gen_negxf2 (e2
, e2
));
18433 emit_label (jump_label
);
18434 LABEL_NUSES (jump_label
) = 1;
18436 /* op0 = 0.5 * e2 */
18437 half
= force_reg (XFmode
, half
);
18438 emit_insn (gen_mulxf3 (op0
, e2
, half
));
18441 /* Output code to perform a log1p XFmode calculation. */
18444 ix86_emit_i387_log1p (rtx op0
, rtx op1
)
18446 rtx_code_label
*label1
= gen_label_rtx ();
18447 rtx_code_label
*label2
= gen_label_rtx ();
18449 rtx tmp
= gen_reg_rtx (XFmode
);
18450 rtx res
= gen_reg_rtx (XFmode
);
18451 rtx cst
, cstln2
, cst1
;
18454 /* The emit_jump call emits pending stack adjust, make sure it is emitted
18455 before the conditional jump, otherwise the stack adjustment will be
18456 only conditional. */
18457 do_pending_stack_adjust ();
18459 cst
= const_double_from_real_value
18460 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
18461 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
18463 emit_insn (gen_absxf2 (tmp
, op1
));
18465 cst
= force_reg (XFmode
, cst
);
18466 ix86_expand_branch (GE
, tmp
, cst
, label1
);
18467 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
18468 insn
= get_last_insn ();
18469 JUMP_LABEL (insn
) = label1
;
18471 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
18472 emit_jump (label2
);
18474 emit_label (label1
);
18475 LABEL_NUSES (label1
) = 1;
18477 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
18478 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
18479 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
18481 emit_label (label2
);
18482 LABEL_NUSES (label2
) = 1;
18484 emit_move_insn (op0
, res
);
18487 /* Emit code for round calculation. */
18489 ix86_emit_i387_round (rtx op0
, rtx op1
)
18491 machine_mode inmode
= GET_MODE (op1
);
18492 machine_mode outmode
= GET_MODE (op0
);
18493 rtx e1
= gen_reg_rtx (XFmode
);
18494 rtx e2
= gen_reg_rtx (XFmode
);
18495 rtx scratch
= gen_reg_rtx (HImode
);
18496 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
18497 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
18498 rtx res
= gen_reg_rtx (outmode
);
18499 rtx_code_label
*jump_label
= gen_label_rtx ();
18500 rtx (*floor_insn
) (rtx
, rtx
);
18501 rtx (*neg_insn
) (rtx
, rtx
);
18509 tmp
= gen_reg_rtx (XFmode
);
18511 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
18517 gcc_unreachable ();
18523 floor_insn
= gen_frndintxf2_floor
;
18524 neg_insn
= gen_negsf2
;
18527 floor_insn
= gen_frndintxf2_floor
;
18528 neg_insn
= gen_negdf2
;
18531 floor_insn
= gen_frndintxf2_floor
;
18532 neg_insn
= gen_negxf2
;
18535 floor_insn
= gen_lfloorxfhi2
;
18536 neg_insn
= gen_neghi2
;
18539 floor_insn
= gen_lfloorxfsi2
;
18540 neg_insn
= gen_negsi2
;
18543 floor_insn
= gen_lfloorxfdi2
;
18544 neg_insn
= gen_negdi2
;
18547 gcc_unreachable ();
18550 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
18552 /* scratch = fxam(op1) */
18553 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
18555 /* e1 = fabs(op1) */
18556 emit_insn (gen_absxf2 (e1
, op1
));
18558 /* e2 = e1 + 0.5 */
18559 half
= force_reg (XFmode
, half
);
18560 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
18562 /* res = floor(e2) */
18568 tmp
= gen_reg_rtx (XFmode
);
18570 emit_insn (floor_insn (tmp
, e2
));
18571 emit_insn (gen_rtx_SET (res
,
18572 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
18573 UNSPEC_TRUNC_NOOP
)));
18577 emit_insn (floor_insn (res
, e2
));
18580 /* flags = signbit(a) */
18581 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
18583 /* if (flags) then res = -res */
18584 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
18585 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
18586 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
18588 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
18589 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
18590 JUMP_LABEL (insn
) = jump_label
;
18592 emit_insn (neg_insn (res
, res
));
18594 emit_label (jump_label
);
18595 LABEL_NUSES (jump_label
) = 1;
18597 emit_move_insn (op0
, res
);
18600 /* Output code to perform a Newton-Rhapson approximation of a single precision
18601 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
18604 ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
18606 rtx x0
, x1
, e0
, e1
;
18608 x0
= gen_reg_rtx (mode
);
18609 e0
= gen_reg_rtx (mode
);
18610 e1
= gen_reg_rtx (mode
);
18611 x1
= gen_reg_rtx (mode
);
18613 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
18615 b
= force_reg (mode
, b
);
18617 /* x0 = rcp(b) estimate */
18618 if (mode
== V16SFmode
|| mode
== V8DFmode
)
18620 if (TARGET_AVX512ER
)
18622 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18625 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
18629 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18633 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18637 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
18640 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
18643 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
18646 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
18649 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
18652 /* Output code to perform a Newton-Rhapson approximation of a
18653 single precision floating point [reciprocal] square root. */
18656 ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
18658 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
18662 x0
= gen_reg_rtx (mode
);
18663 e0
= gen_reg_rtx (mode
);
18664 e1
= gen_reg_rtx (mode
);
18665 e2
= gen_reg_rtx (mode
);
18666 e3
= gen_reg_rtx (mode
);
18668 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
18671 /* res = rsqrt28(a) estimate */
18672 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18676 /* x0 = rsqrt28(a) estimate */
18677 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18679 /* res = rcp28(x0) estimate */
18680 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
18686 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
18687 mthree
= const_double_from_real_value (r
, SFmode
);
18689 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
18690 mhalf
= const_double_from_real_value (r
, SFmode
);
18691 unspec
= UNSPEC_RSQRT
;
18693 if (VECTOR_MODE_P (mode
))
18695 mthree
= ix86_build_const_vector (mode
, true, mthree
);
18696 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
18697 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18698 if (GET_MODE_SIZE (mode
) == 64)
18699 unspec
= UNSPEC_RSQRT14
;
18702 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18703 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18705 a
= force_reg (mode
, a
);
18707 /* x0 = rsqrt(a) estimate */
18708 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18711 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18714 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
18717 /* Handle masked compare. */
18718 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
18720 mask
= gen_reg_rtx (HImode
);
18721 /* Imm value 0x4 corresponds to not-equal comparison. */
18722 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
18723 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
18727 mask
= gen_reg_rtx (mode
);
18728 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
18729 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
18733 mthree
= force_reg (mode
, mthree
);
18736 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
18738 unsigned vector_size
= GET_MODE_SIZE (mode
);
18740 || (TARGET_AVX512F
&& TARGET_EVEX512
&& vector_size
== 64)
18741 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
18742 emit_insn (gen_rtx_SET (e2
,
18743 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
18747 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
18750 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
18753 mhalf
= force_reg (mode
, mhalf
);
18755 /* e3 = -.5 * x0 */
18756 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
18758 /* e3 = -.5 * e0 */
18759 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
18760 /* ret = e2 * e3 */
18761 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
18764 /* Expand fabs (OP0) and return a new rtx that holds the result. The
18765 mask for masking out the sign-bit is stored in *SMASK, if that is
18769 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
18771 machine_mode vmode
, mode
= GET_MODE (op0
);
18774 xa
= gen_reg_rtx (mode
);
18775 if (mode
== SFmode
)
18777 else if (mode
== DFmode
)
18781 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
18782 if (!VECTOR_MODE_P (mode
))
18784 /* We need to generate a scalar mode mask in this case. */
18785 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
18786 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
18787 mask
= gen_reg_rtx (mode
);
18788 emit_insn (gen_rtx_SET (mask
, tmp
));
18790 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
18798 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
18799 swapping the operands if SWAP_OPERANDS is true. The expanded
18800 code is a forward jump to a newly created label in case the
18801 comparison is true. The generated label rtx is returned. */
18802 static rtx_code_label
*
18803 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
18804 bool swap_operands
)
18806 bool unordered_compare
= ix86_unordered_fp_compare (code
);
18807 rtx_code_label
*label
;
18811 std::swap (op0
, op1
);
18813 label
= gen_label_rtx ();
18814 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
18815 if (unordered_compare
)
18816 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
18817 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
18818 emit_insn (gen_rtx_SET (reg
, tmp
));
18819 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
18820 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
18821 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
18822 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
18823 JUMP_LABEL (tmp
) = label
;
18828 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18829 using comparison code CODE. Operands are swapped for the comparison if
18830 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18832 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
18833 bool swap_operands
)
18835 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
18836 machine_mode mode
= GET_MODE (op0
);
18837 rtx mask
= gen_reg_rtx (mode
);
18840 std::swap (op0
, op1
);
18842 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
18844 emit_insn (insn (mask
, op0
, op1
,
18845 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
18849 /* Expand copysign from SIGN to the positive value ABS_VALUE
18850 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18854 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
18856 machine_mode mode
= GET_MODE (sign
);
18857 rtx sgn
= gen_reg_rtx (mode
);
18858 if (mask
== NULL_RTX
)
18860 machine_mode vmode
;
18862 if (mode
== SFmode
)
18864 else if (mode
== DFmode
)
18866 else if (mode
== HFmode
)
18871 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
18872 if (!VECTOR_MODE_P (mode
))
18874 /* We need to generate a scalar mode mask in this case. */
18875 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
18876 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
18877 mask
= gen_reg_rtx (mode
);
18878 emit_insn (gen_rtx_SET (mask
, tmp
));
18882 mask
= gen_rtx_NOT (mode
, mask
);
18883 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
18884 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
18887 /* Expand SSE sequence for computing lround from OP1 storing
18891 ix86_expand_lround (rtx op0
, rtx op1
)
18893 /* C code for the stuff we're doing below:
18894 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18897 machine_mode mode
= GET_MODE (op1
);
18898 const struct real_format
*fmt
;
18899 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18902 /* load nextafter (0.5, 0.0) */
18903 fmt
= REAL_MODE_FORMAT (mode
);
18904 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18905 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18907 /* adj = copysign (0.5, op1) */
18908 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
18909 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
18911 /* adj = op1 + adj */
18912 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
18914 /* op0 = (imode)adj */
18915 expand_fix (op0
, adj
, 0);
18918 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18922 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
18924 /* C code for the stuff we're doing below (for do_floor):
18926 xi -= (double)xi > op1 ? 1 : 0;
18929 machine_mode fmode
= GET_MODE (op1
);
18930 machine_mode imode
= GET_MODE (op0
);
18931 rtx ireg
, freg
, tmp
;
18932 rtx_code_label
*label
;
18934 /* reg = (long)op1 */
18935 ireg
= gen_reg_rtx (imode
);
18936 expand_fix (ireg
, op1
, 0);
18938 /* freg = (double)reg */
18939 freg
= gen_reg_rtx (fmode
);
18940 expand_float (freg
, ireg
, 0);
18942 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18943 label
= ix86_expand_sse_compare_and_jump (UNLE
,
18944 freg
, op1
, !do_floor
);
18945 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
18946 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
18947 emit_move_insn (ireg
, tmp
);
18949 emit_label (label
);
18950 LABEL_NUSES (label
) = 1;
18952 emit_move_insn (op0
, ireg
);
18955 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18956 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18959 ix86_gen_TWO52 (machine_mode mode
)
18961 const struct real_format
*fmt
;
18962 REAL_VALUE_TYPE TWO52r
;
18965 fmt
= REAL_MODE_FORMAT (mode
);
18966 real_2expN (&TWO52r
, fmt
->p
- 1, mode
);
18967 TWO52
= const_double_from_real_value (TWO52r
, mode
);
18968 TWO52
= force_reg (mode
, TWO52
);
18973 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18976 ix86_expand_rint (rtx operand0
, rtx operand1
)
18978 /* C code for the stuff we're doing below:
18979 xa = fabs (operand1);
18980 if (!isless (xa, 2**52))
18983 if (flag_rounding_math)
18985 two52 = copysign (two52, operand1);
18988 xa = xa + two52 - two52;
18989 return copysign (xa, operand1);
18991 machine_mode mode
= GET_MODE (operand0
);
18992 rtx res
, xa
, TWO52
, mask
;
18993 rtx_code_label
*label
;
18995 TWO52
= ix86_gen_TWO52 (mode
);
18997 /* Temporary for holding the result, initialized to the input
18998 operand to ease control flow. */
18999 res
= copy_to_reg (operand1
);
19001 /* xa = abs (operand1) */
19002 xa
= ix86_expand_sse_fabs (res
, &mask
);
19004 /* if (!isless (xa, TWO52)) goto label; */
19005 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
19007 if (flag_rounding_math
)
19009 ix86_sse_copysign_to_positive (TWO52
, TWO52
, res
, mask
);
19013 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
19014 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
19016 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
19017 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
19018 xa
= ix86_expand_sse_fabs (xa
, NULL
);
19020 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
19022 emit_label (label
);
19023 LABEL_NUSES (label
) = 1;
19025 emit_move_insn (operand0
, res
);
19028 /* Expand SSE2 sequence for computing floor or ceil
19029 from OPERAND1 storing into OPERAND0. */
19031 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
19033 /* C code for the stuff we expand below.
19034 double xa = fabs (x), x2;
19035 if (!isless (xa, TWO52))
19037 x2 = (double)(long)x;
19046 if (HONOR_SIGNED_ZEROS (mode))
19047 return copysign (x2, x);
19050 machine_mode mode
= GET_MODE (operand0
);
19051 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
19052 rtx_code_label
*label
;
19054 TWO52
= ix86_gen_TWO52 (mode
);
19056 /* Temporary for holding the result, initialized to the input
19057 operand to ease control flow. */
19058 res
= copy_to_reg (operand1
);
19060 /* xa = abs (operand1) */
19061 xa
= ix86_expand_sse_fabs (res
, &mask
);
19063 /* if (!isless (xa, TWO52)) goto label; */
19064 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
19066 /* xa = (double)(long)x */
19067 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
19068 expand_fix (xi
, res
, 0);
19069 expand_float (xa
, xi
, 0);
19072 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
19074 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
19075 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
19076 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
19077 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
19078 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
19079 if (HONOR_SIGNED_ZEROS (mode
))
19081 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
19082 if (do_floor
&& flag_rounding_math
)
19083 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
19085 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
19087 emit_move_insn (res
, tmp
);
19089 emit_label (label
);
19090 LABEL_NUSES (label
) = 1;
19092 emit_move_insn (operand0
, res
);
19095 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
19096 into OPERAND0 without relying on DImode truncation via cvttsd2siq
19097 that is only available on 64bit targets. */
19099 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
19101 /* C code for the stuff we expand below.
19102 double xa = fabs (x), x2;
19103 if (!isless (xa, TWO52))
19105 xa = xa + TWO52 - TWO52;
19106 x2 = copysign (xa, x);
19115 if (HONOR_SIGNED_ZEROS (mode))
19116 x2 = copysign (x2, x);
19119 machine_mode mode
= GET_MODE (operand0
);
19120 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
19121 rtx_code_label
*label
;
19123 TWO52
= ix86_gen_TWO52 (mode
);
19125 /* Temporary for holding the result, initialized to the input
19126 operand to ease control flow. */
19127 res
= copy_to_reg (operand1
);
19129 /* xa = abs (operand1) */
19130 xa
= ix86_expand_sse_fabs (res
, &mask
);
19132 /* if (!isless (xa, TWO52)) goto label; */
19133 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
19135 /* xa = xa + TWO52 - TWO52; */
19136 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
19137 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
19139 /* xa = copysign (xa, operand1) */
19140 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
19143 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
19145 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
19146 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
19147 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
19148 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
19149 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
19150 if (HONOR_SIGNED_ZEROS (mode
))
19152 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
19153 if (do_floor
&& flag_rounding_math
)
19154 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
19156 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
19158 emit_move_insn (res
, tmp
);
19160 emit_label (label
);
19161 LABEL_NUSES (label
) = 1;
19163 emit_move_insn (operand0
, res
);
19166 /* Expand SSE sequence for computing trunc
19167 from OPERAND1 storing into OPERAND0. */
19169 ix86_expand_trunc (rtx operand0
, rtx operand1
)
19171 /* C code for SSE variant we expand below.
19172 double xa = fabs (x), x2;
19173 if (!isless (xa, TWO52))
19175 x2 = (double)(long)x;
19176 if (HONOR_SIGNED_ZEROS (mode))
19177 return copysign (x2, x);
19180 machine_mode mode
= GET_MODE (operand0
);
19181 rtx xa
, xi
, TWO52
, res
, mask
;
19182 rtx_code_label
*label
;
19184 TWO52
= ix86_gen_TWO52 (mode
);
19186 /* Temporary for holding the result, initialized to the input
19187 operand to ease control flow. */
19188 res
= copy_to_reg (operand1
);
19190 /* xa = abs (operand1) */
19191 xa
= ix86_expand_sse_fabs (res
, &mask
);
19193 /* if (!isless (xa, TWO52)) goto label; */
19194 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
19196 /* xa = (double)(long)x */
19197 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
19198 expand_fix (xi
, res
, 0);
19199 expand_float (xa
, xi
, 0);
19201 if (HONOR_SIGNED_ZEROS (mode
))
19202 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
19204 emit_move_insn (res
, xa
);
19206 emit_label (label
);
19207 LABEL_NUSES (label
) = 1;
19209 emit_move_insn (operand0
, res
);
19212 /* Expand SSE sequence for computing trunc from OPERAND1 storing
19213 into OPERAND0 without relying on DImode truncation via cvttsd2siq
19214 that is only available on 64bit targets. */
19216 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
19218 machine_mode mode
= GET_MODE (operand0
);
19219 rtx xa
, xa2
, TWO52
, tmp
, one
, res
, mask
;
19220 rtx_code_label
*label
;
19222 /* C code for SSE variant we expand below.
19223 double xa = fabs (x), x2;
19224 if (!isless (xa, TWO52))
19226 xa2 = xa + TWO52 - TWO52;
19230 x2 = copysign (xa2, x);
19234 TWO52
= ix86_gen_TWO52 (mode
);
19236 /* Temporary for holding the result, initialized to the input
19237 operand to ease control flow. */
19238 res
=copy_to_reg (operand1
);
19240 /* xa = abs (operand1) */
19241 xa
= ix86_expand_sse_fabs (res
, &mask
);
19243 /* if (!isless (xa, TWO52)) goto label; */
19244 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
19246 /* xa2 = xa + TWO52 - TWO52; */
19247 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
19248 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
19251 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
19253 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
19254 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa2
, xa
, false);
19255 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
19256 tmp
= expand_simple_binop (mode
, MINUS
,
19257 xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
19258 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
19259 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
19260 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
19262 /* res = copysign (xa2, operand1) */
19263 ix86_sse_copysign_to_positive (res
, tmp
, res
, mask
);
19265 emit_label (label
);
19266 LABEL_NUSES (label
) = 1;
19268 emit_move_insn (operand0
, res
);
19271 /* Expand SSE sequence for computing round
19272 from OPERAND1 storing into OPERAND0. */
19274 ix86_expand_round (rtx operand0
, rtx operand1
)
19276 /* C code for the stuff we're doing below:
19277 double xa = fabs (x);
19278 if (!isless (xa, TWO52))
19280 xa = (double)(long)(xa + nextafter (0.5, 0.0));
19281 return copysign (xa, x);
19283 machine_mode mode
= GET_MODE (operand0
);
19284 rtx res
, TWO52
, xa
, xi
, half
, mask
;
19285 rtx_code_label
*label
;
19286 const struct real_format
*fmt
;
19287 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
19289 /* Temporary for holding the result, initialized to the input
19290 operand to ease control flow. */
19291 res
= copy_to_reg (operand1
);
19293 TWO52
= ix86_gen_TWO52 (mode
);
19294 xa
= ix86_expand_sse_fabs (res
, &mask
);
19295 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
19297 /* load nextafter (0.5, 0.0) */
19298 fmt
= REAL_MODE_FORMAT (mode
);
19299 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
19300 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
19302 /* xa = xa + 0.5 */
19303 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
19304 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
19306 /* xa = (double)(int64_t)xa */
19307 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
19308 expand_fix (xi
, xa
, 0);
19309 expand_float (xa
, xi
, 0);
19311 /* res = copysign (xa, operand1) */
19312 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
19314 emit_label (label
);
19315 LABEL_NUSES (label
) = 1;
19317 emit_move_insn (operand0
, res
);
19320 /* Expand SSE sequence for computing round from OPERAND1 storing
19321 into OPERAND0 without relying on DImode truncation via cvttsd2siq
19322 that is only available on 64bit targets. */
19324 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
19326 /* C code for the stuff we expand below.
19327 double xa = fabs (x), xa2, x2;
19328 if (!isless (xa, TWO52))
19330 Using the absolute value and copying back sign makes
19331 -0.0 -> -0.0 correct.
19332 xa2 = xa + TWO52 - TWO52;
19337 else if (dxa > 0.5)
19339 x2 = copysign (xa2, x);
19342 machine_mode mode
= GET_MODE (operand0
);
19343 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
19344 rtx_code_label
*label
;
19346 TWO52
= ix86_gen_TWO52 (mode
);
19348 /* Temporary for holding the result, initialized to the input
19349 operand to ease control flow. */
19350 res
= copy_to_reg (operand1
);
19352 /* xa = abs (operand1) */
19353 xa
= ix86_expand_sse_fabs (res
, &mask
);
19355 /* if (!isless (xa, TWO52)) goto label; */
19356 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
19358 /* xa2 = xa + TWO52 - TWO52; */
19359 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
19360 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
19362 /* dxa = xa2 - xa; */
19363 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
19365 /* generate 0.5, 1.0 and -0.5 */
19366 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
19367 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
19368 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
19372 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
19373 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
19374 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
19375 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
19376 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
19377 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
19378 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
19379 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
19381 /* res = copysign (xa2, operand1) */
19382 ix86_sse_copysign_to_positive (res
, xa2
, res
, mask
);
19384 emit_label (label
);
19385 LABEL_NUSES (label
) = 1;
19387 emit_move_insn (operand0
, res
);
19390 /* Expand SSE sequence for computing round
19391 from OP1 storing into OP0 using sse4 round insn. */
19393 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
19395 machine_mode mode
= GET_MODE (op0
);
19396 rtx e1
, e2
, res
, half
;
19397 const struct real_format
*fmt
;
19398 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
19399 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
19400 rtx (*gen_round
) (rtx
, rtx
, rtx
);
19405 gen_copysign
= gen_copysignhf3
;
19406 gen_round
= gen_sse4_1_roundhf2
;
19409 gen_copysign
= gen_copysignsf3
;
19410 gen_round
= gen_sse4_1_roundsf2
;
19413 gen_copysign
= gen_copysigndf3
;
19414 gen_round
= gen_sse4_1_rounddf2
;
19417 gcc_unreachable ();
19420 /* round (a) = trunc (a + copysign (0.5, a)) */
19422 /* load nextafter (0.5, 0.0) */
19423 fmt
= REAL_MODE_FORMAT (mode
);
19424 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
19425 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
19426 half
= const_double_from_real_value (pred_half
, mode
);
19428 /* e1 = copysign (0.5, op1) */
19429 e1
= gen_reg_rtx (mode
);
19430 emit_insn (gen_copysign (e1
, half
, op1
));
19432 /* e2 = op1 + e1 */
19433 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
19435 /* res = trunc (e2) */
19436 res
= gen_reg_rtx (mode
);
19437 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
19439 emit_move_insn (op0
, res
);
19442 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
19443 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
19444 insn every time. */
19446 static GTY(()) rtx_insn
*vselect_insn
;
19448 /* Initialize vselect_insn. */
19451 init_vselect_insn (void)
19456 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
19457 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
19458 XVECEXP (x
, 0, i
) = const0_rtx
;
19459 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
19461 x
= gen_rtx_SET (const0_rtx
, x
);
19463 vselect_insn
= emit_insn (x
);
19467 /* Construct (set target (vec_select op0 (parallel perm))) and
19468 return true if that's a valid instruction in the active ISA. */
19471 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
19472 unsigned nelt
, bool testing_p
)
19475 rtx x
, save_vconcat
;
19478 if (vselect_insn
== NULL_RTX
)
19479 init_vselect_insn ();
19481 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
19482 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
19483 for (i
= 0; i
< nelt
; ++i
)
19484 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
19485 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
19486 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
19487 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
19488 SET_DEST (PATTERN (vselect_insn
)) = target
;
19489 icode
= recog_memoized (vselect_insn
);
19491 if (icode
>= 0 && !testing_p
)
19492 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
19494 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
19495 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
19496 INSN_CODE (vselect_insn
) = -1;
19501 /* Similar, but generate a vec_concat from op0 and op1 as well. */
19504 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
19505 const unsigned char *perm
, unsigned nelt
,
19508 machine_mode v2mode
;
19512 if (vselect_insn
== NULL_RTX
)
19513 init_vselect_insn ();
19515 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
19517 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
19518 PUT_MODE (x
, v2mode
);
19521 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
19522 XEXP (x
, 0) = const0_rtx
;
19523 XEXP (x
, 1) = const0_rtx
;
19527 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19528 using movss or movsd. */
19530 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
19532 machine_mode vmode
= d
->vmode
;
19533 unsigned i
, nelt
= d
->nelt
;
19536 if (d
->one_operand_p
)
19539 if (!(TARGET_SSE
&& (vmode
== V4SFmode
|| vmode
== V4SImode
))
19540 && !(TARGET_MMX_WITH_SSE
&& (vmode
== V2SFmode
|| vmode
== V2SImode
))
19541 && !(TARGET_SSE2
&& (vmode
== V2DFmode
|| vmode
== V2DImode
)))
19544 /* Only the first element is changed. */
19545 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
19547 for (i
= 1; i
< nelt
; ++i
)
19548 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
19554 if (d
->perm
[0] == nelt
)
19555 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
19557 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
19559 emit_insn (gen_rtx_SET (d
->target
, x
));
19564 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19567 expand_vec_perm_insertps (struct expand_vec_perm_d
*d
)
19569 machine_mode vmode
= d
->vmode
;
19570 unsigned i
, cnt_s
, nelt
= d
->nelt
;
19574 if (d
->one_operand_p
)
19577 if (!(TARGET_SSE4_1
19578 && (vmode
== V4SFmode
|| vmode
== V4SImode
19579 || (TARGET_MMX_WITH_SSE
19580 && (vmode
== V2SFmode
|| vmode
== V2SImode
)))))
19583 for (i
= 0; i
< nelt
; ++i
)
19585 if (d
->perm
[i
] == i
)
19597 for (i
= 0; i
< nelt
; ++i
)
19599 if (d
->perm
[i
] == i
+ nelt
)
19613 gcc_assert (cnt_d
!= -1);
19615 cnt_s
= d
->perm
[cnt_d
];
19627 gcc_assert (cnt_s
< nelt
);
19629 rtx x
= gen_sse4_1_insertps (vmode
, d
->target
, dst
, src
,
19630 GEN_INT (cnt_s
<< 6 | cnt_d
<< 4));
19636 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19637 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
19640 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
19642 machine_mode mmode
, vmode
= d
->vmode
;
19643 unsigned i
, nelt
= d
->nelt
;
19644 unsigned HOST_WIDE_INT mask
;
19645 rtx target
, op0
, op1
, maskop
, x
;
19646 rtx rperm
[32], vperm
;
19648 if (d
->one_operand_p
)
19650 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
19651 && (TARGET_AVX512BW
19652 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
19654 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
19656 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
19658 else if (TARGET_SSE4_1
19659 && (GET_MODE_SIZE (vmode
) == 16
19660 || (TARGET_MMX_WITH_SSE
&& GET_MODE_SIZE (vmode
) == 8)
19661 || GET_MODE_SIZE (vmode
) == 4))
19666 /* This is a blend, not a permute. Elements must stay in their
19667 respective lanes. */
19668 for (i
= 0; i
< nelt
; ++i
)
19670 unsigned e
= d
->perm
[i
];
19671 if (!(e
== i
|| e
== i
+ nelt
))
19678 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
19679 decision should be extracted elsewhere, so that we only try that
19680 sequence once all budget==3 options have been tried. */
19681 target
= d
->target
;
19703 for (i
= 0; i
< nelt
; ++i
)
19704 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
19708 for (i
= 0; i
< 2; ++i
)
19709 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
19714 for (i
= 0; i
< 2; ++i
)
19715 mask
|= (d
->perm
[i
] >= 2 ? 3 : 0) << (i
* 2);
19722 /* Use vpblendd instead of vpblendw. */
19723 for (i
= 0; i
< nelt
; ++i
)
19724 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
19729 for (i
= 0; i
< 4; ++i
)
19730 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
19736 /* See if bytes move in pairs so we can use pblendw with
19737 an immediate argument, rather than pblendvb with a vector
19739 for (i
= 0; i
< 16; i
+= 2)
19740 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19743 for (i
= 0; i
< nelt
; ++i
)
19744 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
19747 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
19748 vperm
= force_reg (vmode
, vperm
);
19750 if (GET_MODE_SIZE (vmode
) == 4)
19751 emit_insn (gen_mmx_pblendvb_v4qi (target
, op0
, op1
, vperm
));
19752 else if (GET_MODE_SIZE (vmode
) == 8)
19753 emit_insn (gen_mmx_pblendvb_v8qi (target
, op0
, op1
, vperm
));
19754 else if (GET_MODE_SIZE (vmode
) == 16)
19755 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
19757 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
19758 if (target
!= d
->target
)
19759 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19763 for (i
= 0; i
< 8; ++i
)
19764 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
19769 target
= gen_reg_rtx (vmode
);
19770 op0
= gen_lowpart (vmode
, op0
);
19771 op1
= gen_lowpart (vmode
, op1
);
19775 for (i
= 0; i
< 8; i
+= 2)
19776 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19779 for (i
= 0; i
< 4; ++i
)
19780 mask
|= (d
->perm
[i
* 2] >= 8) << i
;
19785 for (i
= 0; i
< 4; i
+= 2)
19786 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19789 for (i
= 0; i
< 2; ++i
)
19790 mask
|= (d
->perm
[i
* 2] >= 4) << i
;
19795 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19796 for (i
= 0; i
< 32; i
+= 2)
19797 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19799 /* See if bytes move in quadruplets. If yes, vpblendd
19800 with immediate can be used. */
19801 for (i
= 0; i
< 32; i
+= 4)
19802 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
19806 /* See if bytes move the same in both lanes. If yes,
19807 vpblendw with immediate can be used. */
19808 for (i
= 0; i
< 16; i
+= 2)
19809 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
19812 /* Use vpblendw. */
19813 for (i
= 0; i
< 16; ++i
)
19814 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
19819 /* Use vpblendd. */
19820 for (i
= 0; i
< 8; ++i
)
19821 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
19826 /* See if words move in pairs. If yes, vpblendd can be used. */
19827 for (i
= 0; i
< 16; i
+= 2)
19828 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19832 /* See if words move the same in both lanes. If not,
19833 vpblendvb must be used. */
19834 for (i
= 0; i
< 8; i
++)
19835 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
19837 /* Use vpblendvb. */
19838 for (i
= 0; i
< 32; ++i
)
19839 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
19843 target
= gen_reg_rtx (vmode
);
19844 op0
= gen_lowpart (vmode
, op0
);
19845 op1
= gen_lowpart (vmode
, op1
);
19846 goto finish_pblendvb
;
19849 /* Use vpblendw. */
19850 for (i
= 0; i
< 16; ++i
)
19851 mask
|= (d
->perm
[i
] >= 16) << i
;
19855 /* Use vpblendd. */
19856 for (i
= 0; i
< 8; ++i
)
19857 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
19862 /* Use vpblendd. */
19863 for (i
= 0; i
< 4; ++i
)
19864 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
19869 gcc_unreachable ();
19892 /* Canonicalize vec_merge. */
19893 if (swap_commutative_operands_p (op1
, op0
)
19894 /* Two operands have same precedence, then
19895 first bit of mask select first operand. */
19896 || (!swap_commutative_operands_p (op0
, op1
)
19899 unsigned n_elts
= GET_MODE_NUNITS (vmode
);
19900 std::swap (op0
, op1
);
19901 unsigned HOST_WIDE_INT mask_all
= HOST_WIDE_INT_1U
;
19902 if (n_elts
== HOST_BITS_PER_WIDE_INT
)
19905 mask_all
= (HOST_WIDE_INT_1U
<< n_elts
) - 1;
19906 mask
= ~mask
& mask_all
;
19909 if (mmode
!= VOIDmode
)
19910 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
19912 maskop
= GEN_INT (mask
);
19914 /* This matches five different patterns with the different modes. */
19915 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
19916 x
= gen_rtx_SET (target
, x
);
19918 if (target
!= d
->target
)
19919 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19924 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19925 in terms of the variable form of vpermilps.
19927 Note that we will have already failed the immediate input vpermilps,
19928 which requires that the high and low part shuffle be identical; the
19929 variable form doesn't require that. */
19932 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
19934 rtx rperm
[8], vperm
;
19937 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
19940 /* We can only permute within the 128-bit lane. */
19941 for (i
= 0; i
< 8; ++i
)
19943 unsigned e
= d
->perm
[i
];
19944 if (i
< 4 ? e
>= 4 : e
< 4)
19951 for (i
= 0; i
< 8; ++i
)
19953 unsigned e
= d
->perm
[i
];
19955 /* Within each 128-bit lane, the elements of op0 are numbered
19956 from 0 and the elements of op1 are numbered from 4. */
19962 rperm
[i
] = GEN_INT (e
);
19965 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
19966 vperm
= force_reg (V8SImode
, vperm
);
19967 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
19972 /* For V*[QHS]Imode permutations, check if the same permutation
19973 can't be performed in a 2x, 4x or 8x wider inner mode. */
19976 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
19977 struct expand_vec_perm_d
*nd
)
19980 machine_mode mode
= VOIDmode
;
19984 case E_V8QImode
: mode
= V4HImode
; break;
19985 case E_V16QImode
: mode
= V8HImode
; break;
19986 case E_V32QImode
: mode
= V16HImode
; break;
19987 case E_V64QImode
: mode
= V32HImode
; break;
19988 case E_V4HImode
: mode
= V2SImode
; break;
19989 case E_V8HImode
: mode
= V4SImode
; break;
19990 case E_V16HImode
: mode
= V8SImode
; break;
19991 case E_V32HImode
: mode
= V16SImode
; break;
19992 case E_V4SImode
: mode
= V2DImode
; break;
19993 case E_V8SImode
: mode
= V4DImode
; break;
19994 case E_V16SImode
: mode
= V8DImode
; break;
19995 default: return false;
19997 for (i
= 0; i
< d
->nelt
; i
+= 2)
19998 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
20001 nd
->nelt
= d
->nelt
/ 2;
20002 for (i
= 0; i
< nd
->nelt
; i
++)
20003 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
20004 if (GET_MODE_INNER (mode
) != DImode
)
20005 canonicalize_vector_int_perm (nd
, nd
);
20008 nd
->one_operand_p
= d
->one_operand_p
;
20009 nd
->testing_p
= d
->testing_p
;
20010 if (d
->op0
== d
->op1
)
20011 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
20014 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
20015 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
20018 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
20020 nd
->target
= gen_reg_rtx (nd
->vmode
);
20025 /* Return true if permutation D can be performed as VMODE permutation
20029 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
20031 unsigned int i
, j
, chunk
;
20033 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
20034 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
20035 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
20038 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
20041 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
20042 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
20043 if (d
->perm
[i
] & (chunk
- 1))
20046 for (j
= 1; j
< chunk
; ++j
)
20047 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
20053 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20054 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
20057 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
20059 unsigned i
, nelt
, eltsz
, mask
;
20060 unsigned char perm
[64];
20061 machine_mode vmode
;
20062 struct expand_vec_perm_d nd
;
20063 rtx rperm
[64], vperm
, target
, op0
, op1
;
20067 if (!d
->one_operand_p
)
20068 switch (GET_MODE_SIZE (d
->vmode
))
20092 if (valid_perm_using_mode_p (V2TImode
, d
))
20097 /* Use vperm2i128 insn. The pattern uses
20098 V4DImode instead of V2TImode. */
20099 target
= d
->target
;
20100 if (d
->vmode
!= V4DImode
)
20101 target
= gen_reg_rtx (V4DImode
);
20102 op0
= gen_lowpart (V4DImode
, d
->op0
);
20103 op1
= gen_lowpart (V4DImode
, d
->op1
);
20105 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
20106 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
20107 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
20108 if (target
!= d
->target
)
20109 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
20118 switch (GET_MODE_SIZE (d
->vmode
))
20142 /* V4DImode should be already handled through
20143 expand_vselect by vpermq instruction. */
20144 gcc_assert (d
->vmode
!= V4DImode
);
20147 if (d
->vmode
== V8SImode
20148 || d
->vmode
== V16HImode
20149 || d
->vmode
== V32QImode
)
20151 /* First see if vpermq can be used for
20152 V8SImode/V16HImode/V32QImode. */
20153 if (valid_perm_using_mode_p (V4DImode
, d
))
20155 for (i
= 0; i
< 4; i
++)
20156 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
20159 target
= gen_reg_rtx (V4DImode
);
20160 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
20163 emit_move_insn (d
->target
,
20164 gen_lowpart (d
->vmode
, target
));
20170 /* Next see if vpermd can be used. */
20171 if (valid_perm_using_mode_p (V8SImode
, d
))
20174 /* Or if vpermps can be used. */
20175 else if (d
->vmode
== V8SFmode
)
20178 if (vmode
== V32QImode
)
20180 /* vpshufb only works intra lanes, it is not
20181 possible to shuffle bytes in between the lanes. */
20182 for (i
= 0; i
< nelt
; ++i
)
20183 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
20189 if (!TARGET_AVX512BW
)
20192 /* If vpermq didn't work, vpshufb won't work either. */
20193 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
20197 if (d
->vmode
== V16SImode
20198 || d
->vmode
== V32HImode
20199 || d
->vmode
== V64QImode
)
20201 /* First see if vpermq can be used for
20202 V16SImode/V32HImode/V64QImode. */
20203 if (valid_perm_using_mode_p (V8DImode
, d
))
20205 for (i
= 0; i
< 8; i
++)
20206 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
20209 target
= gen_reg_rtx (V8DImode
);
20210 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
20213 emit_move_insn (d
->target
,
20214 gen_lowpart (d
->vmode
, target
));
20220 /* Next see if vpermd can be used. */
20221 if (valid_perm_using_mode_p (V16SImode
, d
))
20224 /* Or if vpermps can be used. */
20225 else if (d
->vmode
== V16SFmode
)
20228 if (vmode
== V64QImode
)
20230 /* vpshufb only works intra lanes, it is not
20231 possible to shuffle bytes in between the lanes. */
20232 for (i
= 0; i
< nelt
; ++i
)
20233 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
20245 /* Try to avoid variable permutation instruction. */
20246 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
20248 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
20252 if (vmode
== V8SImode
)
20253 for (i
= 0; i
< 8; ++i
)
20254 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
20255 else if (vmode
== V16SImode
)
20256 for (i
= 0; i
< 16; ++i
)
20257 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
20260 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
20261 if (!d
->one_operand_p
)
20262 mask
= 2 * nelt
- 1;
20263 else if (vmode
== V64QImode
)
20264 mask
= nelt
/ 4 - 1;
20265 else if (vmode
== V32QImode
)
20266 mask
= nelt
/ 2 - 1;
20270 for (i
= 0; i
< nelt
; ++i
)
20272 unsigned j
, e
= d
->perm
[i
] & mask
;
20273 for (j
= 0; j
< eltsz
; ++j
)
20274 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
20278 machine_mode vpmode
= vmode
;
20280 nelt
= GET_MODE_SIZE (vmode
);
20282 /* Emulate narrow modes with V16QI instructions. */
20285 rtx m128
= GEN_INT (-128);
20287 /* Remap elements from the second operand, as we have to
20288 account for inactive top elements from the first operand. */
20289 if (!d
->one_operand_p
)
20291 for (i
= 0; i
< nelt
; ++i
)
20293 unsigned ival
= UINTVAL (rperm
[i
]);
20295 rperm
[i
] = GEN_INT (ival
+ 16 - nelt
);
20299 /* Fill inactive elements in the top positions with zeros. */
20300 for (i
= nelt
; i
< 16; ++i
)
20303 vpmode
= V16QImode
;
20306 vperm
= gen_rtx_CONST_VECTOR (vpmode
,
20307 gen_rtvec_v (GET_MODE_NUNITS (vpmode
), rperm
));
20308 vperm
= force_reg (vpmode
, vperm
);
20310 if (vmode
== d
->vmode
)
20311 target
= d
->target
;
20313 target
= gen_reg_rtx (vmode
);
20315 op0
= gen_lowpart (vmode
, d
->op0
);
20317 if (d
->one_operand_p
)
20319 rtx (*gen
) (rtx
, rtx
, rtx
);
20321 if (vmode
== V4QImode
)
20322 gen
= gen_mmx_pshufbv4qi3
;
20323 else if (vmode
== V8QImode
)
20324 gen
= gen_mmx_pshufbv8qi3
;
20325 else if (vmode
== V16QImode
)
20326 gen
= gen_ssse3_pshufbv16qi3
;
20327 else if (vmode
== V32QImode
)
20328 gen
= gen_avx2_pshufbv32qi3
;
20329 else if (vmode
== V64QImode
)
20330 gen
= gen_avx512bw_pshufbv64qi3
;
20331 else if (vmode
== V8SFmode
)
20332 gen
= gen_avx2_permvarv8sf
;
20333 else if (vmode
== V8SImode
)
20334 gen
= gen_avx2_permvarv8si
;
20335 else if (vmode
== V16SFmode
)
20336 gen
= gen_avx512f_permvarv16sf
;
20337 else if (vmode
== V16SImode
)
20338 gen
= gen_avx512f_permvarv16si
;
20340 gcc_unreachable ();
20342 emit_insn (gen (target
, op0
, vperm
));
20346 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
20348 op1
= gen_lowpart (vmode
, d
->op1
);
20350 if (vmode
== V4QImode
)
20351 gen
= gen_mmx_ppermv32
;
20352 else if (vmode
== V8QImode
)
20353 gen
= gen_mmx_ppermv64
;
20354 else if (vmode
== V16QImode
)
20355 gen
= gen_xop_pperm
;
20357 gcc_unreachable ();
20359 emit_insn (gen (target
, op0
, op1
, vperm
));
20362 if (target
!= d
->target
)
20363 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
20368 /* Try to expand one-operand permutation with constant mask. */
20371 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
20373 machine_mode mode
= GET_MODE (d
->op0
);
20374 machine_mode maskmode
= mode
;
20375 unsigned inner_size
= GET_MODE_SIZE (GET_MODE_INNER (mode
));
20376 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
20377 rtx target
, op0
, mask
;
20380 if (!rtx_equal_p (d
->op0
, d
->op1
))
20383 if (!TARGET_AVX512F
)
20386 /* Accept VNxHImode and VNxQImode now. */
20387 if (!TARGET_AVX512VL
&& GET_MODE_SIZE (mode
) < 64)
20391 if (!TARGET_AVX512BW
&& inner_size
== 2)
20395 if (!TARGET_AVX512VBMI
&& inner_size
== 1)
20401 gen
= gen_avx512f_permvarv16si
;
20404 gen
= gen_avx512f_permvarv16sf
;
20405 maskmode
= V16SImode
;
20408 gen
= gen_avx512f_permvarv8di
;
20411 gen
= gen_avx512f_permvarv8df
;
20412 maskmode
= V8DImode
;
20415 gen
= gen_avx512bw_permvarv32hi
;
20418 gen
= gen_avx512vl_permvarv16hi
;
20421 gen
= gen_avx512vl_permvarv8hi
;
20424 gen
= gen_avx512bw_permvarv64qi
;
20427 gen
= gen_avx512vl_permvarv32qi
;
20430 gen
= gen_avx512vl_permvarv16qi
;
20440 target
= d
->target
;
20442 for (int i
= 0; i
< d
->nelt
; ++i
)
20443 vec
[i
] = GEN_INT (d
->perm
[i
]);
20444 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
20445 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
20449 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
20451 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
20452 in a single instruction. */
20455 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
20457 unsigned i
, nelt
= d
->nelt
;
20458 struct expand_vec_perm_d nd
;
20460 /* Check plain VEC_SELECT first, because AVX has instructions that could
20461 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
20462 input where SEL+CONCAT may not. */
20463 if (d
->one_operand_p
)
20465 int mask
= nelt
- 1;
20466 bool identity_perm
= true;
20467 bool broadcast_perm
= true;
20469 for (i
= 0; i
< nelt
; i
++)
20471 nd
.perm
[i
] = d
->perm
[i
] & mask
;
20472 if (nd
.perm
[i
] != i
)
20473 identity_perm
= false;
20475 broadcast_perm
= false;
20481 emit_move_insn (d
->target
, d
->op0
);
20484 else if (broadcast_perm
&& TARGET_AVX2
)
20486 /* Use vpbroadcast{b,w,d}. */
20487 rtx (*gen
) (rtx
, rtx
) = NULL
;
20491 if (TARGET_AVX512BW
)
20492 gen
= gen_avx512bw_vec_dupv64qi_1
;
20495 gen
= gen_avx2_pbroadcastv32qi_1
;
20498 if (TARGET_AVX512BW
)
20499 gen
= gen_avx512bw_vec_dupv32hi_1
;
20502 gen
= gen_avx2_pbroadcastv16hi_1
;
20505 if (TARGET_AVX512F
)
20506 gen
= gen_avx512f_vec_dupv16si_1
;
20509 gen
= gen_avx2_pbroadcastv8si_1
;
20512 gen
= gen_avx2_pbroadcastv16qi
;
20515 gen
= gen_avx2_pbroadcastv8hi
;
20518 if (TARGET_AVX512F
)
20519 gen
= gen_avx512f_vec_dupv16sf_1
;
20522 gen
= gen_avx2_vec_dupv8sf_1
;
20525 if (TARGET_AVX512F
)
20526 gen
= gen_avx512f_vec_dupv8df_1
;
20529 if (TARGET_AVX512F
)
20530 gen
= gen_avx512f_vec_dupv8di_1
;
20532 /* For other modes prefer other shuffles this function creates. */
20538 emit_insn (gen (d
->target
, d
->op0
));
20543 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
20546 /* There are plenty of patterns in sse.md that are written for
20547 SEL+CONCAT and are not replicated for a single op. Perhaps
20548 that should be changed, to avoid the nastiness here. */
20550 /* Recognize interleave style patterns, which means incrementing
20551 every other permutation operand. */
20552 for (i
= 0; i
< nelt
; i
+= 2)
20554 nd
.perm
[i
] = d
->perm
[i
] & mask
;
20555 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
20557 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
20561 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
20564 for (i
= 0; i
< nelt
; i
+= 4)
20566 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
20567 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
20568 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
20569 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
20572 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
20578 /* Try the SSE4.1 blend variable merge instructions. */
20579 if (expand_vec_perm_blend (d
))
20582 /* Try movss/movsd instructions. */
20583 if (expand_vec_perm_movs (d
))
20586 /* Try the SSE4.1 insertps instruction. */
20587 if (expand_vec_perm_insertps (d
))
20590 /* Try the fully general two operand permute. */
20591 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
20595 /* Recognize interleave style patterns with reversed operands. */
20596 if (!d
->one_operand_p
)
20598 for (i
= 0; i
< nelt
; ++i
)
20600 unsigned e
= d
->perm
[i
];
20608 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
20613 /* Try one of the AVX vpermil variable permutations. */
20614 if (expand_vec_perm_vpermil (d
))
20617 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
20618 vpshufb, vpermd, vpermps or vpermq variable permutation. */
20619 if (expand_vec_perm_pshufb (d
))
20622 /* Try the AVX2 vpalignr instruction. */
20623 if (expand_vec_perm_palignr (d
, true))
20626 /* Try the AVX512F vperm{w,b,s,d} instructions */
20627 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
20630 /* Try the AVX512F vpermt2/vpermi2 instructions. */
20631 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
20634 /* See if we can get the same permutation in different vector integer
20636 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
20639 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
20645 /* Canonicalize vec_perm index to make the first index
20646 always comes from the first vector. */
20648 ix86_vec_perm_index_canon (struct expand_vec_perm_d
*d
)
20650 unsigned nelt
= d
->nelt
;
20651 if (d
->perm
[0] < nelt
)
20654 for (unsigned i
= 0; i
!= nelt
; i
++)
20655 d
->perm
[i
] = (d
->perm
[i
] + nelt
) % (2 * nelt
);
20657 std::swap (d
->op0
, d
->op1
);
20661 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20662 in terms of a pair of shufps+ shufps/pshufd instructions. */
20664 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d
*d
)
20666 unsigned char perm1
[4];
20667 machine_mode vmode
= d
->vmode
;
20669 unsigned i
, j
, k
, count
= 0;
20671 if (d
->one_operand_p
20672 || (vmode
!= V4SImode
&& vmode
!= V4SFmode
))
20678 ix86_vec_perm_index_canon (d
);
20679 for (i
= 0; i
< 4; ++i
)
20680 count
+= d
->perm
[i
] > 3 ? 1 : 0;
20682 gcc_assert (count
& 3);
20684 rtx tmp
= gen_reg_rtx (vmode
);
20685 /* 2 from op0 and 2 from op1. */
20688 unsigned char perm2
[4];
20689 for (i
= 0, j
= 0, k
= 2; i
< 4; ++i
)
20690 if (d
->perm
[i
] & 4)
20692 perm1
[k
++] = d
->perm
[i
];
20697 perm1
[j
++] = d
->perm
[i
];
20702 ok
= expand_vselect_vconcat (tmp
, d
->op0
, d
->op1
,
20703 perm1
, d
->nelt
, false);
20705 if (vmode
== V4SImode
&& TARGET_SSE2
)
20707 ok
= expand_vselect (d
->target
, tmp
,
20708 perm2
, d
->nelt
, false);
20714 ok
= expand_vselect_vconcat (d
->target
, tmp
, tmp
,
20715 perm2
, d
->nelt
, false);
20719 /* 3 from one op and 1 from another. */
20722 unsigned pair_idx
= 8, lone_idx
= 8, shift
;
20724 /* Find the lone index. */
20725 for (i
= 0; i
< 4; ++i
)
20726 if ((d
->perm
[i
] > 3 && count
== 1)
20727 || (d
->perm
[i
] < 4 && count
== 3))
20730 /* When lone_idx is not 0, it must from second op(count == 1). */
20731 gcc_assert (count
== (lone_idx
? 1 : 3));
20733 /* Find the pair index that sits in the same half as the lone index. */
20734 shift
= lone_idx
& 2;
20735 pair_idx
= 1 - lone_idx
+ 2 * shift
;
20737 /* First permutate lone index and pair index into the same vector as
20738 [ lone, lone, pair, pair ]. */
20739 perm1
[1] = perm1
[0]
20740 = (count
== 3) ? d
->perm
[lone_idx
] : d
->perm
[lone_idx
] - 4;
20741 perm1
[3] = perm1
[2]
20742 = (count
== 3) ? d
->perm
[pair_idx
] : d
->perm
[pair_idx
] + 4;
20744 /* Alway put the vector contains lone indx at the first. */
20746 std::swap (d
->op0
, d
->op1
);
20749 ok
= expand_vselect_vconcat (tmp
, d
->op0
, d
->op1
,
20750 perm1
, d
->nelt
, false);
20753 /* Refine lone and pair index to original order. */
20754 perm1
[shift
] = lone_idx
<< 1;
20755 perm1
[shift
+ 1] = pair_idx
<< 1;
20757 /* Select the remaining 2 elements in another vector. */
20758 for (i
= 2 - shift
; i
< 4 - shift
; ++i
)
20759 perm1
[i
] = lone_idx
== 1 ? d
->perm
[i
] + 4 : d
->perm
[i
];
20761 /* Adjust to original selector. */
20763 std::swap (tmp
, d
->op1
);
20766 ok
= expand_vselect_vconcat (d
->target
, tmp
, d
->op1
,
20767 perm1
, d
->nelt
, false);
20775 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20776 in terms of a pair of pshuflw + pshufhw instructions. */
20779 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
20781 unsigned char perm2
[MAX_VECT_LEN
];
20785 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
20788 /* The two permutations only operate in 64-bit lanes. */
20789 for (i
= 0; i
< 4; ++i
)
20790 if (d
->perm
[i
] >= 4)
20792 for (i
= 4; i
< 8; ++i
)
20793 if (d
->perm
[i
] < 4)
20799 /* Emit the pshuflw. */
20800 memcpy (perm2
, d
->perm
, 4);
20801 for (i
= 4; i
< 8; ++i
)
20803 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
20806 /* Emit the pshufhw. */
20807 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
20808 for (i
= 0; i
< 4; ++i
)
20810 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
20816 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20817 the permutation using the SSSE3 palignr instruction. This succeeds
20818 when all of the elements in PERM fit within one vector and we merely
20819 need to shift them down so that a single vector permutation has a
20820 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20821 the vpalignr instruction itself can perform the requested permutation. */
20824 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
20826 unsigned i
, nelt
= d
->nelt
;
20827 unsigned min
, max
, minswap
, maxswap
;
20828 bool in_order
, ok
, swap
= false;
20830 struct expand_vec_perm_d dcopy
;
20832 /* Even with AVX, palignr only operates on 128-bit vectors,
20833 in AVX2 palignr operates on both 128-bit lanes. */
20834 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
20835 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
20840 minswap
= 2 * nelt
;
20842 for (i
= 0; i
< nelt
; ++i
)
20844 unsigned e
= d
->perm
[i
];
20845 unsigned eswap
= d
->perm
[i
] ^ nelt
;
20846 if (GET_MODE_SIZE (d
->vmode
) == 32)
20848 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
20849 eswap
= e
^ (nelt
/ 2);
20855 if (eswap
< minswap
)
20857 if (eswap
> maxswap
)
20861 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
20863 if (d
->one_operand_p
20865 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
20866 ? nelt
/ 2 : nelt
))
20873 /* Given that we have SSSE3, we know we'll be able to implement the
20874 single operand permutation after the palignr with pshufb for
20875 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20877 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
20883 dcopy
.op0
= d
->op1
;
20884 dcopy
.op1
= d
->op0
;
20885 for (i
= 0; i
< nelt
; ++i
)
20886 dcopy
.perm
[i
] ^= nelt
;
20890 for (i
= 0; i
< nelt
; ++i
)
20892 unsigned e
= dcopy
.perm
[i
];
20893 if (GET_MODE_SIZE (d
->vmode
) == 32
20895 && (e
& (nelt
/ 2 - 1)) < min
)
20896 e
= e
- min
- (nelt
/ 2);
20903 dcopy
.one_operand_p
= true;
20905 if (single_insn_only_p
&& !in_order
)
20908 /* For AVX2, test whether we can permute the result in one instruction. */
20913 dcopy
.op1
= dcopy
.op0
;
20914 return expand_vec_perm_1 (&dcopy
);
20917 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
20918 if (GET_MODE_SIZE (d
->vmode
) == 16)
20920 target
= gen_reg_rtx (V1TImode
);
20921 emit_insn (gen_ssse3_palignrv1ti (target
,
20922 gen_lowpart (V1TImode
, dcopy
.op1
),
20923 gen_lowpart (V1TImode
, dcopy
.op0
),
20928 target
= gen_reg_rtx (V2TImode
);
20929 emit_insn (gen_avx2_palignrv2ti (target
,
20930 gen_lowpart (V2TImode
, dcopy
.op1
),
20931 gen_lowpart (V2TImode
, dcopy
.op0
),
20935 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
20937 /* Test for the degenerate case where the alignment by itself
20938 produces the desired permutation. */
20941 emit_move_insn (d
->target
, dcopy
.op0
);
20945 ok
= expand_vec_perm_1 (&dcopy
);
20946 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
20951 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20952 the permutation using the SSE4_1 pblendv instruction. Potentially
20953 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20956 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
20958 unsigned i
, which
, nelt
= d
->nelt
;
20959 struct expand_vec_perm_d dcopy
, dcopy1
;
20960 machine_mode vmode
= d
->vmode
;
20963 /* Use the same checks as in expand_vec_perm_blend. */
20964 if (d
->one_operand_p
)
20966 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
20968 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
20970 else if (TARGET_SSE4_1
20971 && (GET_MODE_SIZE (vmode
) == 16
20972 || (TARGET_MMX_WITH_SSE
&& GET_MODE_SIZE (vmode
) == 8)
20973 || GET_MODE_SIZE (vmode
) == 4))
20978 /* Figure out where permutation elements stay not in their
20979 respective lanes. */
20980 for (i
= 0, which
= 0; i
< nelt
; ++i
)
20982 unsigned e
= d
->perm
[i
];
20984 which
|= (e
< nelt
? 1 : 2);
20986 /* We can pblend the part where elements stay not in their
20987 respective lanes only when these elements are all in one
20988 half of a permutation.
20989 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20990 lanes, but both 8 and 9 >= 8
20991 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20992 respective lanes and 8 >= 8, but 2 not. */
20993 if (which
!= 1 && which
!= 2)
20995 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
20998 /* First we apply one operand permutation to the part where
20999 elements stay not in their respective lanes. */
21002 dcopy
.op0
= dcopy
.op1
= d
->op1
;
21004 dcopy
.op0
= dcopy
.op1
= d
->op0
;
21006 dcopy
.target
= gen_reg_rtx (vmode
);
21007 dcopy
.one_operand_p
= true;
21009 for (i
= 0; i
< nelt
; ++i
)
21010 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
21012 ok
= expand_vec_perm_1 (&dcopy
);
21013 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
21020 /* Next we put permuted elements into their positions. */
21023 dcopy1
.op1
= dcopy
.target
;
21025 dcopy1
.op0
= dcopy
.target
;
21027 for (i
= 0; i
< nelt
; ++i
)
21028 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
21030 ok
= expand_vec_perm_blend (&dcopy1
);
21036 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
21038 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21039 a two vector permutation into a single vector permutation by using
21040 an interleave operation to merge the vectors. */
21043 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
21045 struct expand_vec_perm_d dremap
, dfinal
;
21046 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
21047 unsigned HOST_WIDE_INT contents
;
21048 unsigned char remap
[2 * MAX_VECT_LEN
];
21050 bool ok
, same_halves
= false;
21052 if (GET_MODE_SIZE (d
->vmode
) == 4
21053 || GET_MODE_SIZE (d
->vmode
) == 8
21054 || GET_MODE_SIZE (d
->vmode
) == 16)
21056 if (d
->one_operand_p
)
21059 else if (GET_MODE_SIZE (d
->vmode
) == 32)
21063 /* For 32-byte modes allow even d->one_operand_p.
21064 The lack of cross-lane shuffling in some instructions
21065 might prevent a single insn shuffle. */
21067 dfinal
.testing_p
= true;
21068 /* If expand_vec_perm_interleave3 can expand this into
21069 a 3 insn sequence, give up and let it be expanded as
21070 3 insn sequence. While that is one insn longer,
21071 it doesn't need a memory operand and in the common
21072 case that both interleave low and high permutations
21073 with the same operands are adjacent needs 4 insns
21074 for both after CSE. */
21075 if (expand_vec_perm_interleave3 (&dfinal
))
21081 /* Examine from whence the elements come. */
21083 for (i
= 0; i
< nelt
; ++i
)
21084 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
21086 memset (remap
, 0xff, sizeof (remap
));
21089 if (GET_MODE_SIZE (d
->vmode
) == 4
21090 || GET_MODE_SIZE (d
->vmode
) == 8)
21092 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
21094 /* Split the two input vectors into 4 halves. */
21095 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
21100 /* If the elements from the low halves use interleave low,
21101 and similarly for interleave high. */
21102 if ((contents
& (h1
| h3
)) == contents
)
21105 for (i
= 0; i
< nelt2
; ++i
)
21108 remap
[i
+ nelt
] = i
* 2 + 1;
21109 dremap
.perm
[i
* 2] = i
;
21110 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
21113 else if ((contents
& (h2
| h4
)) == contents
)
21116 for (i
= 0; i
< nelt2
; ++i
)
21118 remap
[i
+ nelt2
] = i
* 2;
21119 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
21120 dremap
.perm
[i
* 2] = i
+ nelt2
;
21121 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
21127 else if (GET_MODE_SIZE (d
->vmode
) == 16)
21129 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
21131 /* Split the two input vectors into 4 halves. */
21132 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
21137 /* If the elements from the low halves use interleave low, and similarly
21138 for interleave high. If the elements are from mis-matched halves, we
21139 can use shufps for V4SF/V4SI or do a DImode shuffle. */
21140 if ((contents
& (h1
| h3
)) == contents
)
21143 for (i
= 0; i
< nelt2
; ++i
)
21146 remap
[i
+ nelt
] = i
* 2 + 1;
21147 dremap
.perm
[i
* 2] = i
;
21148 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
21150 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
21151 dremap
.vmode
= V4SFmode
;
21153 else if ((contents
& (h2
| h4
)) == contents
)
21156 for (i
= 0; i
< nelt2
; ++i
)
21158 remap
[i
+ nelt2
] = i
* 2;
21159 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
21160 dremap
.perm
[i
* 2] = i
+ nelt2
;
21161 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
21163 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
21164 dremap
.vmode
= V4SFmode
;
21166 else if ((contents
& (h1
| h4
)) == contents
)
21169 for (i
= 0; i
< nelt2
; ++i
)
21172 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
21173 dremap
.perm
[i
] = i
;
21174 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
21179 dremap
.vmode
= V2DImode
;
21181 dremap
.perm
[0] = 0;
21182 dremap
.perm
[1] = 3;
21185 else if ((contents
& (h2
| h3
)) == contents
)
21188 for (i
= 0; i
< nelt2
; ++i
)
21190 remap
[i
+ nelt2
] = i
;
21191 remap
[i
+ nelt
] = i
+ nelt2
;
21192 dremap
.perm
[i
] = i
+ nelt2
;
21193 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
21198 dremap
.vmode
= V2DImode
;
21200 dremap
.perm
[0] = 1;
21201 dremap
.perm
[1] = 2;
21209 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
21210 unsigned HOST_WIDE_INT q
[8];
21211 unsigned int nonzero_halves
[4];
21213 /* Split the two input vectors into 8 quarters. */
21214 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
21215 for (i
= 1; i
< 8; ++i
)
21216 q
[i
] = q
[0] << (nelt4
* i
);
21217 for (i
= 0; i
< 4; ++i
)
21218 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
21220 nonzero_halves
[nzcnt
] = i
;
21226 gcc_assert (d
->one_operand_p
);
21227 nonzero_halves
[1] = nonzero_halves
[0];
21228 same_halves
= true;
21230 else if (d
->one_operand_p
)
21232 gcc_assert (nonzero_halves
[0] == 0);
21233 gcc_assert (nonzero_halves
[1] == 1);
21238 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
21240 /* Attempt to increase the likelihood that dfinal
21241 shuffle will be intra-lane. */
21242 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
21245 /* vperm2f128 or vperm2i128. */
21246 for (i
= 0; i
< nelt2
; ++i
)
21248 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
21249 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
21250 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
21251 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
21254 if (d
->vmode
!= V8SFmode
21255 && d
->vmode
!= V4DFmode
21256 && d
->vmode
!= V8SImode
)
21258 dremap
.vmode
= V8SImode
;
21260 for (i
= 0; i
< 4; ++i
)
21262 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
21263 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
21267 else if (d
->one_operand_p
)
21269 else if (TARGET_AVX2
21270 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
21273 for (i
= 0; i
< nelt4
; ++i
)
21276 remap
[i
+ nelt
] = i
* 2 + 1;
21277 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
21278 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
21279 dremap
.perm
[i
* 2] = i
;
21280 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
21281 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
21282 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
21285 else if (TARGET_AVX2
21286 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
21289 for (i
= 0; i
< nelt4
; ++i
)
21291 remap
[i
+ nelt4
] = i
* 2;
21292 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
21293 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
21294 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
21295 dremap
.perm
[i
* 2] = i
+ nelt4
;
21296 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
21297 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
21298 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
21305 /* Use the remapping array set up above to move the elements from their
21306 swizzled locations into their final destinations. */
21308 for (i
= 0; i
< nelt
; ++i
)
21310 unsigned e
= remap
[d
->perm
[i
]];
21311 gcc_assert (e
< nelt
);
21312 /* If same_halves is true, both halves of the remapped vector are the
21313 same. Avoid cross-lane accesses if possible. */
21314 if (same_halves
&& i
>= nelt2
)
21316 gcc_assert (e
< nelt2
);
21317 dfinal
.perm
[i
] = e
+ nelt2
;
21320 dfinal
.perm
[i
] = e
;
21324 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
21325 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
21327 dfinal
.op1
= dfinal
.op0
;
21328 dfinal
.one_operand_p
= true;
21330 /* Test if the final remap can be done with a single insn. For V4SFmode or
21331 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
21333 ok
= expand_vec_perm_1 (&dfinal
);
21334 seq
= get_insns ();
21343 if (dremap
.vmode
!= dfinal
.vmode
)
21345 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
21346 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
21349 ok
= expand_vec_perm_1 (&dremap
);
21356 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21357 a single vector cross-lane permutation into vpermq followed
21358 by any of the single insn permutations. */
21361 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
21363 struct expand_vec_perm_d dremap
, dfinal
;
21364 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
21365 unsigned contents
[2];
21369 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
21370 && d
->one_operand_p
))
21375 for (i
= 0; i
< nelt2
; ++i
)
21377 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
21378 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
21381 for (i
= 0; i
< 2; ++i
)
21383 unsigned int cnt
= 0;
21384 for (j
= 0; j
< 4; ++j
)
21385 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
21393 dremap
.vmode
= V4DImode
;
21395 dremap
.target
= gen_reg_rtx (V4DImode
);
21396 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
21397 dremap
.op1
= dremap
.op0
;
21398 dremap
.one_operand_p
= true;
21399 for (i
= 0; i
< 2; ++i
)
21401 unsigned int cnt
= 0;
21402 for (j
= 0; j
< 4; ++j
)
21403 if ((contents
[i
] & (1u << j
)) != 0)
21404 dremap
.perm
[2 * i
+ cnt
++] = j
;
21405 for (; cnt
< 2; ++cnt
)
21406 dremap
.perm
[2 * i
+ cnt
] = 0;
21410 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
21411 dfinal
.op1
= dfinal
.op0
;
21412 dfinal
.one_operand_p
= true;
21413 for (i
= 0, j
= 0; i
< nelt
; ++i
)
21417 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
21418 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
21420 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
21421 dfinal
.perm
[i
] |= nelt4
;
21423 gcc_unreachable ();
21426 ok
= expand_vec_perm_1 (&dremap
);
21429 ok
= expand_vec_perm_1 (&dfinal
);
21435 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
21437 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
21438 a vector permutation using two instructions, vperm2f128 resp.
21439 vperm2i128 followed by any single in-lane permutation. */
21442 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
21444 struct expand_vec_perm_d dfirst
, dsecond
;
21445 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
21449 || GET_MODE_SIZE (d
->vmode
) != 32
21450 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
21454 dsecond
.one_operand_p
= false;
21455 dsecond
.testing_p
= true;
21457 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
21458 immediate. For perm < 16 the second permutation uses
21459 d->op0 as first operand, for perm >= 16 it uses d->op1
21460 as first operand. The second operand is the result of
21462 for (perm
= 0; perm
< 32; perm
++)
21464 /* Ignore permutations which do not move anything cross-lane. */
21467 /* The second shuffle for e.g. V4DFmode has
21468 0123 and ABCD operands.
21469 Ignore AB23, as 23 is already in the second lane
21470 of the first operand. */
21471 if ((perm
& 0xc) == (1 << 2)) continue;
21472 /* And 01CD, as 01 is in the first lane of the first
21474 if ((perm
& 3) == 0) continue;
21475 /* And 4567, as then the vperm2[fi]128 doesn't change
21476 anything on the original 4567 second operand. */
21477 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
21481 /* The second shuffle for e.g. V4DFmode has
21482 4567 and ABCD operands.
21483 Ignore AB67, as 67 is already in the second lane
21484 of the first operand. */
21485 if ((perm
& 0xc) == (3 << 2)) continue;
21486 /* And 45CD, as 45 is in the first lane of the first
21488 if ((perm
& 3) == 2) continue;
21489 /* And 0123, as then the vperm2[fi]128 doesn't change
21490 anything on the original 0123 first operand. */
21491 if ((perm
& 0xf) == (1 << 2)) continue;
21494 for (i
= 0; i
< nelt
; i
++)
21496 j
= d
->perm
[i
] / nelt2
;
21497 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
21498 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
21499 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
21500 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
21508 ok
= expand_vec_perm_1 (&dsecond
);
21519 /* Found a usable second shuffle. dfirst will be
21520 vperm2f128 on d->op0 and d->op1. */
21521 dsecond
.testing_p
= false;
21523 dfirst
.target
= gen_reg_rtx (d
->vmode
);
21524 for (i
= 0; i
< nelt
; i
++)
21525 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
21526 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
21528 canonicalize_perm (&dfirst
);
21529 ok
= expand_vec_perm_1 (&dfirst
);
21532 /* And dsecond is some single insn shuffle, taking
21533 d->op0 and result of vperm2f128 (if perm < 16) or
21534 d->op1 and result of vperm2f128 (otherwise). */
21536 dsecond
.op0
= dsecond
.op1
;
21537 dsecond
.op1
= dfirst
.target
;
21539 ok
= expand_vec_perm_1 (&dsecond
);
21545 /* For one operand, the only useful vperm2f128 permutation is 0x01
21547 if (d
->one_operand_p
)
21554 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21555 a two vector permutation using 2 intra-lane interleave insns
21556 and cross-lane shuffle for 32-byte vectors. */
21559 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
21562 rtx (*gen
) (rtx
, rtx
, rtx
);
21564 if (d
->one_operand_p
)
21566 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
21568 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
21574 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
21576 for (i
= 0; i
< nelt
; i
+= 2)
21577 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
21578 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
21588 gen
= gen_vec_interleave_highv32qi
;
21590 gen
= gen_vec_interleave_lowv32qi
;
21594 gen
= gen_vec_interleave_highv16hi
;
21596 gen
= gen_vec_interleave_lowv16hi
;
21600 gen
= gen_vec_interleave_highv8si
;
21602 gen
= gen_vec_interleave_lowv8si
;
21606 gen
= gen_vec_interleave_highv4di
;
21608 gen
= gen_vec_interleave_lowv4di
;
21612 gen
= gen_vec_interleave_highv8sf
;
21614 gen
= gen_vec_interleave_lowv8sf
;
21618 gen
= gen_vec_interleave_highv4df
;
21620 gen
= gen_vec_interleave_lowv4df
;
21623 gcc_unreachable ();
21626 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
21630 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21631 a single vector permutation using a single intra-lane vector
21632 permutation, vperm2f128 swapping the lanes and vblend* insn blending
21633 the non-swapped and swapped vectors together. */
21636 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
21638 struct expand_vec_perm_d dfirst
, dsecond
;
21639 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
21642 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
21646 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
21647 || !d
->one_operand_p
)
21651 for (i
= 0; i
< nelt
; i
++)
21652 dfirst
.perm
[i
] = 0xff;
21653 for (i
= 0, msk
= 0; i
< nelt
; i
++)
21655 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
21656 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
21658 dfirst
.perm
[j
] = d
->perm
[i
];
21662 for (i
= 0; i
< nelt
; i
++)
21663 if (dfirst
.perm
[i
] == 0xff)
21664 dfirst
.perm
[i
] = i
;
21667 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21670 ok
= expand_vec_perm_1 (&dfirst
);
21671 seq
= get_insns ();
21683 dsecond
.op0
= dfirst
.target
;
21684 dsecond
.op1
= dfirst
.target
;
21685 dsecond
.one_operand_p
= true;
21686 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21687 for (i
= 0; i
< nelt
; i
++)
21688 dsecond
.perm
[i
] = i
^ nelt2
;
21690 ok
= expand_vec_perm_1 (&dsecond
);
21693 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
21694 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
21698 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21699 a two vector permutation using two single vector permutations and
21700 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
21701 of dfirst or dsecond is identity permutation. */
21704 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d
*d
, bool two_insn
)
21706 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, lane
= nelt
;
21707 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
21708 bool ident1
= true, ident2
= true;
21710 if (d
->one_operand_p
)
21713 if (GET_MODE_SIZE (d
->vmode
) == 16)
21717 if (d
->vmode
!= V4SFmode
&& d
->vmode
!= V2DFmode
&& !TARGET_SSE2
)
21720 else if (GET_MODE_SIZE (d
->vmode
) == 32)
21724 if (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
)
21731 for (i
= 1; i
< nelt
; i
++)
21732 if ((d
->perm
[i
] >= nelt
) != ((d
->perm
[0] >= nelt
) ^ (i
& 1)))
21738 dfirst
.op1
= dfirst
.op0
;
21739 dfirst
.one_operand_p
= true;
21740 dsecond
.op0
= dsecond
.op1
;
21741 dsecond
.one_operand_p
= true;
21743 for (i
= 0; i
< nelt
; i
++)
21744 if (d
->perm
[i
] >= nelt
)
21746 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
] - nelt
;
21747 if (d
->perm
[i
] - nelt
!= i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
21749 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)]
21750 = d
->perm
[i
] - nelt
;
21754 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
];
21755 if (d
->perm
[i
] != i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
21757 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)] = d
->perm
[i
];
21760 if (two_insn
&& !ident1
&& !ident2
)
21766 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
21768 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
21769 if (d
->perm
[0] >= nelt
)
21770 std::swap (dfinal
.op0
, dfinal
.op1
);
21774 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
21779 ok
= expand_vec_perm_1 (&dfirst
);
21780 seq1
= get_insns ();
21790 ok
= expand_vec_perm_1 (&dsecond
);
21791 seq2
= get_insns ();
21801 for (i
= 0; i
< nelt
; i
++)
21803 dfinal
.perm
[i
] = i
/ 2;
21805 dfinal
.perm
[i
] += lane
/ 2;
21807 dfinal
.perm
[i
] += nelt
;
21811 ok
= expand_vselect_vconcat (dfinal
.target
, dfinal
.op0
, dfinal
.op1
,
21812 dfinal
.perm
, dfinal
.nelt
, false);
21817 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21818 the permutation using two single vector permutations and the SSE4_1 pblendv
21819 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21820 identity permutation. */
21823 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d
*d
, bool two_insn
)
21825 unsigned i
, nelt
= d
->nelt
;
21826 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
21827 machine_mode vmode
= d
->vmode
;
21828 bool ident1
= true, ident2
= true;
21830 /* Use the same checks as in expand_vec_perm_blend. */
21831 if (d
->one_operand_p
)
21833 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
21835 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
21837 else if (TARGET_SSE4_1
21838 && (GET_MODE_SIZE (vmode
) == 16
21839 || (TARGET_MMX_WITH_SSE
&& GET_MODE_SIZE (vmode
) == 8)
21840 || GET_MODE_SIZE (vmode
) == 4))
21848 dfirst
.op1
= dfirst
.op0
;
21849 dfirst
.one_operand_p
= true;
21850 dsecond
.op0
= dsecond
.op1
;
21851 dsecond
.one_operand_p
= true;
21853 for (i
= 0; i
< nelt
; ++i
)
21854 if (d
->perm
[i
] >= nelt
)
21856 dfirst
.perm
[i
] = 0xff;
21857 dsecond
.perm
[i
] = d
->perm
[i
] - nelt
;
21858 if (d
->perm
[i
] != i
+ nelt
)
21863 dsecond
.perm
[i
] = 0xff;
21864 dfirst
.perm
[i
] = d
->perm
[i
];
21865 if (d
->perm
[i
] != i
)
21869 if (two_insn
&& !ident1
&& !ident2
)
21872 /* For now. Ideally treat 0xff as a wildcard. */
21873 for (i
= 0; i
< nelt
; ++i
)
21874 if (dfirst
.perm
[i
] == 0xff)
21876 if (GET_MODE_SIZE (vmode
) == 32
21877 && dfirst
.perm
[i
^ (nelt
/ 2)] != 0xff)
21878 dfirst
.perm
[i
] = dfirst
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
21880 dfirst
.perm
[i
] = i
;
21884 if (GET_MODE_SIZE (vmode
) == 32
21885 && dsecond
.perm
[i
^ (nelt
/ 2)] != 0xff)
21886 dsecond
.perm
[i
] = dsecond
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
21888 dsecond
.perm
[i
] = i
;
21894 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
21896 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
21900 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
21905 ok
= expand_vec_perm_1 (&dfirst
);
21906 seq1
= get_insns ();
21916 ok
= expand_vec_perm_1 (&dsecond
);
21917 seq2
= get_insns ();
21927 for (i
= 0; i
< nelt
; ++i
)
21928 dfinal
.perm
[i
] = (d
->perm
[i
] >= nelt
? i
+ nelt
: i
);
21932 ok
= expand_vec_perm_blend (&dfinal
);
21937 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
21938 permutation using two vperm2f128, followed by a vshufpd insn blending
21939 the two vectors together. */
21942 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
21944 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
21947 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
21957 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
21958 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
21959 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
21960 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
21961 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
21962 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
21963 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
21964 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
21965 dthird
.perm
[0] = (d
->perm
[0] % 2);
21966 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
21967 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
21968 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
21970 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21971 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21972 dthird
.op0
= dfirst
.target
;
21973 dthird
.op1
= dsecond
.target
;
21974 dthird
.one_operand_p
= false;
21976 canonicalize_perm (&dfirst
);
21977 canonicalize_perm (&dsecond
);
21979 ok
= expand_vec_perm_1 (&dfirst
)
21980 && expand_vec_perm_1 (&dsecond
)
21981 && expand_vec_perm_1 (&dthird
);
21988 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
21990 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21991 a two vector permutation using two intra-lane vector
21992 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21993 the non-swapped and swapped vectors together. */
21996 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
21998 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
21999 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
22000 rtx_insn
*seq1
, *seq2
;
22002 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
22006 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
22007 || d
->one_operand_p
)
22012 for (i
= 0; i
< nelt
; i
++)
22014 dfirst
.perm
[i
] = 0xff;
22015 dsecond
.perm
[i
] = 0xff;
22017 for (i
= 0, msk
= 0; i
< nelt
; i
++)
22019 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
22022 dfirst
.perm
[j
] = d
->perm
[i
];
22023 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
22027 dsecond
.perm
[j
] = d
->perm
[i
];
22028 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
22032 if (msk
== 0 || msk
== (1U << nelt
) - 1)
22037 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
22038 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
22041 for (i
= 0; i
< nelt
; i
++)
22043 if (dfirst
.perm
[i
] == 0xff)
22044 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
22045 if (dsecond
.perm
[i
] == 0xff)
22046 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
22048 canonicalize_perm (&dfirst
);
22050 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
22051 seq1
= get_insns ();
22057 canonicalize_perm (&dsecond
);
22059 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
22060 seq2
= get_insns ();
22073 dthird
.op0
= dsecond
.target
;
22074 dthird
.op1
= dsecond
.target
;
22075 dthird
.one_operand_p
= true;
22076 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
22077 for (i
= 0; i
< nelt
; i
++)
22078 dthird
.perm
[i
] = i
^ nelt2
;
22080 ok
= expand_vec_perm_1 (&dthird
);
22083 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
22084 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
22088 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
22089 permutation with two pshufb insns and an ior. We should have already
22090 failed all two instruction sequences. */
22093 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
22095 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
22096 unsigned int i
, nelt
, eltsz
;
22098 rtx (*gen
) (rtx
, rtx
, rtx
);
22100 if (!TARGET_SSSE3
|| (GET_MODE_SIZE (d
->vmode
) != 16
22101 && GET_MODE_SIZE (d
->vmode
) != 8
22102 && GET_MODE_SIZE (d
->vmode
) != 4))
22104 gcc_assert (!d
->one_operand_p
);
22109 switch (GET_MODE_SIZE (d
->vmode
))
22113 gen
= gen_mmx_pshufbv4qi3
;
22117 gen
= gen_mmx_pshufbv8qi3
;
22121 gen
= gen_ssse3_pshufbv16qi3
;
22124 gcc_unreachable ();
22128 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
22130 /* Generate two permutation masks. If the required element is within
22131 the given vector it is shuffled into the proper lane. If the required
22132 element is in the other vector, force a zero into the lane by setting
22133 bit 7 in the permutation mask. */
22134 m128
= GEN_INT (-128);
22135 for (i
= 0; i
< nelt
; ++i
)
22137 unsigned j
, k
, e
= d
->perm
[i
];
22138 unsigned which
= (e
>= nelt
);
22142 for (j
= 0; j
< eltsz
; ++j
)
22144 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
22145 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
22148 for (k
= i
*eltsz
+ j
; k
< 16; ++k
)
22149 rperm
[0][k
] = rperm
[1][k
] = m128
;
22152 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
22153 vperm
= force_reg (V16QImode
, vperm
);
22155 l
= gen_reg_rtx (mode
);
22156 op
= gen_lowpart (mode
, d
->op0
);
22157 emit_insn (gen (l
, op
, vperm
));
22159 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
22160 vperm
= force_reg (V16QImode
, vperm
);
22162 h
= gen_reg_rtx (mode
);
22163 op
= gen_lowpart (mode
, d
->op1
);
22164 emit_insn (gen (h
, op
, vperm
));
22167 if (d
->vmode
!= mode
)
22168 op
= gen_reg_rtx (mode
);
22169 ix86_emit_vec_binop (IOR
, mode
, op
, l
, h
);
22170 if (op
!= d
->target
)
22171 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
22176 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
22177 with two vpshufb insns, vpermq and vpor. We should have already failed
22178 all two or three instruction sequences. */
22181 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
22183 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
22184 unsigned int i
, nelt
, eltsz
;
22187 || !d
->one_operand_p
22188 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
22195 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
22197 /* Generate two permutation masks. If the required element is within
22198 the same lane, it is shuffled in. If the required element from the
22199 other lane, force a zero by setting bit 7 in the permutation mask.
22200 In the other mask the mask has non-negative elements if element
22201 is requested from the other lane, but also moved to the other lane,
22202 so that the result of vpshufb can have the two V2TImode halves
22204 m128
= GEN_INT (-128);
22205 for (i
= 0; i
< nelt
; ++i
)
22207 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
22208 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
22210 for (j
= 0; j
< eltsz
; ++j
)
22212 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
22213 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
22217 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
22218 vperm
= force_reg (V32QImode
, vperm
);
22220 h
= gen_reg_rtx (V32QImode
);
22221 op
= gen_lowpart (V32QImode
, d
->op0
);
22222 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
22224 /* Swap the 128-byte lanes of h into hp. */
22225 hp
= gen_reg_rtx (V4DImode
);
22226 op
= gen_lowpart (V4DImode
, h
);
22227 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
22230 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
22231 vperm
= force_reg (V32QImode
, vperm
);
22233 l
= gen_reg_rtx (V32QImode
);
22234 op
= gen_lowpart (V32QImode
, d
->op0
);
22235 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
22238 if (d
->vmode
!= V32QImode
)
22239 op
= gen_reg_rtx (V32QImode
);
22240 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
22241 if (op
!= d
->target
)
22242 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
22247 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22248 and extract-odd permutations of two V32QImode and V16QImode operand
22249 with two vpshufb insns, vpor and vpermq. We should have already
22250 failed all two or three instruction sequences. */
22253 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
22255 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
22256 unsigned int i
, nelt
, eltsz
;
22259 || d
->one_operand_p
22260 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
22263 for (i
= 0; i
< d
->nelt
; ++i
)
22264 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
22271 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
22273 /* Generate two permutation masks. In the first permutation mask
22274 the first quarter will contain indexes for the first half
22275 of the op0, the second quarter will contain bit 7 set, third quarter
22276 will contain indexes for the second half of the op0 and the
22277 last quarter bit 7 set. In the second permutation mask
22278 the first quarter will contain bit 7 set, the second quarter
22279 indexes for the first half of the op1, the third quarter bit 7 set
22280 and last quarter indexes for the second half of the op1.
22281 I.e. the first mask e.g. for V32QImode extract even will be:
22282 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
22283 (all values masked with 0xf except for -128) and second mask
22284 for extract even will be
22285 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
22286 m128
= GEN_INT (-128);
22287 for (i
= 0; i
< nelt
; ++i
)
22289 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
22290 unsigned which
= d
->perm
[i
] >= nelt
;
22291 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
22293 for (j
= 0; j
< eltsz
; ++j
)
22295 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
22296 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
22300 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
22301 vperm
= force_reg (V32QImode
, vperm
);
22303 l
= gen_reg_rtx (V32QImode
);
22304 op
= gen_lowpart (V32QImode
, d
->op0
);
22305 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
22307 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
22308 vperm
= force_reg (V32QImode
, vperm
);
22310 h
= gen_reg_rtx (V32QImode
);
22311 op
= gen_lowpart (V32QImode
, d
->op1
);
22312 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
22314 ior
= gen_reg_rtx (V32QImode
);
22315 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
22317 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
22318 op
= gen_reg_rtx (V4DImode
);
22319 ior
= gen_lowpart (V4DImode
, ior
);
22320 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
22321 const1_rtx
, GEN_INT (3)));
22322 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
22327 /* Implement permutation with pslldq + psrldq + por when pshufb is not
22330 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d
*d
, bool pandn
)
22332 unsigned i
, nelt
= d
->nelt
;
22333 unsigned start1
, end1
= -1;
22334 machine_mode vmode
= d
->vmode
, imode
;
22336 bool clear_op0
, clear_op1
;
22337 unsigned inner_size
;
22338 rtx op0
, op1
, dop1
;
22339 rtx (*gen_vec_shr
) (rtx
, rtx
, rtx
);
22340 rtx (*gen_vec_shl
) (rtx
, rtx
, rtx
);
22342 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
22343 if (!TARGET_SSE2
|| (vmode
!= E_V16QImode
&& vmode
!= E_V8HImode
))
22346 start1
= d
->perm
[0];
22347 for (i
= 1; i
< nelt
; i
++)
22349 if (d
->perm
[i
] != d
->perm
[i
-1] + 1
22350 || d
->perm
[i
] == nelt
)
22354 start2
= d
->perm
[i
];
22355 end1
= d
->perm
[i
-1];
22362 clear_op0
= end1
!= nelt
- 1;
22363 clear_op1
= start2
% nelt
!= 0;
22364 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
22365 if (!pandn
&& (clear_op0
|| clear_op1
))
22371 gen_vec_shr
= vmode
== E_V16QImode
? gen_vec_shr_v16qi
: gen_vec_shr_v8hi
;
22372 gen_vec_shl
= vmode
== E_V16QImode
? gen_vec_shl_v16qi
: gen_vec_shl_v8hi
;
22373 imode
= GET_MODE_INNER (vmode
);
22374 inner_size
= GET_MODE_BITSIZE (imode
);
22375 op0
= gen_reg_rtx (vmode
);
22376 op1
= gen_reg_rtx (vmode
);
22379 emit_insn (gen_vec_shr (op0
, d
->op0
, GEN_INT (start1
* inner_size
)));
22381 emit_move_insn (op0
, d
->op0
);
22384 if (d
->one_operand_p
)
22387 int shl_offset
= end1
- start1
+ 1 - start2
% nelt
;
22389 emit_insn (gen_vec_shl (op1
, dop1
, GEN_INT (shl_offset
* inner_size
)));
22391 emit_move_insn (op1
, dop1
);
22393 /* Clear lower/upper bits for op0/op1. */
22394 if (clear_op0
|| clear_op1
)
22399 for (i
= 0; i
!= nelt
; i
++)
22401 if (i
< (end1
- start1
+ 1))
22402 vec
[i
] = gen_int_mode ((HOST_WIDE_INT_1U
<< inner_size
) - 1, imode
);
22404 vec
[i
] = CONST0_RTX (imode
);
22406 const_vec
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, vec
));
22407 const_vec
= validize_mem (force_const_mem (vmode
, const_vec
));
22408 clear
= force_reg (vmode
, const_vec
);
22411 emit_move_insn (op0
, gen_rtx_AND (vmode
, op0
, clear
));
22413 emit_move_insn (op1
, gen_rtx_AND (vmode
,
22414 gen_rtx_NOT (vmode
, clear
),
22418 emit_move_insn (d
->target
, gen_rtx_IOR (vmode
, op0
, op1
));
22422 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22423 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
22424 operands with two "and" and "pack" or two "shift" and "pack" insns.
22425 We should have already failed all two instruction sequences. */
22428 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
22430 rtx op
, dop0
, dop1
, t
;
22431 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
22432 bool end_perm
= false;
22433 machine_mode half_mode
;
22434 rtx (*gen_and
) (rtx
, rtx
, rtx
);
22435 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
22436 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
22438 if (d
->one_operand_p
)
22444 /* Required for "pack". */
22445 if (!TARGET_SSE4_1
)
22449 half_mode
= V2SImode
;
22450 gen_and
= gen_andv2si3
;
22451 gen_pack
= gen_mmx_packusdw
;
22452 gen_shift
= gen_lshrv2si3
;
22455 /* Required for "pack". */
22456 if (!TARGET_SSE4_1
)
22460 half_mode
= V4SImode
;
22461 gen_and
= gen_andv4si3
;
22462 gen_pack
= gen_sse4_1_packusdw
;
22463 gen_shift
= gen_lshrv4si3
;
22466 /* No check as all instructions are SSE2. */
22469 half_mode
= V4HImode
;
22470 gen_and
= gen_andv4hi3
;
22471 gen_pack
= gen_mmx_packuswb
;
22472 gen_shift
= gen_lshrv4hi3
;
22475 /* No check as all instructions are SSE2. */
22478 half_mode
= V8HImode
;
22479 gen_and
= gen_andv8hi3
;
22480 gen_pack
= gen_sse2_packuswb
;
22481 gen_shift
= gen_lshrv8hi3
;
22488 half_mode
= V8SImode
;
22489 gen_and
= gen_andv8si3
;
22490 gen_pack
= gen_avx2_packusdw
;
22491 gen_shift
= gen_lshrv8si3
;
22499 half_mode
= V16HImode
;
22500 gen_and
= gen_andv16hi3
;
22501 gen_pack
= gen_avx2_packuswb
;
22502 gen_shift
= gen_lshrv16hi3
;
22506 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
22507 are more profitable than general shuffles. */
22511 /* Check that permutation is even or odd. */
22516 for (i
= 1; i
< nelt
; ++i
)
22517 if (d
->perm
[i
] != 2 * i
+ odd
)
22523 dop0
= gen_reg_rtx (half_mode
);
22524 dop1
= gen_reg_rtx (half_mode
);
22527 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
22528 t
= force_reg (half_mode
, t
);
22529 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
22530 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
22534 emit_insn (gen_shift (dop0
,
22535 gen_lowpart (half_mode
, d
->op0
),
22537 emit_insn (gen_shift (dop1
,
22538 gen_lowpart (half_mode
, d
->op1
),
22541 /* In AVX2 for 256 bit case we need to permute pack result. */
22542 if (TARGET_AVX2
&& end_perm
)
22544 op
= gen_reg_rtx (d
->vmode
);
22545 t
= gen_reg_rtx (V4DImode
);
22546 emit_insn (gen_pack (op
, dop0
, dop1
));
22547 emit_insn (gen_avx2_permv4di_1 (t
,
22548 gen_lowpart (V4DImode
, op
),
22553 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
22556 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
22561 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22562 and extract-odd permutations of two V64QI operands
22563 with two "shifts", two "truncs" and one "concat" insns for "odd"
22564 and two "truncs" and one concat insn for "even."
22565 Have already failed all two instruction sequences. */
22568 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
22570 rtx t1
, t2
, t3
, t4
;
22571 unsigned i
, odd
, nelt
= d
->nelt
;
22573 if (!TARGET_AVX512BW
22574 || d
->one_operand_p
22575 || d
->vmode
!= V64QImode
)
22578 /* Check that permutation is even or odd. */
22583 for (i
= 1; i
< nelt
; ++i
)
22584 if (d
->perm
[i
] != 2 * i
+ odd
)
22593 t1
= gen_reg_rtx (V32HImode
);
22594 t2
= gen_reg_rtx (V32HImode
);
22595 emit_insn (gen_lshrv32hi3 (t1
,
22596 gen_lowpart (V32HImode
, d
->op0
),
22598 emit_insn (gen_lshrv32hi3 (t2
,
22599 gen_lowpart (V32HImode
, d
->op1
),
22604 t1
= gen_lowpart (V32HImode
, d
->op0
);
22605 t2
= gen_lowpart (V32HImode
, d
->op1
);
22608 t3
= gen_reg_rtx (V32QImode
);
22609 t4
= gen_reg_rtx (V32QImode
);
22610 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
22611 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
22612 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
22617 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
22618 and extract-odd permutations. */
22621 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
22623 rtx t1
, t2
, t3
, t4
, t5
;
22630 t1
= gen_reg_rtx (V4DFmode
);
22631 t2
= gen_reg_rtx (V4DFmode
);
22633 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22634 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
22635 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
22637 /* Now an unpck[lh]pd will produce the result required. */
22639 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
22641 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
22647 int mask
= odd
? 0xdd : 0x88;
22651 t1
= gen_reg_rtx (V8SFmode
);
22652 t2
= gen_reg_rtx (V8SFmode
);
22653 t3
= gen_reg_rtx (V8SFmode
);
22655 /* Shuffle within the 128-bit lanes to produce:
22656 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
22657 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
22660 /* Shuffle the lanes around to produce:
22661 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
22662 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
22665 /* Shuffle within the 128-bit lanes to produce:
22666 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
22667 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
22669 /* Shuffle within the 128-bit lanes to produce:
22670 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
22671 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
22673 /* Shuffle the lanes around to produce:
22674 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
22675 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
22686 /* These are always directly implementable by expand_vec_perm_1. */
22687 gcc_unreachable ();
22690 gcc_assert (TARGET_MMX_WITH_SSE
);
22691 /* We have no suitable instructions. */
22697 if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
22698 return expand_vec_perm_pshufb2 (d
);
22703 /* We need 2*log2(N)-1 operations to achieve odd/even
22704 with interleave. */
22705 t1
= gen_reg_rtx (V4QImode
);
22706 emit_insn (gen_mmx_punpckhbw_low (t1
, d
->op0
, d
->op1
));
22707 emit_insn (gen_mmx_punpcklbw_low (d
->target
, d
->op0
, d
->op1
));
22709 t2
= gen_mmx_punpckhbw_low (d
->target
, d
->target
, t1
);
22711 t2
= gen_mmx_punpcklbw_low (d
->target
, d
->target
, t1
);
22718 return expand_vec_perm_even_odd_pack (d
);
22719 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
22720 return expand_vec_perm_pshufb2 (d
);
22725 /* We need 2*log2(N)-1 operations to achieve odd/even
22726 with interleave. */
22727 t1
= gen_reg_rtx (V4HImode
);
22728 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
22729 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
22731 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
22733 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
22740 return expand_vec_perm_even_odd_pack (d
);
22741 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
22742 return expand_vec_perm_pshufb2 (d
);
22747 /* We need 2*log2(N)-1 operations to achieve odd/even
22748 with interleave. */
22749 t1
= gen_reg_rtx (V8HImode
);
22750 t2
= gen_reg_rtx (V8HImode
);
22751 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
22752 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
22753 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
22754 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
22756 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
22758 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
22765 return expand_vec_perm_even_odd_pack (d
);
22769 return expand_vec_perm_even_odd_pack (d
);
22772 return expand_vec_perm_even_odd_trunc (d
);
22777 struct expand_vec_perm_d d_copy
= *d
;
22778 d_copy
.vmode
= V4DFmode
;
22780 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
22782 d_copy
.target
= gen_reg_rtx (V4DFmode
);
22783 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
22784 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
22785 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
22788 emit_move_insn (d
->target
,
22789 gen_lowpart (V4DImode
, d_copy
.target
));
22798 t1
= gen_reg_rtx (V4DImode
);
22799 t2
= gen_reg_rtx (V4DImode
);
22801 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22802 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
22803 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
22805 /* Now an vpunpck[lh]qdq will produce the result required. */
22807 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
22809 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
22816 struct expand_vec_perm_d d_copy
= *d
;
22817 d_copy
.vmode
= V8SFmode
;
22819 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
22821 d_copy
.target
= gen_reg_rtx (V8SFmode
);
22822 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
22823 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
22824 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
22827 emit_move_insn (d
->target
,
22828 gen_lowpart (V8SImode
, d_copy
.target
));
22837 t1
= gen_reg_rtx (V8SImode
);
22838 t2
= gen_reg_rtx (V8SImode
);
22839 t3
= gen_reg_rtx (V4DImode
);
22840 t4
= gen_reg_rtx (V4DImode
);
22841 t5
= gen_reg_rtx (V4DImode
);
22843 /* Shuffle the lanes around into
22844 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22845 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
22846 gen_lowpart (V4DImode
, d
->op1
),
22848 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
22849 gen_lowpart (V4DImode
, d
->op1
),
22852 /* Swap the 2nd and 3rd position in each lane into
22853 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22854 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
22855 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22856 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
22857 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22859 /* Now an vpunpck[lh]qdq will produce
22860 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22862 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
22863 gen_lowpart (V4DImode
, t2
));
22865 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
22866 gen_lowpart (V4DImode
, t2
));
22868 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
22872 gcc_unreachable ();
22878 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22879 extract-even and extract-odd permutations. */
22882 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
22884 unsigned i
, odd
, nelt
= d
->nelt
;
22887 if (odd
!= 0 && odd
!= 1)
22890 for (i
= 1; i
< nelt
; ++i
)
22891 if (d
->perm
[i
] != 2 * i
+ odd
)
22894 if (d
->vmode
== E_V32HImode
22896 && !TARGET_AVX512BW
)
22899 return expand_vec_perm_even_odd_1 (d
, odd
);
22902 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
22903 permutations. We assume that expand_vec_perm_1 has already failed. */
22906 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
22908 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
22909 machine_mode vmode
= d
->vmode
;
22910 rtx (*gen
) (rtx
, rtx
, rtx
);
22911 unsigned char perm2
[4];
22912 rtx op0
= d
->op0
, dest
;
22919 /* These are special-cased in sse.md so that we can optionally
22920 use the vbroadcast instruction. They expand to two insns
22921 if the input happens to be in a register. */
22922 gcc_unreachable ();
22932 /* These are always implementable using standard shuffle patterns. */
22933 gcc_unreachable ();
22936 /* This can be implemented via interleave and pshuflw. */
22942 gen
= gen_mmx_punpckhbw_low
;
22946 gen
= gen_mmx_punpcklbw_low
;
22948 dest
= gen_reg_rtx (vmode
);
22949 emit_insn (gen (dest
, op0
, op0
));
22950 vmode
= get_mode_wider_vector (vmode
);
22951 op0
= gen_lowpart (vmode
, dest
);
22953 memset (perm2
, elt
, 2);
22954 dest
= gen_reg_rtx (vmode
);
22955 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
22958 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22962 /* This can be implemented via interleave. We save one insn by
22963 stopping once we have promoted to V2SImode and then use pshufd. */
22970 gen
= vmode
== V8QImode
? gen_mmx_punpckhbw
22971 : gen_mmx_punpckhwd
;
22975 gen
= vmode
== V8QImode
? gen_mmx_punpcklbw
22976 : gen_mmx_punpcklwd
;
22979 dest
= gen_reg_rtx (vmode
);
22980 emit_insn (gen (dest
, op0
, op0
));
22981 vmode
= get_mode_wider_vector (vmode
);
22982 op0
= gen_lowpart (vmode
, dest
);
22984 while (vmode
!= V2SImode
);
22986 memset (perm2
, elt
, 2);
22987 dest
= gen_reg_rtx (vmode
);
22988 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
22991 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22996 /* These can be implemented via interleave. We save one insn by
22997 stopping once we have promoted to V4SImode and then use pshufd. */
23004 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
23005 : gen_vec_interleave_highv8hi
;
23009 gen
= vmode
== V16QImode
? gen_vec_interleave_lowv16qi
23010 : gen_vec_interleave_lowv8hi
;
23013 dest
= gen_reg_rtx (vmode
);
23014 emit_insn (gen (dest
, op0
, op0
));
23015 vmode
= get_mode_wider_vector (vmode
);
23016 op0
= gen_lowpart (vmode
, dest
);
23018 while (vmode
!= V4SImode
);
23020 memset (perm2
, elt
, 4);
23021 dest
= gen_reg_rtx (vmode
);
23022 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
23025 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
23030 /* This can be implemented via interleave and pshufd. */
23034 rtx (*gen_interleave
) (machine_mode
, int, rtx
, rtx
, rtx
);
23037 gen_interleave
= gen_vec_interleave_high
;
23041 gen_interleave
= gen_vec_interleave_low
;
23044 dest
= gen_reg_rtx (vmode
);
23045 emit_insn (gen_interleave (vmode
, 1, dest
, op0
, op0
));
23048 op0
= gen_lowpart (vmode
, dest
);
23050 memset (perm2
, elt
, 4);
23051 dest
= gen_reg_rtx (vmode
);
23052 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
23055 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
23062 /* For AVX2 broadcasts of the first element vpbroadcast* or
23063 vpermq should be used by expand_vec_perm_1. */
23064 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
23068 gcc_assert (!TARGET_AVX512BW
|| d
->perm
[0]);
23072 gcc_assert (!TARGET_AVX512BW
);
23076 gcc_unreachable ();
23080 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
23081 broadcast permutations. */
23084 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
23086 unsigned i
, elt
, nelt
= d
->nelt
;
23088 if (!d
->one_operand_p
)
23092 for (i
= 1; i
< nelt
; ++i
)
23093 if (d
->perm
[i
] != elt
)
23096 return expand_vec_perm_broadcast_1 (d
);
23099 /* Implement arbitrary permutations of two V64QImode operands
23100 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
23102 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
23104 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
23110 struct expand_vec_perm_d ds
[2];
23111 rtx rperm
[128], vperm
, target0
, target1
;
23112 unsigned int i
, nelt
;
23113 machine_mode vmode
;
23118 for (i
= 0; i
< 2; i
++)
23121 ds
[i
].vmode
= V32HImode
;
23123 ds
[i
].target
= gen_reg_rtx (V32HImode
);
23124 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
23125 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
23128 /* Prepare permutations such that the first one takes care of
23129 putting the even bytes into the right positions or one higher
23130 positions (ds[0]) and the second one takes care of
23131 putting the odd bytes into the right positions or one below
23134 for (i
= 0; i
< nelt
; i
++)
23136 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
23139 rperm
[i
] = constm1_rtx
;
23140 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
23144 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
23145 rperm
[i
+ 64] = constm1_rtx
;
23149 bool ok
= expand_vec_perm_1 (&ds
[0]);
23151 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
23153 ok
= expand_vec_perm_1 (&ds
[1]);
23155 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
23157 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
23158 vperm
= force_reg (vmode
, vperm
);
23159 target0
= gen_reg_rtx (V64QImode
);
23160 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
23162 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
23163 vperm
= force_reg (vmode
, vperm
);
23164 target1
= gen_reg_rtx (V64QImode
);
23165 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
23167 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
23171 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
23172 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
23173 all the shorter instruction sequences. */
23176 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
23178 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
23179 unsigned int i
, nelt
, eltsz
;
23183 || d
->one_operand_p
23184 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
23191 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
23193 /* Generate 4 permutation masks. If the required element is within
23194 the same lane, it is shuffled in. If the required element from the
23195 other lane, force a zero by setting bit 7 in the permutation mask.
23196 In the other mask the mask has non-negative elements if element
23197 is requested from the other lane, but also moved to the other lane,
23198 so that the result of vpshufb can have the two V2TImode halves
23200 m128
= GEN_INT (-128);
23201 for (i
= 0; i
< 32; ++i
)
23203 rperm
[0][i
] = m128
;
23204 rperm
[1][i
] = m128
;
23205 rperm
[2][i
] = m128
;
23206 rperm
[3][i
] = m128
;
23212 for (i
= 0; i
< nelt
; ++i
)
23214 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
23215 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
23216 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
23218 for (j
= 0; j
< eltsz
; ++j
)
23219 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
23220 used
[which
] = true;
23223 for (i
= 0; i
< 2; ++i
)
23225 if (!used
[2 * i
+ 1])
23230 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
23231 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
23232 vperm
= force_reg (V32QImode
, vperm
);
23233 h
[i
] = gen_reg_rtx (V32QImode
);
23234 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
23235 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
23238 /* Swap the 128-byte lanes of h[X]. */
23239 for (i
= 0; i
< 2; ++i
)
23241 if (h
[i
] == NULL_RTX
)
23243 op
= gen_reg_rtx (V4DImode
);
23244 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
23245 const2_rtx
, GEN_INT (3), const0_rtx
,
23247 h
[i
] = gen_lowpart (V32QImode
, op
);
23250 for (i
= 0; i
< 2; ++i
)
23257 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
23258 vperm
= force_reg (V32QImode
, vperm
);
23259 l
[i
] = gen_reg_rtx (V32QImode
);
23260 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
23261 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
23264 for (i
= 0; i
< 2; ++i
)
23268 op
= gen_reg_rtx (V32QImode
);
23269 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
23276 gcc_assert (l
[0] && l
[1]);
23278 if (d
->vmode
!= V32QImode
)
23279 op
= gen_reg_rtx (V32QImode
);
23280 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
23281 if (op
!= d
->target
)
23282 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
23286 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
23287 taken care of, perform the expansion in D and return true on success. */
23290 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
23292 /* Try a single instruction expansion. */
23293 if (expand_vec_perm_1 (d
))
23296 /* Try sequences of two instructions. */
23298 if (expand_vec_perm_pshuflw_pshufhw (d
))
23301 if (expand_vec_perm_palignr (d
, false))
23304 if (expand_vec_perm_interleave2 (d
))
23307 if (expand_vec_perm_broadcast (d
))
23310 if (expand_vec_perm_vpermq_perm_1 (d
))
23313 if (expand_vec_perm_vperm2f128 (d
))
23316 if (expand_vec_perm_pblendv (d
))
23319 if (expand_vec_perm_2perm_interleave (d
, true))
23322 if (expand_vec_perm_2perm_pblendv (d
, true))
23325 if (expand_vec_perm_shufps_shufps (d
))
23328 /* Try sequences of three instructions. */
23330 if (expand_vec_perm_even_odd_pack (d
))
23333 if (expand_vec_perm_2vperm2f128_vshuf (d
))
23336 if (expand_vec_perm_pshufb2 (d
))
23339 if (expand_vec_perm_pslldq_psrldq_por (d
, false))
23342 if (expand_vec_perm_interleave3 (d
))
23345 if (expand_vec_perm_vperm2f128_vblend (d
))
23348 if (expand_vec_perm_2perm_interleave (d
, false))
23351 if (expand_vec_perm_2perm_pblendv (d
, false))
23354 /* Try sequences of four instructions. */
23356 if (expand_vec_perm_even_odd_trunc (d
))
23358 if (expand_vec_perm_vpshufb2_vpermq (d
))
23361 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
23364 if (expand_vec_perm_vpermt2_vpshub2 (d
))
23367 /* ??? Look for narrow permutations whose element orderings would
23368 allow the promotion to a wider mode. */
23370 /* ??? Look for sequences of interleave or a wider permute that place
23371 the data into the correct lanes for a half-vector shuffle like
23372 pshuf[lh]w or vpermilps. */
23374 /* ??? Look for sequences of interleave that produce the desired results.
23375 The combinatorics of punpck[lh] get pretty ugly... */
23377 if (expand_vec_perm_even_odd (d
))
23380 /* Generate four or five instructions. */
23381 if (expand_vec_perm_pslldq_psrldq_por (d
, true))
23384 /* Even longer sequences. */
23385 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
23388 /* See if we can get the same permutation in different vector integer
23390 struct expand_vec_perm_d nd
;
23391 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
23394 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
23398 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
23399 if (expand_vec_perm2_vperm2f128_vblend (d
))
23405 /* If a permutation only uses one operand, make it clear. Returns true
23406 if the permutation references both operands. */
23409 canonicalize_perm (struct expand_vec_perm_d
*d
)
23411 int i
, which
, nelt
= d
->nelt
;
23413 for (i
= which
= 0; i
< nelt
; ++i
)
23414 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
23416 d
->one_operand_p
= true;
23423 if (!rtx_equal_p (d
->op0
, d
->op1
))
23425 d
->one_operand_p
= false;
23428 /* The elements of PERM do not suggest that only the first operand
23429 is used, but both operands are identical. Allow easier matching
23430 of the permutation by folding the permutation into the single
23435 for (i
= 0; i
< nelt
; ++i
)
23436 d
->perm
[i
] &= nelt
- 1;
23445 return (which
== 3);
23448 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
23451 ix86_vectorize_vec_perm_const (machine_mode vmode
, machine_mode op_mode
,
23452 rtx target
, rtx op0
, rtx op1
,
23453 const vec_perm_indices
&sel
)
23455 if (vmode
!= op_mode
)
23458 struct expand_vec_perm_d d
;
23459 unsigned char perm
[MAX_VECT_LEN
];
23460 unsigned int i
, nelt
, which
;
23463 if (GET_MODE_SIZE (vmode
) == 64 && !TARGET_EVEX512
)
23466 /* For HF mode vector, convert it to HI using subreg. */
23467 if (GET_MODE_INNER (vmode
) == HFmode
)
23469 machine_mode orig_mode
= vmode
;
23470 vmode
= mode_for_vector (HImode
,
23471 GET_MODE_NUNITS (vmode
)).require ();
23473 target
= lowpart_subreg (vmode
, target
, orig_mode
);
23475 op0
= lowpart_subreg (vmode
, op0
, orig_mode
);
23477 op1
= lowpart_subreg (vmode
, op1
, orig_mode
);
23485 gcc_assert (VECTOR_MODE_P (d
.vmode
));
23486 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
23487 d
.testing_p
= !target
;
23489 gcc_assert (sel
.length () == nelt
);
23490 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
23492 /* Given sufficient ISA support we can just return true here
23493 for selected vector modes. */
23500 if (!TARGET_AVX512F
)
23502 /* All implementable with a single vperm[it]2 insn. */
23507 if (!TARGET_AVX512F
)
23509 if (d
.testing_p
&& TARGET_AVX512BW
)
23510 /* All implementable with a single vperm[it]2 insn. */
23514 if (!TARGET_AVX512F
)
23516 if (d
.testing_p
&& TARGET_AVX512BW
)
23517 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
23526 if (d
.testing_p
&& TARGET_AVX512VL
)
23527 /* All implementable with a single vperm[it]2 insn. */
23533 if (d
.testing_p
&& TARGET_AVX2
)
23534 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23540 if (d
.testing_p
&& TARGET_AVX2
)
23541 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23548 /* Fall through. */
23553 /* All implementable with a single vpperm insn. */
23554 if (d
.testing_p
&& TARGET_XOP
)
23556 /* All implementable with 2 pshufb + 1 ior. */
23557 if (d
.testing_p
&& TARGET_SSSE3
)
23564 if (!TARGET_MMX_WITH_SSE
)
23570 /* All implementable with *punpckwd. */
23582 /* All implementable with shufpd or unpck[lh]pd. */
23590 for (i
= which
= 0; i
< nelt
; ++i
)
23592 unsigned char e
= sel
[i
];
23593 gcc_assert (e
< 2 * nelt
);
23596 which
|= (e
< nelt
? 1 : 2);
23601 /* For all elements from second vector, fold the elements to first. */
23603 for (i
= 0; i
< nelt
; ++i
)
23606 /* Check whether the mask can be applied to the vector type. */
23607 d
.one_operand_p
= (which
!= 3);
23609 /* Implementable with shufps, pshufd or pshuflw. */
23610 if (d
.one_operand_p
23611 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
23612 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
23613 || d
.vmode
== V4HImode
|| d
.vmode
== V2HImode
))
23616 /* Otherwise we have to go through the motions and see if we can
23617 figure out how to generate the requested permutation. */
23618 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
23619 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
23620 if (!d
.one_operand_p
)
23621 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
23624 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
23630 two_args
= canonicalize_perm (&d
);
23632 /* If one of the operands is a zero vector, try to match pmovzx. */
23633 if (two_args
&& (d
.op0
== CONST0_RTX (vmode
) || d
.op1
== CONST0_RTX (vmode
)))
23635 struct expand_vec_perm_d dzero
= d
;
23636 if (d
.op0
== CONST0_RTX (vmode
))
23638 d
.op1
= dzero
.op1
= force_reg (vmode
, d
.op1
);
23639 std::swap (dzero
.op0
, dzero
.op1
);
23640 for (i
= 0; i
< nelt
; ++i
)
23641 dzero
.perm
[i
] ^= nelt
;
23644 d
.op0
= dzero
.op0
= force_reg (vmode
, d
.op0
);
23646 if (expand_vselect_vconcat (dzero
.target
, dzero
.op0
, dzero
.op1
,
23647 dzero
.perm
, nelt
, dzero
.testing_p
))
23651 /* Force operands into registers. */
23652 rtx nop0
= force_reg (vmode
, d
.op0
);
23653 if (d
.op0
== d
.op1
)
23656 d
.op1
= force_reg (vmode
, d
.op1
);
23658 if (ix86_expand_vec_perm_const_1 (&d
))
23661 /* If the selector says both arguments are needed, but the operands are the
23662 same, the above tried to expand with one_operand_p and flattened selector.
23663 If that didn't work, retry without one_operand_p; we succeeded with that
23665 if (two_args
&& d
.one_operand_p
)
23667 d
.one_operand_p
= false;
23668 memcpy (d
.perm
, perm
, sizeof (perm
));
23669 return ix86_expand_vec_perm_const_1 (&d
);
23676 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
23678 struct expand_vec_perm_d d
;
23684 d
.vmode
= GET_MODE (targ
);
23685 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
23686 d
.one_operand_p
= false;
23687 d
.testing_p
= false;
23689 for (i
= 0; i
< nelt
; ++i
)
23690 d
.perm
[i
] = i
* 2 + odd
;
23692 /* We'll either be able to implement the permutation directly... */
23693 if (expand_vec_perm_1 (&d
))
23696 /* ... or we use the special-case patterns. */
23697 expand_vec_perm_even_odd_1 (&d
, odd
);
23701 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
23703 struct expand_vec_perm_d d
;
23704 unsigned i
, nelt
, base
;
23710 d
.vmode
= GET_MODE (targ
);
23711 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
23712 d
.one_operand_p
= false;
23713 d
.testing_p
= false;
23715 base
= high_p
? nelt
/ 2 : 0;
23716 for (i
= 0; i
< nelt
/ 2; ++i
)
23718 d
.perm
[i
* 2] = i
+ base
;
23719 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
23722 /* Note that for AVX this isn't one instruction. */
23723 ok
= ix86_expand_vec_perm_const_1 (&d
);
23727 /* Expand a vector operation shift by constant for a V*QImode in terms of the
23728 same operation on V*HImode. Return true if success. */
23730 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
,
23731 rtx dest
, rtx op1
, rtx op2
)
23733 machine_mode qimode
, himode
;
23734 HOST_WIDE_INT and_constant
, xor_constant
;
23735 HOST_WIDE_INT shift_amount
;
23736 rtx vec_const_and
, vec_const_xor
;
23737 rtx tmp
, op1_subreg
;
23738 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
23739 rtx (*gen_and
) (rtx
, rtx
, rtx
);
23740 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
23741 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
23743 /* Only optimize shift by constant. */
23744 if (!CONST_INT_P (op2
))
23747 qimode
= GET_MODE (dest
);
23748 shift_amount
= INTVAL (op2
);
23749 /* Do nothing when shift amount greater equal 8. */
23750 if (shift_amount
> 7)
23753 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
23754 /* Record sign bit. */
23755 xor_constant
= 1 << (8 - shift_amount
- 1);
23757 /* Zero upper/lower bits shift from left/right element. */
23759 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
23760 : (1 << (8 - shift_amount
)) - 1);
23769 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
23770 gen_and
= gen_andv16qi3
;
23771 gen_xor
= gen_xorv16qi3
;
23772 gen_sub
= gen_subv16qi3
;
23775 himode
= V16HImode
;
23779 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
23780 gen_and
= gen_andv32qi3
;
23781 gen_xor
= gen_xorv32qi3
;
23782 gen_sub
= gen_subv32qi3
;
23785 himode
= V32HImode
;
23789 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
23790 gen_and
= gen_andv64qi3
;
23791 gen_xor
= gen_xorv64qi3
;
23792 gen_sub
= gen_subv64qi3
;
23795 gcc_unreachable ();
23798 tmp
= gen_reg_rtx (himode
);
23799 vec_const_and
= gen_reg_rtx (qimode
);
23800 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
23802 /* For ASHIFT and LSHIFTRT, perform operation like
23803 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23804 vpand %vec_const_and, %dest. */
23805 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
23806 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
23807 emit_move_insn (vec_const_and
,
23808 ix86_build_const_vector (qimode
, true,
23809 gen_int_mode (and_constant
, QImode
)));
23810 emit_insn (gen_and (dest
, dest
, vec_const_and
));
23812 /* For ASHIFTRT, perform extra operation like
23813 vpxor %vec_const_xor, %dest, %dest
23814 vpsubb %vec_const_xor, %dest, %dest */
23815 if (code
== ASHIFTRT
)
23817 vec_const_xor
= gen_reg_rtx (qimode
);
23818 emit_move_insn (vec_const_xor
,
23819 ix86_build_const_vector (qimode
, true,
23820 gen_int_mode (xor_constant
, QImode
)));
23821 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
23822 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
23828 ix86_expand_vecop_qihi_partial (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
23830 machine_mode qimode
= GET_MODE (dest
);
23831 rtx qop1
, qop2
, hop1
, hop2
, qdest
, hdest
;
23832 bool op2vec
= GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
;
23833 bool uns_p
= code
!= ASHIFTRT
;
23841 gcc_unreachable ();
23844 qop1
= lowpart_subreg (V16QImode
, force_reg (qimode
, op1
), qimode
);
23847 qop2
= lowpart_subreg (V16QImode
, force_reg (qimode
, op2
), qimode
);
23851 qdest
= gen_reg_rtx (V16QImode
);
23853 if (CONST_INT_P (op2
)
23854 && (code
== ASHIFT
|| code
== LSHIFTRT
|| code
== ASHIFTRT
)
23855 && ix86_expand_vec_shift_qihi_constant (code
, qdest
, qop1
, qop2
))
23857 emit_move_insn (dest
, gen_lowpart (qimode
, qdest
));
23864 gcc_assert (op2vec
);
23865 if (!TARGET_SSE4_1
)
23867 /* Unpack data such that we've got a source byte in each low byte
23868 of each word. We don't care what goes into the high byte of
23869 each word. Rather than trying to get zero in there, most
23870 convenient is to let it be a copy of the low byte. */
23871 hop1
= copy_to_reg (qop1
);
23872 hop2
= copy_to_reg (qop2
);
23873 emit_insn (gen_vec_interleave_lowv16qi (hop1
, hop1
, hop1
));
23874 emit_insn (gen_vec_interleave_lowv16qi (hop2
, hop2
, hop2
));
23881 hop1
= gen_reg_rtx (V8HImode
);
23882 ix86_expand_sse_unpack (hop1
, qop1
, uns_p
, false);
23883 /* mult/vashr/vlshr/vashl */
23886 hop2
= gen_reg_rtx (V8HImode
);
23887 ix86_expand_sse_unpack (hop2
, qop2
, uns_p
, false);
23894 gcc_unreachable ();
23897 if (code
!= MULT
&& op2vec
)
23899 /* Expand vashr/vlshr/vashl. */
23900 hdest
= gen_reg_rtx (V8HImode
);
23901 emit_insn (gen_rtx_SET (hdest
,
23902 simplify_gen_binary (code
, V8HImode
,
23906 /* Expand mult/ashr/lshr/ashl. */
23907 hdest
= expand_simple_binop (V8HImode
, code
, hop1
, hop2
,
23908 NULL_RTX
, 1, OPTAB_DIRECT
);
23910 if (TARGET_AVX512BW
&& TARGET_AVX512VL
)
23912 if (qimode
== V8QImode
)
23915 qdest
= gen_reg_rtx (V8QImode
);
23917 emit_insn (gen_truncv8hiv8qi2 (qdest
, hdest
));
23921 struct expand_vec_perm_d d
;
23922 rtx qres
= gen_lowpart (V16QImode
, hdest
);
23926 /* Merge the data back into the right place. */
23928 d
.op0
= d
.op1
= qres
;
23929 d
.vmode
= V16QImode
;
23931 d
.one_operand_p
= false;
23932 d
.testing_p
= false;
23934 for (i
= 0; i
< d
.nelt
; ++i
)
23937 ok
= ix86_expand_vec_perm_const_1 (&d
);
23942 emit_move_insn (dest
, gen_lowpart (qimode
, qdest
));
23945 /* Emit instruction in 2x wider mode. For example, optimize
23946 vector MUL generation like
23948 vpmovzxbw ymm2, xmm0
23949 vpmovzxbw ymm3, xmm1
23950 vpmullw ymm4, ymm2, ymm3
23953 it would take less instructions than ix86_expand_vecop_qihi.
23954 Return true if success. */
23957 ix86_expand_vecop_qihi2 (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
23959 machine_mode himode
, qimode
= GET_MODE (dest
);
23960 machine_mode wqimode
;
23961 rtx qop1
, qop2
, hop1
, hop2
, hdest
;
23962 rtx (*gen_truncate
)(rtx
, rtx
) = NULL
;
23963 bool op2vec
= GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
;
23964 bool uns_p
= code
!= ASHIFTRT
;
23966 if ((qimode
== V16QImode
&& !TARGET_AVX2
)
23967 || (qimode
== V32QImode
&& (!TARGET_AVX512BW
|| !TARGET_EVEX512
))
23968 /* There are no V64HImode instructions. */
23969 || qimode
== V64QImode
)
23972 /* Do not generate ymm/zmm instructions when
23973 target prefers 128/256 bit vector width. */
23974 if ((qimode
== V16QImode
&& TARGET_PREFER_AVX128
)
23975 || (qimode
== V32QImode
&& TARGET_PREFER_AVX256
))
23981 himode
= V16HImode
;
23982 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
23983 gen_truncate
= gen_truncv16hiv16qi2
;
23986 himode
= V32HImode
;
23987 gen_truncate
= gen_truncv32hiv32qi2
;
23990 gcc_unreachable ();
23993 wqimode
= GET_MODE_2XWIDER_MODE (qimode
).require ();
23994 qop1
= lowpart_subreg (wqimode
, force_reg (qimode
, op1
), qimode
);
23997 qop2
= lowpart_subreg (wqimode
, force_reg (qimode
, op2
), qimode
);
24001 hop1
= gen_reg_rtx (himode
);
24002 ix86_expand_sse_unpack (hop1
, qop1
, uns_p
, false);
24006 hop2
= gen_reg_rtx (himode
);
24007 ix86_expand_sse_unpack (hop2
, qop2
, uns_p
, false);
24012 if (code
!= MULT
&& op2vec
)
24014 /* Expand vashr/vlshr/vashl. */
24015 hdest
= gen_reg_rtx (himode
);
24016 emit_insn (gen_rtx_SET (hdest
,
24017 simplify_gen_binary (code
, himode
,
24021 /* Expand mult/ashr/lshr/ashl. */
24022 hdest
= expand_simple_binop (himode
, code
, hop1
, hop2
,
24023 NULL_RTX
, 1, OPTAB_DIRECT
);
24026 emit_insn (gen_truncate (dest
, hdest
));
24029 struct expand_vec_perm_d d
;
24030 rtx wqdest
= gen_reg_rtx (wqimode
);
24031 rtx wqres
= gen_lowpart (wqimode
, hdest
);
24035 /* Merge the data back into the right place. */
24037 d
.op0
= d
.op1
= wqres
;
24039 d
.nelt
= GET_MODE_NUNITS (wqimode
);
24040 d
.one_operand_p
= false;
24041 d
.testing_p
= false;
24043 for (i
= 0; i
< d
.nelt
; ++i
)
24046 ok
= ix86_expand_vec_perm_const_1 (&d
);
24049 emit_move_insn (dest
, gen_lowpart (qimode
, wqdest
));
24055 /* Expand a vector operation CODE for a V*QImode in terms of the
24056 same operation on V*HImode. */
24059 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
24061 machine_mode qimode
= GET_MODE (dest
);
24062 machine_mode himode
;
24063 rtx (*gen_il
) (rtx
, rtx
, rtx
);
24064 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
24065 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
24066 bool op2vec
= GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
;
24067 struct expand_vec_perm_d d
;
24068 bool full_interleave
= true;
24069 bool uns_p
= code
!= ASHIFTRT
;
24073 if (CONST_INT_P (op2
)
24074 && (code
== ASHIFT
|| code
== LSHIFTRT
|| code
== ASHIFTRT
)
24075 && ix86_expand_vec_shift_qihi_constant (code
, dest
, op1
, op2
))
24078 if (ix86_expand_vecop_qihi2 (code
, dest
, op1
, op2
))
24087 himode
= V16HImode
;
24090 himode
= V32HImode
;
24093 gcc_unreachable ();
24099 gcc_assert (op2vec
);
24100 /* Unpack data such that we've got a source byte in each low byte of
24101 each word. We don't care what goes into the high byte of each word.
24102 Rather than trying to get zero in there, most convenient is to let
24103 it be a copy of the low byte. */
24107 gen_il
= gen_vec_interleave_lowv16qi
;
24108 gen_ih
= gen_vec_interleave_highv16qi
;
24111 gen_il
= gen_avx2_interleave_lowv32qi
;
24112 gen_ih
= gen_avx2_interleave_highv32qi
;
24113 full_interleave
= false;
24116 gen_il
= gen_avx512bw_interleave_lowv64qi
;
24117 gen_ih
= gen_avx512bw_interleave_highv64qi
;
24118 full_interleave
= false;
24121 gcc_unreachable ();
24124 op2_l
= gen_reg_rtx (qimode
);
24125 op2_h
= gen_reg_rtx (qimode
);
24126 emit_insn (gen_il (op2_l
, op2
, op2
));
24127 emit_insn (gen_ih (op2_h
, op2
, op2
));
24129 op1_l
= gen_reg_rtx (qimode
);
24130 op1_h
= gen_reg_rtx (qimode
);
24131 emit_insn (gen_il (op1_l
, op1
, op1
));
24132 emit_insn (gen_ih (op1_h
, op1
, op1
));
24138 op1_l
= gen_reg_rtx (himode
);
24139 op1_h
= gen_reg_rtx (himode
);
24140 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
24141 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
24142 /* vashr/vlshr/vashl */
24145 rtx tmp
= force_reg (qimode
, op2
);
24146 op2_l
= gen_reg_rtx (himode
);
24147 op2_h
= gen_reg_rtx (himode
);
24148 ix86_expand_sse_unpack (op2_l
, tmp
, uns_p
, false);
24149 ix86_expand_sse_unpack (op2_h
, tmp
, uns_p
, true);
24152 op2_l
= op2_h
= op2
;
24156 gcc_unreachable ();
24159 if (code
!= MULT
&& op2vec
)
24161 /* Expand vashr/vlshr/vashl. */
24162 res_l
= gen_reg_rtx (himode
);
24163 res_h
= gen_reg_rtx (himode
);
24164 emit_insn (gen_rtx_SET (res_l
,
24165 simplify_gen_binary (code
, himode
,
24167 emit_insn (gen_rtx_SET (res_h
,
24168 simplify_gen_binary (code
, himode
,
24173 /* Expand mult/ashr/lshr/ashl. */
24174 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
24176 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
24180 gcc_assert (res_l
&& res_h
);
24182 /* Merge the data back into the right place. */
24184 d
.op0
= gen_lowpart (qimode
, res_l
);
24185 d
.op1
= gen_lowpart (qimode
, res_h
);
24187 d
.nelt
= GET_MODE_NUNITS (qimode
);
24188 d
.one_operand_p
= false;
24189 d
.testing_p
= false;
24191 if (full_interleave
)
24193 /* We used the full interleave, the desired
24194 results are in the even elements. */
24195 for (i
= 0; i
< d
.nelt
; ++i
)
24200 /* For AVX, the interleave used above was not cross-lane. So the
24201 extraction is evens but with the second and third quarter swapped.
24202 Happily, that is even one insn shorter than even extraction.
24203 For AVX512BW we have 4 lanes. We extract evens from within a lane,
24204 always first from the first and then from the second source operand,
24205 the index bits above the low 4 bits remains the same.
24206 Thus, for d.nelt == 32 we want permutation
24207 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
24208 and for d.nelt == 64 we want permutation
24209 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
24210 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
24211 for (i
= 0; i
< d
.nelt
; ++i
)
24212 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
24215 ok
= ix86_expand_vec_perm_const_1 (&d
);
24219 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
24220 if op is CONST_VECTOR with all odd elements equal to their
24221 preceding element. */
24224 const_vector_equal_evenodd_p (rtx op
)
24226 machine_mode mode
= GET_MODE (op
);
24227 int i
, nunits
= GET_MODE_NUNITS (mode
);
24228 if (GET_CODE (op
) != CONST_VECTOR
24229 || nunits
!= CONST_VECTOR_NUNITS (op
))
24231 for (i
= 0; i
< nunits
; i
+= 2)
24232 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
24238 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
24239 bool uns_p
, bool odd_p
)
24241 machine_mode mode
= GET_MODE (op1
);
24242 machine_mode wmode
= GET_MODE (dest
);
24244 rtx orig_op1
= op1
, orig_op2
= op2
;
24246 if (!nonimmediate_operand (op1
, mode
))
24247 op1
= force_reg (mode
, op1
);
24248 if (!nonimmediate_operand (op2
, mode
))
24249 op2
= force_reg (mode
, op2
);
24251 /* We only play even/odd games with vectors of SImode. */
24252 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
24254 /* If we're looking for the odd results, shift those members down to
24255 the even slots. For some cpus this is faster than a PSHUFD. */
24258 /* For XOP use vpmacsdqh, but only for smult, as it is only
24260 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
24262 x
= force_reg (wmode
, CONST0_RTX (wmode
));
24263 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
24267 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
24268 if (!const_vector_equal_evenodd_p (orig_op1
))
24269 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
24270 x
, NULL
, 1, OPTAB_DIRECT
);
24271 if (!const_vector_equal_evenodd_p (orig_op2
))
24272 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
24273 x
, NULL
, 1, OPTAB_DIRECT
);
24274 op1
= gen_lowpart (mode
, op1
);
24275 op2
= gen_lowpart (mode
, op2
);
24278 if (mode
== V16SImode
)
24281 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
24283 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
24285 else if (mode
== V8SImode
)
24288 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
24290 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
24293 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
24294 else if (TARGET_SSE4_1
)
24295 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
24298 rtx s1
, s2
, t0
, t1
, t2
;
24300 /* The easiest way to implement this without PMULDQ is to go through
24301 the motions as if we are performing a full 64-bit multiply. With
24302 the exception that we need to do less shuffling of the elements. */
24304 /* Compute the sign-extension, aka highparts, of the two operands. */
24305 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
24306 op1
, pc_rtx
, pc_rtx
);
24307 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
24308 op2
, pc_rtx
, pc_rtx
);
24310 /* Multiply LO(A) * HI(B), and vice-versa. */
24311 t1
= gen_reg_rtx (wmode
);
24312 t2
= gen_reg_rtx (wmode
);
24313 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
24314 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
24316 /* Multiply LO(A) * LO(B). */
24317 t0
= gen_reg_rtx (wmode
);
24318 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
24320 /* Combine and shift the highparts into place. */
24321 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
24322 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
24325 /* Combine high and low parts. */
24326 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
24333 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
24334 bool uns_p
, bool high_p
)
24336 machine_mode wmode
= GET_MODE (dest
);
24337 machine_mode mode
= GET_MODE (op1
);
24338 rtx t1
, t2
, t3
, t4
, mask
;
24343 t1
= gen_reg_rtx (mode
);
24344 t2
= gen_reg_rtx (mode
);
24345 if (TARGET_XOP
&& !uns_p
)
24347 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
24348 shuffle the elements once so that all elements are in the right
24349 place for immediate use: { A C B D }. */
24350 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
24351 const1_rtx
, GEN_INT (3)));
24352 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
24353 const1_rtx
, GEN_INT (3)));
24357 /* Put the elements into place for the multiply. */
24358 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
24359 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
24362 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
24366 /* Shuffle the elements between the lanes. After this we
24367 have { A B E F | C D G H } for each operand. */
24368 t1
= gen_reg_rtx (V4DImode
);
24369 t2
= gen_reg_rtx (V4DImode
);
24370 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
24371 const0_rtx
, const2_rtx
,
24372 const1_rtx
, GEN_INT (3)));
24373 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
24374 const0_rtx
, const2_rtx
,
24375 const1_rtx
, GEN_INT (3)));
24377 /* Shuffle the elements within the lanes. After this we
24378 have { A A B B | C C D D } or { E E F F | G G H H }. */
24379 t3
= gen_reg_rtx (V8SImode
);
24380 t4
= gen_reg_rtx (V8SImode
);
24381 mask
= GEN_INT (high_p
24382 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
24383 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
24384 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
24385 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
24387 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
24392 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
24393 uns_p
, OPTAB_DIRECT
);
24394 t2
= expand_binop (mode
,
24395 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
24396 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
24397 gcc_assert (t1
&& t2
);
24399 t3
= gen_reg_rtx (mode
);
24400 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
24401 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
24409 t1
= gen_reg_rtx (wmode
);
24410 t2
= gen_reg_rtx (wmode
);
24411 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
24412 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
24414 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
24418 gcc_unreachable ();
24423 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
24425 rtx res_1
, res_2
, res_3
, res_4
;
24427 res_1
= gen_reg_rtx (V4SImode
);
24428 res_2
= gen_reg_rtx (V4SImode
);
24429 res_3
= gen_reg_rtx (V2DImode
);
24430 res_4
= gen_reg_rtx (V2DImode
);
24431 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
24432 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
24434 /* Move the results in element 2 down to element 1; we don't care
24435 what goes in elements 2 and 3. Then we can merge the parts
24436 back together with an interleave.
24438 Note that two other sequences were tried:
24439 (1) Use interleaves at the start instead of psrldq, which allows
24440 us to use a single shufps to merge things back at the end.
24441 (2) Use shufps here to combine the two vectors, then pshufd to
24442 put the elements in the correct order.
24443 In both cases the cost of the reformatting stall was too high
24444 and the overall sequence slower. */
24446 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
24447 const0_rtx
, const2_rtx
,
24448 const0_rtx
, const0_rtx
));
24449 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
24450 const0_rtx
, const2_rtx
,
24451 const0_rtx
, const0_rtx
));
24452 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
24454 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
24458 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
24460 machine_mode mode
= GET_MODE (op0
);
24461 rtx t1
, t2
, t3
, t4
, t5
, t6
;
24463 if (TARGET_AVX512DQ
&& TARGET_EVEX512
&& mode
== V8DImode
)
24464 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
24465 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
24466 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
24467 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
24468 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
24469 else if (TARGET_XOP
&& mode
== V2DImode
)
24471 /* op1: A,B,C,D, op2: E,F,G,H */
24472 op1
= gen_lowpart (V4SImode
, op1
);
24473 op2
= gen_lowpart (V4SImode
, op2
);
24475 t1
= gen_reg_rtx (V4SImode
);
24476 t2
= gen_reg_rtx (V4SImode
);
24477 t3
= gen_reg_rtx (V2DImode
);
24478 t4
= gen_reg_rtx (V2DImode
);
24481 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
24487 /* t2: (B*E),(A*F),(D*G),(C*H) */
24488 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
24490 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
24491 emit_insn (gen_xop_phadddq (t3
, t2
));
24493 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
24494 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
24496 /* Multiply lower parts and add all */
24497 t5
= gen_reg_rtx (V2DImode
);
24498 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
24499 gen_lowpart (V4SImode
, op1
),
24500 gen_lowpart (V4SImode
, op2
)));
24501 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
24505 machine_mode nmode
;
24506 rtx (*umul
) (rtx
, rtx
, rtx
);
24508 if (mode
== V2DImode
)
24510 umul
= gen_vec_widen_umult_even_v4si
;
24513 else if (mode
== V4DImode
)
24515 umul
= gen_vec_widen_umult_even_v8si
;
24518 else if (mode
== V8DImode
)
24520 umul
= gen_vec_widen_umult_even_v16si
;
24524 gcc_unreachable ();
24527 /* Multiply low parts. */
24528 t1
= gen_reg_rtx (mode
);
24529 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
24531 /* Shift input vectors right 32 bits so we can multiply high parts. */
24533 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
24534 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
24536 /* Multiply high parts by low parts. */
24537 t4
= gen_reg_rtx (mode
);
24538 t5
= gen_reg_rtx (mode
);
24539 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
24540 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
24542 /* Combine and shift the highparts back. */
24543 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
24544 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
24546 /* Combine high and low parts. */
24547 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
24550 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
24551 gen_rtx_MULT (mode
, op1
, op2
));
24554 /* Return 1 if control tansfer instruction INSN
24555 should be encoded with notrack prefix. */
24558 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
24560 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
24565 rtx call
= get_call_rtx_from (insn
);
24566 gcc_assert (call
!= NULL_RTX
);
24567 rtx addr
= XEXP (call
, 0);
24569 /* Do not emit 'notrack' if it's not an indirect call. */
24571 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
24574 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
24577 if (JUMP_P (insn
) && !flag_cet_switch
)
24579 rtx target
= JUMP_LABEL (insn
);
24580 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
24583 /* Check the jump is a switch table. */
24584 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
24585 rtx_insn
*table
= next_insn (label
);
24586 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
24594 /* Calculate integer abs() using only SSE2 instructions. */
24597 ix86_expand_sse2_abs (rtx target
, rtx input
)
24599 machine_mode mode
= GET_MODE (target
);
24606 /* For 64-bit signed integer X, with SSE4.2 use
24607 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
24608 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
24609 32 and use logical instead of arithmetic right shift (which is
24610 unimplemented) and subtract. */
24613 tmp0
= gen_reg_rtx (mode
);
24614 tmp1
= gen_reg_rtx (mode
);
24615 emit_move_insn (tmp1
, CONST0_RTX (mode
));
24616 if (mode
== E_V2DImode
)
24617 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
24619 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
24623 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
24624 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
24625 - 1), NULL
, 0, OPTAB_DIRECT
);
24626 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
24629 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
24630 NULL
, 0, OPTAB_DIRECT
);
24631 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
24632 target
, 0, OPTAB_DIRECT
);
24636 /* For 32-bit signed integer X, the best way to calculate the absolute
24637 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
24638 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
24639 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
24640 NULL
, 0, OPTAB_DIRECT
);
24641 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
24642 NULL
, 0, OPTAB_DIRECT
);
24643 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
24644 target
, 0, OPTAB_DIRECT
);
24648 /* For 16-bit signed integer X, the best way to calculate the absolute
24649 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
24650 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
24652 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
24653 target
, 0, OPTAB_DIRECT
);
24657 /* For 8-bit signed integer X, the best way to calculate the absolute
24658 value of X is min ((unsigned char) X, (unsigned char) (-X)),
24659 as SSE2 provides the PMINUB insn. */
24660 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
24662 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
24663 target
, 0, OPTAB_DIRECT
);
24667 gcc_unreachable ();
24671 emit_move_insn (target
, x
);
24674 /* Expand an extract from a vector register through pextr insn.
24675 Return true if successful. */
24678 ix86_expand_pextr (rtx
*operands
)
24680 rtx dst
= operands
[0];
24681 rtx src
= operands
[1];
24683 unsigned int size
= INTVAL (operands
[2]);
24684 unsigned int pos
= INTVAL (operands
[3]);
24686 if (SUBREG_P (dst
))
24688 /* Reject non-lowpart subregs. */
24689 if (SUBREG_BYTE (dst
) > 0)
24691 dst
= SUBREG_REG (dst
);
24694 if (SUBREG_P (src
))
24696 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
24697 src
= SUBREG_REG (src
);
24700 switch (GET_MODE (src
))
24708 machine_mode srcmode
, dstmode
;
24711 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
24717 if (!TARGET_SSE4_1
)
24719 srcmode
= V16QImode
;
24725 srcmode
= V8HImode
;
24729 if (!TARGET_SSE4_1
)
24731 srcmode
= V4SImode
;
24735 gcc_assert (TARGET_64BIT
);
24736 if (!TARGET_SSE4_1
)
24738 srcmode
= V2DImode
;
24745 /* Reject extractions from misaligned positions. */
24746 if (pos
& (size
-1))
24749 if (GET_MODE (dst
) == dstmode
)
24752 d
= gen_reg_rtx (dstmode
);
24754 /* Construct insn pattern. */
24755 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
24756 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
24758 /* Let the rtl optimizers know about the zero extension performed. */
24759 if (dstmode
== QImode
|| dstmode
== HImode
)
24761 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
24762 d
= gen_lowpart (SImode
, d
);
24765 emit_insn (gen_rtx_SET (d
, pat
));
24768 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
24777 /* Expand an insert into a vector register through pinsr insn.
24778 Return true if successful. */
24781 ix86_expand_pinsr (rtx
*operands
)
24783 rtx dst
= operands
[0];
24784 rtx src
= operands
[3];
24786 unsigned int size
= INTVAL (operands
[1]);
24787 unsigned int pos
= INTVAL (operands
[2]);
24789 if (SUBREG_P (dst
))
24791 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
24792 dst
= SUBREG_REG (dst
);
24795 switch (GET_MODE (dst
))
24803 machine_mode srcmode
, dstmode
;
24804 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
24807 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
24813 if (!TARGET_SSE4_1
)
24815 dstmode
= V16QImode
;
24816 pinsr
= gen_sse4_1_pinsrb
;
24822 dstmode
= V8HImode
;
24823 pinsr
= gen_sse2_pinsrw
;
24827 if (!TARGET_SSE4_1
)
24829 dstmode
= V4SImode
;
24830 pinsr
= gen_sse4_1_pinsrd
;
24834 gcc_assert (TARGET_64BIT
);
24835 if (!TARGET_SSE4_1
)
24837 dstmode
= V2DImode
;
24838 pinsr
= gen_sse4_1_pinsrq
;
24845 /* Reject insertions to misaligned positions. */
24846 if (pos
& (size
-1))
24849 if (SUBREG_P (src
))
24851 unsigned int srcpos
= SUBREG_BYTE (src
);
24857 extr_ops
[0] = gen_reg_rtx (srcmode
);
24858 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
24859 extr_ops
[2] = GEN_INT (size
);
24860 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
24862 if (!ix86_expand_pextr (extr_ops
))
24868 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
24871 if (GET_MODE (dst
) == dstmode
)
24874 d
= gen_reg_rtx (dstmode
);
24876 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
24877 gen_lowpart (srcmode
, src
),
24878 GEN_INT (1 << (pos
/ size
))));
24880 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
24889 /* All CPUs prefer to avoid cross-lane operations so perform reductions
24890 upper against lower halves up to SSE reg size. */
24893 ix86_split_reduction (machine_mode mode
)
24895 /* Reduce lowpart against highpart until we reach SSE reg width to
24896 avoid cross-lane operations. */
24922 /* Generate call to __divmoddi4. */
24925 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
24927 rtx
*quot_p
, rtx
*rem_p
)
24929 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
24931 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
24932 mode
, op0
, mode
, op1
, mode
,
24933 XEXP (rem
, 0), Pmode
);
24939 ix86_expand_atomic_fetch_op_loop (rtx target
, rtx mem
, rtx val
,
24940 enum rtx_code code
, bool after
,
24943 rtx old_reg
, new_reg
, old_mem
, success
;
24944 machine_mode mode
= GET_MODE (target
);
24945 rtx_code_label
*loop_label
= NULL
;
24947 old_reg
= gen_reg_rtx (mode
);
24949 old_mem
= copy_to_reg (mem
);
24950 loop_label
= gen_label_rtx ();
24951 emit_label (loop_label
);
24952 emit_move_insn (old_reg
, old_mem
);
24954 /* return value for atomic_fetch_op. */
24956 emit_move_insn (target
, old_reg
);
24960 new_reg
= expand_simple_binop (mode
, AND
, new_reg
, val
, NULL_RTX
,
24961 true, OPTAB_LIB_WIDEN
);
24962 new_reg
= expand_simple_unop (mode
, code
, new_reg
, NULL_RTX
, true);
24965 new_reg
= expand_simple_binop (mode
, code
, new_reg
, val
, NULL_RTX
,
24966 true, OPTAB_LIB_WIDEN
);
24968 /* return value for atomic_op_fetch. */
24970 emit_move_insn (target
, new_reg
);
24972 success
= NULL_RTX
;
24974 ix86_expand_cmpxchg_loop (&success
, old_mem
, mem
, old_reg
, new_reg
,
24975 gen_int_mode (MEMMODEL_SYNC_SEQ_CST
,
24977 doubleword
, loop_label
);
24980 /* Relax cmpxchg instruction, param loop_label indicates whether
24981 the instruction should be relaxed with a pause loop. If not,
24982 it will be relaxed to an atomic load + compare, and skip
24983 cmpxchg instruction if mem != exp_input. */
24986 ix86_expand_cmpxchg_loop (rtx
*ptarget_bool
, rtx target_val
,
24987 rtx mem
, rtx exp_input
, rtx new_input
,
24988 rtx mem_model
, bool doubleword
,
24989 rtx_code_label
*loop_label
)
24991 rtx_code_label
*cmp_label
= NULL
;
24992 rtx_code_label
*done_label
= NULL
;
24993 rtx target_bool
= NULL_RTX
, new_mem
= NULL_RTX
;
24994 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
24995 rtx (*gendw
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
24996 machine_mode mode
= GET_MODE (target_val
), hmode
= mode
;
24998 if (*ptarget_bool
== NULL
)
24999 target_bool
= gen_reg_rtx (QImode
);
25001 target_bool
= *ptarget_bool
;
25003 cmp_label
= gen_label_rtx ();
25004 done_label
= gen_label_rtx ();
25006 new_mem
= gen_reg_rtx (mode
);
25007 /* Load memory first. */
25008 expand_atomic_load (new_mem
, mem
, MEMMODEL_SEQ_CST
);
25013 gendw
= gen_atomic_compare_and_swapti_doubleword
;
25019 gendw
= gen_atomic_compare_and_swapdi_doubleword
;
25023 gen
= gen_atomic_compare_and_swapdi_1
;
25026 gen
= gen_atomic_compare_and_swapsi_1
;
25029 gen
= gen_atomic_compare_and_swaphi_1
;
25032 gen
= gen_atomic_compare_and_swapqi_1
;
25035 gcc_unreachable ();
25038 /* Compare mem value with expected value. */
25041 rtx low_new_mem
= gen_lowpart (hmode
, new_mem
);
25042 rtx low_exp_input
= gen_lowpart (hmode
, exp_input
);
25043 rtx high_new_mem
= gen_highpart (hmode
, new_mem
);
25044 rtx high_exp_input
= gen_highpart (hmode
, exp_input
);
25045 emit_cmp_and_jump_insns (low_new_mem
, low_exp_input
, NE
, NULL_RTX
,
25046 hmode
, 1, cmp_label
,
25047 profile_probability::guessed_never ());
25048 emit_cmp_and_jump_insns (high_new_mem
, high_exp_input
, NE
, NULL_RTX
,
25049 hmode
, 1, cmp_label
,
25050 profile_probability::guessed_never ());
25053 emit_cmp_and_jump_insns (new_mem
, exp_input
, NE
, NULL_RTX
,
25054 GET_MODE (exp_input
), 1, cmp_label
,
25055 profile_probability::guessed_never ());
25057 /* Directly emits cmpxchg here. */
25059 emit_insn (gendw (target_val
, mem
, exp_input
,
25060 gen_lowpart (hmode
, new_input
),
25061 gen_highpart (hmode
, new_input
),
25064 emit_insn (gen (target_val
, mem
, exp_input
, new_input
, mem_model
));
25068 emit_jump_insn (gen_jump (done_label
));
25070 emit_label (cmp_label
);
25071 emit_move_insn (target_val
, new_mem
);
25072 emit_label (done_label
);
25073 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
25078 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
25080 emit_cmp_and_jump_insns (target_bool
, const0_rtx
, EQ
, const0_rtx
,
25081 GET_MODE (target_bool
), 1, loop_label
,
25082 profile_probability::guessed_never ());
25083 emit_jump_insn (gen_jump (done_label
));
25086 /* If mem is not expected, pause and loop back. */
25087 emit_label (cmp_label
);
25088 emit_move_insn (target_val
, new_mem
);
25089 emit_insn (gen_pause ());
25090 emit_jump_insn (gen_jump (loop_label
));
25092 emit_label (done_label
);
25095 *ptarget_bool
= target_bool
;
25098 /* Convert a BFmode VAL to SFmode without signaling sNaNs.
25099 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
25102 ix86_expand_fast_convert_bf_to_sf (rtx val
)
25104 rtx op
= gen_lowpart (HImode
, val
), ret
;
25105 if (CONST_INT_P (op
))
25107 ret
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
25111 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
25112 ret
= gen_reg_rtx (SImode
);
25113 emit_move_insn (ret
, GEN_INT (INTVAL (op
) & 0xffff));
25114 emit_insn (gen_ashlsi3 (ret
, ret
, GEN_INT (16)));
25115 return gen_lowpart (SFmode
, ret
);
25118 ret
= gen_reg_rtx (SFmode
);
25119 emit_insn (gen_extendbfsf2_1 (ret
, force_reg (BFmode
, val
)));
25123 #include "gt-i386-expand.h"