i386: Fix a pasto in ix86_expand_int_sse_cmp [PR114339]
[official-gcc.git] / gcc / config / i386 / i386-expand.cc
blob8bb8f21e6860580dd4eddc972f66e9b1d529309a
1 /* Copyright (C) 1988-2024 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "shrink-wrap.h"
70 #include "builtins.h"
71 #include "rtl-iter.h"
72 #include "tree-iterator.h"
73 #include "dbgcnt.h"
74 #include "case-cfn-macros.h"
75 #include "dojump.h"
76 #include "fold-const-call.h"
77 #include "tree-vrp.h"
78 #include "tree-ssanames.h"
79 #include "selftest.h"
80 #include "selftest-rtl.h"
81 #include "print-rtl.h"
82 #include "intl.h"
83 #include "ifcvt.h"
84 #include "symbol-summary.h"
85 #include "sreal.h"
86 #include "ipa-cp.h"
87 #include "ipa-prop.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
91 #include "debug.h"
92 #include "dwarf2out.h"
93 #include "i386-options.h"
94 #include "i386-builtins.h"
95 #include "i386-expand.h"
96 #include "asan.h"
98 /* Split one or more double-mode RTL references into pairs of half-mode
99 references. The RTL can be REG, offsettable MEM, integer constant, or
100 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
101 split and "num" is its length. lo_half and hi_half are output arrays
102 that parallel "operands". */
104 void
105 split_double_mode (machine_mode mode, rtx operands[],
106 int num, rtx lo_half[], rtx hi_half[])
108 machine_mode half_mode;
109 unsigned int byte;
110 rtx mem_op = NULL_RTX;
111 int mem_num = 0;
113 switch (mode)
115 case E_TImode:
116 half_mode = DImode;
117 break;
118 case E_DImode:
119 half_mode = SImode;
120 break;
121 case E_P2HImode:
122 half_mode = HImode;
123 break;
124 case E_P2QImode:
125 half_mode = QImode;
126 break;
127 default:
128 gcc_unreachable ();
131 byte = GET_MODE_SIZE (half_mode);
133 while (num--)
135 rtx op = operands[num];
137 /* simplify_subreg refuse to split volatile memory addresses,
138 but we still have to handle it. */
139 if (MEM_P (op))
141 if (mem_op && rtx_equal_p (op, mem_op))
143 lo_half[num] = lo_half[mem_num];
144 hi_half[num] = hi_half[mem_num];
146 else
148 mem_op = op;
149 mem_num = num;
150 lo_half[num] = adjust_address (op, half_mode, 0);
151 hi_half[num] = adjust_address (op, half_mode, byte);
154 else
156 lo_half[num] = simplify_gen_subreg (half_mode, op,
157 GET_MODE (op) == VOIDmode
158 ? mode : GET_MODE (op), 0);
160 rtx tmp = simplify_gen_subreg (half_mode, op,
161 GET_MODE (op) == VOIDmode
162 ? mode : GET_MODE (op), byte);
163 /* simplify_gen_subreg will return NULL RTX for the
164 high half of the paradoxical subreg. */
165 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
170 /* Emit the double word assignment DST = { LO, HI }. */
172 void
173 split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
175 rtx dlo, dhi;
176 int deleted_move_count = 0;
177 split_double_mode (mode, &dst, 1, &dlo, &dhi);
178 /* Constraints ensure that if both lo and hi are MEMs, then
179 dst has early-clobber and thus addresses of MEMs don't use
180 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
181 dlo/dhi are registers. */
182 if (MEM_P (lo)
183 && rtx_equal_p (dlo, hi)
184 && reg_overlap_mentioned_p (dhi, lo))
186 /* If dlo is same as hi and lo's address uses dhi register,
187 code below would first emit_move_insn (dhi, hi)
188 and then emit_move_insn (dlo, lo). But the former
189 would invalidate lo's address. Load into dhi first,
190 then swap. */
191 emit_move_insn (dhi, lo);
192 lo = dhi;
194 else if (MEM_P (hi)
195 && !MEM_P (lo)
196 && !rtx_equal_p (dlo, lo)
197 && reg_overlap_mentioned_p (dlo, hi))
199 /* In this case, code below would first emit_move_insn (dlo, lo)
200 and then emit_move_insn (dhi, hi). But the former would
201 invalidate hi's address. */
202 if (rtx_equal_p (dhi, lo))
204 /* We can't load into dhi first, so load into dlo
205 first and we'll swap. */
206 emit_move_insn (dlo, hi);
207 hi = dlo;
209 else
211 /* Load into dhi first. */
212 emit_move_insn (dhi, hi);
213 hi = dhi;
216 if (!rtx_equal_p (dlo, hi))
218 if (!rtx_equal_p (dlo, lo))
219 emit_move_insn (dlo, lo);
220 else
221 deleted_move_count++;
222 if (!rtx_equal_p (dhi, hi))
223 emit_move_insn (dhi, hi);
224 else
225 deleted_move_count++;
227 else if (!rtx_equal_p (lo, dhi))
229 if (!rtx_equal_p (dhi, hi))
230 emit_move_insn (dhi, hi);
231 else
232 deleted_move_count++;
233 if (!rtx_equal_p (dlo, lo))
234 emit_move_insn (dlo, lo);
235 else
236 deleted_move_count++;
238 else if (mode == TImode)
239 emit_insn (gen_swapdi (dlo, dhi));
240 else
241 emit_insn (gen_swapsi (dlo, dhi));
243 if (deleted_move_count == 2)
244 emit_note (NOTE_INSN_DELETED);
248 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
249 for the target. */
251 void
252 ix86_expand_clear (rtx dest)
254 rtx tmp;
256 /* We play register width games, which are only valid after reload. */
257 gcc_assert (reload_completed);
259 /* Avoid HImode and its attendant prefix byte. */
260 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
261 dest = gen_rtx_REG (SImode, REGNO (dest));
262 tmp = gen_rtx_SET (dest, const0_rtx);
264 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
266 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
267 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
270 emit_insn (tmp);
273 /* Return true if V can be broadcasted from an integer of WIDTH bits
274 which is returned in VAL_BROADCAST. Otherwise, return false. */
276 static bool
277 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
278 HOST_WIDE_INT &val_broadcast)
280 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
281 val_broadcast = wi::extract_uhwi (val, 0, width);
282 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
284 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
285 if (val_broadcast != each)
286 return false;
288 val_broadcast = sext_hwi (val_broadcast, width);
289 return true;
292 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
295 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
297 /* Don't use integer vector broadcast if we can't move from GPR to SSE
298 register directly. */
299 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
300 return nullptr;
302 unsigned int msize = GET_MODE_SIZE (mode);
304 /* Only optimized for vpbroadcast[bwsd]/vbroadcastss with xmm/ymm/zmm. */
305 if (msize != 16 && msize != 32 && msize != 64)
306 return nullptr;
308 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
309 broadcast only if vector broadcast is available. */
310 if (!TARGET_AVX
311 || !CONST_WIDE_INT_P (op)
312 || standard_sse_constant_p (op, mode)
313 || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
314 != GET_MODE_BITSIZE (mode)))
315 return nullptr;
317 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
318 HOST_WIDE_INT val_broadcast;
319 scalar_int_mode broadcast_mode;
320 /* vpbroadcastb zmm requires TARGET_AVX512BW. */
321 if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
322 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
323 val_broadcast))
324 broadcast_mode = QImode;
325 else if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
326 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
327 val_broadcast))
328 broadcast_mode = HImode;
329 /* vbroadcasts[sd] only support memory operand w/o AVX2.
330 When msize == 16, pshufs is used for vec_duplicate.
331 when msize == 64, vpbroadcastd is used, and TARGET_AVX512F must be existed. */
332 else if ((msize != 32 || TARGET_AVX2)
333 && ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
334 val_broadcast))
335 broadcast_mode = SImode;
336 else if (TARGET_64BIT && (msize != 32 || TARGET_AVX2)
337 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
338 val_broadcast))
339 broadcast_mode = DImode;
340 else
341 return nullptr;
343 /* Check if OP can be broadcasted from VAL. */
344 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
345 if (val != CONST_WIDE_INT_ELT (op, i))
346 return nullptr;
348 unsigned int nunits = (GET_MODE_SIZE (mode)
349 / GET_MODE_SIZE (broadcast_mode));
350 machine_mode vector_mode;
351 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
352 gcc_unreachable ();
353 rtx target = gen_reg_rtx (vector_mode);
354 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
355 target,
356 GEN_INT (val_broadcast));
357 if (!ok)
358 return nullptr;
359 target = lowpart_subreg (mode, target, vector_mode);
360 return target;
363 void
364 ix86_expand_move (machine_mode mode, rtx operands[])
366 rtx op0, op1;
367 rtx tmp, addend = NULL_RTX;
368 enum tls_model model;
370 op0 = operands[0];
371 op1 = operands[1];
373 /* Avoid complex sets of likely spilled hard registers before reload. */
374 if (!ix86_hardreg_mov_ok (op0, op1))
376 tmp = gen_reg_rtx (mode);
377 operands[0] = tmp;
378 ix86_expand_move (mode, operands);
379 operands[0] = op0;
380 operands[1] = tmp;
381 op1 = tmp;
384 switch (GET_CODE (op1))
386 case CONST:
387 tmp = XEXP (op1, 0);
389 if (GET_CODE (tmp) != PLUS
390 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
391 break;
393 op1 = XEXP (tmp, 0);
394 addend = XEXP (tmp, 1);
395 /* FALLTHRU */
397 case SYMBOL_REF:
398 model = SYMBOL_REF_TLS_MODEL (op1);
400 if (model)
401 op1 = legitimize_tls_address (op1, model, true);
402 else if (ix86_force_load_from_GOT_p (op1))
404 /* Load the external function address via GOT slot to avoid PLT. */
405 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
406 (TARGET_64BIT
407 ? UNSPEC_GOTPCREL
408 : UNSPEC_GOT));
409 op1 = gen_rtx_CONST (Pmode, op1);
410 op1 = gen_const_mem (Pmode, op1);
411 set_mem_alias_set (op1, ix86_GOT_alias_set ());
413 else
415 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
416 if (tmp)
418 op1 = tmp;
419 if (!addend)
420 break;
422 else
424 op1 = operands[1];
425 break;
429 if (addend)
431 op1 = force_operand (op1, NULL_RTX);
432 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
433 op0, 1, OPTAB_DIRECT);
435 else
436 op1 = force_operand (op1, op0);
438 if (op1 == op0)
439 return;
441 op1 = convert_to_mode (mode, op1, 1);
443 default:
444 break;
446 case SUBREG:
447 /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
448 if (TARGET_64BIT
449 && mode == TImode
450 && SUBREG_P (op1)
451 && GET_MODE (SUBREG_REG (op1)) == DImode
452 && SUBREG_BYTE (op1) == 0)
453 op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
454 /* As not all values in XFmode are representable in real_value,
455 we might be called with unfoldable SUBREGs of constants. */
456 if (mode == XFmode
457 && CONSTANT_P (SUBREG_REG (op1))
458 && can_create_pseudo_p ())
460 machine_mode imode = GET_MODE (SUBREG_REG (op1));
461 rtx r = force_const_mem (imode, SUBREG_REG (op1));
462 if (r)
463 r = validize_mem (r);
464 else
465 r = force_reg (imode, SUBREG_REG (op1));
466 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
468 break;
471 if ((flag_pic || MACHOPIC_INDIRECT)
472 && symbolic_operand (op1, mode))
474 #if TARGET_MACHO
475 if (TARGET_MACHO && !TARGET_64BIT)
477 /* dynamic-no-pic */
478 if (MACHOPIC_INDIRECT)
480 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
481 ? op0 : gen_reg_rtx (Pmode);
482 op1 = machopic_indirect_data_reference (op1, temp);
483 if (MACHOPIC_PURE)
484 op1 = machopic_legitimize_pic_address (op1, mode,
485 temp == op1 ? 0 : temp);
487 if (op0 != op1 && GET_CODE (op0) != MEM)
489 rtx insn = gen_rtx_SET (op0, op1);
490 emit_insn (insn);
491 return;
494 #endif
496 if (MEM_P (op0))
497 op1 = force_reg (mode, op1);
498 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
500 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
501 op1 = legitimize_pic_address (op1, reg);
502 if (op0 == op1)
503 return;
504 op1 = convert_to_mode (mode, op1, 1);
507 else
509 if (MEM_P (op0)
510 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
511 || !push_operand (op0, mode))
512 && MEM_P (op1))
513 op1 = force_reg (mode, op1);
515 if (push_operand (op0, mode)
516 && ! general_no_elim_operand (op1, mode))
517 op1 = copy_to_mode_reg (mode, op1);
519 /* Force large constants in 64bit compilation into register
520 to get them CSEed. */
521 if (can_create_pseudo_p ()
522 && (mode == DImode) && TARGET_64BIT
523 && immediate_operand (op1, mode)
524 && !x86_64_zext_immediate_operand (op1, VOIDmode)
525 && !register_operand (op0, mode)
526 && optimize)
527 op1 = copy_to_mode_reg (mode, op1);
529 if (can_create_pseudo_p ())
531 if (CONST_DOUBLE_P (op1))
533 /* If we are loading a floating point constant to a
534 register, force the value to memory now, since we'll
535 get better code out the back end. */
537 op1 = validize_mem (force_const_mem (mode, op1));
538 if (!register_operand (op0, mode))
540 rtx temp = gen_reg_rtx (mode);
541 emit_insn (gen_rtx_SET (temp, op1));
542 emit_move_insn (op0, temp);
543 return;
549 /* Special case inserting 64-bit values into a TImode register. */
550 if (TARGET_64BIT
551 /* Disable for -O0 (see PR110587) unless naked (PR110533). */
552 && (optimize || ix86_function_naked (current_function_decl))
553 && (mode == DImode || mode == DFmode)
554 && SUBREG_P (op0)
555 && GET_MODE (SUBREG_REG (op0)) == TImode
556 && REG_P (SUBREG_REG (op0))
557 && REG_P (op1))
559 /* Use *insvti_lowpart_1 to set lowpart. */
560 if (SUBREG_BYTE (op0) == 0)
562 wide_int mask = wi::mask (64, true, 128);
563 rtx tmp = immed_wide_int_const (mask, TImode);
564 op0 = SUBREG_REG (op0);
565 tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
566 if (mode == DFmode)
567 op1 = gen_lowpart (DImode, op1);
568 op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
569 op1 = gen_rtx_IOR (TImode, tmp, op1);
571 /* Use *insvti_highpart_1 to set highpart. */
572 else if (SUBREG_BYTE (op0) == 8)
574 wide_int mask = wi::mask (64, false, 128);
575 rtx tmp = immed_wide_int_const (mask, TImode);
576 op0 = SUBREG_REG (op0);
577 tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
578 if (mode == DFmode)
579 op1 = gen_lowpart (DImode, op1);
580 op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
581 op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
582 op1 = gen_rtx_IOR (TImode, tmp, op1);
586 emit_insn (gen_rtx_SET (op0, op1));
589 /* OP is a memref of CONST_VECTOR, return scalar constant mem
590 if CONST_VECTOR is a vec_duplicate, else return NULL. */
591 static rtx
592 ix86_broadcast_from_constant (machine_mode mode, rtx op)
594 int nunits = GET_MODE_NUNITS (mode);
595 if (nunits < 2)
596 return nullptr;
598 /* Don't use integer vector broadcast if we can't move from GPR to SSE
599 register directly. */
600 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
601 && INTEGRAL_MODE_P (mode))
602 return nullptr;
604 /* Convert CONST_VECTOR to a non-standard SSE constant integer
605 broadcast only if vector broadcast is available. */
606 if (standard_sse_constant_p (op, mode))
607 return nullptr;
609 if (GET_MODE_INNER (mode) == TImode)
610 return nullptr;
612 rtx constant = get_pool_constant (XEXP (op, 0));
613 if (GET_CODE (constant) != CONST_VECTOR)
614 return nullptr;
616 /* There could be some rtx like
617 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
618 but with "*.LC1" refer to V2DI constant vector. */
619 if (GET_MODE (constant) != mode)
621 constant = simplify_subreg (mode, constant, GET_MODE (constant),
623 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
624 return nullptr;
627 rtx first = XVECEXP (constant, 0, 0);
629 for (int i = 1; i < nunits; ++i)
631 rtx tmp = XVECEXP (constant, 0, i);
632 /* Vector duplicate value. */
633 if (!rtx_equal_p (tmp, first))
634 return nullptr;
637 return first;
640 void
641 ix86_expand_vector_move (machine_mode mode, rtx operands[])
643 rtx op0 = operands[0], op1 = operands[1];
644 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
645 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
646 unsigned int align = (TARGET_IAMCU
647 ? GET_MODE_BITSIZE (mode)
648 : GET_MODE_ALIGNMENT (mode));
650 if (push_operand (op0, VOIDmode))
651 op0 = emit_move_resolve_push (mode, op0);
653 /* Force constants other than zero into memory. We do not know how
654 the instructions used to build constants modify the upper 64 bits
655 of the register, once we have that information we may be able
656 to handle some of them more efficiently. */
657 if (can_create_pseudo_p ()
658 && (CONSTANT_P (op1)
659 || (SUBREG_P (op1)
660 && CONSTANT_P (SUBREG_REG (op1))))
661 && ((register_operand (op0, mode)
662 && !standard_sse_constant_p (op1, mode))
663 /* ix86_expand_vector_move_misalign() does not like constants. */
664 || (SSE_REG_MODE_P (mode)
665 && MEM_P (op0)
666 && MEM_ALIGN (op0) < align)))
668 if (SUBREG_P (op1))
670 machine_mode imode = GET_MODE (SUBREG_REG (op1));
671 rtx r = force_const_mem (imode, SUBREG_REG (op1));
672 if (r)
673 r = validize_mem (r);
674 else
675 r = force_reg (imode, SUBREG_REG (op1));
676 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
678 else
680 machine_mode mode = GET_MODE (op0);
681 rtx tmp = ix86_convert_const_wide_int_to_broadcast
682 (mode, op1);
683 if (tmp == nullptr)
684 op1 = validize_mem (force_const_mem (mode, op1));
685 else
686 op1 = tmp;
690 if (can_create_pseudo_p ()
691 && GET_MODE_SIZE (mode) >= 16
692 && VECTOR_MODE_P (mode)
693 && (MEM_P (op1)
694 && SYMBOL_REF_P (XEXP (op1, 0))
695 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
697 rtx first = ix86_broadcast_from_constant (mode, op1);
698 if (first != nullptr)
700 /* Broadcast to XMM/YMM/ZMM register from an integer
701 constant or scalar mem. */
702 rtx tmp = gen_reg_rtx (mode);
703 if (FLOAT_MODE_P (mode))
704 first = force_const_mem (GET_MODE_INNER (mode), first);
705 bool ok = ix86_expand_vector_init_duplicate (false, mode,
706 tmp, first);
707 if (!ok && !TARGET_64BIT && GET_MODE_INNER (mode) == DImode)
709 first = force_const_mem (GET_MODE_INNER (mode), first);
710 ok = ix86_expand_vector_init_duplicate (false, mode,
711 tmp, first);
713 if (ok)
715 emit_move_insn (op0, tmp);
716 return;
721 /* We need to check memory alignment for SSE mode since attribute
722 can make operands unaligned. */
723 if (can_create_pseudo_p ()
724 && SSE_REG_MODE_P (mode)
725 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
726 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
728 rtx tmp[2];
730 /* ix86_expand_vector_move_misalign() does not like both
731 arguments in memory. */
732 if (!register_operand (op0, mode)
733 && !register_operand (op1, mode))
735 rtx scratch = gen_reg_rtx (mode);
736 emit_move_insn (scratch, op1);
737 op1 = scratch;
740 tmp[0] = op0; tmp[1] = op1;
741 ix86_expand_vector_move_misalign (mode, tmp);
742 return;
745 /* Special case TImode to 128-bit vector conversions via V2DI. */
746 if (VECTOR_MODE_P (mode)
747 && GET_MODE_SIZE (mode) == 16
748 && SUBREG_P (op1)
749 && GET_MODE (SUBREG_REG (op1)) == TImode
750 && TARGET_64BIT && TARGET_SSE
751 && can_create_pseudo_p ())
753 rtx tmp = gen_reg_rtx (V2DImode);
754 rtx lo = gen_reg_rtx (DImode);
755 rtx hi = gen_reg_rtx (DImode);
756 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
757 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
758 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
759 emit_move_insn (op0, gen_lowpart (mode, tmp));
760 return;
763 /* If operand0 is a hard register, make operand1 a pseudo. */
764 if (can_create_pseudo_p ()
765 && !ix86_hardreg_mov_ok (op0, op1))
767 rtx tmp = gen_reg_rtx (GET_MODE (op0));
768 emit_move_insn (tmp, op1);
769 emit_move_insn (op0, tmp);
770 return;
773 /* Make operand1 a register if it isn't already. */
774 if (can_create_pseudo_p ()
775 && !register_operand (op0, mode)
776 && !register_operand (op1, mode))
778 rtx tmp = gen_reg_rtx (GET_MODE (op0));
779 emit_move_insn (tmp, op1);
780 emit_move_insn (op0, tmp);
781 return;
784 emit_insn (gen_rtx_SET (op0, op1));
787 /* Split 32-byte AVX unaligned load and store if needed. */
789 static void
790 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
792 rtx m;
793 rtx (*extract) (rtx, rtx, rtx);
794 machine_mode mode;
796 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
797 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
799 emit_insn (gen_rtx_SET (op0, op1));
800 return;
803 rtx orig_op0 = NULL_RTX;
804 mode = GET_MODE (op0);
805 switch (GET_MODE_CLASS (mode))
807 case MODE_VECTOR_INT:
808 case MODE_INT:
809 if (mode != V32QImode)
811 if (!MEM_P (op0))
813 orig_op0 = op0;
814 op0 = gen_reg_rtx (V32QImode);
816 else
817 op0 = gen_lowpart (V32QImode, op0);
818 op1 = gen_lowpart (V32QImode, op1);
819 mode = V32QImode;
821 break;
822 case MODE_VECTOR_FLOAT:
823 break;
824 default:
825 gcc_unreachable ();
828 switch (mode)
830 default:
831 gcc_unreachable ();
832 case E_V32QImode:
833 extract = gen_avx_vextractf128v32qi;
834 mode = V16QImode;
835 break;
836 case E_V16BFmode:
837 extract = gen_avx_vextractf128v16bf;
838 mode = V8BFmode;
839 break;
840 case E_V16HFmode:
841 extract = gen_avx_vextractf128v16hf;
842 mode = V8HFmode;
843 break;
844 case E_V8SFmode:
845 extract = gen_avx_vextractf128v8sf;
846 mode = V4SFmode;
847 break;
848 case E_V4DFmode:
849 extract = gen_avx_vextractf128v4df;
850 mode = V2DFmode;
851 break;
854 if (MEM_P (op1))
856 rtx r = gen_reg_rtx (mode);
857 m = adjust_address (op1, mode, 0);
858 emit_move_insn (r, m);
859 m = adjust_address (op1, mode, 16);
860 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
861 emit_move_insn (op0, r);
863 else if (MEM_P (op0))
865 m = adjust_address (op0, mode, 0);
866 emit_insn (extract (m, op1, const0_rtx));
867 m = adjust_address (op0, mode, 16);
868 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
870 else
871 gcc_unreachable ();
873 if (orig_op0)
874 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
877 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
878 straight to ix86_expand_vector_move. */
879 /* Code generation for scalar reg-reg moves of single and double precision data:
880 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
881 movaps reg, reg
882 else
883 movss reg, reg
884 if (x86_sse_partial_reg_dependency == true)
885 movapd reg, reg
886 else
887 movsd reg, reg
889 Code generation for scalar loads of double precision data:
890 if (x86_sse_split_regs == true)
891 movlpd mem, reg (gas syntax)
892 else
893 movsd mem, reg
895 Code generation for unaligned packed loads of single precision data
896 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
897 if (x86_sse_unaligned_move_optimal)
898 movups mem, reg
900 if (x86_sse_partial_reg_dependency == true)
902 xorps reg, reg
903 movlps mem, reg
904 movhps mem+8, reg
906 else
908 movlps mem, reg
909 movhps mem+8, reg
912 Code generation for unaligned packed loads of double precision data
913 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
914 if (x86_sse_unaligned_move_optimal)
915 movupd mem, reg
917 if (x86_sse_split_regs == true)
919 movlpd mem, reg
920 movhpd mem+8, reg
922 else
924 movsd mem, reg
925 movhpd mem+8, reg
929 void
930 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
932 rtx op0, op1, m;
934 op0 = operands[0];
935 op1 = operands[1];
937 /* Use unaligned load/store for AVX512 or when optimizing for size. */
938 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
940 emit_insn (gen_rtx_SET (op0, op1));
941 return;
944 if (TARGET_AVX)
946 if (GET_MODE_SIZE (mode) == 32)
947 ix86_avx256_split_vector_move_misalign (op0, op1);
948 else
949 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
950 emit_insn (gen_rtx_SET (op0, op1));
951 return;
954 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
955 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
957 emit_insn (gen_rtx_SET (op0, op1));
958 return;
961 /* ??? If we have typed data, then it would appear that using
962 movdqu is the only way to get unaligned data loaded with
963 integer type. */
964 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
966 emit_insn (gen_rtx_SET (op0, op1));
967 return;
970 if (MEM_P (op1))
972 if (TARGET_SSE2 && mode == V2DFmode)
974 rtx zero;
976 /* When SSE registers are split into halves, we can avoid
977 writing to the top half twice. */
978 if (TARGET_SSE_SPLIT_REGS)
980 emit_clobber (op0);
981 zero = op0;
983 else
985 /* ??? Not sure about the best option for the Intel chips.
986 The following would seem to satisfy; the register is
987 entirely cleared, breaking the dependency chain. We
988 then store to the upper half, with a dependency depth
989 of one. A rumor has it that Intel recommends two movsd
990 followed by an unpacklpd, but this is unconfirmed. And
991 given that the dependency depth of the unpacklpd would
992 still be one, I'm not sure why this would be better. */
993 zero = CONST0_RTX (V2DFmode);
996 m = adjust_address (op1, DFmode, 0);
997 emit_insn (gen_sse2_loadlpd (op0, zero, m));
998 m = adjust_address (op1, DFmode, 8);
999 emit_insn (gen_sse2_loadhpd (op0, op0, m));
1001 else
1003 rtx t;
1005 if (mode != V4SFmode)
1006 t = gen_reg_rtx (V4SFmode);
1007 else
1008 t = op0;
1010 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
1011 emit_move_insn (t, CONST0_RTX (V4SFmode));
1012 else
1013 emit_clobber (t);
1015 m = adjust_address (op1, V2SFmode, 0);
1016 emit_insn (gen_sse_loadlps (t, t, m));
1017 m = adjust_address (op1, V2SFmode, 8);
1018 emit_insn (gen_sse_loadhps (t, t, m));
1019 if (mode != V4SFmode)
1020 emit_move_insn (op0, gen_lowpart (mode, t));
1023 else if (MEM_P (op0))
1025 if (TARGET_SSE2 && mode == V2DFmode)
1027 m = adjust_address (op0, DFmode, 0);
1028 emit_insn (gen_sse2_storelpd (m, op1));
1029 m = adjust_address (op0, DFmode, 8);
1030 emit_insn (gen_sse2_storehpd (m, op1));
1032 else
1034 if (mode != V4SFmode)
1035 op1 = gen_lowpart (V4SFmode, op1);
1037 m = adjust_address (op0, V2SFmode, 0);
1038 emit_insn (gen_sse_storelps (m, op1));
1039 m = adjust_address (op0, V2SFmode, 8);
1040 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
1043 else
1044 gcc_unreachable ();
1047 /* Move bits 64:95 to bits 32:63. */
1049 void
1050 ix86_move_vector_high_sse_to_mmx (rtx op)
1052 rtx mask = gen_rtx_PARALLEL (VOIDmode,
1053 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1054 GEN_INT (0), GEN_INT (0)));
1055 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
1056 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1057 rtx insn = gen_rtx_SET (dest, op);
1058 emit_insn (insn);
1061 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1063 void
1064 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1066 rtx op0 = operands[0];
1067 rtx op1 = operands[1];
1068 rtx op2 = operands[2];
1069 rtx src;
1071 machine_mode dmode = GET_MODE (op0);
1072 machine_mode smode = GET_MODE (op1);
1073 machine_mode inner_dmode = GET_MODE_INNER (dmode);
1074 machine_mode inner_smode = GET_MODE_INNER (smode);
1076 /* Get the corresponding SSE mode for destination. */
1077 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1078 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1079 nunits).require ();
1080 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1081 nunits / 2).require ();
1083 /* Get the corresponding SSE mode for source. */
1084 nunits = 16 / GET_MODE_SIZE (inner_smode);
1085 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1086 nunits).require ();
1088 /* Generate SSE pack with signed/unsigned saturation. */
1089 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1090 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1091 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1093 /* paskusdw/packuswb does unsigned saturation of a signed source
1094 which is different from generic us_truncate RTX. */
1095 if (code == US_TRUNCATE)
1096 src = gen_rtx_UNSPEC (sse_dmode,
1097 gen_rtvec (2, op1, op2),
1098 UNSPEC_US_TRUNCATE);
1099 else
1101 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1102 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1103 src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
1106 emit_move_insn (dest, src);
1108 ix86_move_vector_high_sse_to_mmx (op0);
1111 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. This is also used
1112 for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
1113 OPERANDS[0]. */
1115 void
1116 ix86_split_mmx_punpck (rtx operands[], bool high_p)
1118 rtx op0 = operands[0];
1119 rtx op1 = operands[1];
1120 rtx op2 = operands[2];
1121 machine_mode mode = GET_MODE (op1);
1122 rtx mask;
1123 /* The corresponding SSE mode. */
1124 machine_mode sse_mode, double_sse_mode;
1126 switch (mode)
1128 case E_V8QImode:
1129 case E_V4QImode:
1130 case E_V2QImode:
1131 sse_mode = V16QImode;
1132 double_sse_mode = V32QImode;
1133 mask = gen_rtx_PARALLEL (VOIDmode,
1134 gen_rtvec (16,
1135 GEN_INT (0), GEN_INT (16),
1136 GEN_INT (1), GEN_INT (17),
1137 GEN_INT (2), GEN_INT (18),
1138 GEN_INT (3), GEN_INT (19),
1139 GEN_INT (4), GEN_INT (20),
1140 GEN_INT (5), GEN_INT (21),
1141 GEN_INT (6), GEN_INT (22),
1142 GEN_INT (7), GEN_INT (23)));
1143 break;
1145 case E_V4HImode:
1146 case E_V2HImode:
1147 sse_mode = V8HImode;
1148 double_sse_mode = V16HImode;
1149 mask = gen_rtx_PARALLEL (VOIDmode,
1150 gen_rtvec (8,
1151 GEN_INT (0), GEN_INT (8),
1152 GEN_INT (1), GEN_INT (9),
1153 GEN_INT (2), GEN_INT (10),
1154 GEN_INT (3), GEN_INT (11)));
1155 break;
1157 case E_V2SImode:
1158 sse_mode = V4SImode;
1159 double_sse_mode = V8SImode;
1160 mask = gen_rtx_PARALLEL (VOIDmode,
1161 gen_rtvec (4,
1162 GEN_INT (0), GEN_INT (4),
1163 GEN_INT (1), GEN_INT (5)));
1164 break;
1166 case E_V2SFmode:
1167 sse_mode = V4SFmode;
1168 double_sse_mode = V8SFmode;
1169 mask = gen_rtx_PARALLEL (VOIDmode,
1170 gen_rtvec (4,
1171 GEN_INT (0), GEN_INT (4),
1172 GEN_INT (1), GEN_INT (5)));
1173 break;
1175 default:
1176 gcc_unreachable ();
1179 /* Generate SSE punpcklXX. */
1180 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1181 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1182 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1184 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1185 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1186 rtx insn = gen_rtx_SET (dest, op2);
1187 emit_insn (insn);
1189 /* Move high bits to low bits. */
1190 if (high_p)
1192 if (sse_mode == V4SFmode)
1194 mask = gen_rtx_PARALLEL (VOIDmode,
1195 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1196 GEN_INT (4), GEN_INT (5)));
1197 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1198 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1200 else
1202 int sz = GET_MODE_SIZE (mode);
1204 if (sz == 4)
1205 mask = gen_rtx_PARALLEL (VOIDmode,
1206 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1207 GEN_INT (0), GEN_INT (1)));
1208 else if (sz == 8)
1209 mask = gen_rtx_PARALLEL (VOIDmode,
1210 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1211 GEN_INT (0), GEN_INT (1)));
1212 else
1213 gcc_unreachable ();
1215 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1216 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1219 insn = gen_rtx_SET (dest, op1);
1220 emit_insn (insn);
1224 /* Helper function of ix86_fixup_binary_operands to canonicalize
1225 operand order. Returns true if the operands should be swapped. */
1227 static bool
1228 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1229 rtx operands[])
1231 rtx dst = operands[0];
1232 rtx src1 = operands[1];
1233 rtx src2 = operands[2];
1235 /* If the operation is not commutative, we can't do anything. */
1236 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1237 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1238 return false;
1240 /* Highest priority is that src1 should match dst. */
1241 if (rtx_equal_p (dst, src1))
1242 return false;
1243 if (rtx_equal_p (dst, src2))
1244 return true;
1246 /* Next highest priority is that immediate constants come second. */
1247 if (immediate_operand (src2, mode))
1248 return false;
1249 if (immediate_operand (src1, mode))
1250 return true;
1252 /* Lowest priority is that memory references should come second. */
1253 if (MEM_P (src2))
1254 return false;
1255 if (MEM_P (src1))
1256 return true;
1258 return false;
1261 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1262 destination to use for the operation. If different from the true
1263 destination in operands[0], a copy operation will be required except
1264 under TARGET_APX_NDD. */
1267 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1268 rtx operands[], bool use_ndd)
1270 rtx dst = operands[0];
1271 rtx src1 = operands[1];
1272 rtx src2 = operands[2];
1274 /* Canonicalize operand order. */
1275 if (ix86_swap_binary_operands_p (code, mode, operands))
1277 /* It is invalid to swap operands of different modes. */
1278 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1280 std::swap (src1, src2);
1283 /* Both source operands cannot be in memory. */
1284 if (MEM_P (src1) && MEM_P (src2))
1286 /* Optimization: Only read from memory once. */
1287 if (rtx_equal_p (src1, src2))
1289 src2 = force_reg (mode, src2);
1290 src1 = src2;
1292 else if (rtx_equal_p (dst, src1))
1293 src2 = force_reg (mode, src2);
1294 else
1295 src1 = force_reg (mode, src1);
1298 /* If the destination is memory, and we do not have matching source
1299 operands, do things in registers. */
1300 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1301 dst = gen_reg_rtx (mode);
1303 /* Source 1 cannot be a constant. */
1304 if (CONSTANT_P (src1))
1305 src1 = force_reg (mode, src1);
1307 /* Source 1 cannot be a non-matching memory. */
1308 if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
1309 src1 = force_reg (mode, src1);
1311 /* Improve address combine. */
1312 if (code == PLUS
1313 && GET_MODE_CLASS (mode) == MODE_INT
1314 && MEM_P (src2))
1315 src2 = force_reg (mode, src2);
1317 operands[1] = src1;
1318 operands[2] = src2;
1319 return dst;
1322 /* Similarly, but assume that the destination has already been
1323 set up properly. */
1325 void
1326 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1327 machine_mode mode, rtx operands[],
1328 bool use_ndd)
1330 rtx dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
1331 gcc_assert (dst == operands[0]);
1334 /* Attempt to expand a binary operator. Make the expansion closer to the
1335 actual machine, then just general_operand, which will allow 3 separate
1336 memory references (one output, two input) in a single insn. */
1338 void
1339 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1340 rtx operands[], bool use_ndd)
1342 rtx src1, src2, dst, op, clob;
1344 dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
1345 src1 = operands[1];
1346 src2 = operands[2];
1348 /* Emit the instruction. */
1350 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1352 if (reload_completed
1353 && code == PLUS
1354 && !rtx_equal_p (dst, src1)
1355 && !use_ndd)
1357 /* This is going to be an LEA; avoid splitting it later. */
1358 emit_insn (op);
1360 else
1362 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1363 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1366 /* Fix up the destination if needed. */
1367 if (dst != operands[0])
1368 emit_move_insn (operands[0], dst);
1371 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1372 the given OPERANDS. */
1374 void
1375 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1376 rtx operands[])
1378 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1379 if (SUBREG_P (operands[1]))
1381 op1 = operands[1];
1382 op2 = operands[2];
1384 else if (SUBREG_P (operands[2]))
1386 op1 = operands[2];
1387 op2 = operands[1];
1389 /* Optimize (__m128i) d | (__m128i) e and similar code
1390 when d and e are float vectors into float vector logical
1391 insn. In C/C++ without using intrinsics there is no other way
1392 to express vector logical operation on float vectors than
1393 to cast them temporarily to integer vectors. */
1394 if (op1
1395 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1396 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1397 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1398 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1399 && SUBREG_BYTE (op1) == 0
1400 && (GET_CODE (op2) == CONST_VECTOR
1401 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1402 && SUBREG_BYTE (op2) == 0))
1403 && can_create_pseudo_p ())
1405 rtx dst;
1406 switch (GET_MODE (SUBREG_REG (op1)))
1408 case E_V4SFmode:
1409 case E_V8SFmode:
1410 case E_V16SFmode:
1411 case E_V2DFmode:
1412 case E_V4DFmode:
1413 case E_V8DFmode:
1414 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1415 if (GET_CODE (op2) == CONST_VECTOR)
1417 op2 = gen_lowpart (GET_MODE (dst), op2);
1418 op2 = force_reg (GET_MODE (dst), op2);
1420 else
1422 op1 = operands[1];
1423 op2 = SUBREG_REG (operands[2]);
1424 if (!vector_operand (op2, GET_MODE (dst)))
1425 op2 = force_reg (GET_MODE (dst), op2);
1427 op1 = SUBREG_REG (op1);
1428 if (!vector_operand (op1, GET_MODE (dst)))
1429 op1 = force_reg (GET_MODE (dst), op1);
1430 emit_insn (gen_rtx_SET (dst,
1431 gen_rtx_fmt_ee (code, GET_MODE (dst),
1432 op1, op2)));
1433 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1434 return;
1435 default:
1436 break;
1439 if (!vector_operand (operands[1], mode))
1440 operands[1] = force_reg (mode, operands[1]);
1441 if (!vector_operand (operands[2], mode))
1442 operands[2] = force_reg (mode, operands[2]);
1443 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1444 emit_insn (gen_rtx_SET (operands[0],
1445 gen_rtx_fmt_ee (code, mode, operands[1],
1446 operands[2])));
1449 /* Return TRUE or FALSE depending on whether the binary operator meets the
1450 appropriate constraints. */
1452 bool
1453 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1454 rtx operands[3], bool use_ndd)
1456 rtx dst = operands[0];
1457 rtx src1 = operands[1];
1458 rtx src2 = operands[2];
1460 /* Both source operands cannot be in memory. */
1461 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1462 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1463 return false;
1465 /* Canonicalize operand order for commutative operators. */
1466 if (ix86_swap_binary_operands_p (code, mode, operands))
1467 std::swap (src1, src2);
1469 /* If the destination is memory, we must have a matching source operand. */
1470 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1471 return false;
1473 /* Source 1 cannot be a constant. */
1474 if (CONSTANT_P (src1))
1475 return false;
1477 /* Source 1 cannot be a non-matching memory. */
1478 if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
1479 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1480 return (code == AND
1481 && (mode == HImode
1482 || mode == SImode
1483 || (TARGET_64BIT && mode == DImode))
1484 && satisfies_constraint_L (src2));
1486 return true;
1489 /* Attempt to expand a unary operator. Make the expansion closer to the
1490 actual machine, then just general_operand, which will allow 2 separate
1491 memory references (one output, one input) in a single insn. */
1493 void
1494 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1495 rtx operands[], bool use_ndd)
1497 bool matching_memory = false;
1498 rtx src, dst, op, clob;
1500 dst = operands[0];
1501 src = operands[1];
1503 /* If the destination is memory, and we do not have matching source
1504 operands, do things in registers. */
1505 if (MEM_P (dst))
1507 if (rtx_equal_p (dst, src))
1508 matching_memory = true;
1509 else
1510 dst = gen_reg_rtx (mode);
1513 /* When source operand is memory, destination must match. */
1514 if (!use_ndd && MEM_P (src) && !matching_memory)
1515 src = force_reg (mode, src);
1517 /* Emit the instruction. */
1519 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1521 if (code == NOT)
1522 emit_insn (op);
1523 else
1525 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1526 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1529 /* Fix up the destination if needed. */
1530 if (dst != operands[0])
1531 emit_move_insn (operands[0], dst);
1534 /* Return TRUE or FALSE depending on whether the unary operator meets the
1535 appropriate constraints. */
1537 bool
1538 ix86_unary_operator_ok (enum rtx_code,
1539 machine_mode,
1540 rtx operands[2],
1541 bool use_ndd)
1543 /* If one of operands is memory, source and destination must match. */
1544 if ((MEM_P (operands[0])
1545 || (!use_ndd && MEM_P (operands[1])))
1546 && ! rtx_equal_p (operands[0], operands[1]))
1547 return false;
1548 return true;
1551 /* Predict just emitted jump instruction to be taken with probability PROB. */
1553 static void
1554 predict_jump (int prob)
1556 rtx_insn *insn = get_last_insn ();
1557 gcc_assert (JUMP_P (insn));
1558 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1561 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1562 divisor are within the range [0-255]. */
1564 void
1565 ix86_split_idivmod (machine_mode mode, rtx operands[],
1566 bool unsigned_p)
1568 rtx_code_label *end_label, *qimode_label;
1569 rtx div, mod;
1570 rtx_insn *insn;
1571 rtx scratch, tmp0, tmp1, tmp2;
1572 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1574 operands[2] = force_reg (mode, operands[2]);
1575 operands[3] = force_reg (mode, operands[3]);
1577 switch (mode)
1579 case E_SImode:
1580 if (GET_MODE (operands[0]) == SImode)
1582 if (GET_MODE (operands[1]) == SImode)
1583 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1584 else
1585 gen_divmod4_1
1586 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1588 else
1589 gen_divmod4_1
1590 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1591 break;
1593 case E_DImode:
1594 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1595 break;
1597 default:
1598 gcc_unreachable ();
1601 end_label = gen_label_rtx ();
1602 qimode_label = gen_label_rtx ();
1604 scratch = gen_reg_rtx (mode);
1606 /* Use 8bit unsigned divimod if dividend and divisor are within
1607 the range [0-255]. */
1608 emit_move_insn (scratch, operands[2]);
1609 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1610 scratch, 1, OPTAB_DIRECT);
1611 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1612 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1613 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1614 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1615 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1616 pc_rtx);
1617 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1618 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1619 JUMP_LABEL (insn) = qimode_label;
1621 /* Generate original signed/unsigned divimod. */
1622 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1623 operands[2], operands[3]));
1625 /* Branch to the end. */
1626 emit_jump_insn (gen_jump (end_label));
1627 emit_barrier ();
1629 /* Generate 8bit unsigned divide. */
1630 emit_label (qimode_label);
1631 /* Don't use operands[0] for result of 8bit divide since not all
1632 registers support QImode ZERO_EXTRACT. */
1633 tmp0 = lowpart_subreg (HImode, scratch, mode);
1634 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1635 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1636 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1638 if (unsigned_p)
1640 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1641 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1643 else
1645 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1646 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1648 if (mode == SImode)
1650 if (GET_MODE (operands[0]) != SImode)
1651 div = gen_rtx_ZERO_EXTEND (DImode, div);
1652 if (GET_MODE (operands[1]) != SImode)
1653 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1656 /* Extract remainder from AH. */
1657 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1658 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1659 GEN_INT (8), GEN_INT (8));
1660 insn = emit_move_insn (operands[1], tmp1);
1661 set_unique_reg_note (insn, REG_EQUAL, mod);
1663 /* Zero extend quotient from AL. */
1664 tmp1 = gen_lowpart (QImode, tmp0);
1665 insn = emit_insn (gen_extend_insn
1666 (operands[0], tmp1,
1667 GET_MODE (operands[0]), QImode, 1));
1668 set_unique_reg_note (insn, REG_EQUAL, div);
1670 emit_label (end_label);
1673 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1674 matches destination. RTX includes clobber of FLAGS_REG. */
1676 void
1677 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1678 rtx dst, rtx src)
1680 rtx op, clob;
1682 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1683 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1685 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1688 /* Return true if regno1 def is nearest to the insn. */
1690 static bool
1691 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1693 rtx_insn *prev = insn;
1694 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1696 if (insn == start)
1697 return false;
1698 while (prev && prev != start)
1700 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1702 prev = PREV_INSN (prev);
1703 continue;
1705 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1706 return true;
1707 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1708 return false;
1709 prev = PREV_INSN (prev);
1712 /* None of the regs is defined in the bb. */
1713 return false;
1716 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1717 int ix86_last_zero_store_uid;
1719 /* Split lea instructions into a sequence of instructions
1720 which are executed on ALU to avoid AGU stalls.
1721 It is assumed that it is allowed to clobber flags register
1722 at lea position. */
1724 void
1725 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1727 unsigned int regno0, regno1, regno2;
1728 struct ix86_address parts;
1729 rtx target, tmp;
1730 int ok, adds;
1732 ok = ix86_decompose_address (operands[1], &parts);
1733 gcc_assert (ok);
1735 target = gen_lowpart (mode, operands[0]);
1737 regno0 = true_regnum (target);
1738 regno1 = INVALID_REGNUM;
1739 regno2 = INVALID_REGNUM;
1741 if (parts.base)
1743 parts.base = gen_lowpart (mode, parts.base);
1744 regno1 = true_regnum (parts.base);
1747 if (parts.index)
1749 parts.index = gen_lowpart (mode, parts.index);
1750 regno2 = true_regnum (parts.index);
1753 if (parts.disp)
1754 parts.disp = gen_lowpart (mode, parts.disp);
1756 if (parts.scale > 1)
1758 /* Case r1 = r1 + ... */
1759 if (regno1 == regno0)
1761 /* If we have a case r1 = r1 + C * r2 then we
1762 should use multiplication which is very
1763 expensive. Assume cost model is wrong if we
1764 have such case here. */
1765 gcc_assert (regno2 != regno0);
1767 for (adds = parts.scale; adds > 0; adds--)
1768 ix86_emit_binop (PLUS, mode, target, parts.index);
1770 else
1772 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1773 if (regno0 != regno2)
1774 emit_insn (gen_rtx_SET (target, parts.index));
1776 /* Use shift for scaling, but emit it as MULT instead
1777 to avoid it being immediately peephole2 optimized back
1778 into lea. */
1779 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1781 if (parts.base)
1782 ix86_emit_binop (PLUS, mode, target, parts.base);
1784 if (parts.disp && parts.disp != const0_rtx)
1785 ix86_emit_binop (PLUS, mode, target, parts.disp);
1788 else if (!parts.base && !parts.index)
1790 gcc_assert(parts.disp);
1791 emit_insn (gen_rtx_SET (target, parts.disp));
1793 else
1795 if (!parts.base)
1797 if (regno0 != regno2)
1798 emit_insn (gen_rtx_SET (target, parts.index));
1800 else if (!parts.index)
1802 if (regno0 != regno1)
1803 emit_insn (gen_rtx_SET (target, parts.base));
1805 else
1807 if (regno0 == regno1)
1808 tmp = parts.index;
1809 else if (regno0 == regno2)
1810 tmp = parts.base;
1811 else
1813 rtx tmp1;
1815 /* Find better operand for SET instruction, depending
1816 on which definition is farther from the insn. */
1817 if (find_nearest_reg_def (insn, regno1, regno2))
1818 tmp = parts.index, tmp1 = parts.base;
1819 else
1820 tmp = parts.base, tmp1 = parts.index;
1822 emit_insn (gen_rtx_SET (target, tmp));
1824 if (parts.disp && parts.disp != const0_rtx)
1825 ix86_emit_binop (PLUS, mode, target, parts.disp);
1827 ix86_emit_binop (PLUS, mode, target, tmp1);
1828 return;
1831 ix86_emit_binop (PLUS, mode, target, tmp);
1834 if (parts.disp && parts.disp != const0_rtx)
1835 ix86_emit_binop (PLUS, mode, target, parts.disp);
1839 /* Post-reload splitter for converting an SF or DFmode value in an
1840 SSE register into an unsigned SImode. */
1842 void
1843 ix86_split_convert_uns_si_sse (rtx operands[])
1845 machine_mode vecmode;
1846 rtx value, large, zero_or_two31, input, two31, x;
1848 large = operands[1];
1849 zero_or_two31 = operands[2];
1850 input = operands[3];
1851 two31 = operands[4];
1852 vecmode = GET_MODE (large);
1853 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1855 /* Load up the value into the low element. We must ensure that the other
1856 elements are valid floats -- zero is the easiest such value. */
1857 if (MEM_P (input))
1859 if (vecmode == V4SFmode)
1860 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1861 else
1862 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1864 else
1866 input = gen_rtx_REG (vecmode, REGNO (input));
1867 emit_move_insn (value, CONST0_RTX (vecmode));
1868 if (vecmode == V4SFmode)
1869 emit_insn (gen_sse_movss_v4sf (value, value, input));
1870 else
1871 emit_insn (gen_sse2_movsd_v2df (value, value, input));
1874 emit_move_insn (large, two31);
1875 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1877 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1878 emit_insn (gen_rtx_SET (large, x));
1880 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1881 emit_insn (gen_rtx_SET (zero_or_two31, x));
1883 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1884 emit_insn (gen_rtx_SET (value, x));
1886 large = gen_rtx_REG (V4SImode, REGNO (large));
1887 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1889 x = gen_rtx_REG (V4SImode, REGNO (value));
1890 if (vecmode == V4SFmode)
1891 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1892 else
1893 emit_insn (gen_sse2_cvttpd2dq (x, value));
1894 value = x;
1896 emit_insn (gen_xorv4si3 (value, value, large));
1899 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1900 machine_mode mode, rtx target,
1901 rtx var, int one_var);
1903 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1904 Expects the 64-bit DImode to be supplied in a pair of integral
1905 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1906 -mfpmath=sse, !optimize_size only. */
1908 void
1909 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1911 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1912 rtx int_xmm, fp_xmm;
1913 rtx biases, exponents;
1914 rtx x;
1916 int_xmm = gen_reg_rtx (V4SImode);
1917 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1918 emit_insn (gen_movdi_to_sse (int_xmm, input));
1919 else if (TARGET_SSE_SPLIT_REGS)
1921 emit_clobber (int_xmm);
1922 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1924 else
1926 x = gen_reg_rtx (V2DImode);
1927 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1928 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1931 x = gen_rtx_CONST_VECTOR (V4SImode,
1932 gen_rtvec (4, GEN_INT (0x43300000UL),
1933 GEN_INT (0x45300000UL),
1934 const0_rtx, const0_rtx));
1935 exponents = validize_mem (force_const_mem (V4SImode, x));
1937 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1938 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1940 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1941 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1942 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1943 (0x1.0p84 + double(fp_value_hi_xmm)).
1944 Note these exponents differ by 32. */
1946 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1948 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1949 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1950 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1951 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1952 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1953 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1954 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1955 biases = validize_mem (force_const_mem (V2DFmode, biases));
1956 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1958 /* Add the upper and lower DFmode values together. */
1959 if (TARGET_SSE3)
1960 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1961 else
1963 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1964 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1965 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1968 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1971 /* Not used, but eases macroization of patterns. */
1972 void
1973 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1975 gcc_unreachable ();
1978 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1980 /* Convert an unsigned SImode value into a DFmode. Only currently used
1981 for SSE, but applicable anywhere. */
1983 void
1984 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1986 REAL_VALUE_TYPE TWO31r;
1987 rtx x, fp;
1989 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1990 NULL, 1, OPTAB_DIRECT);
1992 fp = gen_reg_rtx (DFmode);
1993 emit_insn (gen_floatsidf2 (fp, x));
1995 real_ldexp (&TWO31r, &dconst1, 31);
1996 x = const_double_from_real_value (TWO31r, DFmode);
1998 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
2000 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
2001 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
2002 x = ix86_expand_sse_fabs (x, NULL);
2004 if (x != target)
2005 emit_move_insn (target, x);
2008 /* Convert a signed DImode value into a DFmode. Only used for SSE in
2009 32-bit mode; otherwise we have a direct convert instruction. */
2011 void
2012 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
2014 REAL_VALUE_TYPE TWO32r;
2015 rtx fp_lo, fp_hi, x;
2017 fp_lo = gen_reg_rtx (DFmode);
2018 fp_hi = gen_reg_rtx (DFmode);
2020 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
2022 real_ldexp (&TWO32r, &dconst1, 32);
2023 x = const_double_from_real_value (TWO32r, DFmode);
2024 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
2026 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
2028 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
2029 0, OPTAB_DIRECT);
2030 if (x != target)
2031 emit_move_insn (target, x);
2034 /* Convert an unsigned SImode value into a SFmode, using only SSE.
2035 For x86_32, -mfpmath=sse, !optimize_size only. */
2036 void
2037 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
2039 REAL_VALUE_TYPE ONE16r;
2040 rtx fp_hi, fp_lo, int_hi, int_lo, x;
2042 real_ldexp (&ONE16r, &dconst1, 16);
2043 x = const_double_from_real_value (ONE16r, SFmode);
2044 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
2045 NULL, 0, OPTAB_DIRECT);
2046 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
2047 NULL, 0, OPTAB_DIRECT);
2048 fp_hi = gen_reg_rtx (SFmode);
2049 fp_lo = gen_reg_rtx (SFmode);
2050 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
2051 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
2052 if (TARGET_FMA)
2054 x = validize_mem (force_const_mem (SFmode, x));
2055 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
2056 emit_move_insn (target, fp_hi);
2058 else
2060 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
2061 0, OPTAB_DIRECT);
2062 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
2063 0, OPTAB_DIRECT);
2064 if (!rtx_equal_p (target, fp_hi))
2065 emit_move_insn (target, fp_hi);
2069 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
2070 a vector of unsigned ints VAL to vector of floats TARGET. */
2072 void
2073 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
2075 rtx tmp[8];
2076 REAL_VALUE_TYPE TWO16r;
2077 machine_mode intmode = GET_MODE (val);
2078 machine_mode fltmode = GET_MODE (target);
2079 rtx (*cvt) (rtx, rtx);
2081 if (intmode == V4SImode)
2082 cvt = gen_floatv4siv4sf2;
2083 else
2084 cvt = gen_floatv8siv8sf2;
2085 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
2086 tmp[0] = force_reg (intmode, tmp[0]);
2087 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
2088 OPTAB_DIRECT);
2089 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
2090 NULL_RTX, 1, OPTAB_DIRECT);
2091 tmp[3] = gen_reg_rtx (fltmode);
2092 emit_insn (cvt (tmp[3], tmp[1]));
2093 tmp[4] = gen_reg_rtx (fltmode);
2094 emit_insn (cvt (tmp[4], tmp[2]));
2095 real_ldexp (&TWO16r, &dconst1, 16);
2096 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2097 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
2098 if (TARGET_FMA)
2100 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2101 emit_move_insn (target, tmp[6]);
2103 else
2105 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2106 NULL_RTX, 1, OPTAB_DIRECT);
2107 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2108 target, 1, OPTAB_DIRECT);
2109 if (tmp[7] != target)
2110 emit_move_insn (target, tmp[7]);
2114 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2115 pattern can be used on it instead of fixuns_trunc*.
2116 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2117 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2120 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2122 REAL_VALUE_TYPE TWO31r;
2123 rtx two31r, tmp[4];
2124 machine_mode mode = GET_MODE (val);
2125 machine_mode scalarmode = GET_MODE_INNER (mode);
2126 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2127 rtx (*cmp) (rtx, rtx, rtx, rtx);
2128 int i;
2130 for (i = 0; i < 3; i++)
2131 tmp[i] = gen_reg_rtx (mode);
2132 real_ldexp (&TWO31r, &dconst1, 31);
2133 two31r = const_double_from_real_value (TWO31r, scalarmode);
2134 two31r = ix86_build_const_vector (mode, 1, two31r);
2135 two31r = force_reg (mode, two31r);
2136 switch (mode)
2138 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2139 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2140 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2141 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2142 default: gcc_unreachable ();
2144 tmp[3] = gen_rtx_LE (mode, two31r, val);
2145 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2146 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2147 0, OPTAB_DIRECT);
2148 if (intmode == V4SImode || TARGET_AVX2)
2149 *xorp = expand_simple_binop (intmode, ASHIFT,
2150 gen_lowpart (intmode, tmp[0]),
2151 GEN_INT (31), NULL_RTX, 0,
2152 OPTAB_DIRECT);
2153 else
2155 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2156 two31 = ix86_build_const_vector (intmode, 1, two31);
2157 *xorp = expand_simple_binop (intmode, AND,
2158 gen_lowpart (intmode, tmp[0]),
2159 two31, NULL_RTX, 0,
2160 OPTAB_DIRECT);
2162 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2163 0, OPTAB_DIRECT);
2166 /* Generate code for floating point ABS or NEG. */
2168 void
2169 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2170 rtx operands[])
2172 rtx set, dst, src;
2173 bool use_sse = false;
2174 bool vector_mode = VECTOR_MODE_P (mode);
2175 machine_mode vmode = mode;
2176 rtvec par;
2178 if (vector_mode || mode == TFmode || mode == HFmode)
2180 use_sse = true;
2181 if (mode == HFmode)
2182 vmode = V8HFmode;
2184 else if (TARGET_SSE_MATH)
2186 use_sse = SSE_FLOAT_MODE_P (mode);
2187 if (mode == SFmode)
2188 vmode = V4SFmode;
2189 else if (mode == DFmode)
2190 vmode = V2DFmode;
2193 dst = operands[0];
2194 src = operands[1];
2196 set = gen_rtx_fmt_e (code, mode, src);
2197 set = gen_rtx_SET (dst, set);
2199 if (use_sse)
2201 rtx mask, use, clob;
2203 /* NEG and ABS performed with SSE use bitwise mask operations.
2204 Create the appropriate mask now. */
2205 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2206 use = gen_rtx_USE (VOIDmode, mask);
2207 if (vector_mode || mode == TFmode)
2208 par = gen_rtvec (2, set, use);
2209 else
2211 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2212 par = gen_rtvec (3, set, use, clob);
2215 else
2217 rtx clob;
2219 /* Changing of sign for FP values is doable using integer unit too. */
2220 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2221 par = gen_rtvec (2, set, clob);
2224 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2227 /* Deconstruct a floating point ABS or NEG operation
2228 with integer registers into integer operations. */
2230 void
2231 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2232 rtx operands[])
2234 enum rtx_code absneg_op;
2235 rtx dst, set;
2237 gcc_assert (operands_match_p (operands[0], operands[1]));
2239 switch (mode)
2241 case E_SFmode:
2242 dst = gen_lowpart (SImode, operands[0]);
2244 if (code == ABS)
2246 set = gen_int_mode (0x7fffffff, SImode);
2247 absneg_op = AND;
2249 else
2251 set = gen_int_mode (0x80000000, SImode);
2252 absneg_op = XOR;
2254 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2255 break;
2257 case E_DFmode:
2258 if (TARGET_64BIT)
2260 dst = gen_lowpart (DImode, operands[0]);
2261 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2263 if (code == ABS)
2264 set = const0_rtx;
2265 else
2266 set = gen_rtx_NOT (DImode, dst);
2268 else
2270 dst = gen_highpart (SImode, operands[0]);
2272 if (code == ABS)
2274 set = gen_int_mode (0x7fffffff, SImode);
2275 absneg_op = AND;
2277 else
2279 set = gen_int_mode (0x80000000, SImode);
2280 absneg_op = XOR;
2282 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2284 break;
2286 case E_XFmode:
2287 dst = gen_rtx_REG (SImode,
2288 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2289 if (code == ABS)
2291 set = GEN_INT (0x7fff);
2292 absneg_op = AND;
2294 else
2296 set = GEN_INT (0x8000);
2297 absneg_op = XOR;
2299 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2300 break;
2302 default:
2303 gcc_unreachable ();
2306 set = gen_rtx_SET (dst, set);
2308 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2309 rtvec par = gen_rtvec (2, set, clob);
2311 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2314 /* Expand a copysign operation. Special case operand 0 being a constant. */
2316 void
2317 ix86_expand_copysign (rtx operands[])
2319 machine_mode mode, vmode;
2320 rtx dest, vdest, op0, op1, mask, op2, op3;
2322 mode = GET_MODE (operands[0]);
2324 if (mode == HFmode)
2325 vmode = V8HFmode;
2326 else if (mode == SFmode)
2327 vmode = V4SFmode;
2328 else if (mode == DFmode)
2329 vmode = V2DFmode;
2330 else if (mode == TFmode)
2331 vmode = mode;
2332 else
2333 gcc_unreachable ();
2335 if (rtx_equal_p (operands[1], operands[2]))
2337 emit_move_insn (operands[0], operands[1]);
2338 return;
2341 dest = operands[0];
2342 vdest = lowpart_subreg (vmode, dest, mode);
2343 if (vdest == NULL_RTX)
2344 vdest = gen_reg_rtx (vmode);
2345 else
2346 dest = NULL_RTX;
2347 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2348 mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
2350 if (CONST_DOUBLE_P (operands[1]))
2352 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2353 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2354 if (op0 == CONST0_RTX (mode))
2356 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2357 if (dest)
2358 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2359 return;
2362 if (GET_MODE_SIZE (mode) < 16)
2363 op0 = ix86_build_const_vector (vmode, false, op0);
2364 op0 = force_reg (vmode, op0);
2366 else
2367 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2369 op2 = gen_reg_rtx (vmode);
2370 op3 = gen_reg_rtx (vmode);
2371 emit_move_insn (op2, gen_rtx_AND (vmode,
2372 gen_rtx_NOT (vmode, mask),
2373 op0));
2374 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2375 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2376 if (dest)
2377 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2380 /* Expand an xorsign operation. */
2382 void
2383 ix86_expand_xorsign (rtx operands[])
2385 machine_mode mode, vmode;
2386 rtx dest, vdest, op0, op1, mask, x, temp;
2388 dest = operands[0];
2389 op0 = operands[1];
2390 op1 = operands[2];
2392 mode = GET_MODE (dest);
2394 if (mode == HFmode)
2395 vmode = V8HFmode;
2396 else if (mode == SFmode)
2397 vmode = V4SFmode;
2398 else if (mode == DFmode)
2399 vmode = V2DFmode;
2400 else
2401 gcc_unreachable ();
2403 temp = gen_reg_rtx (vmode);
2404 mask = ix86_build_signbit_mask (vmode, 0, 0);
2406 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2407 x = gen_rtx_AND (vmode, op1, mask);
2408 emit_insn (gen_rtx_SET (temp, x));
2410 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2411 x = gen_rtx_XOR (vmode, temp, op0);
2413 vdest = lowpart_subreg (vmode, dest, mode);
2414 if (vdest == NULL_RTX)
2415 vdest = gen_reg_rtx (vmode);
2416 else
2417 dest = NULL_RTX;
2418 emit_insn (gen_rtx_SET (vdest, x));
2420 if (dest)
2421 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2424 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2426 void
2427 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2429 machine_mode mode = GET_MODE (op0);
2430 rtx tmp;
2432 /* Handle special case - vector comparsion with boolean result, transform
2433 it using ptest instruction or vpcmpeq + kortest. */
2434 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2435 || (mode == TImode && !TARGET_64BIT)
2436 || mode == OImode
2437 || GET_MODE_SIZE (mode) == 64)
2439 unsigned msize = GET_MODE_SIZE (mode);
2440 machine_mode p_mode
2441 = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
2442 /* kortest set CF when result is 0xFFFF (op0 == op1). */
2443 rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
2445 gcc_assert (code == EQ || code == NE);
2447 /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors. */
2448 if (msize == 64)
2450 if (mode != V16SImode)
2452 op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2453 op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2456 tmp = gen_reg_rtx (HImode);
2457 emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
2458 emit_insn (gen_kortesthi_ccc (tmp, tmp));
2460 /* Using ptest for 128/256-bit vectors. */
2461 else
2463 if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
2465 op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2466 op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2467 mode = p_mode;
2470 /* Generate XOR since we can't check that one operand is zero
2471 vector. */
2472 tmp = gen_reg_rtx (mode);
2473 rtx ops[3] = { tmp, op0, op1 };
2474 ix86_expand_vector_logical_operator (XOR, mode, ops);
2475 tmp = gen_lowpart (p_mode, tmp);
2476 emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
2477 gen_rtx_UNSPEC (CCZmode,
2478 gen_rtvec (2, tmp, tmp),
2479 UNSPEC_PTEST)));
2481 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2482 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2483 gen_rtx_LABEL_REF (VOIDmode, label),
2484 pc_rtx);
2485 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2486 return;
2489 switch (mode)
2491 case E_HFmode:
2492 case E_SFmode:
2493 case E_DFmode:
2494 case E_XFmode:
2495 case E_QImode:
2496 case E_HImode:
2497 case E_SImode:
2498 simple:
2499 tmp = ix86_expand_compare (code, op0, op1);
2500 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2501 gen_rtx_LABEL_REF (VOIDmode, label),
2502 pc_rtx);
2503 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2504 return;
2506 case E_DImode:
2507 if (TARGET_64BIT)
2508 goto simple;
2509 /* FALLTHRU */
2510 case E_TImode:
2511 /* DI and TI mode equality/inequality comparisons may be performed
2512 on SSE registers. Avoid splitting them, except when optimizing
2513 for size. */
2514 if ((code == EQ || code == NE)
2515 && !optimize_insn_for_size_p ())
2516 goto simple;
2518 /* Expand DImode branch into multiple compare+branch. */
2520 rtx lo[2], hi[2];
2521 rtx_code_label *label2;
2522 enum rtx_code code1, code2, code3;
2523 machine_mode submode;
2525 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2527 std::swap (op0, op1);
2528 code = swap_condition (code);
2531 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2532 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2534 submode = mode == DImode ? SImode : DImode;
2536 /* If we are doing less-than or greater-or-equal-than,
2537 op1 is a constant and the low word is zero, then we can just
2538 examine the high word. Similarly for low word -1 and
2539 less-or-equal-than or greater-than. */
2541 if (CONST_INT_P (hi[1]))
2542 switch (code)
2544 case LT: case LTU: case GE: case GEU:
2545 if (lo[1] == const0_rtx)
2547 ix86_expand_branch (code, hi[0], hi[1], label);
2548 return;
2550 break;
2551 case LE: case LEU: case GT: case GTU:
2552 if (lo[1] == constm1_rtx)
2554 ix86_expand_branch (code, hi[0], hi[1], label);
2555 return;
2557 break;
2558 default:
2559 break;
2562 /* Emulate comparisons that do not depend on Zero flag with
2563 double-word subtraction. Note that only Overflow, Sign
2564 and Carry flags are valid, so swap arguments and condition
2565 of comparisons that would otherwise test Zero flag. */
2567 switch (code)
2569 case LE: case LEU: case GT: case GTU:
2570 std::swap (lo[0], lo[1]);
2571 std::swap (hi[0], hi[1]);
2572 code = swap_condition (code);
2573 /* FALLTHRU */
2575 case LT: case LTU: case GE: case GEU:
2577 bool uns = (code == LTU || code == GEU);
2578 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2579 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2581 if (!nonimmediate_operand (lo[0], submode))
2582 lo[0] = force_reg (submode, lo[0]);
2583 if (!x86_64_general_operand (lo[1], submode))
2584 lo[1] = force_reg (submode, lo[1]);
2586 if (!register_operand (hi[0], submode))
2587 hi[0] = force_reg (submode, hi[0]);
2588 if ((uns && !nonimmediate_operand (hi[1], submode))
2589 || (!uns && !x86_64_general_operand (hi[1], submode)))
2590 hi[1] = force_reg (submode, hi[1]);
2592 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2594 tmp = gen_rtx_SCRATCH (submode);
2595 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2597 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2598 ix86_expand_branch (code, tmp, const0_rtx, label);
2599 return;
2602 default:
2603 break;
2606 /* Otherwise, we need two or three jumps. */
2608 label2 = gen_label_rtx ();
2610 code1 = code;
2611 code2 = swap_condition (code);
2612 code3 = unsigned_condition (code);
2614 switch (code)
2616 case LT: case GT: case LTU: case GTU:
2617 break;
2619 case LE: code1 = LT; code2 = GT; break;
2620 case GE: code1 = GT; code2 = LT; break;
2621 case LEU: code1 = LTU; code2 = GTU; break;
2622 case GEU: code1 = GTU; code2 = LTU; break;
2624 case EQ: code1 = UNKNOWN; code2 = NE; break;
2625 case NE: code2 = UNKNOWN; break;
2627 default:
2628 gcc_unreachable ();
2632 * a < b =>
2633 * if (hi(a) < hi(b)) goto true;
2634 * if (hi(a) > hi(b)) goto false;
2635 * if (lo(a) < lo(b)) goto true;
2636 * false:
2639 if (code1 != UNKNOWN)
2640 ix86_expand_branch (code1, hi[0], hi[1], label);
2641 if (code2 != UNKNOWN)
2642 ix86_expand_branch (code2, hi[0], hi[1], label2);
2644 ix86_expand_branch (code3, lo[0], lo[1], label);
2646 if (code2 != UNKNOWN)
2647 emit_label (label2);
2648 return;
2651 default:
2652 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2653 goto simple;
2657 /* Figure out whether to use unordered fp comparisons. */
2659 static bool
2660 ix86_unordered_fp_compare (enum rtx_code code)
2662 if (!TARGET_IEEE_FP)
2663 return false;
2665 switch (code)
2667 case LT:
2668 case LE:
2669 case GT:
2670 case GE:
2671 case LTGT:
2672 return false;
2674 case EQ:
2675 case NE:
2677 case UNORDERED:
2678 case ORDERED:
2679 case UNLT:
2680 case UNLE:
2681 case UNGT:
2682 case UNGE:
2683 case UNEQ:
2684 return true;
2686 default:
2687 gcc_unreachable ();
2691 /* Return a comparison we can do and that it is equivalent to
2692 swap_condition (code) apart possibly from orderedness.
2693 But, never change orderedness if TARGET_IEEE_FP, returning
2694 UNKNOWN in that case if necessary. */
2696 static enum rtx_code
2697 ix86_fp_swap_condition (enum rtx_code code)
2699 switch (code)
2701 case GT: /* GTU - CF=0 & ZF=0 */
2702 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2703 case GE: /* GEU - CF=0 */
2704 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2705 case UNLT: /* LTU - CF=1 */
2706 return TARGET_IEEE_FP ? UNKNOWN : GT;
2707 case UNLE: /* LEU - CF=1 | ZF=1 */
2708 return TARGET_IEEE_FP ? UNKNOWN : GE;
2709 default:
2710 return swap_condition (code);
2714 /* Return cost of comparison CODE using the best strategy for performance.
2715 All following functions do use number of instructions as a cost metrics.
2716 In future this should be tweaked to compute bytes for optimize_size and
2717 take into account performance of various instructions on various CPUs. */
2719 static int
2720 ix86_fp_comparison_cost (enum rtx_code code)
2722 int arith_cost;
2724 /* The cost of code using bit-twiddling on %ah. */
2725 switch (code)
2727 case UNLE:
2728 case UNLT:
2729 case LTGT:
2730 case GT:
2731 case GE:
2732 case UNORDERED:
2733 case ORDERED:
2734 case UNEQ:
2735 arith_cost = 4;
2736 break;
2737 case LT:
2738 case NE:
2739 case EQ:
2740 case UNGE:
2741 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2742 break;
2743 case LE:
2744 case UNGT:
2745 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2746 break;
2747 default:
2748 gcc_unreachable ();
2751 switch (ix86_fp_comparison_strategy (code))
2753 case IX86_FPCMP_COMI:
2754 return arith_cost > 4 ? 3 : 2;
2755 case IX86_FPCMP_SAHF:
2756 return arith_cost > 4 ? 4 : 3;
2757 default:
2758 return arith_cost;
2762 /* Swap, force into registers, or otherwise massage the two operands
2763 to a fp comparison. The operands are updated in place; the new
2764 comparison code is returned. */
2766 static enum rtx_code
2767 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2769 bool unordered_compare = ix86_unordered_fp_compare (code);
2770 rtx op0 = *pop0, op1 = *pop1;
2771 machine_mode op_mode = GET_MODE (op0);
2772 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2774 if (op_mode == BFmode)
2776 rtx op = gen_lowpart (HImode, op0);
2777 if (CONST_INT_P (op))
2778 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2779 op0, BFmode);
2780 else
2782 rtx t1 = gen_reg_rtx (SImode);
2783 emit_insn (gen_zero_extendhisi2 (t1, op));
2784 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2785 op = gen_lowpart (SFmode, t1);
2787 *pop0 = op;
2788 op = gen_lowpart (HImode, op1);
2789 if (CONST_INT_P (op))
2790 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2791 op1, BFmode);
2792 else
2794 rtx t1 = gen_reg_rtx (SImode);
2795 emit_insn (gen_zero_extendhisi2 (t1, op));
2796 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2797 op = gen_lowpart (SFmode, t1);
2799 *pop1 = op;
2800 return ix86_prepare_fp_compare_args (code, pop0, pop1);
2803 /* All of the unordered compare instructions only work on registers.
2804 The same is true of the fcomi compare instructions. The XFmode
2805 compare instructions require registers except when comparing
2806 against zero or when converting operand 1 from fixed point to
2807 floating point. */
2809 if (!is_sse
2810 && (unordered_compare
2811 || (op_mode == XFmode
2812 && ! (standard_80387_constant_p (op0) == 1
2813 || standard_80387_constant_p (op1) == 1)
2814 && GET_CODE (op1) != FLOAT)
2815 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2817 op0 = force_reg (op_mode, op0);
2818 op1 = force_reg (op_mode, op1);
2820 else
2822 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2823 things around if they appear profitable, otherwise force op0
2824 into a register. */
2826 if (standard_80387_constant_p (op0) == 0
2827 || (MEM_P (op0)
2828 && ! (standard_80387_constant_p (op1) == 0
2829 || MEM_P (op1))))
2831 enum rtx_code new_code = ix86_fp_swap_condition (code);
2832 if (new_code != UNKNOWN)
2834 std::swap (op0, op1);
2835 code = new_code;
2839 if (!REG_P (op0))
2840 op0 = force_reg (op_mode, op0);
2842 if (CONSTANT_P (op1))
2844 int tmp = standard_80387_constant_p (op1);
2845 if (tmp == 0)
2846 op1 = validize_mem (force_const_mem (op_mode, op1));
2847 else if (tmp == 1)
2849 if (TARGET_CMOVE)
2850 op1 = force_reg (op_mode, op1);
2852 else
2853 op1 = force_reg (op_mode, op1);
2857 /* Try to rearrange the comparison to make it cheaper. */
2858 if (ix86_fp_comparison_cost (code)
2859 > ix86_fp_comparison_cost (swap_condition (code))
2860 && (REG_P (op1) || can_create_pseudo_p ()))
2862 std::swap (op0, op1);
2863 code = swap_condition (code);
2864 if (!REG_P (op0))
2865 op0 = force_reg (op_mode, op0);
2868 *pop0 = op0;
2869 *pop1 = op1;
2870 return code;
2873 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2875 static rtx
2876 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2878 bool unordered_compare = ix86_unordered_fp_compare (code);
2879 machine_mode cmp_mode;
2880 rtx tmp, scratch;
2882 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2884 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2885 if (unordered_compare)
2886 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2888 /* Do fcomi/sahf based test when profitable. */
2889 switch (ix86_fp_comparison_strategy (code))
2891 case IX86_FPCMP_COMI:
2892 cmp_mode = CCFPmode;
2893 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2894 break;
2896 case IX86_FPCMP_SAHF:
2897 cmp_mode = CCFPmode;
2898 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2899 scratch = gen_reg_rtx (HImode);
2900 emit_insn (gen_rtx_SET (scratch, tmp));
2901 emit_insn (gen_x86_sahf_1 (scratch));
2902 break;
2904 case IX86_FPCMP_ARITH:
2905 cmp_mode = CCNOmode;
2906 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2907 scratch = gen_reg_rtx (HImode);
2908 emit_insn (gen_rtx_SET (scratch, tmp));
2910 /* In the unordered case, we have to check C2 for NaN's, which
2911 doesn't happen to work out to anything nice combination-wise.
2912 So do some bit twiddling on the value we've got in AH to come
2913 up with an appropriate set of condition codes. */
2915 switch (code)
2917 case GT:
2918 case UNGT:
2919 if (code == GT || !TARGET_IEEE_FP)
2921 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2922 code = EQ;
2924 else
2926 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2927 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2928 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2929 cmp_mode = CCmode;
2930 code = GEU;
2932 break;
2933 case LT:
2934 case UNLT:
2935 if (code == LT && TARGET_IEEE_FP)
2937 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2938 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2939 cmp_mode = CCmode;
2940 code = EQ;
2942 else
2944 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2945 code = NE;
2947 break;
2948 case GE:
2949 case UNGE:
2950 if (code == GE || !TARGET_IEEE_FP)
2952 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2953 code = EQ;
2955 else
2957 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2958 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2959 code = NE;
2961 break;
2962 case LE:
2963 case UNLE:
2964 if (code == LE && TARGET_IEEE_FP)
2966 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2967 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2968 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2969 cmp_mode = CCmode;
2970 code = LTU;
2972 else
2974 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2975 code = NE;
2977 break;
2978 case EQ:
2979 case UNEQ:
2980 if (code == EQ && TARGET_IEEE_FP)
2982 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2983 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2984 cmp_mode = CCmode;
2985 code = EQ;
2987 else
2989 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2990 code = NE;
2992 break;
2993 case NE:
2994 case LTGT:
2995 if (code == NE && TARGET_IEEE_FP)
2997 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2998 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2999 GEN_INT (0x40)));
3000 code = NE;
3002 else
3004 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
3005 code = EQ;
3007 break;
3009 case UNORDERED:
3010 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
3011 code = NE;
3012 break;
3013 case ORDERED:
3014 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
3015 code = EQ;
3016 break;
3018 default:
3019 gcc_unreachable ();
3021 break;
3023 default:
3024 gcc_unreachable();
3027 /* Return the test that should be put into the flags user, i.e.
3028 the bcc, scc, or cmov instruction. */
3029 return gen_rtx_fmt_ee (code, VOIDmode,
3030 gen_rtx_REG (cmp_mode, FLAGS_REG),
3031 const0_rtx);
3034 /* Generate insn patterns to do an integer compare of OPERANDS. */
3036 static rtx
3037 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
3039 machine_mode cmpmode;
3040 rtx tmp, flags;
3042 /* Swap operands to emit carry flag comparison. */
3043 if ((code == GTU || code == LEU)
3044 && nonimmediate_operand (op1, VOIDmode))
3046 std::swap (op0, op1);
3047 code = swap_condition (code);
3050 cmpmode = SELECT_CC_MODE (code, op0, op1);
3051 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
3053 /* Attempt to use PTEST, if available, when testing vector modes for
3054 equality/inequality against zero. */
3055 if (op1 == const0_rtx
3056 && SUBREG_P (op0)
3057 && cmpmode == CCZmode
3058 && SUBREG_BYTE (op0) == 0
3059 && REG_P (SUBREG_REG (op0))
3060 && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
3061 && TARGET_SSE4_1
3062 && GET_MODE (op0) == TImode
3063 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
3065 tmp = SUBREG_REG (op0);
3066 tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
3068 else
3069 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
3071 /* This is very simple, but making the interface the same as in the
3072 FP case makes the rest of the code easier. */
3073 emit_insn (gen_rtx_SET (flags, tmp));
3075 /* Return the test that should be put into the flags user, i.e.
3076 the bcc, scc, or cmov instruction. */
3077 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
3080 static rtx
3081 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
3083 rtx ret;
3085 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
3086 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
3088 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
3090 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
3091 ret = ix86_expand_fp_compare (code, op0, op1);
3093 else
3094 ret = ix86_expand_int_compare (code, op0, op1);
3096 return ret;
3099 void
3100 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
3102 rtx ret;
3104 gcc_assert (GET_MODE (dest) == QImode);
3106 ret = ix86_expand_compare (code, op0, op1);
3107 PUT_MODE (ret, QImode);
3108 emit_insn (gen_rtx_SET (dest, ret));
3111 /* Expand floating point op0 <=> op1, i.e.
3112 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
3114 void
3115 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
3117 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
3118 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
3119 rtx l0 = gen_label_rtx ();
3120 rtx l1 = gen_label_rtx ();
3121 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
3122 rtx lend = gen_label_rtx ();
3123 rtx tmp;
3124 rtx_insn *jmp;
3125 if (l2)
3127 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
3128 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3129 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
3130 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
3131 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3132 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
3134 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3135 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3136 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3137 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3138 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3139 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3140 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3141 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3142 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3143 add_reg_br_prob_note (jmp, profile_probability::even ());
3144 emit_move_insn (dest, constm1_rtx);
3145 emit_jump (lend);
3146 emit_label (l0);
3147 emit_move_insn (dest, const0_rtx);
3148 emit_jump (lend);
3149 emit_label (l1);
3150 emit_move_insn (dest, const1_rtx);
3151 emit_jump (lend);
3152 if (l2)
3154 emit_label (l2);
3155 emit_move_insn (dest, const2_rtx);
3157 emit_label (lend);
3160 /* Expand comparison setting or clearing carry flag. Return true when
3161 successful and set pop for the operation. */
3162 static bool
3163 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3165 machine_mode mode
3166 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3168 /* Do not handle double-mode compares that go through special path. */
3169 if (mode == (TARGET_64BIT ? TImode : DImode))
3170 return false;
3172 if (SCALAR_FLOAT_MODE_P (mode))
3174 rtx compare_op;
3175 rtx_insn *compare_seq;
3177 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3179 /* Shortcut: following common codes never translate
3180 into carry flag compares. */
3181 if (code == EQ || code == NE || code == UNEQ || code == LTGT
3182 || code == ORDERED || code == UNORDERED)
3183 return false;
3185 /* These comparisons require zero flag; swap operands so they won't. */
3186 if ((code == GT || code == UNLE || code == LE || code == UNGT)
3187 && !TARGET_IEEE_FP)
3189 std::swap (op0, op1);
3190 code = swap_condition (code);
3193 /* Try to expand the comparison and verify that we end up with
3194 carry flag based comparison. This fails to be true only when
3195 we decide to expand comparison using arithmetic that is not
3196 too common scenario. */
3197 start_sequence ();
3198 compare_op = ix86_expand_fp_compare (code, op0, op1);
3199 compare_seq = get_insns ();
3200 end_sequence ();
3202 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3203 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3204 else
3205 code = GET_CODE (compare_op);
3207 if (code != LTU && code != GEU)
3208 return false;
3210 emit_insn (compare_seq);
3211 *pop = compare_op;
3212 return true;
3215 if (!INTEGRAL_MODE_P (mode))
3216 return false;
3218 switch (code)
3220 case LTU:
3221 case GEU:
3222 break;
3224 /* Convert a==0 into (unsigned)a<1. */
3225 case EQ:
3226 case NE:
3227 if (op1 != const0_rtx)
3228 return false;
3229 op1 = const1_rtx;
3230 code = (code == EQ ? LTU : GEU);
3231 break;
3233 /* Convert a>b into b<a or a>=b-1. */
3234 case GTU:
3235 case LEU:
3236 if (CONST_INT_P (op1))
3238 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3239 /* Bail out on overflow. We still can swap operands but that
3240 would force loading of the constant into register. */
3241 if (op1 == const0_rtx
3242 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3243 return false;
3244 code = (code == GTU ? GEU : LTU);
3246 else
3248 std::swap (op0, op1);
3249 code = (code == GTU ? LTU : GEU);
3251 break;
3253 /* Convert a>=0 into (unsigned)a<0x80000000. */
3254 case LT:
3255 case GE:
3256 if (mode == DImode || op1 != const0_rtx)
3257 return false;
3258 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3259 code = (code == LT ? GEU : LTU);
3260 break;
3261 case LE:
3262 case GT:
3263 if (mode == DImode || op1 != constm1_rtx)
3264 return false;
3265 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3266 code = (code == LE ? GEU : LTU);
3267 break;
3269 default:
3270 return false;
3272 /* Swapping operands may cause constant to appear as first operand. */
3273 if (!nonimmediate_operand (op0, VOIDmode))
3275 if (!can_create_pseudo_p ())
3276 return false;
3277 op0 = force_reg (mode, op0);
3279 *pop = ix86_expand_compare (code, op0, op1);
3280 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3281 return true;
3284 /* Expand conditional increment or decrement using adb/sbb instructions.
3285 The default case using setcc followed by the conditional move can be
3286 done by generic code. */
3287 bool
3288 ix86_expand_int_addcc (rtx operands[])
3290 enum rtx_code code = GET_CODE (operands[1]);
3291 rtx flags;
3292 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3293 rtx compare_op;
3294 rtx val = const0_rtx;
3295 bool fpcmp = false;
3296 machine_mode mode;
3297 rtx op0 = XEXP (operands[1], 0);
3298 rtx op1 = XEXP (operands[1], 1);
3300 if (operands[3] != const1_rtx
3301 && operands[3] != constm1_rtx)
3302 return false;
3303 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3304 return false;
3305 code = GET_CODE (compare_op);
3307 flags = XEXP (compare_op, 0);
3309 if (GET_MODE (flags) == CCFPmode)
3311 fpcmp = true;
3312 code = ix86_fp_compare_code_to_integer (code);
3315 if (code != LTU)
3317 val = constm1_rtx;
3318 if (fpcmp)
3319 PUT_CODE (compare_op,
3320 reverse_condition_maybe_unordered
3321 (GET_CODE (compare_op)));
3322 else
3323 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3326 mode = GET_MODE (operands[0]);
3328 /* Construct either adc or sbb insn. */
3329 if ((code == LTU) == (operands[3] == constm1_rtx))
3330 insn = gen_sub3_carry;
3331 else
3332 insn = gen_add3_carry;
3334 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3336 return true;
3339 bool
3340 ix86_expand_int_movcc (rtx operands[])
3342 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3343 rtx_insn *compare_seq;
3344 rtx compare_op;
3345 machine_mode mode = GET_MODE (operands[0]);
3346 bool sign_bit_compare_p = false;
3347 bool negate_cc_compare_p = false;
3348 rtx op0 = XEXP (operands[1], 0);
3349 rtx op1 = XEXP (operands[1], 1);
3350 rtx op2 = operands[2];
3351 rtx op3 = operands[3];
3353 if (GET_MODE (op0) == TImode
3354 || (GET_MODE (op0) == DImode
3355 && !TARGET_64BIT))
3356 return false;
3358 if (GET_MODE (op0) == BFmode
3359 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3360 return false;
3362 start_sequence ();
3363 compare_op = ix86_expand_compare (code, op0, op1);
3364 compare_seq = get_insns ();
3365 end_sequence ();
3367 compare_code = GET_CODE (compare_op);
3369 if ((op1 == const0_rtx && (code == GE || code == LT))
3370 || (op1 == constm1_rtx && (code == GT || code == LE)))
3371 sign_bit_compare_p = true;
3373 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3374 but if op1 is a constant, the latter form allows more optimizations,
3375 either through the last 2 ops being constant handling, or the one
3376 constant and one variable cases. On the other side, for cmov the
3377 former might be better as we don't need to load the constant into
3378 another register. */
3379 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3380 op2 = op1;
3381 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3382 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3383 op3 = op1;
3385 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3386 HImode insns, we'd be swallowed in word prefix ops. */
3388 if ((mode != HImode || TARGET_FAST_PREFIX)
3389 && (mode != (TARGET_64BIT ? TImode : DImode))
3390 && CONST_INT_P (op2)
3391 && CONST_INT_P (op3))
3393 rtx out = operands[0];
3394 HOST_WIDE_INT ct = INTVAL (op2);
3395 HOST_WIDE_INT cf = INTVAL (op3);
3396 HOST_WIDE_INT diff;
3398 if ((mode == SImode
3399 || (TARGET_64BIT && mode == DImode))
3400 && (GET_MODE (op0) == SImode
3401 || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3403 /* Special case x != 0 ? -1 : y. */
3404 if (code == NE && op1 == const0_rtx && ct == -1)
3406 negate_cc_compare_p = true;
3407 std::swap (ct, cf);
3408 code = EQ;
3410 else if (code == EQ && op1 == const0_rtx && cf == -1)
3411 negate_cc_compare_p = true;
3414 diff = ct - cf;
3415 /* Sign bit compares are better done using shifts than we do by using
3416 sbb. */
3417 if (sign_bit_compare_p
3418 || negate_cc_compare_p
3419 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3421 /* Detect overlap between destination and compare sources. */
3422 rtx tmp = out;
3424 if (negate_cc_compare_p)
3426 if (GET_MODE (op0) == DImode)
3427 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3428 else
3429 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3430 gen_lowpart (SImode, op0)));
3432 tmp = gen_reg_rtx (mode);
3433 if (mode == DImode)
3434 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3435 else
3436 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3437 tmp)));
3439 else if (!sign_bit_compare_p)
3441 rtx flags;
3442 bool fpcmp = false;
3444 compare_code = GET_CODE (compare_op);
3446 flags = XEXP (compare_op, 0);
3448 if (GET_MODE (flags) == CCFPmode)
3450 fpcmp = true;
3451 compare_code
3452 = ix86_fp_compare_code_to_integer (compare_code);
3455 /* To simplify rest of code, restrict to the GEU case. */
3456 if (compare_code == LTU)
3458 std::swap (ct, cf);
3459 compare_code = reverse_condition (compare_code);
3460 code = reverse_condition (code);
3462 else
3464 if (fpcmp)
3465 PUT_CODE (compare_op,
3466 reverse_condition_maybe_unordered
3467 (GET_CODE (compare_op)));
3468 else
3469 PUT_CODE (compare_op,
3470 reverse_condition (GET_CODE (compare_op)));
3472 diff = ct - cf;
3474 if (reg_overlap_mentioned_p (out, compare_op))
3475 tmp = gen_reg_rtx (mode);
3477 if (mode == DImode)
3478 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3479 else
3480 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3481 flags, compare_op));
3483 else
3485 if (code == GT || code == GE)
3486 code = reverse_condition (code);
3487 else
3489 std::swap (ct, cf);
3490 diff = ct - cf;
3492 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3495 if (diff == 1)
3498 * cmpl op0,op1
3499 * sbbl dest,dest
3500 * [addl dest, ct]
3502 * Size 5 - 8.
3504 if (ct)
3505 tmp = expand_simple_binop (mode, PLUS,
3506 tmp, GEN_INT (ct),
3507 copy_rtx (tmp), 1, OPTAB_DIRECT);
3509 else if (cf == -1)
3512 * cmpl op0,op1
3513 * sbbl dest,dest
3514 * orl $ct, dest
3516 * Size 8.
3518 tmp = expand_simple_binop (mode, IOR,
3519 tmp, GEN_INT (ct),
3520 copy_rtx (tmp), 1, OPTAB_DIRECT);
3522 else if (diff == -1 && ct)
3525 * cmpl op0,op1
3526 * sbbl dest,dest
3527 * notl dest
3528 * [addl dest, cf]
3530 * Size 8 - 11.
3532 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3533 if (cf)
3534 tmp = expand_simple_binop (mode, PLUS,
3535 copy_rtx (tmp), GEN_INT (cf),
3536 copy_rtx (tmp), 1, OPTAB_DIRECT);
3538 else
3541 * cmpl op0,op1
3542 * sbbl dest,dest
3543 * [notl dest]
3544 * andl cf - ct, dest
3545 * [addl dest, ct]
3547 * Size 8 - 11.
3550 if (cf == 0)
3552 cf = ct;
3553 ct = 0;
3554 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3557 tmp = expand_simple_binop (mode, AND,
3558 copy_rtx (tmp),
3559 gen_int_mode (cf - ct, mode),
3560 copy_rtx (tmp), 1, OPTAB_DIRECT);
3561 if (ct)
3562 tmp = expand_simple_binop (mode, PLUS,
3563 copy_rtx (tmp), GEN_INT (ct),
3564 copy_rtx (tmp), 1, OPTAB_DIRECT);
3567 if (!rtx_equal_p (tmp, out))
3568 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3570 return true;
3573 if (diff < 0)
3575 machine_mode cmp_mode = GET_MODE (op0);
3576 enum rtx_code new_code;
3578 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3580 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3582 /* We may be reversing a non-trapping
3583 comparison to a trapping comparison. */
3584 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3585 && code != EQ && code != NE
3586 && code != ORDERED && code != UNORDERED)
3587 new_code = UNKNOWN;
3588 else
3589 new_code = reverse_condition_maybe_unordered (code);
3591 else
3592 new_code = ix86_reverse_condition (code, cmp_mode);
3593 if (new_code != UNKNOWN)
3595 std::swap (ct, cf);
3596 diff = -diff;
3597 code = new_code;
3601 compare_code = UNKNOWN;
3602 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3603 && CONST_INT_P (op1))
3605 if (op1 == const0_rtx
3606 && (code == LT || code == GE))
3607 compare_code = code;
3608 else if (op1 == constm1_rtx)
3610 if (code == LE)
3611 compare_code = LT;
3612 else if (code == GT)
3613 compare_code = GE;
3617 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3618 if (compare_code != UNKNOWN
3619 && GET_MODE (op0) == GET_MODE (out)
3620 && (cf == -1 || ct == -1))
3622 /* If lea code below could be used, only optimize
3623 if it results in a 2 insn sequence. */
3625 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3626 || diff == 3 || diff == 5 || diff == 9)
3627 || (compare_code == LT && ct == -1)
3628 || (compare_code == GE && cf == -1))
3631 * notl op1 (if necessary)
3632 * sarl $31, op1
3633 * orl cf, op1
3635 if (ct != -1)
3637 cf = ct;
3638 ct = -1;
3639 code = reverse_condition (code);
3642 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3644 out = expand_simple_binop (mode, IOR,
3645 out, GEN_INT (cf),
3646 out, 1, OPTAB_DIRECT);
3647 if (out != operands[0])
3648 emit_move_insn (operands[0], out);
3650 return true;
3655 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3656 || diff == 3 || diff == 5 || diff == 9)
3657 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3658 && (mode != DImode
3659 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3662 * xorl dest,dest
3663 * cmpl op1,op2
3664 * setcc dest
3665 * lea cf(dest*(ct-cf)),dest
3667 * Size 14.
3669 * This also catches the degenerate setcc-only case.
3672 rtx tmp;
3673 int nops;
3675 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3677 nops = 0;
3678 /* On x86_64 the lea instruction operates on Pmode, so we need
3679 to get arithmetics done in proper mode to match. */
3680 if (diff == 1)
3681 tmp = copy_rtx (out);
3682 else
3684 rtx out1;
3685 out1 = copy_rtx (out);
3686 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3687 nops++;
3688 if (diff & 1)
3690 tmp = gen_rtx_PLUS (mode, tmp, out1);
3691 nops++;
3694 if (cf != 0)
3696 tmp = plus_constant (mode, tmp, cf);
3697 nops++;
3699 if (!rtx_equal_p (tmp, out))
3701 if (nops == 1)
3702 out = force_operand (tmp, copy_rtx (out));
3703 else
3704 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3706 if (!rtx_equal_p (out, operands[0]))
3707 emit_move_insn (operands[0], copy_rtx (out));
3709 return true;
3713 * General case: Jumpful:
3714 * xorl dest,dest cmpl op1, op2
3715 * cmpl op1, op2 movl ct, dest
3716 * setcc dest jcc 1f
3717 * decl dest movl cf, dest
3718 * andl (cf-ct),dest 1:
3719 * addl ct,dest
3721 * Size 20. Size 14.
3723 * This is reasonably steep, but branch mispredict costs are
3724 * high on modern cpus, so consider failing only if optimizing
3725 * for space.
3728 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3729 && BRANCH_COST (optimize_insn_for_speed_p (),
3730 false) >= 2)
3732 if (cf == 0)
3734 machine_mode cmp_mode = GET_MODE (op0);
3735 enum rtx_code new_code;
3737 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3739 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3741 /* We may be reversing a non-trapping
3742 comparison to a trapping comparison. */
3743 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3744 && code != EQ && code != NE
3745 && code != ORDERED && code != UNORDERED)
3746 new_code = UNKNOWN;
3747 else
3748 new_code = reverse_condition_maybe_unordered (code);
3751 else
3753 new_code = ix86_reverse_condition (code, cmp_mode);
3754 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3755 compare_code = reverse_condition (compare_code);
3758 if (new_code != UNKNOWN)
3760 cf = ct;
3761 ct = 0;
3762 code = new_code;
3766 if (compare_code != UNKNOWN)
3768 /* notl op1 (if needed)
3769 sarl $31, op1
3770 andl (cf-ct), op1
3771 addl ct, op1
3773 For x < 0 (resp. x <= -1) there will be no notl,
3774 so if possible swap the constants to get rid of the
3775 complement.
3776 True/false will be -1/0 while code below (store flag
3777 followed by decrement) is 0/-1, so the constants need
3778 to be exchanged once more. */
3780 if (compare_code == GE || !cf)
3782 code = reverse_condition (code);
3783 compare_code = LT;
3785 else
3786 std::swap (ct, cf);
3788 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3790 else
3792 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3794 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3795 constm1_rtx,
3796 copy_rtx (out), 1, OPTAB_DIRECT);
3799 out = expand_simple_binop (mode, AND, copy_rtx (out),
3800 gen_int_mode (cf - ct, mode),
3801 copy_rtx (out), 1, OPTAB_DIRECT);
3802 if (ct)
3803 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3804 copy_rtx (out), 1, OPTAB_DIRECT);
3805 if (!rtx_equal_p (out, operands[0]))
3806 emit_move_insn (operands[0], copy_rtx (out));
3808 return true;
3812 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3814 /* Try a few things more with specific constants and a variable. */
3816 optab op;
3817 rtx var, orig_out, out, tmp;
3819 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3820 return false;
3822 operands[2] = op2;
3823 operands[3] = op3;
3825 /* If one of the two operands is an interesting constant, load a
3826 constant with the above and mask it in with a logical operation. */
3828 if (CONST_INT_P (operands[2]))
3830 var = operands[3];
3831 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3832 operands[3] = constm1_rtx, op = and_optab;
3833 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3834 operands[3] = const0_rtx, op = ior_optab;
3835 else
3836 return false;
3838 else if (CONST_INT_P (operands[3]))
3840 var = operands[2];
3841 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3843 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3844 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3845 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3846 operands[1] = simplify_gen_relational (LT, VOIDmode,
3847 GET_MODE (op0),
3848 op0, const0_rtx);
3850 operands[2] = constm1_rtx;
3851 op = and_optab;
3853 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3854 operands[2] = const0_rtx, op = ior_optab;
3855 else
3856 return false;
3858 else
3859 return false;
3861 orig_out = operands[0];
3862 tmp = gen_reg_rtx (mode);
3863 operands[0] = tmp;
3865 /* Recurse to get the constant loaded. */
3866 if (!ix86_expand_int_movcc (operands))
3867 return false;
3869 /* Mask in the interesting variable. */
3870 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3871 OPTAB_WIDEN);
3872 if (!rtx_equal_p (out, orig_out))
3873 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3875 return true;
3879 * For comparison with above,
3881 * movl cf,dest
3882 * movl ct,tmp
3883 * cmpl op1,op2
3884 * cmovcc tmp,dest
3886 * Size 15.
3889 if (! nonimmediate_operand (operands[2], mode))
3890 operands[2] = force_reg (mode, operands[2]);
3891 if (! nonimmediate_operand (operands[3], mode))
3892 operands[3] = force_reg (mode, operands[3]);
3894 if (! register_operand (operands[2], VOIDmode)
3895 && (mode == QImode
3896 || ! register_operand (operands[3], VOIDmode)))
3897 operands[2] = force_reg (mode, operands[2]);
3899 if (mode == QImode
3900 && ! register_operand (operands[3], VOIDmode))
3901 operands[3] = force_reg (mode, operands[3]);
3903 emit_insn (compare_seq);
3904 emit_insn (gen_rtx_SET (operands[0],
3905 gen_rtx_IF_THEN_ELSE (mode,
3906 compare_op, operands[2],
3907 operands[3])));
3908 return true;
3911 /* Detect conditional moves that exactly match min/max operational
3912 semantics. Note that this is IEEE safe, as long as we don't
3913 interchange the operands.
3915 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3916 and TRUE if the operation is successful and instructions are emitted. */
3918 static bool
3919 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3920 rtx cmp_op1, rtx if_true, rtx if_false)
3922 machine_mode mode;
3923 bool is_min;
3924 rtx tmp;
3926 if (code == LT)
3928 else if (code == UNGE)
3929 std::swap (if_true, if_false);
3930 else
3931 return false;
3933 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3934 is_min = true;
3935 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3936 is_min = false;
3937 else
3938 return false;
3940 mode = GET_MODE (dest);
3942 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3943 but MODE may be a vector mode and thus not appropriate. */
3944 if (!flag_finite_math_only || flag_signed_zeros)
3946 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3947 rtvec v;
3949 if_true = force_reg (mode, if_true);
3950 v = gen_rtvec (2, if_true, if_false);
3951 tmp = gen_rtx_UNSPEC (mode, v, u);
3953 else
3955 code = is_min ? SMIN : SMAX;
3956 if (MEM_P (if_true) && MEM_P (if_false))
3957 if_true = force_reg (mode, if_true);
3958 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3961 emit_insn (gen_rtx_SET (dest, tmp));
3962 return true;
3965 /* Return true if MODE is valid for vector compare to mask register,
3966 Same result for conditionl vector move with mask register. */
3967 static bool
3968 ix86_valid_mask_cmp_mode (machine_mode mode)
3970 /* XOP has its own vector conditional movement. */
3971 if (TARGET_XOP && !TARGET_AVX512F)
3972 return false;
3974 /* HFmode only supports vcmpsh whose dest is mask register. */
3975 if (TARGET_AVX512FP16 && mode == HFmode)
3976 return true;
3978 /* AVX512F is needed for mask operation. */
3979 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3980 return false;
3982 /* AVX512BW is needed for vector QI/HImode,
3983 AVX512VL is needed for 128/256-bit vector. */
3984 machine_mode inner_mode = GET_MODE_INNER (mode);
3985 int vector_size = GET_MODE_SIZE (mode);
3986 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3987 return false;
3989 return (vector_size == 64 && TARGET_EVEX512) || TARGET_AVX512VL;
3992 /* Return true if integer mask comparison should be used. */
3993 static bool
3994 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3995 rtx op_true, rtx op_false)
3997 int vector_size = GET_MODE_SIZE (mode);
3999 if (cmp_mode == HFmode)
4000 return true;
4001 else if (vector_size < 16)
4002 return false;
4003 else if (vector_size == 64)
4004 return true;
4005 else if (GET_MODE_INNER (cmp_mode) == HFmode)
4006 return true;
4008 /* When op_true is NULL, op_false must be NULL, or vice versa. */
4009 gcc_assert (!op_true == !op_false);
4011 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
4012 vector dest is required. */
4013 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
4014 return false;
4016 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
4017 if (op_false == CONST0_RTX (mode)
4018 || op_true == CONST0_RTX (mode)
4019 || (INTEGRAL_MODE_P (mode)
4020 && (op_true == CONSTM1_RTX (mode)
4021 || op_false == CONSTM1_RTX (mode))))
4022 return false;
4024 return true;
4027 /* Expand an SSE comparison. Return the register with the result. */
4029 static rtx
4030 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
4031 rtx op_true, rtx op_false)
4033 machine_mode mode = GET_MODE (dest);
4034 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
4036 /* In general case result of comparison can differ from operands' type. */
4037 machine_mode cmp_mode;
4039 /* In AVX512F the result of comparison is an integer mask. */
4040 bool maskcmp = false;
4041 rtx x;
4043 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
4045 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
4046 maskcmp = true;
4047 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
4049 else
4050 cmp_mode = cmp_ops_mode;
4052 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
4054 bool (*op1_predicate)(rtx, machine_mode)
4055 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
4057 if (!op1_predicate (cmp_op1, cmp_ops_mode))
4058 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
4060 if (optimize
4061 || (maskcmp && cmp_mode != mode)
4062 || (op_true && reg_overlap_mentioned_p (dest, op_true))
4063 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
4064 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
4066 if (maskcmp)
4068 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
4069 gcc_assert (ok);
4070 return dest;
4073 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
4075 if (cmp_mode != mode)
4077 x = force_reg (cmp_ops_mode, x);
4078 convert_move (dest, x, false);
4080 else
4081 emit_insn (gen_rtx_SET (dest, x));
4083 return dest;
4086 /* Emit x86 binary operand CODE in mode MODE for SSE vector
4087 instructions that can be performed using GP registers. */
4089 static void
4090 ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
4091 rtx dst, rtx src1, rtx src2)
4093 rtx tmp;
4095 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
4097 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
4098 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4100 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
4101 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
4104 emit_insn (tmp);
4107 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
4108 operations. This is used for both scalar and vector conditional moves. */
4110 void
4111 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
4113 machine_mode mode = GET_MODE (dest);
4114 machine_mode cmpmode = GET_MODE (cmp);
4115 rtx x;
4117 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
4118 if (rtx_equal_p (op_true, op_false))
4120 emit_move_insn (dest, op_true);
4121 return;
4124 /* If we have an integer mask and FP value then we need
4125 to cast mask to FP mode. */
4126 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
4128 cmp = force_reg (cmpmode, cmp);
4129 cmp = gen_rtx_SUBREG (mode, cmp, 0);
4132 /* In AVX512F the result of comparison is an integer mask. */
4133 if (mode != cmpmode
4134 && GET_MODE_CLASS (cmpmode) == MODE_INT)
4136 gcc_assert (ix86_valid_mask_cmp_mode (mode));
4137 /* Using scalar/vector move with mask register. */
4138 cmp = force_reg (cmpmode, cmp);
4139 /* Optimize for mask zero. */
4140 op_true = (op_true != CONST0_RTX (mode)
4141 ? force_reg (mode, op_true) : op_true);
4142 op_false = (op_false != CONST0_RTX (mode)
4143 ? force_reg (mode, op_false) : op_false);
4144 if (op_true == CONST0_RTX (mode))
4146 if (cmpmode == E_DImode && !TARGET_64BIT)
4148 x = gen_reg_rtx (cmpmode);
4149 emit_insn (gen_knotdi (x, cmp));
4151 else
4152 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4153 cmp = x;
4154 /* Reverse op_true op_false. */
4155 std::swap (op_true, op_false);
4158 if (mode == HFmode)
4159 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4160 else
4161 emit_insn (gen_rtx_SET (dest,
4162 gen_rtx_VEC_MERGE (mode,
4163 op_true, op_false, cmp)));
4164 return;
4167 if (vector_all_ones_operand (op_true, mode)
4168 && op_false == CONST0_RTX (mode))
4170 emit_move_insn (dest, cmp);
4171 return;
4173 else if (op_false == CONST0_RTX (mode))
4175 x = expand_simple_binop (mode, AND, cmp, op_true,
4176 dest, 1, OPTAB_DIRECT);
4177 if (x != dest)
4178 emit_move_insn (dest, x);
4179 return;
4181 else if (op_true == CONST0_RTX (mode))
4183 op_false = force_reg (mode, op_false);
4184 x = gen_rtx_NOT (mode, cmp);
4185 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
4186 return;
4188 else if (vector_all_ones_operand (op_true, mode))
4190 x = expand_simple_binop (mode, IOR, cmp, op_false,
4191 dest, 1, OPTAB_DIRECT);
4192 if (x != dest)
4193 emit_move_insn (dest, x);
4194 return;
4197 if (TARGET_XOP)
4199 op_true = force_reg (mode, op_true);
4201 if (GET_MODE_SIZE (mode) < 16
4202 || !nonimmediate_operand (op_false, mode))
4203 op_false = force_reg (mode, op_false);
4205 emit_insn (gen_rtx_SET (dest,
4206 gen_rtx_IF_THEN_ELSE (mode, cmp,
4207 op_true, op_false)));
4208 return;
4211 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4212 machine_mode blend_mode = mode;
4214 if (GET_MODE_SIZE (mode) < 16
4215 || !vector_operand (op_true, mode))
4216 op_true = force_reg (mode, op_true);
4218 op_false = force_reg (mode, op_false);
4220 switch (mode)
4222 case E_V2SFmode:
4223 if (TARGET_SSE4_1)
4224 gen = gen_mmx_blendvps;
4225 break;
4226 case E_V4SFmode:
4227 if (TARGET_SSE4_1)
4228 gen = gen_sse4_1_blendvps;
4229 break;
4230 case E_V2DFmode:
4231 if (TARGET_SSE4_1)
4232 gen = gen_sse4_1_blendvpd;
4233 break;
4234 case E_SFmode:
4235 if (TARGET_SSE4_1)
4236 gen = gen_sse4_1_blendvss;
4237 break;
4238 case E_DFmode:
4239 if (TARGET_SSE4_1)
4240 gen = gen_sse4_1_blendvsd;
4241 break;
4242 case E_V8QImode:
4243 case E_V4HImode:
4244 case E_V4HFmode:
4245 case E_V4BFmode:
4246 case E_V2SImode:
4247 if (TARGET_SSE4_1)
4249 gen = gen_mmx_pblendvb_v8qi;
4250 blend_mode = V8QImode;
4252 break;
4253 case E_V4QImode:
4254 case E_V2HImode:
4255 case E_V2HFmode:
4256 case E_V2BFmode:
4257 if (TARGET_SSE4_1)
4259 gen = gen_mmx_pblendvb_v4qi;
4260 blend_mode = V4QImode;
4262 break;
4263 case E_V2QImode:
4264 if (TARGET_SSE4_1)
4265 gen = gen_mmx_pblendvb_v2qi;
4266 break;
4267 case E_V16QImode:
4268 case E_V8HImode:
4269 case E_V8HFmode:
4270 case E_V8BFmode:
4271 case E_V4SImode:
4272 case E_V2DImode:
4273 case E_V1TImode:
4274 if (TARGET_SSE4_1)
4276 gen = gen_sse4_1_pblendvb;
4277 blend_mode = V16QImode;
4279 break;
4280 case E_V8SFmode:
4281 if (TARGET_AVX)
4282 gen = gen_avx_blendvps256;
4283 break;
4284 case E_V4DFmode:
4285 if (TARGET_AVX)
4286 gen = gen_avx_blendvpd256;
4287 break;
4288 case E_V32QImode:
4289 case E_V16HImode:
4290 case E_V16HFmode:
4291 case E_V16BFmode:
4292 case E_V8SImode:
4293 case E_V4DImode:
4294 if (TARGET_AVX2)
4296 gen = gen_avx2_pblendvb;
4297 blend_mode = V32QImode;
4299 break;
4301 case E_V64QImode:
4302 gen = gen_avx512bw_blendmv64qi;
4303 break;
4304 case E_V32HImode:
4305 gen = gen_avx512bw_blendmv32hi;
4306 break;
4307 case E_V32HFmode:
4308 gen = gen_avx512bw_blendmv32hf;
4309 break;
4310 case E_V32BFmode:
4311 gen = gen_avx512bw_blendmv32bf;
4312 break;
4313 case E_V16SImode:
4314 gen = gen_avx512f_blendmv16si;
4315 break;
4316 case E_V8DImode:
4317 gen = gen_avx512f_blendmv8di;
4318 break;
4319 case E_V8DFmode:
4320 gen = gen_avx512f_blendmv8df;
4321 break;
4322 case E_V16SFmode:
4323 gen = gen_avx512f_blendmv16sf;
4324 break;
4326 default:
4327 break;
4330 if (gen != NULL)
4332 if (blend_mode == mode)
4333 x = dest;
4334 else
4336 x = gen_reg_rtx (blend_mode);
4337 op_false = gen_lowpart (blend_mode, op_false);
4338 op_true = gen_lowpart (blend_mode, op_true);
4339 cmp = gen_lowpart (blend_mode, cmp);
4342 emit_insn (gen (x, op_false, op_true, cmp));
4344 if (x != dest)
4345 emit_move_insn (dest, gen_lowpart (mode, x));
4347 else
4349 rtx t2, t3;
4351 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4352 NULL, 1, OPTAB_DIRECT);
4354 t3 = gen_reg_rtx (mode);
4355 x = gen_rtx_NOT (mode, cmp);
4356 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4358 x = expand_simple_binop (mode, IOR, t3, t2,
4359 dest, 1, OPTAB_DIRECT);
4360 if (x != dest)
4361 emit_move_insn (dest, x);
4365 /* Swap, force into registers, or otherwise massage the two operands
4366 to an sse comparison with a mask result. Thus we differ a bit from
4367 ix86_prepare_fp_compare_args which expects to produce a flags result.
4369 The DEST operand exists to help determine whether to commute commutative
4370 operators. The POP0/POP1 operands are updated in place. The new
4371 comparison code is returned, or UNKNOWN if not implementable. */
4373 static enum rtx_code
4374 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4375 rtx *pop0, rtx *pop1)
4377 switch (code)
4379 case LTGT:
4380 case UNEQ:
4381 /* AVX supports all the needed comparisons. */
4382 if (TARGET_AVX)
4383 break;
4384 /* We have no LTGT as an operator. We could implement it with
4385 NE & ORDERED, but this requires an extra temporary. It's
4386 not clear that it's worth it. */
4387 return UNKNOWN;
4389 case LT:
4390 case LE:
4391 case UNGT:
4392 case UNGE:
4393 /* These are supported directly. */
4394 break;
4396 case EQ:
4397 case NE:
4398 case UNORDERED:
4399 case ORDERED:
4400 /* AVX has 3 operand comparisons, no need to swap anything. */
4401 if (TARGET_AVX)
4402 break;
4403 /* For commutative operators, try to canonicalize the destination
4404 operand to be first in the comparison - this helps reload to
4405 avoid extra moves. */
4406 if (!dest || !rtx_equal_p (dest, *pop1))
4407 break;
4408 /* FALLTHRU */
4410 case GE:
4411 case GT:
4412 case UNLE:
4413 case UNLT:
4414 /* These are not supported directly before AVX, and furthermore
4415 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4416 comparison operands to transform into something that is
4417 supported. */
4418 std::swap (*pop0, *pop1);
4419 code = swap_condition (code);
4420 break;
4422 default:
4423 gcc_unreachable ();
4426 return code;
4429 /* Expand a floating-point conditional move. Return true if successful. */
4431 bool
4432 ix86_expand_fp_movcc (rtx operands[])
4434 machine_mode mode = GET_MODE (operands[0]);
4435 enum rtx_code code = GET_CODE (operands[1]);
4436 rtx tmp, compare_op;
4437 rtx op0 = XEXP (operands[1], 0);
4438 rtx op1 = XEXP (operands[1], 1);
4440 if (GET_MODE (op0) == BFmode
4441 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4442 return false;
4444 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4446 machine_mode cmode;
4448 /* Since we've no cmove for sse registers, don't force bad register
4449 allocation just to gain access to it. Deny movcc when the
4450 comparison mode doesn't match the move mode. */
4451 cmode = GET_MODE (op0);
4452 if (cmode == VOIDmode)
4453 cmode = GET_MODE (op1);
4454 if (cmode != mode)
4455 return false;
4457 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4458 if (code == UNKNOWN)
4459 return false;
4461 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4462 operands[2], operands[3]))
4463 return true;
4465 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4466 operands[2], operands[3]);
4467 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4468 return true;
4471 if (GET_MODE (op0) == TImode
4472 || (GET_MODE (op0) == DImode
4473 && !TARGET_64BIT))
4474 return false;
4476 /* The floating point conditional move instructions don't directly
4477 support conditions resulting from a signed integer comparison. */
4479 compare_op = ix86_expand_compare (code, op0, op1);
4480 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4482 tmp = gen_reg_rtx (QImode);
4483 ix86_expand_setcc (tmp, code, op0, op1);
4485 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4488 emit_insn (gen_rtx_SET (operands[0],
4489 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4490 operands[2], operands[3])));
4492 return true;
4495 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4497 static int
4498 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4500 switch (code)
4502 case EQ:
4503 return 0;
4504 case LT:
4505 case LTU:
4506 return 1;
4507 case LE:
4508 case LEU:
4509 return 2;
4510 case NE:
4511 return 4;
4512 case GE:
4513 case GEU:
4514 return 5;
4515 case GT:
4516 case GTU:
4517 return 6;
4518 default:
4519 gcc_unreachable ();
4523 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4525 static int
4526 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4528 switch (code)
4530 case EQ:
4531 return 0x00;
4532 case NE:
4533 return 0x04;
4534 case GT:
4535 return 0x0e;
4536 case LE:
4537 return 0x02;
4538 case GE:
4539 return 0x0d;
4540 case LT:
4541 return 0x01;
4542 case UNLE:
4543 return 0x0a;
4544 case UNLT:
4545 return 0x09;
4546 case UNGE:
4547 return 0x05;
4548 case UNGT:
4549 return 0x06;
4550 case UNEQ:
4551 return 0x18;
4552 case LTGT:
4553 return 0x0c;
4554 case ORDERED:
4555 return 0x07;
4556 case UNORDERED:
4557 return 0x03;
4558 default:
4559 gcc_unreachable ();
4563 /* Return immediate value to be used in UNSPEC_PCMP
4564 for comparison CODE in MODE. */
4566 static int
4567 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4569 if (FLOAT_MODE_P (mode))
4570 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4571 return ix86_int_cmp_code_to_pcmp_immediate (code);
4574 /* Expand AVX-512 vector comparison. */
4576 bool
4577 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4579 machine_mode mask_mode = GET_MODE (dest);
4580 machine_mode cmp_mode = GET_MODE (cmp_op0);
4581 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4582 int unspec_code;
4583 rtx unspec;
4585 switch (code)
4587 case LEU:
4588 case GTU:
4589 case GEU:
4590 case LTU:
4591 unspec_code = UNSPEC_UNSIGNED_PCMP;
4592 break;
4594 default:
4595 unspec_code = UNSPEC_PCMP;
4598 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4599 unspec_code);
4600 emit_insn (gen_rtx_SET (dest, unspec));
4602 return true;
4605 /* Expand fp vector comparison. */
4607 bool
4608 ix86_expand_fp_vec_cmp (rtx operands[])
4610 enum rtx_code code = GET_CODE (operands[1]);
4611 rtx cmp;
4613 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4614 &operands[2], &operands[3]);
4615 if (code == UNKNOWN)
4617 rtx temp;
4618 switch (GET_CODE (operands[1]))
4620 case LTGT:
4621 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4622 operands[3], NULL, NULL);
4623 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4624 operands[3], NULL, NULL);
4625 code = AND;
4626 break;
4627 case UNEQ:
4628 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4629 operands[3], NULL, NULL);
4630 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4631 operands[3], NULL, NULL);
4632 code = IOR;
4633 break;
4634 default:
4635 gcc_unreachable ();
4637 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4638 OPTAB_DIRECT);
4640 else
4641 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4642 NULL, NULL);
4644 if (operands[0] != cmp)
4645 emit_move_insn (operands[0], cmp);
4647 return true;
4650 static rtx
4651 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4652 rtx op_true, rtx op_false, bool *negate)
4654 machine_mode data_mode = GET_MODE (dest);
4655 machine_mode mode = GET_MODE (cop0);
4656 rtx x;
4658 *negate = false;
4660 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4661 if (TARGET_XOP
4662 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4663 && GET_MODE_SIZE (mode) <= 16)
4665 /* AVX512F supports all of the comparsions
4666 on all 128/256/512-bit vector int types. */
4667 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4669 else
4671 /* Canonicalize the comparison to EQ, GT, GTU. */
4672 switch (code)
4674 case EQ:
4675 case GT:
4676 case GTU:
4677 break;
4679 case LE:
4680 case LEU:
4681 /* x <= cst can be handled as x < cst + 1 unless there is
4682 wrap around in cst + 1. */
4683 if (GET_CODE (cop1) == CONST_VECTOR
4684 && GET_MODE_INNER (mode) != TImode)
4686 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4687 machine_mode eltmode = GET_MODE_INNER (mode);
4688 for (i = 0; i < n_elts; ++i)
4690 rtx elt = CONST_VECTOR_ELT (cop1, i);
4691 if (!CONST_INT_P (elt))
4692 break;
4693 if (code == LE)
4695 /* For LE punt if some element is signed maximum. */
4696 if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4697 == (GET_MODE_MASK (eltmode) >> 1))
4698 break;
4700 /* For LEU punt if some element is unsigned maximum. */
4701 else if (elt == constm1_rtx)
4702 break;
4704 if (i == n_elts)
4706 rtvec v = rtvec_alloc (n_elts);
4707 for (i = 0; i < n_elts; ++i)
4708 RTVEC_ELT (v, i)
4709 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4710 eltmode);
4711 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4712 std::swap (cop0, cop1);
4713 code = code == LE ? GT : GTU;
4714 break;
4717 /* FALLTHRU */
4718 case NE:
4719 code = reverse_condition (code);
4720 *negate = true;
4721 break;
4723 case GE:
4724 case GEU:
4725 /* x >= cst can be handled as x > cst - 1 unless there is
4726 wrap around in cst - 1. */
4727 if (GET_CODE (cop1) == CONST_VECTOR
4728 && GET_MODE_INNER (mode) != TImode)
4730 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4731 machine_mode eltmode = GET_MODE_INNER (mode);
4732 for (i = 0; i < n_elts; ++i)
4734 rtx elt = CONST_VECTOR_ELT (cop1, i);
4735 if (!CONST_INT_P (elt))
4736 break;
4737 if (code == GE)
4739 /* For GE punt if some element is signed minimum. */
4740 if (INTVAL (elt) < 0
4741 && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4742 == 0))
4743 break;
4745 /* For GEU punt if some element is zero. */
4746 else if (elt == const0_rtx)
4747 break;
4749 if (i == n_elts)
4751 rtvec v = rtvec_alloc (n_elts);
4752 for (i = 0; i < n_elts; ++i)
4753 RTVEC_ELT (v, i)
4754 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
4755 eltmode);
4756 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4757 code = code == GE ? GT : GTU;
4758 break;
4761 code = reverse_condition (code);
4762 *negate = true;
4763 /* FALLTHRU */
4765 case LT:
4766 case LTU:
4767 std::swap (cop0, cop1);
4768 code = swap_condition (code);
4769 break;
4771 default:
4772 gcc_unreachable ();
4775 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4776 if (mode == V2DImode)
4778 switch (code)
4780 case EQ:
4781 /* SSE4.1 supports EQ. */
4782 if (!TARGET_SSE4_1)
4783 return NULL;
4784 break;
4786 case GT:
4787 case GTU:
4788 /* SSE4.2 supports GT/GTU. */
4789 if (!TARGET_SSE4_2)
4790 return NULL;
4791 break;
4793 default:
4794 gcc_unreachable ();
4798 if (GET_CODE (cop0) == CONST_VECTOR)
4799 cop0 = force_reg (mode, cop0);
4800 else if (GET_CODE (cop1) == CONST_VECTOR)
4801 cop1 = force_reg (mode, cop1);
4803 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4804 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4805 if (*negate)
4806 std::swap (optrue, opfalse);
4808 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4809 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4810 min (x, y) == x). While we add one instruction (the minimum),
4811 we remove the need for two instructions in the negation, as the
4812 result is done this way.
4813 When using masks, do it for SI/DImode element types, as it is shorter
4814 than the two subtractions. */
4815 if ((code != EQ
4816 && GET_MODE_SIZE (mode) != 64
4817 && vector_all_ones_operand (opfalse, data_mode)
4818 && optrue == CONST0_RTX (data_mode))
4819 || (code == GTU
4820 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4821 /* Don't do it if not using integer masks and we'd end up with
4822 the right values in the registers though. */
4823 && ((GET_MODE_SIZE (mode) == 64 && TARGET_EVEX512)
4824 || !vector_all_ones_operand (optrue, data_mode)
4825 || opfalse != CONST0_RTX (data_mode))))
4827 rtx (*gen) (rtx, rtx, rtx) = NULL;
4829 switch (mode)
4831 case E_V16SImode:
4832 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4833 break;
4834 case E_V8DImode:
4835 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4836 cop0 = force_reg (mode, cop0);
4837 cop1 = force_reg (mode, cop1);
4838 break;
4839 case E_V32QImode:
4840 if (TARGET_AVX2)
4841 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4842 break;
4843 case E_V16HImode:
4844 if (TARGET_AVX2)
4845 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4846 break;
4847 case E_V8SImode:
4848 if (TARGET_AVX2)
4849 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4850 break;
4851 case E_V4DImode:
4852 if (TARGET_AVX512VL)
4854 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4855 cop0 = force_reg (mode, cop0);
4856 cop1 = force_reg (mode, cop1);
4858 break;
4859 case E_V16QImode:
4860 if (code == GTU && TARGET_SSE2)
4861 gen = gen_uminv16qi3;
4862 else if (code == GT && TARGET_SSE4_1)
4863 gen = gen_sminv16qi3;
4864 break;
4865 case E_V8QImode:
4866 if (code == GTU && TARGET_SSE2)
4867 gen = gen_uminv8qi3;
4868 else if (code == GT && TARGET_SSE4_1)
4869 gen = gen_sminv8qi3;
4870 break;
4871 case E_V4QImode:
4872 if (code == GTU && TARGET_SSE2)
4873 gen = gen_uminv4qi3;
4874 else if (code == GT && TARGET_SSE4_1)
4875 gen = gen_sminv4qi3;
4876 break;
4877 case E_V2QImode:
4878 if (code == GTU && TARGET_SSE2)
4879 gen = gen_uminv2qi3;
4880 else if (code == GT && TARGET_SSE4_1)
4881 gen = gen_sminv2qi3;
4882 break;
4883 case E_V8HImode:
4884 if (code == GTU && TARGET_SSE4_1)
4885 gen = gen_uminv8hi3;
4886 else if (code == GT && TARGET_SSE2)
4887 gen = gen_sminv8hi3;
4888 break;
4889 case E_V4HImode:
4890 if (code == GTU && TARGET_SSE4_1)
4891 gen = gen_uminv4hi3;
4892 else if (code == GT && TARGET_SSE2)
4893 gen = gen_sminv4hi3;
4894 break;
4895 case E_V2HImode:
4896 if (code == GTU && TARGET_SSE4_1)
4897 gen = gen_uminv2hi3;
4898 else if (code == GT && TARGET_SSE2)
4899 gen = gen_sminv2hi3;
4900 break;
4901 case E_V4SImode:
4902 if (TARGET_SSE4_1)
4903 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4904 break;
4905 case E_V2SImode:
4906 if (TARGET_SSE4_1)
4907 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4908 break;
4909 case E_V2DImode:
4910 if (TARGET_AVX512VL)
4912 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4913 cop0 = force_reg (mode, cop0);
4914 cop1 = force_reg (mode, cop1);
4916 break;
4917 default:
4918 break;
4921 if (gen)
4923 rtx tem = gen_reg_rtx (mode);
4924 if (!vector_operand (cop0, mode))
4925 cop0 = force_reg (mode, cop0);
4926 if (!vector_operand (cop1, mode))
4927 cop1 = force_reg (mode, cop1);
4928 *negate = !*negate;
4929 emit_insn (gen (tem, cop0, cop1));
4930 cop1 = tem;
4931 code = EQ;
4935 /* Unsigned parallel compare is not supported by the hardware.
4936 Play some tricks to turn this into a signed comparison
4937 against 0. */
4938 if (code == GTU)
4940 cop0 = force_reg (mode, cop0);
4942 switch (mode)
4944 case E_V16SImode:
4945 case E_V8DImode:
4946 case E_V8SImode:
4947 case E_V4DImode:
4948 case E_V4SImode:
4949 case E_V2SImode:
4950 case E_V2DImode:
4952 rtx t1, t2, mask;
4954 /* Subtract (-(INT MAX) - 1) from both operands to make
4955 them signed. */
4956 mask = ix86_build_signbit_mask (mode, true, false);
4957 t1 = gen_reg_rtx (mode);
4958 emit_insn (gen_sub3_insn (t1, cop0, mask));
4960 t2 = gen_reg_rtx (mode);
4961 emit_insn (gen_sub3_insn (t2, cop1, mask));
4963 cop0 = t1;
4964 cop1 = t2;
4965 code = GT;
4967 break;
4969 case E_V64QImode:
4970 case E_V32HImode:
4971 case E_V32QImode:
4972 case E_V16HImode:
4973 case E_V16QImode:
4974 case E_V8QImode:
4975 case E_V4QImode:
4976 case E_V2QImode:
4977 case E_V8HImode:
4978 case E_V4HImode:
4979 case E_V2HImode:
4980 /* Perform a parallel unsigned saturating subtraction. */
4981 x = gen_reg_rtx (mode);
4982 emit_insn (gen_rtx_SET
4983 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4984 cop0 = x;
4985 cop1 = CONST0_RTX (mode);
4986 code = EQ;
4987 *negate = !*negate;
4988 break;
4990 default:
4991 gcc_unreachable ();
4996 if (*negate)
4997 std::swap (op_true, op_false);
4999 if (GET_CODE (cop1) == CONST_VECTOR)
5000 cop1 = force_reg (mode, cop1);
5002 /* Allow the comparison to be done in one mode, but the movcc to
5003 happen in another mode. */
5004 if (data_mode == mode)
5005 x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
5006 else
5008 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
5009 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
5010 op_true, op_false);
5011 if (GET_MODE (x) == mode)
5012 x = gen_lowpart (data_mode, x);
5015 return x;
5018 /* Expand integer vector comparison. */
5020 bool
5021 ix86_expand_int_vec_cmp (rtx operands[])
5023 rtx_code code = GET_CODE (operands[1]);
5024 bool negate = false;
5025 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
5026 operands[3], NULL, NULL, &negate);
5028 if (!cmp)
5029 return false;
5031 if (negate)
5032 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
5033 CONST0_RTX (GET_MODE (cmp)),
5034 NULL, NULL, &negate);
5036 gcc_assert (!negate);
5038 if (operands[0] != cmp)
5039 emit_move_insn (operands[0], cmp);
5041 return true;
5044 /* Expand a floating-point vector conditional move; a vcond operation
5045 rather than a movcc operation. */
5047 bool
5048 ix86_expand_fp_vcond (rtx operands[])
5050 enum rtx_code code = GET_CODE (operands[3]);
5051 rtx cmp;
5053 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
5054 &operands[4], &operands[5]);
5055 if (code == UNKNOWN)
5057 rtx temp;
5058 switch (GET_CODE (operands[3]))
5060 case LTGT:
5061 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
5062 operands[5], operands[0], operands[0]);
5063 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
5064 operands[5], operands[1], operands[2]);
5065 code = AND;
5066 break;
5067 case UNEQ:
5068 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
5069 operands[5], operands[0], operands[0]);
5070 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
5071 operands[5], operands[1], operands[2]);
5072 code = IOR;
5073 break;
5074 default:
5075 gcc_unreachable ();
5077 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
5078 OPTAB_DIRECT);
5079 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5080 return true;
5083 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
5084 operands[5], operands[1], operands[2]))
5085 return true;
5087 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
5088 operands[1], operands[2]);
5089 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5090 return true;
5093 /* Expand a signed/unsigned integral vector conditional move. */
5095 bool
5096 ix86_expand_int_vcond (rtx operands[])
5098 machine_mode data_mode = GET_MODE (operands[0]);
5099 machine_mode mode = GET_MODE (operands[4]);
5100 enum rtx_code code = GET_CODE (operands[3]);
5101 bool negate = false;
5102 rtx x, cop0, cop1;
5104 cop0 = operands[4];
5105 cop1 = operands[5];
5107 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
5108 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
5109 if ((code == LT || code == GE)
5110 && data_mode == mode
5111 && cop1 == CONST0_RTX (mode)
5112 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
5113 && GET_MODE_UNIT_SIZE (data_mode) > 1
5114 && GET_MODE_UNIT_SIZE (data_mode) <= 8
5115 && (GET_MODE_SIZE (data_mode) == 16
5116 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
5118 rtx negop = operands[2 - (code == LT)];
5119 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
5120 if (negop == CONST1_RTX (data_mode))
5122 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
5123 operands[0], 1, OPTAB_DIRECT);
5124 if (res != operands[0])
5125 emit_move_insn (operands[0], res);
5126 return true;
5128 else if (GET_MODE_INNER (data_mode) != DImode
5129 && vector_all_ones_operand (negop, data_mode))
5131 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
5132 operands[0], 0, OPTAB_DIRECT);
5133 if (res != operands[0])
5134 emit_move_insn (operands[0], res);
5135 return true;
5139 if (!nonimmediate_operand (cop1, mode))
5140 cop1 = force_reg (mode, cop1);
5141 if (!general_operand (operands[1], data_mode))
5142 operands[1] = force_reg (data_mode, operands[1]);
5143 if (!general_operand (operands[2], data_mode))
5144 operands[2] = force_reg (data_mode, operands[2]);
5146 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
5147 operands[1], operands[2], &negate);
5149 if (!x)
5150 return false;
5152 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
5153 operands[2-negate]);
5154 return true;
5157 static bool
5158 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5159 struct expand_vec_perm_d *d)
5161 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5162 expander, so args are either in d, or in op0, op1 etc. */
5163 machine_mode mode = GET_MODE (d ? d->op0 : op0);
5164 machine_mode maskmode = mode;
5165 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5167 switch (mode)
5169 case E_V16QImode:
5170 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5171 gen = gen_avx512vl_vpermt2varv16qi3;
5172 break;
5173 case E_V32QImode:
5174 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5175 gen = gen_avx512vl_vpermt2varv32qi3;
5176 break;
5177 case E_V64QImode:
5178 if (TARGET_AVX512VBMI)
5179 gen = gen_avx512bw_vpermt2varv64qi3;
5180 break;
5181 case E_V8HImode:
5182 if (TARGET_AVX512VL && TARGET_AVX512BW)
5183 gen = gen_avx512vl_vpermt2varv8hi3;
5184 break;
5185 case E_V16HImode:
5186 if (TARGET_AVX512VL && TARGET_AVX512BW)
5187 gen = gen_avx512vl_vpermt2varv16hi3;
5188 break;
5189 case E_V32HImode:
5190 if (TARGET_AVX512BW)
5191 gen = gen_avx512bw_vpermt2varv32hi3;
5192 break;
5193 case E_V4SImode:
5194 if (TARGET_AVX512VL)
5195 gen = gen_avx512vl_vpermt2varv4si3;
5196 break;
5197 case E_V8SImode:
5198 if (TARGET_AVX512VL)
5199 gen = gen_avx512vl_vpermt2varv8si3;
5200 break;
5201 case E_V16SImode:
5202 if (TARGET_AVX512F)
5203 gen = gen_avx512f_vpermt2varv16si3;
5204 break;
5205 case E_V4SFmode:
5206 if (TARGET_AVX512VL)
5208 gen = gen_avx512vl_vpermt2varv4sf3;
5209 maskmode = V4SImode;
5211 break;
5212 case E_V8SFmode:
5213 if (TARGET_AVX512VL)
5215 gen = gen_avx512vl_vpermt2varv8sf3;
5216 maskmode = V8SImode;
5218 break;
5219 case E_V16SFmode:
5220 if (TARGET_AVX512F)
5222 gen = gen_avx512f_vpermt2varv16sf3;
5223 maskmode = V16SImode;
5225 break;
5226 case E_V2DImode:
5227 if (TARGET_AVX512VL)
5228 gen = gen_avx512vl_vpermt2varv2di3;
5229 break;
5230 case E_V4DImode:
5231 if (TARGET_AVX512VL)
5232 gen = gen_avx512vl_vpermt2varv4di3;
5233 break;
5234 case E_V8DImode:
5235 if (TARGET_AVX512F)
5236 gen = gen_avx512f_vpermt2varv8di3;
5237 break;
5238 case E_V2DFmode:
5239 if (TARGET_AVX512VL)
5241 gen = gen_avx512vl_vpermt2varv2df3;
5242 maskmode = V2DImode;
5244 break;
5245 case E_V4DFmode:
5246 if (TARGET_AVX512VL)
5248 gen = gen_avx512vl_vpermt2varv4df3;
5249 maskmode = V4DImode;
5251 break;
5252 case E_V8DFmode:
5253 if (TARGET_AVX512F)
5255 gen = gen_avx512f_vpermt2varv8df3;
5256 maskmode = V8DImode;
5258 break;
5259 default:
5260 break;
5263 if (gen == NULL)
5264 return false;
5266 if (d && d->testing_p)
5267 return true;
5269 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5270 expander, so args are either in d, or in op0, op1 etc. */
5271 if (d)
5273 rtx vec[64];
5274 target = d->target;
5275 op0 = d->op0;
5276 op1 = d->op1;
5277 for (int i = 0; i < d->nelt; ++i)
5278 vec[i] = GEN_INT (d->perm[i]);
5279 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5282 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5283 return true;
5286 /* Expand a variable vector permutation. */
5288 void
5289 ix86_expand_vec_perm (rtx operands[])
5291 rtx target = operands[0];
5292 rtx op0 = operands[1];
5293 rtx op1 = operands[2];
5294 rtx mask = operands[3];
5295 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5296 machine_mode mode = GET_MODE (op0);
5297 machine_mode maskmode = GET_MODE (mask);
5298 int w, e, i;
5299 bool one_operand_shuffle = rtx_equal_p (op0, op1);
5301 /* Number of elements in the vector. */
5302 w = GET_MODE_NUNITS (mode);
5303 e = GET_MODE_UNIT_SIZE (mode);
5304 gcc_assert (w <= 64);
5306 /* For HF mode vector, convert it to HI using subreg. */
5307 if (GET_MODE_INNER (mode) == HFmode)
5309 machine_mode orig_mode = mode;
5310 mode = mode_for_vector (HImode, w).require ();
5311 target = lowpart_subreg (mode, target, orig_mode);
5312 op0 = lowpart_subreg (mode, op0, orig_mode);
5313 op1 = lowpart_subreg (mode, op1, orig_mode);
5316 if (TARGET_AVX512F && one_operand_shuffle)
5318 rtx (*gen) (rtx, rtx, rtx) = NULL;
5319 switch (mode)
5321 case E_V16SImode:
5322 gen =gen_avx512f_permvarv16si;
5323 break;
5324 case E_V16SFmode:
5325 gen = gen_avx512f_permvarv16sf;
5326 break;
5327 case E_V8DImode:
5328 gen = gen_avx512f_permvarv8di;
5329 break;
5330 case E_V8DFmode:
5331 gen = gen_avx512f_permvarv8df;
5332 break;
5333 default:
5334 break;
5336 if (gen != NULL)
5338 emit_insn (gen (target, op0, mask));
5339 return;
5343 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5344 return;
5346 if (TARGET_AVX2)
5348 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5350 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5351 an constant shuffle operand. With a tiny bit of effort we can
5352 use VPERMD instead. A re-interpretation stall for V4DFmode is
5353 unfortunate but there's no avoiding it.
5354 Similarly for V16HImode we don't have instructions for variable
5355 shuffling, while for V32QImode we can use after preparing suitable
5356 masks vpshufb; vpshufb; vpermq; vpor. */
5358 if (mode == V16HImode)
5360 maskmode = mode = V32QImode;
5361 w = 32;
5362 e = 1;
5364 else
5366 maskmode = mode = V8SImode;
5367 w = 8;
5368 e = 4;
5370 t1 = gen_reg_rtx (maskmode);
5372 /* Replicate the low bits of the V4DImode mask into V8SImode:
5373 mask = { A B C D }
5374 t1 = { A A B B C C D D }. */
5375 for (i = 0; i < w / 2; ++i)
5376 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5377 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5378 vt = force_reg (maskmode, vt);
5379 mask = gen_lowpart (maskmode, mask);
5380 if (maskmode == V8SImode)
5381 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5382 else
5383 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5385 /* Multiply the shuffle indicies by two. */
5386 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5387 OPTAB_DIRECT);
5389 /* Add one to the odd shuffle indicies:
5390 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5391 for (i = 0; i < w / 2; ++i)
5393 vec[i * 2] = const0_rtx;
5394 vec[i * 2 + 1] = const1_rtx;
5396 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5397 vt = validize_mem (force_const_mem (maskmode, vt));
5398 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5399 OPTAB_DIRECT);
5401 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5402 operands[3] = mask = t1;
5403 target = gen_reg_rtx (mode);
5404 op0 = gen_lowpart (mode, op0);
5405 op1 = gen_lowpart (mode, op1);
5408 switch (mode)
5410 case E_V8SImode:
5411 /* The VPERMD and VPERMPS instructions already properly ignore
5412 the high bits of the shuffle elements. No need for us to
5413 perform an AND ourselves. */
5414 if (one_operand_shuffle)
5416 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5417 if (target != operands[0])
5418 emit_move_insn (operands[0],
5419 gen_lowpart (GET_MODE (operands[0]), target));
5421 else
5423 t1 = gen_reg_rtx (V8SImode);
5424 t2 = gen_reg_rtx (V8SImode);
5425 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5426 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5427 goto merge_two;
5429 return;
5431 case E_V8SFmode:
5432 mask = gen_lowpart (V8SImode, mask);
5433 if (one_operand_shuffle)
5434 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5435 else
5437 t1 = gen_reg_rtx (V8SFmode);
5438 t2 = gen_reg_rtx (V8SFmode);
5439 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5440 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5441 goto merge_two;
5443 return;
5445 case E_V4SImode:
5446 /* By combining the two 128-bit input vectors into one 256-bit
5447 input vector, we can use VPERMD and VPERMPS for the full
5448 two-operand shuffle. */
5449 t1 = gen_reg_rtx (V8SImode);
5450 t2 = gen_reg_rtx (V8SImode);
5451 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5452 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5453 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5454 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5455 return;
5457 case E_V4SFmode:
5458 t1 = gen_reg_rtx (V8SFmode);
5459 t2 = gen_reg_rtx (V8SImode);
5460 mask = gen_lowpart (V4SImode, mask);
5461 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5462 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5463 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5464 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5465 return;
5467 case E_V32QImode:
5468 t1 = gen_reg_rtx (V32QImode);
5469 t2 = gen_reg_rtx (V32QImode);
5470 t3 = gen_reg_rtx (V32QImode);
5471 vt2 = GEN_INT (-128);
5472 vt = gen_const_vec_duplicate (V32QImode, vt2);
5473 vt = force_reg (V32QImode, vt);
5474 for (i = 0; i < 32; i++)
5475 vec[i] = i < 16 ? vt2 : const0_rtx;
5476 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5477 vt2 = force_reg (V32QImode, vt2);
5478 /* From mask create two adjusted masks, which contain the same
5479 bits as mask in the low 7 bits of each vector element.
5480 The first mask will have the most significant bit clear
5481 if it requests element from the same 128-bit lane
5482 and MSB set if it requests element from the other 128-bit lane.
5483 The second mask will have the opposite values of the MSB,
5484 and additionally will have its 128-bit lanes swapped.
5485 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5486 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5487 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5488 stands for other 12 bytes. */
5489 /* The bit whether element is from the same lane or the other
5490 lane is bit 4, so shift it up by 3 to the MSB position. */
5491 t5 = gen_reg_rtx (V4DImode);
5492 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5493 GEN_INT (3)));
5494 /* Clear MSB bits from the mask just in case it had them set. */
5495 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5496 /* After this t1 will have MSB set for elements from other lane. */
5497 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5498 /* Clear bits other than MSB. */
5499 emit_insn (gen_andv32qi3 (t1, t1, vt));
5500 /* Or in the lower bits from mask into t3. */
5501 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5502 /* And invert MSB bits in t1, so MSB is set for elements from the same
5503 lane. */
5504 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5505 /* Swap 128-bit lanes in t3. */
5506 t6 = gen_reg_rtx (V4DImode);
5507 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5508 const2_rtx, GEN_INT (3),
5509 const0_rtx, const1_rtx));
5510 /* And or in the lower bits from mask into t1. */
5511 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5512 if (one_operand_shuffle)
5514 /* Each of these shuffles will put 0s in places where
5515 element from the other 128-bit lane is needed, otherwise
5516 will shuffle in the requested value. */
5517 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5518 gen_lowpart (V32QImode, t6)));
5519 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5520 /* For t3 the 128-bit lanes are swapped again. */
5521 t7 = gen_reg_rtx (V4DImode);
5522 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5523 const2_rtx, GEN_INT (3),
5524 const0_rtx, const1_rtx));
5525 /* And oring both together leads to the result. */
5526 emit_insn (gen_iorv32qi3 (target, t1,
5527 gen_lowpart (V32QImode, t7)));
5528 if (target != operands[0])
5529 emit_move_insn (operands[0],
5530 gen_lowpart (GET_MODE (operands[0]), target));
5531 return;
5534 t4 = gen_reg_rtx (V32QImode);
5535 /* Similarly to the above one_operand_shuffle code,
5536 just for repeated twice for each operand. merge_two:
5537 code will merge the two results together. */
5538 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5539 gen_lowpart (V32QImode, t6)));
5540 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5541 gen_lowpart (V32QImode, t6)));
5542 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5543 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5544 t7 = gen_reg_rtx (V4DImode);
5545 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5546 const2_rtx, GEN_INT (3),
5547 const0_rtx, const1_rtx));
5548 t8 = gen_reg_rtx (V4DImode);
5549 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5550 const2_rtx, GEN_INT (3),
5551 const0_rtx, const1_rtx));
5552 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5553 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5554 t1 = t4;
5555 t2 = t3;
5556 goto merge_two;
5558 default:
5559 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5560 break;
5564 if (TARGET_XOP)
5566 /* The XOP VPPERM insn supports three inputs. By ignoring the
5567 one_operand_shuffle special case, we avoid creating another
5568 set of constant vectors in memory. */
5569 one_operand_shuffle = false;
5571 /* mask = mask & {2*w-1, ...} */
5572 vt = GEN_INT (2*w - 1);
5574 else
5576 /* mask = mask & {w-1, ...} */
5577 vt = GEN_INT (w - 1);
5580 vt = gen_const_vec_duplicate (maskmode, vt);
5581 mask = expand_simple_binop (maskmode, AND, mask, vt,
5582 NULL_RTX, 0, OPTAB_DIRECT);
5584 /* For non-QImode operations, convert the word permutation control
5585 into a byte permutation control. */
5586 if (mode != V16QImode)
5588 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5589 GEN_INT (exact_log2 (e)),
5590 NULL_RTX, 0, OPTAB_DIRECT);
5592 /* Convert mask to vector of chars. */
5593 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5595 /* Replicate each of the input bytes into byte positions:
5596 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5597 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5598 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5599 for (i = 0; i < 16; ++i)
5600 vec[i] = GEN_INT (i/e * e);
5601 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5602 vt = validize_mem (force_const_mem (V16QImode, vt));
5603 if (TARGET_XOP)
5604 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5605 else
5606 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5608 /* Convert it into the byte positions by doing
5609 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5610 for (i = 0; i < 16; ++i)
5611 vec[i] = GEN_INT (i % e);
5612 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5613 vt = validize_mem (force_const_mem (V16QImode, vt));
5614 emit_insn (gen_addv16qi3 (mask, mask, vt));
5617 /* The actual shuffle operations all operate on V16QImode. */
5618 op0 = gen_lowpart (V16QImode, op0);
5619 op1 = gen_lowpart (V16QImode, op1);
5621 if (TARGET_XOP)
5623 if (GET_MODE (target) != V16QImode)
5624 target = gen_reg_rtx (V16QImode);
5625 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5626 if (target != operands[0])
5627 emit_move_insn (operands[0],
5628 gen_lowpart (GET_MODE (operands[0]), target));
5630 else if (one_operand_shuffle)
5632 if (GET_MODE (target) != V16QImode)
5633 target = gen_reg_rtx (V16QImode);
5634 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5635 if (target != operands[0])
5636 emit_move_insn (operands[0],
5637 gen_lowpart (GET_MODE (operands[0]), target));
5639 else
5641 rtx xops[6];
5642 bool ok;
5644 /* Shuffle the two input vectors independently. */
5645 t1 = gen_reg_rtx (V16QImode);
5646 t2 = gen_reg_rtx (V16QImode);
5647 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5648 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5650 merge_two:
5651 /* Then merge them together. The key is whether any given control
5652 element contained a bit set that indicates the second word. */
5653 mask = operands[3];
5654 vt = GEN_INT (w);
5655 if (maskmode == V2DImode && !TARGET_SSE4_1)
5657 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5658 more shuffle to convert the V2DI input mask into a V4SI
5659 input mask. At which point the masking that expand_int_vcond
5660 will work as desired. */
5661 rtx t3 = gen_reg_rtx (V4SImode);
5662 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5663 const0_rtx, const0_rtx,
5664 const2_rtx, const2_rtx));
5665 mask = t3;
5666 maskmode = V4SImode;
5667 e = w = 4;
5670 vt = gen_const_vec_duplicate (maskmode, vt);
5671 vt = force_reg (maskmode, vt);
5672 mask = expand_simple_binop (maskmode, AND, mask, vt,
5673 NULL_RTX, 0, OPTAB_DIRECT);
5675 if (GET_MODE (target) != mode)
5676 target = gen_reg_rtx (mode);
5677 xops[0] = target;
5678 xops[1] = gen_lowpart (mode, t2);
5679 xops[2] = gen_lowpart (mode, t1);
5680 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5681 xops[4] = mask;
5682 xops[5] = vt;
5683 ok = ix86_expand_int_vcond (xops);
5684 gcc_assert (ok);
5685 if (target != operands[0])
5686 emit_move_insn (operands[0],
5687 gen_lowpart (GET_MODE (operands[0]), target));
5691 /* Extend SRC into next wider integer vector type. UNSIGNED_P is
5692 true if we should do zero extension, else sign extension. */
5694 void
5695 ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
5697 machine_mode imode = GET_MODE (src);
5698 rtx ops[3];
5700 switch (imode)
5702 case E_V8QImode:
5703 case E_V4QImode:
5704 case E_V2QImode:
5705 case E_V4HImode:
5706 case E_V2HImode:
5707 case E_V2SImode:
5708 break;
5709 default:
5710 gcc_unreachable ();
5713 ops[0] = dest;
5715 ops[1] = force_reg (imode, src);
5717 if (unsigned_p)
5718 ops[2] = force_reg (imode, CONST0_RTX (imode));
5719 else
5720 ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5721 ops[1], pc_rtx, pc_rtx);
5723 ix86_split_mmx_punpck (ops, false);
5726 /* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
5727 true if we should do zero extension, else sign extension. HIGH_P is
5728 true if we want the N/2 high elements, else the low elements. */
5730 void
5731 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5733 machine_mode imode = GET_MODE (src);
5734 rtx tmp;
5736 if (TARGET_SSE4_1)
5738 rtx (*unpack)(rtx, rtx);
5739 rtx (*extract)(rtx, rtx) = NULL;
5740 machine_mode halfmode = BLKmode;
5742 switch (imode)
5744 case E_V64QImode:
5745 if (unsigned_p)
5746 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5747 else
5748 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5749 halfmode = V32QImode;
5750 extract
5751 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5752 break;
5753 case E_V32QImode:
5754 if (unsigned_p)
5755 unpack = gen_avx2_zero_extendv16qiv16hi2;
5756 else
5757 unpack = gen_avx2_sign_extendv16qiv16hi2;
5758 halfmode = V16QImode;
5759 extract
5760 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5761 break;
5762 case E_V32HImode:
5763 if (unsigned_p)
5764 unpack = gen_avx512f_zero_extendv16hiv16si2;
5765 else
5766 unpack = gen_avx512f_sign_extendv16hiv16si2;
5767 halfmode = V16HImode;
5768 extract
5769 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5770 break;
5771 case E_V16HImode:
5772 if (unsigned_p)
5773 unpack = gen_avx2_zero_extendv8hiv8si2;
5774 else
5775 unpack = gen_avx2_sign_extendv8hiv8si2;
5776 halfmode = V8HImode;
5777 extract
5778 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5779 break;
5780 case E_V16SImode:
5781 if (unsigned_p)
5782 unpack = gen_avx512f_zero_extendv8siv8di2;
5783 else
5784 unpack = gen_avx512f_sign_extendv8siv8di2;
5785 halfmode = V8SImode;
5786 extract
5787 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5788 break;
5789 case E_V8SImode:
5790 if (unsigned_p)
5791 unpack = gen_avx2_zero_extendv4siv4di2;
5792 else
5793 unpack = gen_avx2_sign_extendv4siv4di2;
5794 halfmode = V4SImode;
5795 extract
5796 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5797 break;
5798 case E_V16QImode:
5799 if (unsigned_p)
5800 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5801 else
5802 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5803 break;
5804 case E_V8HImode:
5805 if (unsigned_p)
5806 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5807 else
5808 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5809 break;
5810 case E_V4SImode:
5811 if (unsigned_p)
5812 unpack = gen_sse4_1_zero_extendv2siv2di2;
5813 else
5814 unpack = gen_sse4_1_sign_extendv2siv2di2;
5815 break;
5816 case E_V8QImode:
5817 if (unsigned_p)
5818 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5819 else
5820 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5821 break;
5822 case E_V4HImode:
5823 if (unsigned_p)
5824 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5825 else
5826 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5827 break;
5828 case E_V4QImode:
5829 if (unsigned_p)
5830 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5831 else
5832 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5833 break;
5834 default:
5835 gcc_unreachable ();
5838 if (GET_MODE_SIZE (imode) >= 32)
5840 tmp = gen_reg_rtx (halfmode);
5841 emit_insn (extract (tmp, src));
5843 else if (high_p)
5845 switch (GET_MODE_SIZE (imode))
5847 case 16:
5848 /* Shift higher 8 bytes to lower 8 bytes. */
5849 tmp = gen_reg_rtx (V1TImode);
5850 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5851 GEN_INT (64)));
5852 break;
5853 case 8:
5854 /* Shift higher 4 bytes to lower 4 bytes. */
5855 tmp = gen_reg_rtx (V1DImode);
5856 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5857 GEN_INT (32)));
5858 break;
5859 case 4:
5860 /* Shift higher 2 bytes to lower 2 bytes. */
5861 tmp = gen_reg_rtx (V1SImode);
5862 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5863 GEN_INT (16)));
5864 break;
5865 default:
5866 gcc_unreachable ();
5869 tmp = gen_lowpart (imode, tmp);
5871 else
5872 tmp = src;
5874 emit_insn (unpack (dest, tmp));
5876 else
5878 rtx (*unpack)(rtx, rtx, rtx);
5880 switch (imode)
5882 case E_V16QImode:
5883 if (high_p)
5884 unpack = gen_vec_interleave_highv16qi;
5885 else
5886 unpack = gen_vec_interleave_lowv16qi;
5887 break;
5888 case E_V8HImode:
5889 if (high_p)
5890 unpack = gen_vec_interleave_highv8hi;
5891 else
5892 unpack = gen_vec_interleave_lowv8hi;
5893 break;
5894 case E_V4SImode:
5895 if (high_p)
5896 unpack = gen_vec_interleave_highv4si;
5897 else
5898 unpack = gen_vec_interleave_lowv4si;
5899 break;
5900 case E_V8QImode:
5901 if (high_p)
5902 unpack = gen_mmx_punpckhbw;
5903 else
5904 unpack = gen_mmx_punpcklbw;
5905 break;
5906 case E_V4HImode:
5907 if (high_p)
5908 unpack = gen_mmx_punpckhwd;
5909 else
5910 unpack = gen_mmx_punpcklwd;
5911 break;
5912 case E_V4QImode:
5913 if (high_p)
5914 unpack = gen_mmx_punpckhbw_low;
5915 else
5916 unpack = gen_mmx_punpcklbw_low;
5917 break;
5918 default:
5919 gcc_unreachable ();
5922 if (unsigned_p)
5923 tmp = force_reg (imode, CONST0_RTX (imode));
5924 else
5925 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5926 src, pc_rtx, pc_rtx);
5928 rtx tmp2 = gen_reg_rtx (imode);
5929 emit_insn (unpack (tmp2, src, tmp));
5930 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5934 /* Return true if mem is pool constant which contains a const_vector
5935 perm index, assign the index to PERM. */
5936 bool
5937 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5939 machine_mode mode = GET_MODE (mem);
5940 int nelt = GET_MODE_NUNITS (mode);
5942 if (!INTEGRAL_MODE_P (mode))
5943 return false;
5945 /* Needs to be constant pool. */
5946 if (!(MEM_P (mem))
5947 || !SYMBOL_REF_P (XEXP (mem, 0))
5948 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5949 return false;
5951 rtx constant = get_pool_constant (XEXP (mem, 0));
5953 if (GET_CODE (constant) != CONST_VECTOR)
5954 return false;
5956 /* There could be some rtx like
5957 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5958 but with "*.LC1" refer to V2DI constant vector. */
5959 if (GET_MODE (constant) != mode)
5961 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5963 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5964 return false;
5967 for (int i = 0; i != nelt; i++)
5968 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5970 return true;
5973 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5974 but works for floating pointer parameters and nonoffsetable memories.
5975 For pushes, it returns just stack offsets; the values will be saved
5976 in the right order. Maximally three parts are generated. */
5978 static int
5979 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5981 int size;
5983 if (!TARGET_64BIT)
5984 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5985 else
5986 size = (GET_MODE_SIZE (mode) + 4) / 8;
5988 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5989 gcc_assert (size >= 2 && size <= 4);
5991 /* Optimize constant pool reference to immediates. This is used by fp
5992 moves, that force all constants to memory to allow combining. */
5993 if (MEM_P (operand) && MEM_READONLY_P (operand))
5994 operand = avoid_constant_pool_reference (operand);
5996 if (MEM_P (operand) && !offsettable_memref_p (operand))
5998 /* The only non-offsetable memories we handle are pushes. */
5999 int ok = push_operand (operand, VOIDmode);
6001 gcc_assert (ok);
6003 operand = copy_rtx (operand);
6004 PUT_MODE (operand, word_mode);
6005 parts[0] = parts[1] = parts[2] = parts[3] = operand;
6006 return size;
6009 if (GET_CODE (operand) == CONST_VECTOR)
6011 scalar_int_mode imode = int_mode_for_mode (mode).require ();
6012 /* Caution: if we looked through a constant pool memory above,
6013 the operand may actually have a different mode now. That's
6014 ok, since we want to pun this all the way back to an integer. */
6015 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
6016 gcc_assert (operand != NULL);
6017 mode = imode;
6020 if (!TARGET_64BIT)
6022 if (mode == DImode)
6023 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
6024 else
6026 int i;
6028 if (REG_P (operand))
6030 gcc_assert (reload_completed);
6031 for (i = 0; i < size; i++)
6032 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
6034 else if (offsettable_memref_p (operand))
6036 operand = adjust_address (operand, SImode, 0);
6037 parts[0] = operand;
6038 for (i = 1; i < size; i++)
6039 parts[i] = adjust_address (operand, SImode, 4 * i);
6041 else if (CONST_DOUBLE_P (operand))
6043 const REAL_VALUE_TYPE *r;
6044 long l[4];
6046 r = CONST_DOUBLE_REAL_VALUE (operand);
6047 switch (mode)
6049 case E_TFmode:
6050 real_to_target (l, r, mode);
6051 parts[3] = gen_int_mode (l[3], SImode);
6052 parts[2] = gen_int_mode (l[2], SImode);
6053 break;
6054 case E_XFmode:
6055 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
6056 long double may not be 80-bit. */
6057 real_to_target (l, r, mode);
6058 parts[2] = gen_int_mode (l[2], SImode);
6059 break;
6060 case E_DFmode:
6061 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
6062 break;
6063 default:
6064 gcc_unreachable ();
6066 parts[1] = gen_int_mode (l[1], SImode);
6067 parts[0] = gen_int_mode (l[0], SImode);
6069 else
6070 gcc_unreachable ();
6073 else
6075 if (mode == TImode)
6076 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
6077 if (mode == XFmode || mode == TFmode)
6079 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
6080 if (REG_P (operand))
6082 gcc_assert (reload_completed);
6083 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
6084 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
6086 else if (offsettable_memref_p (operand))
6088 operand = adjust_address (operand, DImode, 0);
6089 parts[0] = operand;
6090 parts[1] = adjust_address (operand, upper_mode, 8);
6092 else if (CONST_DOUBLE_P (operand))
6094 long l[4];
6096 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
6098 /* real_to_target puts 32-bit pieces in each long. */
6099 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
6100 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
6101 << 32), DImode);
6103 if (upper_mode == SImode)
6104 parts[1] = gen_int_mode (l[2], SImode);
6105 else
6106 parts[1]
6107 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
6108 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
6109 << 32), DImode);
6111 else
6112 gcc_unreachable ();
6116 return size;
6119 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
6120 Return false when normal moves are needed; true when all required
6121 insns have been emitted. Operands 2-4 contain the input values
6122 int the correct order; operands 5-7 contain the output values. */
6124 void
6125 ix86_split_long_move (rtx operands[])
6127 rtx part[2][4];
6128 int nparts, i, j;
6129 int push = 0;
6130 int collisions = 0;
6131 machine_mode mode = GET_MODE (operands[0]);
6132 bool collisionparts[4];
6134 /* The DFmode expanders may ask us to move double.
6135 For 64bit target this is single move. By hiding the fact
6136 here we simplify i386.md splitters. */
6137 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
6139 /* Optimize constant pool reference to immediates. This is used by
6140 fp moves, that force all constants to memory to allow combining. */
6142 if (MEM_P (operands[1])
6143 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
6144 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
6145 operands[1] = get_pool_constant (XEXP (operands[1], 0));
6146 if (push_operand (operands[0], VOIDmode))
6148 operands[0] = copy_rtx (operands[0]);
6149 PUT_MODE (operands[0], word_mode);
6151 else
6152 operands[0] = gen_lowpart (DImode, operands[0]);
6153 operands[1] = gen_lowpart (DImode, operands[1]);
6154 emit_move_insn (operands[0], operands[1]);
6155 return;
6158 /* The only non-offsettable memory we handle is push. */
6159 if (push_operand (operands[0], VOIDmode))
6160 push = 1;
6161 else
6162 gcc_assert (!MEM_P (operands[0])
6163 || offsettable_memref_p (operands[0]));
6165 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
6166 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
6168 /* When emitting push, take care for source operands on the stack. */
6169 if (push && MEM_P (operands[1])
6170 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
6172 rtx src_base = XEXP (part[1][nparts - 1], 0);
6174 /* Compensate for the stack decrement by 4. */
6175 if (!TARGET_64BIT && nparts == 3
6176 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
6177 src_base = plus_constant (Pmode, src_base, 4);
6179 /* src_base refers to the stack pointer and is
6180 automatically decreased by emitted push. */
6181 for (i = 0; i < nparts; i++)
6182 part[1][i] = change_address (part[1][i],
6183 GET_MODE (part[1][i]), src_base);
6186 /* We need to do copy in the right order in case an address register
6187 of the source overlaps the destination. */
6188 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6190 rtx tmp;
6192 for (i = 0; i < nparts; i++)
6194 collisionparts[i]
6195 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6196 if (collisionparts[i])
6197 collisions++;
6200 /* Collision in the middle part can be handled by reordering. */
6201 if (collisions == 1 && nparts == 3 && collisionparts [1])
6203 std::swap (part[0][1], part[0][2]);
6204 std::swap (part[1][1], part[1][2]);
6206 else if (collisions == 1
6207 && nparts == 4
6208 && (collisionparts [1] || collisionparts [2]))
6210 if (collisionparts [1])
6212 std::swap (part[0][1], part[0][2]);
6213 std::swap (part[1][1], part[1][2]);
6215 else
6217 std::swap (part[0][2], part[0][3]);
6218 std::swap (part[1][2], part[1][3]);
6222 /* If there are more collisions, we can't handle it by reordering.
6223 Do an lea to the last part and use only one colliding move. */
6224 else if (collisions > 1)
6226 rtx base, addr;
6228 collisions = 1;
6230 base = part[0][nparts - 1];
6232 /* Handle the case when the last part isn't valid for lea.
6233 Happens in 64-bit mode storing the 12-byte XFmode. */
6234 if (GET_MODE (base) != Pmode)
6235 base = gen_rtx_REG (Pmode, REGNO (base));
6237 addr = XEXP (part[1][0], 0);
6238 if (TARGET_TLS_DIRECT_SEG_REFS)
6240 struct ix86_address parts;
6241 int ok = ix86_decompose_address (addr, &parts);
6242 gcc_assert (ok);
6243 /* It is not valid to use %gs: or %fs: in lea. */
6244 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6246 emit_insn (gen_rtx_SET (base, addr));
6247 part[1][0] = replace_equiv_address (part[1][0], base);
6248 for (i = 1; i < nparts; i++)
6250 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6251 part[1][i] = replace_equiv_address (part[1][i], tmp);
6256 if (push)
6258 if (!TARGET_64BIT)
6260 if (nparts == 3)
6262 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
6263 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
6264 emit_move_insn (part[0][2], part[1][2]);
6266 else if (nparts == 4)
6268 emit_move_insn (part[0][3], part[1][3]);
6269 emit_move_insn (part[0][2], part[1][2]);
6272 else
6274 /* In 64bit mode we don't have 32bit push available. In case this is
6275 register, it is OK - we will just use larger counterpart. We also
6276 retype memory - these comes from attempt to avoid REX prefix on
6277 moving of second half of TFmode value. */
6278 if (GET_MODE (part[1][1]) == SImode)
6280 switch (GET_CODE (part[1][1]))
6282 case MEM:
6283 part[1][1] = adjust_address (part[1][1], DImode, 0);
6284 break;
6286 case REG:
6287 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6288 break;
6290 default:
6291 gcc_unreachable ();
6294 if (GET_MODE (part[1][0]) == SImode)
6295 part[1][0] = part[1][1];
6298 emit_move_insn (part[0][1], part[1][1]);
6299 emit_move_insn (part[0][0], part[1][0]);
6300 return;
6303 /* Choose correct order to not overwrite the source before it is copied. */
6304 if ((REG_P (part[0][0])
6305 && REG_P (part[1][1])
6306 && (REGNO (part[0][0]) == REGNO (part[1][1])
6307 || (nparts == 3
6308 && REGNO (part[0][0]) == REGNO (part[1][2]))
6309 || (nparts == 4
6310 && REGNO (part[0][0]) == REGNO (part[1][3]))))
6311 || (collisions > 0
6312 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6314 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6316 operands[2 + i] = part[0][j];
6317 operands[6 + i] = part[1][j];
6320 else
6322 for (i = 0; i < nparts; i++)
6324 operands[2 + i] = part[0][i];
6325 operands[6 + i] = part[1][i];
6329 /* Attempt to locally unCSE nonzero constants. */
6330 for (j = 0; j < nparts - 1; j++)
6331 if (CONST_INT_P (operands[6 + j])
6332 && operands[6 + j] != const0_rtx
6333 && REG_P (operands[2 + j]))
6334 for (i = j; i < nparts - 1; i++)
6335 if (CONST_INT_P (operands[7 + i])
6336 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6337 operands[7 + i] = operands[2 + j];
6339 for (i = 0; i < nparts; i++)
6340 emit_move_insn (operands[2 + i], operands[6 + i]);
6342 return;
6345 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6346 left shift by a constant, either using a single shift or
6347 a sequence of add instructions. */
6349 static void
6350 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6352 if (count == 1
6353 || (count * ix86_cost->add <= ix86_cost->shift_const
6354 && !optimize_insn_for_size_p ()))
6356 while (count-- > 0)
6357 emit_insn (gen_add2_insn (operand, operand));
6359 else
6361 rtx (*insn)(rtx, rtx, rtx);
6363 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6364 emit_insn (insn (operand, operand, GEN_INT (count)));
6368 void
6369 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6371 rtx (*gen_ashl3)(rtx, rtx, rtx);
6372 rtx (*gen_shld)(rtx, rtx, rtx);
6373 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6374 machine_mode half_mode;
6376 rtx low[2], high[2];
6377 int count;
6379 if (CONST_INT_P (operands[2]))
6381 split_double_mode (mode, operands, 2, low, high);
6382 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6384 if (count >= half_width)
6386 emit_move_insn (high[0], low[1]);
6387 ix86_expand_clear (low[0]);
6389 if (count > half_width)
6390 ix86_expand_ashl_const (high[0], count - half_width, mode);
6392 else if (count == 1)
6394 if (!rtx_equal_p (operands[0], operands[1]))
6395 emit_move_insn (operands[0], operands[1]);
6396 rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
6397 rtx x4 = gen_rtx_LTU (mode, x3, const0_rtx);
6398 half_mode = mode == DImode ? SImode : DImode;
6399 emit_insn (gen_add3_cc_overflow_1 (half_mode, low[0],
6400 low[0], low[0]));
6401 emit_insn (gen_add3_carry (half_mode, high[0], high[0], high[0],
6402 x3, x4));
6404 else
6406 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6408 if (!rtx_equal_p (operands[0], operands[1]))
6409 emit_move_insn (operands[0], operands[1]);
6411 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6412 ix86_expand_ashl_const (low[0], count, mode);
6414 return;
6417 split_double_mode (mode, operands, 1, low, high);
6418 half_mode = mode == DImode ? SImode : DImode;
6420 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6422 if (operands[1] == const1_rtx)
6424 /* Assuming we've chosen a QImode capable registers, then 1 << N
6425 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6426 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6428 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6430 ix86_expand_clear (low[0]);
6431 ix86_expand_clear (high[0]);
6432 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6434 d = gen_lowpart (QImode, low[0]);
6435 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6436 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6437 emit_insn (gen_rtx_SET (d, s));
6439 d = gen_lowpart (QImode, high[0]);
6440 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6441 s = gen_rtx_NE (QImode, flags, const0_rtx);
6442 emit_insn (gen_rtx_SET (d, s));
6445 /* Otherwise, we can get the same results by manually performing
6446 a bit extract operation on bit 5/6, and then performing the two
6447 shifts. The two methods of getting 0/1 into low/high are exactly
6448 the same size. Avoiding the shift in the bit extract case helps
6449 pentium4 a bit; no one else seems to care much either way. */
6450 else
6452 rtx (*gen_lshr3)(rtx, rtx, rtx);
6453 rtx (*gen_and3)(rtx, rtx, rtx);
6454 rtx (*gen_xor3)(rtx, rtx, rtx);
6455 HOST_WIDE_INT bits;
6456 rtx x;
6458 if (mode == DImode)
6460 gen_lshr3 = gen_lshrsi3;
6461 gen_and3 = gen_andsi3;
6462 gen_xor3 = gen_xorsi3;
6463 bits = 5;
6465 else
6467 gen_lshr3 = gen_lshrdi3;
6468 gen_and3 = gen_anddi3;
6469 gen_xor3 = gen_xordi3;
6470 bits = 6;
6473 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6474 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6475 else
6476 x = gen_lowpart (half_mode, operands[2]);
6477 emit_insn (gen_rtx_SET (high[0], x));
6479 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6480 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6481 emit_move_insn (low[0], high[0]);
6482 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6485 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6486 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6487 return;
6490 if (operands[1] == constm1_rtx)
6492 /* For -1 << N, we can avoid the shld instruction, because we
6493 know that we're shifting 0...31/63 ones into a -1. */
6494 emit_move_insn (low[0], constm1_rtx);
6495 if (optimize_insn_for_size_p ())
6496 emit_move_insn (high[0], low[0]);
6497 else
6498 emit_move_insn (high[0], constm1_rtx);
6500 else
6502 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6504 if (!rtx_equal_p (operands[0], operands[1]))
6505 emit_move_insn (operands[0], operands[1]);
6507 split_double_mode (mode, operands, 1, low, high);
6508 emit_insn (gen_shld (high[0], low[0], operands[2]));
6511 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6513 if (TARGET_CMOVE && scratch)
6515 ix86_expand_clear (scratch);
6516 emit_insn (gen_x86_shift_adj_1
6517 (half_mode, high[0], low[0], operands[2], scratch));
6519 else
6520 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6523 void
6524 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6526 rtx (*gen_ashr3)(rtx, rtx, rtx)
6527 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6528 rtx (*gen_shrd)(rtx, rtx, rtx);
6529 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6531 rtx low[2], high[2];
6532 int count;
6534 if (CONST_INT_P (operands[2]))
6536 split_double_mode (mode, operands, 2, low, high);
6537 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6539 if (count == GET_MODE_BITSIZE (mode) - 1)
6541 emit_move_insn (high[0], high[1]);
6542 emit_insn (gen_ashr3 (high[0], high[0],
6543 GEN_INT (half_width - 1)));
6544 emit_move_insn (low[0], high[0]);
6547 else if (count >= half_width)
6549 emit_move_insn (low[0], high[1]);
6550 emit_move_insn (high[0], low[0]);
6551 emit_insn (gen_ashr3 (high[0], high[0],
6552 GEN_INT (half_width - 1)));
6554 if (count > half_width)
6555 emit_insn (gen_ashr3 (low[0], low[0],
6556 GEN_INT (count - half_width)));
6558 else if (count == 1
6559 && (TARGET_USE_RCR || optimize_size > 1))
6561 if (!rtx_equal_p (operands[0], operands[1]))
6562 emit_move_insn (operands[0], operands[1]);
6563 if (mode == DImode)
6565 emit_insn (gen_ashrsi3_carry (high[0], high[0]));
6566 emit_insn (gen_rcrsi2 (low[0], low[0]));
6568 else
6570 emit_insn (gen_ashrdi3_carry (high[0], high[0]));
6571 emit_insn (gen_rcrdi2 (low[0], low[0]));
6574 else
6576 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6578 if (!rtx_equal_p (operands[0], operands[1]))
6579 emit_move_insn (operands[0], operands[1]);
6581 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6582 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6585 else
6587 machine_mode half_mode;
6589 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6591 if (!rtx_equal_p (operands[0], operands[1]))
6592 emit_move_insn (operands[0], operands[1]);
6594 split_double_mode (mode, operands, 1, low, high);
6595 half_mode = mode == DImode ? SImode : DImode;
6597 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6598 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6600 if (TARGET_CMOVE && scratch)
6602 emit_move_insn (scratch, high[0]);
6603 emit_insn (gen_ashr3 (scratch, scratch,
6604 GEN_INT (half_width - 1)));
6605 emit_insn (gen_x86_shift_adj_1
6606 (half_mode, low[0], high[0], operands[2], scratch));
6608 else
6609 emit_insn (gen_x86_shift_adj_3
6610 (half_mode, low[0], high[0], operands[2]));
6614 void
6615 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6617 rtx (*gen_lshr3)(rtx, rtx, rtx)
6618 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6619 rtx (*gen_shrd)(rtx, rtx, rtx);
6620 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6622 rtx low[2], high[2];
6623 int count;
6625 if (CONST_INT_P (operands[2]))
6627 split_double_mode (mode, operands, 2, low, high);
6628 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6630 if (count >= half_width)
6632 emit_move_insn (low[0], high[1]);
6633 ix86_expand_clear (high[0]);
6635 if (count > half_width)
6636 emit_insn (gen_lshr3 (low[0], low[0],
6637 GEN_INT (count - half_width)));
6639 else if (count == 1
6640 && (TARGET_USE_RCR || optimize_size > 1))
6642 if (!rtx_equal_p (operands[0], operands[1]))
6643 emit_move_insn (operands[0], operands[1]);
6644 if (mode == DImode)
6646 emit_insn (gen_lshrsi3_carry (high[0], high[0]));
6647 emit_insn (gen_rcrsi2 (low[0], low[0]));
6649 else
6651 emit_insn (gen_lshrdi3_carry (high[0], high[0]));
6652 emit_insn (gen_rcrdi2 (low[0], low[0]));
6655 else
6657 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6659 if (!rtx_equal_p (operands[0], operands[1]))
6660 emit_move_insn (operands[0], operands[1]);
6662 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6663 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6666 else
6668 machine_mode half_mode;
6670 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6672 if (!rtx_equal_p (operands[0], operands[1]))
6673 emit_move_insn (operands[0], operands[1]);
6675 split_double_mode (mode, operands, 1, low, high);
6676 half_mode = mode == DImode ? SImode : DImode;
6678 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6679 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6681 if (TARGET_CMOVE && scratch)
6683 ix86_expand_clear (scratch);
6684 emit_insn (gen_x86_shift_adj_1
6685 (half_mode, low[0], high[0], operands[2], scratch));
6687 else
6688 emit_insn (gen_x86_shift_adj_2
6689 (half_mode, low[0], high[0], operands[2]));
6693 /* Helper function to split TImode ashl under NDD. */
6694 void
6695 ix86_split_ashl_ndd (rtx *operands, rtx scratch)
6697 gcc_assert (TARGET_APX_NDD);
6698 int half_width = GET_MODE_BITSIZE (TImode) >> 1;
6700 rtx low[2], high[2];
6701 int count;
6703 split_double_mode (TImode, operands, 2, low, high);
6704 if (CONST_INT_P (operands[2]))
6706 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
6708 if (count >= half_width)
6710 count = count - half_width;
6711 if (count == 0)
6713 if (!rtx_equal_p (high[0], low[1]))
6714 emit_move_insn (high[0], low[1]);
6716 else if (count == 1)
6717 emit_insn (gen_adddi3 (high[0], low[1], low[1]));
6718 else
6719 emit_insn (gen_ashldi3 (high[0], low[1], GEN_INT (count)));
6721 ix86_expand_clear (low[0]);
6723 else if (count == 1)
6725 rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
6726 rtx x4 = gen_rtx_LTU (TImode, x3, const0_rtx);
6727 emit_insn (gen_add3_cc_overflow_1 (DImode, low[0],
6728 low[1], low[1]));
6729 emit_insn (gen_add3_carry (DImode, high[0], high[1], high[1],
6730 x3, x4));
6732 else
6734 emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
6735 GEN_INT (count)));
6736 emit_insn (gen_ashldi3 (low[0], low[1], GEN_INT (count)));
6739 else
6741 emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
6742 operands[2]));
6743 emit_insn (gen_ashldi3 (low[0], low[1], operands[2]));
6744 if (TARGET_CMOVE && scratch)
6746 ix86_expand_clear (scratch);
6747 emit_insn (gen_x86_shift_adj_1
6748 (DImode, high[0], low[0], operands[2], scratch));
6750 else
6751 emit_insn (gen_x86_shift_adj_2 (DImode, high[0], low[0], operands[2]));
6755 /* Helper function to split TImode l/ashr under NDD. */
6756 void
6757 ix86_split_rshift_ndd (enum rtx_code code, rtx *operands, rtx scratch)
6759 gcc_assert (TARGET_APX_NDD);
6760 int half_width = GET_MODE_BITSIZE (TImode) >> 1;
6761 bool ashr_p = code == ASHIFTRT;
6762 rtx (*gen_shr)(rtx, rtx, rtx) = ashr_p ? gen_ashrdi3
6763 : gen_lshrdi3;
6765 rtx low[2], high[2];
6766 int count;
6768 split_double_mode (TImode, operands, 2, low, high);
6769 if (CONST_INT_P (operands[2]))
6771 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
6773 if (ashr_p && (count == GET_MODE_BITSIZE (TImode) - 1))
6775 emit_insn (gen_shr (high[0], high[1],
6776 GEN_INT (half_width - 1)));
6777 emit_move_insn (low[0], high[0]);
6779 else if (count >= half_width)
6781 if (ashr_p)
6782 emit_insn (gen_shr (high[0], high[1],
6783 GEN_INT (half_width - 1)));
6784 else
6785 ix86_expand_clear (high[0]);
6787 if (count > half_width)
6788 emit_insn (gen_shr (low[0], high[1],
6789 GEN_INT (count - half_width)));
6790 else
6791 emit_move_insn (low[0], high[1]);
6793 else
6795 emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
6796 GEN_INT (count)));
6797 emit_insn (gen_shr (high[0], high[1], GEN_INT (count)));
6800 else
6802 emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
6803 operands[2]));
6804 emit_insn (gen_shr (high[0], high[1], operands[2]));
6806 if (TARGET_CMOVE && scratch)
6808 if (ashr_p)
6810 emit_move_insn (scratch, high[0]);
6811 emit_insn (gen_shr (scratch, scratch,
6812 GEN_INT (half_width - 1)));
6814 else
6815 ix86_expand_clear (scratch);
6817 emit_insn (gen_x86_shift_adj_1
6818 (DImode, low[0], high[0], operands[2], scratch));
6820 else if (ashr_p)
6821 emit_insn (gen_x86_shift_adj_3
6822 (DImode, low[0], high[0], operands[2]));
6823 else
6824 emit_insn (gen_x86_shift_adj_2
6825 (DImode, low[0], high[0], operands[2]));
6829 /* Expand move of V1TI mode register X to a new TI mode register. */
6830 static rtx
6831 ix86_expand_v1ti_to_ti (rtx x)
6833 rtx result = gen_reg_rtx (TImode);
6834 if (TARGET_SSE2)
6836 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
6837 rtx lo = gen_lowpart (DImode, result);
6838 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6839 rtx hi = gen_highpart (DImode, result);
6840 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6842 else
6843 emit_move_insn (result, gen_lowpart (TImode, x));
6844 return result;
6847 /* Expand move of TI mode register X to a new V1TI mode register. */
6848 static rtx
6849 ix86_expand_ti_to_v1ti (rtx x)
6851 if (TARGET_SSE2)
6853 rtx lo = gen_lowpart (DImode, x);
6854 rtx hi = gen_highpart (DImode, x);
6855 rtx tmp = gen_reg_rtx (V2DImode);
6856 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6857 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
6860 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
6863 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6864 void
6865 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6867 rtx op1 = force_reg (V1TImode, operands[1]);
6869 if (!CONST_INT_P (operands[2]))
6871 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6872 rtx tmp2 = gen_reg_rtx (TImode);
6873 rtx (*shift) (rtx, rtx, rtx)
6874 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6875 emit_insn (shift (tmp2, tmp1, operands[2]));
6876 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6877 emit_move_insn (operands[0], tmp3);
6878 return;
6881 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6883 if (bits == 0)
6885 emit_move_insn (operands[0], op1);
6886 return;
6889 if ((bits & 7) == 0)
6891 rtx tmp = gen_reg_rtx (V1TImode);
6892 if (code == ASHIFT)
6893 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6894 else
6895 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6896 emit_move_insn (operands[0], tmp);
6897 return;
6900 rtx tmp1 = gen_reg_rtx (V1TImode);
6901 if (code == ASHIFT)
6902 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6903 else
6904 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6906 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6907 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6909 /* tmp3 will be the V2DImode result. */
6910 rtx tmp3 = gen_reg_rtx (V2DImode);
6912 if (bits > 64)
6914 if (code == ASHIFT)
6915 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6916 else
6917 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6919 else
6921 /* tmp4 is operands[1], in V2DImode. */
6922 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6924 rtx tmp5 = gen_reg_rtx (V2DImode);
6925 if (code == ASHIFT)
6926 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6927 else
6928 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6930 rtx tmp6 = gen_reg_rtx (V2DImode);
6931 if (code == ASHIFT)
6932 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6933 else
6934 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6936 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6939 /* Convert the result back to V1TImode and store in operands[0]. */
6940 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6941 emit_move_insn (operands[0], tmp7);
6944 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6945 void
6946 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6948 rtx op1 = force_reg (V1TImode, operands[1]);
6950 if (!CONST_INT_P (operands[2]))
6952 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6953 rtx tmp2 = gen_reg_rtx (TImode);
6954 rtx (*rotate) (rtx, rtx, rtx)
6955 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6956 emit_insn (rotate (tmp2, tmp1, operands[2]));
6957 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6958 emit_move_insn (operands[0], tmp3);
6959 return;
6962 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6964 if (bits == 0)
6966 emit_move_insn (operands[0], op1);
6967 return;
6970 if (code == ROTATERT)
6971 bits = 128 - bits;
6973 if ((bits & 31) == 0)
6975 rtx tmp2 = gen_reg_rtx (V4SImode);
6976 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6977 if (bits == 32)
6978 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6979 else if (bits == 64)
6980 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6981 else
6982 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6983 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6984 return;
6987 if ((bits & 7) == 0)
6989 rtx tmp1 = gen_reg_rtx (V1TImode);
6990 rtx tmp2 = gen_reg_rtx (V1TImode);
6991 rtx tmp3 = gen_reg_rtx (V1TImode);
6993 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6994 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6995 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6996 emit_move_insn (operands[0], tmp3);
6997 return;
7000 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7002 rtx lobits;
7003 rtx hibits;
7005 switch (bits >> 5)
7007 case 0:
7008 lobits = op1_v4si;
7009 hibits = gen_reg_rtx (V4SImode);
7010 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
7011 break;
7013 case 1:
7014 lobits = gen_reg_rtx (V4SImode);
7015 hibits = gen_reg_rtx (V4SImode);
7016 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
7017 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
7018 break;
7020 case 2:
7021 lobits = gen_reg_rtx (V4SImode);
7022 hibits = gen_reg_rtx (V4SImode);
7023 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
7024 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
7025 break;
7027 default:
7028 lobits = gen_reg_rtx (V4SImode);
7029 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
7030 hibits = op1_v4si;
7031 break;
7034 rtx tmp1 = gen_reg_rtx (V4SImode);
7035 rtx tmp2 = gen_reg_rtx (V4SImode);
7036 rtx tmp3 = gen_reg_rtx (V4SImode);
7038 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
7039 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
7040 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
7042 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
7045 /* Expand V1TI mode ashiftrt by constant. */
7046 void
7047 ix86_expand_v1ti_ashiftrt (rtx operands[])
7049 rtx op1 = force_reg (V1TImode, operands[1]);
7051 if (!CONST_INT_P (operands[2]))
7053 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
7054 rtx tmp2 = gen_reg_rtx (TImode);
7055 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
7056 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
7057 emit_move_insn (operands[0], tmp3);
7058 return;
7061 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
7063 if (bits == 0)
7065 emit_move_insn (operands[0], op1);
7066 return;
7069 if (bits == 127)
7071 /* Two operations. */
7072 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7073 rtx tmp2 = gen_reg_rtx (V4SImode);
7074 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7076 rtx tmp3 = gen_reg_rtx (V4SImode);
7077 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7079 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
7080 return;
7083 if (bits == 64)
7085 /* Three operations. */
7086 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7087 rtx tmp2 = gen_reg_rtx (V4SImode);
7088 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7090 rtx tmp3 = gen_reg_rtx (V4SImode);
7091 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7093 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7094 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7095 rtx tmp6 = gen_reg_rtx (V2DImode);
7096 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
7098 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7099 return;
7102 if (bits == 96)
7104 /* Three operations. */
7105 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7106 rtx tmp2 = gen_reg_rtx (V4SImode);
7107 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
7109 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7110 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
7111 rtx tmp5 = gen_reg_rtx (V2DImode);
7112 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
7114 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
7115 rtx tmp7 = gen_reg_rtx (V4SImode);
7116 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
7118 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
7119 return;
7122 if (bits >= 111)
7124 /* Three operations. */
7125 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7126 rtx tmp2 = gen_reg_rtx (V4SImode);
7127 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
7129 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7130 rtx tmp4 = gen_reg_rtx (V8HImode);
7131 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
7133 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
7134 rtx tmp6 = gen_reg_rtx (V4SImode);
7135 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
7137 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7138 return;
7141 if (TARGET_AVX2 || TARGET_SSE4_1)
7143 /* Three operations. */
7144 if (bits == 32)
7146 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7147 rtx tmp2 = gen_reg_rtx (V4SImode);
7148 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
7150 rtx tmp3 = gen_reg_rtx (V1TImode);
7151 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
7153 if (TARGET_AVX2)
7155 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
7156 rtx tmp5 = gen_reg_rtx (V4SImode);
7157 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
7158 GEN_INT (7)));
7160 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
7162 else
7164 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7165 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7166 rtx tmp6 = gen_reg_rtx (V8HImode);
7167 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
7168 GEN_INT (0x3f)));
7170 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7172 return;
7175 /* Three operations. */
7176 if (bits == 8 || bits == 16 || bits == 24)
7178 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7179 rtx tmp2 = gen_reg_rtx (V4SImode);
7180 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7182 rtx tmp3 = gen_reg_rtx (V1TImode);
7183 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
7185 if (TARGET_AVX2)
7187 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
7188 rtx tmp5 = gen_reg_rtx (V4SImode);
7189 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
7190 GEN_INT (7)));
7192 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
7194 else
7196 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7197 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7198 rtx tmp6 = gen_reg_rtx (V8HImode);
7199 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
7200 GEN_INT (0x3f)));
7202 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7204 return;
7208 if (bits > 96)
7210 /* Four operations. */
7211 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7212 rtx tmp2 = gen_reg_rtx (V4SImode);
7213 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
7215 rtx tmp3 = gen_reg_rtx (V4SImode);
7216 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
7218 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
7219 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7220 rtx tmp6 = gen_reg_rtx (V2DImode);
7221 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
7223 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
7224 rtx tmp8 = gen_reg_rtx (V4SImode);
7225 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
7227 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
7228 return;
7231 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
7233 /* Four operations. */
7234 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7235 rtx tmp2 = gen_reg_rtx (V4SImode);
7236 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7238 rtx tmp3 = gen_reg_rtx (V4SImode);
7239 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7241 rtx tmp4 = gen_reg_rtx (V1TImode);
7242 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7244 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7245 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
7246 rtx tmp7 = gen_reg_rtx (V8HImode);
7247 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
7248 GEN_INT (bits == 48 ? 0x1f : 0x07)));
7250 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
7251 return;
7254 if ((bits & 7) == 0)
7256 /* Five operations. */
7257 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7258 rtx tmp2 = gen_reg_rtx (V4SImode);
7259 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7261 rtx tmp3 = gen_reg_rtx (V4SImode);
7262 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7264 rtx tmp4 = gen_reg_rtx (V1TImode);
7265 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7267 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7268 rtx tmp6 = gen_reg_rtx (V1TImode);
7269 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
7271 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7272 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
7273 rtx tmp9 = gen_reg_rtx (V2DImode);
7274 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
7276 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
7277 return;
7280 if (TARGET_AVX2 && bits < 32)
7282 /* Six operations. */
7283 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7284 rtx tmp2 = gen_reg_rtx (V4SImode);
7285 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7287 rtx tmp3 = gen_reg_rtx (V1TImode);
7288 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7290 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7291 rtx tmp5 = gen_reg_rtx (V2DImode);
7292 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7294 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7295 rtx tmp7 = gen_reg_rtx (V2DImode);
7296 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7298 rtx tmp8 = gen_reg_rtx (V2DImode);
7299 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7301 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
7302 rtx tmp10 = gen_reg_rtx (V4SImode);
7303 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
7305 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
7306 return;
7309 if (TARGET_SSE4_1 && bits < 15)
7311 /* Six operations. */
7312 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7313 rtx tmp2 = gen_reg_rtx (V4SImode);
7314 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7316 rtx tmp3 = gen_reg_rtx (V1TImode);
7317 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7319 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7320 rtx tmp5 = gen_reg_rtx (V2DImode);
7321 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7323 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7324 rtx tmp7 = gen_reg_rtx (V2DImode);
7325 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7327 rtx tmp8 = gen_reg_rtx (V2DImode);
7328 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7330 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7331 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
7332 rtx tmp11 = gen_reg_rtx (V8HImode);
7333 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
7335 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
7336 return;
7339 if (bits == 1)
7341 /* Eight operations. */
7342 rtx tmp1 = gen_reg_rtx (V1TImode);
7343 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7345 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7346 rtx tmp3 = gen_reg_rtx (V2DImode);
7347 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
7349 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7350 rtx tmp5 = gen_reg_rtx (V2DImode);
7351 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
7353 rtx tmp6 = gen_reg_rtx (V2DImode);
7354 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
7356 rtx tmp7 = gen_reg_rtx (V2DImode);
7357 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
7359 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
7360 rtx tmp9 = gen_reg_rtx (V4SImode);
7361 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7363 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
7364 rtx tmp11 = gen_reg_rtx (V2DImode);
7365 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7367 rtx tmp12 = gen_reg_rtx (V2DImode);
7368 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7370 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
7371 return;
7374 if (bits > 64)
7376 /* Eight operations. */
7377 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7378 rtx tmp2 = gen_reg_rtx (V4SImode);
7379 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7381 rtx tmp3 = gen_reg_rtx (V4SImode);
7382 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7384 rtx tmp4 = gen_reg_rtx (V1TImode);
7385 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7387 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7388 rtx tmp6 = gen_reg_rtx (V2DImode);
7389 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7391 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7392 rtx tmp8 = gen_reg_rtx (V1TImode);
7393 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7395 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7396 rtx tmp10 = gen_reg_rtx (V2DImode);
7397 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7399 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
7400 rtx tmp12 = gen_reg_rtx (V2DImode);
7401 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7403 rtx tmp13 = gen_reg_rtx (V2DImode);
7404 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7406 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
7408 else
7410 /* Nine operations. */
7411 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7412 rtx tmp2 = gen_reg_rtx (V4SImode);
7413 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7415 rtx tmp3 = gen_reg_rtx (V4SImode);
7416 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7418 rtx tmp4 = gen_reg_rtx (V1TImode);
7419 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7421 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7422 rtx tmp6 = gen_reg_rtx (V2DImode);
7423 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7425 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7426 rtx tmp8 = gen_reg_rtx (V2DImode);
7427 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7429 rtx tmp9 = gen_reg_rtx (V2DImode);
7430 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7432 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7433 rtx tmp11 = gen_reg_rtx (V1TImode);
7434 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7436 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
7437 rtx tmp13 = gen_reg_rtx (V2DImode);
7438 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7440 rtx tmp14 = gen_reg_rtx (V2DImode);
7441 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7443 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
7447 /* Replace all occurrences of REG FROM with REG TO in X, including
7448 occurrences with different modes. */
7451 ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
7453 gcc_checking_assert (REG_P (from)
7454 && REG_P (to)
7455 && GET_MODE (from) == GET_MODE (to));
7456 if (!reg_overlap_mentioned_p (from, x))
7457 return x;
7458 rtx ret = copy_rtx (x);
7459 subrtx_ptr_iterator::array_type array;
7460 FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
7462 rtx *loc = *iter;
7463 x = *loc;
7464 if (REG_P (x) && REGNO (x) == REGNO (from))
7466 if (x == from)
7467 *loc = to;
7468 else
7470 gcc_checking_assert (REG_NREGS (x) == 1);
7471 *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
7475 return ret;
7478 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7479 DImode for constant loop counts. */
7481 static machine_mode
7482 counter_mode (rtx count_exp)
7484 if (GET_MODE (count_exp) != VOIDmode)
7485 return GET_MODE (count_exp);
7486 if (!CONST_INT_P (count_exp))
7487 return Pmode;
7488 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7489 return DImode;
7490 return SImode;
7493 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7494 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7495 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7496 memory by VALUE (supposed to be in MODE).
7498 The size is rounded down to whole number of chunk size moved at once.
7499 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7502 static void
7503 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
7504 rtx destptr, rtx srcptr, rtx value,
7505 rtx count, machine_mode mode, int unroll,
7506 int expected_size, bool issetmem)
7508 rtx_code_label *out_label, *top_label;
7509 rtx iter, tmp;
7510 machine_mode iter_mode = counter_mode (count);
7511 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7512 rtx piece_size = GEN_INT (piece_size_n);
7513 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7514 rtx size;
7515 int i;
7517 top_label = gen_label_rtx ();
7518 out_label = gen_label_rtx ();
7519 iter = gen_reg_rtx (iter_mode);
7521 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7522 NULL, 1, OPTAB_DIRECT);
7523 /* Those two should combine. */
7524 if (piece_size == const1_rtx)
7526 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7527 true, out_label);
7528 predict_jump (REG_BR_PROB_BASE * 10 / 100);
7530 emit_move_insn (iter, const0_rtx);
7532 emit_label (top_label);
7534 tmp = convert_modes (Pmode, iter_mode, iter, true);
7536 /* This assert could be relaxed - in this case we'll need to compute
7537 smallest power of two, containing in PIECE_SIZE_N and pass it to
7538 offset_address. */
7539 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7540 destmem = offset_address (destmem, tmp, piece_size_n);
7541 destmem = adjust_address (destmem, mode, 0);
7543 if (!issetmem)
7545 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7546 srcmem = adjust_address (srcmem, mode, 0);
7548 /* When unrolling for chips that reorder memory reads and writes,
7549 we can save registers by using single temporary.
7550 Also using 4 temporaries is overkill in 32bit mode. */
7551 if (!TARGET_64BIT && 0)
7553 for (i = 0; i < unroll; i++)
7555 if (i)
7557 destmem = adjust_address (copy_rtx (destmem), mode,
7558 GET_MODE_SIZE (mode));
7559 srcmem = adjust_address (copy_rtx (srcmem), mode,
7560 GET_MODE_SIZE (mode));
7562 emit_move_insn (destmem, srcmem);
7565 else
7567 rtx tmpreg[4];
7568 gcc_assert (unroll <= 4);
7569 for (i = 0; i < unroll; i++)
7571 tmpreg[i] = gen_reg_rtx (mode);
7572 if (i)
7573 srcmem = adjust_address (copy_rtx (srcmem), mode,
7574 GET_MODE_SIZE (mode));
7575 emit_move_insn (tmpreg[i], srcmem);
7577 for (i = 0; i < unroll; i++)
7579 if (i)
7580 destmem = adjust_address (copy_rtx (destmem), mode,
7581 GET_MODE_SIZE (mode));
7582 emit_move_insn (destmem, tmpreg[i]);
7586 else
7587 for (i = 0; i < unroll; i++)
7589 if (i)
7590 destmem = adjust_address (copy_rtx (destmem), mode,
7591 GET_MODE_SIZE (mode));
7592 emit_move_insn (destmem, value);
7595 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7596 true, OPTAB_LIB_WIDEN);
7597 if (tmp != iter)
7598 emit_move_insn (iter, tmp);
7600 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7601 true, top_label);
7602 if (expected_size != -1)
7604 expected_size /= GET_MODE_SIZE (mode) * unroll;
7605 if (expected_size == 0)
7606 predict_jump (0);
7607 else if (expected_size > REG_BR_PROB_BASE)
7608 predict_jump (REG_BR_PROB_BASE - 1);
7609 else
7610 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7611 / expected_size);
7613 else
7614 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7615 iter = ix86_zero_extend_to_Pmode (iter);
7616 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7617 true, OPTAB_LIB_WIDEN);
7618 if (tmp != destptr)
7619 emit_move_insn (destptr, tmp);
7620 if (!issetmem)
7622 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7623 true, OPTAB_LIB_WIDEN);
7624 if (tmp != srcptr)
7625 emit_move_insn (srcptr, tmp);
7627 emit_label (out_label);
7630 /* Divide COUNTREG by SCALE. */
7631 static rtx
7632 scale_counter (rtx countreg, int scale)
7634 rtx sc;
7636 if (scale == 1)
7637 return countreg;
7638 if (CONST_INT_P (countreg))
7639 return GEN_INT (INTVAL (countreg) / scale);
7640 gcc_assert (REG_P (countreg));
7642 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7643 GEN_INT (exact_log2 (scale)),
7644 NULL, 1, OPTAB_DIRECT);
7645 return sc;
7648 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7649 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7650 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7651 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7652 ORIG_VALUE is the original value passed to memset to fill the memory with.
7653 Other arguments have same meaning as for previous function. */
7655 static void
7656 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7657 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7658 rtx count,
7659 machine_mode mode, bool issetmem)
7661 rtx destexp;
7662 rtx srcexp;
7663 rtx countreg;
7664 HOST_WIDE_INT rounded_count;
7666 /* If possible, it is shorter to use rep movs.
7667 TODO: Maybe it is better to move this logic to decide_alg. */
7668 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7669 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7670 && (!issetmem || orig_value == const0_rtx))
7671 mode = SImode;
7673 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7674 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7676 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7677 GET_MODE_SIZE (mode)));
7678 if (mode != QImode)
7680 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7681 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7682 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7684 else
7685 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7686 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7688 rounded_count
7689 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7690 destmem = shallow_copy_rtx (destmem);
7691 set_mem_size (destmem, rounded_count);
7693 else if (MEM_SIZE_KNOWN_P (destmem))
7694 clear_mem_size (destmem);
7696 if (issetmem)
7698 value = force_reg (mode, gen_lowpart (mode, value));
7699 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7701 else
7703 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7704 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7705 if (mode != QImode)
7707 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7708 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7709 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7711 else
7712 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7713 if (CONST_INT_P (count))
7715 rounded_count
7716 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7717 srcmem = shallow_copy_rtx (srcmem);
7718 set_mem_size (srcmem, rounded_count);
7720 else
7722 if (MEM_SIZE_KNOWN_P (srcmem))
7723 clear_mem_size (srcmem);
7725 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7726 destexp, srcexp));
7730 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7731 DESTMEM.
7732 SRC is passed by pointer to be updated on return.
7733 Return value is updated DST. */
7734 static rtx
7735 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7736 HOST_WIDE_INT size_to_move)
7738 rtx dst = destmem, src = *srcmem, tempreg;
7739 enum insn_code code;
7740 machine_mode move_mode;
7741 int piece_size, i;
7743 /* Find the widest mode in which we could perform moves.
7744 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7745 it until move of such size is supported. */
7746 piece_size = 1 << floor_log2 (size_to_move);
7747 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7748 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7750 gcc_assert (piece_size > 1);
7751 piece_size >>= 1;
7754 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7755 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7756 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7758 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7759 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7760 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7762 move_mode = word_mode;
7763 piece_size = GET_MODE_SIZE (move_mode);
7764 code = optab_handler (mov_optab, move_mode);
7767 gcc_assert (code != CODE_FOR_nothing);
7769 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7770 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7772 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7773 gcc_assert (size_to_move % piece_size == 0);
7775 for (i = 0; i < size_to_move; i += piece_size)
7777 /* We move from memory to memory, so we'll need to do it via
7778 a temporary register. */
7779 tempreg = gen_reg_rtx (move_mode);
7780 emit_insn (GEN_FCN (code) (tempreg, src));
7781 emit_insn (GEN_FCN (code) (dst, tempreg));
7783 emit_move_insn (destptr,
7784 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7785 emit_move_insn (srcptr,
7786 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7788 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7789 piece_size);
7790 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7791 piece_size);
7794 /* Update DST and SRC rtx. */
7795 *srcmem = src;
7796 return dst;
7799 /* Helper function for the string operations below. Dest VARIABLE whether
7800 it is aligned to VALUE bytes. If true, jump to the label. */
7802 static rtx_code_label *
7803 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7805 rtx_code_label *label = gen_label_rtx ();
7806 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7807 if (GET_MODE (variable) == DImode)
7808 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7809 else
7810 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7811 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7812 1, label);
7813 if (epilogue)
7814 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7815 else
7816 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7817 return label;
7821 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7823 static void
7824 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7825 rtx destptr, rtx srcptr, rtx count, int max_size)
7827 rtx src, dest;
7828 if (CONST_INT_P (count))
7830 HOST_WIDE_INT countval = INTVAL (count);
7831 HOST_WIDE_INT epilogue_size = countval % max_size;
7832 int i;
7834 /* For now MAX_SIZE should be a power of 2. This assert could be
7835 relaxed, but it'll require a bit more complicated epilogue
7836 expanding. */
7837 gcc_assert ((max_size & (max_size - 1)) == 0);
7838 for (i = max_size; i >= 1; i >>= 1)
7840 if (epilogue_size & i)
7841 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7843 return;
7845 if (max_size > 8)
7847 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7848 count, 1, OPTAB_DIRECT);
7849 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7850 count, QImode, 1, 4, false);
7851 return;
7854 /* When there are stringops, we can cheaply increase dest and src pointers.
7855 Otherwise we save code size by maintaining offset (zero is readily
7856 available from preceding rep operation) and using x86 addressing modes.
7858 if (TARGET_SINGLE_STRINGOP)
7860 if (max_size > 4)
7862 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7863 src = change_address (srcmem, SImode, srcptr);
7864 dest = change_address (destmem, SImode, destptr);
7865 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7866 emit_label (label);
7867 LABEL_NUSES (label) = 1;
7869 if (max_size > 2)
7871 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7872 src = change_address (srcmem, HImode, srcptr);
7873 dest = change_address (destmem, HImode, destptr);
7874 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7875 emit_label (label);
7876 LABEL_NUSES (label) = 1;
7878 if (max_size > 1)
7880 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7881 src = change_address (srcmem, QImode, srcptr);
7882 dest = change_address (destmem, QImode, destptr);
7883 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7884 emit_label (label);
7885 LABEL_NUSES (label) = 1;
7888 else
7890 rtx offset = force_reg (Pmode, const0_rtx);
7891 rtx tmp;
7893 if (max_size > 4)
7895 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7896 src = change_address (srcmem, SImode, srcptr);
7897 dest = change_address (destmem, SImode, destptr);
7898 emit_move_insn (dest, src);
7899 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7900 true, OPTAB_LIB_WIDEN);
7901 if (tmp != offset)
7902 emit_move_insn (offset, tmp);
7903 emit_label (label);
7904 LABEL_NUSES (label) = 1;
7906 if (max_size > 2)
7908 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7909 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7910 src = change_address (srcmem, HImode, tmp);
7911 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7912 dest = change_address (destmem, HImode, tmp);
7913 emit_move_insn (dest, src);
7914 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7915 true, OPTAB_LIB_WIDEN);
7916 if (tmp != offset)
7917 emit_move_insn (offset, tmp);
7918 emit_label (label);
7919 LABEL_NUSES (label) = 1;
7921 if (max_size > 1)
7923 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7924 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7925 src = change_address (srcmem, QImode, tmp);
7926 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7927 dest = change_address (destmem, QImode, tmp);
7928 emit_move_insn (dest, src);
7929 emit_label (label);
7930 LABEL_NUSES (label) = 1;
7935 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7936 with value PROMOTED_VAL.
7937 SRC is passed by pointer to be updated on return.
7938 Return value is updated DST. */
7939 static rtx
7940 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7941 HOST_WIDE_INT size_to_move)
7943 rtx dst = destmem;
7944 enum insn_code code;
7945 machine_mode move_mode;
7946 int piece_size, i;
7948 /* Find the widest mode in which we could perform moves.
7949 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7950 it until move of such size is supported. */
7951 move_mode = GET_MODE (promoted_val);
7952 if (move_mode == VOIDmode)
7953 move_mode = QImode;
7954 if (size_to_move < GET_MODE_SIZE (move_mode))
7956 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7957 move_mode = int_mode_for_size (move_bits, 0).require ();
7958 promoted_val = gen_lowpart (move_mode, promoted_val);
7960 piece_size = GET_MODE_SIZE (move_mode);
7961 code = optab_handler (mov_optab, move_mode);
7962 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7964 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7966 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7967 gcc_assert (size_to_move % piece_size == 0);
7969 for (i = 0; i < size_to_move; i += piece_size)
7971 if (piece_size <= GET_MODE_SIZE (word_mode))
7973 emit_insn (gen_strset (destptr, dst, promoted_val));
7974 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7975 piece_size);
7976 continue;
7979 emit_insn (GEN_FCN (code) (dst, promoted_val));
7981 emit_move_insn (destptr,
7982 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7984 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7985 piece_size);
7988 /* Update DST rtx. */
7989 return dst;
7991 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7992 static void
7993 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7994 rtx count, int max_size)
7996 count = expand_simple_binop (counter_mode (count), AND, count,
7997 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7998 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7999 gen_lowpart (QImode, value), count, QImode,
8000 1, max_size / 2, true);
8003 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
8004 static void
8005 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
8006 rtx count, int max_size)
8008 rtx dest;
8010 if (CONST_INT_P (count))
8012 HOST_WIDE_INT countval = INTVAL (count);
8013 HOST_WIDE_INT epilogue_size = countval % max_size;
8014 int i;
8016 /* For now MAX_SIZE should be a power of 2. This assert could be
8017 relaxed, but it'll require a bit more complicated epilogue
8018 expanding. */
8019 gcc_assert ((max_size & (max_size - 1)) == 0);
8020 for (i = max_size; i >= 1; i >>= 1)
8022 if (epilogue_size & i)
8024 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
8025 destmem = emit_memset (destmem, destptr, vec_value, i);
8026 else
8027 destmem = emit_memset (destmem, destptr, value, i);
8030 return;
8032 if (max_size > 32)
8034 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
8035 return;
8037 if (max_size > 16)
8039 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
8040 if (TARGET_64BIT)
8042 dest = change_address (destmem, DImode, destptr);
8043 emit_insn (gen_strset (destptr, dest, value));
8044 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
8045 emit_insn (gen_strset (destptr, dest, value));
8047 else
8049 dest = change_address (destmem, SImode, destptr);
8050 emit_insn (gen_strset (destptr, dest, value));
8051 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
8052 emit_insn (gen_strset (destptr, dest, value));
8053 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
8054 emit_insn (gen_strset (destptr, dest, value));
8055 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
8056 emit_insn (gen_strset (destptr, dest, value));
8058 emit_label (label);
8059 LABEL_NUSES (label) = 1;
8061 if (max_size > 8)
8063 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
8064 if (TARGET_64BIT)
8066 dest = change_address (destmem, DImode, destptr);
8067 emit_insn (gen_strset (destptr, dest, value));
8069 else
8071 dest = change_address (destmem, SImode, destptr);
8072 emit_insn (gen_strset (destptr, dest, value));
8073 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
8074 emit_insn (gen_strset (destptr, dest, value));
8076 emit_label (label);
8077 LABEL_NUSES (label) = 1;
8079 if (max_size > 4)
8081 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
8082 dest = change_address (destmem, SImode, destptr);
8083 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
8084 emit_label (label);
8085 LABEL_NUSES (label) = 1;
8087 if (max_size > 2)
8089 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
8090 dest = change_address (destmem, HImode, destptr);
8091 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
8092 emit_label (label);
8093 LABEL_NUSES (label) = 1;
8095 if (max_size > 1)
8097 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
8098 dest = change_address (destmem, QImode, destptr);
8099 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
8100 emit_label (label);
8101 LABEL_NUSES (label) = 1;
8105 /* Adjust COUNTER by the VALUE. */
8106 static void
8107 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
8109 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
8112 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
8113 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
8114 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
8115 ignored.
8116 Return value is updated DESTMEM. */
8118 static rtx
8119 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
8120 rtx destptr, rtx srcptr, rtx value,
8121 rtx vec_value, rtx count, int align,
8122 int desired_alignment, bool issetmem)
8124 int i;
8125 for (i = 1; i < desired_alignment; i <<= 1)
8127 if (align <= i)
8129 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
8130 if (issetmem)
8132 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
8133 destmem = emit_memset (destmem, destptr, vec_value, i);
8134 else
8135 destmem = emit_memset (destmem, destptr, value, i);
8137 else
8138 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
8139 ix86_adjust_counter (count, i);
8140 emit_label (label);
8141 LABEL_NUSES (label) = 1;
8142 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
8145 return destmem;
8148 /* Test if COUNT&SIZE is nonzero and if so, expand movme
8149 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
8150 and jump to DONE_LABEL. */
8151 static void
8152 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
8153 rtx destptr, rtx srcptr,
8154 rtx value, rtx vec_value,
8155 rtx count, int size,
8156 rtx done_label, bool issetmem)
8158 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
8159 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
8160 rtx modesize;
8161 int n;
8163 /* If we do not have vector value to copy, we must reduce size. */
8164 if (issetmem)
8166 if (!vec_value)
8168 if (GET_MODE (value) == VOIDmode && size > 8)
8169 mode = Pmode;
8170 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
8171 mode = GET_MODE (value);
8173 else
8174 mode = GET_MODE (vec_value), value = vec_value;
8176 else
8178 /* Choose appropriate vector mode. */
8179 if (size >= 32)
8180 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
8181 else if (size >= 16)
8182 mode = TARGET_SSE ? V16QImode : DImode;
8183 srcmem = change_address (srcmem, mode, srcptr);
8185 destmem = change_address (destmem, mode, destptr);
8186 modesize = GEN_INT (GET_MODE_SIZE (mode));
8187 gcc_assert (GET_MODE_SIZE (mode) <= size);
8188 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
8190 if (issetmem)
8191 emit_move_insn (destmem, gen_lowpart (mode, value));
8192 else
8194 emit_move_insn (destmem, srcmem);
8195 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8197 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8200 destmem = offset_address (destmem, count, 1);
8201 destmem = offset_address (destmem, GEN_INT (-2 * size),
8202 GET_MODE_SIZE (mode));
8203 if (!issetmem)
8205 srcmem = offset_address (srcmem, count, 1);
8206 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
8207 GET_MODE_SIZE (mode));
8209 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
8211 if (issetmem)
8212 emit_move_insn (destmem, gen_lowpart (mode, value));
8213 else
8215 emit_move_insn (destmem, srcmem);
8216 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8218 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8220 emit_jump_insn (gen_jump (done_label));
8221 emit_barrier ();
8223 emit_label (label);
8224 LABEL_NUSES (label) = 1;
8227 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
8228 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
8229 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
8230 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
8231 DONE_LABEL is a label after the whole copying sequence. The label is created
8232 on demand if *DONE_LABEL is NULL.
8233 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
8234 bounds after the initial copies.
8236 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
8237 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
8238 we will dispatch to a library call for large blocks.
8240 In pseudocode we do:
8242 if (COUNT < SIZE)
8244 Assume that SIZE is 4. Bigger sizes are handled analogously
8245 if (COUNT & 4)
8247 copy 4 bytes from SRCPTR to DESTPTR
8248 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
8249 goto done_label
8251 if (!COUNT)
8252 goto done_label;
8253 copy 1 byte from SRCPTR to DESTPTR
8254 if (COUNT & 2)
8256 copy 2 bytes from SRCPTR to DESTPTR
8257 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
8260 else
8262 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
8263 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
8265 OLD_DESPTR = DESTPTR;
8266 Align DESTPTR up to DESIRED_ALIGN
8267 SRCPTR += DESTPTR - OLD_DESTPTR
8268 COUNT -= DEST_PTR - OLD_DESTPTR
8269 if (DYNAMIC_CHECK)
8270 Round COUNT down to multiple of SIZE
8271 << optional caller supplied zero size guard is here >>
8272 << optional caller supplied dynamic check is here >>
8273 << caller supplied main copy loop is here >>
8275 done_label:
8277 static void
8278 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
8279 rtx *destptr, rtx *srcptr,
8280 machine_mode mode,
8281 rtx value, rtx vec_value,
8282 rtx *count,
8283 rtx_code_label **done_label,
8284 int size,
8285 int desired_align,
8286 int align,
8287 unsigned HOST_WIDE_INT *min_size,
8288 bool dynamic_check,
8289 bool issetmem)
8291 rtx_code_label *loop_label = NULL, *label;
8292 int n;
8293 rtx modesize;
8294 int prolog_size = 0;
8295 rtx mode_value;
8297 /* Chose proper value to copy. */
8298 if (issetmem && VECTOR_MODE_P (mode))
8299 mode_value = vec_value;
8300 else
8301 mode_value = value;
8302 gcc_assert (GET_MODE_SIZE (mode) <= size);
8304 /* See if block is big or small, handle small blocks. */
8305 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
8307 int size2 = size;
8308 loop_label = gen_label_rtx ();
8310 if (!*done_label)
8311 *done_label = gen_label_rtx ();
8313 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
8314 1, loop_label);
8315 size2 >>= 1;
8317 /* Handle sizes > 3. */
8318 for (;size2 > 2; size2 >>= 1)
8319 expand_small_cpymem_or_setmem (destmem, srcmem,
8320 *destptr, *srcptr,
8321 value, vec_value,
8322 *count,
8323 size2, *done_label, issetmem);
8324 /* Nothing to copy? Jump to DONE_LABEL if so */
8325 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
8326 1, *done_label);
8328 /* Do a byte copy. */
8329 destmem = change_address (destmem, QImode, *destptr);
8330 if (issetmem)
8331 emit_move_insn (destmem, gen_lowpart (QImode, value));
8332 else
8334 srcmem = change_address (srcmem, QImode, *srcptr);
8335 emit_move_insn (destmem, srcmem);
8338 /* Handle sizes 2 and 3. */
8339 label = ix86_expand_aligntest (*count, 2, false);
8340 destmem = change_address (destmem, HImode, *destptr);
8341 destmem = offset_address (destmem, *count, 1);
8342 destmem = offset_address (destmem, GEN_INT (-2), 2);
8343 if (issetmem)
8344 emit_move_insn (destmem, gen_lowpart (HImode, value));
8345 else
8347 srcmem = change_address (srcmem, HImode, *srcptr);
8348 srcmem = offset_address (srcmem, *count, 1);
8349 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
8350 emit_move_insn (destmem, srcmem);
8353 emit_label (label);
8354 LABEL_NUSES (label) = 1;
8355 emit_jump_insn (gen_jump (*done_label));
8356 emit_barrier ();
8358 else
8359 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
8360 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
8362 /* Start memcpy for COUNT >= SIZE. */
8363 if (loop_label)
8365 emit_label (loop_label);
8366 LABEL_NUSES (loop_label) = 1;
8369 /* Copy first desired_align bytes. */
8370 if (!issetmem)
8371 srcmem = change_address (srcmem, mode, *srcptr);
8372 destmem = change_address (destmem, mode, *destptr);
8373 modesize = GEN_INT (GET_MODE_SIZE (mode));
8374 for (n = 0; prolog_size < desired_align - align; n++)
8376 if (issetmem)
8377 emit_move_insn (destmem, mode_value);
8378 else
8380 emit_move_insn (destmem, srcmem);
8381 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8383 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8384 prolog_size += GET_MODE_SIZE (mode);
8388 /* Copy last SIZE bytes. */
8389 destmem = offset_address (destmem, *count, 1);
8390 destmem = offset_address (destmem,
8391 GEN_INT (-size - prolog_size),
8393 if (issetmem)
8394 emit_move_insn (destmem, mode_value);
8395 else
8397 srcmem = offset_address (srcmem, *count, 1);
8398 srcmem = offset_address (srcmem,
8399 GEN_INT (-size - prolog_size),
8401 emit_move_insn (destmem, srcmem);
8403 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8405 destmem = offset_address (destmem, modesize, 1);
8406 if (issetmem)
8407 emit_move_insn (destmem, mode_value);
8408 else
8410 srcmem = offset_address (srcmem, modesize, 1);
8411 emit_move_insn (destmem, srcmem);
8415 /* Align destination. */
8416 if (desired_align > 1 && desired_align > align)
8418 rtx saveddest = *destptr;
8420 gcc_assert (desired_align <= size);
8421 /* Align destptr up, place it to new register. */
8422 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
8423 GEN_INT (prolog_size),
8424 NULL_RTX, 1, OPTAB_DIRECT);
8425 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
8426 REG_POINTER (*destptr) = 1;
8427 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
8428 GEN_INT (-desired_align),
8429 *destptr, 1, OPTAB_DIRECT);
8430 /* See how many bytes we skipped. */
8431 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
8432 *destptr,
8433 NULL_RTX, 1, OPTAB_DIRECT);
8434 /* Adjust srcptr and count. */
8435 if (!issetmem)
8436 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
8437 saveddest, *srcptr, 1, OPTAB_DIRECT);
8438 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8439 saveddest, *count, 1, OPTAB_DIRECT);
8440 /* We copied at most size + prolog_size. */
8441 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
8442 *min_size
8443 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
8444 else
8445 *min_size = 0;
8447 /* Our loops always round down the block size, but for dispatch to
8448 library we need precise value. */
8449 if (dynamic_check)
8450 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
8451 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
8453 else
8455 gcc_assert (prolog_size == 0);
8456 /* Decrease count, so we won't end up copying last word twice. */
8457 if (!CONST_INT_P (*count))
8458 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8459 constm1_rtx, *count, 1, OPTAB_DIRECT);
8460 else
8461 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
8462 (unsigned HOST_WIDE_INT)size));
8463 if (*min_size)
8464 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
8469 /* This function is like the previous one, except here we know how many bytes
8470 need to be copied. That allows us to update alignment not only of DST, which
8471 is returned, but also of SRC, which is passed as a pointer for that
8472 reason. */
8473 static rtx
8474 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
8475 rtx srcreg, rtx value, rtx vec_value,
8476 int desired_align, int align_bytes,
8477 bool issetmem)
8479 rtx src = NULL;
8480 rtx orig_dst = dst;
8481 rtx orig_src = NULL;
8482 int piece_size = 1;
8483 int copied_bytes = 0;
8485 if (!issetmem)
8487 gcc_assert (srcp != NULL);
8488 src = *srcp;
8489 orig_src = src;
8492 for (piece_size = 1;
8493 piece_size <= desired_align && copied_bytes < align_bytes;
8494 piece_size <<= 1)
8496 if (align_bytes & piece_size)
8498 if (issetmem)
8500 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
8501 dst = emit_memset (dst, destreg, vec_value, piece_size);
8502 else
8503 dst = emit_memset (dst, destreg, value, piece_size);
8505 else
8506 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
8507 copied_bytes += piece_size;
8510 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
8511 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8512 if (MEM_SIZE_KNOWN_P (orig_dst))
8513 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
8515 if (!issetmem)
8517 int src_align_bytes = get_mem_align_offset (src, desired_align
8518 * BITS_PER_UNIT);
8519 if (src_align_bytes >= 0)
8520 src_align_bytes = desired_align - src_align_bytes;
8521 if (src_align_bytes >= 0)
8523 unsigned int src_align;
8524 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
8526 if ((src_align_bytes & (src_align - 1))
8527 == (align_bytes & (src_align - 1)))
8528 break;
8530 if (src_align > (unsigned int) desired_align)
8531 src_align = desired_align;
8532 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
8533 set_mem_align (src, src_align * BITS_PER_UNIT);
8535 if (MEM_SIZE_KNOWN_P (orig_src))
8536 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
8537 *srcp = src;
8540 return dst;
8543 /* Return true if ALG can be used in current context.
8544 Assume we expand memset if MEMSET is true. */
8545 static bool
8546 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
8548 if (alg == no_stringop)
8549 return false;
8550 /* It is not possible to use a library call if we have non-default
8551 address space. We can do better than the generic byte-at-a-time
8552 loop, used as a fallback. */
8553 if (alg == libcall && have_as)
8554 return false;
8555 if (alg == vector_loop)
8556 return TARGET_SSE || TARGET_AVX;
8557 /* Algorithms using the rep prefix want at least edi and ecx;
8558 additionally, memset wants eax and memcpy wants esi. Don't
8559 consider such algorithms if the user has appropriated those
8560 registers for their own purposes, or if we have a non-default
8561 address space, since some string insns cannot override the segment. */
8562 if (alg == rep_prefix_1_byte
8563 || alg == rep_prefix_4_byte
8564 || alg == rep_prefix_8_byte)
8566 if (have_as)
8567 return false;
8568 if (fixed_regs[CX_REG]
8569 || fixed_regs[DI_REG]
8570 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8571 return false;
8573 return true;
8576 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8577 static enum stringop_alg
8578 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8579 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8580 bool memset, bool zero_memset, bool have_as,
8581 int *dynamic_check, bool *noalign, bool recur)
8583 const struct stringop_algs *algs;
8584 bool optimize_for_speed;
8585 int max = 0;
8586 const struct processor_costs *cost;
8587 int i;
8588 bool any_alg_usable_p = false;
8590 *noalign = false;
8591 *dynamic_check = -1;
8593 /* Even if the string operation call is cold, we still might spend a lot
8594 of time processing large blocks. */
8595 if (optimize_function_for_size_p (cfun)
8596 || (optimize_insn_for_size_p ()
8597 && (max_size < 256
8598 || (expected_size != -1 && expected_size < 256))))
8599 optimize_for_speed = false;
8600 else
8601 optimize_for_speed = true;
8603 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8604 if (memset)
8605 algs = &cost->memset[TARGET_64BIT != 0];
8606 else
8607 algs = &cost->memcpy[TARGET_64BIT != 0];
8609 /* See maximal size for user defined algorithm. */
8610 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8612 enum stringop_alg candidate = algs->size[i].alg;
8613 bool usable = alg_usable_p (candidate, memset, have_as);
8614 any_alg_usable_p |= usable;
8616 if (candidate != libcall && candidate && usable)
8617 max = algs->size[i].max;
8620 /* If expected size is not known but max size is small enough
8621 so inline version is a win, set expected size into
8622 the range. */
8623 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8624 && expected_size == -1)
8625 expected_size = min_size / 2 + max_size / 2;
8627 /* If user specified the algorithm, honor it if possible. */
8628 if (ix86_stringop_alg != no_stringop
8629 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8630 return ix86_stringop_alg;
8631 /* rep; movq or rep; movl is the smallest variant. */
8632 else if (!optimize_for_speed)
8634 *noalign = true;
8635 if (!count || (count & 3) || (memset && !zero_memset))
8636 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8637 ? rep_prefix_1_byte : loop_1_byte;
8638 else
8639 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8640 ? rep_prefix_4_byte : loop;
8642 /* Very tiny blocks are best handled via the loop, REP is expensive to
8643 setup. */
8644 else if (expected_size != -1 && expected_size < 4)
8645 return loop_1_byte;
8646 else if (expected_size != -1)
8648 enum stringop_alg alg = libcall;
8649 bool alg_noalign = false;
8650 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8652 /* We get here if the algorithms that were not libcall-based
8653 were rep-prefix based and we are unable to use rep prefixes
8654 based on global register usage. Break out of the loop and
8655 use the heuristic below. */
8656 if (algs->size[i].max == 0)
8657 break;
8658 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8660 enum stringop_alg candidate = algs->size[i].alg;
8662 if (candidate != libcall
8663 && alg_usable_p (candidate, memset, have_as))
8665 alg = candidate;
8666 alg_noalign = algs->size[i].noalign;
8668 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8669 last non-libcall inline algorithm. */
8670 if (TARGET_INLINE_ALL_STRINGOPS)
8672 /* When the current size is best to be copied by a libcall,
8673 but we are still forced to inline, run the heuristic below
8674 that will pick code for medium sized blocks. */
8675 if (alg != libcall)
8677 *noalign = alg_noalign;
8678 return alg;
8680 else if (!any_alg_usable_p)
8681 break;
8683 else if (alg_usable_p (candidate, memset, have_as)
8684 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8685 && candidate == rep_prefix_1_byte
8686 /* NB: If min_size != max_size, size is
8687 unknown. */
8688 && min_size != max_size))
8690 *noalign = algs->size[i].noalign;
8691 return candidate;
8696 /* When asked to inline the call anyway, try to pick meaningful choice.
8697 We look for maximal size of block that is faster to copy by hand and
8698 take blocks of at most of that size guessing that average size will
8699 be roughly half of the block.
8701 If this turns out to be bad, we might simply specify the preferred
8702 choice in ix86_costs. */
8703 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8704 && (algs->unknown_size == libcall
8705 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8707 enum stringop_alg alg;
8708 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8710 /* If there aren't any usable algorithms or if recursing already,
8711 then recursing on smaller sizes or same size isn't going to
8712 find anything. Just return the simple byte-at-a-time copy loop. */
8713 if (!any_alg_usable_p || recur)
8715 /* Pick something reasonable. */
8716 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8717 *dynamic_check = 128;
8718 return loop_1_byte;
8720 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8721 zero_memset, have_as, dynamic_check, noalign, true);
8722 gcc_assert (*dynamic_check == -1);
8723 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8724 *dynamic_check = max;
8725 else
8726 gcc_assert (alg != libcall);
8727 return alg;
8730 /* Try to use some reasonable fallback algorithm. Note that for
8731 non-default address spaces we default to a loop instead of
8732 a libcall. */
8733 return (alg_usable_p (algs->unknown_size, memset, have_as)
8734 ? algs->unknown_size : have_as ? loop : libcall);
8737 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8738 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8739 static int
8740 decide_alignment (int align,
8741 enum stringop_alg alg,
8742 int expected_size,
8743 machine_mode move_mode)
8745 int desired_align = 0;
8747 gcc_assert (alg != no_stringop);
8749 if (alg == libcall)
8750 return 0;
8751 if (move_mode == VOIDmode)
8752 return 0;
8754 desired_align = GET_MODE_SIZE (move_mode);
8755 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8756 copying whole cacheline at once. */
8757 if (TARGET_CPU_P (PENTIUMPRO)
8758 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8759 desired_align = 8;
8761 if (optimize_size)
8762 desired_align = 1;
8763 if (desired_align < align)
8764 desired_align = align;
8765 if (expected_size != -1 && expected_size < 4)
8766 desired_align = align;
8768 return desired_align;
8772 /* Helper function for memcpy. For QImode value 0xXY produce
8773 0xXYXYXYXY of wide specified by MODE. This is essentially
8774 a * 0x10101010, but we can do slightly better than
8775 synth_mult by unwinding the sequence by hand on CPUs with
8776 slow multiply. */
8777 static rtx
8778 promote_duplicated_reg (machine_mode mode, rtx val)
8780 machine_mode valmode = GET_MODE (val);
8781 rtx tmp;
8782 int nops = mode == DImode ? 3 : 2;
8784 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8785 if (val == const0_rtx)
8786 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8787 if (CONST_INT_P (val))
8789 HOST_WIDE_INT v = INTVAL (val) & 255;
8791 v |= v << 8;
8792 v |= v << 16;
8793 if (mode == DImode)
8794 v |= (v << 16) << 16;
8795 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8798 if (valmode == VOIDmode)
8799 valmode = QImode;
8800 if (valmode != QImode)
8801 val = gen_lowpart (QImode, val);
8802 if (mode == QImode)
8803 return val;
8804 if (!TARGET_PARTIAL_REG_STALL)
8805 nops--;
8806 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8807 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8808 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8809 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8811 rtx reg = convert_modes (mode, QImode, val, true);
8812 tmp = promote_duplicated_reg (mode, const1_rtx);
8813 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8814 OPTAB_DIRECT);
8816 else
8818 rtx reg = convert_modes (mode, QImode, val, true);
8820 if (!TARGET_PARTIAL_REG_STALL)
8821 emit_insn (gen_insv_1 (mode, reg, reg));
8822 else
8824 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8825 NULL, 1, OPTAB_DIRECT);
8826 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8827 OPTAB_DIRECT);
8829 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8830 NULL, 1, OPTAB_DIRECT);
8831 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8832 if (mode == SImode)
8833 return reg;
8834 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8835 NULL, 1, OPTAB_DIRECT);
8836 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8837 return reg;
8841 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8842 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8843 alignment from ALIGN to DESIRED_ALIGN. */
8844 static rtx
8845 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8846 int align)
8848 rtx promoted_val;
8850 if (TARGET_64BIT
8851 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8852 promoted_val = promote_duplicated_reg (DImode, val);
8853 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8854 promoted_val = promote_duplicated_reg (SImode, val);
8855 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8856 promoted_val = promote_duplicated_reg (HImode, val);
8857 else
8858 promoted_val = val;
8860 return promoted_val;
8863 /* Copy the address to a Pmode register. This is used for x32 to
8864 truncate DImode TLS address to a SImode register. */
8866 static rtx
8867 ix86_copy_addr_to_reg (rtx addr)
8869 rtx reg;
8870 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8872 reg = copy_addr_to_reg (addr);
8873 REG_POINTER (reg) = 1;
8874 return reg;
8876 else
8878 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8879 reg = copy_to_mode_reg (DImode, addr);
8880 REG_POINTER (reg) = 1;
8881 return gen_rtx_SUBREG (SImode, reg, 0);
8885 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8886 operations when profitable. The code depends upon architecture, block size
8887 and alignment, but always has one of the following overall structures:
8889 Aligned move sequence:
8891 1) Prologue guard: Conditional that jumps up to epilogues for small
8892 blocks that can be handled by epilogue alone. This is faster
8893 but also needed for correctness, since prologue assume the block
8894 is larger than the desired alignment.
8896 Optional dynamic check for size and libcall for large
8897 blocks is emitted here too, with -minline-stringops-dynamically.
8899 2) Prologue: copy first few bytes in order to get destination
8900 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8901 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8902 copied. We emit either a jump tree on power of two sized
8903 blocks, or a byte loop.
8905 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8906 with specified algorithm.
8908 4) Epilogue: code copying tail of the block that is too small to be
8909 handled by main body (or up to size guarded by prologue guard).
8911 Misaligned move sequence
8913 1) missaligned move prologue/epilogue containing:
8914 a) Prologue handling small memory blocks and jumping to done_label
8915 (skipped if blocks are known to be large enough)
8916 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8917 needed by single possibly misaligned move
8918 (skipped if alignment is not needed)
8919 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8921 2) Zero size guard dispatching to done_label, if needed
8923 3) dispatch to library call, if needed,
8925 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8926 with specified algorithm. */
8927 bool
8928 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8929 rtx align_exp, rtx expected_align_exp,
8930 rtx expected_size_exp, rtx min_size_exp,
8931 rtx max_size_exp, rtx probable_max_size_exp,
8932 bool issetmem)
8934 rtx destreg;
8935 rtx srcreg = NULL;
8936 rtx_code_label *label = NULL;
8937 rtx tmp;
8938 rtx_code_label *jump_around_label = NULL;
8939 HOST_WIDE_INT align = 1;
8940 unsigned HOST_WIDE_INT count = 0;
8941 HOST_WIDE_INT expected_size = -1;
8942 int size_needed = 0, epilogue_size_needed;
8943 int desired_align = 0, align_bytes = 0;
8944 enum stringop_alg alg;
8945 rtx promoted_val = NULL;
8946 rtx vec_promoted_val = NULL;
8947 bool force_loopy_epilogue = false;
8948 int dynamic_check;
8949 bool need_zero_guard = false;
8950 bool noalign;
8951 machine_mode move_mode = VOIDmode;
8952 machine_mode wider_mode;
8953 int unroll_factor = 1;
8954 /* TODO: Once value ranges are available, fill in proper data. */
8955 unsigned HOST_WIDE_INT min_size = 0;
8956 unsigned HOST_WIDE_INT max_size = -1;
8957 unsigned HOST_WIDE_INT probable_max_size = -1;
8958 bool misaligned_prologue_used = false;
8959 bool have_as;
8961 if (CONST_INT_P (align_exp))
8962 align = INTVAL (align_exp);
8963 /* i386 can do misaligned access on reasonably increased cost. */
8964 if (CONST_INT_P (expected_align_exp)
8965 && INTVAL (expected_align_exp) > align)
8966 align = INTVAL (expected_align_exp);
8967 /* ALIGN is the minimum of destination and source alignment, but we care here
8968 just about destination alignment. */
8969 else if (!issetmem
8970 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8971 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8973 if (CONST_INT_P (count_exp))
8975 min_size = max_size = probable_max_size = count = expected_size
8976 = INTVAL (count_exp);
8977 /* When COUNT is 0, there is nothing to do. */
8978 if (!count)
8979 return true;
8981 else
8983 if (min_size_exp)
8984 min_size = INTVAL (min_size_exp);
8985 if (max_size_exp)
8986 max_size = INTVAL (max_size_exp);
8987 if (probable_max_size_exp)
8988 probable_max_size = INTVAL (probable_max_size_exp);
8989 if (CONST_INT_P (expected_size_exp))
8990 expected_size = INTVAL (expected_size_exp);
8993 /* Make sure we don't need to care about overflow later on. */
8994 if (count > (HOST_WIDE_INT_1U << 30))
8995 return false;
8997 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8998 if (!issetmem)
8999 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
9001 /* Step 0: Decide on preferred algorithm, desired alignment and
9002 size of chunks to be copied by main loop. */
9003 alg = decide_alg (count, expected_size, min_size, probable_max_size,
9004 issetmem,
9005 issetmem && val_exp == const0_rtx, have_as,
9006 &dynamic_check, &noalign, false);
9008 if (dump_file)
9009 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
9010 stringop_alg_names[alg]);
9012 if (alg == libcall)
9013 return false;
9014 gcc_assert (alg != no_stringop);
9016 /* For now vector-version of memset is generated only for memory zeroing, as
9017 creating of promoted vector value is very cheap in this case. */
9018 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
9019 alg = unrolled_loop;
9021 if (!count)
9022 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
9023 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
9024 if (!issetmem)
9025 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
9027 unroll_factor = 1;
9028 move_mode = word_mode;
9029 switch (alg)
9031 case libcall:
9032 case no_stringop:
9033 case last_alg:
9034 gcc_unreachable ();
9035 case loop_1_byte:
9036 need_zero_guard = true;
9037 move_mode = QImode;
9038 break;
9039 case loop:
9040 need_zero_guard = true;
9041 break;
9042 case unrolled_loop:
9043 need_zero_guard = true;
9044 unroll_factor = (TARGET_64BIT ? 4 : 2);
9045 break;
9046 case vector_loop:
9047 need_zero_guard = true;
9048 unroll_factor = 4;
9049 /* Find the widest supported mode. */
9050 move_mode = word_mode;
9051 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
9052 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
9053 move_mode = wider_mode;
9055 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
9056 move_mode = TImode;
9057 if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
9058 move_mode = OImode;
9060 /* Find the corresponding vector mode with the same size as MOVE_MODE.
9061 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
9062 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
9064 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
9065 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
9066 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
9067 move_mode = word_mode;
9069 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
9070 break;
9071 case rep_prefix_8_byte:
9072 move_mode = DImode;
9073 break;
9074 case rep_prefix_4_byte:
9075 move_mode = SImode;
9076 break;
9077 case rep_prefix_1_byte:
9078 move_mode = QImode;
9079 break;
9081 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
9082 epilogue_size_needed = size_needed;
9084 /* If we are going to call any library calls conditionally, make sure any
9085 pending stack adjustment happen before the first conditional branch,
9086 otherwise they will be emitted before the library call only and won't
9087 happen from the other branches. */
9088 if (dynamic_check != -1)
9089 do_pending_stack_adjust ();
9091 desired_align = decide_alignment (align, alg, expected_size, move_mode);
9092 if (!TARGET_ALIGN_STRINGOPS || noalign)
9093 align = desired_align;
9095 /* Step 1: Prologue guard. */
9097 /* Alignment code needs count to be in register. */
9098 if (CONST_INT_P (count_exp) && desired_align > align)
9100 if (INTVAL (count_exp) > desired_align
9101 && INTVAL (count_exp) > size_needed)
9103 align_bytes
9104 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
9105 if (align_bytes <= 0)
9106 align_bytes = 0;
9107 else
9108 align_bytes = desired_align - align_bytes;
9110 if (align_bytes == 0)
9111 count_exp = force_reg (counter_mode (count_exp), count_exp);
9113 gcc_assert (desired_align >= 1 && align >= 1);
9115 /* Misaligned move sequences handle both prologue and epilogue at once.
9116 Default code generation results in a smaller code for large alignments
9117 and also avoids redundant job when sizes are known precisely. */
9118 misaligned_prologue_used
9119 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
9120 && MAX (desired_align, epilogue_size_needed) <= 32
9121 && desired_align <= epilogue_size_needed
9122 && ((desired_align > align && !align_bytes)
9123 || (!count && epilogue_size_needed > 1)));
9125 /* Do the cheap promotion to allow better CSE across the
9126 main loop and epilogue (ie one load of the big constant in the
9127 front of all code.
9128 For now the misaligned move sequences do not have fast path
9129 without broadcasting. */
9130 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
9132 if (alg == vector_loop)
9134 gcc_assert (val_exp == const0_rtx);
9135 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
9136 promoted_val = promote_duplicated_reg_to_size (val_exp,
9137 GET_MODE_SIZE (word_mode),
9138 desired_align, align);
9140 else
9142 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
9143 desired_align, align);
9146 /* Misaligned move sequences handles both prologues and epilogues at once.
9147 Default code generation results in smaller code for large alignments and
9148 also avoids redundant job when sizes are known precisely. */
9149 if (misaligned_prologue_used)
9151 /* Misaligned move prologue handled small blocks by itself. */
9152 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
9153 (dst, src, &destreg, &srcreg,
9154 move_mode, promoted_val, vec_promoted_val,
9155 &count_exp,
9156 &jump_around_label,
9157 desired_align < align
9158 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
9159 desired_align, align, &min_size, dynamic_check, issetmem);
9160 if (!issetmem)
9161 src = change_address (src, BLKmode, srcreg);
9162 dst = change_address (dst, BLKmode, destreg);
9163 set_mem_align (dst, desired_align * BITS_PER_UNIT);
9164 epilogue_size_needed = 0;
9165 if (need_zero_guard
9166 && min_size < (unsigned HOST_WIDE_INT) size_needed)
9168 /* It is possible that we copied enough so the main loop will not
9169 execute. */
9170 gcc_assert (size_needed > 1);
9171 if (jump_around_label == NULL_RTX)
9172 jump_around_label = gen_label_rtx ();
9173 emit_cmp_and_jump_insns (count_exp,
9174 GEN_INT (size_needed),
9175 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
9176 if (expected_size == -1
9177 || expected_size < (desired_align - align) / 2 + size_needed)
9178 predict_jump (REG_BR_PROB_BASE * 20 / 100);
9179 else
9180 predict_jump (REG_BR_PROB_BASE * 60 / 100);
9183 /* Ensure that alignment prologue won't copy past end of block. */
9184 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
9186 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
9187 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
9188 Make sure it is power of 2. */
9189 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
9191 /* To improve performance of small blocks, we jump around the VAL
9192 promoting mode. This mean that if the promoted VAL is not constant,
9193 we might not use it in the epilogue and have to use byte
9194 loop variant. */
9195 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
9196 force_loopy_epilogue = true;
9197 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9198 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9200 /* If main algorithm works on QImode, no epilogue is needed.
9201 For small sizes just don't align anything. */
9202 if (size_needed == 1)
9203 desired_align = align;
9204 else
9205 goto epilogue;
9207 else if (!count
9208 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9210 label = gen_label_rtx ();
9211 emit_cmp_and_jump_insns (count_exp,
9212 GEN_INT (epilogue_size_needed),
9213 LTU, 0, counter_mode (count_exp), 1, label);
9214 if (expected_size == -1 || expected_size < epilogue_size_needed)
9215 predict_jump (REG_BR_PROB_BASE * 60 / 100);
9216 else
9217 predict_jump (REG_BR_PROB_BASE * 20 / 100);
9221 /* Emit code to decide on runtime whether library call or inline should be
9222 used. */
9223 if (dynamic_check != -1)
9225 if (!issetmem && CONST_INT_P (count_exp))
9227 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
9229 emit_block_copy_via_libcall (dst, src, count_exp);
9230 count_exp = const0_rtx;
9231 goto epilogue;
9234 else
9236 rtx_code_label *hot_label = gen_label_rtx ();
9237 if (jump_around_label == NULL_RTX)
9238 jump_around_label = gen_label_rtx ();
9239 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
9240 LEU, 0, counter_mode (count_exp),
9241 1, hot_label);
9242 predict_jump (REG_BR_PROB_BASE * 90 / 100);
9243 if (issetmem)
9244 set_storage_via_libcall (dst, count_exp, val_exp);
9245 else
9246 emit_block_copy_via_libcall (dst, src, count_exp);
9247 emit_jump (jump_around_label);
9248 emit_label (hot_label);
9252 /* Step 2: Alignment prologue. */
9253 /* Do the expensive promotion once we branched off the small blocks. */
9254 if (issetmem && !promoted_val)
9255 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
9256 desired_align, align);
9258 if (desired_align > align && !misaligned_prologue_used)
9260 if (align_bytes == 0)
9262 /* Except for the first move in prologue, we no longer know
9263 constant offset in aliasing info. It don't seems to worth
9264 the pain to maintain it for the first move, so throw away
9265 the info early. */
9266 dst = change_address (dst, BLKmode, destreg);
9267 if (!issetmem)
9268 src = change_address (src, BLKmode, srcreg);
9269 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
9270 promoted_val, vec_promoted_val,
9271 count_exp, align, desired_align,
9272 issetmem);
9273 /* At most desired_align - align bytes are copied. */
9274 if (min_size < (unsigned)(desired_align - align))
9275 min_size = 0;
9276 else
9277 min_size -= desired_align - align;
9279 else
9281 /* If we know how many bytes need to be stored before dst is
9282 sufficiently aligned, maintain aliasing info accurately. */
9283 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
9284 srcreg,
9285 promoted_val,
9286 vec_promoted_val,
9287 desired_align,
9288 align_bytes,
9289 issetmem);
9291 count_exp = plus_constant (counter_mode (count_exp),
9292 count_exp, -align_bytes);
9293 count -= align_bytes;
9294 min_size -= align_bytes;
9295 max_size -= align_bytes;
9297 if (need_zero_guard
9298 && min_size < (unsigned HOST_WIDE_INT) size_needed
9299 && (count < (unsigned HOST_WIDE_INT) size_needed
9300 || (align_bytes == 0
9301 && count < ((unsigned HOST_WIDE_INT) size_needed
9302 + desired_align - align))))
9304 /* It is possible that we copied enough so the main loop will not
9305 execute. */
9306 gcc_assert (size_needed > 1);
9307 if (label == NULL_RTX)
9308 label = gen_label_rtx ();
9309 emit_cmp_and_jump_insns (count_exp,
9310 GEN_INT (size_needed),
9311 LTU, 0, counter_mode (count_exp), 1, label);
9312 if (expected_size == -1
9313 || expected_size < (desired_align - align) / 2 + size_needed)
9314 predict_jump (REG_BR_PROB_BASE * 20 / 100);
9315 else
9316 predict_jump (REG_BR_PROB_BASE * 60 / 100);
9319 if (label && size_needed == 1)
9321 emit_label (label);
9322 LABEL_NUSES (label) = 1;
9323 label = NULL;
9324 epilogue_size_needed = 1;
9325 if (issetmem)
9326 promoted_val = val_exp;
9328 else if (label == NULL_RTX && !misaligned_prologue_used)
9329 epilogue_size_needed = size_needed;
9331 /* Step 3: Main loop. */
9333 switch (alg)
9335 case libcall:
9336 case no_stringop:
9337 case last_alg:
9338 gcc_unreachable ();
9339 case loop_1_byte:
9340 case loop:
9341 case unrolled_loop:
9342 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
9343 count_exp, move_mode, unroll_factor,
9344 expected_size, issetmem);
9345 break;
9346 case vector_loop:
9347 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
9348 vec_promoted_val, count_exp, move_mode,
9349 unroll_factor, expected_size, issetmem);
9350 break;
9351 case rep_prefix_8_byte:
9352 case rep_prefix_4_byte:
9353 case rep_prefix_1_byte:
9354 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
9355 val_exp, count_exp, move_mode, issetmem);
9356 break;
9358 /* Adjust properly the offset of src and dest memory for aliasing. */
9359 if (CONST_INT_P (count_exp))
9361 if (!issetmem)
9362 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
9363 (count / size_needed) * size_needed);
9364 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
9365 (count / size_needed) * size_needed);
9367 else
9369 if (!issetmem)
9370 src = change_address (src, BLKmode, srcreg);
9371 dst = change_address (dst, BLKmode, destreg);
9374 /* Step 4: Epilogue to copy the remaining bytes. */
9375 epilogue:
9376 if (label)
9378 /* When the main loop is done, COUNT_EXP might hold original count,
9379 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9380 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9381 bytes. Compensate if needed. */
9383 if (size_needed < epilogue_size_needed)
9385 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
9386 GEN_INT (size_needed - 1), count_exp, 1,
9387 OPTAB_DIRECT);
9388 if (tmp != count_exp)
9389 emit_move_insn (count_exp, tmp);
9391 emit_label (label);
9392 LABEL_NUSES (label) = 1;
9395 if (count_exp != const0_rtx && epilogue_size_needed > 1)
9397 if (force_loopy_epilogue)
9398 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
9399 epilogue_size_needed);
9400 else
9402 if (issetmem)
9403 expand_setmem_epilogue (dst, destreg, promoted_val,
9404 vec_promoted_val, count_exp,
9405 epilogue_size_needed);
9406 else
9407 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
9408 epilogue_size_needed);
9411 if (jump_around_label)
9412 emit_label (jump_around_label);
9413 return true;
9416 /* Expand cmpstrn or memcmp. */
9418 bool
9419 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
9420 rtx length, rtx align, bool is_cmpstrn)
9422 /* Expand strncmp and memcmp only with -minline-all-stringops since
9423 "repz cmpsb" can be much slower than strncmp and memcmp functions
9424 implemented with vector instructions, see
9426 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9428 if (!TARGET_INLINE_ALL_STRINGOPS)
9429 return false;
9431 /* Can't use this if the user has appropriated ecx, esi or edi. */
9432 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
9433 return false;
9435 if (is_cmpstrn)
9437 /* For strncmp, length is the maximum length, which can be larger
9438 than actual string lengths. We can expand the cmpstrn pattern
9439 to "repz cmpsb" only if one of the strings is a constant so
9440 that expand_builtin_strncmp() can write the length argument to
9441 be the minimum of the const string length and the actual length
9442 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9443 tree t1 = MEM_EXPR (src1);
9444 tree t2 = MEM_EXPR (src2);
9445 if (!((t1 && TREE_CODE (t1) == MEM_REF
9446 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
9447 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
9448 == STRING_CST))
9449 || (t2 && TREE_CODE (t2) == MEM_REF
9450 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
9451 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
9452 == STRING_CST))))
9453 return false;
9456 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
9457 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
9458 if (addr1 != XEXP (src1, 0))
9459 src1 = replace_equiv_address_nv (src1, addr1);
9460 if (addr2 != XEXP (src2, 0))
9461 src2 = replace_equiv_address_nv (src2, addr2);
9463 /* NB: Make a copy of the data length to avoid changing the original
9464 data length by cmpstrnqi patterns. */
9465 length = ix86_zero_extend_to_Pmode (length);
9466 rtx lengthreg = gen_reg_rtx (Pmode);
9467 emit_move_insn (lengthreg, length);
9469 /* If we are testing strict equality, we can use known alignment to
9470 good advantage. This may be possible with combine, particularly
9471 once cc0 is dead. */
9472 if (CONST_INT_P (length))
9474 if (length == const0_rtx)
9476 emit_move_insn (result, const0_rtx);
9477 return true;
9479 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
9480 src1, src2));
9482 else
9484 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
9485 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
9486 src1, src2));
9489 rtx out = gen_lowpart (QImode, result);
9490 emit_insn (gen_cmpintqi (out));
9491 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
9493 return true;
9496 /* Expand the appropriate insns for doing strlen if not just doing
9497 repnz; scasb
9499 out = result, initialized with the start address
9500 align_rtx = alignment of the address.
9501 scratch = scratch register, initialized with the startaddress when
9502 not aligned, otherwise undefined
9504 This is just the body. It needs the initializations mentioned above and
9505 some address computing at the end. These things are done in i386.md. */
9507 static void
9508 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
9510 int align;
9511 rtx tmp;
9512 rtx_code_label *align_2_label = NULL;
9513 rtx_code_label *align_3_label = NULL;
9514 rtx_code_label *align_4_label = gen_label_rtx ();
9515 rtx_code_label *end_0_label = gen_label_rtx ();
9516 rtx mem;
9517 rtx tmpreg = gen_reg_rtx (SImode);
9518 rtx scratch = gen_reg_rtx (SImode);
9519 rtx cmp;
9521 align = 0;
9522 if (CONST_INT_P (align_rtx))
9523 align = INTVAL (align_rtx);
9525 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9527 /* Is there a known alignment and is it less than 4? */
9528 if (align < 4)
9530 rtx scratch1 = gen_reg_rtx (Pmode);
9531 emit_move_insn (scratch1, out);
9532 /* Is there a known alignment and is it not 2? */
9533 if (align != 2)
9535 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
9536 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
9538 /* Leave just the 3 lower bits. */
9539 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
9540 NULL_RTX, 0, OPTAB_WIDEN);
9542 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9543 Pmode, 1, align_4_label);
9544 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
9545 Pmode, 1, align_2_label);
9546 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
9547 Pmode, 1, align_3_label);
9549 else
9551 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9552 check if is aligned to 4 - byte. */
9554 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
9555 NULL_RTX, 0, OPTAB_WIDEN);
9557 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9558 Pmode, 1, align_4_label);
9561 mem = change_address (src, QImode, out);
9563 /* Now compare the bytes. */
9565 /* Compare the first n unaligned byte on a byte per byte basis. */
9566 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
9567 QImode, 1, end_0_label);
9569 /* Increment the address. */
9570 emit_insn (gen_add2_insn (out, const1_rtx));
9572 /* Not needed with an alignment of 2 */
9573 if (align != 2)
9575 emit_label (align_2_label);
9577 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9578 end_0_label);
9580 emit_insn (gen_add2_insn (out, const1_rtx));
9582 emit_label (align_3_label);
9585 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9586 end_0_label);
9588 emit_insn (gen_add2_insn (out, const1_rtx));
9591 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9592 align this loop. It gives only huge programs, but does not help to
9593 speed up. */
9594 emit_label (align_4_label);
9596 mem = change_address (src, SImode, out);
9597 emit_move_insn (scratch, mem);
9598 emit_insn (gen_add2_insn (out, GEN_INT (4)));
9600 /* This formula yields a nonzero result iff one of the bytes is zero.
9601 This saves three branches inside loop and many cycles. */
9603 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9604 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9605 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9606 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9607 gen_int_mode (0x80808080, SImode)));
9608 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9609 align_4_label);
9611 if (TARGET_CMOVE)
9613 rtx reg = gen_reg_rtx (SImode);
9614 rtx reg2 = gen_reg_rtx (Pmode);
9615 emit_move_insn (reg, tmpreg);
9616 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9618 /* If zero is not in the first two bytes, move two bytes forward. */
9619 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9620 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9621 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9622 emit_insn (gen_rtx_SET (tmpreg,
9623 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9624 reg,
9625 tmpreg)));
9626 /* Emit lea manually to avoid clobbering of flags. */
9627 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
9629 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9630 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9631 emit_insn (gen_rtx_SET (out,
9632 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9633 reg2,
9634 out)));
9636 else
9638 rtx_code_label *end_2_label = gen_label_rtx ();
9639 /* Is zero in the first two bytes? */
9641 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9642 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9643 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9644 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9645 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9646 pc_rtx);
9647 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9648 JUMP_LABEL (tmp) = end_2_label;
9650 /* Not in the first two. Move two bytes forward. */
9651 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9652 emit_insn (gen_add2_insn (out, const2_rtx));
9654 emit_label (end_2_label);
9658 /* Avoid branch in fixing the byte. */
9659 tmpreg = gen_lowpart (QImode, tmpreg);
9660 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9661 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9662 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9663 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
9665 emit_label (end_0_label);
9668 /* Expand strlen. */
9670 bool
9671 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9673 if (TARGET_UNROLL_STRLEN
9674 && TARGET_INLINE_ALL_STRINGOPS
9675 && eoschar == const0_rtx
9676 && optimize > 1)
9678 /* The generic case of strlen expander is long. Avoid it's
9679 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9680 rtx addr = force_reg (Pmode, XEXP (src, 0));
9681 /* Well it seems that some optimizer does not combine a call like
9682 foo(strlen(bar), strlen(bar));
9683 when the move and the subtraction is done here. It does calculate
9684 the length just once when these instructions are done inside of
9685 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9686 often used and I use one fewer register for the lifetime of
9687 output_strlen_unroll() this is better. */
9689 emit_move_insn (out, addr);
9691 ix86_expand_strlensi_unroll_1 (out, src, align);
9693 /* strlensi_unroll_1 returns the address of the zero at the end of
9694 the string, like memchr(), so compute the length by subtracting
9695 the start address. */
9696 emit_insn (gen_sub2_insn (out, addr));
9697 return true;
9699 else
9700 return false;
9703 /* For given symbol (function) construct code to compute address of it's PLT
9704 entry in large x86-64 PIC model. */
9706 static rtx
9707 construct_plt_address (rtx symbol)
9709 rtx tmp, unspec;
9711 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9712 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9713 gcc_assert (Pmode == DImode);
9715 tmp = gen_reg_rtx (Pmode);
9716 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9718 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9719 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9720 return tmp;
9723 /* Additional registers that are clobbered by SYSV calls. */
9725 static int const x86_64_ms_sysv_extra_clobbered_registers
9726 [NUM_X86_64_MS_CLOBBERED_REGS] =
9728 SI_REG, DI_REG,
9729 XMM6_REG, XMM7_REG,
9730 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9731 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9734 rtx_insn *
9735 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9736 rtx callarg2,
9737 rtx pop, bool sibcall)
9739 rtx vec[3];
9740 rtx use = NULL, call;
9741 unsigned int vec_len = 0;
9742 tree fndecl;
9743 bool call_no_callee_saved_registers = false;
9745 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9747 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9748 if (fndecl)
9750 if (lookup_attribute ("interrupt",
9751 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
9752 error ("interrupt service routine cannot be called directly");
9753 else if (lookup_attribute ("no_callee_saved_registers",
9754 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
9755 call_no_callee_saved_registers = true;
9758 else
9760 if (MEM_P (fnaddr))
9762 tree mem_expr = MEM_EXPR (fnaddr);
9763 if (mem_expr != nullptr
9764 && TREE_CODE (mem_expr) == MEM_REF
9765 && lookup_attribute ("no_callee_saved_registers",
9766 TYPE_ATTRIBUTES (TREE_TYPE (mem_expr))))
9767 call_no_callee_saved_registers = true;
9770 fndecl = NULL_TREE;
9773 if (pop == const0_rtx)
9774 pop = NULL;
9775 gcc_assert (!TARGET_64BIT || !pop);
9777 rtx addr = XEXP (fnaddr, 0);
9778 if (TARGET_MACHO && !TARGET_64BIT)
9780 #if TARGET_MACHO
9781 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9782 fnaddr = machopic_indirect_call_target (fnaddr);
9783 #endif
9785 else
9787 /* Static functions and indirect calls don't need the pic register. Also,
9788 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9789 it an indirect call. */
9790 if (flag_pic
9791 && GET_CODE (addr) == SYMBOL_REF
9792 && ix86_call_use_plt_p (addr))
9794 if (flag_plt
9795 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9796 || !lookup_attribute ("noplt",
9797 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9799 if (!TARGET_64BIT
9800 || (ix86_cmodel == CM_LARGE_PIC
9801 && DEFAULT_ABI != MS_ABI))
9803 use_reg (&use, gen_rtx_REG (Pmode,
9804 REAL_PIC_OFFSET_TABLE_REGNUM));
9805 if (ix86_use_pseudo_pic_reg ())
9806 emit_move_insn (gen_rtx_REG (Pmode,
9807 REAL_PIC_OFFSET_TABLE_REGNUM),
9808 pic_offset_table_rtx);
9811 else if (!TARGET_PECOFF && !TARGET_MACHO)
9813 if (TARGET_64BIT
9814 && ix86_cmodel == CM_LARGE_PIC
9815 && DEFAULT_ABI != MS_ABI)
9817 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9818 UNSPEC_GOT);
9819 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9820 fnaddr = force_reg (Pmode, fnaddr);
9821 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9823 else if (TARGET_64BIT)
9825 fnaddr = gen_rtx_UNSPEC (Pmode,
9826 gen_rtvec (1, addr),
9827 UNSPEC_GOTPCREL);
9828 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9830 else
9832 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9833 UNSPEC_GOT);
9834 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9835 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9836 fnaddr);
9838 fnaddr = gen_const_mem (Pmode, fnaddr);
9839 /* Pmode may not be the same as word_mode for x32, which
9840 doesn't support indirect branch via 32-bit memory slot.
9841 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9842 indirect branch via x32 GOT slot is OK. */
9843 if (GET_MODE (fnaddr) != word_mode)
9844 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9845 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9850 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9851 parameters passed in vector registers. */
9852 if (TARGET_64BIT
9853 && (INTVAL (callarg2) > 0
9854 || (INTVAL (callarg2) == 0
9855 && (TARGET_SSE || !flag_skip_rax_setup))))
9857 rtx al = gen_rtx_REG (QImode, AX_REG);
9858 emit_move_insn (al, callarg2);
9859 use_reg (&use, al);
9862 if (ix86_cmodel == CM_LARGE_PIC
9863 && !TARGET_PECOFF
9864 && MEM_P (fnaddr)
9865 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9866 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9867 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9868 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9869 branch via x32 GOT slot is OK. */
9870 else if (!(TARGET_X32
9871 && MEM_P (fnaddr)
9872 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9873 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9874 && (sibcall
9875 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9876 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9878 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9879 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9882 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9883 mask off code pointers here.
9884 TODO: also need to handle indirect jump. */
9885 if (ix86_memtag_can_tag_addresses () && !fndecl
9886 && sanitize_flags_p (SANITIZE_HWADDRESS))
9888 rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
9889 NULL_RTX);
9890 fnaddr = gen_rtx_MEM (QImode, untagged_addr);
9893 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9895 if (retval)
9896 call = gen_rtx_SET (retval, call);
9897 vec[vec_len++] = call;
9899 if (pop)
9901 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9902 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9903 vec[vec_len++] = pop;
9906 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9908 if ((cfun->machine->call_saved_registers
9909 == TYPE_NO_CALLER_SAVED_REGISTERS)
9910 && (!fndecl
9911 || (!TREE_THIS_VOLATILE (fndecl)
9912 && !lookup_attribute ("no_caller_saved_registers",
9913 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9915 bool is_64bit_ms_abi = (TARGET_64BIT
9916 && ix86_function_abi (fndecl) == MS_ABI);
9917 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9919 /* If there are no caller-saved registers, add all registers
9920 that are clobbered by the call which returns. */
9921 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9922 if (!fixed_regs[i]
9923 && (ix86_call_used_regs[i] == 1
9924 || (ix86_call_used_regs[i] & c_mask))
9925 && !STACK_REGNO_P (i)
9926 && !MMX_REGNO_P (i))
9927 clobber_reg (&use,
9928 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9930 else if (TARGET_64BIT_MS_ABI
9931 && (!callarg2 || INTVAL (callarg2) != -2))
9933 unsigned i;
9935 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9937 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9938 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9940 clobber_reg (&use, gen_rtx_REG (mode, regno));
9943 /* Set here, but it may get cleared later. */
9944 if (TARGET_CALL_MS2SYSV_XLOGUES)
9946 if (!TARGET_SSE)
9949 /* Don't break hot-patched functions. */
9950 else if (ix86_function_ms_hook_prologue (current_function_decl))
9953 /* TODO: Cases not yet examined. */
9954 else if (flag_split_stack)
9955 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9957 else
9959 gcc_assert (!reload_completed);
9960 cfun->machine->call_ms2sysv = true;
9965 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9966 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9967 || !fndecl || TREE_PUBLIC (fndecl)))
9969 /* We allow public functions defined in a TU to bind locally for PIC
9970 code (the default) on 64bit Mach-O.
9971 If such functions are not inlined, we cannot tell at compile-time if
9972 they will be called via the lazy symbol resolver (this can depend on
9973 options given at link-time). Therefore, we must assume that the lazy
9974 resolver could be used which clobbers R11 and R10. */
9975 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9976 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9979 if (call_no_callee_saved_registers)
9981 /* After calling a no_callee_saved_registers function, all
9982 registers may be clobbered. Clobber all registers that are
9983 not used by the callee. */
9984 bool is_64bit_ms_abi = (TARGET_64BIT
9985 && ix86_function_abi (fndecl) == MS_ABI);
9986 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9987 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9988 if (!fixed_regs[i]
9989 && !(ix86_call_used_regs[i] == 1
9990 || (ix86_call_used_regs[i] & c_mask))
9991 && !STACK_REGNO_P (i)
9992 && !MMX_REGNO_P (i))
9993 clobber_reg (&use,
9994 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9997 if (vec_len > 1)
9998 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9999 rtx_insn *call_insn = emit_call_insn (call);
10000 if (use)
10001 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
10003 return call_insn;
10006 /* Split simple return with popping POPC bytes from stack to indirect
10007 branch with stack adjustment . */
10009 void
10010 ix86_split_simple_return_pop_internal (rtx popc)
10012 struct machine_function *m = cfun->machine;
10013 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10014 rtx_insn *insn;
10016 /* There is no "pascal" calling convention in any 64bit ABI. */
10017 gcc_assert (!TARGET_64BIT);
10019 insn = emit_insn (gen_pop (ecx));
10020 m->fs.cfa_offset -= UNITS_PER_WORD;
10021 m->fs.sp_offset -= UNITS_PER_WORD;
10023 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10024 x = gen_rtx_SET (stack_pointer_rtx, x);
10025 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10026 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
10027 RTX_FRAME_RELATED_P (insn) = 1;
10029 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
10030 x = gen_rtx_SET (stack_pointer_rtx, x);
10031 insn = emit_insn (x);
10032 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10033 RTX_FRAME_RELATED_P (insn) = 1;
10035 /* Now return address is in ECX. */
10036 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
10039 /* Errors in the source file can cause expand_expr to return const0_rtx
10040 where we expect a vector. To avoid crashing, use one of the vector
10041 clear instructions. */
10043 static rtx
10044 safe_vector_operand (rtx x, machine_mode mode)
10046 if (x == const0_rtx)
10047 x = CONST0_RTX (mode);
10048 return x;
10051 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
10053 static rtx
10054 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
10056 rtx pat;
10057 tree arg0 = CALL_EXPR_ARG (exp, 0);
10058 tree arg1 = CALL_EXPR_ARG (exp, 1);
10059 rtx op0 = expand_normal (arg0);
10060 rtx op1 = expand_normal (arg1);
10061 machine_mode tmode = insn_data[icode].operand[0].mode;
10062 machine_mode mode0 = insn_data[icode].operand[1].mode;
10063 machine_mode mode1 = insn_data[icode].operand[2].mode;
10065 if (VECTOR_MODE_P (mode0))
10066 op0 = safe_vector_operand (op0, mode0);
10067 if (VECTOR_MODE_P (mode1))
10068 op1 = safe_vector_operand (op1, mode1);
10070 if (optimize || !target
10071 || GET_MODE (target) != tmode
10072 || !insn_data[icode].operand[0].predicate (target, tmode))
10073 target = gen_reg_rtx (tmode);
10075 if (GET_MODE (op1) == SImode && mode1 == TImode)
10077 rtx x = gen_reg_rtx (V4SImode);
10078 emit_insn (gen_sse2_loadd (x, op1));
10079 op1 = gen_lowpart (TImode, x);
10082 if (!insn_data[icode].operand[1].predicate (op0, mode0))
10083 op0 = copy_to_mode_reg (mode0, op0);
10084 if (!insn_data[icode].operand[2].predicate (op1, mode1))
10085 op1 = copy_to_mode_reg (mode1, op1);
10087 pat = GEN_FCN (icode) (target, op0, op1);
10088 if (! pat)
10089 return 0;
10091 emit_insn (pat);
10093 return target;
10096 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
10098 static rtx
10099 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
10100 enum ix86_builtin_func_type m_type,
10101 enum rtx_code sub_code)
10103 rtx pat;
10104 unsigned int i, nargs;
10105 bool comparison_p = false;
10106 bool tf_p = false;
10107 bool last_arg_constant = false;
10108 int num_memory = 0;
10109 rtx xops[4];
10111 machine_mode tmode = insn_data[icode].operand[0].mode;
10113 switch (m_type)
10115 case MULTI_ARG_4_DF2_DI_I:
10116 case MULTI_ARG_4_DF2_DI_I1:
10117 case MULTI_ARG_4_SF2_SI_I:
10118 case MULTI_ARG_4_SF2_SI_I1:
10119 nargs = 4;
10120 last_arg_constant = true;
10121 break;
10123 case MULTI_ARG_3_SF:
10124 case MULTI_ARG_3_DF:
10125 case MULTI_ARG_3_SF2:
10126 case MULTI_ARG_3_DF2:
10127 case MULTI_ARG_3_DI:
10128 case MULTI_ARG_3_SI:
10129 case MULTI_ARG_3_SI_DI:
10130 case MULTI_ARG_3_HI:
10131 case MULTI_ARG_3_HI_SI:
10132 case MULTI_ARG_3_QI:
10133 case MULTI_ARG_3_DI2:
10134 case MULTI_ARG_3_SI2:
10135 case MULTI_ARG_3_HI2:
10136 case MULTI_ARG_3_QI2:
10137 nargs = 3;
10138 break;
10140 case MULTI_ARG_2_SF:
10141 case MULTI_ARG_2_DF:
10142 case MULTI_ARG_2_DI:
10143 case MULTI_ARG_2_SI:
10144 case MULTI_ARG_2_HI:
10145 case MULTI_ARG_2_QI:
10146 nargs = 2;
10147 break;
10149 case MULTI_ARG_2_DI_IMM:
10150 case MULTI_ARG_2_SI_IMM:
10151 case MULTI_ARG_2_HI_IMM:
10152 case MULTI_ARG_2_QI_IMM:
10153 nargs = 2;
10154 last_arg_constant = true;
10155 break;
10157 case MULTI_ARG_1_SF:
10158 case MULTI_ARG_1_DF:
10159 case MULTI_ARG_1_SF2:
10160 case MULTI_ARG_1_DF2:
10161 case MULTI_ARG_1_DI:
10162 case MULTI_ARG_1_SI:
10163 case MULTI_ARG_1_HI:
10164 case MULTI_ARG_1_QI:
10165 case MULTI_ARG_1_SI_DI:
10166 case MULTI_ARG_1_HI_DI:
10167 case MULTI_ARG_1_HI_SI:
10168 case MULTI_ARG_1_QI_DI:
10169 case MULTI_ARG_1_QI_SI:
10170 case MULTI_ARG_1_QI_HI:
10171 nargs = 1;
10172 break;
10174 case MULTI_ARG_2_DI_CMP:
10175 case MULTI_ARG_2_SI_CMP:
10176 case MULTI_ARG_2_HI_CMP:
10177 case MULTI_ARG_2_QI_CMP:
10178 nargs = 2;
10179 comparison_p = true;
10180 break;
10182 case MULTI_ARG_2_SF_TF:
10183 case MULTI_ARG_2_DF_TF:
10184 case MULTI_ARG_2_DI_TF:
10185 case MULTI_ARG_2_SI_TF:
10186 case MULTI_ARG_2_HI_TF:
10187 case MULTI_ARG_2_QI_TF:
10188 nargs = 2;
10189 tf_p = true;
10190 break;
10192 default:
10193 gcc_unreachable ();
10196 if (optimize || !target
10197 || GET_MODE (target) != tmode
10198 || !insn_data[icode].operand[0].predicate (target, tmode))
10199 target = gen_reg_rtx (tmode);
10200 else if (memory_operand (target, tmode))
10201 num_memory++;
10203 gcc_assert (nargs <= ARRAY_SIZE (xops));
10205 for (i = 0; i < nargs; i++)
10207 tree arg = CALL_EXPR_ARG (exp, i);
10208 rtx op = expand_normal (arg);
10209 int adjust = (comparison_p) ? 1 : 0;
10210 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
10212 if (last_arg_constant && i == nargs - 1)
10214 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
10216 enum insn_code new_icode = icode;
10217 switch (icode)
10219 case CODE_FOR_xop_vpermil2v2df3:
10220 case CODE_FOR_xop_vpermil2v4sf3:
10221 case CODE_FOR_xop_vpermil2v4df3:
10222 case CODE_FOR_xop_vpermil2v8sf3:
10223 error ("the last argument must be a 2-bit immediate");
10224 return gen_reg_rtx (tmode);
10225 case CODE_FOR_xop_rotlv2di3:
10226 new_icode = CODE_FOR_rotlv2di3;
10227 goto xop_rotl;
10228 case CODE_FOR_xop_rotlv4si3:
10229 new_icode = CODE_FOR_rotlv4si3;
10230 goto xop_rotl;
10231 case CODE_FOR_xop_rotlv8hi3:
10232 new_icode = CODE_FOR_rotlv8hi3;
10233 goto xop_rotl;
10234 case CODE_FOR_xop_rotlv16qi3:
10235 new_icode = CODE_FOR_rotlv16qi3;
10236 xop_rotl:
10237 if (CONST_INT_P (op))
10239 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
10240 op = GEN_INT (INTVAL (op) & mask);
10241 gcc_checking_assert
10242 (insn_data[icode].operand[i + 1].predicate (op, mode));
10244 else
10246 gcc_checking_assert
10247 (nargs == 2
10248 && insn_data[new_icode].operand[0].mode == tmode
10249 && insn_data[new_icode].operand[1].mode == tmode
10250 && insn_data[new_icode].operand[2].mode == mode
10251 && insn_data[new_icode].operand[0].predicate
10252 == insn_data[icode].operand[0].predicate
10253 && insn_data[new_icode].operand[1].predicate
10254 == insn_data[icode].operand[1].predicate);
10255 icode = new_icode;
10256 goto non_constant;
10258 break;
10259 default:
10260 gcc_unreachable ();
10264 else
10266 non_constant:
10267 if (VECTOR_MODE_P (mode))
10268 op = safe_vector_operand (op, mode);
10270 /* If we aren't optimizing, only allow one memory operand to be
10271 generated. */
10272 if (memory_operand (op, mode))
10273 num_memory++;
10275 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
10277 if (optimize
10278 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
10279 || num_memory > 1)
10280 op = force_reg (mode, op);
10283 xops[i] = op;
10286 switch (nargs)
10288 case 1:
10289 pat = GEN_FCN (icode) (target, xops[0]);
10290 break;
10292 case 2:
10293 if (tf_p)
10294 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10295 GEN_INT ((int)sub_code));
10296 else if (! comparison_p)
10297 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10298 else
10300 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
10301 xops[0], xops[1]);
10303 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
10305 break;
10307 case 3:
10308 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10309 break;
10311 case 4:
10312 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
10313 break;
10315 default:
10316 gcc_unreachable ();
10319 if (! pat)
10320 return 0;
10322 emit_insn (pat);
10323 return target;
10326 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
10327 insns with vec_merge. */
10329 static rtx
10330 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
10331 rtx target)
10333 rtx pat;
10334 tree arg0 = CALL_EXPR_ARG (exp, 0);
10335 rtx op1, op0 = expand_normal (arg0);
10336 machine_mode tmode = insn_data[icode].operand[0].mode;
10337 machine_mode mode0 = insn_data[icode].operand[1].mode;
10339 if (optimize || !target
10340 || GET_MODE (target) != tmode
10341 || !insn_data[icode].operand[0].predicate (target, tmode))
10342 target = gen_reg_rtx (tmode);
10344 if (VECTOR_MODE_P (mode0))
10345 op0 = safe_vector_operand (op0, mode0);
10347 if ((optimize && !register_operand (op0, mode0))
10348 || !insn_data[icode].operand[1].predicate (op0, mode0))
10349 op0 = copy_to_mode_reg (mode0, op0);
10351 op1 = op0;
10352 if (!insn_data[icode].operand[2].predicate (op1, mode0))
10353 op1 = copy_to_mode_reg (mode0, op1);
10355 pat = GEN_FCN (icode) (target, op0, op1);
10356 if (! pat)
10357 return 0;
10358 emit_insn (pat);
10359 return target;
10362 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
10364 static rtx
10365 ix86_expand_sse_compare (const struct builtin_description *d,
10366 tree exp, rtx target, bool swap)
10368 rtx pat;
10369 tree arg0 = CALL_EXPR_ARG (exp, 0);
10370 tree arg1 = CALL_EXPR_ARG (exp, 1);
10371 rtx op0 = expand_normal (arg0);
10372 rtx op1 = expand_normal (arg1);
10373 rtx op2;
10374 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10375 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10376 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10377 enum rtx_code comparison = d->comparison;
10379 if (VECTOR_MODE_P (mode0))
10380 op0 = safe_vector_operand (op0, mode0);
10381 if (VECTOR_MODE_P (mode1))
10382 op1 = safe_vector_operand (op1, mode1);
10384 /* Swap operands if we have a comparison that isn't available in
10385 hardware. */
10386 if (swap)
10387 std::swap (op0, op1);
10389 if (optimize || !target
10390 || GET_MODE (target) != tmode
10391 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10392 target = gen_reg_rtx (tmode);
10394 if ((optimize && !register_operand (op0, mode0))
10395 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
10396 op0 = copy_to_mode_reg (mode0, op0);
10397 if ((optimize && !register_operand (op1, mode1))
10398 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
10399 op1 = copy_to_mode_reg (mode1, op1);
10401 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
10402 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10403 if (! pat)
10404 return 0;
10405 emit_insn (pat);
10406 return target;
10409 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
10410 * ordered EQ or unordered NE, generate PF jump. */
10412 static rtx
10413 ix86_ssecom_setcc (const enum rtx_code comparison,
10414 bool check_unordered, machine_mode mode,
10415 rtx set_dst, rtx target)
10418 rtx_code_label *label = NULL;
10420 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10421 with NAN operands. */
10422 if (check_unordered)
10424 gcc_assert (comparison == EQ || comparison == NE);
10426 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10427 label = gen_label_rtx ();
10428 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10429 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10430 gen_rtx_LABEL_REF (VOIDmode, label),
10431 pc_rtx);
10432 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10435 /* NB: Set CCFPmode and check a different CCmode which is in subset
10436 of CCFPmode. */
10437 if (GET_MODE (set_dst) != mode)
10439 gcc_assert (mode == CCAmode || mode == CCCmode
10440 || mode == CCOmode || mode == CCPmode
10441 || mode == CCSmode || mode == CCZmode);
10442 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10445 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10446 gen_rtx_fmt_ee (comparison, QImode,
10447 set_dst,
10448 const0_rtx)));
10450 if (label)
10451 emit_label (label);
10453 return SUBREG_REG (target);
10456 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
10458 static rtx
10459 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
10460 rtx target)
10462 rtx pat, set_dst;
10463 tree arg0 = CALL_EXPR_ARG (exp, 0);
10464 tree arg1 = CALL_EXPR_ARG (exp, 1);
10465 rtx op0 = expand_normal (arg0);
10466 rtx op1 = expand_normal (arg1);
10467 enum insn_code icode = d->icode;
10468 const struct insn_data_d *insn_p = &insn_data[icode];
10469 machine_mode mode0 = insn_p->operand[0].mode;
10470 machine_mode mode1 = insn_p->operand[1].mode;
10472 if (VECTOR_MODE_P (mode0))
10473 op0 = safe_vector_operand (op0, mode0);
10474 if (VECTOR_MODE_P (mode1))
10475 op1 = safe_vector_operand (op1, mode1);
10477 enum rtx_code comparison = d->comparison;
10478 rtx const_val = const0_rtx;
10480 bool check_unordered = false;
10481 machine_mode mode = CCFPmode;
10482 switch (comparison)
10484 case LE: /* -> GE */
10485 case LT: /* -> GT */
10486 std::swap (op0, op1);
10487 comparison = swap_condition (comparison);
10488 /* FALLTHRU */
10489 case GT:
10490 case GE:
10491 break;
10492 case EQ:
10493 check_unordered = true;
10494 mode = CCZmode;
10495 break;
10496 case NE:
10497 check_unordered = true;
10498 mode = CCZmode;
10499 const_val = const1_rtx;
10500 break;
10501 default:
10502 gcc_unreachable ();
10505 target = gen_reg_rtx (SImode);
10506 emit_move_insn (target, const_val);
10507 target = gen_rtx_SUBREG (QImode, target, 0);
10509 if ((optimize && !register_operand (op0, mode0))
10510 || !insn_p->operand[0].predicate (op0, mode0))
10511 op0 = copy_to_mode_reg (mode0, op0);
10512 if ((optimize && !register_operand (op1, mode1))
10513 || !insn_p->operand[1].predicate (op1, mode1))
10514 op1 = copy_to_mode_reg (mode1, op1);
10516 pat = GEN_FCN (icode) (op0, op1);
10517 if (! pat)
10518 return 0;
10520 set_dst = SET_DEST (pat);
10521 emit_insn (pat);
10522 return ix86_ssecom_setcc (comparison, check_unordered, mode,
10523 set_dst, target);
10526 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10528 static rtx
10529 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
10530 rtx target)
10532 rtx pat;
10533 tree arg0 = CALL_EXPR_ARG (exp, 0);
10534 rtx op1, op0 = expand_normal (arg0);
10535 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10536 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10538 if (optimize || target == 0
10539 || GET_MODE (target) != tmode
10540 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10541 target = gen_reg_rtx (tmode);
10543 if (VECTOR_MODE_P (mode0))
10544 op0 = safe_vector_operand (op0, mode0);
10546 if ((optimize && !register_operand (op0, mode0))
10547 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10548 op0 = copy_to_mode_reg (mode0, op0);
10550 op1 = GEN_INT (d->comparison);
10552 pat = GEN_FCN (d->icode) (target, op0, op1);
10553 if (! pat)
10554 return 0;
10555 emit_insn (pat);
10556 return target;
10559 static rtx
10560 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
10561 tree exp, rtx target)
10563 rtx pat;
10564 tree arg0 = CALL_EXPR_ARG (exp, 0);
10565 tree arg1 = CALL_EXPR_ARG (exp, 1);
10566 rtx op0 = expand_normal (arg0);
10567 rtx op1 = expand_normal (arg1);
10568 rtx op2;
10569 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10570 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10571 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10573 if (optimize || target == 0
10574 || GET_MODE (target) != tmode
10575 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10576 target = gen_reg_rtx (tmode);
10578 op0 = safe_vector_operand (op0, mode0);
10579 op1 = safe_vector_operand (op1, mode1);
10581 if ((optimize && !register_operand (op0, mode0))
10582 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10583 op0 = copy_to_mode_reg (mode0, op0);
10584 if ((optimize && !register_operand (op1, mode1))
10585 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10586 op1 = copy_to_mode_reg (mode1, op1);
10588 op2 = GEN_INT (d->comparison);
10590 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10591 if (! pat)
10592 return 0;
10593 emit_insn (pat);
10594 return target;
10597 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10599 static rtx
10600 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
10601 rtx target)
10603 rtx pat;
10604 tree arg0 = CALL_EXPR_ARG (exp, 0);
10605 tree arg1 = CALL_EXPR_ARG (exp, 1);
10606 rtx op0 = expand_normal (arg0);
10607 rtx op1 = expand_normal (arg1);
10608 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
10609 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
10610 enum rtx_code comparison = d->comparison;
10612 /* ptest reg, reg sets the carry flag. */
10613 if (comparison == LTU
10614 && (d->code == IX86_BUILTIN_PTESTC
10615 || d->code == IX86_BUILTIN_PTESTC256)
10616 && rtx_equal_p (op0, op1))
10618 if (!target)
10619 target = gen_reg_rtx (SImode);
10620 emit_move_insn (target, const1_rtx);
10621 return target;
10624 if (VECTOR_MODE_P (mode0))
10625 op0 = safe_vector_operand (op0, mode0);
10626 if (VECTOR_MODE_P (mode1))
10627 op1 = safe_vector_operand (op1, mode1);
10629 target = gen_reg_rtx (SImode);
10630 emit_move_insn (target, const0_rtx);
10631 target = gen_rtx_SUBREG (QImode, target, 0);
10633 if ((optimize && !register_operand (op0, mode0))
10634 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10635 op0 = copy_to_mode_reg (mode0, op0);
10636 if ((optimize && !register_operand (op1, mode1))
10637 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10638 op1 = copy_to_mode_reg (mode1, op1);
10640 pat = GEN_FCN (d->icode) (op0, op1);
10641 if (! pat)
10642 return 0;
10643 emit_insn (pat);
10644 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10645 gen_rtx_fmt_ee (comparison, QImode,
10646 SET_DEST (pat),
10647 const0_rtx)));
10649 return SUBREG_REG (target);
10652 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10654 static rtx
10655 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
10656 tree exp, rtx target)
10658 rtx pat;
10659 tree arg0 = CALL_EXPR_ARG (exp, 0);
10660 tree arg1 = CALL_EXPR_ARG (exp, 1);
10661 tree arg2 = CALL_EXPR_ARG (exp, 2);
10662 tree arg3 = CALL_EXPR_ARG (exp, 3);
10663 tree arg4 = CALL_EXPR_ARG (exp, 4);
10664 rtx scratch0, scratch1;
10665 rtx op0 = expand_normal (arg0);
10666 rtx op1 = expand_normal (arg1);
10667 rtx op2 = expand_normal (arg2);
10668 rtx op3 = expand_normal (arg3);
10669 rtx op4 = expand_normal (arg4);
10670 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
10672 tmode0 = insn_data[d->icode].operand[0].mode;
10673 tmode1 = insn_data[d->icode].operand[1].mode;
10674 modev2 = insn_data[d->icode].operand[2].mode;
10675 modei3 = insn_data[d->icode].operand[3].mode;
10676 modev4 = insn_data[d->icode].operand[4].mode;
10677 modei5 = insn_data[d->icode].operand[5].mode;
10678 modeimm = insn_data[d->icode].operand[6].mode;
10680 if (VECTOR_MODE_P (modev2))
10681 op0 = safe_vector_operand (op0, modev2);
10682 if (VECTOR_MODE_P (modev4))
10683 op2 = safe_vector_operand (op2, modev4);
10685 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10686 op0 = copy_to_mode_reg (modev2, op0);
10687 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
10688 op1 = copy_to_mode_reg (modei3, op1);
10689 if ((optimize && !register_operand (op2, modev4))
10690 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
10691 op2 = copy_to_mode_reg (modev4, op2);
10692 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
10693 op3 = copy_to_mode_reg (modei5, op3);
10695 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
10697 error ("the fifth argument must be an 8-bit immediate");
10698 return const0_rtx;
10701 if (d->code == IX86_BUILTIN_PCMPESTRI128)
10703 if (optimize || !target
10704 || GET_MODE (target) != tmode0
10705 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10706 target = gen_reg_rtx (tmode0);
10708 scratch1 = gen_reg_rtx (tmode1);
10710 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10712 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10714 if (optimize || !target
10715 || GET_MODE (target) != tmode1
10716 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10717 target = gen_reg_rtx (tmode1);
10719 scratch0 = gen_reg_rtx (tmode0);
10721 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10723 else
10725 gcc_assert (d->flag);
10727 scratch0 = gen_reg_rtx (tmode0);
10728 scratch1 = gen_reg_rtx (tmode1);
10730 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10733 if (! pat)
10734 return 0;
10736 emit_insn (pat);
10738 if (d->flag)
10740 target = gen_reg_rtx (SImode);
10741 emit_move_insn (target, const0_rtx);
10742 target = gen_rtx_SUBREG (QImode, target, 0);
10744 emit_insn
10745 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10746 gen_rtx_fmt_ee (EQ, QImode,
10747 gen_rtx_REG ((machine_mode) d->flag,
10748 FLAGS_REG),
10749 const0_rtx)));
10750 return SUBREG_REG (target);
10752 else
10753 return target;
10757 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10759 static rtx
10760 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10761 tree exp, rtx target)
10763 rtx pat;
10764 tree arg0 = CALL_EXPR_ARG (exp, 0);
10765 tree arg1 = CALL_EXPR_ARG (exp, 1);
10766 tree arg2 = CALL_EXPR_ARG (exp, 2);
10767 rtx scratch0, scratch1;
10768 rtx op0 = expand_normal (arg0);
10769 rtx op1 = expand_normal (arg1);
10770 rtx op2 = expand_normal (arg2);
10771 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10773 tmode0 = insn_data[d->icode].operand[0].mode;
10774 tmode1 = insn_data[d->icode].operand[1].mode;
10775 modev2 = insn_data[d->icode].operand[2].mode;
10776 modev3 = insn_data[d->icode].operand[3].mode;
10777 modeimm = insn_data[d->icode].operand[4].mode;
10779 if (VECTOR_MODE_P (modev2))
10780 op0 = safe_vector_operand (op0, modev2);
10781 if (VECTOR_MODE_P (modev3))
10782 op1 = safe_vector_operand (op1, modev3);
10784 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10785 op0 = copy_to_mode_reg (modev2, op0);
10786 if ((optimize && !register_operand (op1, modev3))
10787 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10788 op1 = copy_to_mode_reg (modev3, op1);
10790 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10792 error ("the third argument must be an 8-bit immediate");
10793 return const0_rtx;
10796 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10798 if (optimize || !target
10799 || GET_MODE (target) != tmode0
10800 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10801 target = gen_reg_rtx (tmode0);
10803 scratch1 = gen_reg_rtx (tmode1);
10805 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10807 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10809 if (optimize || !target
10810 || GET_MODE (target) != tmode1
10811 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10812 target = gen_reg_rtx (tmode1);
10814 scratch0 = gen_reg_rtx (tmode0);
10816 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10818 else
10820 gcc_assert (d->flag);
10822 scratch0 = gen_reg_rtx (tmode0);
10823 scratch1 = gen_reg_rtx (tmode1);
10825 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10828 if (! pat)
10829 return 0;
10831 emit_insn (pat);
10833 if (d->flag)
10835 target = gen_reg_rtx (SImode);
10836 emit_move_insn (target, const0_rtx);
10837 target = gen_rtx_SUBREG (QImode, target, 0);
10839 emit_insn
10840 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10841 gen_rtx_fmt_ee (EQ, QImode,
10842 gen_rtx_REG ((machine_mode) d->flag,
10843 FLAGS_REG),
10844 const0_rtx)));
10845 return SUBREG_REG (target);
10847 else
10848 return target;
10851 /* Fixup modeless constants to fit required mode. */
10853 static rtx
10854 fixup_modeless_constant (rtx x, machine_mode mode)
10856 if (GET_MODE (x) == VOIDmode)
10857 x = convert_to_mode (mode, x, 1);
10858 return x;
10861 /* Subroutine of ix86_expand_builtin to take care of insns with
10862 variable number of operands. */
10864 static rtx
10865 ix86_expand_args_builtin (const struct builtin_description *d,
10866 tree exp, rtx target)
10868 rtx pat, real_target;
10869 unsigned int i, nargs;
10870 unsigned int nargs_constant = 0;
10871 unsigned int mask_pos = 0;
10872 int num_memory = 0;
10873 rtx xops[6];
10874 bool second_arg_count = false;
10875 enum insn_code icode = d->icode;
10876 const struct insn_data_d *insn_p = &insn_data[icode];
10877 machine_mode tmode = insn_p->operand[0].mode;
10878 machine_mode rmode = VOIDmode;
10879 bool swap = false;
10880 enum rtx_code comparison = d->comparison;
10882 switch ((enum ix86_builtin_func_type) d->flag)
10884 case V2DF_FTYPE_V2DF_ROUND:
10885 case V4DF_FTYPE_V4DF_ROUND:
10886 case V8DF_FTYPE_V8DF_ROUND:
10887 case V4SF_FTYPE_V4SF_ROUND:
10888 case V8SF_FTYPE_V8SF_ROUND:
10889 case V16SF_FTYPE_V16SF_ROUND:
10890 case V8HF_FTYPE_V8HF_ROUND:
10891 case V16HF_FTYPE_V16HF_ROUND:
10892 case V32HF_FTYPE_V32HF_ROUND:
10893 case V4SI_FTYPE_V4SF_ROUND:
10894 case V8SI_FTYPE_V8SF_ROUND:
10895 case V16SI_FTYPE_V16SF_ROUND:
10896 return ix86_expand_sse_round (d, exp, target);
10897 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10898 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10899 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10900 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10901 case INT_FTYPE_V8SF_V8SF_PTEST:
10902 case INT_FTYPE_V4DI_V4DI_PTEST:
10903 case INT_FTYPE_V4DF_V4DF_PTEST:
10904 case INT_FTYPE_V4SF_V4SF_PTEST:
10905 case INT_FTYPE_V2DI_V2DI_PTEST:
10906 case INT_FTYPE_V2DF_V2DF_PTEST:
10907 return ix86_expand_sse_ptest (d, exp, target);
10908 case FLOAT128_FTYPE_FLOAT128:
10909 case FLOAT_FTYPE_FLOAT:
10910 case FLOAT_FTYPE_BFLOAT16:
10911 case INT_FTYPE_INT:
10912 case UINT_FTYPE_UINT:
10913 case UINT16_FTYPE_UINT16:
10914 case UINT64_FTYPE_INT:
10915 case UINT64_FTYPE_UINT64:
10916 case INT64_FTYPE_INT64:
10917 case INT64_FTYPE_V4SF:
10918 case INT64_FTYPE_V2DF:
10919 case INT_FTYPE_V16QI:
10920 case INT_FTYPE_V8QI:
10921 case INT_FTYPE_V8SF:
10922 case INT_FTYPE_V4DF:
10923 case INT_FTYPE_V4SF:
10924 case INT_FTYPE_V2DF:
10925 case INT_FTYPE_V32QI:
10926 case V16QI_FTYPE_V16QI:
10927 case V8SI_FTYPE_V8SF:
10928 case V8SI_FTYPE_V4SI:
10929 case V8HI_FTYPE_V8HI:
10930 case V8HI_FTYPE_V16QI:
10931 case V8QI_FTYPE_V8QI:
10932 case V8SF_FTYPE_V8SF:
10933 case V8SF_FTYPE_V8SI:
10934 case V8SF_FTYPE_V4SF:
10935 case V8SF_FTYPE_V8HI:
10936 case V4SI_FTYPE_V4SI:
10937 case V4SI_FTYPE_V16QI:
10938 case V4SI_FTYPE_V4SF:
10939 case V4SI_FTYPE_V8SI:
10940 case V4SI_FTYPE_V8HI:
10941 case V4SI_FTYPE_V4DF:
10942 case V4SI_FTYPE_V2DF:
10943 case V4HI_FTYPE_V4HI:
10944 case V4DF_FTYPE_V4DF:
10945 case V4DF_FTYPE_V4SI:
10946 case V4DF_FTYPE_V4SF:
10947 case V4DF_FTYPE_V2DF:
10948 case V4SF_FTYPE_V4SF:
10949 case V4SF_FTYPE_V4SI:
10950 case V4SF_FTYPE_V8SF:
10951 case V4SF_FTYPE_V4DF:
10952 case V4SF_FTYPE_V8HI:
10953 case V4SF_FTYPE_V2DF:
10954 case V2DI_FTYPE_V2DI:
10955 case V2DI_FTYPE_V16QI:
10956 case V2DI_FTYPE_V8HI:
10957 case V2DI_FTYPE_V4SI:
10958 case V2DF_FTYPE_V2DF:
10959 case V2DF_FTYPE_V4SI:
10960 case V2DF_FTYPE_V4DF:
10961 case V2DF_FTYPE_V4SF:
10962 case V2DF_FTYPE_V2SI:
10963 case V2SI_FTYPE_V2SI:
10964 case V2SI_FTYPE_V4SF:
10965 case V2SI_FTYPE_V2SF:
10966 case V2SI_FTYPE_V2DF:
10967 case V2SF_FTYPE_V2SF:
10968 case V2SF_FTYPE_V2SI:
10969 case V32QI_FTYPE_V32QI:
10970 case V32QI_FTYPE_V16QI:
10971 case V16HI_FTYPE_V16HI:
10972 case V16HI_FTYPE_V8HI:
10973 case V8SI_FTYPE_V8SI:
10974 case V16HI_FTYPE_V16QI:
10975 case V8SI_FTYPE_V16QI:
10976 case V4DI_FTYPE_V16QI:
10977 case V8SI_FTYPE_V8HI:
10978 case V4DI_FTYPE_V8HI:
10979 case V4DI_FTYPE_V4SI:
10980 case V4DI_FTYPE_V2DI:
10981 case UQI_FTYPE_UQI:
10982 case UHI_FTYPE_UHI:
10983 case USI_FTYPE_USI:
10984 case USI_FTYPE_UQI:
10985 case USI_FTYPE_UHI:
10986 case UDI_FTYPE_UDI:
10987 case UHI_FTYPE_V16QI:
10988 case USI_FTYPE_V32QI:
10989 case UDI_FTYPE_V64QI:
10990 case V16QI_FTYPE_UHI:
10991 case V32QI_FTYPE_USI:
10992 case V64QI_FTYPE_UDI:
10993 case V8HI_FTYPE_UQI:
10994 case V16HI_FTYPE_UHI:
10995 case V32HI_FTYPE_USI:
10996 case V4SI_FTYPE_UQI:
10997 case V8SI_FTYPE_UQI:
10998 case V4SI_FTYPE_UHI:
10999 case V8SI_FTYPE_UHI:
11000 case UQI_FTYPE_V8HI:
11001 case UHI_FTYPE_V16HI:
11002 case USI_FTYPE_V32HI:
11003 case UQI_FTYPE_V4SI:
11004 case UQI_FTYPE_V8SI:
11005 case UHI_FTYPE_V16SI:
11006 case UQI_FTYPE_V2DI:
11007 case UQI_FTYPE_V4DI:
11008 case UQI_FTYPE_V8DI:
11009 case V16SI_FTYPE_UHI:
11010 case V2DI_FTYPE_UQI:
11011 case V4DI_FTYPE_UQI:
11012 case V16SI_FTYPE_INT:
11013 case V16SF_FTYPE_V8SF:
11014 case V16SI_FTYPE_V8SI:
11015 case V16SF_FTYPE_V4SF:
11016 case V16SI_FTYPE_V4SI:
11017 case V16SI_FTYPE_V16SF:
11018 case V16SI_FTYPE_V16SI:
11019 case V64QI_FTYPE_V64QI:
11020 case V32HI_FTYPE_V32HI:
11021 case V16SF_FTYPE_V16SF:
11022 case V8DI_FTYPE_UQI:
11023 case V8DI_FTYPE_V8DI:
11024 case V8DF_FTYPE_V4DF:
11025 case V8DF_FTYPE_V2DF:
11026 case V8DF_FTYPE_V8DF:
11027 case V4DI_FTYPE_V4DI:
11028 case V16BF_FTYPE_V16SF:
11029 case V8BF_FTYPE_V8SF:
11030 case V8BF_FTYPE_V4SF:
11031 nargs = 1;
11032 break;
11033 case V4SF_FTYPE_V4SF_VEC_MERGE:
11034 case V2DF_FTYPE_V2DF_VEC_MERGE:
11035 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
11036 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
11037 case V16QI_FTYPE_V16QI_V16QI:
11038 case V16QI_FTYPE_V8HI_V8HI:
11039 case V16HF_FTYPE_V16HF_V16HF:
11040 case V16SF_FTYPE_V16SF_V16SF:
11041 case V8QI_FTYPE_V8QI_V8QI:
11042 case V8QI_FTYPE_V4HI_V4HI:
11043 case V8HI_FTYPE_V8HI_V8HI:
11044 case V8HI_FTYPE_V16QI_V16QI:
11045 case V8HI_FTYPE_V4SI_V4SI:
11046 case V8HF_FTYPE_V8HF_V8HF:
11047 case V8SF_FTYPE_V8SF_V8SF:
11048 case V8SF_FTYPE_V8SF_V8SI:
11049 case V8DF_FTYPE_V8DF_V8DF:
11050 case V4SI_FTYPE_V4SI_V4SI:
11051 case V4SI_FTYPE_V8HI_V8HI:
11052 case V4SI_FTYPE_V2DF_V2DF:
11053 case V4HI_FTYPE_V4HI_V4HI:
11054 case V4HI_FTYPE_V8QI_V8QI:
11055 case V4HI_FTYPE_V2SI_V2SI:
11056 case V4DF_FTYPE_V4DF_V4DF:
11057 case V4DF_FTYPE_V4DF_V4DI:
11058 case V4SF_FTYPE_V4SF_V4SF:
11059 case V4SF_FTYPE_V4SF_V4SI:
11060 case V4SF_FTYPE_V4SF_V2SI:
11061 case V4SF_FTYPE_V4SF_V2DF:
11062 case V4SF_FTYPE_V4SF_UINT:
11063 case V4SF_FTYPE_V4SF_DI:
11064 case V4SF_FTYPE_V4SF_SI:
11065 case V4DI_FTYPE_V4DI_V2DI:
11066 case V2DI_FTYPE_V2DI_V2DI:
11067 case V2DI_FTYPE_V16QI_V16QI:
11068 case V2DI_FTYPE_V4SI_V4SI:
11069 case V2DI_FTYPE_V2DI_V16QI:
11070 case V2SI_FTYPE_V2SI_V2SI:
11071 case V2SI_FTYPE_V4HI_V4HI:
11072 case V2SI_FTYPE_V2SF_V2SF:
11073 case V2DF_FTYPE_V2DF_V2DF:
11074 case V2DF_FTYPE_V2DF_V4SF:
11075 case V2DF_FTYPE_V2DF_V2DI:
11076 case V2DF_FTYPE_V2DF_DI:
11077 case V2DF_FTYPE_V2DF_SI:
11078 case V2DF_FTYPE_V2DF_UINT:
11079 case V2SF_FTYPE_V2SF_V2SF:
11080 case V1DI_FTYPE_V1DI_V1DI:
11081 case V1DI_FTYPE_V8QI_V8QI:
11082 case V1DI_FTYPE_V2SI_V2SI:
11083 case V32QI_FTYPE_V16HI_V16HI:
11084 case V16HI_FTYPE_V8SI_V8SI:
11085 case V64QI_FTYPE_V64QI_V64QI:
11086 case V32QI_FTYPE_V32QI_V32QI:
11087 case V16HI_FTYPE_V32QI_V32QI:
11088 case V16HI_FTYPE_V16HI_V16HI:
11089 case V8SI_FTYPE_V4DF_V4DF:
11090 case V8SI_FTYPE_V8SI_V8SI:
11091 case V8SI_FTYPE_V16HI_V16HI:
11092 case V4DI_FTYPE_V4DI_V4DI:
11093 case V4DI_FTYPE_V8SI_V8SI:
11094 case V4DI_FTYPE_V32QI_V32QI:
11095 case V8DI_FTYPE_V64QI_V64QI:
11096 if (comparison == UNKNOWN)
11097 return ix86_expand_binop_builtin (icode, exp, target);
11098 nargs = 2;
11099 break;
11100 case V4SF_FTYPE_V4SF_V4SF_SWAP:
11101 case V2DF_FTYPE_V2DF_V2DF_SWAP:
11102 gcc_assert (comparison != UNKNOWN);
11103 nargs = 2;
11104 swap = true;
11105 break;
11106 case V16HI_FTYPE_V16HI_V8HI_COUNT:
11107 case V16HI_FTYPE_V16HI_SI_COUNT:
11108 case V8SI_FTYPE_V8SI_V4SI_COUNT:
11109 case V8SI_FTYPE_V8SI_SI_COUNT:
11110 case V4DI_FTYPE_V4DI_V2DI_COUNT:
11111 case V4DI_FTYPE_V4DI_INT_COUNT:
11112 case V8HI_FTYPE_V8HI_V8HI_COUNT:
11113 case V8HI_FTYPE_V8HI_SI_COUNT:
11114 case V4SI_FTYPE_V4SI_V4SI_COUNT:
11115 case V4SI_FTYPE_V4SI_SI_COUNT:
11116 case V4HI_FTYPE_V4HI_V4HI_COUNT:
11117 case V4HI_FTYPE_V4HI_SI_COUNT:
11118 case V2DI_FTYPE_V2DI_V2DI_COUNT:
11119 case V2DI_FTYPE_V2DI_SI_COUNT:
11120 case V2SI_FTYPE_V2SI_V2SI_COUNT:
11121 case V2SI_FTYPE_V2SI_SI_COUNT:
11122 case V1DI_FTYPE_V1DI_V1DI_COUNT:
11123 case V1DI_FTYPE_V1DI_SI_COUNT:
11124 nargs = 2;
11125 second_arg_count = true;
11126 break;
11127 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
11128 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
11129 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
11130 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
11131 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
11132 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
11133 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
11134 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
11135 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
11136 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
11137 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
11138 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
11139 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
11140 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
11141 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
11142 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
11143 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
11144 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
11145 nargs = 4;
11146 second_arg_count = true;
11147 break;
11148 case UINT64_FTYPE_UINT64_UINT64:
11149 case UINT_FTYPE_UINT_UINT:
11150 case UINT_FTYPE_UINT_USHORT:
11151 case UINT_FTYPE_UINT_UCHAR:
11152 case UINT16_FTYPE_UINT16_INT:
11153 case UINT8_FTYPE_UINT8_INT:
11154 case UQI_FTYPE_UQI_UQI:
11155 case UHI_FTYPE_UHI_UHI:
11156 case USI_FTYPE_USI_USI:
11157 case UDI_FTYPE_UDI_UDI:
11158 case V16SI_FTYPE_V8DF_V8DF:
11159 case V32BF_FTYPE_V16SF_V16SF:
11160 case V16BF_FTYPE_V8SF_V8SF:
11161 case V8BF_FTYPE_V4SF_V4SF:
11162 case V16BF_FTYPE_V16SF_UHI:
11163 case V8BF_FTYPE_V8SF_UQI:
11164 case V8BF_FTYPE_V4SF_UQI:
11165 nargs = 2;
11166 break;
11167 case V2DI_FTYPE_V2DI_INT_CONVERT:
11168 nargs = 2;
11169 rmode = V1TImode;
11170 nargs_constant = 1;
11171 break;
11172 case V4DI_FTYPE_V4DI_INT_CONVERT:
11173 nargs = 2;
11174 rmode = V2TImode;
11175 nargs_constant = 1;
11176 break;
11177 case V8DI_FTYPE_V8DI_INT_CONVERT:
11178 nargs = 2;
11179 rmode = V4TImode;
11180 nargs_constant = 1;
11181 break;
11182 case V8HI_FTYPE_V8HI_INT:
11183 case V8HI_FTYPE_V8SF_INT:
11184 case V16HI_FTYPE_V16SF_INT:
11185 case V8HI_FTYPE_V4SF_INT:
11186 case V8SF_FTYPE_V8SF_INT:
11187 case V4SF_FTYPE_V16SF_INT:
11188 case V16SF_FTYPE_V16SF_INT:
11189 case V4SI_FTYPE_V4SI_INT:
11190 case V4SI_FTYPE_V8SI_INT:
11191 case V4HI_FTYPE_V4HI_INT:
11192 case V4DF_FTYPE_V4DF_INT:
11193 case V4DF_FTYPE_V8DF_INT:
11194 case V4SF_FTYPE_V4SF_INT:
11195 case V4SF_FTYPE_V8SF_INT:
11196 case V2DI_FTYPE_V2DI_INT:
11197 case V2DF_FTYPE_V2DF_INT:
11198 case V2DF_FTYPE_V4DF_INT:
11199 case V16HI_FTYPE_V16HI_INT:
11200 case V8SI_FTYPE_V8SI_INT:
11201 case V16SI_FTYPE_V16SI_INT:
11202 case V4SI_FTYPE_V16SI_INT:
11203 case V4DI_FTYPE_V4DI_INT:
11204 case V2DI_FTYPE_V4DI_INT:
11205 case V4DI_FTYPE_V8DI_INT:
11206 case UQI_FTYPE_UQI_UQI_CONST:
11207 case UHI_FTYPE_UHI_UQI:
11208 case USI_FTYPE_USI_UQI:
11209 case UDI_FTYPE_UDI_UQI:
11210 nargs = 2;
11211 nargs_constant = 1;
11212 break;
11213 case V16QI_FTYPE_V16QI_V16QI_V16QI:
11214 case V8SF_FTYPE_V8SF_V8SF_V8SF:
11215 case V4DF_FTYPE_V4DF_V4DF_V4DF:
11216 case V4SF_FTYPE_V4SF_V4SF_V4SF:
11217 case V2DF_FTYPE_V2DF_V2DF_V2DF:
11218 case V32QI_FTYPE_V32QI_V32QI_V32QI:
11219 case UHI_FTYPE_V16SI_V16SI_UHI:
11220 case UQI_FTYPE_V8DI_V8DI_UQI:
11221 case V16HI_FTYPE_V16SI_V16HI_UHI:
11222 case V16QI_FTYPE_V16SI_V16QI_UHI:
11223 case V16QI_FTYPE_V8DI_V16QI_UQI:
11224 case V32HF_FTYPE_V32HF_V32HF_USI:
11225 case V16SF_FTYPE_V16SF_V16SF_UHI:
11226 case V16SF_FTYPE_V4SF_V16SF_UHI:
11227 case V16SI_FTYPE_SI_V16SI_UHI:
11228 case V16SI_FTYPE_V16HI_V16SI_UHI:
11229 case V16SI_FTYPE_V16QI_V16SI_UHI:
11230 case V8SF_FTYPE_V4SF_V8SF_UQI:
11231 case V4DF_FTYPE_V2DF_V4DF_UQI:
11232 case V8SI_FTYPE_V4SI_V8SI_UQI:
11233 case V8SI_FTYPE_SI_V8SI_UQI:
11234 case V4SI_FTYPE_V4SI_V4SI_UQI:
11235 case V4SI_FTYPE_SI_V4SI_UQI:
11236 case V4DI_FTYPE_V2DI_V4DI_UQI:
11237 case V4DI_FTYPE_DI_V4DI_UQI:
11238 case V2DI_FTYPE_V2DI_V2DI_UQI:
11239 case V2DI_FTYPE_DI_V2DI_UQI:
11240 case V64QI_FTYPE_V64QI_V64QI_UDI:
11241 case V64QI_FTYPE_V16QI_V64QI_UDI:
11242 case V64QI_FTYPE_QI_V64QI_UDI:
11243 case V32QI_FTYPE_V32QI_V32QI_USI:
11244 case V32QI_FTYPE_V16QI_V32QI_USI:
11245 case V32QI_FTYPE_QI_V32QI_USI:
11246 case V16QI_FTYPE_V16QI_V16QI_UHI:
11247 case V16QI_FTYPE_QI_V16QI_UHI:
11248 case V32HI_FTYPE_V8HI_V32HI_USI:
11249 case V32HI_FTYPE_HI_V32HI_USI:
11250 case V16HI_FTYPE_V8HI_V16HI_UHI:
11251 case V16HI_FTYPE_HI_V16HI_UHI:
11252 case V8HI_FTYPE_V8HI_V8HI_UQI:
11253 case V8HI_FTYPE_HI_V8HI_UQI:
11254 case V16HF_FTYPE_V16HF_V16HF_UHI:
11255 case V8SF_FTYPE_V8HI_V8SF_UQI:
11256 case V4SF_FTYPE_V8HI_V4SF_UQI:
11257 case V8SI_FTYPE_V8HF_V8SI_UQI:
11258 case V8SF_FTYPE_V8HF_V8SF_UQI:
11259 case V8SI_FTYPE_V8SF_V8SI_UQI:
11260 case V4SI_FTYPE_V4SF_V4SI_UQI:
11261 case V4SI_FTYPE_V8HF_V4SI_UQI:
11262 case V4SF_FTYPE_V8HF_V4SF_UQI:
11263 case V4DI_FTYPE_V8HF_V4DI_UQI:
11264 case V4DI_FTYPE_V4SF_V4DI_UQI:
11265 case V2DI_FTYPE_V8HF_V2DI_UQI:
11266 case V2DI_FTYPE_V4SF_V2DI_UQI:
11267 case V8HF_FTYPE_V8HF_V8HF_UQI:
11268 case V8HF_FTYPE_V8HF_V8HF_V8HF:
11269 case V8HF_FTYPE_V8HI_V8HF_UQI:
11270 case V8HF_FTYPE_V8SI_V8HF_UQI:
11271 case V8HF_FTYPE_V8SF_V8HF_UQI:
11272 case V8HF_FTYPE_V4SI_V8HF_UQI:
11273 case V8HF_FTYPE_V4SF_V8HF_UQI:
11274 case V8HF_FTYPE_V4DI_V8HF_UQI:
11275 case V8HF_FTYPE_V4DF_V8HF_UQI:
11276 case V8HF_FTYPE_V2DI_V8HF_UQI:
11277 case V8HF_FTYPE_V2DF_V8HF_UQI:
11278 case V4SF_FTYPE_V4DI_V4SF_UQI:
11279 case V4SF_FTYPE_V2DI_V4SF_UQI:
11280 case V4DF_FTYPE_V4DI_V4DF_UQI:
11281 case V4DF_FTYPE_V8HF_V4DF_UQI:
11282 case V2DF_FTYPE_V8HF_V2DF_UQI:
11283 case V2DF_FTYPE_V2DI_V2DF_UQI:
11284 case V16QI_FTYPE_V8HI_V16QI_UQI:
11285 case V16QI_FTYPE_V16HI_V16QI_UHI:
11286 case V16QI_FTYPE_V4SI_V16QI_UQI:
11287 case V16QI_FTYPE_V8SI_V16QI_UQI:
11288 case V8HI_FTYPE_V8HF_V8HI_UQI:
11289 case V8HI_FTYPE_V4SI_V8HI_UQI:
11290 case V8HI_FTYPE_V8SI_V8HI_UQI:
11291 case V16QI_FTYPE_V2DI_V16QI_UQI:
11292 case V16QI_FTYPE_V4DI_V16QI_UQI:
11293 case V8HI_FTYPE_V2DI_V8HI_UQI:
11294 case V8HI_FTYPE_V4DI_V8HI_UQI:
11295 case V4SI_FTYPE_V2DI_V4SI_UQI:
11296 case V4SI_FTYPE_V4DI_V4SI_UQI:
11297 case V32QI_FTYPE_V32HI_V32QI_USI:
11298 case UHI_FTYPE_V16QI_V16QI_UHI:
11299 case USI_FTYPE_V32QI_V32QI_USI:
11300 case UDI_FTYPE_V64QI_V64QI_UDI:
11301 case UQI_FTYPE_V8HI_V8HI_UQI:
11302 case UHI_FTYPE_V16HI_V16HI_UHI:
11303 case USI_FTYPE_V32HI_V32HI_USI:
11304 case UQI_FTYPE_V4SI_V4SI_UQI:
11305 case UQI_FTYPE_V8SI_V8SI_UQI:
11306 case UQI_FTYPE_V2DI_V2DI_UQI:
11307 case UQI_FTYPE_V4DI_V4DI_UQI:
11308 case V4SF_FTYPE_V2DF_V4SF_UQI:
11309 case V4SF_FTYPE_V4DF_V4SF_UQI:
11310 case V16SI_FTYPE_V16SI_V16SI_UHI:
11311 case V16SI_FTYPE_V4SI_V16SI_UHI:
11312 case V2DI_FTYPE_V4SI_V2DI_UQI:
11313 case V2DI_FTYPE_V8HI_V2DI_UQI:
11314 case V2DI_FTYPE_V16QI_V2DI_UQI:
11315 case V4DI_FTYPE_V4DI_V4DI_UQI:
11316 case V4DI_FTYPE_V4SI_V4DI_UQI:
11317 case V4DI_FTYPE_V8HI_V4DI_UQI:
11318 case V4DI_FTYPE_V16QI_V4DI_UQI:
11319 case V4DI_FTYPE_V4DF_V4DI_UQI:
11320 case V2DI_FTYPE_V2DF_V2DI_UQI:
11321 case V4SI_FTYPE_V4DF_V4SI_UQI:
11322 case V4SI_FTYPE_V2DF_V4SI_UQI:
11323 case V4SI_FTYPE_V8HI_V4SI_UQI:
11324 case V4SI_FTYPE_V16QI_V4SI_UQI:
11325 case V4DI_FTYPE_V4DI_V4DI_V4DI:
11326 case V8DF_FTYPE_V2DF_V8DF_UQI:
11327 case V8DF_FTYPE_V4DF_V8DF_UQI:
11328 case V8DF_FTYPE_V8DF_V8DF_UQI:
11329 case V8SF_FTYPE_V8SF_V8SF_UQI:
11330 case V8SF_FTYPE_V8SI_V8SF_UQI:
11331 case V4DF_FTYPE_V4DF_V4DF_UQI:
11332 case V4SF_FTYPE_V4SF_V4SF_UQI:
11333 case V2DF_FTYPE_V2DF_V2DF_UQI:
11334 case V2DF_FTYPE_V4SF_V2DF_UQI:
11335 case V2DF_FTYPE_V4SI_V2DF_UQI:
11336 case V4SF_FTYPE_V4SI_V4SF_UQI:
11337 case V4DF_FTYPE_V4SF_V4DF_UQI:
11338 case V4DF_FTYPE_V4SI_V4DF_UQI:
11339 case V8SI_FTYPE_V8SI_V8SI_UQI:
11340 case V8SI_FTYPE_V8HI_V8SI_UQI:
11341 case V8SI_FTYPE_V16QI_V8SI_UQI:
11342 case V8DF_FTYPE_V8SI_V8DF_UQI:
11343 case V8DI_FTYPE_DI_V8DI_UQI:
11344 case V16SF_FTYPE_V8SF_V16SF_UHI:
11345 case V16SI_FTYPE_V8SI_V16SI_UHI:
11346 case V16HF_FTYPE_V16HI_V16HF_UHI:
11347 case V16HF_FTYPE_V16HF_V16HF_V16HF:
11348 case V16HI_FTYPE_V16HF_V16HI_UHI:
11349 case V16HI_FTYPE_V16HI_V16HI_UHI:
11350 case V8HI_FTYPE_V16QI_V8HI_UQI:
11351 case V16HI_FTYPE_V16QI_V16HI_UHI:
11352 case V32HI_FTYPE_V32HI_V32HI_USI:
11353 case V32HI_FTYPE_V32QI_V32HI_USI:
11354 case V8DI_FTYPE_V16QI_V8DI_UQI:
11355 case V8DI_FTYPE_V2DI_V8DI_UQI:
11356 case V8DI_FTYPE_V4DI_V8DI_UQI:
11357 case V8DI_FTYPE_V8DI_V8DI_UQI:
11358 case V8DI_FTYPE_V8HI_V8DI_UQI:
11359 case V8DI_FTYPE_V8SI_V8DI_UQI:
11360 case V8HI_FTYPE_V8DI_V8HI_UQI:
11361 case V8SI_FTYPE_V8DI_V8SI_UQI:
11362 case V4SI_FTYPE_V4SI_V4SI_V4SI:
11363 case V4DI_FTYPE_V4DI_V4DI_V2DI:
11364 case V16SI_FTYPE_V16SI_V16SI_V16SI:
11365 case V8DI_FTYPE_V8DI_V8DI_V8DI:
11366 case V32HI_FTYPE_V32HI_V32HI_V32HI:
11367 case V2DI_FTYPE_V2DI_V2DI_V2DI:
11368 case V16HI_FTYPE_V16HI_V16HI_V16HI:
11369 case V8SI_FTYPE_V8SI_V8SI_V8SI:
11370 case V8HI_FTYPE_V8HI_V8HI_V8HI:
11371 case V32BF_FTYPE_V16SF_V16SF_USI:
11372 case V16BF_FTYPE_V8SF_V8SF_UHI:
11373 case V8BF_FTYPE_V4SF_V4SF_UQI:
11374 case V16BF_FTYPE_V16SF_V16BF_UHI:
11375 case V8BF_FTYPE_V8SF_V8BF_UQI:
11376 case V8BF_FTYPE_V4SF_V8BF_UQI:
11377 case V16SF_FTYPE_V16SF_V32BF_V32BF:
11378 case V8SF_FTYPE_V8SF_V16BF_V16BF:
11379 case V4SF_FTYPE_V4SF_V8BF_V8BF:
11380 nargs = 3;
11381 break;
11382 case V32QI_FTYPE_V32QI_V32QI_INT:
11383 case V16HI_FTYPE_V16HI_V16HI_INT:
11384 case V16QI_FTYPE_V16QI_V16QI_INT:
11385 case V4DI_FTYPE_V4DI_V4DI_INT:
11386 case V8HI_FTYPE_V8HI_V8HI_INT:
11387 case V8SI_FTYPE_V8SI_V8SI_INT:
11388 case V8SI_FTYPE_V8SI_V4SI_INT:
11389 case V8SF_FTYPE_V8SF_V8SF_INT:
11390 case V8SF_FTYPE_V8SF_V4SF_INT:
11391 case V4SI_FTYPE_V4SI_V4SI_INT:
11392 case V4DF_FTYPE_V4DF_V4DF_INT:
11393 case V16SF_FTYPE_V16SF_V16SF_INT:
11394 case V16SF_FTYPE_V16SF_V4SF_INT:
11395 case V16SI_FTYPE_V16SI_V4SI_INT:
11396 case V4DF_FTYPE_V4DF_V2DF_INT:
11397 case V4SF_FTYPE_V4SF_V4SF_INT:
11398 case V2DI_FTYPE_V2DI_V2DI_INT:
11399 case V4DI_FTYPE_V4DI_V2DI_INT:
11400 case V2DF_FTYPE_V2DF_V2DF_INT:
11401 case UQI_FTYPE_V8DI_V8UDI_INT:
11402 case UQI_FTYPE_V8DF_V8DF_INT:
11403 case UQI_FTYPE_V2DF_V2DF_INT:
11404 case UQI_FTYPE_V4SF_V4SF_INT:
11405 case UHI_FTYPE_V16SI_V16SI_INT:
11406 case UHI_FTYPE_V16SF_V16SF_INT:
11407 case V64QI_FTYPE_V64QI_V64QI_INT:
11408 case V32HI_FTYPE_V32HI_V32HI_INT:
11409 case V16SI_FTYPE_V16SI_V16SI_INT:
11410 case V8DI_FTYPE_V8DI_V8DI_INT:
11411 nargs = 3;
11412 nargs_constant = 1;
11413 break;
11414 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
11415 nargs = 3;
11416 rmode = V4DImode;
11417 nargs_constant = 1;
11418 break;
11419 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
11420 nargs = 3;
11421 rmode = V2DImode;
11422 nargs_constant = 1;
11423 break;
11424 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
11425 nargs = 3;
11426 rmode = DImode;
11427 nargs_constant = 1;
11428 break;
11429 case V2DI_FTYPE_V2DI_UINT_UINT:
11430 nargs = 3;
11431 nargs_constant = 2;
11432 break;
11433 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
11434 nargs = 3;
11435 rmode = V8DImode;
11436 nargs_constant = 1;
11437 break;
11438 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
11439 nargs = 5;
11440 rmode = V8DImode;
11441 mask_pos = 2;
11442 nargs_constant = 1;
11443 break;
11444 case QI_FTYPE_V8DF_INT_UQI:
11445 case QI_FTYPE_V4DF_INT_UQI:
11446 case QI_FTYPE_V2DF_INT_UQI:
11447 case HI_FTYPE_V16SF_INT_UHI:
11448 case QI_FTYPE_V8SF_INT_UQI:
11449 case QI_FTYPE_V4SF_INT_UQI:
11450 case QI_FTYPE_V8HF_INT_UQI:
11451 case HI_FTYPE_V16HF_INT_UHI:
11452 case SI_FTYPE_V32HF_INT_USI:
11453 case V4SI_FTYPE_V4SI_V4SI_UHI:
11454 case V8SI_FTYPE_V8SI_V8SI_UHI:
11455 nargs = 3;
11456 mask_pos = 1;
11457 nargs_constant = 1;
11458 break;
11459 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
11460 nargs = 5;
11461 rmode = V4DImode;
11462 mask_pos = 2;
11463 nargs_constant = 1;
11464 break;
11465 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
11466 nargs = 5;
11467 rmode = V2DImode;
11468 mask_pos = 2;
11469 nargs_constant = 1;
11470 break;
11471 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
11472 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
11473 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
11474 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
11475 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
11476 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
11477 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
11478 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
11479 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
11480 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
11481 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
11482 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
11483 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
11484 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
11485 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
11486 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
11487 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
11488 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
11489 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
11490 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
11491 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
11492 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
11493 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
11494 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
11495 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
11496 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
11497 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
11498 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
11499 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
11500 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
11501 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
11502 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
11503 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
11504 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
11505 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
11506 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
11507 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
11508 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
11509 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
11510 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
11511 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
11512 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
11513 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
11514 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
11515 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
11516 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
11517 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
11518 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
11519 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
11520 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
11521 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
11522 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
11523 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
11524 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
11525 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
11526 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
11527 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
11528 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
11529 nargs = 4;
11530 break;
11531 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
11532 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
11533 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
11534 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
11535 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
11536 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
11537 nargs = 4;
11538 nargs_constant = 1;
11539 break;
11540 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
11541 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
11542 case QI_FTYPE_V4DF_V4DF_INT_UQI:
11543 case QI_FTYPE_V8SF_V8SF_INT_UQI:
11544 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
11545 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
11546 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
11547 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
11548 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
11549 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
11550 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
11551 case USI_FTYPE_V32QI_V32QI_INT_USI:
11552 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
11553 case USI_FTYPE_V32HI_V32HI_INT_USI:
11554 case USI_FTYPE_V32HF_V32HF_INT_USI:
11555 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
11556 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
11557 nargs = 4;
11558 mask_pos = 1;
11559 nargs_constant = 1;
11560 break;
11561 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
11562 nargs = 4;
11563 nargs_constant = 2;
11564 break;
11565 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
11566 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
11567 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
11568 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
11569 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
11570 nargs = 4;
11571 break;
11572 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
11573 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
11574 mask_pos = 1;
11575 nargs = 4;
11576 nargs_constant = 1;
11577 break;
11578 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
11579 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
11580 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
11581 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
11582 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
11583 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
11584 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
11585 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
11586 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
11587 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
11588 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
11589 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
11590 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
11591 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
11592 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
11593 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
11594 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
11595 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
11596 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
11597 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
11598 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
11599 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
11600 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
11601 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
11602 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
11603 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
11604 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
11605 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
11606 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
11607 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
11608 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
11609 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
11610 nargs = 4;
11611 mask_pos = 2;
11612 nargs_constant = 1;
11613 break;
11614 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
11615 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
11616 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
11617 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
11618 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
11619 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
11620 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
11621 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
11622 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
11623 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
11624 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
11625 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
11626 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
11627 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
11628 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
11629 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
11630 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
11631 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
11632 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
11633 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
11634 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
11635 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
11636 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
11637 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
11638 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
11639 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
11640 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
11641 nargs = 5;
11642 mask_pos = 2;
11643 nargs_constant = 1;
11644 break;
11645 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
11646 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
11647 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
11648 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
11649 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
11650 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
11651 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
11652 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
11653 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
11654 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
11655 nargs = 5;
11656 mask_pos = 1;
11657 nargs_constant = 1;
11658 break;
11659 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
11660 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
11661 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
11662 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
11663 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
11664 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
11665 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
11666 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
11667 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
11668 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
11669 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
11670 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
11671 nargs = 5;
11672 mask_pos = 1;
11673 nargs_constant = 2;
11674 break;
11676 default:
11677 gcc_unreachable ();
11680 gcc_assert (nargs <= ARRAY_SIZE (xops));
11682 if (comparison != UNKNOWN)
11684 gcc_assert (nargs == 2);
11685 return ix86_expand_sse_compare (d, exp, target, swap);
11688 if (rmode == VOIDmode || rmode == tmode)
11690 if (optimize
11691 || target == 0
11692 || GET_MODE (target) != tmode
11693 || !insn_p->operand[0].predicate (target, tmode))
11694 target = gen_reg_rtx (tmode);
11695 else if (memory_operand (target, tmode))
11696 num_memory++;
11697 real_target = target;
11699 else
11701 real_target = gen_reg_rtx (tmode);
11702 target = lowpart_subreg (rmode, real_target, tmode);
11705 for (i = 0; i < nargs; i++)
11707 tree arg = CALL_EXPR_ARG (exp, i);
11708 rtx op = expand_normal (arg);
11709 machine_mode mode = insn_p->operand[i + 1].mode;
11710 bool match = insn_p->operand[i + 1].predicate (op, mode);
11712 if (second_arg_count && i == 1)
11714 /* SIMD shift insns take either an 8-bit immediate or
11715 register as count. But builtin functions take int as
11716 count. If count doesn't match, we put it in register.
11717 The instructions are using 64-bit count, if op is just
11718 32-bit, zero-extend it, as negative shift counts
11719 are undefined behavior and zero-extension is more
11720 efficient. */
11721 if (!match)
11723 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11724 op = convert_modes (mode, GET_MODE (op), op, 1);
11725 else
11726 op = lowpart_subreg (mode, op, GET_MODE (op));
11727 if (!insn_p->operand[i + 1].predicate (op, mode))
11728 op = copy_to_reg (op);
11731 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11732 (!mask_pos && (nargs - i) <= nargs_constant))
11734 if (!match)
11735 switch (icode)
11737 case CODE_FOR_avx_vinsertf128v4di:
11738 case CODE_FOR_avx_vextractf128v4di:
11739 error ("the last argument must be an 1-bit immediate");
11740 return const0_rtx;
11742 case CODE_FOR_avx512f_cmpv8di3_mask:
11743 case CODE_FOR_avx512f_cmpv16si3_mask:
11744 case CODE_FOR_avx512f_ucmpv8di3_mask:
11745 case CODE_FOR_avx512f_ucmpv16si3_mask:
11746 case CODE_FOR_avx512vl_cmpv4di3_mask:
11747 case CODE_FOR_avx512vl_cmpv8si3_mask:
11748 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11749 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11750 case CODE_FOR_avx512vl_cmpv2di3_mask:
11751 case CODE_FOR_avx512vl_cmpv4si3_mask:
11752 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11753 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11754 error ("the last argument must be a 3-bit immediate");
11755 return const0_rtx;
11757 case CODE_FOR_sse4_1_roundsd:
11758 case CODE_FOR_sse4_1_roundss:
11760 case CODE_FOR_sse4_1_roundpd:
11761 case CODE_FOR_sse4_1_roundps:
11762 case CODE_FOR_avx_roundpd256:
11763 case CODE_FOR_avx_roundps256:
11765 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11766 case CODE_FOR_sse4_1_roundps_sfix:
11767 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11768 case CODE_FOR_avx_roundps_sfix256:
11770 case CODE_FOR_sse4_1_blendps:
11771 case CODE_FOR_avx_blendpd256:
11772 case CODE_FOR_avx_vpermilv4df:
11773 case CODE_FOR_avx_vpermilv4df_mask:
11774 case CODE_FOR_avx512f_getmantv8df_mask:
11775 case CODE_FOR_avx512f_getmantv16sf_mask:
11776 case CODE_FOR_avx512vl_getmantv16hf_mask:
11777 case CODE_FOR_avx512vl_getmantv8sf_mask:
11778 case CODE_FOR_avx512vl_getmantv4df_mask:
11779 case CODE_FOR_avx512fp16_getmantv8hf_mask:
11780 case CODE_FOR_avx512vl_getmantv4sf_mask:
11781 case CODE_FOR_avx512vl_getmantv2df_mask:
11782 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11783 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11784 case CODE_FOR_avx512dq_rangepv4df_mask:
11785 case CODE_FOR_avx512dq_rangepv8sf_mask:
11786 case CODE_FOR_avx512dq_rangepv2df_mask:
11787 case CODE_FOR_avx512dq_rangepv4sf_mask:
11788 case CODE_FOR_avx_shufpd256_mask:
11789 error ("the last argument must be a 4-bit immediate");
11790 return const0_rtx;
11792 case CODE_FOR_sha1rnds4:
11793 case CODE_FOR_sse4_1_blendpd:
11794 case CODE_FOR_avx_vpermilv2df:
11795 case CODE_FOR_avx_vpermilv2df_mask:
11796 case CODE_FOR_xop_vpermil2v2df3:
11797 case CODE_FOR_xop_vpermil2v4sf3:
11798 case CODE_FOR_xop_vpermil2v4df3:
11799 case CODE_FOR_xop_vpermil2v8sf3:
11800 case CODE_FOR_avx512f_vinsertf32x4_mask:
11801 case CODE_FOR_avx512f_vinserti32x4_mask:
11802 case CODE_FOR_avx512f_vextractf32x4_mask:
11803 case CODE_FOR_avx512f_vextracti32x4_mask:
11804 case CODE_FOR_sse2_shufpd:
11805 case CODE_FOR_sse2_shufpd_mask:
11806 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11807 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11808 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11809 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11810 error ("the last argument must be a 2-bit immediate");
11811 return const0_rtx;
11813 case CODE_FOR_avx_vextractf128v4df:
11814 case CODE_FOR_avx_vextractf128v8sf:
11815 case CODE_FOR_avx_vextractf128v8si:
11816 case CODE_FOR_avx_vinsertf128v4df:
11817 case CODE_FOR_avx_vinsertf128v8sf:
11818 case CODE_FOR_avx_vinsertf128v8si:
11819 case CODE_FOR_avx512f_vinsertf64x4_mask:
11820 case CODE_FOR_avx512f_vinserti64x4_mask:
11821 case CODE_FOR_avx512f_vextractf64x4_mask:
11822 case CODE_FOR_avx512f_vextracti64x4_mask:
11823 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11824 case CODE_FOR_avx512dq_vinserti32x8_mask:
11825 case CODE_FOR_avx512vl_vinsertv4df:
11826 case CODE_FOR_avx512vl_vinsertv4di:
11827 case CODE_FOR_avx512vl_vinsertv8sf:
11828 case CODE_FOR_avx512vl_vinsertv8si:
11829 error ("the last argument must be a 1-bit immediate");
11830 return const0_rtx;
11832 case CODE_FOR_avx_vmcmpv2df3:
11833 case CODE_FOR_avx_vmcmpv4sf3:
11834 case CODE_FOR_avx_cmpv2df3:
11835 case CODE_FOR_avx_cmpv4sf3:
11836 case CODE_FOR_avx_cmpv4df3:
11837 case CODE_FOR_avx_cmpv8sf3:
11838 case CODE_FOR_avx512f_cmpv8df3_mask:
11839 case CODE_FOR_avx512f_cmpv16sf3_mask:
11840 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11841 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11842 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11843 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11844 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11845 error ("the last argument must be a 5-bit immediate");
11846 return const0_rtx;
11848 default:
11849 switch (nargs_constant)
11851 case 2:
11852 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11853 (!mask_pos && (nargs - i) == nargs_constant))
11855 error ("the next to last argument must be an 8-bit immediate");
11856 break;
11858 /* FALLTHRU */
11859 case 1:
11860 error ("the last argument must be an 8-bit immediate");
11861 break;
11862 default:
11863 gcc_unreachable ();
11865 return const0_rtx;
11868 else
11870 if (VECTOR_MODE_P (mode))
11871 op = safe_vector_operand (op, mode);
11873 /* If we aren't optimizing, only allow one memory operand to
11874 be generated. */
11875 if (memory_operand (op, mode))
11876 num_memory++;
11878 op = fixup_modeless_constant (op, mode);
11880 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11882 if (optimize || !match || num_memory > 1)
11883 op = copy_to_mode_reg (mode, op);
11885 else
11887 op = copy_to_reg (op);
11888 op = lowpart_subreg (mode, op, GET_MODE (op));
11892 xops[i] = op;
11895 switch (nargs)
11897 case 1:
11898 pat = GEN_FCN (icode) (real_target, xops[0]);
11899 break;
11900 case 2:
11901 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11902 break;
11903 case 3:
11904 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11905 break;
11906 case 4:
11907 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11908 xops[2], xops[3]);
11909 break;
11910 case 5:
11911 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11912 xops[2], xops[3], xops[4]);
11913 break;
11914 case 6:
11915 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11916 xops[2], xops[3], xops[4], xops[5]);
11917 break;
11918 default:
11919 gcc_unreachable ();
11922 if (! pat)
11923 return 0;
11925 emit_insn (pat);
11926 return target;
11929 /* Transform pattern of following layout:
11930 (set A
11931 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11933 into:
11934 (set (A B)) */
11936 static rtx
11937 ix86_erase_embedded_rounding (rtx pat)
11939 if (GET_CODE (pat) == INSN)
11940 pat = PATTERN (pat);
11942 gcc_assert (GET_CODE (pat) == SET);
11943 rtx src = SET_SRC (pat);
11944 gcc_assert (XVECLEN (src, 0) == 2);
11945 rtx p0 = XVECEXP (src, 0, 0);
11946 gcc_assert (GET_CODE (src) == UNSPEC
11947 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11948 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11949 return res;
11952 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11953 with rounding. */
11954 static rtx
11955 ix86_expand_sse_comi_round (const struct builtin_description *d,
11956 tree exp, rtx target)
11958 rtx pat, set_dst;
11959 tree arg0 = CALL_EXPR_ARG (exp, 0);
11960 tree arg1 = CALL_EXPR_ARG (exp, 1);
11961 tree arg2 = CALL_EXPR_ARG (exp, 2);
11962 tree arg3 = CALL_EXPR_ARG (exp, 3);
11963 rtx op0 = expand_normal (arg0);
11964 rtx op1 = expand_normal (arg1);
11965 rtx op2 = expand_normal (arg2);
11966 rtx op3 = expand_normal (arg3);
11967 enum insn_code icode = d->icode;
11968 const struct insn_data_d *insn_p = &insn_data[icode];
11969 machine_mode mode0 = insn_p->operand[0].mode;
11970 machine_mode mode1 = insn_p->operand[1].mode;
11972 /* See avxintrin.h for values. */
11973 static const enum rtx_code comparisons[32] =
11975 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11976 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11977 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11978 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11980 static const bool ordereds[32] =
11982 true, true, true, false, false, false, false, true,
11983 false, false, false, true, true, true, true, false,
11984 true, true, true, false, false, false, false, true,
11985 false, false, false, true, true, true, true, false
11987 static const bool non_signalings[32] =
11989 true, false, false, true, true, false, false, true,
11990 true, false, false, true, true, false, false, true,
11991 false, true, true, false, false, true, true, false,
11992 false, true, true, false, false, true, true, false
11995 if (!CONST_INT_P (op2))
11997 error ("the third argument must be comparison constant");
11998 return const0_rtx;
12000 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
12002 error ("incorrect comparison mode");
12003 return const0_rtx;
12006 if (!insn_p->operand[2].predicate (op3, SImode))
12008 error ("incorrect rounding operand");
12009 return const0_rtx;
12012 if (VECTOR_MODE_P (mode0))
12013 op0 = safe_vector_operand (op0, mode0);
12014 if (VECTOR_MODE_P (mode1))
12015 op1 = safe_vector_operand (op1, mode1);
12017 enum rtx_code comparison = comparisons[INTVAL (op2)];
12018 bool ordered = ordereds[INTVAL (op2)];
12019 bool non_signaling = non_signalings[INTVAL (op2)];
12020 rtx const_val = const0_rtx;
12022 bool check_unordered = false;
12023 machine_mode mode = CCFPmode;
12024 switch (comparison)
12026 case ORDERED:
12027 if (!ordered)
12029 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
12030 if (!non_signaling)
12031 ordered = true;
12032 mode = CCSmode;
12034 else
12036 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
12037 if (non_signaling)
12038 ordered = false;
12039 mode = CCPmode;
12041 comparison = NE;
12042 break;
12043 case UNORDERED:
12044 if (ordered)
12046 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
12047 if (non_signaling)
12048 ordered = false;
12049 mode = CCSmode;
12051 else
12053 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
12054 if (!non_signaling)
12055 ordered = true;
12056 mode = CCPmode;
12058 comparison = EQ;
12059 break;
12061 case LE: /* -> GE */
12062 case LT: /* -> GT */
12063 case UNGE: /* -> UNLE */
12064 case UNGT: /* -> UNLT */
12065 std::swap (op0, op1);
12066 comparison = swap_condition (comparison);
12067 /* FALLTHRU */
12068 case GT:
12069 case GE:
12070 case UNEQ:
12071 case UNLT:
12072 case UNLE:
12073 case LTGT:
12074 /* These are supported by CCFPmode. NB: Use ordered/signaling
12075 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
12076 with NAN operands. */
12077 if (ordered == non_signaling)
12078 ordered = !ordered;
12079 break;
12080 case EQ:
12081 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
12082 _CMP_EQ_OQ/_CMP_EQ_OS. */
12083 check_unordered = true;
12084 mode = CCZmode;
12085 break;
12086 case NE:
12087 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
12088 _CMP_NEQ_UQ/_CMP_NEQ_US. */
12089 gcc_assert (!ordered);
12090 check_unordered = true;
12091 mode = CCZmode;
12092 const_val = const1_rtx;
12093 break;
12094 default:
12095 gcc_unreachable ();
12098 target = gen_reg_rtx (SImode);
12099 emit_move_insn (target, const_val);
12100 target = gen_rtx_SUBREG (QImode, target, 0);
12102 if ((optimize && !register_operand (op0, mode0))
12103 || !insn_p->operand[0].predicate (op0, mode0))
12104 op0 = copy_to_mode_reg (mode0, op0);
12105 if ((optimize && !register_operand (op1, mode1))
12106 || !insn_p->operand[1].predicate (op1, mode1))
12107 op1 = copy_to_mode_reg (mode1, op1);
12110 1. COMI: ordered and signaling.
12111 2. UCOMI: unordered and non-signaling.
12113 if (non_signaling)
12114 icode = (icode == CODE_FOR_sse_comi_round
12115 ? CODE_FOR_sse_ucomi_round
12116 : CODE_FOR_sse2_ucomi_round);
12118 pat = GEN_FCN (icode) (op0, op1, op3);
12119 if (! pat)
12120 return 0;
12122 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
12123 if (INTVAL (op3) == NO_ROUND)
12125 pat = ix86_erase_embedded_rounding (pat);
12126 if (! pat)
12127 return 0;
12129 set_dst = SET_DEST (pat);
12131 else
12133 gcc_assert (GET_CODE (pat) == SET);
12134 set_dst = SET_DEST (pat);
12137 emit_insn (pat);
12139 return ix86_ssecom_setcc (comparison, check_unordered, mode,
12140 set_dst, target);
12143 static rtx
12144 ix86_expand_round_builtin (const struct builtin_description *d,
12145 tree exp, rtx target)
12147 rtx pat;
12148 unsigned int i, nargs;
12149 rtx xops[6];
12150 enum insn_code icode = d->icode;
12151 const struct insn_data_d *insn_p = &insn_data[icode];
12152 machine_mode tmode = insn_p->operand[0].mode;
12153 unsigned int nargs_constant = 0;
12154 unsigned int redundant_embed_rnd = 0;
12156 switch ((enum ix86_builtin_func_type) d->flag)
12158 case UINT64_FTYPE_V2DF_INT:
12159 case UINT64_FTYPE_V4SF_INT:
12160 case UINT64_FTYPE_V8HF_INT:
12161 case UINT_FTYPE_V2DF_INT:
12162 case UINT_FTYPE_V4SF_INT:
12163 case UINT_FTYPE_V8HF_INT:
12164 case INT64_FTYPE_V2DF_INT:
12165 case INT64_FTYPE_V4SF_INT:
12166 case INT64_FTYPE_V8HF_INT:
12167 case INT_FTYPE_V2DF_INT:
12168 case INT_FTYPE_V4SF_INT:
12169 case INT_FTYPE_V8HF_INT:
12170 nargs = 2;
12171 break;
12172 case V32HF_FTYPE_V32HF_V32HF_INT:
12173 case V8HF_FTYPE_V8HF_V8HF_INT:
12174 case V8HF_FTYPE_V8HF_INT_INT:
12175 case V8HF_FTYPE_V8HF_UINT_INT:
12176 case V8HF_FTYPE_V8HF_INT64_INT:
12177 case V8HF_FTYPE_V8HF_UINT64_INT:
12178 case V4SF_FTYPE_V4SF_UINT_INT:
12179 case V4SF_FTYPE_V4SF_UINT64_INT:
12180 case V2DF_FTYPE_V2DF_UINT64_INT:
12181 case V4SF_FTYPE_V4SF_INT_INT:
12182 case V4SF_FTYPE_V4SF_INT64_INT:
12183 case V2DF_FTYPE_V2DF_INT64_INT:
12184 case V4SF_FTYPE_V4SF_V4SF_INT:
12185 case V2DF_FTYPE_V2DF_V2DF_INT:
12186 case V4SF_FTYPE_V4SF_V2DF_INT:
12187 case V2DF_FTYPE_V2DF_V4SF_INT:
12188 nargs = 3;
12189 break;
12190 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
12191 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
12192 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
12193 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
12194 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
12195 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
12196 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
12197 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
12198 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
12199 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
12200 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
12201 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
12202 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
12203 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
12204 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
12205 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
12206 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
12207 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
12208 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
12209 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
12210 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
12211 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
12212 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
12213 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
12214 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
12215 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
12216 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
12217 nargs = 4;
12218 break;
12219 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
12220 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
12221 nargs_constant = 2;
12222 nargs = 4;
12223 break;
12224 case INT_FTYPE_V4SF_V4SF_INT_INT:
12225 case INT_FTYPE_V2DF_V2DF_INT_INT:
12226 return ix86_expand_sse_comi_round (d, exp, target);
12227 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
12228 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
12229 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
12230 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
12231 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
12232 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
12233 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
12234 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
12235 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
12236 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
12237 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
12238 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
12239 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
12240 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
12241 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
12242 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
12243 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
12244 nargs = 5;
12245 break;
12246 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
12247 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
12248 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
12249 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
12250 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
12251 nargs_constant = 4;
12252 nargs = 5;
12253 break;
12254 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
12255 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
12256 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
12257 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
12258 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
12259 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
12260 nargs_constant = 3;
12261 nargs = 5;
12262 break;
12263 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
12264 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
12265 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
12266 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
12267 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
12268 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
12269 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
12270 nargs = 6;
12271 nargs_constant = 4;
12272 break;
12273 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
12274 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
12275 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
12276 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
12277 nargs = 6;
12278 nargs_constant = 3;
12279 break;
12280 default:
12281 gcc_unreachable ();
12283 gcc_assert (nargs <= ARRAY_SIZE (xops));
12285 if (optimize
12286 || target == 0
12287 || GET_MODE (target) != tmode
12288 || !insn_p->operand[0].predicate (target, tmode))
12289 target = gen_reg_rtx (tmode);
12291 for (i = 0; i < nargs; i++)
12293 tree arg = CALL_EXPR_ARG (exp, i);
12294 rtx op = expand_normal (arg);
12295 machine_mode mode = insn_p->operand[i + 1].mode;
12296 bool match = insn_p->operand[i + 1].predicate (op, mode);
12298 if (i == nargs - nargs_constant)
12300 if (!match)
12302 switch (icode)
12304 case CODE_FOR_avx512f_getmantv8df_mask_round:
12305 case CODE_FOR_avx512f_getmantv16sf_mask_round:
12306 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
12307 case CODE_FOR_avx512f_vgetmantv2df_round:
12308 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
12309 case CODE_FOR_avx512f_vgetmantv4sf_round:
12310 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
12311 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
12312 error ("the immediate argument must be a 4-bit immediate");
12313 return const0_rtx;
12314 case CODE_FOR_avx512f_cmpv8df3_mask_round:
12315 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
12316 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
12317 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
12318 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
12319 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
12320 error ("the immediate argument must be a 5-bit immediate");
12321 return const0_rtx;
12322 default:
12323 error ("the immediate argument must be an 8-bit immediate");
12324 return const0_rtx;
12328 else if (i == nargs-1)
12330 if (!insn_p->operand[nargs].predicate (op, SImode))
12332 error ("incorrect rounding operand");
12333 return const0_rtx;
12336 /* If there is no rounding use normal version of the pattern. */
12337 if (INTVAL (op) == NO_ROUND)
12339 /* Skip erasing embedded rounding for below expanders who
12340 generates multiple insns. In ix86_erase_embedded_rounding
12341 the pattern will be transformed to a single set, and emit_insn
12342 appends the set insead of insert it to chain. So the insns
12343 emitted inside define_expander would be ignored. */
12344 switch (icode)
12346 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
12347 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
12348 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
12349 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
12350 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
12351 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
12352 redundant_embed_rnd = 0;
12353 break;
12354 default:
12355 redundant_embed_rnd = 1;
12356 break;
12360 else
12362 if (VECTOR_MODE_P (mode))
12363 op = safe_vector_operand (op, mode);
12365 op = fixup_modeless_constant (op, mode);
12367 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12369 if (optimize || !match)
12370 op = copy_to_mode_reg (mode, op);
12372 else
12374 op = copy_to_reg (op);
12375 op = lowpart_subreg (mode, op, GET_MODE (op));
12379 xops[i] = op;
12382 switch (nargs)
12384 case 1:
12385 pat = GEN_FCN (icode) (target, xops[0]);
12386 break;
12387 case 2:
12388 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12389 break;
12390 case 3:
12391 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12392 break;
12393 case 4:
12394 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12395 xops[2], xops[3]);
12396 break;
12397 case 5:
12398 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12399 xops[2], xops[3], xops[4]);
12400 break;
12401 case 6:
12402 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12403 xops[2], xops[3], xops[4], xops[5]);
12404 break;
12405 default:
12406 gcc_unreachable ();
12409 if (!pat)
12410 return 0;
12412 if (redundant_embed_rnd)
12413 pat = ix86_erase_embedded_rounding (pat);
12415 emit_insn (pat);
12416 return target;
12419 /* Subroutine of ix86_expand_builtin to take care of special insns
12420 with variable number of operands. */
12422 static rtx
12423 ix86_expand_special_args_builtin (const struct builtin_description *d,
12424 tree exp, rtx target)
12426 tree arg;
12427 rtx pat, op;
12428 unsigned int i, nargs, arg_adjust, memory;
12429 unsigned int constant = 100;
12430 bool aligned_mem = false;
12431 rtx xops[4];
12432 enum insn_code icode = d->icode;
12433 const struct insn_data_d *insn_p = &insn_data[icode];
12434 machine_mode tmode = insn_p->operand[0].mode;
12435 enum { load, store } klass;
12437 switch ((enum ix86_builtin_func_type) d->flag)
12439 case VOID_FTYPE_VOID:
12440 emit_insn (GEN_FCN (icode) (target));
12441 return 0;
12442 case VOID_FTYPE_UINT64:
12443 case VOID_FTYPE_UNSIGNED:
12444 nargs = 0;
12445 klass = store;
12446 memory = 0;
12447 break;
12449 case INT_FTYPE_VOID:
12450 case USHORT_FTYPE_VOID:
12451 case UINT64_FTYPE_VOID:
12452 case UINT_FTYPE_VOID:
12453 case UINT8_FTYPE_VOID:
12454 case UNSIGNED_FTYPE_VOID:
12455 nargs = 0;
12456 klass = load;
12457 memory = 0;
12458 break;
12459 case UINT64_FTYPE_PUNSIGNED:
12460 case V2DI_FTYPE_PV2DI:
12461 case V4DI_FTYPE_PV4DI:
12462 case V32QI_FTYPE_PCCHAR:
12463 case V16QI_FTYPE_PCCHAR:
12464 case V8SF_FTYPE_PCV4SF:
12465 case V8SF_FTYPE_PCFLOAT:
12466 case V4SF_FTYPE_PCFLOAT:
12467 case V4SF_FTYPE_PCFLOAT16:
12468 case V4SF_FTYPE_PCBFLOAT16:
12469 case V4SF_FTYPE_PCV8BF:
12470 case V4SF_FTYPE_PCV8HF:
12471 case V8SF_FTYPE_PCFLOAT16:
12472 case V8SF_FTYPE_PCBFLOAT16:
12473 case V8SF_FTYPE_PCV16HF:
12474 case V8SF_FTYPE_PCV16BF:
12475 case V4DF_FTYPE_PCV2DF:
12476 case V4DF_FTYPE_PCDOUBLE:
12477 case V2DF_FTYPE_PCDOUBLE:
12478 case VOID_FTYPE_PVOID:
12479 case V8DI_FTYPE_PV8DI:
12480 nargs = 1;
12481 klass = load;
12482 memory = 0;
12483 switch (icode)
12485 case CODE_FOR_sse4_1_movntdqa:
12486 case CODE_FOR_avx2_movntdqa:
12487 case CODE_FOR_avx512f_movntdqa:
12488 aligned_mem = true;
12489 break;
12490 default:
12491 break;
12493 break;
12494 case VOID_FTYPE_PV2SF_V4SF:
12495 case VOID_FTYPE_PV8DI_V8DI:
12496 case VOID_FTYPE_PV4DI_V4DI:
12497 case VOID_FTYPE_PV2DI_V2DI:
12498 case VOID_FTYPE_PCHAR_V32QI:
12499 case VOID_FTYPE_PCHAR_V16QI:
12500 case VOID_FTYPE_PFLOAT_V16SF:
12501 case VOID_FTYPE_PFLOAT_V8SF:
12502 case VOID_FTYPE_PFLOAT_V4SF:
12503 case VOID_FTYPE_PDOUBLE_V8DF:
12504 case VOID_FTYPE_PDOUBLE_V4DF:
12505 case VOID_FTYPE_PDOUBLE_V2DF:
12506 case VOID_FTYPE_PLONGLONG_LONGLONG:
12507 case VOID_FTYPE_PULONGLONG_ULONGLONG:
12508 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
12509 case VOID_FTYPE_PINT_INT:
12510 nargs = 1;
12511 klass = store;
12512 /* Reserve memory operand for target. */
12513 memory = ARRAY_SIZE (xops);
12514 switch (icode)
12516 /* These builtins and instructions require the memory
12517 to be properly aligned. */
12518 case CODE_FOR_avx_movntv4di:
12519 case CODE_FOR_sse2_movntv2di:
12520 case CODE_FOR_avx_movntv8sf:
12521 case CODE_FOR_sse_movntv4sf:
12522 case CODE_FOR_sse4a_vmmovntv4sf:
12523 case CODE_FOR_avx_movntv4df:
12524 case CODE_FOR_sse2_movntv2df:
12525 case CODE_FOR_sse4a_vmmovntv2df:
12526 case CODE_FOR_sse2_movntidi:
12527 case CODE_FOR_sse_movntq:
12528 case CODE_FOR_sse2_movntisi:
12529 case CODE_FOR_avx512f_movntv16sf:
12530 case CODE_FOR_avx512f_movntv8df:
12531 case CODE_FOR_avx512f_movntv8di:
12532 aligned_mem = true;
12533 break;
12534 default:
12535 break;
12537 break;
12538 case VOID_FTYPE_PVOID_PCVOID:
12539 nargs = 1;
12540 klass = store;
12541 memory = 0;
12543 break;
12544 case V4SF_FTYPE_V4SF_PCV2SF:
12545 case V2DF_FTYPE_V2DF_PCDOUBLE:
12546 nargs = 2;
12547 klass = load;
12548 memory = 1;
12549 break;
12550 case V8SF_FTYPE_PCV8SF_V8SI:
12551 case V4DF_FTYPE_PCV4DF_V4DI:
12552 case V4SF_FTYPE_PCV4SF_V4SI:
12553 case V2DF_FTYPE_PCV2DF_V2DI:
12554 case V8SI_FTYPE_PCV8SI_V8SI:
12555 case V4DI_FTYPE_PCV4DI_V4DI:
12556 case V4SI_FTYPE_PCV4SI_V4SI:
12557 case V2DI_FTYPE_PCV2DI_V2DI:
12558 case VOID_FTYPE_INT_INT64:
12559 nargs = 2;
12560 klass = load;
12561 memory = 0;
12562 break;
12563 case VOID_FTYPE_PV8DF_V8DF_UQI:
12564 case VOID_FTYPE_PV4DF_V4DF_UQI:
12565 case VOID_FTYPE_PV2DF_V2DF_UQI:
12566 case VOID_FTYPE_PV16SF_V16SF_UHI:
12567 case VOID_FTYPE_PV8SF_V8SF_UQI:
12568 case VOID_FTYPE_PV4SF_V4SF_UQI:
12569 case VOID_FTYPE_PV8DI_V8DI_UQI:
12570 case VOID_FTYPE_PV4DI_V4DI_UQI:
12571 case VOID_FTYPE_PV2DI_V2DI_UQI:
12572 case VOID_FTYPE_PV16SI_V16SI_UHI:
12573 case VOID_FTYPE_PV8SI_V8SI_UQI:
12574 case VOID_FTYPE_PV4SI_V4SI_UQI:
12575 case VOID_FTYPE_PV64QI_V64QI_UDI:
12576 case VOID_FTYPE_PV32HI_V32HI_USI:
12577 case VOID_FTYPE_PV32QI_V32QI_USI:
12578 case VOID_FTYPE_PV16QI_V16QI_UHI:
12579 case VOID_FTYPE_PV16HI_V16HI_UHI:
12580 case VOID_FTYPE_PV8HI_V8HI_UQI:
12581 switch (icode)
12583 /* These builtins and instructions require the memory
12584 to be properly aligned. */
12585 case CODE_FOR_avx512f_storev16sf_mask:
12586 case CODE_FOR_avx512f_storev16si_mask:
12587 case CODE_FOR_avx512f_storev8df_mask:
12588 case CODE_FOR_avx512f_storev8di_mask:
12589 case CODE_FOR_avx512vl_storev8sf_mask:
12590 case CODE_FOR_avx512vl_storev8si_mask:
12591 case CODE_FOR_avx512vl_storev4df_mask:
12592 case CODE_FOR_avx512vl_storev4di_mask:
12593 case CODE_FOR_avx512vl_storev4sf_mask:
12594 case CODE_FOR_avx512vl_storev4si_mask:
12595 case CODE_FOR_avx512vl_storev2df_mask:
12596 case CODE_FOR_avx512vl_storev2di_mask:
12597 aligned_mem = true;
12598 break;
12599 default:
12600 break;
12602 /* FALLTHRU */
12603 case VOID_FTYPE_PV8SF_V8SI_V8SF:
12604 case VOID_FTYPE_PV4DF_V4DI_V4DF:
12605 case VOID_FTYPE_PV4SF_V4SI_V4SF:
12606 case VOID_FTYPE_PV2DF_V2DI_V2DF:
12607 case VOID_FTYPE_PV8SI_V8SI_V8SI:
12608 case VOID_FTYPE_PV4DI_V4DI_V4DI:
12609 case VOID_FTYPE_PV4SI_V4SI_V4SI:
12610 case VOID_FTYPE_PV2DI_V2DI_V2DI:
12611 case VOID_FTYPE_PV8SI_V8DI_UQI:
12612 case VOID_FTYPE_PV8HI_V8DI_UQI:
12613 case VOID_FTYPE_PV16HI_V16SI_UHI:
12614 case VOID_FTYPE_PUDI_V8DI_UQI:
12615 case VOID_FTYPE_PV16QI_V16SI_UHI:
12616 case VOID_FTYPE_PV4SI_V4DI_UQI:
12617 case VOID_FTYPE_PUDI_V2DI_UQI:
12618 case VOID_FTYPE_PUDI_V4DI_UQI:
12619 case VOID_FTYPE_PUSI_V2DI_UQI:
12620 case VOID_FTYPE_PV8HI_V8SI_UQI:
12621 case VOID_FTYPE_PUDI_V4SI_UQI:
12622 case VOID_FTYPE_PUSI_V4DI_UQI:
12623 case VOID_FTYPE_PUHI_V2DI_UQI:
12624 case VOID_FTYPE_PUDI_V8SI_UQI:
12625 case VOID_FTYPE_PUSI_V4SI_UQI:
12626 case VOID_FTYPE_PCHAR_V64QI_UDI:
12627 case VOID_FTYPE_PCHAR_V32QI_USI:
12628 case VOID_FTYPE_PCHAR_V16QI_UHI:
12629 case VOID_FTYPE_PSHORT_V32HI_USI:
12630 case VOID_FTYPE_PSHORT_V16HI_UHI:
12631 case VOID_FTYPE_PSHORT_V8HI_UQI:
12632 case VOID_FTYPE_PINT_V16SI_UHI:
12633 case VOID_FTYPE_PINT_V8SI_UQI:
12634 case VOID_FTYPE_PINT_V4SI_UQI:
12635 case VOID_FTYPE_PINT64_V8DI_UQI:
12636 case VOID_FTYPE_PINT64_V4DI_UQI:
12637 case VOID_FTYPE_PINT64_V2DI_UQI:
12638 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
12639 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
12640 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
12641 case VOID_FTYPE_PFLOAT_V16SF_UHI:
12642 case VOID_FTYPE_PFLOAT_V8SF_UQI:
12643 case VOID_FTYPE_PFLOAT_V4SF_UQI:
12644 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
12645 case VOID_FTYPE_PV32QI_V32HI_USI:
12646 case VOID_FTYPE_PV16QI_V16HI_UHI:
12647 case VOID_FTYPE_PUDI_V8HI_UQI:
12648 nargs = 2;
12649 klass = store;
12650 /* Reserve memory operand for target. */
12651 memory = ARRAY_SIZE (xops);
12652 break;
12653 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
12654 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
12655 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
12656 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
12657 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
12658 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
12659 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
12660 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
12661 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
12662 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
12663 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
12664 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
12665 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
12666 case V32HI_FTYPE_PCV32HI_V32HI_USI:
12667 case V32QI_FTYPE_PCV32QI_V32QI_USI:
12668 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
12669 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
12670 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
12671 switch (icode)
12673 /* These builtins and instructions require the memory
12674 to be properly aligned. */
12675 case CODE_FOR_avx512f_loadv16sf_mask:
12676 case CODE_FOR_avx512f_loadv16si_mask:
12677 case CODE_FOR_avx512f_loadv8df_mask:
12678 case CODE_FOR_avx512f_loadv8di_mask:
12679 case CODE_FOR_avx512vl_loadv8sf_mask:
12680 case CODE_FOR_avx512vl_loadv8si_mask:
12681 case CODE_FOR_avx512vl_loadv4df_mask:
12682 case CODE_FOR_avx512vl_loadv4di_mask:
12683 case CODE_FOR_avx512vl_loadv4sf_mask:
12684 case CODE_FOR_avx512vl_loadv4si_mask:
12685 case CODE_FOR_avx512vl_loadv2df_mask:
12686 case CODE_FOR_avx512vl_loadv2di_mask:
12687 case CODE_FOR_avx512bw_loadv64qi_mask:
12688 case CODE_FOR_avx512vl_loadv32qi_mask:
12689 case CODE_FOR_avx512vl_loadv16qi_mask:
12690 case CODE_FOR_avx512bw_loadv32hi_mask:
12691 case CODE_FOR_avx512vl_loadv16hi_mask:
12692 case CODE_FOR_avx512vl_loadv8hi_mask:
12693 aligned_mem = true;
12694 break;
12695 default:
12696 break;
12698 /* FALLTHRU */
12699 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12700 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12701 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12702 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12703 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12704 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12705 case V16SI_FTYPE_PCINT_V16SI_UHI:
12706 case V8SI_FTYPE_PCINT_V8SI_UQI:
12707 case V4SI_FTYPE_PCINT_V4SI_UQI:
12708 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12709 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12710 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12711 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12712 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12713 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12714 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12715 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12716 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
12717 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
12718 nargs = 3;
12719 klass = load;
12720 memory = 0;
12721 break;
12722 case INT_FTYPE_PINT_INT_INT_INT:
12723 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
12724 nargs = 4;
12725 klass = load;
12726 memory = 0;
12727 constant = 3;
12728 break;
12729 default:
12730 gcc_unreachable ();
12733 gcc_assert (nargs <= ARRAY_SIZE (xops));
12735 if (klass == store)
12737 arg = CALL_EXPR_ARG (exp, 0);
12738 op = expand_normal (arg);
12739 gcc_assert (target == 0);
12740 if (memory)
12742 op = ix86_zero_extend_to_Pmode (op);
12743 target = gen_rtx_MEM (tmode, op);
12744 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12745 on it. Try to improve it using get_pointer_alignment,
12746 and if the special builtin is one that requires strict
12747 mode alignment, also from it's GET_MODE_ALIGNMENT.
12748 Failure to do so could lead to ix86_legitimate_combined_insn
12749 rejecting all changes to such insns. */
12750 unsigned int align = get_pointer_alignment (arg);
12751 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12752 align = GET_MODE_ALIGNMENT (tmode);
12753 if (MEM_ALIGN (target) < align)
12754 set_mem_align (target, align);
12756 else
12757 target = force_reg (tmode, op);
12758 arg_adjust = 1;
12760 else
12762 arg_adjust = 0;
12763 if (optimize
12764 || target == 0
12765 || !register_operand (target, tmode)
12766 || GET_MODE (target) != tmode)
12767 target = gen_reg_rtx (tmode);
12770 for (i = 0; i < nargs; i++)
12772 machine_mode mode = insn_p->operand[i + 1].mode;
12774 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12775 op = expand_normal (arg);
12777 if (i == memory)
12779 /* This must be the memory operand. */
12780 op = ix86_zero_extend_to_Pmode (op);
12781 op = gen_rtx_MEM (mode, op);
12782 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12783 on it. Try to improve it using get_pointer_alignment,
12784 and if the special builtin is one that requires strict
12785 mode alignment, also from it's GET_MODE_ALIGNMENT.
12786 Failure to do so could lead to ix86_legitimate_combined_insn
12787 rejecting all changes to such insns. */
12788 unsigned int align = get_pointer_alignment (arg);
12789 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12790 align = GET_MODE_ALIGNMENT (mode);
12791 if (MEM_ALIGN (op) < align)
12792 set_mem_align (op, align);
12794 else if (i == constant)
12796 /* This must be the constant. */
12797 if (!insn_p->operand[nargs].predicate(op, SImode))
12799 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12800 return const0_rtx;
12803 else
12805 /* This must be register. */
12806 if (VECTOR_MODE_P (mode))
12807 op = safe_vector_operand (op, mode);
12809 op = fixup_modeless_constant (op, mode);
12811 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12812 and that mask operand shoud be at the end.
12813 Keep all-ones mask which would be simplified by the expander. */
12814 if (nargs == 3 && i == 2 && klass == load
12815 && constm1_operand (op, mode)
12816 && insn_p->operand[i].predicate (op, mode))
12818 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12819 op = copy_to_mode_reg (mode, op);
12820 else
12822 op = copy_to_reg (op);
12823 op = lowpart_subreg (mode, op, GET_MODE (op));
12827 xops[i]= op;
12830 switch (nargs)
12832 case 0:
12833 pat = GEN_FCN (icode) (target);
12834 break;
12835 case 1:
12836 pat = GEN_FCN (icode) (target, xops[0]);
12837 break;
12838 case 2:
12839 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12840 break;
12841 case 3:
12842 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12843 break;
12844 case 4:
12845 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
12846 break;
12847 default:
12848 gcc_unreachable ();
12851 if (! pat)
12852 return 0;
12854 emit_insn (pat);
12855 return klass == store ? 0 : target;
12858 /* Return the integer constant in ARG. Constrain it to be in the range
12859 of the subparts of VEC_TYPE; issue an error if not. */
12861 static int
12862 get_element_number (tree vec_type, tree arg)
12864 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12866 if (!tree_fits_uhwi_p (arg)
12867 || (elt = tree_to_uhwi (arg), elt > max))
12869 error ("selector must be an integer constant in the range "
12870 "[0, %wi]", max);
12871 return 0;
12874 return elt;
12877 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12878 ix86_expand_vector_init. We DO have language-level syntax for this, in
12879 the form of (type){ init-list }. Except that since we can't place emms
12880 instructions from inside the compiler, we can't allow the use of MMX
12881 registers unless the user explicitly asks for it. So we do *not* define
12882 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12883 we have builtins invoked by mmintrin.h that gives us license to emit
12884 these sorts of instructions. */
12886 static rtx
12887 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12889 machine_mode tmode = TYPE_MODE (type);
12890 machine_mode inner_mode = GET_MODE_INNER (tmode);
12891 int i, n_elt = GET_MODE_NUNITS (tmode);
12892 rtvec v = rtvec_alloc (n_elt);
12894 gcc_assert (VECTOR_MODE_P (tmode));
12895 gcc_assert (call_expr_nargs (exp) == n_elt);
12897 for (i = 0; i < n_elt; ++i)
12899 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12900 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12903 if (!target || !register_operand (target, tmode))
12904 target = gen_reg_rtx (tmode);
12906 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12907 return target;
12910 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12911 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12912 had a language-level syntax for referencing vector elements. */
12914 static rtx
12915 ix86_expand_vec_ext_builtin (tree exp, rtx target)
12917 machine_mode tmode, mode0;
12918 tree arg0, arg1;
12919 int elt;
12920 rtx op0;
12922 arg0 = CALL_EXPR_ARG (exp, 0);
12923 arg1 = CALL_EXPR_ARG (exp, 1);
12925 op0 = expand_normal (arg0);
12926 elt = get_element_number (TREE_TYPE (arg0), arg1);
12928 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12929 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12930 gcc_assert (VECTOR_MODE_P (mode0));
12932 op0 = force_reg (mode0, op0);
12934 if (optimize || !target || !register_operand (target, tmode))
12935 target = gen_reg_rtx (tmode);
12937 ix86_expand_vector_extract (true, target, op0, elt);
12939 return target;
12942 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12943 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12944 a language-level syntax for referencing vector elements. */
12946 static rtx
12947 ix86_expand_vec_set_builtin (tree exp)
12949 machine_mode tmode, mode1;
12950 tree arg0, arg1, arg2;
12951 int elt;
12952 rtx op0, op1, target;
12954 arg0 = CALL_EXPR_ARG (exp, 0);
12955 arg1 = CALL_EXPR_ARG (exp, 1);
12956 arg2 = CALL_EXPR_ARG (exp, 2);
12958 tmode = TYPE_MODE (TREE_TYPE (arg0));
12959 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12960 gcc_assert (VECTOR_MODE_P (tmode));
12962 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12963 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12964 elt = get_element_number (TREE_TYPE (arg0), arg2);
12966 if (GET_MODE (op1) != mode1)
12967 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12969 op0 = force_reg (tmode, op0);
12970 op1 = force_reg (mode1, op1);
12972 /* OP0 is the source of these builtin functions and shouldn't be
12973 modified. Create a copy, use it and return it as target. */
12974 target = gen_reg_rtx (tmode);
12975 emit_move_insn (target, op0);
12976 ix86_expand_vector_set (true, target, op1, elt);
12978 return target;
12981 /* Return true if the necessary isa options for this builtin exist,
12982 else false.
12983 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12984 bool
12985 ix86_check_builtin_isa_match (unsigned int fcode,
12986 HOST_WIDE_INT* pbisa,
12987 HOST_WIDE_INT* pbisa2)
12989 HOST_WIDE_INT isa = ix86_isa_flags;
12990 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12991 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12992 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12993 HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
12994 /* The general case is we require all the ISAs specified in bisa{,2}
12995 to be enabled.
12996 The exceptions are:
12997 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12998 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12999 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
13000 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
13001 OPTION_MASK_ISA2_AVXVNNI
13002 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
13003 OPTION_MASK_ISA2_AVXIFMA
13004 (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
13005 OPTION_MASK_ISA2_AVXNECONVERT
13006 OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
13007 where for each such pair it is sufficient if either of the ISAs is
13008 enabled, plus if it is ored with other options also those others.
13009 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
13011 #define SHARE_BUILTIN(A1, A2, B1, B2) \
13012 if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
13013 && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
13014 && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
13015 || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
13017 tmp_isa |= (A1) | (B1); \
13018 tmp_isa2 |= (A2) | (B2); \
13021 SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
13022 SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
13023 SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
13024 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
13025 OPTION_MASK_ISA2_AVXVNNI);
13026 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
13027 OPTION_MASK_ISA2_AVXIFMA);
13028 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
13029 OPTION_MASK_ISA2_AVXNECONVERT);
13030 SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
13031 OPTION_MASK_ISA2_VAES);
13032 isa = tmp_isa;
13033 isa2 = tmp_isa2;
13035 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
13036 /* __builtin_ia32_maskmovq requires MMX registers. */
13037 && fcode != IX86_BUILTIN_MASKMOVQ)
13039 bisa &= ~OPTION_MASK_ISA_MMX;
13040 bisa |= OPTION_MASK_ISA_SSE2;
13043 if (pbisa)
13044 *pbisa = bisa;
13045 if (pbisa2)
13046 *pbisa2 = bisa2;
13048 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
13051 /* Emit instructions to set the carry flag from ARG. */
13053 void
13054 ix86_expand_carry (rtx arg)
13056 if (!CONST_INT_P (arg) || arg == const0_rtx)
13058 arg = convert_to_mode (QImode, arg, 1);
13059 arg = copy_to_mode_reg (QImode, arg);
13060 emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
13062 else
13063 emit_insn (gen_x86_stc ());
13066 /* Expand an expression EXP that calls a built-in function,
13067 with result going to TARGET if that's convenient
13068 (and in mode MODE if that's convenient).
13069 SUBTARGET may be used as the target for computing one of EXP's operands.
13070 IGNORE is nonzero if the value is to be ignored. */
13073 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
13074 machine_mode mode, int ignore)
13076 size_t i;
13077 enum insn_code icode, icode2;
13078 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13079 tree arg0, arg1, arg2, arg3, arg4;
13080 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
13081 machine_mode mode0, mode1, mode2, mode3, mode4;
13082 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
13083 HOST_WIDE_INT bisa, bisa2;
13085 /* For CPU builtins that can be folded, fold first and expand the fold. */
13086 switch (fcode)
13088 case IX86_BUILTIN_CPU_INIT:
13090 /* Make it call __cpu_indicator_init in libgcc. */
13091 tree call_expr, fndecl, type;
13092 type = build_function_type_list (integer_type_node, NULL_TREE);
13093 fndecl = build_fn_decl ("__cpu_indicator_init", type);
13094 call_expr = build_call_expr (fndecl, 0);
13095 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
13097 case IX86_BUILTIN_CPU_IS:
13098 case IX86_BUILTIN_CPU_SUPPORTS:
13100 tree arg0 = CALL_EXPR_ARG (exp, 0);
13101 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
13102 gcc_assert (fold_expr != NULL_TREE);
13103 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
13107 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
13109 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
13110 if (TARGET_ABI_X32)
13111 bisa |= OPTION_MASK_ABI_X32;
13112 else
13113 bisa |= OPTION_MASK_ABI_64;
13114 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
13115 (enum fpmath_unit) 0,
13116 (enum prefer_vector_width) 0,
13117 PVW_NONE, PVW_NONE,
13118 false, add_abi_p);
13119 if (!opts)
13120 error ("%qE needs unknown isa option", fndecl);
13121 else
13123 gcc_assert (opts != NULL);
13124 error ("%qE needs isa option %s", fndecl, opts);
13125 free (opts);
13127 return expand_call (exp, target, ignore);
13130 switch (fcode)
13132 case IX86_BUILTIN_MASKMOVQ:
13133 case IX86_BUILTIN_MASKMOVDQU:
13134 icode = (fcode == IX86_BUILTIN_MASKMOVQ
13135 ? CODE_FOR_mmx_maskmovq
13136 : CODE_FOR_sse2_maskmovdqu);
13137 /* Note the arg order is different from the operand order. */
13138 arg1 = CALL_EXPR_ARG (exp, 0);
13139 arg2 = CALL_EXPR_ARG (exp, 1);
13140 arg0 = CALL_EXPR_ARG (exp, 2);
13141 op0 = expand_normal (arg0);
13142 op1 = expand_normal (arg1);
13143 op2 = expand_normal (arg2);
13144 mode0 = insn_data[icode].operand[0].mode;
13145 mode1 = insn_data[icode].operand[1].mode;
13146 mode2 = insn_data[icode].operand[2].mode;
13148 op0 = ix86_zero_extend_to_Pmode (op0);
13149 op0 = gen_rtx_MEM (mode1, op0);
13151 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13152 op0 = copy_to_mode_reg (mode0, op0);
13153 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13154 op1 = copy_to_mode_reg (mode1, op1);
13155 if (!insn_data[icode].operand[2].predicate (op2, mode2))
13156 op2 = copy_to_mode_reg (mode2, op2);
13157 pat = GEN_FCN (icode) (op0, op1, op2);
13158 if (! pat)
13159 return 0;
13160 emit_insn (pat);
13161 return 0;
13163 case IX86_BUILTIN_LDMXCSR:
13164 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
13165 target = assign_386_stack_local (SImode, SLOT_TEMP);
13166 emit_move_insn (target, op0);
13167 emit_insn (gen_sse_ldmxcsr (target));
13168 return 0;
13170 case IX86_BUILTIN_STMXCSR:
13171 target = assign_386_stack_local (SImode, SLOT_TEMP);
13172 emit_insn (gen_sse_stmxcsr (target));
13173 return copy_to_mode_reg (SImode, target);
13175 case IX86_BUILTIN_CLFLUSH:
13176 arg0 = CALL_EXPR_ARG (exp, 0);
13177 op0 = expand_normal (arg0);
13178 icode = CODE_FOR_sse2_clflush;
13179 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13180 op0 = ix86_zero_extend_to_Pmode (op0);
13182 emit_insn (gen_sse2_clflush (op0));
13183 return 0;
13185 case IX86_BUILTIN_CLWB:
13186 arg0 = CALL_EXPR_ARG (exp, 0);
13187 op0 = expand_normal (arg0);
13188 icode = CODE_FOR_clwb;
13189 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13190 op0 = ix86_zero_extend_to_Pmode (op0);
13192 emit_insn (gen_clwb (op0));
13193 return 0;
13195 case IX86_BUILTIN_CLFLUSHOPT:
13196 arg0 = CALL_EXPR_ARG (exp, 0);
13197 op0 = expand_normal (arg0);
13198 icode = CODE_FOR_clflushopt;
13199 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13200 op0 = ix86_zero_extend_to_Pmode (op0);
13202 emit_insn (gen_clflushopt (op0));
13203 return 0;
13205 case IX86_BUILTIN_MONITOR:
13206 case IX86_BUILTIN_MONITORX:
13207 arg0 = CALL_EXPR_ARG (exp, 0);
13208 arg1 = CALL_EXPR_ARG (exp, 1);
13209 arg2 = CALL_EXPR_ARG (exp, 2);
13210 op0 = expand_normal (arg0);
13211 op1 = expand_normal (arg1);
13212 op2 = expand_normal (arg2);
13213 if (!REG_P (op0))
13214 op0 = ix86_zero_extend_to_Pmode (op0);
13215 if (!REG_P (op1))
13216 op1 = copy_to_mode_reg (SImode, op1);
13217 if (!REG_P (op2))
13218 op2 = copy_to_mode_reg (SImode, op2);
13220 emit_insn (fcode == IX86_BUILTIN_MONITOR
13221 ? gen_sse3_monitor (Pmode, op0, op1, op2)
13222 : gen_monitorx (Pmode, op0, op1, op2));
13223 return 0;
13225 case IX86_BUILTIN_MWAIT:
13226 arg0 = CALL_EXPR_ARG (exp, 0);
13227 arg1 = CALL_EXPR_ARG (exp, 1);
13228 op0 = expand_normal (arg0);
13229 op1 = expand_normal (arg1);
13230 if (!REG_P (op0))
13231 op0 = copy_to_mode_reg (SImode, op0);
13232 if (!REG_P (op1))
13233 op1 = copy_to_mode_reg (SImode, op1);
13234 emit_insn (gen_sse3_mwait (op0, op1));
13235 return 0;
13237 case IX86_BUILTIN_MWAITX:
13238 arg0 = CALL_EXPR_ARG (exp, 0);
13239 arg1 = CALL_EXPR_ARG (exp, 1);
13240 arg2 = CALL_EXPR_ARG (exp, 2);
13241 op0 = expand_normal (arg0);
13242 op1 = expand_normal (arg1);
13243 op2 = expand_normal (arg2);
13244 if (!REG_P (op0))
13245 op0 = copy_to_mode_reg (SImode, op0);
13246 if (!REG_P (op1))
13247 op1 = copy_to_mode_reg (SImode, op1);
13248 if (!REG_P (op2))
13249 op2 = copy_to_mode_reg (SImode, op2);
13250 emit_insn (gen_mwaitx (op0, op1, op2));
13251 return 0;
13253 case IX86_BUILTIN_UMONITOR:
13254 arg0 = CALL_EXPR_ARG (exp, 0);
13255 op0 = expand_normal (arg0);
13257 op0 = ix86_zero_extend_to_Pmode (op0);
13258 emit_insn (gen_umonitor (Pmode, op0));
13259 return 0;
13261 case IX86_BUILTIN_UMWAIT:
13262 case IX86_BUILTIN_TPAUSE:
13263 arg0 = CALL_EXPR_ARG (exp, 0);
13264 arg1 = CALL_EXPR_ARG (exp, 1);
13265 op0 = expand_normal (arg0);
13266 op1 = expand_normal (arg1);
13268 if (!REG_P (op0))
13269 op0 = copy_to_mode_reg (SImode, op0);
13271 op1 = force_reg (DImode, op1);
13273 if (TARGET_64BIT)
13275 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13276 NULL, 1, OPTAB_DIRECT);
13277 switch (fcode)
13279 case IX86_BUILTIN_UMWAIT:
13280 icode = CODE_FOR_umwait_rex64;
13281 break;
13282 case IX86_BUILTIN_TPAUSE:
13283 icode = CODE_FOR_tpause_rex64;
13284 break;
13285 default:
13286 gcc_unreachable ();
13289 op2 = gen_lowpart (SImode, op2);
13290 op1 = gen_lowpart (SImode, op1);
13291 pat = GEN_FCN (icode) (op0, op1, op2);
13293 else
13295 switch (fcode)
13297 case IX86_BUILTIN_UMWAIT:
13298 icode = CODE_FOR_umwait;
13299 break;
13300 case IX86_BUILTIN_TPAUSE:
13301 icode = CODE_FOR_tpause;
13302 break;
13303 default:
13304 gcc_unreachable ();
13306 pat = GEN_FCN (icode) (op0, op1);
13309 if (!pat)
13310 return 0;
13312 emit_insn (pat);
13314 if (target == 0
13315 || !register_operand (target, QImode))
13316 target = gen_reg_rtx (QImode);
13318 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13319 const0_rtx);
13320 emit_insn (gen_rtx_SET (target, pat));
13322 return target;
13324 case IX86_BUILTIN_TESTUI:
13325 emit_insn (gen_testui ());
13327 if (target == 0
13328 || !register_operand (target, QImode))
13329 target = gen_reg_rtx (QImode);
13331 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13332 const0_rtx);
13333 emit_insn (gen_rtx_SET (target, pat));
13335 return target;
13337 case IX86_BUILTIN_CLZERO:
13338 arg0 = CALL_EXPR_ARG (exp, 0);
13339 op0 = expand_normal (arg0);
13340 if (!REG_P (op0))
13341 op0 = ix86_zero_extend_to_Pmode (op0);
13342 emit_insn (gen_clzero (Pmode, op0));
13343 return 0;
13345 case IX86_BUILTIN_CLDEMOTE:
13346 arg0 = CALL_EXPR_ARG (exp, 0);
13347 op0 = expand_normal (arg0);
13348 icode = CODE_FOR_cldemote;
13349 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13350 op0 = ix86_zero_extend_to_Pmode (op0);
13352 emit_insn (gen_cldemote (op0));
13353 return 0;
13355 case IX86_BUILTIN_LOADIWKEY:
13357 arg0 = CALL_EXPR_ARG (exp, 0);
13358 arg1 = CALL_EXPR_ARG (exp, 1);
13359 arg2 = CALL_EXPR_ARG (exp, 2);
13360 arg3 = CALL_EXPR_ARG (exp, 3);
13362 op0 = expand_normal (arg0);
13363 op1 = expand_normal (arg1);
13364 op2 = expand_normal (arg2);
13365 op3 = expand_normal (arg3);
13367 if (!REG_P (op0))
13368 op0 = copy_to_mode_reg (V2DImode, op0);
13369 if (!REG_P (op1))
13370 op1 = copy_to_mode_reg (V2DImode, op1);
13371 if (!REG_P (op2))
13372 op2 = copy_to_mode_reg (V2DImode, op2);
13373 if (!REG_P (op3))
13374 op3 = copy_to_mode_reg (SImode, op3);
13376 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
13378 return 0;
13381 case IX86_BUILTIN_AESDEC128KLU8:
13382 icode = CODE_FOR_aesdec128klu8;
13383 goto aesdecenc_expand;
13385 case IX86_BUILTIN_AESDEC256KLU8:
13386 icode = CODE_FOR_aesdec256klu8;
13387 goto aesdecenc_expand;
13389 case IX86_BUILTIN_AESENC128KLU8:
13390 icode = CODE_FOR_aesenc128klu8;
13391 goto aesdecenc_expand;
13393 case IX86_BUILTIN_AESENC256KLU8:
13394 icode = CODE_FOR_aesenc256klu8;
13396 aesdecenc_expand:
13398 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
13399 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
13400 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13402 op0 = expand_normal (arg0);
13403 op1 = expand_normal (arg1);
13404 op2 = expand_normal (arg2);
13406 if (!address_operand (op0, V2DImode))
13408 op0 = convert_memory_address (Pmode, op0);
13409 op0 = copy_addr_to_reg (op0);
13411 op0 = gen_rtx_MEM (V2DImode, op0);
13413 if (!REG_P (op1))
13414 op1 = copy_to_mode_reg (V2DImode, op1);
13416 if (!address_operand (op2, VOIDmode))
13418 op2 = convert_memory_address (Pmode, op2);
13419 op2 = copy_addr_to_reg (op2);
13421 op2 = gen_rtx_MEM (BLKmode, op2);
13423 emit_insn (GEN_FCN (icode) (op1, op1, op2));
13425 if (target == 0)
13426 target = gen_reg_rtx (QImode);
13428 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
13429 error occurs. Then the output should be cleared for safety. */
13430 rtx_code_label *ok_label;
13431 rtx tmp;
13433 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13434 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13435 ok_label = gen_label_rtx ();
13436 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13437 true, ok_label);
13438 /* Usually the runtime error seldom occur, so predict OK path as
13439 hotspot to optimize it as fallthrough block. */
13440 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13442 emit_insn (gen_rtx_SET (op1, const0_rtx));
13444 emit_label (ok_label);
13445 emit_insn (gen_rtx_SET (target, pat));
13446 emit_insn (gen_rtx_SET (op0, op1));
13448 return target;
13450 case IX86_BUILTIN_AESDECWIDE128KLU8:
13451 icode = CODE_FOR_aesdecwide128klu8;
13452 goto wideaesdecenc_expand;
13454 case IX86_BUILTIN_AESDECWIDE256KLU8:
13455 icode = CODE_FOR_aesdecwide256klu8;
13456 goto wideaesdecenc_expand;
13458 case IX86_BUILTIN_AESENCWIDE128KLU8:
13459 icode = CODE_FOR_aesencwide128klu8;
13460 goto wideaesdecenc_expand;
13462 case IX86_BUILTIN_AESENCWIDE256KLU8:
13463 icode = CODE_FOR_aesencwide256klu8;
13465 wideaesdecenc_expand:
13467 rtx xmm_regs[8];
13468 rtx op;
13470 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
13471 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
13472 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13474 op0 = expand_normal (arg0);
13475 op1 = expand_normal (arg1);
13476 op2 = expand_normal (arg2);
13478 if (!address_operand (op2, VOIDmode))
13480 op2 = convert_memory_address (Pmode, op2);
13481 op2 = copy_addr_to_reg (op2);
13483 op2 = gen_rtx_MEM (BLKmode, op2);
13485 for (i = 0; i < 8; i++)
13487 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13489 op = gen_rtx_MEM (V2DImode,
13490 plus_constant (Pmode, op1, (i * 16)));
13492 emit_move_insn (xmm_regs[i], op);
13495 emit_insn (GEN_FCN (icode) (op2));
13497 if (target == 0)
13498 target = gen_reg_rtx (QImode);
13500 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13501 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13502 ok_label = gen_label_rtx ();
13503 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13504 true, ok_label);
13505 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13507 for (i = 0; i < 8; i++)
13508 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
13510 emit_label (ok_label);
13511 emit_insn (gen_rtx_SET (target, pat));
13513 for (i = 0; i < 8; i++)
13515 op = gen_rtx_MEM (V2DImode,
13516 plus_constant (Pmode, op0, (i * 16)));
13517 emit_move_insn (op, xmm_regs[i]);
13520 return target;
13522 case IX86_BUILTIN_ENCODEKEY128U32:
13524 rtx op, xmm_regs[7];
13526 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13527 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
13528 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
13530 op0 = expand_normal (arg0);
13531 op1 = expand_normal (arg1);
13532 op2 = expand_normal (arg2);
13534 if (!REG_P (op0))
13535 op0 = copy_to_mode_reg (SImode, op0);
13537 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13538 emit_move_insn (op, op1);
13540 for (i = 0; i < 3; i++)
13541 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13543 if (target == 0)
13544 target = gen_reg_rtx (SImode);
13546 emit_insn (gen_encodekey128u32 (target, op0));
13548 for (i = 0; i < 3; i++)
13550 op = gen_rtx_MEM (V2DImode,
13551 plus_constant (Pmode, op2, (i * 16)));
13552 emit_move_insn (op, xmm_regs[i]);
13555 return target;
13557 case IX86_BUILTIN_ENCODEKEY256U32:
13559 rtx op, xmm_regs[7];
13561 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13562 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
13563 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
13564 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
13566 op0 = expand_normal (arg0);
13567 op1 = expand_normal (arg1);
13568 op2 = expand_normal (arg2);
13569 op3 = expand_normal (arg3);
13571 if (!REG_P (op0))
13572 op0 = copy_to_mode_reg (SImode, op0);
13574 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13575 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13576 emit_move_insn (op, op1);
13577 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
13578 emit_move_insn (op, op2);
13580 for (i = 0; i < 4; i++)
13581 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13583 if (target == 0)
13584 target = gen_reg_rtx (SImode);
13586 emit_insn (gen_encodekey256u32 (target, op0));
13588 for (i = 0; i < 4; i++)
13590 op = gen_rtx_MEM (V2DImode,
13591 plus_constant (Pmode, op3, (i * 16)));
13592 emit_move_insn (op, xmm_regs[i]);
13595 return target;
13598 case IX86_BUILTIN_PREFETCH:
13600 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13601 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13602 arg2 = CALL_EXPR_ARG (exp, 2); // const int
13603 arg3 = CALL_EXPR_ARG (exp, 3); // const int
13605 op0 = expand_normal (arg0);
13606 op1 = expand_normal (arg1);
13607 op2 = expand_normal (arg2);
13608 op3 = expand_normal (arg3);
13610 if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
13612 error ("second, third and fourth argument must be a const");
13613 return const0_rtx;
13616 if (INTVAL (op3) == 1)
13618 if (INTVAL (op2) < 2 || INTVAL (op2) > 3)
13620 error ("invalid third argument");
13621 return const0_rtx;
13624 if (TARGET_64BIT && TARGET_PREFETCHI
13625 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13626 emit_insn (gen_prefetchi (op0, op2));
13627 else
13629 warning (0, "instruction prefetch applies when in 64-bit mode"
13630 " with RIP-relative addressing and"
13631 " option %<-mprefetchi%>;"
13632 " they stay NOPs otherwise");
13633 emit_insn (gen_nop ());
13636 else
13638 if (!address_operand (op0, VOIDmode))
13640 op0 = convert_memory_address (Pmode, op0);
13641 op0 = copy_addr_to_reg (op0);
13644 if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
13646 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
13647 op2 = const0_rtx;
13650 if (TARGET_3DNOW || TARGET_PREFETCH_SSE
13651 || TARGET_PRFCHW || TARGET_PREFETCHWT1)
13652 emit_insn (gen_prefetch (op0, op1, op2));
13653 else if (!MEM_P (op0) && side_effects_p (op0))
13654 /* Don't do anything with direct references to volatile memory,
13655 but generate code to handle other side effects. */
13656 emit_insn (op0);
13659 return 0;
13662 case IX86_BUILTIN_PREFETCHI:
13664 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13665 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13667 op0 = expand_normal (arg0);
13668 op1 = expand_normal (arg1);
13670 if (!CONST_INT_P (op1))
13672 error ("second argument must be a const");
13673 return const0_rtx;
13676 /* GOT/PLT_PIC should not be available for instruction prefetch.
13677 It must be real instruction address. */
13678 if (TARGET_64BIT
13679 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13680 emit_insn (gen_prefetchi (op0, op1));
13681 else
13683 /* Ignore the hint. */
13684 warning (0, "instruction prefetch applies when in 64-bit mode"
13685 " with RIP-relative addressing and"
13686 " option %<-mprefetchi%>;"
13687 " they stay NOPs otherwise");
13688 emit_insn (gen_nop ());
13691 return 0;
13694 case IX86_BUILTIN_URDMSR:
13695 case IX86_BUILTIN_UWRMSR:
13697 arg0 = CALL_EXPR_ARG (exp, 0);
13698 op0 = expand_normal (arg0);
13700 if (CONST_INT_P (op0))
13702 unsigned HOST_WIDE_INT val = UINTVAL (op0);
13703 if (val > 0xffffffff)
13704 op0 = force_reg (DImode, op0);
13706 else
13707 op0 = force_reg (DImode, op0);
13709 if (fcode == IX86_BUILTIN_UWRMSR)
13711 arg1 = CALL_EXPR_ARG (exp, 1);
13712 op1 = expand_normal (arg1);
13713 op1 = force_reg (DImode, op1);
13714 icode = CODE_FOR_uwrmsr;
13715 target = 0;
13717 else
13719 if (target == 0)
13720 target = gen_reg_rtx (DImode);
13721 icode = CODE_FOR_urdmsr;
13722 op1 = op0;
13723 op0 = target;
13725 emit_insn (GEN_FCN (icode) (op0, op1));
13726 return target;
13729 case IX86_BUILTIN_VEC_INIT_V2SI:
13730 case IX86_BUILTIN_VEC_INIT_V4HI:
13731 case IX86_BUILTIN_VEC_INIT_V8QI:
13732 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
13734 case IX86_BUILTIN_VEC_EXT_V2DF:
13735 case IX86_BUILTIN_VEC_EXT_V2DI:
13736 case IX86_BUILTIN_VEC_EXT_V4SF:
13737 case IX86_BUILTIN_VEC_EXT_V4SI:
13738 case IX86_BUILTIN_VEC_EXT_V8HI:
13739 case IX86_BUILTIN_VEC_EXT_V2SI:
13740 case IX86_BUILTIN_VEC_EXT_V4HI:
13741 case IX86_BUILTIN_VEC_EXT_V16QI:
13742 return ix86_expand_vec_ext_builtin (exp, target);
13744 case IX86_BUILTIN_VEC_SET_V2DI:
13745 case IX86_BUILTIN_VEC_SET_V4SF:
13746 case IX86_BUILTIN_VEC_SET_V4SI:
13747 case IX86_BUILTIN_VEC_SET_V8HI:
13748 case IX86_BUILTIN_VEC_SET_V4HI:
13749 case IX86_BUILTIN_VEC_SET_V16QI:
13750 return ix86_expand_vec_set_builtin (exp);
13752 case IX86_BUILTIN_NANQ:
13753 case IX86_BUILTIN_NANSQ:
13754 return expand_call (exp, target, ignore);
13756 case IX86_BUILTIN_RDPID:
13758 op0 = gen_reg_rtx (word_mode);
13760 if (TARGET_64BIT)
13762 insn = gen_rdpid_rex64 (op0);
13763 op0 = convert_to_mode (SImode, op0, 1);
13765 else
13766 insn = gen_rdpid (op0);
13768 emit_insn (insn);
13770 if (target == 0
13771 || !register_operand (target, SImode))
13772 target = gen_reg_rtx (SImode);
13774 emit_move_insn (target, op0);
13775 return target;
13777 case IX86_BUILTIN_2INTERSECTD512:
13778 case IX86_BUILTIN_2INTERSECTQ512:
13779 case IX86_BUILTIN_2INTERSECTD256:
13780 case IX86_BUILTIN_2INTERSECTQ256:
13781 case IX86_BUILTIN_2INTERSECTD128:
13782 case IX86_BUILTIN_2INTERSECTQ128:
13783 arg0 = CALL_EXPR_ARG (exp, 0);
13784 arg1 = CALL_EXPR_ARG (exp, 1);
13785 arg2 = CALL_EXPR_ARG (exp, 2);
13786 arg3 = CALL_EXPR_ARG (exp, 3);
13787 op0 = expand_normal (arg0);
13788 op1 = expand_normal (arg1);
13789 op2 = expand_normal (arg2);
13790 op3 = expand_normal (arg3);
13792 if (!address_operand (op0, VOIDmode))
13794 op0 = convert_memory_address (Pmode, op0);
13795 op0 = copy_addr_to_reg (op0);
13797 if (!address_operand (op1, VOIDmode))
13799 op1 = convert_memory_address (Pmode, op1);
13800 op1 = copy_addr_to_reg (op1);
13803 switch (fcode)
13805 case IX86_BUILTIN_2INTERSECTD512:
13806 mode4 = P2HImode;
13807 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
13808 break;
13809 case IX86_BUILTIN_2INTERSECTQ512:
13810 mode4 = P2QImode;
13811 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
13812 break;
13813 case IX86_BUILTIN_2INTERSECTD256:
13814 mode4 = P2QImode;
13815 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
13816 break;
13817 case IX86_BUILTIN_2INTERSECTQ256:
13818 mode4 = P2QImode;
13819 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
13820 break;
13821 case IX86_BUILTIN_2INTERSECTD128:
13822 mode4 = P2QImode;
13823 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
13824 break;
13825 case IX86_BUILTIN_2INTERSECTQ128:
13826 mode4 = P2QImode;
13827 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
13828 break;
13829 default:
13830 gcc_unreachable ();
13833 mode2 = insn_data[icode].operand[1].mode;
13834 mode3 = insn_data[icode].operand[2].mode;
13835 if (!insn_data[icode].operand[1].predicate (op2, mode2))
13836 op2 = copy_to_mode_reg (mode2, op2);
13837 if (!insn_data[icode].operand[2].predicate (op3, mode3))
13838 op3 = copy_to_mode_reg (mode3, op3);
13840 op4 = gen_reg_rtx (mode4);
13841 emit_insn (GEN_FCN (icode) (op4, op2, op3));
13842 mode0 = mode4 == P2HImode ? HImode : QImode;
13843 emit_move_insn (gen_rtx_MEM (mode0, op0),
13844 gen_lowpart (mode0, op4));
13845 emit_move_insn (gen_rtx_MEM (mode0, op1),
13846 gen_highpart (mode0, op4));
13848 return 0;
13850 case IX86_BUILTIN_RDPMC:
13851 case IX86_BUILTIN_RDTSC:
13852 case IX86_BUILTIN_RDTSCP:
13853 case IX86_BUILTIN_XGETBV:
13855 op0 = gen_reg_rtx (DImode);
13856 op1 = gen_reg_rtx (DImode);
13858 if (fcode == IX86_BUILTIN_RDPMC)
13860 arg0 = CALL_EXPR_ARG (exp, 0);
13861 op2 = expand_normal (arg0);
13862 if (!register_operand (op2, SImode))
13863 op2 = copy_to_mode_reg (SImode, op2);
13865 insn = (TARGET_64BIT
13866 ? gen_rdpmc_rex64 (op0, op1, op2)
13867 : gen_rdpmc (op0, op2));
13868 emit_insn (insn);
13870 else if (fcode == IX86_BUILTIN_XGETBV)
13872 arg0 = CALL_EXPR_ARG (exp, 0);
13873 op2 = expand_normal (arg0);
13874 if (!register_operand (op2, SImode))
13875 op2 = copy_to_mode_reg (SImode, op2);
13877 insn = (TARGET_64BIT
13878 ? gen_xgetbv_rex64 (op0, op1, op2)
13879 : gen_xgetbv (op0, op2));
13880 emit_insn (insn);
13882 else if (fcode == IX86_BUILTIN_RDTSC)
13884 insn = (TARGET_64BIT
13885 ? gen_rdtsc_rex64 (op0, op1)
13886 : gen_rdtsc (op0));
13887 emit_insn (insn);
13889 else
13891 op2 = gen_reg_rtx (SImode);
13893 insn = (TARGET_64BIT
13894 ? gen_rdtscp_rex64 (op0, op1, op2)
13895 : gen_rdtscp (op0, op2));
13896 emit_insn (insn);
13898 arg0 = CALL_EXPR_ARG (exp, 0);
13899 op4 = expand_normal (arg0);
13900 if (!address_operand (op4, VOIDmode))
13902 op4 = convert_memory_address (Pmode, op4);
13903 op4 = copy_addr_to_reg (op4);
13905 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13908 if (target == 0
13909 || !register_operand (target, DImode))
13910 target = gen_reg_rtx (DImode);
13912 if (TARGET_64BIT)
13914 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13915 op1, 1, OPTAB_DIRECT);
13916 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13917 op0, 1, OPTAB_DIRECT);
13920 emit_move_insn (target, op0);
13921 return target;
13923 case IX86_BUILTIN_ENQCMD:
13924 case IX86_BUILTIN_ENQCMDS:
13925 case IX86_BUILTIN_MOVDIR64B:
13927 arg0 = CALL_EXPR_ARG (exp, 0);
13928 arg1 = CALL_EXPR_ARG (exp, 1);
13929 op0 = expand_normal (arg0);
13930 op1 = expand_normal (arg1);
13932 op0 = ix86_zero_extend_to_Pmode (op0);
13933 if (!address_operand (op1, VOIDmode))
13935 op1 = convert_memory_address (Pmode, op1);
13936 op1 = copy_addr_to_reg (op1);
13938 op1 = gen_rtx_MEM (XImode, op1);
13940 if (fcode == IX86_BUILTIN_MOVDIR64B)
13942 emit_insn (gen_movdir64b (Pmode, op0, op1));
13943 return 0;
13945 else
13947 if (target == 0
13948 || !register_operand (target, SImode))
13949 target = gen_reg_rtx (SImode);
13951 emit_move_insn (target, const0_rtx);
13952 target = gen_rtx_SUBREG (QImode, target, 0);
13954 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13955 ? UNSPECV_ENQCMD
13956 : UNSPECV_ENQCMDS);
13957 icode = code_for_enqcmd (unspecv, Pmode);
13958 emit_insn (GEN_FCN (icode) (op0, op1));
13960 emit_insn
13961 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13962 gen_rtx_fmt_ee (EQ, QImode,
13963 gen_rtx_REG (CCZmode, FLAGS_REG),
13964 const0_rtx)));
13965 return SUBREG_REG (target);
13968 case IX86_BUILTIN_FXSAVE:
13969 case IX86_BUILTIN_FXRSTOR:
13970 case IX86_BUILTIN_FXSAVE64:
13971 case IX86_BUILTIN_FXRSTOR64:
13972 case IX86_BUILTIN_FNSTENV:
13973 case IX86_BUILTIN_FLDENV:
13974 mode0 = BLKmode;
13975 switch (fcode)
13977 case IX86_BUILTIN_FXSAVE:
13978 icode = CODE_FOR_fxsave;
13979 break;
13980 case IX86_BUILTIN_FXRSTOR:
13981 icode = CODE_FOR_fxrstor;
13982 break;
13983 case IX86_BUILTIN_FXSAVE64:
13984 icode = CODE_FOR_fxsave64;
13985 break;
13986 case IX86_BUILTIN_FXRSTOR64:
13987 icode = CODE_FOR_fxrstor64;
13988 break;
13989 case IX86_BUILTIN_FNSTENV:
13990 icode = CODE_FOR_fnstenv;
13991 break;
13992 case IX86_BUILTIN_FLDENV:
13993 icode = CODE_FOR_fldenv;
13994 break;
13995 default:
13996 gcc_unreachable ();
13999 arg0 = CALL_EXPR_ARG (exp, 0);
14000 op0 = expand_normal (arg0);
14002 if (!address_operand (op0, VOIDmode))
14004 op0 = convert_memory_address (Pmode, op0);
14005 op0 = copy_addr_to_reg (op0);
14007 op0 = gen_rtx_MEM (mode0, op0);
14009 pat = GEN_FCN (icode) (op0);
14010 if (pat)
14011 emit_insn (pat);
14012 return 0;
14014 case IX86_BUILTIN_XSETBV:
14015 arg0 = CALL_EXPR_ARG (exp, 0);
14016 arg1 = CALL_EXPR_ARG (exp, 1);
14017 op0 = expand_normal (arg0);
14018 op1 = expand_normal (arg1);
14020 if (!REG_P (op0))
14021 op0 = copy_to_mode_reg (SImode, op0);
14023 op1 = force_reg (DImode, op1);
14025 if (TARGET_64BIT)
14027 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
14028 NULL, 1, OPTAB_DIRECT);
14030 icode = CODE_FOR_xsetbv_rex64;
14032 op2 = gen_lowpart (SImode, op2);
14033 op1 = gen_lowpart (SImode, op1);
14034 pat = GEN_FCN (icode) (op0, op1, op2);
14036 else
14038 icode = CODE_FOR_xsetbv;
14040 pat = GEN_FCN (icode) (op0, op1);
14042 if (pat)
14043 emit_insn (pat);
14044 return 0;
14046 case IX86_BUILTIN_XSAVE:
14047 case IX86_BUILTIN_XRSTOR:
14048 case IX86_BUILTIN_XSAVE64:
14049 case IX86_BUILTIN_XRSTOR64:
14050 case IX86_BUILTIN_XSAVEOPT:
14051 case IX86_BUILTIN_XSAVEOPT64:
14052 case IX86_BUILTIN_XSAVES:
14053 case IX86_BUILTIN_XRSTORS:
14054 case IX86_BUILTIN_XSAVES64:
14055 case IX86_BUILTIN_XRSTORS64:
14056 case IX86_BUILTIN_XSAVEC:
14057 case IX86_BUILTIN_XSAVEC64:
14058 arg0 = CALL_EXPR_ARG (exp, 0);
14059 arg1 = CALL_EXPR_ARG (exp, 1);
14060 op0 = expand_normal (arg0);
14061 op1 = expand_normal (arg1);
14063 if (!address_operand (op0, VOIDmode))
14065 op0 = convert_memory_address (Pmode, op0);
14066 op0 = copy_addr_to_reg (op0);
14068 op0 = gen_rtx_MEM (BLKmode, op0);
14070 op1 = force_reg (DImode, op1);
14072 if (TARGET_64BIT)
14074 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
14075 NULL, 1, OPTAB_DIRECT);
14076 switch (fcode)
14078 case IX86_BUILTIN_XSAVE:
14079 icode = CODE_FOR_xsave_rex64;
14080 break;
14081 case IX86_BUILTIN_XRSTOR:
14082 icode = CODE_FOR_xrstor_rex64;
14083 break;
14084 case IX86_BUILTIN_XSAVE64:
14085 icode = CODE_FOR_xsave64;
14086 break;
14087 case IX86_BUILTIN_XRSTOR64:
14088 icode = CODE_FOR_xrstor64;
14089 break;
14090 case IX86_BUILTIN_XSAVEOPT:
14091 icode = CODE_FOR_xsaveopt_rex64;
14092 break;
14093 case IX86_BUILTIN_XSAVEOPT64:
14094 icode = CODE_FOR_xsaveopt64;
14095 break;
14096 case IX86_BUILTIN_XSAVES:
14097 icode = CODE_FOR_xsaves_rex64;
14098 break;
14099 case IX86_BUILTIN_XRSTORS:
14100 icode = CODE_FOR_xrstors_rex64;
14101 break;
14102 case IX86_BUILTIN_XSAVES64:
14103 icode = CODE_FOR_xsaves64;
14104 break;
14105 case IX86_BUILTIN_XRSTORS64:
14106 icode = CODE_FOR_xrstors64;
14107 break;
14108 case IX86_BUILTIN_XSAVEC:
14109 icode = CODE_FOR_xsavec_rex64;
14110 break;
14111 case IX86_BUILTIN_XSAVEC64:
14112 icode = CODE_FOR_xsavec64;
14113 break;
14114 default:
14115 gcc_unreachable ();
14118 op2 = gen_lowpart (SImode, op2);
14119 op1 = gen_lowpart (SImode, op1);
14120 pat = GEN_FCN (icode) (op0, op1, op2);
14122 else
14124 switch (fcode)
14126 case IX86_BUILTIN_XSAVE:
14127 icode = CODE_FOR_xsave;
14128 break;
14129 case IX86_BUILTIN_XRSTOR:
14130 icode = CODE_FOR_xrstor;
14131 break;
14132 case IX86_BUILTIN_XSAVEOPT:
14133 icode = CODE_FOR_xsaveopt;
14134 break;
14135 case IX86_BUILTIN_XSAVES:
14136 icode = CODE_FOR_xsaves;
14137 break;
14138 case IX86_BUILTIN_XRSTORS:
14139 icode = CODE_FOR_xrstors;
14140 break;
14141 case IX86_BUILTIN_XSAVEC:
14142 icode = CODE_FOR_xsavec;
14143 break;
14144 default:
14145 gcc_unreachable ();
14147 pat = GEN_FCN (icode) (op0, op1);
14150 if (pat)
14151 emit_insn (pat);
14152 return 0;
14154 case IX86_BUILTIN_LDTILECFG:
14155 case IX86_BUILTIN_STTILECFG:
14156 arg0 = CALL_EXPR_ARG (exp, 0);
14157 op0 = expand_normal (arg0);
14159 if (!address_operand (op0, VOIDmode))
14161 op0 = convert_memory_address (Pmode, op0);
14162 op0 = copy_addr_to_reg (op0);
14164 op0 = gen_rtx_MEM (XImode, op0);
14165 if (fcode == IX86_BUILTIN_LDTILECFG)
14166 icode = CODE_FOR_ldtilecfg;
14167 else
14168 icode = CODE_FOR_sttilecfg;
14169 pat = GEN_FCN (icode) (op0);
14170 emit_insn (pat);
14171 return 0;
14173 case IX86_BUILTIN_LLWPCB:
14174 arg0 = CALL_EXPR_ARG (exp, 0);
14175 op0 = expand_normal (arg0);
14177 if (!register_operand (op0, Pmode))
14178 op0 = ix86_zero_extend_to_Pmode (op0);
14179 emit_insn (gen_lwp_llwpcb (Pmode, op0));
14180 return 0;
14182 case IX86_BUILTIN_SLWPCB:
14183 if (!target
14184 || !register_operand (target, Pmode))
14185 target = gen_reg_rtx (Pmode);
14186 emit_insn (gen_lwp_slwpcb (Pmode, target));
14187 return target;
14189 case IX86_BUILTIN_LWPVAL32:
14190 case IX86_BUILTIN_LWPVAL64:
14191 case IX86_BUILTIN_LWPINS32:
14192 case IX86_BUILTIN_LWPINS64:
14193 mode = ((fcode == IX86_BUILTIN_LWPVAL32
14194 || fcode == IX86_BUILTIN_LWPINS32)
14195 ? SImode : DImode);
14197 if (fcode == IX86_BUILTIN_LWPVAL32
14198 || fcode == IX86_BUILTIN_LWPVAL64)
14199 icode = code_for_lwp_lwpval (mode);
14200 else
14201 icode = code_for_lwp_lwpins (mode);
14203 arg0 = CALL_EXPR_ARG (exp, 0);
14204 arg1 = CALL_EXPR_ARG (exp, 1);
14205 arg2 = CALL_EXPR_ARG (exp, 2);
14206 op0 = expand_normal (arg0);
14207 op1 = expand_normal (arg1);
14208 op2 = expand_normal (arg2);
14209 mode0 = insn_data[icode].operand[0].mode;
14211 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14212 op0 = copy_to_mode_reg (mode0, op0);
14213 if (!insn_data[icode].operand[1].predicate (op1, SImode))
14214 op1 = copy_to_mode_reg (SImode, op1);
14216 if (!CONST_INT_P (op2))
14218 error ("the last argument must be a 32-bit immediate");
14219 return const0_rtx;
14222 emit_insn (GEN_FCN (icode) (op0, op1, op2));
14224 if (fcode == IX86_BUILTIN_LWPINS32
14225 || fcode == IX86_BUILTIN_LWPINS64)
14227 if (target == 0
14228 || !nonimmediate_operand (target, QImode))
14229 target = gen_reg_rtx (QImode);
14231 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14232 const0_rtx);
14233 emit_insn (gen_rtx_SET (target, pat));
14235 return target;
14237 else
14238 return 0;
14240 case IX86_BUILTIN_BEXTRI32:
14241 case IX86_BUILTIN_BEXTRI64:
14242 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
14244 arg0 = CALL_EXPR_ARG (exp, 0);
14245 arg1 = CALL_EXPR_ARG (exp, 1);
14246 op0 = expand_normal (arg0);
14247 op1 = expand_normal (arg1);
14249 if (!CONST_INT_P (op1))
14251 error ("last argument must be an immediate");
14252 return const0_rtx;
14254 else
14256 unsigned char lsb_index = UINTVAL (op1);
14257 unsigned char length = UINTVAL (op1) >> 8;
14259 unsigned char bitsize = GET_MODE_BITSIZE (mode);
14261 icode = code_for_tbm_bextri (mode);
14263 mode1 = insn_data[icode].operand[1].mode;
14264 if (!insn_data[icode].operand[1].predicate (op0, mode1))
14265 op0 = copy_to_mode_reg (mode1, op0);
14267 mode0 = insn_data[icode].operand[0].mode;
14268 if (target == 0
14269 || !register_operand (target, mode0))
14270 target = gen_reg_rtx (mode0);
14272 if (length == 0 || lsb_index >= bitsize)
14274 emit_move_insn (target, const0_rtx);
14275 return target;
14278 if (length + lsb_index > bitsize)
14279 length = bitsize - lsb_index;
14281 op1 = GEN_INT (length);
14282 op2 = GEN_INT (lsb_index);
14284 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
14285 return target;
14288 case IX86_BUILTIN_RDRAND16_STEP:
14289 mode = HImode;
14290 goto rdrand_step;
14292 case IX86_BUILTIN_RDRAND32_STEP:
14293 mode = SImode;
14294 goto rdrand_step;
14296 case IX86_BUILTIN_RDRAND64_STEP:
14297 mode = DImode;
14299 rdrand_step:
14300 arg0 = CALL_EXPR_ARG (exp, 0);
14301 op1 = expand_normal (arg0);
14302 if (!address_operand (op1, VOIDmode))
14304 op1 = convert_memory_address (Pmode, op1);
14305 op1 = copy_addr_to_reg (op1);
14308 op0 = gen_reg_rtx (mode);
14309 emit_insn (gen_rdrand (mode, op0));
14311 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
14313 op1 = force_reg (SImode, const1_rtx);
14315 /* Emit SImode conditional move. */
14316 if (mode == HImode)
14318 if (TARGET_ZERO_EXTEND_WITH_AND
14319 && optimize_function_for_speed_p (cfun))
14321 op2 = force_reg (SImode, const0_rtx);
14323 emit_insn (gen_movstricthi
14324 (gen_lowpart (HImode, op2), op0));
14326 else
14328 op2 = gen_reg_rtx (SImode);
14330 emit_insn (gen_zero_extendhisi2 (op2, op0));
14333 else if (mode == SImode)
14334 op2 = op0;
14335 else
14336 op2 = gen_rtx_SUBREG (SImode, op0, 0);
14338 if (target == 0
14339 || !register_operand (target, SImode))
14340 target = gen_reg_rtx (SImode);
14342 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
14343 const0_rtx);
14344 emit_insn (gen_rtx_SET (target,
14345 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
14346 return target;
14348 case IX86_BUILTIN_RDSEED16_STEP:
14349 mode = HImode;
14350 goto rdseed_step;
14352 case IX86_BUILTIN_RDSEED32_STEP:
14353 mode = SImode;
14354 goto rdseed_step;
14356 case IX86_BUILTIN_RDSEED64_STEP:
14357 mode = DImode;
14359 rdseed_step:
14360 arg0 = CALL_EXPR_ARG (exp, 0);
14361 op1 = expand_normal (arg0);
14362 if (!address_operand (op1, VOIDmode))
14364 op1 = convert_memory_address (Pmode, op1);
14365 op1 = copy_addr_to_reg (op1);
14368 op0 = gen_reg_rtx (mode);
14369 emit_insn (gen_rdseed (mode, op0));
14371 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
14373 op2 = gen_reg_rtx (QImode);
14375 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14376 const0_rtx);
14377 emit_insn (gen_rtx_SET (op2, pat));
14379 if (target == 0
14380 || !register_operand (target, SImode))
14381 target = gen_reg_rtx (SImode);
14383 emit_insn (gen_zero_extendqisi2 (target, op2));
14384 return target;
14386 case IX86_BUILTIN_SBB32:
14387 icode = CODE_FOR_subborrowsi;
14388 icode2 = CODE_FOR_subborrowsi_0;
14389 mode0 = SImode;
14390 mode1 = DImode;
14391 mode2 = CCmode;
14392 goto handlecarry;
14394 case IX86_BUILTIN_SBB64:
14395 icode = CODE_FOR_subborrowdi;
14396 icode2 = CODE_FOR_subborrowdi_0;
14397 mode0 = DImode;
14398 mode1 = TImode;
14399 mode2 = CCmode;
14400 goto handlecarry;
14402 case IX86_BUILTIN_ADDCARRYX32:
14403 icode = CODE_FOR_addcarrysi;
14404 icode2 = CODE_FOR_addcarrysi_0;
14405 mode0 = SImode;
14406 mode1 = DImode;
14407 mode2 = CCCmode;
14408 goto handlecarry;
14410 case IX86_BUILTIN_ADDCARRYX64:
14411 icode = CODE_FOR_addcarrydi;
14412 icode2 = CODE_FOR_addcarrydi_0;
14413 mode0 = DImode;
14414 mode1 = TImode;
14415 mode2 = CCCmode;
14417 handlecarry:
14418 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
14419 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
14420 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
14421 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
14423 op1 = expand_normal (arg0);
14425 op2 = expand_normal (arg1);
14426 if (!register_operand (op2, mode0))
14427 op2 = copy_to_mode_reg (mode0, op2);
14429 op3 = expand_normal (arg2);
14430 if (!register_operand (op3, mode0))
14431 op3 = copy_to_mode_reg (mode0, op3);
14433 op4 = expand_normal (arg3);
14434 if (!address_operand (op4, VOIDmode))
14436 op4 = convert_memory_address (Pmode, op4);
14437 op4 = copy_addr_to_reg (op4);
14440 op0 = gen_reg_rtx (mode0);
14441 if (op1 == const0_rtx)
14443 /* If arg0 is 0, optimize right away into add or sub
14444 instruction that sets CCCmode flags. */
14445 op1 = gen_rtx_REG (mode2, FLAGS_REG);
14446 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
14448 else
14450 /* Generate CF from input operand. */
14451 ix86_expand_carry (op1);
14453 /* Generate instruction that consumes CF. */
14454 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
14455 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
14456 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
14457 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
14460 /* Return current CF value. */
14461 if (target == 0)
14462 target = gen_reg_rtx (QImode);
14464 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
14465 emit_insn (gen_rtx_SET (target, pat));
14467 /* Store the result. */
14468 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
14470 return target;
14472 case IX86_BUILTIN_READ_FLAGS:
14473 if (ignore)
14474 return const0_rtx;
14476 emit_insn (gen_pushfl ());
14478 if (optimize
14479 || target == NULL_RTX
14480 || !nonimmediate_operand (target, word_mode)
14481 || GET_MODE (target) != word_mode)
14482 target = gen_reg_rtx (word_mode);
14484 emit_insn (gen_pop (target));
14485 return target;
14487 case IX86_BUILTIN_WRITE_FLAGS:
14489 arg0 = CALL_EXPR_ARG (exp, 0);
14490 op0 = expand_normal (arg0);
14491 if (!general_no_elim_operand (op0, word_mode))
14492 op0 = copy_to_mode_reg (word_mode, op0);
14494 emit_insn (gen_push (op0));
14495 emit_insn (gen_popfl ());
14496 return 0;
14498 case IX86_BUILTIN_KTESTC8:
14499 icode = CODE_FOR_ktestqi;
14500 mode3 = CCCmode;
14501 goto kortest;
14503 case IX86_BUILTIN_KTESTZ8:
14504 icode = CODE_FOR_ktestqi;
14505 mode3 = CCZmode;
14506 goto kortest;
14508 case IX86_BUILTIN_KTESTC16:
14509 icode = CODE_FOR_ktesthi;
14510 mode3 = CCCmode;
14511 goto kortest;
14513 case IX86_BUILTIN_KTESTZ16:
14514 icode = CODE_FOR_ktesthi;
14515 mode3 = CCZmode;
14516 goto kortest;
14518 case IX86_BUILTIN_KTESTC32:
14519 icode = CODE_FOR_ktestsi;
14520 mode3 = CCCmode;
14521 goto kortest;
14523 case IX86_BUILTIN_KTESTZ32:
14524 icode = CODE_FOR_ktestsi;
14525 mode3 = CCZmode;
14526 goto kortest;
14528 case IX86_BUILTIN_KTESTC64:
14529 icode = CODE_FOR_ktestdi;
14530 mode3 = CCCmode;
14531 goto kortest;
14533 case IX86_BUILTIN_KTESTZ64:
14534 icode = CODE_FOR_ktestdi;
14535 mode3 = CCZmode;
14536 goto kortest;
14538 case IX86_BUILTIN_KORTESTC8:
14539 icode = CODE_FOR_kortestqi;
14540 mode3 = CCCmode;
14541 goto kortest;
14543 case IX86_BUILTIN_KORTESTZ8:
14544 icode = CODE_FOR_kortestqi;
14545 mode3 = CCZmode;
14546 goto kortest;
14548 case IX86_BUILTIN_KORTESTC16:
14549 icode = CODE_FOR_kortesthi;
14550 mode3 = CCCmode;
14551 goto kortest;
14553 case IX86_BUILTIN_KORTESTZ16:
14554 icode = CODE_FOR_kortesthi;
14555 mode3 = CCZmode;
14556 goto kortest;
14558 case IX86_BUILTIN_KORTESTC32:
14559 icode = CODE_FOR_kortestsi;
14560 mode3 = CCCmode;
14561 goto kortest;
14563 case IX86_BUILTIN_KORTESTZ32:
14564 icode = CODE_FOR_kortestsi;
14565 mode3 = CCZmode;
14566 goto kortest;
14568 case IX86_BUILTIN_KORTESTC64:
14569 icode = CODE_FOR_kortestdi;
14570 mode3 = CCCmode;
14571 goto kortest;
14573 case IX86_BUILTIN_KORTESTZ64:
14574 icode = CODE_FOR_kortestdi;
14575 mode3 = CCZmode;
14577 kortest:
14578 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
14579 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
14580 op0 = expand_normal (arg0);
14581 op1 = expand_normal (arg1);
14583 mode0 = insn_data[icode].operand[0].mode;
14584 mode1 = insn_data[icode].operand[1].mode;
14586 if (GET_MODE (op0) != VOIDmode)
14587 op0 = force_reg (GET_MODE (op0), op0);
14589 op0 = gen_lowpart (mode0, op0);
14591 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14592 op0 = copy_to_mode_reg (mode0, op0);
14594 if (GET_MODE (op1) != VOIDmode)
14595 op1 = force_reg (GET_MODE (op1), op1);
14597 op1 = gen_lowpart (mode1, op1);
14599 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14600 op1 = copy_to_mode_reg (mode1, op1);
14602 target = gen_reg_rtx (QImode);
14604 /* Emit kortest. */
14605 emit_insn (GEN_FCN (icode) (op0, op1));
14606 /* And use setcc to return result from flags. */
14607 ix86_expand_setcc (target, EQ,
14608 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
14609 return target;
14611 case IX86_BUILTIN_GATHERSIV2DF:
14612 icode = CODE_FOR_avx2_gathersiv2df;
14613 goto gather_gen;
14614 case IX86_BUILTIN_GATHERSIV4DF:
14615 icode = CODE_FOR_avx2_gathersiv4df;
14616 goto gather_gen;
14617 case IX86_BUILTIN_GATHERDIV2DF:
14618 icode = CODE_FOR_avx2_gatherdiv2df;
14619 goto gather_gen;
14620 case IX86_BUILTIN_GATHERDIV4DF:
14621 icode = CODE_FOR_avx2_gatherdiv4df;
14622 goto gather_gen;
14623 case IX86_BUILTIN_GATHERSIV4SF:
14624 icode = CODE_FOR_avx2_gathersiv4sf;
14625 goto gather_gen;
14626 case IX86_BUILTIN_GATHERSIV8SF:
14627 icode = CODE_FOR_avx2_gathersiv8sf;
14628 goto gather_gen;
14629 case IX86_BUILTIN_GATHERDIV4SF:
14630 icode = CODE_FOR_avx2_gatherdiv4sf;
14631 goto gather_gen;
14632 case IX86_BUILTIN_GATHERDIV8SF:
14633 icode = CODE_FOR_avx2_gatherdiv8sf;
14634 goto gather_gen;
14635 case IX86_BUILTIN_GATHERSIV2DI:
14636 icode = CODE_FOR_avx2_gathersiv2di;
14637 goto gather_gen;
14638 case IX86_BUILTIN_GATHERSIV4DI:
14639 icode = CODE_FOR_avx2_gathersiv4di;
14640 goto gather_gen;
14641 case IX86_BUILTIN_GATHERDIV2DI:
14642 icode = CODE_FOR_avx2_gatherdiv2di;
14643 goto gather_gen;
14644 case IX86_BUILTIN_GATHERDIV4DI:
14645 icode = CODE_FOR_avx2_gatherdiv4di;
14646 goto gather_gen;
14647 case IX86_BUILTIN_GATHERSIV4SI:
14648 icode = CODE_FOR_avx2_gathersiv4si;
14649 goto gather_gen;
14650 case IX86_BUILTIN_GATHERSIV8SI:
14651 icode = CODE_FOR_avx2_gathersiv8si;
14652 goto gather_gen;
14653 case IX86_BUILTIN_GATHERDIV4SI:
14654 icode = CODE_FOR_avx2_gatherdiv4si;
14655 goto gather_gen;
14656 case IX86_BUILTIN_GATHERDIV8SI:
14657 icode = CODE_FOR_avx2_gatherdiv8si;
14658 goto gather_gen;
14659 case IX86_BUILTIN_GATHERALTSIV4DF:
14660 icode = CODE_FOR_avx2_gathersiv4df;
14661 goto gather_gen;
14662 case IX86_BUILTIN_GATHERALTDIV8SF:
14663 icode = CODE_FOR_avx2_gatherdiv8sf;
14664 goto gather_gen;
14665 case IX86_BUILTIN_GATHERALTSIV4DI:
14666 icode = CODE_FOR_avx2_gathersiv4di;
14667 goto gather_gen;
14668 case IX86_BUILTIN_GATHERALTDIV8SI:
14669 icode = CODE_FOR_avx2_gatherdiv8si;
14670 goto gather_gen;
14671 case IX86_BUILTIN_GATHER3SIV16SF:
14672 icode = CODE_FOR_avx512f_gathersiv16sf;
14673 goto gather_gen;
14674 case IX86_BUILTIN_GATHER3SIV8DF:
14675 icode = CODE_FOR_avx512f_gathersiv8df;
14676 goto gather_gen;
14677 case IX86_BUILTIN_GATHER3DIV16SF:
14678 icode = CODE_FOR_avx512f_gatherdiv16sf;
14679 goto gather_gen;
14680 case IX86_BUILTIN_GATHER3DIV8DF:
14681 icode = CODE_FOR_avx512f_gatherdiv8df;
14682 goto gather_gen;
14683 case IX86_BUILTIN_GATHER3SIV16SI:
14684 icode = CODE_FOR_avx512f_gathersiv16si;
14685 goto gather_gen;
14686 case IX86_BUILTIN_GATHER3SIV8DI:
14687 icode = CODE_FOR_avx512f_gathersiv8di;
14688 goto gather_gen;
14689 case IX86_BUILTIN_GATHER3DIV16SI:
14690 icode = CODE_FOR_avx512f_gatherdiv16si;
14691 goto gather_gen;
14692 case IX86_BUILTIN_GATHER3DIV8DI:
14693 icode = CODE_FOR_avx512f_gatherdiv8di;
14694 goto gather_gen;
14695 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14696 icode = CODE_FOR_avx512f_gathersiv8df;
14697 goto gather_gen;
14698 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14699 icode = CODE_FOR_avx512f_gatherdiv16sf;
14700 goto gather_gen;
14701 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14702 icode = CODE_FOR_avx512f_gathersiv8di;
14703 goto gather_gen;
14704 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14705 icode = CODE_FOR_avx512f_gatherdiv16si;
14706 goto gather_gen;
14707 case IX86_BUILTIN_GATHER3SIV2DF:
14708 icode = CODE_FOR_avx512vl_gathersiv2df;
14709 goto gather_gen;
14710 case IX86_BUILTIN_GATHER3SIV4DF:
14711 icode = CODE_FOR_avx512vl_gathersiv4df;
14712 goto gather_gen;
14713 case IX86_BUILTIN_GATHER3DIV2DF:
14714 icode = CODE_FOR_avx512vl_gatherdiv2df;
14715 goto gather_gen;
14716 case IX86_BUILTIN_GATHER3DIV4DF:
14717 icode = CODE_FOR_avx512vl_gatherdiv4df;
14718 goto gather_gen;
14719 case IX86_BUILTIN_GATHER3SIV4SF:
14720 icode = CODE_FOR_avx512vl_gathersiv4sf;
14721 goto gather_gen;
14722 case IX86_BUILTIN_GATHER3SIV8SF:
14723 icode = CODE_FOR_avx512vl_gathersiv8sf;
14724 goto gather_gen;
14725 case IX86_BUILTIN_GATHER3DIV4SF:
14726 icode = CODE_FOR_avx512vl_gatherdiv4sf;
14727 goto gather_gen;
14728 case IX86_BUILTIN_GATHER3DIV8SF:
14729 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14730 goto gather_gen;
14731 case IX86_BUILTIN_GATHER3SIV2DI:
14732 icode = CODE_FOR_avx512vl_gathersiv2di;
14733 goto gather_gen;
14734 case IX86_BUILTIN_GATHER3SIV4DI:
14735 icode = CODE_FOR_avx512vl_gathersiv4di;
14736 goto gather_gen;
14737 case IX86_BUILTIN_GATHER3DIV2DI:
14738 icode = CODE_FOR_avx512vl_gatherdiv2di;
14739 goto gather_gen;
14740 case IX86_BUILTIN_GATHER3DIV4DI:
14741 icode = CODE_FOR_avx512vl_gatherdiv4di;
14742 goto gather_gen;
14743 case IX86_BUILTIN_GATHER3SIV4SI:
14744 icode = CODE_FOR_avx512vl_gathersiv4si;
14745 goto gather_gen;
14746 case IX86_BUILTIN_GATHER3SIV8SI:
14747 icode = CODE_FOR_avx512vl_gathersiv8si;
14748 goto gather_gen;
14749 case IX86_BUILTIN_GATHER3DIV4SI:
14750 icode = CODE_FOR_avx512vl_gatherdiv4si;
14751 goto gather_gen;
14752 case IX86_BUILTIN_GATHER3DIV8SI:
14753 icode = CODE_FOR_avx512vl_gatherdiv8si;
14754 goto gather_gen;
14755 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14756 icode = CODE_FOR_avx512vl_gathersiv4df;
14757 goto gather_gen;
14758 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14759 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14760 goto gather_gen;
14761 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14762 icode = CODE_FOR_avx512vl_gathersiv4di;
14763 goto gather_gen;
14764 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14765 icode = CODE_FOR_avx512vl_gatherdiv8si;
14766 goto gather_gen;
14767 case IX86_BUILTIN_SCATTERSIV16SF:
14768 icode = CODE_FOR_avx512f_scattersiv16sf;
14769 goto scatter_gen;
14770 case IX86_BUILTIN_SCATTERSIV8DF:
14771 icode = CODE_FOR_avx512f_scattersiv8df;
14772 goto scatter_gen;
14773 case IX86_BUILTIN_SCATTERDIV16SF:
14774 icode = CODE_FOR_avx512f_scatterdiv16sf;
14775 goto scatter_gen;
14776 case IX86_BUILTIN_SCATTERDIV8DF:
14777 icode = CODE_FOR_avx512f_scatterdiv8df;
14778 goto scatter_gen;
14779 case IX86_BUILTIN_SCATTERSIV16SI:
14780 icode = CODE_FOR_avx512f_scattersiv16si;
14781 goto scatter_gen;
14782 case IX86_BUILTIN_SCATTERSIV8DI:
14783 icode = CODE_FOR_avx512f_scattersiv8di;
14784 goto scatter_gen;
14785 case IX86_BUILTIN_SCATTERDIV16SI:
14786 icode = CODE_FOR_avx512f_scatterdiv16si;
14787 goto scatter_gen;
14788 case IX86_BUILTIN_SCATTERDIV8DI:
14789 icode = CODE_FOR_avx512f_scatterdiv8di;
14790 goto scatter_gen;
14791 case IX86_BUILTIN_SCATTERSIV8SF:
14792 icode = CODE_FOR_avx512vl_scattersiv8sf;
14793 goto scatter_gen;
14794 case IX86_BUILTIN_SCATTERSIV4SF:
14795 icode = CODE_FOR_avx512vl_scattersiv4sf;
14796 goto scatter_gen;
14797 case IX86_BUILTIN_SCATTERSIV4DF:
14798 icode = CODE_FOR_avx512vl_scattersiv4df;
14799 goto scatter_gen;
14800 case IX86_BUILTIN_SCATTERSIV2DF:
14801 icode = CODE_FOR_avx512vl_scattersiv2df;
14802 goto scatter_gen;
14803 case IX86_BUILTIN_SCATTERDIV8SF:
14804 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14805 goto scatter_gen;
14806 case IX86_BUILTIN_SCATTERDIV4SF:
14807 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14808 goto scatter_gen;
14809 case IX86_BUILTIN_SCATTERDIV4DF:
14810 icode = CODE_FOR_avx512vl_scatterdiv4df;
14811 goto scatter_gen;
14812 case IX86_BUILTIN_SCATTERDIV2DF:
14813 icode = CODE_FOR_avx512vl_scatterdiv2df;
14814 goto scatter_gen;
14815 case IX86_BUILTIN_SCATTERSIV8SI:
14816 icode = CODE_FOR_avx512vl_scattersiv8si;
14817 goto scatter_gen;
14818 case IX86_BUILTIN_SCATTERSIV4SI:
14819 icode = CODE_FOR_avx512vl_scattersiv4si;
14820 goto scatter_gen;
14821 case IX86_BUILTIN_SCATTERSIV4DI:
14822 icode = CODE_FOR_avx512vl_scattersiv4di;
14823 goto scatter_gen;
14824 case IX86_BUILTIN_SCATTERSIV2DI:
14825 icode = CODE_FOR_avx512vl_scattersiv2di;
14826 goto scatter_gen;
14827 case IX86_BUILTIN_SCATTERDIV8SI:
14828 icode = CODE_FOR_avx512vl_scatterdiv8si;
14829 goto scatter_gen;
14830 case IX86_BUILTIN_SCATTERDIV4SI:
14831 icode = CODE_FOR_avx512vl_scatterdiv4si;
14832 goto scatter_gen;
14833 case IX86_BUILTIN_SCATTERDIV4DI:
14834 icode = CODE_FOR_avx512vl_scatterdiv4di;
14835 goto scatter_gen;
14836 case IX86_BUILTIN_SCATTERDIV2DI:
14837 icode = CODE_FOR_avx512vl_scatterdiv2di;
14838 goto scatter_gen;
14839 case IX86_BUILTIN_GATHERPFDPD:
14840 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
14841 goto vec_prefetch_gen;
14842 case IX86_BUILTIN_SCATTERALTSIV8DF:
14843 icode = CODE_FOR_avx512f_scattersiv8df;
14844 goto scatter_gen;
14845 case IX86_BUILTIN_SCATTERALTDIV16SF:
14846 icode = CODE_FOR_avx512f_scatterdiv16sf;
14847 goto scatter_gen;
14848 case IX86_BUILTIN_SCATTERALTSIV8DI:
14849 icode = CODE_FOR_avx512f_scattersiv8di;
14850 goto scatter_gen;
14851 case IX86_BUILTIN_SCATTERALTDIV16SI:
14852 icode = CODE_FOR_avx512f_scatterdiv16si;
14853 goto scatter_gen;
14854 case IX86_BUILTIN_SCATTERALTSIV4DF:
14855 icode = CODE_FOR_avx512vl_scattersiv4df;
14856 goto scatter_gen;
14857 case IX86_BUILTIN_SCATTERALTDIV8SF:
14858 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14859 goto scatter_gen;
14860 case IX86_BUILTIN_SCATTERALTSIV4DI:
14861 icode = CODE_FOR_avx512vl_scattersiv4di;
14862 goto scatter_gen;
14863 case IX86_BUILTIN_SCATTERALTDIV8SI:
14864 icode = CODE_FOR_avx512vl_scatterdiv8si;
14865 goto scatter_gen;
14866 case IX86_BUILTIN_SCATTERALTSIV2DF:
14867 icode = CODE_FOR_avx512vl_scattersiv2df;
14868 goto scatter_gen;
14869 case IX86_BUILTIN_SCATTERALTDIV4SF:
14870 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14871 goto scatter_gen;
14872 case IX86_BUILTIN_SCATTERALTSIV2DI:
14873 icode = CODE_FOR_avx512vl_scattersiv2di;
14874 goto scatter_gen;
14875 case IX86_BUILTIN_SCATTERALTDIV4SI:
14876 icode = CODE_FOR_avx512vl_scatterdiv4si;
14877 goto scatter_gen;
14878 case IX86_BUILTIN_GATHERPFDPS:
14879 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14880 goto vec_prefetch_gen;
14881 case IX86_BUILTIN_GATHERPFQPD:
14882 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14883 goto vec_prefetch_gen;
14884 case IX86_BUILTIN_GATHERPFQPS:
14885 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14886 goto vec_prefetch_gen;
14887 case IX86_BUILTIN_SCATTERPFDPD:
14888 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14889 goto vec_prefetch_gen;
14890 case IX86_BUILTIN_SCATTERPFDPS:
14891 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14892 goto vec_prefetch_gen;
14893 case IX86_BUILTIN_SCATTERPFQPD:
14894 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14895 goto vec_prefetch_gen;
14896 case IX86_BUILTIN_SCATTERPFQPS:
14897 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14898 goto vec_prefetch_gen;
14900 gather_gen:
14901 rtx half;
14902 rtx (*gen) (rtx, rtx);
14904 arg0 = CALL_EXPR_ARG (exp, 0);
14905 arg1 = CALL_EXPR_ARG (exp, 1);
14906 arg2 = CALL_EXPR_ARG (exp, 2);
14907 arg3 = CALL_EXPR_ARG (exp, 3);
14908 arg4 = CALL_EXPR_ARG (exp, 4);
14909 op0 = expand_normal (arg0);
14910 op1 = expand_normal (arg1);
14911 op2 = expand_normal (arg2);
14912 op3 = expand_normal (arg3);
14913 op4 = expand_normal (arg4);
14914 /* Note the arg order is different from the operand order. */
14915 mode0 = insn_data[icode].operand[1].mode;
14916 mode2 = insn_data[icode].operand[3].mode;
14917 mode3 = insn_data[icode].operand[4].mode;
14918 mode4 = insn_data[icode].operand[5].mode;
14920 if (target == NULL_RTX
14921 || GET_MODE (target) != insn_data[icode].operand[0].mode
14922 || !insn_data[icode].operand[0].predicate (target,
14923 GET_MODE (target)))
14924 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14925 else
14926 subtarget = target;
14928 switch (fcode)
14930 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14931 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14932 half = gen_reg_rtx (V8SImode);
14933 if (!nonimmediate_operand (op2, V16SImode))
14934 op2 = copy_to_mode_reg (V16SImode, op2);
14935 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14936 op2 = half;
14937 break;
14938 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14939 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14940 case IX86_BUILTIN_GATHERALTSIV4DF:
14941 case IX86_BUILTIN_GATHERALTSIV4DI:
14942 half = gen_reg_rtx (V4SImode);
14943 if (!nonimmediate_operand (op2, V8SImode))
14944 op2 = copy_to_mode_reg (V8SImode, op2);
14945 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14946 op2 = half;
14947 break;
14948 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14949 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14950 half = gen_reg_rtx (mode0);
14951 if (mode0 == V8SFmode)
14952 gen = gen_vec_extract_lo_v16sf;
14953 else
14954 gen = gen_vec_extract_lo_v16si;
14955 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14956 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14957 emit_insn (gen (half, op0));
14958 op0 = half;
14959 op3 = lowpart_subreg (QImode, op3, HImode);
14960 break;
14961 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14962 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14963 case IX86_BUILTIN_GATHERALTDIV8SF:
14964 case IX86_BUILTIN_GATHERALTDIV8SI:
14965 half = gen_reg_rtx (mode0);
14966 if (mode0 == V4SFmode)
14967 gen = gen_vec_extract_lo_v8sf;
14968 else
14969 gen = gen_vec_extract_lo_v8si;
14970 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14971 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14972 emit_insn (gen (half, op0));
14973 op0 = half;
14974 if (VECTOR_MODE_P (GET_MODE (op3)))
14976 half = gen_reg_rtx (mode0);
14977 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14978 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14979 emit_insn (gen (half, op3));
14980 op3 = half;
14982 break;
14983 default:
14984 break;
14987 /* Force memory operand only with base register here. But we
14988 don't want to do it on memory operand for other builtin
14989 functions. */
14990 op1 = ix86_zero_extend_to_Pmode (op1);
14992 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14993 op0 = copy_to_mode_reg (mode0, op0);
14994 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14995 op1 = copy_to_mode_reg (Pmode, op1);
14996 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14997 op2 = copy_to_mode_reg (mode2, op2);
14999 op3 = fixup_modeless_constant (op3, mode3);
15001 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
15003 if (!insn_data[icode].operand[4].predicate (op3, mode3))
15004 op3 = copy_to_mode_reg (mode3, op3);
15006 else
15008 op3 = copy_to_reg (op3);
15009 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
15011 if (!insn_data[icode].operand[5].predicate (op4, mode4))
15013 error ("the last argument must be scale 1, 2, 4, 8");
15014 return const0_rtx;
15017 /* Optimize. If mask is known to have all high bits set,
15018 replace op0 with pc_rtx to signal that the instruction
15019 overwrites the whole destination and doesn't use its
15020 previous contents. */
15021 if (optimize)
15023 if (TREE_CODE (arg3) == INTEGER_CST)
15025 if (integer_all_onesp (arg3))
15026 op0 = pc_rtx;
15028 else if (TREE_CODE (arg3) == VECTOR_CST)
15030 unsigned int negative = 0;
15031 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
15033 tree cst = VECTOR_CST_ELT (arg3, i);
15034 if (TREE_CODE (cst) == INTEGER_CST
15035 && tree_int_cst_sign_bit (cst))
15036 negative++;
15037 else if (TREE_CODE (cst) == REAL_CST
15038 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
15039 negative++;
15041 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
15042 op0 = pc_rtx;
15044 else if (TREE_CODE (arg3) == SSA_NAME
15045 && VECTOR_TYPE_P (TREE_TYPE (arg3)))
15047 /* Recognize also when mask is like:
15048 __v2df src = _mm_setzero_pd ();
15049 __v2df mask = _mm_cmpeq_pd (src, src);
15051 __v8sf src = _mm256_setzero_ps ();
15052 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
15053 as that is a cheaper way to load all ones into
15054 a register than having to load a constant from
15055 memory. */
15056 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
15057 if (is_gimple_call (def_stmt))
15059 tree fndecl = gimple_call_fndecl (def_stmt);
15060 if (fndecl
15061 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
15062 switch (DECL_MD_FUNCTION_CODE (fndecl))
15064 case IX86_BUILTIN_CMPPD:
15065 case IX86_BUILTIN_CMPPS:
15066 case IX86_BUILTIN_CMPPD256:
15067 case IX86_BUILTIN_CMPPS256:
15068 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
15069 break;
15070 /* FALLTHRU */
15071 case IX86_BUILTIN_CMPEQPD:
15072 case IX86_BUILTIN_CMPEQPS:
15073 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
15074 && initializer_zerop (gimple_call_arg (def_stmt,
15075 1)))
15076 op0 = pc_rtx;
15077 break;
15078 default:
15079 break;
15085 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
15086 if (! pat)
15087 return const0_rtx;
15088 emit_insn (pat);
15090 switch (fcode)
15092 case IX86_BUILTIN_GATHER3DIV16SF:
15093 if (target == NULL_RTX)
15094 target = gen_reg_rtx (V8SFmode);
15095 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
15096 break;
15097 case IX86_BUILTIN_GATHER3DIV16SI:
15098 if (target == NULL_RTX)
15099 target = gen_reg_rtx (V8SImode);
15100 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
15101 break;
15102 case IX86_BUILTIN_GATHER3DIV8SF:
15103 case IX86_BUILTIN_GATHERDIV8SF:
15104 if (target == NULL_RTX)
15105 target = gen_reg_rtx (V4SFmode);
15106 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
15107 break;
15108 case IX86_BUILTIN_GATHER3DIV8SI:
15109 case IX86_BUILTIN_GATHERDIV8SI:
15110 if (target == NULL_RTX)
15111 target = gen_reg_rtx (V4SImode);
15112 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
15113 break;
15114 default:
15115 target = subtarget;
15116 break;
15118 return target;
15120 scatter_gen:
15121 arg0 = CALL_EXPR_ARG (exp, 0);
15122 arg1 = CALL_EXPR_ARG (exp, 1);
15123 arg2 = CALL_EXPR_ARG (exp, 2);
15124 arg3 = CALL_EXPR_ARG (exp, 3);
15125 arg4 = CALL_EXPR_ARG (exp, 4);
15126 op0 = expand_normal (arg0);
15127 op1 = expand_normal (arg1);
15128 op2 = expand_normal (arg2);
15129 op3 = expand_normal (arg3);
15130 op4 = expand_normal (arg4);
15131 mode1 = insn_data[icode].operand[1].mode;
15132 mode2 = insn_data[icode].operand[2].mode;
15133 mode3 = insn_data[icode].operand[3].mode;
15134 mode4 = insn_data[icode].operand[4].mode;
15136 /* Scatter instruction stores operand op3 to memory with
15137 indices from op2 and scale from op4 under writemask op1.
15138 If index operand op2 has more elements then source operand
15139 op3 one need to use only its low half. And vice versa. */
15140 switch (fcode)
15142 case IX86_BUILTIN_SCATTERALTSIV8DF:
15143 case IX86_BUILTIN_SCATTERALTSIV8DI:
15144 half = gen_reg_rtx (V8SImode);
15145 if (!nonimmediate_operand (op2, V16SImode))
15146 op2 = copy_to_mode_reg (V16SImode, op2);
15147 emit_insn (gen_vec_extract_lo_v16si (half, op2));
15148 op2 = half;
15149 break;
15150 case IX86_BUILTIN_SCATTERALTDIV16SF:
15151 case IX86_BUILTIN_SCATTERALTDIV16SI:
15152 half = gen_reg_rtx (mode3);
15153 if (mode3 == V8SFmode)
15154 gen = gen_vec_extract_lo_v16sf;
15155 else
15156 gen = gen_vec_extract_lo_v16si;
15157 if (!nonimmediate_operand (op3, GET_MODE (op3)))
15158 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
15159 emit_insn (gen (half, op3));
15160 op3 = half;
15161 break;
15162 case IX86_BUILTIN_SCATTERALTSIV4DF:
15163 case IX86_BUILTIN_SCATTERALTSIV4DI:
15164 half = gen_reg_rtx (V4SImode);
15165 if (!nonimmediate_operand (op2, V8SImode))
15166 op2 = copy_to_mode_reg (V8SImode, op2);
15167 emit_insn (gen_vec_extract_lo_v8si (half, op2));
15168 op2 = half;
15169 break;
15170 case IX86_BUILTIN_SCATTERALTDIV8SF:
15171 case IX86_BUILTIN_SCATTERALTDIV8SI:
15172 half = gen_reg_rtx (mode3);
15173 if (mode3 == V4SFmode)
15174 gen = gen_vec_extract_lo_v8sf;
15175 else
15176 gen = gen_vec_extract_lo_v8si;
15177 if (!nonimmediate_operand (op3, GET_MODE (op3)))
15178 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
15179 emit_insn (gen (half, op3));
15180 op3 = half;
15181 break;
15182 case IX86_BUILTIN_SCATTERALTSIV2DF:
15183 case IX86_BUILTIN_SCATTERALTSIV2DI:
15184 if (!nonimmediate_operand (op2, V4SImode))
15185 op2 = copy_to_mode_reg (V4SImode, op2);
15186 break;
15187 case IX86_BUILTIN_SCATTERALTDIV4SF:
15188 case IX86_BUILTIN_SCATTERALTDIV4SI:
15189 if (!nonimmediate_operand (op3, GET_MODE (op3)))
15190 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
15191 break;
15192 default:
15193 break;
15196 /* Force memory operand only with base register here. But we
15197 don't want to do it on memory operand for other builtin
15198 functions. */
15199 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
15201 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
15202 op0 = copy_to_mode_reg (Pmode, op0);
15204 op1 = fixup_modeless_constant (op1, mode1);
15206 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
15208 if (!insn_data[icode].operand[1].predicate (op1, mode1))
15209 op1 = copy_to_mode_reg (mode1, op1);
15211 else
15213 op1 = copy_to_reg (op1);
15214 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
15217 if (!insn_data[icode].operand[2].predicate (op2, mode2))
15218 op2 = copy_to_mode_reg (mode2, op2);
15220 if (!insn_data[icode].operand[3].predicate (op3, mode3))
15221 op3 = copy_to_mode_reg (mode3, op3);
15223 if (!insn_data[icode].operand[4].predicate (op4, mode4))
15225 error ("the last argument must be scale 1, 2, 4, 8");
15226 return const0_rtx;
15229 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
15230 if (! pat)
15231 return const0_rtx;
15233 emit_insn (pat);
15234 return 0;
15236 vec_prefetch_gen:
15237 arg0 = CALL_EXPR_ARG (exp, 0);
15238 arg1 = CALL_EXPR_ARG (exp, 1);
15239 arg2 = CALL_EXPR_ARG (exp, 2);
15240 arg3 = CALL_EXPR_ARG (exp, 3);
15241 arg4 = CALL_EXPR_ARG (exp, 4);
15242 op0 = expand_normal (arg0);
15243 op1 = expand_normal (arg1);
15244 op2 = expand_normal (arg2);
15245 op3 = expand_normal (arg3);
15246 op4 = expand_normal (arg4);
15247 mode0 = insn_data[icode].operand[0].mode;
15248 mode1 = insn_data[icode].operand[1].mode;
15249 mode3 = insn_data[icode].operand[3].mode;
15250 mode4 = insn_data[icode].operand[4].mode;
15252 op0 = fixup_modeless_constant (op0, mode0);
15254 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
15256 if (!insn_data[icode].operand[0].predicate (op0, mode0))
15257 op0 = copy_to_mode_reg (mode0, op0);
15259 else
15261 op0 = copy_to_reg (op0);
15262 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
15265 if (!insn_data[icode].operand[1].predicate (op1, mode1))
15266 op1 = copy_to_mode_reg (mode1, op1);
15268 /* Force memory operand only with base register here. But we
15269 don't want to do it on memory operand for other builtin
15270 functions. */
15271 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
15273 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
15274 op2 = copy_to_mode_reg (Pmode, op2);
15276 if (!insn_data[icode].operand[3].predicate (op3, mode3))
15278 error ("the forth argument must be scale 1, 2, 4, 8");
15279 return const0_rtx;
15282 if (!insn_data[icode].operand[4].predicate (op4, mode4))
15284 error ("incorrect hint operand");
15285 return const0_rtx;
15288 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
15289 if (! pat)
15290 return const0_rtx;
15292 emit_insn (pat);
15294 return 0;
15296 case IX86_BUILTIN_XABORT:
15297 icode = CODE_FOR_xabort;
15298 arg0 = CALL_EXPR_ARG (exp, 0);
15299 op0 = expand_normal (arg0);
15300 mode0 = insn_data[icode].operand[0].mode;
15301 if (!insn_data[icode].operand[0].predicate (op0, mode0))
15303 error ("the argument to %<xabort%> intrinsic must "
15304 "be an 8-bit immediate");
15305 return const0_rtx;
15307 emit_insn (gen_xabort (op0));
15308 return 0;
15310 case IX86_BUILTIN_RDSSPD:
15311 case IX86_BUILTIN_RDSSPQ:
15312 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
15314 if (target == 0
15315 || !register_operand (target, mode))
15316 target = gen_reg_rtx (mode);
15318 op0 = force_reg (mode, const0_rtx);
15320 emit_insn (gen_rdssp (mode, target, op0));
15321 return target;
15323 case IX86_BUILTIN_INCSSPD:
15324 case IX86_BUILTIN_INCSSPQ:
15325 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
15327 arg0 = CALL_EXPR_ARG (exp, 0);
15328 op0 = expand_normal (arg0);
15330 op0 = force_reg (mode, op0);
15332 emit_insn (gen_incssp (mode, op0));
15333 return 0;
15335 case IX86_BUILTIN_HRESET:
15336 icode = CODE_FOR_hreset;
15337 arg0 = CALL_EXPR_ARG (exp, 0);
15338 op0 = expand_normal (arg0);
15339 op0 = force_reg (SImode, op0);
15340 emit_insn (gen_hreset (op0));
15341 return 0;
15343 case IX86_BUILTIN_RSTORSSP:
15344 case IX86_BUILTIN_CLRSSBSY:
15345 arg0 = CALL_EXPR_ARG (exp, 0);
15346 op0 = expand_normal (arg0);
15347 icode = (fcode == IX86_BUILTIN_RSTORSSP
15348 ? CODE_FOR_rstorssp
15349 : CODE_FOR_clrssbsy);
15351 if (!address_operand (op0, VOIDmode))
15353 op0 = convert_memory_address (Pmode, op0);
15354 op0 = copy_addr_to_reg (op0);
15356 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
15357 return 0;
15359 case IX86_BUILTIN_WRSSD:
15360 case IX86_BUILTIN_WRSSQ:
15361 case IX86_BUILTIN_WRUSSD:
15362 case IX86_BUILTIN_WRUSSQ:
15363 mode = ((fcode == IX86_BUILTIN_WRSSD
15364 || fcode == IX86_BUILTIN_WRUSSD)
15365 ? SImode : DImode);
15367 arg0 = CALL_EXPR_ARG (exp, 0);
15368 op0 = expand_normal (arg0);
15369 arg1 = CALL_EXPR_ARG (exp, 1);
15370 op1 = expand_normal (arg1);
15372 op0 = force_reg (mode, op0);
15374 if (!address_operand (op1, VOIDmode))
15376 op1 = convert_memory_address (Pmode, op1);
15377 op1 = copy_addr_to_reg (op1);
15379 op1 = gen_rtx_MEM (mode, op1);
15381 icode = ((fcode == IX86_BUILTIN_WRSSD
15382 || fcode == IX86_BUILTIN_WRSSQ)
15383 ? code_for_wrss (mode)
15384 : code_for_wruss (mode));
15385 emit_insn (GEN_FCN (icode) (op0, op1));
15387 return 0;
15389 default:
15390 break;
15393 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
15394 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
15396 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
15397 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
15398 target);
15401 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
15402 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
15404 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
15405 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
15406 target);
15409 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
15410 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
15412 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
15413 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
15414 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
15415 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
15416 int masked = 1;
15417 machine_mode mode, wide_mode, nar_mode;
15419 nar_mode = V4SFmode;
15420 mode = V16SFmode;
15421 wide_mode = V64SFmode;
15422 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
15423 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
15425 switch (fcode)
15427 case IX86_BUILTIN_4FMAPS:
15428 fcn = gen_avx5124fmaddps_4fmaddps;
15429 masked = 0;
15430 goto v4fma_expand;
15432 case IX86_BUILTIN_4DPWSSD:
15433 nar_mode = V4SImode;
15434 mode = V16SImode;
15435 wide_mode = V64SImode;
15436 fcn = gen_avx5124vnniw_vp4dpwssd;
15437 masked = 0;
15438 goto v4fma_expand;
15440 case IX86_BUILTIN_4DPWSSDS:
15441 nar_mode = V4SImode;
15442 mode = V16SImode;
15443 wide_mode = V64SImode;
15444 fcn = gen_avx5124vnniw_vp4dpwssds;
15445 masked = 0;
15446 goto v4fma_expand;
15448 case IX86_BUILTIN_4FNMAPS:
15449 fcn = gen_avx5124fmaddps_4fnmaddps;
15450 masked = 0;
15451 goto v4fma_expand;
15453 case IX86_BUILTIN_4FNMAPS_MASK:
15454 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
15455 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
15456 goto v4fma_expand;
15458 case IX86_BUILTIN_4DPWSSD_MASK:
15459 nar_mode = V4SImode;
15460 mode = V16SImode;
15461 wide_mode = V64SImode;
15462 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
15463 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
15464 goto v4fma_expand;
15466 case IX86_BUILTIN_4DPWSSDS_MASK:
15467 nar_mode = V4SImode;
15468 mode = V16SImode;
15469 wide_mode = V64SImode;
15470 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
15471 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
15472 goto v4fma_expand;
15474 case IX86_BUILTIN_4FMAPS_MASK:
15476 tree args[4];
15477 rtx ops[4];
15478 rtx wide_reg;
15479 rtx accum;
15480 rtx addr;
15481 rtx mem;
15483 v4fma_expand:
15484 wide_reg = gen_reg_rtx (wide_mode);
15485 for (i = 0; i < 4; i++)
15487 args[i] = CALL_EXPR_ARG (exp, i);
15488 ops[i] = expand_normal (args[i]);
15490 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
15491 ops[i]);
15494 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15495 accum = force_reg (mode, accum);
15497 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15498 addr = force_reg (Pmode, addr);
15500 mem = gen_rtx_MEM (nar_mode, addr);
15502 target = gen_reg_rtx (mode);
15504 emit_move_insn (target, accum);
15506 if (! masked)
15507 emit_insn (fcn (target, accum, wide_reg, mem));
15508 else
15510 rtx merge, mask;
15511 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15513 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15515 if (CONST_INT_P (mask))
15516 mask = fixup_modeless_constant (mask, HImode);
15518 mask = force_reg (HImode, mask);
15520 if (GET_MODE (mask) != HImode)
15521 mask = gen_rtx_SUBREG (HImode, mask, 0);
15523 /* If merge is 0 then we're about to emit z-masked variant. */
15524 if (const0_operand (merge, mode))
15525 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15526 /* If merge is the same as accum then emit merge-masked variant. */
15527 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15529 merge = force_reg (mode, merge);
15530 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15532 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15533 else
15535 target = gen_reg_rtx (mode);
15536 emit_move_insn (target, merge);
15537 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15540 return target;
15543 case IX86_BUILTIN_4FNMASS:
15544 fcn = gen_avx5124fmaddps_4fnmaddss;
15545 masked = 0;
15546 goto s4fma_expand;
15548 case IX86_BUILTIN_4FMASS:
15549 fcn = gen_avx5124fmaddps_4fmaddss;
15550 masked = 0;
15551 goto s4fma_expand;
15553 case IX86_BUILTIN_4FNMASS_MASK:
15554 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
15555 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
15556 goto s4fma_expand;
15558 case IX86_BUILTIN_4FMASS_MASK:
15560 tree args[4];
15561 rtx ops[4];
15562 rtx wide_reg;
15563 rtx accum;
15564 rtx addr;
15565 rtx mem;
15567 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
15568 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
15570 s4fma_expand:
15571 mode = V4SFmode;
15572 wide_reg = gen_reg_rtx (V64SFmode);
15573 for (i = 0; i < 4; i++)
15575 rtx tmp;
15576 args[i] = CALL_EXPR_ARG (exp, i);
15577 ops[i] = expand_normal (args[i]);
15579 tmp = gen_reg_rtx (SFmode);
15580 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
15582 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
15583 gen_rtx_SUBREG (V16SFmode, tmp, 0));
15586 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15587 accum = force_reg (V4SFmode, accum);
15589 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15590 addr = force_reg (Pmode, addr);
15592 mem = gen_rtx_MEM (V4SFmode, addr);
15594 target = gen_reg_rtx (V4SFmode);
15596 emit_move_insn (target, accum);
15598 if (! masked)
15599 emit_insn (fcn (target, accum, wide_reg, mem));
15600 else
15602 rtx merge, mask;
15603 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15605 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15607 if (CONST_INT_P (mask))
15608 mask = fixup_modeless_constant (mask, QImode);
15610 mask = force_reg (QImode, mask);
15612 if (GET_MODE (mask) != QImode)
15613 mask = gen_rtx_SUBREG (QImode, mask, 0);
15615 /* If merge is 0 then we're about to emit z-masked variant. */
15616 if (const0_operand (merge, mode))
15617 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15618 /* If merge is the same as accum then emit merge-masked
15619 variant. */
15620 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15622 merge = force_reg (mode, merge);
15623 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15625 /* Merge with something unknown might happen if we z-mask
15626 w/ -O0. */
15627 else
15629 target = gen_reg_rtx (mode);
15630 emit_move_insn (target, merge);
15631 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15634 return target;
15636 case IX86_BUILTIN_RDPID:
15637 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
15638 target);
15639 case IX86_BUILTIN_FABSQ:
15640 case IX86_BUILTIN_COPYSIGNQ:
15641 if (!TARGET_SSE)
15642 /* Emit a normal call if SSE isn't available. */
15643 return expand_call (exp, target, ignore);
15644 /* FALLTHRU */
15645 default:
15646 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
15650 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
15651 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
15653 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
15654 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
15657 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15658 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
15660 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
15661 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
15664 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15665 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
15667 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
15668 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
15671 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15672 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
15674 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
15675 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
15678 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15679 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
15681 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
15682 const struct builtin_description *d = bdesc_multi_arg + i;
15683 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
15684 (enum ix86_builtin_func_type)
15685 d->flag, d->comparison);
15688 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
15689 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
15691 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
15692 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
15693 target);
15696 gcc_unreachable ();
15699 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15700 fill target with val via vec_duplicate. */
15702 static bool
15703 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
15705 bool ok;
15706 rtx_insn *insn;
15707 rtx dup;
15708 /* Save/restore recog_data in case this is called from splitters
15709 or other routines where recog_data needs to stay valid across
15710 force_reg. See PR106577. */
15711 recog_data_d recog_data_save = recog_data;
15713 /* First attempt to recognize VAL as-is. */
15714 dup = gen_vec_duplicate (mode, val);
15715 insn = emit_insn (gen_rtx_SET (target, dup));
15716 if (recog_memoized (insn) < 0)
15718 rtx_insn *seq;
15719 machine_mode innermode = GET_MODE_INNER (mode);
15720 rtx reg;
15722 /* If that fails, force VAL into a register. */
15724 start_sequence ();
15725 reg = force_reg (innermode, val);
15726 if (GET_MODE (reg) != innermode)
15727 reg = gen_lowpart (innermode, reg);
15728 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
15729 seq = get_insns ();
15730 end_sequence ();
15731 if (seq)
15732 emit_insn_before (seq, insn);
15734 ok = recog_memoized (insn) >= 0;
15735 gcc_assert (ok);
15737 recog_data = recog_data_save;
15738 return true;
15741 /* Get a vector mode of the same size as the original but with elements
15742 twice as wide. This is only guaranteed to apply to integral vectors. */
15744 static machine_mode
15745 get_mode_wider_vector (machine_mode o)
15747 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15748 machine_mode n = GET_MODE_NEXT_MODE (o).require ();
15749 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
15750 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
15751 return n;
15754 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
15755 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
15757 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15758 with all elements equal to VAR. Return true if successful. */
15760 bool
15761 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
15762 rtx target, rtx val)
15764 bool ok;
15766 switch (mode)
15768 case E_V2DImode:
15769 if (CONST_INT_P (val))
15771 int tmp = (int)INTVAL (val);
15772 if (tmp == (int)(INTVAL (val) >> 32))
15774 rtx reg = gen_reg_rtx (V4SImode);
15775 ok = ix86_vector_duplicate_value (V4SImode, reg,
15776 GEN_INT (tmp));
15777 if (ok)
15779 emit_move_insn (target, gen_lowpart (V2DImode, reg));
15780 return true;
15784 return ix86_vector_duplicate_value (mode, target, val);
15786 case E_V4DImode:
15787 if (CONST_INT_P (val))
15789 int tmp = (int)INTVAL (val);
15790 if (tmp == (int)(INTVAL (val) >> 32))
15792 rtx reg = gen_reg_rtx (V8SImode);
15793 ok = ix86_vector_duplicate_value (V8SImode, reg,
15794 GEN_INT (tmp));
15795 if (ok)
15797 emit_move_insn (target, gen_lowpart (V4DImode, reg));
15798 return true;
15802 return ix86_vector_duplicate_value (mode, target, val);
15804 case E_V2SImode:
15805 case E_V2SFmode:
15806 if (!mmx_ok)
15807 return false;
15808 /* FALLTHRU */
15810 case E_V4DFmode:
15811 case E_V8SFmode:
15812 case E_V8SImode:
15813 case E_V2DFmode:
15814 case E_V4SFmode:
15815 case E_V4SImode:
15816 case E_V16SImode:
15817 case E_V8DImode:
15818 case E_V16SFmode:
15819 case E_V8DFmode:
15820 return ix86_vector_duplicate_value (mode, target, val);
15822 case E_V4HImode:
15823 if (!mmx_ok)
15824 return false;
15825 if (TARGET_SSE || TARGET_3DNOW_A)
15827 rtx x;
15829 val = gen_lowpart (SImode, val);
15830 if (CONST_INT_P (val))
15831 return false;
15832 x = gen_rtx_TRUNCATE (HImode, val);
15833 x = gen_rtx_VEC_DUPLICATE (mode, x);
15834 emit_insn (gen_rtx_SET (target, x));
15835 return true;
15837 goto widen;
15839 case E_V4HFmode:
15840 case E_V4BFmode:
15841 if (TARGET_MMX_WITH_SSE)
15843 val = force_reg (GET_MODE_INNER (mode), val);
15844 rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
15845 emit_insn (gen_rtx_SET (target, x));
15846 return true;
15848 return false;
15850 case E_V2HImode:
15851 if (TARGET_SSE2)
15853 rtx x;
15855 val = gen_lowpart (SImode, val);
15856 if (CONST_INT_P (val))
15857 return false;
15858 x = gen_rtx_TRUNCATE (HImode, val);
15859 x = gen_rtx_VEC_DUPLICATE (mode, x);
15860 emit_insn (gen_rtx_SET (target, x));
15861 return true;
15863 return false;
15865 case E_V2HFmode:
15866 case E_V2BFmode:
15867 if (TARGET_SSE2)
15869 val = force_reg (GET_MODE_INNER (mode), val);
15870 rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
15871 emit_insn (gen_rtx_SET (target, x));
15872 return true;
15874 return false;
15876 case E_V8QImode:
15877 case E_V4QImode:
15878 if (!mmx_ok)
15879 return false;
15880 goto widen;
15882 case E_V8HImode:
15883 if (CONST_INT_P (val))
15884 goto widen;
15885 /* FALLTHRU */
15887 case E_V8HFmode:
15888 case E_V8BFmode:
15889 if (TARGET_AVX2)
15890 return ix86_vector_duplicate_value (mode, target, val);
15892 if (TARGET_SSE2)
15894 struct expand_vec_perm_d dperm;
15895 rtx tmp1, tmp2;
15897 permute:
15898 memset (&dperm, 0, sizeof (dperm));
15899 dperm.target = target;
15900 dperm.vmode = mode;
15901 dperm.nelt = GET_MODE_NUNITS (mode);
15902 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
15903 dperm.one_operand_p = true;
15905 if (mode == V8HFmode || mode == V8BFmode)
15907 tmp1 = force_reg (GET_MODE_INNER (mode), val);
15908 tmp2 = gen_reg_rtx (mode);
15909 emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
15910 tmp1 = gen_lowpart (mode, tmp2);
15912 else
15914 /* Extend to SImode using a paradoxical SUBREG. */
15915 tmp1 = gen_reg_rtx (SImode);
15916 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15918 /* Insert the SImode value as
15919 low element of a V4SImode vector. */
15920 tmp2 = gen_reg_rtx (V4SImode);
15921 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15922 tmp1 = gen_lowpart (mode, tmp2);
15925 emit_move_insn (dperm.op0, tmp1);
15926 ok = (expand_vec_perm_1 (&dperm)
15927 || expand_vec_perm_broadcast_1 (&dperm));
15928 gcc_assert (ok);
15929 return ok;
15931 goto widen;
15933 case E_V16QImode:
15934 if (CONST_INT_P (val))
15935 goto widen;
15936 if (TARGET_AVX2)
15937 return ix86_vector_duplicate_value (mode, target, val);
15939 if (TARGET_SSE2)
15940 goto permute;
15941 goto widen;
15943 widen:
15944 /* Replicate the value once into the next wider mode and recurse. */
15946 machine_mode smode, wsmode, wvmode;
15947 rtx x;
15949 smode = GET_MODE_INNER (mode);
15950 wvmode = get_mode_wider_vector (mode);
15951 wsmode = GET_MODE_INNER (wvmode);
15953 val = convert_modes (wsmode, smode, val, true);
15955 if (CONST_INT_P (val))
15957 x = simplify_binary_operation (ASHIFT, wsmode, val,
15958 GEN_INT (GET_MODE_BITSIZE (smode)));
15959 val = simplify_binary_operation (IOR, wsmode, val, x);
15961 else if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15962 emit_insn (gen_insv_1 (wsmode, val, val));
15963 else
15965 x = expand_simple_binop (wsmode, ASHIFT, val,
15966 GEN_INT (GET_MODE_BITSIZE (smode)),
15967 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15968 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15969 OPTAB_LIB_WIDEN);
15972 x = gen_reg_rtx (wvmode);
15973 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15974 if (!ok)
15975 return false;
15976 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15977 return true;
15980 case E_V16HImode:
15981 case E_V32QImode:
15982 if (CONST_INT_P (val))
15983 goto widen;
15984 /* FALLTHRU */
15986 case E_V16HFmode:
15987 case E_V16BFmode:
15988 if (TARGET_AVX2)
15989 return ix86_vector_duplicate_value (mode, target, val);
15990 else
15992 machine_mode hvmode;
15993 switch (mode)
15995 case V16HImode:
15996 hvmode = V8HImode;
15997 break;
15998 case V16HFmode:
15999 hvmode = V8HFmode;
16000 break;
16001 case V16BFmode:
16002 hvmode = V8BFmode;
16003 break;
16004 case V32QImode:
16005 hvmode = V16QImode;
16006 break;
16007 default:
16008 gcc_unreachable ();
16010 rtx x = gen_reg_rtx (hvmode);
16012 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
16013 if (!ok)
16014 return false;
16016 x = gen_rtx_VEC_CONCAT (mode, x, x);
16017 emit_insn (gen_rtx_SET (target, x));
16019 return true;
16021 case E_V32HImode:
16022 case E_V32HFmode:
16023 case E_V32BFmode:
16024 case E_V64QImode:
16025 gcc_assert (TARGET_EVEX512);
16026 if (TARGET_AVX512BW)
16027 return ix86_vector_duplicate_value (mode, target, val);
16028 else
16030 machine_mode hvmode;
16031 switch (mode)
16033 case V32HImode:
16034 hvmode = V16HImode;
16035 break;
16036 case V32HFmode:
16037 hvmode = V16HFmode;
16038 break;
16039 case V32BFmode:
16040 hvmode = V16BFmode;
16041 break;
16042 case V64QImode:
16043 hvmode = V32QImode;
16044 break;
16045 default:
16046 gcc_unreachable ();
16048 rtx x = gen_reg_rtx (hvmode);
16050 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
16051 if (!ok)
16052 return false;
16054 x = gen_rtx_VEC_CONCAT (mode, x, x);
16055 emit_insn (gen_rtx_SET (target, x));
16057 return true;
16059 default:
16060 return false;
16064 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
16065 whose ONE_VAR element is VAR, and other elements are zero. Return true
16066 if successful. */
16068 static bool
16069 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
16070 rtx target, rtx var, int one_var)
16072 machine_mode vsimode;
16073 rtx new_target;
16074 rtx x, tmp;
16075 bool use_vector_set = false;
16076 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
16078 if (GET_MODE_SIZE (mode) == 64 && !TARGET_EVEX512)
16079 return false;
16081 switch (mode)
16083 case E_V2DImode:
16084 /* For SSE4.1, we normally use vector set. But if the second
16085 element is zero and inter-unit moves are OK, we use movq
16086 instead. */
16087 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
16088 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
16089 && one_var == 0));
16090 break;
16091 case E_V16QImode:
16092 case E_V4SImode:
16093 case E_V4SFmode:
16094 use_vector_set = TARGET_SSE4_1;
16095 break;
16096 case E_V8HImode:
16097 use_vector_set = TARGET_SSE2;
16098 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
16099 ? gen_vec_setv8hi_0 : NULL;
16100 break;
16101 case E_V8QImode:
16102 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16103 break;
16104 case E_V4HImode:
16105 case E_V4HFmode:
16106 case E_V4BFmode:
16107 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
16108 break;
16109 case E_V4QImode:
16110 use_vector_set = TARGET_SSE4_1;
16111 break;
16112 case E_V32QImode:
16113 use_vector_set = TARGET_AVX;
16114 break;
16115 case E_V16HImode:
16116 use_vector_set = TARGET_AVX;
16117 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
16118 ? gen_vec_setv16hi_0 : NULL;
16119 break;
16120 case E_V8SImode:
16121 use_vector_set = TARGET_AVX;
16122 gen_vec_set_0 = gen_vec_setv8si_0;
16123 break;
16124 case E_V8SFmode:
16125 use_vector_set = TARGET_AVX;
16126 gen_vec_set_0 = gen_vec_setv8sf_0;
16127 break;
16128 case E_V4DFmode:
16129 use_vector_set = TARGET_AVX;
16130 gen_vec_set_0 = gen_vec_setv4df_0;
16131 break;
16132 case E_V4DImode:
16133 /* Use ix86_expand_vector_set in 64bit mode only. */
16134 use_vector_set = TARGET_AVX && TARGET_64BIT;
16135 gen_vec_set_0 = gen_vec_setv4di_0;
16136 break;
16137 case E_V16SImode:
16138 use_vector_set = TARGET_AVX512F && one_var == 0;
16139 gen_vec_set_0 = gen_vec_setv16si_0;
16140 break;
16141 case E_V16SFmode:
16142 use_vector_set = TARGET_AVX512F && one_var == 0;
16143 gen_vec_set_0 = gen_vec_setv16sf_0;
16144 break;
16145 case E_V8DFmode:
16146 use_vector_set = TARGET_AVX512F && one_var == 0;
16147 gen_vec_set_0 = gen_vec_setv8df_0;
16148 break;
16149 case E_V8DImode:
16150 /* Use ix86_expand_vector_set in 64bit mode only. */
16151 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
16152 gen_vec_set_0 = gen_vec_setv8di_0;
16153 break;
16154 case E_V8HFmode:
16155 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
16156 gen_vec_set_0 = gen_vec_setv8hf_0;
16157 break;
16158 case E_V16HFmode:
16159 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
16160 gen_vec_set_0 = gen_vec_setv16hf_0;
16161 break;
16162 case E_V32HFmode:
16163 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
16164 gen_vec_set_0 = gen_vec_setv32hf_0;
16165 break;
16166 case E_V8BFmode:
16167 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
16168 gen_vec_set_0 = gen_vec_setv8bf_0;
16169 break;
16170 case E_V16BFmode:
16171 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
16172 gen_vec_set_0 = gen_vec_setv16bf_0;
16173 break;
16174 case E_V32BFmode:
16175 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
16176 gen_vec_set_0 = gen_vec_setv32bf_0;
16177 break;
16178 case E_V32HImode:
16179 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
16180 gen_vec_set_0 = gen_vec_setv32hi_0;
16181 default:
16182 break;
16185 if (use_vector_set)
16187 if (gen_vec_set_0 && one_var == 0)
16189 var = force_reg (GET_MODE_INNER (mode), var);
16190 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
16191 return true;
16193 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
16194 var = force_reg (GET_MODE_INNER (mode), var);
16195 ix86_expand_vector_set (mmx_ok, target, var, one_var);
16196 return true;
16199 switch (mode)
16201 case E_V2SFmode:
16202 case E_V2SImode:
16203 if (!mmx_ok)
16204 return false;
16205 /* FALLTHRU */
16207 case E_V2DFmode:
16208 case E_V2DImode:
16209 if (one_var != 0)
16210 return false;
16211 var = force_reg (GET_MODE_INNER (mode), var);
16212 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
16213 emit_insn (gen_rtx_SET (target, x));
16214 return true;
16216 case E_V4SFmode:
16217 case E_V4SImode:
16218 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
16219 new_target = gen_reg_rtx (mode);
16220 else
16221 new_target = target;
16222 var = force_reg (GET_MODE_INNER (mode), var);
16223 x = gen_rtx_VEC_DUPLICATE (mode, var);
16224 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
16225 emit_insn (gen_rtx_SET (new_target, x));
16226 if (one_var != 0)
16228 /* We need to shuffle the value to the correct position, so
16229 create a new pseudo to store the intermediate result. */
16231 /* With SSE2, we can use the integer shuffle insns. */
16232 if (mode != V4SFmode && TARGET_SSE2)
16234 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
16235 const1_rtx,
16236 GEN_INT (one_var == 1 ? 0 : 1),
16237 GEN_INT (one_var == 2 ? 0 : 1),
16238 GEN_INT (one_var == 3 ? 0 : 1)));
16239 if (target != new_target)
16240 emit_move_insn (target, new_target);
16241 return true;
16244 /* Otherwise convert the intermediate result to V4SFmode and
16245 use the SSE1 shuffle instructions. */
16246 if (mode != V4SFmode)
16248 tmp = gen_reg_rtx (V4SFmode);
16249 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
16251 else
16252 tmp = new_target;
16254 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
16255 const1_rtx,
16256 GEN_INT (one_var == 1 ? 0 : 1),
16257 GEN_INT (one_var == 2 ? 0+4 : 1+4),
16258 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
16260 if (mode != V4SFmode)
16261 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
16262 else if (tmp != target)
16263 emit_move_insn (target, tmp);
16265 else if (target != new_target)
16266 emit_move_insn (target, new_target);
16267 return true;
16269 case E_V8HImode:
16270 case E_V16QImode:
16271 vsimode = V4SImode;
16272 goto widen;
16273 case E_V4HImode:
16274 case E_V8QImode:
16275 if (!mmx_ok)
16276 return false;
16277 vsimode = V2SImode;
16278 goto widen;
16279 widen:
16280 if (one_var != 0)
16281 return false;
16283 /* Zero extend the variable element to SImode and recurse. */
16284 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
16286 x = gen_reg_rtx (vsimode);
16287 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
16288 var, one_var))
16289 gcc_unreachable ();
16291 emit_move_insn (target, gen_lowpart (mode, x));
16292 return true;
16294 default:
16295 return false;
16299 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
16300 consisting of the values in VALS. It is known that all elements
16301 except ONE_VAR are constants. Return true if successful. */
16303 static bool
16304 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
16305 rtx target, rtx vals, int one_var)
16307 rtx var = XVECEXP (vals, 0, one_var);
16308 machine_mode wmode;
16309 rtx const_vec, x;
16311 const_vec = copy_rtx (vals);
16312 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
16313 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
16315 switch (mode)
16317 case E_V2DFmode:
16318 case E_V2DImode:
16319 case E_V2SFmode:
16320 case E_V2SImode:
16321 /* For the two element vectors, it's just as easy to use
16322 the general case. */
16323 return false;
16325 case E_V4DImode:
16326 /* Use ix86_expand_vector_set in 64bit mode only. */
16327 if (!TARGET_64BIT)
16328 return false;
16329 /* FALLTHRU */
16330 case E_V8HFmode:
16331 case E_V16HFmode:
16332 case E_V8BFmode:
16333 case E_V16BFmode:
16334 case E_V4DFmode:
16335 case E_V8SFmode:
16336 case E_V8SImode:
16337 case E_V16HImode:
16338 case E_V32QImode:
16339 case E_V4SFmode:
16340 case E_V4SImode:
16341 case E_V8HImode:
16342 case E_V4HImode:
16343 case E_V4HFmode:
16344 case E_V4BFmode:
16345 break;
16347 case E_V16QImode:
16348 if (TARGET_SSE4_1)
16349 break;
16350 wmode = V8HImode;
16351 goto widen;
16352 case E_V8QImode:
16353 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
16354 break;
16355 wmode = V4HImode;
16356 goto widen;
16357 case E_V4QImode:
16358 if (TARGET_SSE4_1)
16359 break;
16360 wmode = V2HImode;
16361 widen:
16362 /* There's no way to set one QImode entry easily. Combine
16363 the variable value with its adjacent constant value, and
16364 promote to an HImode set. */
16365 x = XVECEXP (vals, 0, one_var ^ 1);
16366 if (one_var & 1)
16368 var = convert_modes (HImode, QImode, var, true);
16369 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
16370 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16371 x = GEN_INT (INTVAL (x) & 0xff);
16373 else
16375 var = convert_modes (HImode, QImode, var, true);
16376 x = gen_int_mode (UINTVAL (x) << 8, HImode);
16378 if (x != const0_rtx)
16379 var = expand_simple_binop (HImode, IOR, var, x, var,
16380 1, OPTAB_LIB_WIDEN);
16382 x = gen_reg_rtx (wmode);
16383 emit_move_insn (x, gen_lowpart (wmode, const_vec));
16384 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
16386 emit_move_insn (target, gen_lowpart (mode, x));
16387 return true;
16389 default:
16390 return false;
16393 emit_move_insn (target, const_vec);
16394 ix86_expand_vector_set (mmx_ok, target, var, one_var);
16395 return true;
16398 /* A subroutine of ix86_expand_vector_init_general. Use vector
16399 concatenate to handle the most general case: all values variable,
16400 and none identical. */
16402 static void
16403 ix86_expand_vector_init_concat (machine_mode mode,
16404 rtx target, rtx *ops, int n)
16406 machine_mode half_mode = VOIDmode;
16407 rtx half[2];
16408 rtvec v;
16409 int i, j;
16411 switch (n)
16413 case 2:
16414 switch (mode)
16416 case E_V32HFmode:
16417 half_mode = V16HFmode;
16418 break;
16419 case E_V32BFmode:
16420 half_mode = V16BFmode;
16421 break;
16422 case E_V16SImode:
16423 half_mode = V8SImode;
16424 break;
16425 case E_V16SFmode:
16426 half_mode = V8SFmode;
16427 break;
16428 case E_V8DImode:
16429 half_mode = V4DImode;
16430 break;
16431 case E_V8DFmode:
16432 half_mode = V4DFmode;
16433 break;
16434 case E_V16HFmode:
16435 half_mode = V8HFmode;
16436 break;
16437 case E_V16BFmode:
16438 half_mode = V8BFmode;
16439 break;
16440 case E_V8SImode:
16441 half_mode = V4SImode;
16442 break;
16443 case E_V8SFmode:
16444 half_mode = V4SFmode;
16445 break;
16446 case E_V4DImode:
16447 half_mode = V2DImode;
16448 break;
16449 case E_V4DFmode:
16450 half_mode = V2DFmode;
16451 break;
16452 case E_V4SImode:
16453 half_mode = V2SImode;
16454 break;
16455 case E_V4SFmode:
16456 half_mode = V2SFmode;
16457 break;
16458 case E_V2DImode:
16459 half_mode = DImode;
16460 break;
16461 case E_V2SImode:
16462 half_mode = SImode;
16463 break;
16464 case E_V2DFmode:
16465 half_mode = DFmode;
16466 break;
16467 case E_V2SFmode:
16468 half_mode = SFmode;
16469 break;
16470 default:
16471 gcc_unreachable ();
16474 if (!register_operand (ops[1], half_mode))
16475 ops[1] = force_reg (half_mode, ops[1]);
16476 if (!register_operand (ops[0], half_mode))
16477 ops[0] = force_reg (half_mode, ops[0]);
16478 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
16479 ops[1])));
16480 break;
16482 case 4:
16483 switch (mode)
16485 case E_V4DImode:
16486 half_mode = V2DImode;
16487 break;
16488 case E_V4DFmode:
16489 half_mode = V2DFmode;
16490 break;
16491 case E_V4SImode:
16492 half_mode = V2SImode;
16493 break;
16494 case E_V4SFmode:
16495 half_mode = V2SFmode;
16496 break;
16497 default:
16498 gcc_unreachable ();
16500 goto half;
16502 case 8:
16503 switch (mode)
16505 case E_V8DImode:
16506 half_mode = V4DImode;
16507 break;
16508 case E_V8DFmode:
16509 half_mode = V4DFmode;
16510 break;
16511 case E_V8SImode:
16512 half_mode = V4SImode;
16513 break;
16514 case E_V8SFmode:
16515 half_mode = V4SFmode;
16516 break;
16517 default:
16518 gcc_unreachable ();
16520 goto half;
16522 case 16:
16523 switch (mode)
16525 case E_V16SImode:
16526 half_mode = V8SImode;
16527 break;
16528 case E_V16SFmode:
16529 half_mode = V8SFmode;
16530 break;
16531 default:
16532 gcc_unreachable ();
16534 goto half;
16536 half:
16537 /* FIXME: We process inputs backward to help RA. PR 36222. */
16538 i = n - 1;
16539 for (j = 1; j != -1; j--)
16541 half[j] = gen_reg_rtx (half_mode);
16542 switch (n >> 1)
16544 case 2:
16545 v = gen_rtvec (2, ops[i-1], ops[i]);
16546 i -= 2;
16547 break;
16548 case 4:
16549 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
16550 i -= 4;
16551 break;
16552 case 8:
16553 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
16554 ops[i-3], ops[i-2], ops[i-1], ops[i]);
16555 i -= 8;
16556 break;
16557 default:
16558 gcc_unreachable ();
16560 ix86_expand_vector_init (false, half[j],
16561 gen_rtx_PARALLEL (half_mode, v));
16564 ix86_expand_vector_init_concat (mode, target, half, 2);
16565 break;
16567 default:
16568 gcc_unreachable ();
16572 /* A subroutine of ix86_expand_vector_init_general. Use vector
16573 interleave to handle the most general case: all values variable,
16574 and none identical. */
16576 static void
16577 ix86_expand_vector_init_interleave (machine_mode mode,
16578 rtx target, rtx *ops, int n)
16580 machine_mode first_imode, second_imode, third_imode, inner_mode;
16581 int i, j;
16582 rtx op, op0, op1;
16583 rtx (*gen_load_even) (rtx, rtx, rtx);
16584 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
16585 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
16587 switch (mode)
16589 case E_V8HFmode:
16590 gen_load_even = gen_vec_interleave_lowv8hf;
16591 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16592 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16593 inner_mode = HFmode;
16594 first_imode = V4SImode;
16595 second_imode = V2DImode;
16596 third_imode = VOIDmode;
16597 break;
16598 case E_V8BFmode:
16599 gen_load_even = gen_vec_interleave_lowv8bf;
16600 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16601 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16602 inner_mode = BFmode;
16603 first_imode = V4SImode;
16604 second_imode = V2DImode;
16605 third_imode = VOIDmode;
16606 break;
16607 case E_V8HImode:
16608 gen_load_even = gen_vec_setv8hi;
16609 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16610 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16611 inner_mode = HImode;
16612 first_imode = V4SImode;
16613 second_imode = V2DImode;
16614 third_imode = VOIDmode;
16615 break;
16616 case E_V16QImode:
16617 gen_load_even = gen_vec_setv16qi;
16618 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
16619 gen_interleave_second_low = gen_vec_interleave_lowv4si;
16620 inner_mode = QImode;
16621 first_imode = V8HImode;
16622 second_imode = V4SImode;
16623 third_imode = V2DImode;
16624 break;
16625 default:
16626 gcc_unreachable ();
16629 for (i = 0; i < n; i++)
16631 op = ops [i + i];
16632 if (inner_mode == HFmode || inner_mode == BFmode)
16634 rtx even, odd;
16635 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16636 machine_mode vec_mode =
16637 (inner_mode == HFmode) ? V8HFmode : V8BFmode;
16638 op0 = gen_reg_rtx (vec_mode);
16639 even = lowpart_subreg (vec_mode,
16640 force_reg (inner_mode, op), inner_mode);
16641 odd = lowpart_subreg (vec_mode,
16642 force_reg (inner_mode, ops[i + i + 1]),
16643 inner_mode);
16644 emit_insn (gen_load_even (op0, even, odd));
16646 else
16648 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16649 op0 = gen_reg_rtx (SImode);
16650 emit_move_insn (op0, gen_lowpart (SImode, op));
16652 /* Insert the SImode value as low element of V4SImode vector. */
16653 op1 = gen_reg_rtx (V4SImode);
16654 op0 = gen_rtx_VEC_MERGE (V4SImode,
16655 gen_rtx_VEC_DUPLICATE (V4SImode,
16656 op0),
16657 CONST0_RTX (V4SImode),
16658 const1_rtx);
16659 emit_insn (gen_rtx_SET (op1, op0));
16661 /* Cast the V4SImode vector back to a vector in orignal mode. */
16662 op0 = gen_reg_rtx (mode);
16663 emit_move_insn (op0, gen_lowpart (mode, op1));
16665 /* Load even elements into the second position. */
16666 emit_insn (gen_load_even (op0,
16667 force_reg (inner_mode,
16668 ops[i + i + 1]),
16669 const1_rtx));
16672 /* Cast vector to FIRST_IMODE vector. */
16673 ops[i] = gen_reg_rtx (first_imode);
16674 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
16677 /* Interleave low FIRST_IMODE vectors. */
16678 for (i = j = 0; i < n; i += 2, j++)
16680 op0 = gen_reg_rtx (first_imode);
16681 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
16683 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16684 ops[j] = gen_reg_rtx (second_imode);
16685 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
16688 /* Interleave low SECOND_IMODE vectors. */
16689 switch (second_imode)
16691 case E_V4SImode:
16692 for (i = j = 0; i < n / 2; i += 2, j++)
16694 op0 = gen_reg_rtx (second_imode);
16695 emit_insn (gen_interleave_second_low (op0, ops[i],
16696 ops[i + 1]));
16698 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16699 vector. */
16700 ops[j] = gen_reg_rtx (third_imode);
16701 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
16703 second_imode = V2DImode;
16704 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16705 /* FALLTHRU */
16707 case E_V2DImode:
16708 op0 = gen_reg_rtx (second_imode);
16709 emit_insn (gen_interleave_second_low (op0, ops[0],
16710 ops[1]));
16712 /* Cast the SECOND_IMODE vector back to a vector on original
16713 mode. */
16714 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
16715 break;
16717 default:
16718 gcc_unreachable ();
16722 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
16723 all values variable, and none identical. */
16725 static void
16726 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
16727 rtx target, rtx vals)
16729 rtx ops[64], op0, op1, op2, op3, op4, op5;
16730 machine_mode half_mode = VOIDmode;
16731 machine_mode quarter_mode = VOIDmode;
16732 machine_mode int_inner_mode = VOIDmode;
16733 int n, i;
16735 switch (mode)
16737 case E_V2SFmode:
16738 case E_V2SImode:
16739 if (!mmx_ok && !TARGET_SSE)
16740 break;
16741 /* FALLTHRU */
16743 case E_V16SImode:
16744 case E_V16SFmode:
16745 case E_V8DFmode:
16746 case E_V8DImode:
16747 case E_V8SFmode:
16748 case E_V8SImode:
16749 case E_V4DFmode:
16750 case E_V4DImode:
16751 case E_V4SFmode:
16752 case E_V4SImode:
16753 case E_V2DFmode:
16754 case E_V2DImode:
16755 n = GET_MODE_NUNITS (mode);
16756 for (i = 0; i < n; i++)
16757 ops[i] = XVECEXP (vals, 0, i);
16758 ix86_expand_vector_init_concat (mode, target, ops, n);
16759 return;
16761 case E_V2TImode:
16762 for (i = 0; i < 2; i++)
16763 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16764 op0 = gen_reg_rtx (V4DImode);
16765 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
16766 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16767 return;
16769 case E_V4TImode:
16770 for (i = 0; i < 4; i++)
16771 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16772 ops[4] = gen_reg_rtx (V4DImode);
16773 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
16774 ops[5] = gen_reg_rtx (V4DImode);
16775 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
16776 op0 = gen_reg_rtx (V8DImode);
16777 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
16778 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16779 return;
16781 case E_V32QImode:
16782 half_mode = V16QImode;
16783 goto half;
16785 case E_V16HImode:
16786 half_mode = V8HImode;
16787 goto half;
16789 case E_V16HFmode:
16790 half_mode = V8HFmode;
16791 goto half;
16793 case E_V16BFmode:
16794 half_mode = V8BFmode;
16795 goto half;
16797 half:
16798 n = GET_MODE_NUNITS (mode);
16799 for (i = 0; i < n; i++)
16800 ops[i] = XVECEXP (vals, 0, i);
16801 op0 = gen_reg_rtx (half_mode);
16802 op1 = gen_reg_rtx (half_mode);
16803 ix86_expand_vector_init_interleave (half_mode, op0, ops,
16804 n >> 2);
16805 ix86_expand_vector_init_interleave (half_mode, op1,
16806 &ops [n >> 1], n >> 2);
16807 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
16808 return;
16810 case E_V64QImode:
16811 quarter_mode = V16QImode;
16812 half_mode = V32QImode;
16813 goto quarter;
16815 case E_V32HImode:
16816 quarter_mode = V8HImode;
16817 half_mode = V16HImode;
16818 goto quarter;
16820 case E_V32HFmode:
16821 quarter_mode = V8HFmode;
16822 half_mode = V16HFmode;
16823 goto quarter;
16825 case E_V32BFmode:
16826 quarter_mode = V8BFmode;
16827 half_mode = V16BFmode;
16828 goto quarter;
16830 quarter:
16831 n = GET_MODE_NUNITS (mode);
16832 for (i = 0; i < n; i++)
16833 ops[i] = XVECEXP (vals, 0, i);
16834 op0 = gen_reg_rtx (quarter_mode);
16835 op1 = gen_reg_rtx (quarter_mode);
16836 op2 = gen_reg_rtx (quarter_mode);
16837 op3 = gen_reg_rtx (quarter_mode);
16838 op4 = gen_reg_rtx (half_mode);
16839 op5 = gen_reg_rtx (half_mode);
16840 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
16841 n >> 3);
16842 ix86_expand_vector_init_interleave (quarter_mode, op1,
16843 &ops [n >> 2], n >> 3);
16844 ix86_expand_vector_init_interleave (quarter_mode, op2,
16845 &ops [n >> 1], n >> 3);
16846 ix86_expand_vector_init_interleave (quarter_mode, op3,
16847 &ops [(n >> 1) | (n >> 2)], n >> 3);
16848 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
16849 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
16850 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
16851 return;
16853 case E_V16QImode:
16854 if (!TARGET_SSE4_1)
16855 break;
16856 /* FALLTHRU */
16858 case E_V8HImode:
16859 if (!TARGET_SSE2)
16860 break;
16862 /* Don't use ix86_expand_vector_init_interleave if we can't
16863 move from GPR to SSE register directly. */
16864 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
16865 break;
16866 /* FALLTHRU */
16868 case E_V8HFmode:
16869 case E_V8BFmode:
16871 n = GET_MODE_NUNITS (mode);
16872 for (i = 0; i < n; i++)
16873 ops[i] = XVECEXP (vals, 0, i);
16874 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
16875 return;
16877 case E_V4HFmode:
16878 case E_V4BFmode:
16879 case E_V2HFmode:
16880 case E_V2BFmode:
16881 int_inner_mode = HImode;
16882 break;
16884 case E_V4HImode:
16885 case E_V8QImode:
16887 case E_V2HImode:
16888 case E_V4QImode:
16889 break;
16891 default:
16892 gcc_unreachable ();
16896 int i, j, n_elts, n_words, n_elt_per_word;
16897 machine_mode tmp_mode, inner_mode;
16898 rtx words[4], shift;
16900 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
16902 inner_mode = GET_MODE_INNER (mode);
16903 n_elts = GET_MODE_NUNITS (mode);
16904 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
16905 n_elt_per_word = n_elts / n_words;
16906 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
16908 for (i = 0; i < n_words; ++i)
16910 rtx word = NULL_RTX;
16912 for (j = 0; j < n_elt_per_word; ++j)
16914 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
16915 if (int_inner_mode != E_VOIDmode)
16917 gcc_assert (TARGET_SSE2 && int_inner_mode == HImode);
16918 rtx tmp = gen_reg_rtx (int_inner_mode);
16919 elt = lowpart_subreg (int_inner_mode,
16920 force_reg (inner_mode, elt),
16921 inner_mode);
16922 emit_move_insn (tmp, elt);
16923 elt = tmp;
16925 elt = convert_modes (tmp_mode, inner_mode, elt, true);
16927 if (j == 0)
16928 word = elt;
16929 else
16931 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
16932 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16933 word = expand_simple_binop (tmp_mode, IOR, word, elt,
16934 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16938 words[i] = word;
16941 if (n_words == 1)
16942 emit_move_insn (target, gen_lowpart (mode, words[0]));
16943 else if (n_words == 2)
16945 gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
16946 machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
16947 rtx tmp = gen_reg_rtx (concat_mode);
16948 vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
16949 ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
16950 emit_move_insn (target, gen_lowpart (mode, tmp));
16952 else if (n_words == 4)
16954 rtx tmp = gen_reg_rtx (V4SImode);
16955 gcc_assert (tmp_mode == SImode);
16956 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
16957 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
16958 emit_move_insn (target, gen_lowpart (mode, tmp));
16960 else
16961 gcc_unreachable ();
16965 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16966 instructions unless MMX_OK is true. */
16968 void
16969 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
16971 machine_mode mode = GET_MODE (target);
16972 machine_mode inner_mode = GET_MODE_INNER (mode);
16973 int n_elts = GET_MODE_NUNITS (mode);
16974 int n_var = 0, one_var = -1;
16975 bool all_same = true, all_const_zero = true;
16976 int i;
16977 rtx x;
16979 /* Handle first initialization from vector elts. */
16980 if (n_elts != XVECLEN (vals, 0))
16982 rtx subtarget = target;
16983 x = XVECEXP (vals, 0, 0);
16984 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
16985 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
16987 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
16988 if (inner_mode == QImode
16989 || inner_mode == HImode
16990 || inner_mode == TImode
16991 || inner_mode == HFmode
16992 || inner_mode == BFmode)
16994 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
16995 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
16996 n_bits /= GET_MODE_SIZE (elt_mode);
16997 mode = mode_for_vector (elt_mode, n_bits).require ();
16998 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
16999 ops[0] = gen_lowpart (inner_mode, ops[0]);
17000 ops[1] = gen_lowpart (inner_mode, ops[1]);
17001 subtarget = gen_reg_rtx (mode);
17003 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
17004 if (subtarget != target)
17005 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
17006 return;
17008 gcc_unreachable ();
17011 for (i = 0; i < n_elts; ++i)
17013 x = XVECEXP (vals, 0, i);
17014 if (!(CONST_SCALAR_INT_P (x)
17015 || CONST_DOUBLE_P (x)
17016 || CONST_FIXED_P (x)))
17017 n_var++, one_var = i;
17018 else if (x != CONST0_RTX (inner_mode))
17019 all_const_zero = false;
17020 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
17021 all_same = false;
17024 /* If all values are identical, broadcast the value. */
17025 if (all_same
17026 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
17027 XVECEXP (vals, 0, 0)))
17028 return;
17030 /* Constants are best loaded from the constant pool. */
17031 if (n_var == 0)
17033 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
17034 return;
17037 /* Values where only one field is non-constant are best loaded from
17038 the pool and overwritten via move later. */
17039 if (n_var == 1)
17041 if (all_const_zero
17042 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
17043 XVECEXP (vals, 0, one_var),
17044 one_var))
17045 return;
17047 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
17048 return;
17051 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
17054 /* Implemented as
17055 V setg (V v, int idx, T val)
17057 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
17058 V valv = (V){val, val, val, val, val, val, val, val};
17059 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
17060 v = (v & ~mask) | (valv & mask);
17061 return v;
17062 }. */
17063 void
17064 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
17066 rtx vec[64];
17067 machine_mode mode = GET_MODE (target);
17068 machine_mode cmp_mode = mode;
17069 int n_elts = GET_MODE_NUNITS (mode);
17070 rtx valv,idxv,constv,idx_tmp;
17071 bool ok = false;
17073 /* 512-bits vector byte/word broadcast and comparison only available
17074 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
17075 when without TARGET_AVX512BW. */
17076 if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
17077 || mode == V64QImode)
17078 && !TARGET_AVX512BW)
17080 gcc_assert (TARGET_AVX512F);
17081 rtx vhi, vlo, idx_hi;
17082 machine_mode half_mode;
17083 rtx (*extract_hi)(rtx, rtx);
17084 rtx (*extract_lo)(rtx, rtx);
17086 if (mode == V32HImode)
17088 half_mode = V16HImode;
17089 extract_hi = gen_vec_extract_hi_v32hi;
17090 extract_lo = gen_vec_extract_lo_v32hi;
17092 else if (mode == V32HFmode)
17094 half_mode = V16HFmode;
17095 extract_hi = gen_vec_extract_hi_v32hf;
17096 extract_lo = gen_vec_extract_lo_v32hf;
17098 else if (mode == V32BFmode)
17100 half_mode = V16BFmode;
17101 extract_hi = gen_vec_extract_hi_v32bf;
17102 extract_lo = gen_vec_extract_lo_v32bf;
17104 else
17106 half_mode = V32QImode;
17107 extract_hi = gen_vec_extract_hi_v64qi;
17108 extract_lo = gen_vec_extract_lo_v64qi;
17111 vhi = gen_reg_rtx (half_mode);
17112 vlo = gen_reg_rtx (half_mode);
17113 idx_hi = gen_reg_rtx (GET_MODE (idx));
17114 emit_insn (extract_hi (vhi, target));
17115 emit_insn (extract_lo (vlo, target));
17116 vec[0] = idx_hi;
17117 vec[1] = idx;
17118 vec[2] = GEN_INT (n_elts/2);
17119 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
17120 ix86_expand_vector_set_var (vhi, val, idx_hi);
17121 ix86_expand_vector_set_var (vlo, val, idx);
17122 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
17123 return;
17126 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
17128 switch (mode)
17130 case E_V2DFmode:
17131 cmp_mode = V2DImode;
17132 break;
17133 case E_V4DFmode:
17134 cmp_mode = V4DImode;
17135 break;
17136 case E_V8DFmode:
17137 cmp_mode = V8DImode;
17138 break;
17139 case E_V2SFmode:
17140 cmp_mode = V2SImode;
17141 break;
17142 case E_V4SFmode:
17143 cmp_mode = V4SImode;
17144 break;
17145 case E_V8SFmode:
17146 cmp_mode = V8SImode;
17147 break;
17148 case E_V16SFmode:
17149 cmp_mode = V16SImode;
17150 break;
17151 case E_V2HFmode:
17152 case E_V2BFmode:
17153 cmp_mode = V2HImode;
17154 break;
17155 case E_V4HFmode:
17156 case E_V4BFmode:
17157 cmp_mode = V4HImode;
17158 break;
17159 case E_V8HFmode:
17160 cmp_mode = V8HImode;
17161 break;
17162 case E_V16HFmode:
17163 cmp_mode = V16HImode;
17164 break;
17165 case E_V32HFmode:
17166 cmp_mode = V32HImode;
17167 break;
17168 case E_V8BFmode:
17169 cmp_mode = V8HImode;
17170 break;
17171 case E_V16BFmode:
17172 cmp_mode = V16HImode;
17173 break;
17174 case E_V32BFmode:
17175 cmp_mode = V32HImode;
17176 break;
17177 default:
17178 gcc_unreachable ();
17182 for (int i = 0; i != n_elts; i++)
17183 vec[i] = GEN_INT (i);
17184 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
17185 valv = gen_reg_rtx (mode);
17186 idxv = gen_reg_rtx (cmp_mode);
17187 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
17189 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
17190 mode, valv, val);
17191 gcc_assert (ok);
17192 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
17193 cmp_mode, idxv, idx_tmp);
17194 gcc_assert (ok);
17195 vec[0] = target;
17196 vec[1] = valv;
17197 vec[2] = target;
17198 vec[3] = gen_rtx_EQ (mode, idxv, constv);
17199 vec[4] = idxv;
17200 vec[5] = constv;
17201 ok = ix86_expand_int_vcond (vec);
17202 gcc_assert (ok);
17205 void
17206 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
17208 machine_mode mode = GET_MODE (target);
17209 machine_mode inner_mode = GET_MODE_INNER (mode);
17210 machine_mode half_mode;
17211 bool use_vec_merge = false;
17212 bool blendm_const = false;
17213 rtx tmp;
17214 static rtx (*gen_extract[8][2]) (rtx, rtx)
17216 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
17217 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
17218 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
17219 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
17220 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
17221 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
17222 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
17223 { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
17225 static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
17227 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
17228 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
17229 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
17230 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
17231 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
17232 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
17233 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
17234 { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
17236 int i, j, n;
17237 machine_mode mmode = VOIDmode;
17238 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
17240 switch (mode)
17242 case E_V2SImode:
17243 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17244 if (use_vec_merge)
17245 break;
17246 /* FALLTHRU */
17248 case E_V2SFmode:
17249 if (mmx_ok)
17251 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
17252 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
17253 if (elt == 0)
17254 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
17255 else
17256 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
17257 emit_insn (gen_rtx_SET (target, tmp));
17258 return;
17260 break;
17262 case E_V2DImode:
17263 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
17264 if (use_vec_merge)
17265 break;
17267 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
17268 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
17269 if (elt == 0)
17270 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
17271 else
17272 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
17273 emit_insn (gen_rtx_SET (target, tmp));
17274 return;
17276 case E_V2DFmode:
17277 /* NB: For ELT == 0, use standard scalar operation patterns which
17278 preserve the rest of the vector for combiner:
17280 (vec_merge:V2DF
17281 (vec_duplicate:V2DF (reg:DF))
17282 (reg:V2DF)
17283 (const_int 1))
17285 if (elt == 0)
17286 goto do_vec_merge;
17289 rtx op0, op1;
17291 /* For the two element vectors, we implement a VEC_CONCAT with
17292 the extraction of the other element. */
17294 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
17295 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
17297 if (elt == 0)
17298 op0 = val, op1 = tmp;
17299 else
17300 op0 = tmp, op1 = val;
17302 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
17303 emit_insn (gen_rtx_SET (target, tmp));
17305 return;
17307 case E_V4SFmode:
17308 use_vec_merge = TARGET_SSE4_1;
17309 if (use_vec_merge)
17310 break;
17312 switch (elt)
17314 case 0:
17315 use_vec_merge = true;
17316 break;
17318 case 1:
17319 /* tmp = target = A B C D */
17320 tmp = copy_to_reg (target);
17321 /* target = A A B B */
17322 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
17323 /* target = X A B B */
17324 ix86_expand_vector_set (false, target, val, 0);
17325 /* target = A X C D */
17326 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
17327 const1_rtx, const0_rtx,
17328 GEN_INT (2+4), GEN_INT (3+4)));
17329 return;
17331 case 2:
17332 /* tmp = target = A B C D */
17333 tmp = copy_to_reg (target);
17334 /* tmp = X B C D */
17335 ix86_expand_vector_set (false, tmp, val, 0);
17336 /* target = A B X D */
17337 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
17338 const0_rtx, const1_rtx,
17339 GEN_INT (0+4), GEN_INT (3+4)));
17340 return;
17342 case 3:
17343 /* tmp = target = A B C D */
17344 tmp = copy_to_reg (target);
17345 /* tmp = X B C D */
17346 ix86_expand_vector_set (false, tmp, val, 0);
17347 /* target = A B X D */
17348 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
17349 const0_rtx, const1_rtx,
17350 GEN_INT (2+4), GEN_INT (0+4)));
17351 return;
17353 default:
17354 gcc_unreachable ();
17356 break;
17358 case E_V4SImode:
17359 use_vec_merge = TARGET_SSE4_1;
17360 if (use_vec_merge)
17361 break;
17363 /* Element 0 handled by vec_merge below. */
17364 if (elt == 0)
17366 use_vec_merge = true;
17367 break;
17370 if (TARGET_SSE2)
17372 /* With SSE2, use integer shuffles to swap element 0 and ELT,
17373 store into element 0, then shuffle them back. */
17375 rtx order[4];
17377 order[0] = GEN_INT (elt);
17378 order[1] = const1_rtx;
17379 order[2] = const2_rtx;
17380 order[3] = GEN_INT (3);
17381 order[elt] = const0_rtx;
17383 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
17384 order[1], order[2], order[3]));
17386 ix86_expand_vector_set (false, target, val, 0);
17388 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
17389 order[1], order[2], order[3]));
17391 else
17393 /* For SSE1, we have to reuse the V4SF code. */
17394 rtx t = gen_reg_rtx (V4SFmode);
17395 emit_move_insn (t, gen_lowpart (V4SFmode, target));
17396 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
17397 emit_move_insn (target, gen_lowpart (mode, t));
17399 return;
17401 case E_V8HImode:
17402 case E_V8HFmode:
17403 case E_V8BFmode:
17404 case E_V2HImode:
17405 case E_V2HFmode:
17406 case E_V2BFmode:
17407 use_vec_merge = TARGET_SSE2;
17408 break;
17409 case E_V4HImode:
17410 case E_V4HFmode:
17411 case E_V4BFmode:
17412 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
17413 break;
17415 case E_V16QImode:
17416 case E_V4QImode:
17417 use_vec_merge = TARGET_SSE4_1;
17418 break;
17420 case E_V8QImode:
17421 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17422 break;
17424 case E_V32QImode:
17425 half_mode = V16QImode;
17426 j = 0;
17427 n = 16;
17428 goto half;
17430 case E_V16HFmode:
17431 case E_V16BFmode:
17432 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
17433 if (TARGET_AVX2 && elt != 0)
17435 mmode = SImode;
17436 gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
17437 : gen_avx2_pblendbf_1);
17438 blendm_const = true;
17439 break;
17441 else
17443 half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
17444 j = ((mode == E_V16HFmode) ? 6 : 7);
17445 n = 8;
17446 goto half;
17449 case E_V16HImode:
17450 half_mode = V8HImode;
17451 j = 1;
17452 n = 8;
17453 goto half;
17455 case E_V8SImode:
17456 half_mode = V4SImode;
17457 j = 2;
17458 n = 4;
17459 goto half;
17461 case E_V4DImode:
17462 half_mode = V2DImode;
17463 j = 3;
17464 n = 2;
17465 goto half;
17467 case E_V8SFmode:
17468 half_mode = V4SFmode;
17469 j = 4;
17470 n = 4;
17471 goto half;
17473 case E_V4DFmode:
17474 half_mode = V2DFmode;
17475 j = 5;
17476 n = 2;
17477 goto half;
17479 half:
17480 /* Compute offset. */
17481 i = elt / n;
17482 elt %= n;
17484 gcc_assert (i <= 1);
17486 /* Extract the half. */
17487 tmp = gen_reg_rtx (half_mode);
17488 emit_insn (gen_extract[j][i] (tmp, target));
17490 /* Put val in tmp at elt. */
17491 ix86_expand_vector_set (false, tmp, val, elt);
17493 /* Put it back. */
17494 emit_insn (gen_insert[j][i] (target, target, tmp));
17495 return;
17497 case E_V8DFmode:
17498 if (TARGET_AVX512F)
17500 mmode = QImode;
17501 gen_blendm = gen_avx512f_blendmv8df;
17503 break;
17505 case E_V8DImode:
17506 if (TARGET_AVX512F)
17508 mmode = QImode;
17509 gen_blendm = gen_avx512f_blendmv8di;
17511 break;
17513 case E_V16SFmode:
17514 if (TARGET_AVX512F)
17516 mmode = HImode;
17517 gen_blendm = gen_avx512f_blendmv16sf;
17519 break;
17521 case E_V16SImode:
17522 if (TARGET_AVX512F)
17524 mmode = HImode;
17525 gen_blendm = gen_avx512f_blendmv16si;
17527 break;
17529 case E_V32HFmode:
17530 if (TARGET_AVX512BW)
17532 mmode = SImode;
17533 gen_blendm = gen_avx512bw_blendmv32hf;
17535 break;
17536 case E_V32BFmode:
17537 if (TARGET_AVX512BW)
17539 mmode = SImode;
17540 gen_blendm = gen_avx512bw_blendmv32bf;
17542 break;
17543 case E_V32HImode:
17544 if (TARGET_AVX512BW)
17546 mmode = SImode;
17547 gen_blendm = gen_avx512bw_blendmv32hi;
17549 else if (TARGET_AVX512F)
17551 half_mode = E_V8HImode;
17552 n = 8;
17553 goto quarter;
17555 break;
17557 case E_V64QImode:
17558 if (TARGET_AVX512BW)
17560 mmode = DImode;
17561 gen_blendm = gen_avx512bw_blendmv64qi;
17563 else if (TARGET_AVX512F)
17565 half_mode = E_V16QImode;
17566 n = 16;
17567 goto quarter;
17569 break;
17571 quarter:
17572 /* Compute offset. */
17573 i = elt / n;
17574 elt %= n;
17576 gcc_assert (i <= 3);
17579 /* Extract the quarter. */
17580 tmp = gen_reg_rtx (V4SImode);
17581 rtx tmp2 = gen_lowpart (V16SImode, target);
17582 rtx mask = gen_reg_rtx (QImode);
17584 emit_move_insn (mask, constm1_rtx);
17585 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
17586 tmp, mask));
17588 tmp2 = gen_reg_rtx (half_mode);
17589 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
17590 tmp = tmp2;
17592 /* Put val in tmp at elt. */
17593 ix86_expand_vector_set (false, tmp, val, elt);
17595 /* Put it back. */
17596 tmp2 = gen_reg_rtx (V16SImode);
17597 rtx tmp3 = gen_lowpart (V16SImode, target);
17598 mask = gen_reg_rtx (HImode);
17599 emit_move_insn (mask, constm1_rtx);
17600 tmp = gen_lowpart (V4SImode, tmp);
17601 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
17602 tmp3, mask));
17603 emit_move_insn (target, gen_lowpart (mode, tmp2));
17605 return;
17607 default:
17608 break;
17611 if (mmode != VOIDmode)
17613 tmp = gen_reg_rtx (mode);
17614 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
17615 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
17616 /* The avx512*_blendm<mode> expanders have different operand order
17617 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
17618 elements where the mask is set and second input operand otherwise,
17619 in {sse,avx}*_*blend* the first input operand is used for elements
17620 where the mask is clear and second input operand otherwise. */
17621 if (!blendm_const)
17622 merge_mask = force_reg (mmode, merge_mask);
17623 emit_insn (gen_blendm (target, target, tmp, merge_mask));
17625 else if (use_vec_merge)
17627 do_vec_merge:
17628 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
17629 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
17630 GEN_INT (HOST_WIDE_INT_1U << elt));
17631 emit_insn (gen_rtx_SET (target, tmp));
17633 else
17635 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17637 emit_move_insn (mem, target);
17639 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
17640 emit_move_insn (tmp, val);
17642 emit_move_insn (target, mem);
17646 void
17647 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
17649 machine_mode mode = GET_MODE (vec);
17650 machine_mode inner_mode = GET_MODE_INNER (mode);
17651 bool use_vec_extr = false;
17652 rtx tmp;
17654 switch (mode)
17656 case E_V2SImode:
17657 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17658 if (use_vec_extr)
17659 break;
17660 /* FALLTHRU */
17662 case E_V2SFmode:
17663 if (!mmx_ok)
17664 break;
17665 /* FALLTHRU */
17667 case E_V2DFmode:
17668 case E_V2DImode:
17669 case E_V2TImode:
17670 case E_V4TImode:
17671 use_vec_extr = true;
17672 break;
17674 case E_V4SFmode:
17675 use_vec_extr = TARGET_SSE4_1;
17676 if (use_vec_extr)
17677 break;
17679 switch (elt)
17681 case 0:
17682 tmp = vec;
17683 break;
17685 case 1:
17686 case 3:
17687 tmp = gen_reg_rtx (mode);
17688 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
17689 GEN_INT (elt), GEN_INT (elt),
17690 GEN_INT (elt+4), GEN_INT (elt+4)));
17691 break;
17693 case 2:
17694 tmp = gen_reg_rtx (mode);
17695 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
17696 break;
17698 default:
17699 gcc_unreachable ();
17701 vec = tmp;
17702 use_vec_extr = true;
17703 elt = 0;
17704 break;
17706 case E_V4SImode:
17707 use_vec_extr = TARGET_SSE4_1;
17708 if (use_vec_extr)
17709 break;
17711 if (TARGET_SSE2)
17713 switch (elt)
17715 case 0:
17716 tmp = vec;
17717 break;
17719 case 1:
17720 case 3:
17721 tmp = gen_reg_rtx (mode);
17722 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
17723 GEN_INT (elt), GEN_INT (elt),
17724 GEN_INT (elt), GEN_INT (elt)));
17725 break;
17727 case 2:
17728 tmp = gen_reg_rtx (mode);
17729 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
17730 break;
17732 default:
17733 gcc_unreachable ();
17735 vec = tmp;
17736 use_vec_extr = true;
17737 elt = 0;
17739 else
17741 /* For SSE1, we have to reuse the V4SF code. */
17742 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
17743 gen_lowpart (V4SFmode, vec), elt);
17744 return;
17746 break;
17748 case E_V8HImode:
17749 case E_V8HFmode:
17750 case E_V8BFmode:
17751 case E_V2HImode:
17752 case E_V2HFmode:
17753 case E_V2BFmode:
17754 use_vec_extr = TARGET_SSE2;
17755 break;
17756 case E_V4HImode:
17757 case E_V4HFmode:
17758 case E_V4BFmode:
17759 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
17760 break;
17762 case E_V16QImode:
17763 use_vec_extr = TARGET_SSE4_1;
17764 if (!use_vec_extr
17765 && TARGET_SSE2
17766 && elt == 0
17767 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
17769 tmp = gen_reg_rtx (SImode);
17770 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
17772 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
17773 return;
17775 break;
17776 case E_V4QImode:
17777 use_vec_extr = TARGET_SSE4_1;
17778 break;
17780 case E_V8SFmode:
17781 if (TARGET_AVX)
17783 tmp = gen_reg_rtx (V4SFmode);
17784 if (elt < 4)
17785 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
17786 else
17787 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
17788 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17789 return;
17791 break;
17793 case E_V4DFmode:
17794 if (TARGET_AVX)
17796 tmp = gen_reg_rtx (V2DFmode);
17797 if (elt < 2)
17798 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
17799 else
17800 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
17801 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17802 return;
17804 break;
17806 case E_V32QImode:
17807 if (TARGET_AVX)
17809 tmp = gen_reg_rtx (V16QImode);
17810 if (elt < 16)
17811 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
17812 else
17813 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
17814 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17815 return;
17817 break;
17819 case E_V16HImode:
17820 if (TARGET_AVX)
17822 tmp = gen_reg_rtx (V8HImode);
17823 if (elt < 8)
17824 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
17825 else
17826 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
17827 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17828 return;
17830 break;
17832 case E_V8SImode:
17833 if (TARGET_AVX)
17835 tmp = gen_reg_rtx (V4SImode);
17836 if (elt < 4)
17837 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
17838 else
17839 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
17840 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17841 return;
17843 break;
17845 case E_V4DImode:
17846 if (TARGET_AVX)
17848 tmp = gen_reg_rtx (V2DImode);
17849 if (elt < 2)
17850 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
17851 else
17852 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
17853 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17854 return;
17856 break;
17858 case E_V32HImode:
17859 if (TARGET_AVX512BW)
17861 tmp = gen_reg_rtx (V16HImode);
17862 if (elt < 16)
17863 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
17864 else
17865 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
17866 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17867 return;
17869 break;
17871 case E_V64QImode:
17872 if (TARGET_AVX512BW)
17874 tmp = gen_reg_rtx (V32QImode);
17875 if (elt < 32)
17876 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
17877 else
17878 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
17879 ix86_expand_vector_extract (false, target, tmp, elt & 31);
17880 return;
17882 break;
17884 case E_V16SFmode:
17885 tmp = gen_reg_rtx (V8SFmode);
17886 if (elt < 8)
17887 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
17888 else
17889 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
17890 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17891 return;
17893 case E_V8DFmode:
17894 tmp = gen_reg_rtx (V4DFmode);
17895 if (elt < 4)
17896 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
17897 else
17898 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
17899 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17900 return;
17902 case E_V16SImode:
17903 tmp = gen_reg_rtx (V8SImode);
17904 if (elt < 8)
17905 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
17906 else
17907 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
17908 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17909 return;
17911 case E_V8DImode:
17912 tmp = gen_reg_rtx (V4DImode);
17913 if (elt < 4)
17914 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
17915 else
17916 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
17917 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17918 return;
17920 case E_V32HFmode:
17921 case E_V32BFmode:
17922 if (TARGET_AVX512BW)
17924 tmp = (mode == E_V32HFmode
17925 ? gen_reg_rtx (V16HFmode)
17926 : gen_reg_rtx (V16BFmode));
17927 if (elt < 16)
17928 emit_insn (gen_vec_extract_lo (mode, tmp, vec));
17929 else
17930 emit_insn (gen_vec_extract_hi (mode, tmp, vec));
17931 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17932 return;
17934 break;
17936 case E_V16HFmode:
17937 case E_V16BFmode:
17938 if (TARGET_AVX)
17940 tmp = (mode == E_V16HFmode
17941 ? gen_reg_rtx (V8HFmode)
17942 : gen_reg_rtx (V8BFmode));
17943 if (elt < 8)
17944 emit_insn (gen_vec_extract_lo (mode, tmp, vec));
17945 else
17946 emit_insn (gen_vec_extract_hi (mode, tmp, vec));
17947 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17948 return;
17950 break;
17952 case E_V8QImode:
17953 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17954 /* ??? Could extract the appropriate HImode element and shift. */
17955 break;
17957 default:
17958 break;
17961 if (use_vec_extr)
17963 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
17964 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
17966 /* Let the rtl optimizers know about the zero extension performed. */
17967 if (inner_mode == QImode || inner_mode == HImode)
17969 rtx reg = gen_reg_rtx (SImode);
17970 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
17971 emit_move_insn (reg, tmp);
17972 tmp = gen_lowpart (inner_mode, reg);
17973 SUBREG_PROMOTED_VAR_P (tmp) = 1;
17974 SUBREG_PROMOTED_SET (tmp, 1);
17977 emit_move_insn (target, tmp);
17979 else
17981 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17983 emit_move_insn (mem, vec);
17985 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
17986 emit_move_insn (target, tmp);
17990 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17991 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17992 The upper bits of DEST are undefined, though they shouldn't cause
17993 exceptions (some bits from src or all zeros are ok). */
17995 static void
17996 emit_reduc_half (rtx dest, rtx src, int i)
17998 rtx tem, d = dest;
17999 switch (GET_MODE (src))
18001 case E_V4SFmode:
18002 if (i == 128)
18003 tem = gen_sse_movhlps (dest, src, src);
18004 else
18005 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
18006 GEN_INT (1 + 4), GEN_INT (1 + 4));
18007 break;
18008 case E_V2DFmode:
18009 tem = gen_vec_interleave_highv2df (dest, src, src);
18010 break;
18011 case E_V4QImode:
18012 d = gen_reg_rtx (V1SImode);
18013 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
18014 GEN_INT (i / 2));
18015 break;
18016 case E_V8QImode:
18017 case E_V4HImode:
18018 d = gen_reg_rtx (V1DImode);
18019 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
18020 GEN_INT (i / 2));
18021 break;
18022 case E_V16QImode:
18023 case E_V8HImode:
18024 case E_V8HFmode:
18025 case E_V4SImode:
18026 case E_V2DImode:
18027 d = gen_reg_rtx (V1TImode);
18028 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
18029 GEN_INT (i / 2));
18030 break;
18031 case E_V8SFmode:
18032 if (i == 256)
18033 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
18034 else
18035 tem = gen_avx_shufps256 (dest, src, src,
18036 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
18037 break;
18038 case E_V4DFmode:
18039 if (i == 256)
18040 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
18041 else
18042 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
18043 break;
18044 case E_V32QImode:
18045 case E_V16HImode:
18046 case E_V16HFmode:
18047 case E_V8SImode:
18048 case E_V4DImode:
18049 if (i == 256)
18051 if (GET_MODE (dest) != V4DImode)
18052 d = gen_reg_rtx (V4DImode);
18053 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
18054 gen_lowpart (V4DImode, src),
18055 const1_rtx);
18057 else
18059 d = gen_reg_rtx (V2TImode);
18060 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
18061 GEN_INT (i / 2));
18063 break;
18064 case E_V64QImode:
18065 case E_V32HImode:
18066 case E_V32HFmode:
18067 if (i < 64)
18069 d = gen_reg_rtx (V4TImode);
18070 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
18071 GEN_INT (i / 2));
18072 break;
18074 /* FALLTHRU */
18075 case E_V16SImode:
18076 case E_V16SFmode:
18077 case E_V8DImode:
18078 case E_V8DFmode:
18079 if (i > 128)
18080 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
18081 gen_lowpart (V16SImode, src),
18082 gen_lowpart (V16SImode, src),
18083 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
18084 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
18085 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
18086 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
18087 GEN_INT (0xC), GEN_INT (0xD),
18088 GEN_INT (0xE), GEN_INT (0xF),
18089 GEN_INT (0x10), GEN_INT (0x11),
18090 GEN_INT (0x12), GEN_INT (0x13),
18091 GEN_INT (0x14), GEN_INT (0x15),
18092 GEN_INT (0x16), GEN_INT (0x17));
18093 else
18094 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
18095 gen_lowpart (V16SImode, src),
18096 GEN_INT (i == 128 ? 0x2 : 0x1),
18097 GEN_INT (0x3),
18098 GEN_INT (0x3),
18099 GEN_INT (0x3),
18100 GEN_INT (i == 128 ? 0x6 : 0x5),
18101 GEN_INT (0x7),
18102 GEN_INT (0x7),
18103 GEN_INT (0x7),
18104 GEN_INT (i == 128 ? 0xA : 0x9),
18105 GEN_INT (0xB),
18106 GEN_INT (0xB),
18107 GEN_INT (0xB),
18108 GEN_INT (i == 128 ? 0xE : 0xD),
18109 GEN_INT (0xF),
18110 GEN_INT (0xF),
18111 GEN_INT (0xF));
18112 break;
18113 default:
18114 gcc_unreachable ();
18116 emit_insn (tem);
18117 if (d != dest)
18118 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
18121 /* Expand a vector reduction. FN is the binary pattern to reduce;
18122 DEST is the destination; IN is the input vector. */
18124 void
18125 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
18127 rtx half, dst, vec = in;
18128 machine_mode mode = GET_MODE (in);
18129 int i;
18131 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
18132 if (TARGET_SSE4_1
18133 && mode == V8HImode
18134 && fn == gen_uminv8hi3)
18136 emit_insn (gen_sse4_1_phminposuw (dest, in));
18137 return;
18140 for (i = GET_MODE_BITSIZE (mode);
18141 i > GET_MODE_UNIT_BITSIZE (mode);
18142 i >>= 1)
18144 half = gen_reg_rtx (mode);
18145 emit_reduc_half (half, vec, i);
18146 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
18147 dst = dest;
18148 else
18149 dst = gen_reg_rtx (mode);
18150 emit_insn (fn (dst, half, vec));
18151 vec = dst;
18155 /* Output code to perform a conditional jump to LABEL, if C2 flag in
18156 FP status register is set. */
18158 void
18159 ix86_emit_fp_unordered_jump (rtx label)
18161 rtx reg = gen_reg_rtx (HImode);
18162 rtx_insn *insn;
18163 rtx temp;
18165 emit_insn (gen_x86_fnstsw_1 (reg));
18167 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
18169 emit_insn (gen_x86_sahf_1 (reg));
18171 temp = gen_rtx_REG (CCmode, FLAGS_REG);
18172 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
18174 else
18176 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
18178 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
18179 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
18182 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
18183 gen_rtx_LABEL_REF (VOIDmode, label),
18184 pc_rtx);
18185 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
18186 predict_jump (REG_BR_PROB_BASE * 10 / 100);
18187 JUMP_LABEL (insn) = label;
18190 /* Output code to perform an sinh XFmode calculation. */
18192 void
18193 ix86_emit_i387_sinh (rtx op0, rtx op1)
18195 rtx e1 = gen_reg_rtx (XFmode);
18196 rtx e2 = gen_reg_rtx (XFmode);
18197 rtx scratch = gen_reg_rtx (HImode);
18198 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
18199 rtx half = const_double_from_real_value (dconsthalf, XFmode);
18200 rtx cst1, tmp;
18201 rtx_code_label *jump_label = gen_label_rtx ();
18202 rtx_insn *insn;
18204 /* scratch = fxam (op1) */
18205 emit_insn (gen_fxamxf2_i387 (scratch, op1));
18207 /* e1 = expm1 (|op1|) */
18208 emit_insn (gen_absxf2 (e2, op1));
18209 emit_insn (gen_expm1xf2 (e1, e2));
18211 /* e2 = e1 / (e1 + 1.0) + e1 */
18212 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18213 emit_insn (gen_addxf3 (e2, e1, cst1));
18214 emit_insn (gen_divxf3 (e2, e1, e2));
18215 emit_insn (gen_addxf3 (e2, e2, e1));
18217 /* flags = signbit (op1) */
18218 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18220 /* if (flags) then e2 = -e2 */
18221 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18222 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
18223 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18224 pc_rtx);
18225 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18226 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18227 JUMP_LABEL (insn) = jump_label;
18229 emit_insn (gen_negxf2 (e2, e2));
18231 emit_label (jump_label);
18232 LABEL_NUSES (jump_label) = 1;
18234 /* op0 = 0.5 * e2 */
18235 half = force_reg (XFmode, half);
18236 emit_insn (gen_mulxf3 (op0, e2, half));
18239 /* Output code to perform an cosh XFmode calculation. */
18241 void
18242 ix86_emit_i387_cosh (rtx op0, rtx op1)
18244 rtx e1 = gen_reg_rtx (XFmode);
18245 rtx e2 = gen_reg_rtx (XFmode);
18246 rtx half = const_double_from_real_value (dconsthalf, XFmode);
18247 rtx cst1;
18249 /* e1 = exp (op1) */
18250 emit_insn (gen_expxf2 (e1, op1));
18252 /* e2 = e1 + 1.0 / e1 */
18253 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18254 emit_insn (gen_divxf3 (e2, cst1, e1));
18255 emit_insn (gen_addxf3 (e2, e1, e2));
18257 /* op0 = 0.5 * e2 */
18258 half = force_reg (XFmode, half);
18259 emit_insn (gen_mulxf3 (op0, e2, half));
18262 /* Output code to perform an tanh XFmode calculation. */
18264 void
18265 ix86_emit_i387_tanh (rtx op0, rtx op1)
18267 rtx e1 = gen_reg_rtx (XFmode);
18268 rtx e2 = gen_reg_rtx (XFmode);
18269 rtx scratch = gen_reg_rtx (HImode);
18270 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
18271 rtx cst2, tmp;
18272 rtx_code_label *jump_label = gen_label_rtx ();
18273 rtx_insn *insn;
18275 /* scratch = fxam (op1) */
18276 emit_insn (gen_fxamxf2_i387 (scratch, op1));
18278 /* e1 = expm1 (-|2 * op1|) */
18279 emit_insn (gen_addxf3 (e2, op1, op1));
18280 emit_insn (gen_absxf2 (e2, e2));
18281 emit_insn (gen_negxf2 (e2, e2));
18282 emit_insn (gen_expm1xf2 (e1, e2));
18284 /* e2 = e1 / (e1 + 2.0) */
18285 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
18286 emit_insn (gen_addxf3 (e2, e1, cst2));
18287 emit_insn (gen_divxf3 (e2, e1, e2));
18289 /* flags = signbit (op1) */
18290 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18292 /* if (!flags) then e2 = -e2 */
18293 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18294 gen_rtx_NE (VOIDmode, flags, const0_rtx),
18295 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18296 pc_rtx);
18297 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18298 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18299 JUMP_LABEL (insn) = jump_label;
18301 emit_insn (gen_negxf2 (e2, e2));
18303 emit_label (jump_label);
18304 LABEL_NUSES (jump_label) = 1;
18306 emit_move_insn (op0, e2);
18309 /* Output code to perform an asinh XFmode calculation. */
18311 void
18312 ix86_emit_i387_asinh (rtx op0, rtx op1)
18314 rtx e1 = gen_reg_rtx (XFmode);
18315 rtx e2 = gen_reg_rtx (XFmode);
18316 rtx scratch = gen_reg_rtx (HImode);
18317 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
18318 rtx cst1, tmp;
18319 rtx_code_label *jump_label = gen_label_rtx ();
18320 rtx_insn *insn;
18322 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
18323 emit_insn (gen_mulxf3 (e1, op1, op1));
18324 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18325 emit_insn (gen_addxf3 (e2, e1, cst1));
18326 emit_insn (gen_sqrtxf2 (e2, e2));
18327 emit_insn (gen_addxf3 (e2, e2, cst1));
18329 /* e1 = e1 / e2 */
18330 emit_insn (gen_divxf3 (e1, e1, e2));
18332 /* scratch = fxam (op1) */
18333 emit_insn (gen_fxamxf2_i387 (scratch, op1));
18335 /* e1 = e1 + |op1| */
18336 emit_insn (gen_absxf2 (e2, op1));
18337 emit_insn (gen_addxf3 (e1, e1, e2));
18339 /* e2 = log1p (e1) */
18340 ix86_emit_i387_log1p (e2, e1);
18342 /* flags = signbit (op1) */
18343 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18345 /* if (flags) then e2 = -e2 */
18346 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18347 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
18348 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18349 pc_rtx);
18350 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18351 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18352 JUMP_LABEL (insn) = jump_label;
18354 emit_insn (gen_negxf2 (e2, e2));
18356 emit_label (jump_label);
18357 LABEL_NUSES (jump_label) = 1;
18359 emit_move_insn (op0, e2);
18362 /* Output code to perform an acosh XFmode calculation. */
18364 void
18365 ix86_emit_i387_acosh (rtx op0, rtx op1)
18367 rtx e1 = gen_reg_rtx (XFmode);
18368 rtx e2 = gen_reg_rtx (XFmode);
18369 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18371 /* e2 = sqrt (op1 + 1.0) */
18372 emit_insn (gen_addxf3 (e2, op1, cst1));
18373 emit_insn (gen_sqrtxf2 (e2, e2));
18375 /* e1 = sqrt (op1 - 1.0) */
18376 emit_insn (gen_subxf3 (e1, op1, cst1));
18377 emit_insn (gen_sqrtxf2 (e1, e1));
18379 /* e1 = e1 * e2 */
18380 emit_insn (gen_mulxf3 (e1, e1, e2));
18382 /* e1 = e1 + op1 */
18383 emit_insn (gen_addxf3 (e1, e1, op1));
18385 /* op0 = log (e1) */
18386 emit_insn (gen_logxf2 (op0, e1));
18389 /* Output code to perform an atanh XFmode calculation. */
18391 void
18392 ix86_emit_i387_atanh (rtx op0, rtx op1)
18394 rtx e1 = gen_reg_rtx (XFmode);
18395 rtx e2 = gen_reg_rtx (XFmode);
18396 rtx scratch = gen_reg_rtx (HImode);
18397 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
18398 rtx half = const_double_from_real_value (dconsthalf, XFmode);
18399 rtx cst1, tmp;
18400 rtx_code_label *jump_label = gen_label_rtx ();
18401 rtx_insn *insn;
18403 /* scratch = fxam (op1) */
18404 emit_insn (gen_fxamxf2_i387 (scratch, op1));
18406 /* e2 = |op1| */
18407 emit_insn (gen_absxf2 (e2, op1));
18409 /* e1 = -(e2 + e2) / (e2 + 1.0) */
18410 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18411 emit_insn (gen_addxf3 (e1, e2, cst1));
18412 emit_insn (gen_addxf3 (e2, e2, e2));
18413 emit_insn (gen_negxf2 (e2, e2));
18414 emit_insn (gen_divxf3 (e1, e2, e1));
18416 /* e2 = log1p (e1) */
18417 ix86_emit_i387_log1p (e2, e1);
18419 /* flags = signbit (op1) */
18420 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18422 /* if (!flags) then e2 = -e2 */
18423 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18424 gen_rtx_NE (VOIDmode, flags, const0_rtx),
18425 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18426 pc_rtx);
18427 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18428 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18429 JUMP_LABEL (insn) = jump_label;
18431 emit_insn (gen_negxf2 (e2, e2));
18433 emit_label (jump_label);
18434 LABEL_NUSES (jump_label) = 1;
18436 /* op0 = 0.5 * e2 */
18437 half = force_reg (XFmode, half);
18438 emit_insn (gen_mulxf3 (op0, e2, half));
18441 /* Output code to perform a log1p XFmode calculation. */
18443 void
18444 ix86_emit_i387_log1p (rtx op0, rtx op1)
18446 rtx_code_label *label1 = gen_label_rtx ();
18447 rtx_code_label *label2 = gen_label_rtx ();
18449 rtx tmp = gen_reg_rtx (XFmode);
18450 rtx res = gen_reg_rtx (XFmode);
18451 rtx cst, cstln2, cst1;
18452 rtx_insn *insn;
18454 /* The emit_jump call emits pending stack adjust, make sure it is emitted
18455 before the conditional jump, otherwise the stack adjustment will be
18456 only conditional. */
18457 do_pending_stack_adjust ();
18459 cst = const_double_from_real_value
18460 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
18461 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
18463 emit_insn (gen_absxf2 (tmp, op1));
18465 cst = force_reg (XFmode, cst);
18466 ix86_expand_branch (GE, tmp, cst, label1);
18467 predict_jump (REG_BR_PROB_BASE * 10 / 100);
18468 insn = get_last_insn ();
18469 JUMP_LABEL (insn) = label1;
18471 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
18472 emit_jump (label2);
18474 emit_label (label1);
18475 LABEL_NUSES (label1) = 1;
18477 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18478 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
18479 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
18481 emit_label (label2);
18482 LABEL_NUSES (label2) = 1;
18484 emit_move_insn (op0, res);
18487 /* Emit code for round calculation. */
18488 void
18489 ix86_emit_i387_round (rtx op0, rtx op1)
18491 machine_mode inmode = GET_MODE (op1);
18492 machine_mode outmode = GET_MODE (op0);
18493 rtx e1 = gen_reg_rtx (XFmode);
18494 rtx e2 = gen_reg_rtx (XFmode);
18495 rtx scratch = gen_reg_rtx (HImode);
18496 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
18497 rtx half = const_double_from_real_value (dconsthalf, XFmode);
18498 rtx res = gen_reg_rtx (outmode);
18499 rtx_code_label *jump_label = gen_label_rtx ();
18500 rtx (*floor_insn) (rtx, rtx);
18501 rtx (*neg_insn) (rtx, rtx);
18502 rtx_insn *insn;
18503 rtx tmp;
18505 switch (inmode)
18507 case E_SFmode:
18508 case E_DFmode:
18509 tmp = gen_reg_rtx (XFmode);
18511 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
18512 op1 = tmp;
18513 break;
18514 case E_XFmode:
18515 break;
18516 default:
18517 gcc_unreachable ();
18520 switch (outmode)
18522 case E_SFmode:
18523 floor_insn = gen_frndintxf2_floor;
18524 neg_insn = gen_negsf2;
18525 break;
18526 case E_DFmode:
18527 floor_insn = gen_frndintxf2_floor;
18528 neg_insn = gen_negdf2;
18529 break;
18530 case E_XFmode:
18531 floor_insn = gen_frndintxf2_floor;
18532 neg_insn = gen_negxf2;
18533 break;
18534 case E_HImode:
18535 floor_insn = gen_lfloorxfhi2;
18536 neg_insn = gen_neghi2;
18537 break;
18538 case E_SImode:
18539 floor_insn = gen_lfloorxfsi2;
18540 neg_insn = gen_negsi2;
18541 break;
18542 case E_DImode:
18543 floor_insn = gen_lfloorxfdi2;
18544 neg_insn = gen_negdi2;
18545 break;
18546 default:
18547 gcc_unreachable ();
18550 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
18552 /* scratch = fxam(op1) */
18553 emit_insn (gen_fxamxf2_i387 (scratch, op1));
18555 /* e1 = fabs(op1) */
18556 emit_insn (gen_absxf2 (e1, op1));
18558 /* e2 = e1 + 0.5 */
18559 half = force_reg (XFmode, half);
18560 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
18562 /* res = floor(e2) */
18563 switch (outmode)
18565 case E_SFmode:
18566 case E_DFmode:
18568 tmp = gen_reg_rtx (XFmode);
18570 emit_insn (floor_insn (tmp, e2));
18571 emit_insn (gen_rtx_SET (res,
18572 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
18573 UNSPEC_TRUNC_NOOP)));
18575 break;
18576 default:
18577 emit_insn (floor_insn (res, e2));
18580 /* flags = signbit(a) */
18581 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18583 /* if (flags) then res = -res */
18584 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18585 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
18586 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18587 pc_rtx);
18588 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18589 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18590 JUMP_LABEL (insn) = jump_label;
18592 emit_insn (neg_insn (res, res));
18594 emit_label (jump_label);
18595 LABEL_NUSES (jump_label) = 1;
18597 emit_move_insn (op0, res);
18600 /* Output code to perform a Newton-Rhapson approximation of a single precision
18601 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
18603 void
18604 ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
18606 rtx x0, x1, e0, e1;
18608 x0 = gen_reg_rtx (mode);
18609 e0 = gen_reg_rtx (mode);
18610 e1 = gen_reg_rtx (mode);
18611 x1 = gen_reg_rtx (mode);
18613 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
18615 b = force_reg (mode, b);
18617 /* x0 = rcp(b) estimate */
18618 if (mode == V16SFmode || mode == V8DFmode)
18620 if (TARGET_AVX512ER)
18622 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18623 UNSPEC_RCP28)));
18624 /* res = a * x0 */
18625 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
18626 return;
18628 else
18629 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18630 UNSPEC_RCP14)));
18632 else
18633 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18634 UNSPEC_RCP)));
18636 /* e0 = x0 * b */
18637 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
18639 /* e0 = x0 * e0 */
18640 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
18642 /* e1 = x0 + x0 */
18643 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
18645 /* x1 = e1 - e0 */
18646 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
18648 /* res = a * x1 */
18649 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
18652 /* Output code to perform a Newton-Rhapson approximation of a
18653 single precision floating point [reciprocal] square root. */
18655 void
18656 ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
18658 rtx x0, e0, e1, e2, e3, mthree, mhalf;
18659 REAL_VALUE_TYPE r;
18660 int unspec;
18662 x0 = gen_reg_rtx (mode);
18663 e0 = gen_reg_rtx (mode);
18664 e1 = gen_reg_rtx (mode);
18665 e2 = gen_reg_rtx (mode);
18666 e3 = gen_reg_rtx (mode);
18668 if (TARGET_AVX512ER && mode == V16SFmode)
18670 if (recip)
18671 /* res = rsqrt28(a) estimate */
18672 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18673 UNSPEC_RSQRT28)));
18674 else
18676 /* x0 = rsqrt28(a) estimate */
18677 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18678 UNSPEC_RSQRT28)));
18679 /* res = rcp28(x0) estimate */
18680 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
18681 UNSPEC_RCP28)));
18683 return;
18686 real_from_integer (&r, VOIDmode, -3, SIGNED);
18687 mthree = const_double_from_real_value (r, SFmode);
18689 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
18690 mhalf = const_double_from_real_value (r, SFmode);
18691 unspec = UNSPEC_RSQRT;
18693 if (VECTOR_MODE_P (mode))
18695 mthree = ix86_build_const_vector (mode, true, mthree);
18696 mhalf = ix86_build_const_vector (mode, true, mhalf);
18697 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18698 if (GET_MODE_SIZE (mode) == 64)
18699 unspec = UNSPEC_RSQRT14;
18702 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18703 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18705 a = force_reg (mode, a);
18707 /* x0 = rsqrt(a) estimate */
18708 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18709 unspec)));
18711 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18712 if (!recip)
18714 rtx zero = force_reg (mode, CONST0_RTX(mode));
18715 rtx mask;
18717 /* Handle masked compare. */
18718 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
18720 mask = gen_reg_rtx (HImode);
18721 /* Imm value 0x4 corresponds to not-equal comparison. */
18722 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
18723 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
18725 else
18727 mask = gen_reg_rtx (mode);
18728 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
18729 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
18733 mthree = force_reg (mode, mthree);
18735 /* e0 = x0 * a */
18736 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
18738 unsigned vector_size = GET_MODE_SIZE (mode);
18739 if (TARGET_FMA
18740 || (TARGET_AVX512F && TARGET_EVEX512 && vector_size == 64)
18741 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
18742 emit_insn (gen_rtx_SET (e2,
18743 gen_rtx_FMA (mode, e0, x0, mthree)));
18744 else
18746 /* e1 = e0 * x0 */
18747 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
18749 /* e2 = e1 - 3. */
18750 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
18753 mhalf = force_reg (mode, mhalf);
18754 if (recip)
18755 /* e3 = -.5 * x0 */
18756 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
18757 else
18758 /* e3 = -.5 * e0 */
18759 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
18760 /* ret = e2 * e3 */
18761 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
18764 /* Expand fabs (OP0) and return a new rtx that holds the result. The
18765 mask for masking out the sign-bit is stored in *SMASK, if that is
18766 non-null. */
18768 static rtx
18769 ix86_expand_sse_fabs (rtx op0, rtx *smask)
18771 machine_mode vmode, mode = GET_MODE (op0);
18772 rtx xa, mask;
18774 xa = gen_reg_rtx (mode);
18775 if (mode == SFmode)
18776 vmode = V4SFmode;
18777 else if (mode == DFmode)
18778 vmode = V2DFmode;
18779 else
18780 vmode = mode;
18781 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
18782 if (!VECTOR_MODE_P (mode))
18784 /* We need to generate a scalar mode mask in this case. */
18785 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18786 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18787 mask = gen_reg_rtx (mode);
18788 emit_insn (gen_rtx_SET (mask, tmp));
18790 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
18792 if (smask)
18793 *smask = mask;
18795 return xa;
18798 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
18799 swapping the operands if SWAP_OPERANDS is true. The expanded
18800 code is a forward jump to a newly created label in case the
18801 comparison is true. The generated label rtx is returned. */
18802 static rtx_code_label *
18803 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
18804 bool swap_operands)
18806 bool unordered_compare = ix86_unordered_fp_compare (code);
18807 rtx_code_label *label;
18808 rtx tmp, reg;
18810 if (swap_operands)
18811 std::swap (op0, op1);
18813 label = gen_label_rtx ();
18814 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
18815 if (unordered_compare)
18816 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
18817 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
18818 emit_insn (gen_rtx_SET (reg, tmp));
18819 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
18820 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18821 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
18822 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18823 JUMP_LABEL (tmp) = label;
18825 return label;
18828 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18829 using comparison code CODE. Operands are swapped for the comparison if
18830 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18831 static rtx
18832 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
18833 bool swap_operands)
18835 rtx (*insn)(rtx, rtx, rtx, rtx);
18836 machine_mode mode = GET_MODE (op0);
18837 rtx mask = gen_reg_rtx (mode);
18839 if (swap_operands)
18840 std::swap (op0, op1);
18842 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
18844 emit_insn (insn (mask, op0, op1,
18845 gen_rtx_fmt_ee (code, mode, op0, op1)));
18846 return mask;
18849 /* Expand copysign from SIGN to the positive value ABS_VALUE
18850 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18851 the sign-bit. */
18853 static void
18854 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
18856 machine_mode mode = GET_MODE (sign);
18857 rtx sgn = gen_reg_rtx (mode);
18858 if (mask == NULL_RTX)
18860 machine_mode vmode;
18862 if (mode == SFmode)
18863 vmode = V4SFmode;
18864 else if (mode == DFmode)
18865 vmode = V2DFmode;
18866 else if (mode == HFmode)
18867 vmode = V8HFmode;
18868 else
18869 vmode = mode;
18871 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
18872 if (!VECTOR_MODE_P (mode))
18874 /* We need to generate a scalar mode mask in this case. */
18875 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18876 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18877 mask = gen_reg_rtx (mode);
18878 emit_insn (gen_rtx_SET (mask, tmp));
18881 else
18882 mask = gen_rtx_NOT (mode, mask);
18883 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
18884 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
18887 /* Expand SSE sequence for computing lround from OP1 storing
18888 into OP0. */
18890 void
18891 ix86_expand_lround (rtx op0, rtx op1)
18893 /* C code for the stuff we're doing below:
18894 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18895 return (long)tmp;
18897 machine_mode mode = GET_MODE (op1);
18898 const struct real_format *fmt;
18899 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18900 rtx adj;
18902 /* load nextafter (0.5, 0.0) */
18903 fmt = REAL_MODE_FORMAT (mode);
18904 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18905 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18907 /* adj = copysign (0.5, op1) */
18908 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
18909 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
18911 /* adj = op1 + adj */
18912 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
18914 /* op0 = (imode)adj */
18915 expand_fix (op0, adj, 0);
18918 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18919 into OPERAND0. */
18921 void
18922 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
18924 /* C code for the stuff we're doing below (for do_floor):
18925 xi = (long)op1;
18926 xi -= (double)xi > op1 ? 1 : 0;
18927 return xi;
18929 machine_mode fmode = GET_MODE (op1);
18930 machine_mode imode = GET_MODE (op0);
18931 rtx ireg, freg, tmp;
18932 rtx_code_label *label;
18934 /* reg = (long)op1 */
18935 ireg = gen_reg_rtx (imode);
18936 expand_fix (ireg, op1, 0);
18938 /* freg = (double)reg */
18939 freg = gen_reg_rtx (fmode);
18940 expand_float (freg, ireg, 0);
18942 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18943 label = ix86_expand_sse_compare_and_jump (UNLE,
18944 freg, op1, !do_floor);
18945 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
18946 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
18947 emit_move_insn (ireg, tmp);
18949 emit_label (label);
18950 LABEL_NUSES (label) = 1;
18952 emit_move_insn (op0, ireg);
18955 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18956 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18958 static rtx
18959 ix86_gen_TWO52 (machine_mode mode)
18961 const struct real_format *fmt;
18962 REAL_VALUE_TYPE TWO52r;
18963 rtx TWO52;
18965 fmt = REAL_MODE_FORMAT (mode);
18966 real_2expN (&TWO52r, fmt->p - 1, mode);
18967 TWO52 = const_double_from_real_value (TWO52r, mode);
18968 TWO52 = force_reg (mode, TWO52);
18970 return TWO52;
18973 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18975 void
18976 ix86_expand_rint (rtx operand0, rtx operand1)
18978 /* C code for the stuff we're doing below:
18979 xa = fabs (operand1);
18980 if (!isless (xa, 2**52))
18981 return operand1;
18982 two52 = 2**52;
18983 if (flag_rounding_math)
18985 two52 = copysign (two52, operand1);
18986 xa = operand1;
18988 xa = xa + two52 - two52;
18989 return copysign (xa, operand1);
18991 machine_mode mode = GET_MODE (operand0);
18992 rtx res, xa, TWO52, mask;
18993 rtx_code_label *label;
18995 TWO52 = ix86_gen_TWO52 (mode);
18997 /* Temporary for holding the result, initialized to the input
18998 operand to ease control flow. */
18999 res = copy_to_reg (operand1);
19001 /* xa = abs (operand1) */
19002 xa = ix86_expand_sse_fabs (res, &mask);
19004 /* if (!isless (xa, TWO52)) goto label; */
19005 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
19007 if (flag_rounding_math)
19009 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
19010 xa = res;
19013 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
19014 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
19016 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
19017 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
19018 xa = ix86_expand_sse_fabs (xa, NULL);
19020 ix86_sse_copysign_to_positive (res, xa, res, mask);
19022 emit_label (label);
19023 LABEL_NUSES (label) = 1;
19025 emit_move_insn (operand0, res);
19028 /* Expand SSE2 sequence for computing floor or ceil
19029 from OPERAND1 storing into OPERAND0. */
19030 void
19031 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
19033 /* C code for the stuff we expand below.
19034 double xa = fabs (x), x2;
19035 if (!isless (xa, TWO52))
19036 return x;
19037 x2 = (double)(long)x;
19039 Compensate. Floor:
19040 if (x2 > x)
19041 x2 -= 1;
19042 Compensate. Ceil:
19043 if (x2 < x)
19044 x2 += 1;
19046 if (HONOR_SIGNED_ZEROS (mode))
19047 return copysign (x2, x);
19048 return x2;
19050 machine_mode mode = GET_MODE (operand0);
19051 rtx xa, xi, TWO52, tmp, one, res, mask;
19052 rtx_code_label *label;
19054 TWO52 = ix86_gen_TWO52 (mode);
19056 /* Temporary for holding the result, initialized to the input
19057 operand to ease control flow. */
19058 res = copy_to_reg (operand1);
19060 /* xa = abs (operand1) */
19061 xa = ix86_expand_sse_fabs (res, &mask);
19063 /* if (!isless (xa, TWO52)) goto label; */
19064 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
19066 /* xa = (double)(long)x */
19067 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
19068 expand_fix (xi, res, 0);
19069 expand_float (xa, xi, 0);
19071 /* generate 1.0 */
19072 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
19074 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
19075 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
19076 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
19077 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
19078 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
19079 if (HONOR_SIGNED_ZEROS (mode))
19081 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
19082 if (do_floor && flag_rounding_math)
19083 tmp = ix86_expand_sse_fabs (tmp, NULL);
19085 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
19087 emit_move_insn (res, tmp);
19089 emit_label (label);
19090 LABEL_NUSES (label) = 1;
19092 emit_move_insn (operand0, res);
19095 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
19096 into OPERAND0 without relying on DImode truncation via cvttsd2siq
19097 that is only available on 64bit targets. */
19098 void
19099 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
19101 /* C code for the stuff we expand below.
19102 double xa = fabs (x), x2;
19103 if (!isless (xa, TWO52))
19104 return x;
19105 xa = xa + TWO52 - TWO52;
19106 x2 = copysign (xa, x);
19108 Compensate. Floor:
19109 if (x2 > x)
19110 x2 -= 1;
19111 Compensate. Ceil:
19112 if (x2 < x)
19113 x2 += 1;
19115 if (HONOR_SIGNED_ZEROS (mode))
19116 x2 = copysign (x2, x);
19117 return x2;
19119 machine_mode mode = GET_MODE (operand0);
19120 rtx xa, TWO52, tmp, one, res, mask;
19121 rtx_code_label *label;
19123 TWO52 = ix86_gen_TWO52 (mode);
19125 /* Temporary for holding the result, initialized to the input
19126 operand to ease control flow. */
19127 res = copy_to_reg (operand1);
19129 /* xa = abs (operand1) */
19130 xa = ix86_expand_sse_fabs (res, &mask);
19132 /* if (!isless (xa, TWO52)) goto label; */
19133 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
19135 /* xa = xa + TWO52 - TWO52; */
19136 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
19137 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
19139 /* xa = copysign (xa, operand1) */
19140 ix86_sse_copysign_to_positive (xa, xa, res, mask);
19142 /* generate 1.0 */
19143 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
19145 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
19146 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
19147 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
19148 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
19149 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
19150 if (HONOR_SIGNED_ZEROS (mode))
19152 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
19153 if (do_floor && flag_rounding_math)
19154 tmp = ix86_expand_sse_fabs (tmp, NULL);
19156 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
19158 emit_move_insn (res, tmp);
19160 emit_label (label);
19161 LABEL_NUSES (label) = 1;
19163 emit_move_insn (operand0, res);
19166 /* Expand SSE sequence for computing trunc
19167 from OPERAND1 storing into OPERAND0. */
19168 void
19169 ix86_expand_trunc (rtx operand0, rtx operand1)
19171 /* C code for SSE variant we expand below.
19172 double xa = fabs (x), x2;
19173 if (!isless (xa, TWO52))
19174 return x;
19175 x2 = (double)(long)x;
19176 if (HONOR_SIGNED_ZEROS (mode))
19177 return copysign (x2, x);
19178 return x2;
19180 machine_mode mode = GET_MODE (operand0);
19181 rtx xa, xi, TWO52, res, mask;
19182 rtx_code_label *label;
19184 TWO52 = ix86_gen_TWO52 (mode);
19186 /* Temporary for holding the result, initialized to the input
19187 operand to ease control flow. */
19188 res = copy_to_reg (operand1);
19190 /* xa = abs (operand1) */
19191 xa = ix86_expand_sse_fabs (res, &mask);
19193 /* if (!isless (xa, TWO52)) goto label; */
19194 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
19196 /* xa = (double)(long)x */
19197 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
19198 expand_fix (xi, res, 0);
19199 expand_float (xa, xi, 0);
19201 if (HONOR_SIGNED_ZEROS (mode))
19202 ix86_sse_copysign_to_positive (xa, xa, res, mask);
19204 emit_move_insn (res, xa);
19206 emit_label (label);
19207 LABEL_NUSES (label) = 1;
19209 emit_move_insn (operand0, res);
19212 /* Expand SSE sequence for computing trunc from OPERAND1 storing
19213 into OPERAND0 without relying on DImode truncation via cvttsd2siq
19214 that is only available on 64bit targets. */
19215 void
19216 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
19218 machine_mode mode = GET_MODE (operand0);
19219 rtx xa, xa2, TWO52, tmp, one, res, mask;
19220 rtx_code_label *label;
19222 /* C code for SSE variant we expand below.
19223 double xa = fabs (x), x2;
19224 if (!isless (xa, TWO52))
19225 return x;
19226 xa2 = xa + TWO52 - TWO52;
19227 Compensate:
19228 if (xa2 > xa)
19229 xa2 -= 1.0;
19230 x2 = copysign (xa2, x);
19231 return x2;
19234 TWO52 = ix86_gen_TWO52 (mode);
19236 /* Temporary for holding the result, initialized to the input
19237 operand to ease control flow. */
19238 res =copy_to_reg (operand1);
19240 /* xa = abs (operand1) */
19241 xa = ix86_expand_sse_fabs (res, &mask);
19243 /* if (!isless (xa, TWO52)) goto label; */
19244 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
19246 /* xa2 = xa + TWO52 - TWO52; */
19247 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
19248 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
19250 /* generate 1.0 */
19251 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
19253 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
19254 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
19255 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
19256 tmp = expand_simple_binop (mode, MINUS,
19257 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
19258 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
19259 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
19260 tmp = ix86_expand_sse_fabs (tmp, NULL);
19262 /* res = copysign (xa2, operand1) */
19263 ix86_sse_copysign_to_positive (res, tmp, res, mask);
19265 emit_label (label);
19266 LABEL_NUSES (label) = 1;
19268 emit_move_insn (operand0, res);
19271 /* Expand SSE sequence for computing round
19272 from OPERAND1 storing into OPERAND0. */
19273 void
19274 ix86_expand_round (rtx operand0, rtx operand1)
19276 /* C code for the stuff we're doing below:
19277 double xa = fabs (x);
19278 if (!isless (xa, TWO52))
19279 return x;
19280 xa = (double)(long)(xa + nextafter (0.5, 0.0));
19281 return copysign (xa, x);
19283 machine_mode mode = GET_MODE (operand0);
19284 rtx res, TWO52, xa, xi, half, mask;
19285 rtx_code_label *label;
19286 const struct real_format *fmt;
19287 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
19289 /* Temporary for holding the result, initialized to the input
19290 operand to ease control flow. */
19291 res = copy_to_reg (operand1);
19293 TWO52 = ix86_gen_TWO52 (mode);
19294 xa = ix86_expand_sse_fabs (res, &mask);
19295 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
19297 /* load nextafter (0.5, 0.0) */
19298 fmt = REAL_MODE_FORMAT (mode);
19299 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
19300 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
19302 /* xa = xa + 0.5 */
19303 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
19304 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
19306 /* xa = (double)(int64_t)xa */
19307 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
19308 expand_fix (xi, xa, 0);
19309 expand_float (xa, xi, 0);
19311 /* res = copysign (xa, operand1) */
19312 ix86_sse_copysign_to_positive (res, xa, res, mask);
19314 emit_label (label);
19315 LABEL_NUSES (label) = 1;
19317 emit_move_insn (operand0, res);
19320 /* Expand SSE sequence for computing round from OPERAND1 storing
19321 into OPERAND0 without relying on DImode truncation via cvttsd2siq
19322 that is only available on 64bit targets. */
19323 void
19324 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
19326 /* C code for the stuff we expand below.
19327 double xa = fabs (x), xa2, x2;
19328 if (!isless (xa, TWO52))
19329 return x;
19330 Using the absolute value and copying back sign makes
19331 -0.0 -> -0.0 correct.
19332 xa2 = xa + TWO52 - TWO52;
19333 Compensate.
19334 dxa = xa2 - xa;
19335 if (dxa <= -0.5)
19336 xa2 += 1;
19337 else if (dxa > 0.5)
19338 xa2 -= 1;
19339 x2 = copysign (xa2, x);
19340 return x2;
19342 machine_mode mode = GET_MODE (operand0);
19343 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
19344 rtx_code_label *label;
19346 TWO52 = ix86_gen_TWO52 (mode);
19348 /* Temporary for holding the result, initialized to the input
19349 operand to ease control flow. */
19350 res = copy_to_reg (operand1);
19352 /* xa = abs (operand1) */
19353 xa = ix86_expand_sse_fabs (res, &mask);
19355 /* if (!isless (xa, TWO52)) goto label; */
19356 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
19358 /* xa2 = xa + TWO52 - TWO52; */
19359 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
19360 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
19362 /* dxa = xa2 - xa; */
19363 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
19365 /* generate 0.5, 1.0 and -0.5 */
19366 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
19367 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
19368 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
19369 0, OPTAB_DIRECT);
19371 /* Compensate. */
19372 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
19373 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
19374 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
19375 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
19376 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
19377 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
19378 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
19379 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
19381 /* res = copysign (xa2, operand1) */
19382 ix86_sse_copysign_to_positive (res, xa2, res, mask);
19384 emit_label (label);
19385 LABEL_NUSES (label) = 1;
19387 emit_move_insn (operand0, res);
19390 /* Expand SSE sequence for computing round
19391 from OP1 storing into OP0 using sse4 round insn. */
19392 void
19393 ix86_expand_round_sse4 (rtx op0, rtx op1)
19395 machine_mode mode = GET_MODE (op0);
19396 rtx e1, e2, res, half;
19397 const struct real_format *fmt;
19398 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
19399 rtx (*gen_copysign) (rtx, rtx, rtx);
19400 rtx (*gen_round) (rtx, rtx, rtx);
19402 switch (mode)
19404 case E_HFmode:
19405 gen_copysign = gen_copysignhf3;
19406 gen_round = gen_sse4_1_roundhf2;
19407 break;
19408 case E_SFmode:
19409 gen_copysign = gen_copysignsf3;
19410 gen_round = gen_sse4_1_roundsf2;
19411 break;
19412 case E_DFmode:
19413 gen_copysign = gen_copysigndf3;
19414 gen_round = gen_sse4_1_rounddf2;
19415 break;
19416 default:
19417 gcc_unreachable ();
19420 /* round (a) = trunc (a + copysign (0.5, a)) */
19422 /* load nextafter (0.5, 0.0) */
19423 fmt = REAL_MODE_FORMAT (mode);
19424 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
19425 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
19426 half = const_double_from_real_value (pred_half, mode);
19428 /* e1 = copysign (0.5, op1) */
19429 e1 = gen_reg_rtx (mode);
19430 emit_insn (gen_copysign (e1, half, op1));
19432 /* e2 = op1 + e1 */
19433 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
19435 /* res = trunc (e2) */
19436 res = gen_reg_rtx (mode);
19437 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
19439 emit_move_insn (op0, res);
19442 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
19443 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
19444 insn every time. */
19446 static GTY(()) rtx_insn *vselect_insn;
19448 /* Initialize vselect_insn. */
19450 static void
19451 init_vselect_insn (void)
19453 unsigned i;
19454 rtx x;
19456 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
19457 for (i = 0; i < MAX_VECT_LEN; ++i)
19458 XVECEXP (x, 0, i) = const0_rtx;
19459 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
19460 const0_rtx), x);
19461 x = gen_rtx_SET (const0_rtx, x);
19462 start_sequence ();
19463 vselect_insn = emit_insn (x);
19464 end_sequence ();
19467 /* Construct (set target (vec_select op0 (parallel perm))) and
19468 return true if that's a valid instruction in the active ISA. */
19470 static bool
19471 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
19472 unsigned nelt, bool testing_p)
19474 unsigned int i;
19475 rtx x, save_vconcat;
19476 int icode;
19478 if (vselect_insn == NULL_RTX)
19479 init_vselect_insn ();
19481 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
19482 PUT_NUM_ELEM (XVEC (x, 0), nelt);
19483 for (i = 0; i < nelt; ++i)
19484 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
19485 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
19486 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
19487 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
19488 SET_DEST (PATTERN (vselect_insn)) = target;
19489 icode = recog_memoized (vselect_insn);
19491 if (icode >= 0 && !testing_p)
19492 emit_insn (copy_rtx (PATTERN (vselect_insn)));
19494 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
19495 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
19496 INSN_CODE (vselect_insn) = -1;
19498 return icode >= 0;
19501 /* Similar, but generate a vec_concat from op0 and op1 as well. */
19503 static bool
19504 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
19505 const unsigned char *perm, unsigned nelt,
19506 bool testing_p)
19508 machine_mode v2mode;
19509 rtx x;
19510 bool ok;
19512 if (vselect_insn == NULL_RTX)
19513 init_vselect_insn ();
19515 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
19516 return false;
19517 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
19518 PUT_MODE (x, v2mode);
19519 XEXP (x, 0) = op0;
19520 XEXP (x, 1) = op1;
19521 ok = expand_vselect (target, x, perm, nelt, testing_p);
19522 XEXP (x, 0) = const0_rtx;
19523 XEXP (x, 1) = const0_rtx;
19524 return ok;
19527 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19528 using movss or movsd. */
19529 static bool
19530 expand_vec_perm_movs (struct expand_vec_perm_d *d)
19532 machine_mode vmode = d->vmode;
19533 unsigned i, nelt = d->nelt;
19534 rtx x;
19536 if (d->one_operand_p)
19537 return false;
19539 if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
19540 && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
19541 && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
19542 return false;
19544 /* Only the first element is changed. */
19545 if (d->perm[0] != nelt && d->perm[0] != 0)
19546 return false;
19547 for (i = 1; i < nelt; ++i)
19548 if (d->perm[i] != i + nelt - d->perm[0])
19549 return false;
19551 if (d->testing_p)
19552 return true;
19554 if (d->perm[0] == nelt)
19555 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
19556 else
19557 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
19559 emit_insn (gen_rtx_SET (d->target, x));
19561 return true;
19564 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19565 using insertps. */
19566 static bool
19567 expand_vec_perm_insertps (struct expand_vec_perm_d *d)
19569 machine_mode vmode = d->vmode;
19570 unsigned i, cnt_s, nelt = d->nelt;
19571 int cnt_d = -1;
19572 rtx src, dst;
19574 if (d->one_operand_p)
19575 return false;
19577 if (!(TARGET_SSE4_1
19578 && (vmode == V4SFmode || vmode == V4SImode
19579 || (TARGET_MMX_WITH_SSE
19580 && (vmode == V2SFmode || vmode == V2SImode)))))
19581 return false;
19583 for (i = 0; i < nelt; ++i)
19585 if (d->perm[i] == i)
19586 continue;
19587 if (cnt_d != -1)
19589 cnt_d = -1;
19590 break;
19592 cnt_d = i;
19595 if (cnt_d == -1)
19597 for (i = 0; i < nelt; ++i)
19599 if (d->perm[i] == i + nelt)
19600 continue;
19601 if (cnt_d != -1)
19602 return false;
19603 cnt_d = i;
19606 if (cnt_d == -1)
19607 return false;
19610 if (d->testing_p)
19611 return true;
19613 gcc_assert (cnt_d != -1);
19615 cnt_s = d->perm[cnt_d];
19616 if (cnt_s < nelt)
19618 src = d->op0;
19619 dst = d->op1;
19621 else
19623 cnt_s -= nelt;
19624 src = d->op1;
19625 dst = d->op0;
19627 gcc_assert (cnt_s < nelt);
19629 rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
19630 GEN_INT (cnt_s << 6 | cnt_d << 4));
19631 emit_insn (x);
19633 return true;
19636 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19637 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
19639 static bool
19640 expand_vec_perm_blend (struct expand_vec_perm_d *d)
19642 machine_mode mmode, vmode = d->vmode;
19643 unsigned i, nelt = d->nelt;
19644 unsigned HOST_WIDE_INT mask;
19645 rtx target, op0, op1, maskop, x;
19646 rtx rperm[32], vperm;
19648 if (d->one_operand_p)
19649 return false;
19650 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
19651 && (TARGET_AVX512BW
19652 || GET_MODE_UNIT_SIZE (vmode) >= 4))
19654 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19656 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19658 else if (TARGET_SSE4_1
19659 && (GET_MODE_SIZE (vmode) == 16
19660 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
19661 || GET_MODE_SIZE (vmode) == 4))
19663 else
19664 return false;
19666 /* This is a blend, not a permute. Elements must stay in their
19667 respective lanes. */
19668 for (i = 0; i < nelt; ++i)
19670 unsigned e = d->perm[i];
19671 if (!(e == i || e == i + nelt))
19672 return false;
19675 if (d->testing_p)
19676 return true;
19678 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
19679 decision should be extracted elsewhere, so that we only try that
19680 sequence once all budget==3 options have been tried. */
19681 target = d->target;
19682 op0 = d->op0;
19683 op1 = d->op1;
19684 mask = 0;
19686 switch (vmode)
19688 case E_V8DFmode:
19689 case E_V16SFmode:
19690 case E_V4DFmode:
19691 case E_V8SFmode:
19692 case E_V2DFmode:
19693 case E_V4SFmode:
19694 case E_V2SFmode:
19695 case E_V2HImode:
19696 case E_V4HImode:
19697 case E_V8HImode:
19698 case E_V8SImode:
19699 case E_V32HImode:
19700 case E_V64QImode:
19701 case E_V16SImode:
19702 case E_V8DImode:
19703 for (i = 0; i < nelt; ++i)
19704 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19705 break;
19707 case E_V2DImode:
19708 for (i = 0; i < 2; ++i)
19709 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
19710 vmode = V8HImode;
19711 goto do_subreg;
19713 case E_V2SImode:
19714 for (i = 0; i < 2; ++i)
19715 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
19716 vmode = V4HImode;
19717 goto do_subreg;
19719 case E_V4SImode:
19720 if (TARGET_AVX2)
19722 /* Use vpblendd instead of vpblendw. */
19723 for (i = 0; i < nelt; ++i)
19724 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19725 break;
19727 else
19729 for (i = 0; i < 4; ++i)
19730 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19731 vmode = V8HImode;
19732 goto do_subreg;
19735 case E_V16QImode:
19736 /* See if bytes move in pairs so we can use pblendw with
19737 an immediate argument, rather than pblendvb with a vector
19738 argument. */
19739 for (i = 0; i < 16; i += 2)
19740 if (d->perm[i] + 1 != d->perm[i + 1])
19742 use_pblendvb:
19743 for (i = 0; i < nelt; ++i)
19744 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
19746 finish_pblendvb:
19747 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19748 vperm = force_reg (vmode, vperm);
19750 if (GET_MODE_SIZE (vmode) == 4)
19751 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
19752 else if (GET_MODE_SIZE (vmode) == 8)
19753 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
19754 else if (GET_MODE_SIZE (vmode) == 16)
19755 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
19756 else
19757 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
19758 if (target != d->target)
19759 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19760 return true;
19763 for (i = 0; i < 8; ++i)
19764 mask |= (d->perm[i * 2] >= 16) << i;
19765 vmode = V8HImode;
19766 /* FALLTHRU */
19768 do_subreg:
19769 target = gen_reg_rtx (vmode);
19770 op0 = gen_lowpart (vmode, op0);
19771 op1 = gen_lowpart (vmode, op1);
19772 break;
19774 case E_V8QImode:
19775 for (i = 0; i < 8; i += 2)
19776 if (d->perm[i] + 1 != d->perm[i + 1])
19777 goto use_pblendvb;
19779 for (i = 0; i < 4; ++i)
19780 mask |= (d->perm[i * 2] >= 8) << i;
19781 vmode = V4HImode;
19782 goto do_subreg;
19784 case E_V4QImode:
19785 for (i = 0; i < 4; i += 2)
19786 if (d->perm[i] + 1 != d->perm[i + 1])
19787 goto use_pblendvb;
19789 for (i = 0; i < 2; ++i)
19790 mask |= (d->perm[i * 2] >= 4) << i;
19791 vmode = V2HImode;
19792 goto do_subreg;
19794 case E_V32QImode:
19795 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19796 for (i = 0; i < 32; i += 2)
19797 if (d->perm[i] + 1 != d->perm[i + 1])
19798 goto use_pblendvb;
19799 /* See if bytes move in quadruplets. If yes, vpblendd
19800 with immediate can be used. */
19801 for (i = 0; i < 32; i += 4)
19802 if (d->perm[i] + 2 != d->perm[i + 2])
19803 break;
19804 if (i < 32)
19806 /* See if bytes move the same in both lanes. If yes,
19807 vpblendw with immediate can be used. */
19808 for (i = 0; i < 16; i += 2)
19809 if (d->perm[i] + 16 != d->perm[i + 16])
19810 goto use_pblendvb;
19812 /* Use vpblendw. */
19813 for (i = 0; i < 16; ++i)
19814 mask |= (d->perm[i * 2] >= 32) << i;
19815 vmode = V16HImode;
19816 goto do_subreg;
19819 /* Use vpblendd. */
19820 for (i = 0; i < 8; ++i)
19821 mask |= (d->perm[i * 4] >= 32) << i;
19822 vmode = V8SImode;
19823 goto do_subreg;
19825 case E_V16HImode:
19826 /* See if words move in pairs. If yes, vpblendd can be used. */
19827 for (i = 0; i < 16; i += 2)
19828 if (d->perm[i] + 1 != d->perm[i + 1])
19829 break;
19830 if (i < 16)
19832 /* See if words move the same in both lanes. If not,
19833 vpblendvb must be used. */
19834 for (i = 0; i < 8; i++)
19835 if (d->perm[i] + 8 != d->perm[i + 8])
19837 /* Use vpblendvb. */
19838 for (i = 0; i < 32; ++i)
19839 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
19841 vmode = V32QImode;
19842 nelt = 32;
19843 target = gen_reg_rtx (vmode);
19844 op0 = gen_lowpart (vmode, op0);
19845 op1 = gen_lowpart (vmode, op1);
19846 goto finish_pblendvb;
19849 /* Use vpblendw. */
19850 for (i = 0; i < 16; ++i)
19851 mask |= (d->perm[i] >= 16) << i;
19852 break;
19855 /* Use vpblendd. */
19856 for (i = 0; i < 8; ++i)
19857 mask |= (d->perm[i * 2] >= 16) << i;
19858 vmode = V8SImode;
19859 goto do_subreg;
19861 case E_V4DImode:
19862 /* Use vpblendd. */
19863 for (i = 0; i < 4; ++i)
19864 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19865 vmode = V8SImode;
19866 goto do_subreg;
19868 default:
19869 gcc_unreachable ();
19872 switch (vmode)
19874 case E_V8DFmode:
19875 case E_V8DImode:
19876 mmode = QImode;
19877 break;
19878 case E_V16SFmode:
19879 case E_V16SImode:
19880 mmode = HImode;
19881 break;
19882 case E_V32HImode:
19883 mmode = SImode;
19884 break;
19885 case E_V64QImode:
19886 mmode = DImode;
19887 break;
19888 default:
19889 mmode = VOIDmode;
19892 /* Canonicalize vec_merge. */
19893 if (swap_commutative_operands_p (op1, op0)
19894 /* Two operands have same precedence, then
19895 first bit of mask select first operand. */
19896 || (!swap_commutative_operands_p (op0, op1)
19897 && !(mask & 1)))
19899 unsigned n_elts = GET_MODE_NUNITS (vmode);
19900 std::swap (op0, op1);
19901 unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
19902 if (n_elts == HOST_BITS_PER_WIDE_INT)
19903 mask_all = -1;
19904 else
19905 mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
19906 mask = ~mask & mask_all;
19909 if (mmode != VOIDmode)
19910 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
19911 else
19912 maskop = GEN_INT (mask);
19914 /* This matches five different patterns with the different modes. */
19915 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
19916 x = gen_rtx_SET (target, x);
19917 emit_insn (x);
19918 if (target != d->target)
19919 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19921 return true;
19924 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19925 in terms of the variable form of vpermilps.
19927 Note that we will have already failed the immediate input vpermilps,
19928 which requires that the high and low part shuffle be identical; the
19929 variable form doesn't require that. */
19931 static bool
19932 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
19934 rtx rperm[8], vperm;
19935 unsigned i;
19937 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
19938 return false;
19940 /* We can only permute within the 128-bit lane. */
19941 for (i = 0; i < 8; ++i)
19943 unsigned e = d->perm[i];
19944 if (i < 4 ? e >= 4 : e < 4)
19945 return false;
19948 if (d->testing_p)
19949 return true;
19951 for (i = 0; i < 8; ++i)
19953 unsigned e = d->perm[i];
19955 /* Within each 128-bit lane, the elements of op0 are numbered
19956 from 0 and the elements of op1 are numbered from 4. */
19957 if (e >= 8 + 4)
19958 e -= 8;
19959 else if (e >= 4)
19960 e -= 4;
19962 rperm[i] = GEN_INT (e);
19965 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
19966 vperm = force_reg (V8SImode, vperm);
19967 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
19969 return true;
19972 /* For V*[QHS]Imode permutations, check if the same permutation
19973 can't be performed in a 2x, 4x or 8x wider inner mode. */
19975 static bool
19976 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
19977 struct expand_vec_perm_d *nd)
19979 int i;
19980 machine_mode mode = VOIDmode;
19982 switch (d->vmode)
19984 case E_V8QImode: mode = V4HImode; break;
19985 case E_V16QImode: mode = V8HImode; break;
19986 case E_V32QImode: mode = V16HImode; break;
19987 case E_V64QImode: mode = V32HImode; break;
19988 case E_V4HImode: mode = V2SImode; break;
19989 case E_V8HImode: mode = V4SImode; break;
19990 case E_V16HImode: mode = V8SImode; break;
19991 case E_V32HImode: mode = V16SImode; break;
19992 case E_V4SImode: mode = V2DImode; break;
19993 case E_V8SImode: mode = V4DImode; break;
19994 case E_V16SImode: mode = V8DImode; break;
19995 default: return false;
19997 for (i = 0; i < d->nelt; i += 2)
19998 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
19999 return false;
20000 nd->vmode = mode;
20001 nd->nelt = d->nelt / 2;
20002 for (i = 0; i < nd->nelt; i++)
20003 nd->perm[i] = d->perm[2 * i] / 2;
20004 if (GET_MODE_INNER (mode) != DImode)
20005 canonicalize_vector_int_perm (nd, nd);
20006 if (nd != d)
20008 nd->one_operand_p = d->one_operand_p;
20009 nd->testing_p = d->testing_p;
20010 if (d->op0 == d->op1)
20011 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
20012 else
20014 nd->op0 = gen_lowpart (nd->vmode, d->op0);
20015 nd->op1 = gen_lowpart (nd->vmode, d->op1);
20017 if (d->testing_p)
20018 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
20019 else
20020 nd->target = gen_reg_rtx (nd->vmode);
20022 return true;
20025 /* Return true if permutation D can be performed as VMODE permutation
20026 instead. */
20028 static bool
20029 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
20031 unsigned int i, j, chunk;
20033 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
20034 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
20035 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
20036 return false;
20038 if (GET_MODE_NUNITS (vmode) >= d->nelt)
20039 return true;
20041 chunk = d->nelt / GET_MODE_NUNITS (vmode);
20042 for (i = 0; i < d->nelt; i += chunk)
20043 if (d->perm[i] & (chunk - 1))
20044 return false;
20045 else
20046 for (j = 1; j < chunk; ++j)
20047 if (d->perm[i] + j != d->perm[i + j])
20048 return false;
20050 return true;
20053 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20054 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
20056 static bool
20057 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
20059 unsigned i, nelt, eltsz, mask;
20060 unsigned char perm[64];
20061 machine_mode vmode;
20062 struct expand_vec_perm_d nd;
20063 rtx rperm[64], vperm, target, op0, op1;
20065 nelt = d->nelt;
20067 if (!d->one_operand_p)
20068 switch (GET_MODE_SIZE (d->vmode))
20070 case 4:
20071 if (!TARGET_XOP)
20072 return false;
20073 vmode = V4QImode;
20074 break;
20076 case 8:
20077 if (!TARGET_XOP)
20078 return false;
20079 vmode = V8QImode;
20080 break;
20082 case 16:
20083 if (!TARGET_XOP)
20084 return false;
20085 vmode = V16QImode;
20086 break;
20088 case 32:
20089 if (!TARGET_AVX2)
20090 return false;
20092 if (valid_perm_using_mode_p (V2TImode, d))
20094 if (d->testing_p)
20095 return true;
20097 /* Use vperm2i128 insn. The pattern uses
20098 V4DImode instead of V2TImode. */
20099 target = d->target;
20100 if (d->vmode != V4DImode)
20101 target = gen_reg_rtx (V4DImode);
20102 op0 = gen_lowpart (V4DImode, d->op0);
20103 op1 = gen_lowpart (V4DImode, d->op1);
20104 rperm[0]
20105 = GEN_INT ((d->perm[0] / (nelt / 2))
20106 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
20107 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
20108 if (target != d->target)
20109 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
20110 return true;
20112 /* FALLTHRU */
20114 default:
20115 return false;
20117 else
20118 switch (GET_MODE_SIZE (d->vmode))
20120 case 4:
20121 if (!TARGET_SSSE3)
20122 return false;
20123 vmode = V4QImode;
20124 break;
20126 case 8:
20127 if (!TARGET_SSSE3)
20128 return false;
20129 vmode = V8QImode;
20130 break;
20132 case 16:
20133 if (!TARGET_SSSE3)
20134 return false;
20135 vmode = V16QImode;
20136 break;
20138 case 32:
20139 if (!TARGET_AVX2)
20140 return false;
20142 /* V4DImode should be already handled through
20143 expand_vselect by vpermq instruction. */
20144 gcc_assert (d->vmode != V4DImode);
20146 vmode = V32QImode;
20147 if (d->vmode == V8SImode
20148 || d->vmode == V16HImode
20149 || d->vmode == V32QImode)
20151 /* First see if vpermq can be used for
20152 V8SImode/V16HImode/V32QImode. */
20153 if (valid_perm_using_mode_p (V4DImode, d))
20155 for (i = 0; i < 4; i++)
20156 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
20157 if (d->testing_p)
20158 return true;
20159 target = gen_reg_rtx (V4DImode);
20160 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
20161 perm, 4, false))
20163 emit_move_insn (d->target,
20164 gen_lowpart (d->vmode, target));
20165 return true;
20167 return false;
20170 /* Next see if vpermd can be used. */
20171 if (valid_perm_using_mode_p (V8SImode, d))
20172 vmode = V8SImode;
20174 /* Or if vpermps can be used. */
20175 else if (d->vmode == V8SFmode)
20176 vmode = V8SImode;
20178 if (vmode == V32QImode)
20180 /* vpshufb only works intra lanes, it is not
20181 possible to shuffle bytes in between the lanes. */
20182 for (i = 0; i < nelt; ++i)
20183 if ((d->perm[i] ^ i) & (nelt / 2))
20184 return false;
20186 break;
20188 case 64:
20189 if (!TARGET_AVX512BW)
20190 return false;
20192 /* If vpermq didn't work, vpshufb won't work either. */
20193 if (d->vmode == V8DFmode || d->vmode == V8DImode)
20194 return false;
20196 vmode = V64QImode;
20197 if (d->vmode == V16SImode
20198 || d->vmode == V32HImode
20199 || d->vmode == V64QImode)
20201 /* First see if vpermq can be used for
20202 V16SImode/V32HImode/V64QImode. */
20203 if (valid_perm_using_mode_p (V8DImode, d))
20205 for (i = 0; i < 8; i++)
20206 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
20207 if (d->testing_p)
20208 return true;
20209 target = gen_reg_rtx (V8DImode);
20210 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
20211 perm, 8, false))
20213 emit_move_insn (d->target,
20214 gen_lowpart (d->vmode, target));
20215 return true;
20217 return false;
20220 /* Next see if vpermd can be used. */
20221 if (valid_perm_using_mode_p (V16SImode, d))
20222 vmode = V16SImode;
20224 /* Or if vpermps can be used. */
20225 else if (d->vmode == V16SFmode)
20226 vmode = V16SImode;
20228 if (vmode == V64QImode)
20230 /* vpshufb only works intra lanes, it is not
20231 possible to shuffle bytes in between the lanes. */
20232 for (i = 0; i < nelt; ++i)
20233 if ((d->perm[i] ^ i) & (3 * nelt / 4))
20234 return false;
20236 break;
20238 default:
20239 return false;
20242 if (d->testing_p)
20243 return true;
20245 /* Try to avoid variable permutation instruction. */
20246 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
20248 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
20249 return true;
20252 if (vmode == V8SImode)
20253 for (i = 0; i < 8; ++i)
20254 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
20255 else if (vmode == V16SImode)
20256 for (i = 0; i < 16; ++i)
20257 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
20258 else
20260 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20261 if (!d->one_operand_p)
20262 mask = 2 * nelt - 1;
20263 else if (vmode == V64QImode)
20264 mask = nelt / 4 - 1;
20265 else if (vmode == V32QImode)
20266 mask = nelt / 2 - 1;
20267 else
20268 mask = nelt - 1;
20270 for (i = 0; i < nelt; ++i)
20272 unsigned j, e = d->perm[i] & mask;
20273 for (j = 0; j < eltsz; ++j)
20274 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
20278 machine_mode vpmode = vmode;
20280 nelt = GET_MODE_SIZE (vmode);
20282 /* Emulate narrow modes with V16QI instructions. */
20283 if (nelt < 16)
20285 rtx m128 = GEN_INT (-128);
20287 /* Remap elements from the second operand, as we have to
20288 account for inactive top elements from the first operand. */
20289 if (!d->one_operand_p)
20291 for (i = 0; i < nelt; ++i)
20293 unsigned ival = UINTVAL (rperm[i]);
20294 if (ival >= nelt)
20295 rperm[i] = GEN_INT (ival + 16 - nelt);
20299 /* Fill inactive elements in the top positions with zeros. */
20300 for (i = nelt; i < 16; ++i)
20301 rperm[i] = m128;
20303 vpmode = V16QImode;
20306 vperm = gen_rtx_CONST_VECTOR (vpmode,
20307 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
20308 vperm = force_reg (vpmode, vperm);
20310 if (vmode == d->vmode)
20311 target = d->target;
20312 else
20313 target = gen_reg_rtx (vmode);
20315 op0 = gen_lowpart (vmode, d->op0);
20317 if (d->one_operand_p)
20319 rtx (*gen) (rtx, rtx, rtx);
20321 if (vmode == V4QImode)
20322 gen = gen_mmx_pshufbv4qi3;
20323 else if (vmode == V8QImode)
20324 gen = gen_mmx_pshufbv8qi3;
20325 else if (vmode == V16QImode)
20326 gen = gen_ssse3_pshufbv16qi3;
20327 else if (vmode == V32QImode)
20328 gen = gen_avx2_pshufbv32qi3;
20329 else if (vmode == V64QImode)
20330 gen = gen_avx512bw_pshufbv64qi3;
20331 else if (vmode == V8SFmode)
20332 gen = gen_avx2_permvarv8sf;
20333 else if (vmode == V8SImode)
20334 gen = gen_avx2_permvarv8si;
20335 else if (vmode == V16SFmode)
20336 gen = gen_avx512f_permvarv16sf;
20337 else if (vmode == V16SImode)
20338 gen = gen_avx512f_permvarv16si;
20339 else
20340 gcc_unreachable ();
20342 emit_insn (gen (target, op0, vperm));
20344 else
20346 rtx (*gen) (rtx, rtx, rtx, rtx);
20348 op1 = gen_lowpart (vmode, d->op1);
20350 if (vmode == V4QImode)
20351 gen = gen_mmx_ppermv32;
20352 else if (vmode == V8QImode)
20353 gen = gen_mmx_ppermv64;
20354 else if (vmode == V16QImode)
20355 gen = gen_xop_pperm;
20356 else
20357 gcc_unreachable ();
20359 emit_insn (gen (target, op0, op1, vperm));
20362 if (target != d->target)
20363 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
20365 return true;
20368 /* Try to expand one-operand permutation with constant mask. */
20370 static bool
20371 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
20373 machine_mode mode = GET_MODE (d->op0);
20374 machine_mode maskmode = mode;
20375 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
20376 rtx (*gen) (rtx, rtx, rtx) = NULL;
20377 rtx target, op0, mask;
20378 rtx vec[64];
20380 if (!rtx_equal_p (d->op0, d->op1))
20381 return false;
20383 if (!TARGET_AVX512F)
20384 return false;
20386 /* Accept VNxHImode and VNxQImode now. */
20387 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
20388 return false;
20390 /* vpermw. */
20391 if (!TARGET_AVX512BW && inner_size == 2)
20392 return false;
20394 /* vpermb. */
20395 if (!TARGET_AVX512VBMI && inner_size == 1)
20396 return false;
20398 switch (mode)
20400 case E_V16SImode:
20401 gen = gen_avx512f_permvarv16si;
20402 break;
20403 case E_V16SFmode:
20404 gen = gen_avx512f_permvarv16sf;
20405 maskmode = V16SImode;
20406 break;
20407 case E_V8DImode:
20408 gen = gen_avx512f_permvarv8di;
20409 break;
20410 case E_V8DFmode:
20411 gen = gen_avx512f_permvarv8df;
20412 maskmode = V8DImode;
20413 break;
20414 case E_V32HImode:
20415 gen = gen_avx512bw_permvarv32hi;
20416 break;
20417 case E_V16HImode:
20418 gen = gen_avx512vl_permvarv16hi;
20419 break;
20420 case E_V8HImode:
20421 gen = gen_avx512vl_permvarv8hi;
20422 break;
20423 case E_V64QImode:
20424 gen = gen_avx512bw_permvarv64qi;
20425 break;
20426 case E_V32QImode:
20427 gen = gen_avx512vl_permvarv32qi;
20428 break;
20429 case E_V16QImode:
20430 gen = gen_avx512vl_permvarv16qi;
20431 break;
20433 default:
20434 return false;
20437 if (d->testing_p)
20438 return true;
20440 target = d->target;
20441 op0 = d->op0;
20442 for (int i = 0; i < d->nelt; ++i)
20443 vec[i] = GEN_INT (d->perm[i]);
20444 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
20445 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
20446 return true;
20449 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
20451 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
20452 in a single instruction. */
20454 static bool
20455 expand_vec_perm_1 (struct expand_vec_perm_d *d)
20457 unsigned i, nelt = d->nelt;
20458 struct expand_vec_perm_d nd;
20460 /* Check plain VEC_SELECT first, because AVX has instructions that could
20461 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
20462 input where SEL+CONCAT may not. */
20463 if (d->one_operand_p)
20465 int mask = nelt - 1;
20466 bool identity_perm = true;
20467 bool broadcast_perm = true;
20469 for (i = 0; i < nelt; i++)
20471 nd.perm[i] = d->perm[i] & mask;
20472 if (nd.perm[i] != i)
20473 identity_perm = false;
20474 if (nd.perm[i])
20475 broadcast_perm = false;
20478 if (identity_perm)
20480 if (!d->testing_p)
20481 emit_move_insn (d->target, d->op0);
20482 return true;
20484 else if (broadcast_perm && TARGET_AVX2)
20486 /* Use vpbroadcast{b,w,d}. */
20487 rtx (*gen) (rtx, rtx) = NULL;
20488 switch (d->vmode)
20490 case E_V64QImode:
20491 if (TARGET_AVX512BW)
20492 gen = gen_avx512bw_vec_dupv64qi_1;
20493 break;
20494 case E_V32QImode:
20495 gen = gen_avx2_pbroadcastv32qi_1;
20496 break;
20497 case E_V32HImode:
20498 if (TARGET_AVX512BW)
20499 gen = gen_avx512bw_vec_dupv32hi_1;
20500 break;
20501 case E_V16HImode:
20502 gen = gen_avx2_pbroadcastv16hi_1;
20503 break;
20504 case E_V16SImode:
20505 if (TARGET_AVX512F)
20506 gen = gen_avx512f_vec_dupv16si_1;
20507 break;
20508 case E_V8SImode:
20509 gen = gen_avx2_pbroadcastv8si_1;
20510 break;
20511 case E_V16QImode:
20512 gen = gen_avx2_pbroadcastv16qi;
20513 break;
20514 case E_V8HImode:
20515 gen = gen_avx2_pbroadcastv8hi;
20516 break;
20517 case E_V16SFmode:
20518 if (TARGET_AVX512F)
20519 gen = gen_avx512f_vec_dupv16sf_1;
20520 break;
20521 case E_V8SFmode:
20522 gen = gen_avx2_vec_dupv8sf_1;
20523 break;
20524 case E_V8DFmode:
20525 if (TARGET_AVX512F)
20526 gen = gen_avx512f_vec_dupv8df_1;
20527 break;
20528 case E_V8DImode:
20529 if (TARGET_AVX512F)
20530 gen = gen_avx512f_vec_dupv8di_1;
20531 break;
20532 /* For other modes prefer other shuffles this function creates. */
20533 default: break;
20535 if (gen != NULL)
20537 if (!d->testing_p)
20538 emit_insn (gen (d->target, d->op0));
20539 return true;
20543 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
20544 return true;
20546 /* There are plenty of patterns in sse.md that are written for
20547 SEL+CONCAT and are not replicated for a single op. Perhaps
20548 that should be changed, to avoid the nastiness here. */
20550 /* Recognize interleave style patterns, which means incrementing
20551 every other permutation operand. */
20552 for (i = 0; i < nelt; i += 2)
20554 nd.perm[i] = d->perm[i] & mask;
20555 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
20557 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
20558 d->testing_p))
20559 return true;
20561 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
20562 if (nelt >= 4)
20564 for (i = 0; i < nelt; i += 4)
20566 nd.perm[i + 0] = d->perm[i + 0] & mask;
20567 nd.perm[i + 1] = d->perm[i + 1] & mask;
20568 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
20569 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
20572 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
20573 d->testing_p))
20574 return true;
20578 /* Try the SSE4.1 blend variable merge instructions. */
20579 if (expand_vec_perm_blend (d))
20580 return true;
20582 /* Try movss/movsd instructions. */
20583 if (expand_vec_perm_movs (d))
20584 return true;
20586 /* Try the SSE4.1 insertps instruction. */
20587 if (expand_vec_perm_insertps (d))
20588 return true;
20590 /* Try the fully general two operand permute. */
20591 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
20592 d->testing_p))
20593 return true;
20595 /* Recognize interleave style patterns with reversed operands. */
20596 if (!d->one_operand_p)
20598 for (i = 0; i < nelt; ++i)
20600 unsigned e = d->perm[i];
20601 if (e >= nelt)
20602 e -= nelt;
20603 else
20604 e += nelt;
20605 nd.perm[i] = e;
20608 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
20609 d->testing_p))
20610 return true;
20613 /* Try one of the AVX vpermil variable permutations. */
20614 if (expand_vec_perm_vpermil (d))
20615 return true;
20617 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
20618 vpshufb, vpermd, vpermps or vpermq variable permutation. */
20619 if (expand_vec_perm_pshufb (d))
20620 return true;
20622 /* Try the AVX2 vpalignr instruction. */
20623 if (expand_vec_perm_palignr (d, true))
20624 return true;
20626 /* Try the AVX512F vperm{w,b,s,d} instructions */
20627 if (ix86_expand_vec_one_operand_perm_avx512 (d))
20628 return true;
20630 /* Try the AVX512F vpermt2/vpermi2 instructions. */
20631 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
20632 return true;
20634 /* See if we can get the same permutation in different vector integer
20635 mode. */
20636 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
20638 if (!d->testing_p)
20639 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
20640 return true;
20642 return false;
20645 /* Canonicalize vec_perm index to make the first index
20646 always comes from the first vector. */
20647 static void
20648 ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
20650 unsigned nelt = d->nelt;
20651 if (d->perm[0] < nelt)
20652 return;
20654 for (unsigned i = 0; i != nelt; i++)
20655 d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
20657 std::swap (d->op0, d->op1);
20658 return;
20661 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20662 in terms of a pair of shufps+ shufps/pshufd instructions. */
20663 static bool
20664 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
20666 unsigned char perm1[4];
20667 machine_mode vmode = d->vmode;
20668 bool ok;
20669 unsigned i, j, k, count = 0;
20671 if (d->one_operand_p
20672 || (vmode != V4SImode && vmode != V4SFmode))
20673 return false;
20675 if (d->testing_p)
20676 return true;
20678 ix86_vec_perm_index_canon (d);
20679 for (i = 0; i < 4; ++i)
20680 count += d->perm[i] > 3 ? 1 : 0;
20682 gcc_assert (count & 3);
20684 rtx tmp = gen_reg_rtx (vmode);
20685 /* 2 from op0 and 2 from op1. */
20686 if (count == 2)
20688 unsigned char perm2[4];
20689 for (i = 0, j = 0, k = 2; i < 4; ++i)
20690 if (d->perm[i] & 4)
20692 perm1[k++] = d->perm[i];
20693 perm2[i] = k - 1;
20695 else
20697 perm1[j++] = d->perm[i];
20698 perm2[i] = j - 1;
20701 /* shufps. */
20702 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20703 perm1, d->nelt, false);
20704 gcc_assert (ok);
20705 if (vmode == V4SImode && TARGET_SSE2)
20706 /* pshufd. */
20707 ok = expand_vselect (d->target, tmp,
20708 perm2, d->nelt, false);
20709 else
20711 /* shufps. */
20712 perm2[2] += 4;
20713 perm2[3] += 4;
20714 ok = expand_vselect_vconcat (d->target, tmp, tmp,
20715 perm2, d->nelt, false);
20717 gcc_assert (ok);
20719 /* 3 from one op and 1 from another. */
20720 else
20722 unsigned pair_idx = 8, lone_idx = 8, shift;
20724 /* Find the lone index. */
20725 for (i = 0; i < 4; ++i)
20726 if ((d->perm[i] > 3 && count == 1)
20727 || (d->perm[i] < 4 && count == 3))
20728 lone_idx = i;
20730 /* When lone_idx is not 0, it must from second op(count == 1). */
20731 gcc_assert (count == (lone_idx ? 1 : 3));
20733 /* Find the pair index that sits in the same half as the lone index. */
20734 shift = lone_idx & 2;
20735 pair_idx = 1 - lone_idx + 2 * shift;
20737 /* First permutate lone index and pair index into the same vector as
20738 [ lone, lone, pair, pair ]. */
20739 perm1[1] = perm1[0]
20740 = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
20741 perm1[3] = perm1[2]
20742 = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
20744 /* Alway put the vector contains lone indx at the first. */
20745 if (count == 1)
20746 std::swap (d->op0, d->op1);
20748 /* shufps. */
20749 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20750 perm1, d->nelt, false);
20751 gcc_assert (ok);
20753 /* Refine lone and pair index to original order. */
20754 perm1[shift] = lone_idx << 1;
20755 perm1[shift + 1] = pair_idx << 1;
20757 /* Select the remaining 2 elements in another vector. */
20758 for (i = 2 - shift; i < 4 - shift; ++i)
20759 perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
20761 /* Adjust to original selector. */
20762 if (lone_idx > 1)
20763 std::swap (tmp, d->op1);
20765 /* shufps. */
20766 ok = expand_vselect_vconcat (d->target, tmp, d->op1,
20767 perm1, d->nelt, false);
20769 gcc_assert (ok);
20772 return true;
20775 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20776 in terms of a pair of pshuflw + pshufhw instructions. */
20778 static bool
20779 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
20781 unsigned char perm2[MAX_VECT_LEN];
20782 unsigned i;
20783 bool ok;
20785 if (d->vmode != V8HImode || !d->one_operand_p)
20786 return false;
20788 /* The two permutations only operate in 64-bit lanes. */
20789 for (i = 0; i < 4; ++i)
20790 if (d->perm[i] >= 4)
20791 return false;
20792 for (i = 4; i < 8; ++i)
20793 if (d->perm[i] < 4)
20794 return false;
20796 if (d->testing_p)
20797 return true;
20799 /* Emit the pshuflw. */
20800 memcpy (perm2, d->perm, 4);
20801 for (i = 4; i < 8; ++i)
20802 perm2[i] = i;
20803 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
20804 gcc_assert (ok);
20806 /* Emit the pshufhw. */
20807 memcpy (perm2 + 4, d->perm + 4, 4);
20808 for (i = 0; i < 4; ++i)
20809 perm2[i] = i;
20810 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
20811 gcc_assert (ok);
20813 return true;
20816 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20817 the permutation using the SSSE3 palignr instruction. This succeeds
20818 when all of the elements in PERM fit within one vector and we merely
20819 need to shift them down so that a single vector permutation has a
20820 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20821 the vpalignr instruction itself can perform the requested permutation. */
20823 static bool
20824 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
20826 unsigned i, nelt = d->nelt;
20827 unsigned min, max, minswap, maxswap;
20828 bool in_order, ok, swap = false;
20829 rtx shift, target;
20830 struct expand_vec_perm_d dcopy;
20832 /* Even with AVX, palignr only operates on 128-bit vectors,
20833 in AVX2 palignr operates on both 128-bit lanes. */
20834 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
20835 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
20836 return false;
20838 min = 2 * nelt;
20839 max = 0;
20840 minswap = 2 * nelt;
20841 maxswap = 0;
20842 for (i = 0; i < nelt; ++i)
20844 unsigned e = d->perm[i];
20845 unsigned eswap = d->perm[i] ^ nelt;
20846 if (GET_MODE_SIZE (d->vmode) == 32)
20848 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
20849 eswap = e ^ (nelt / 2);
20851 if (e < min)
20852 min = e;
20853 if (e > max)
20854 max = e;
20855 if (eswap < minswap)
20856 minswap = eswap;
20857 if (eswap > maxswap)
20858 maxswap = eswap;
20860 if (min == 0
20861 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
20863 if (d->one_operand_p
20864 || minswap == 0
20865 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
20866 ? nelt / 2 : nelt))
20867 return false;
20868 swap = true;
20869 min = minswap;
20870 max = maxswap;
20873 /* Given that we have SSSE3, we know we'll be able to implement the
20874 single operand permutation after the palignr with pshufb for
20875 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20876 first. */
20877 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
20878 return true;
20880 dcopy = *d;
20881 if (swap)
20883 dcopy.op0 = d->op1;
20884 dcopy.op1 = d->op0;
20885 for (i = 0; i < nelt; ++i)
20886 dcopy.perm[i] ^= nelt;
20889 in_order = true;
20890 for (i = 0; i < nelt; ++i)
20892 unsigned e = dcopy.perm[i];
20893 if (GET_MODE_SIZE (d->vmode) == 32
20894 && e >= nelt
20895 && (e & (nelt / 2 - 1)) < min)
20896 e = e - min - (nelt / 2);
20897 else
20898 e = e - min;
20899 if (e != i)
20900 in_order = false;
20901 dcopy.perm[i] = e;
20903 dcopy.one_operand_p = true;
20905 if (single_insn_only_p && !in_order)
20906 return false;
20908 /* For AVX2, test whether we can permute the result in one instruction. */
20909 if (d->testing_p)
20911 if (in_order)
20912 return true;
20913 dcopy.op1 = dcopy.op0;
20914 return expand_vec_perm_1 (&dcopy);
20917 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
20918 if (GET_MODE_SIZE (d->vmode) == 16)
20920 target = gen_reg_rtx (V1TImode);
20921 emit_insn (gen_ssse3_palignrv1ti (target,
20922 gen_lowpart (V1TImode, dcopy.op1),
20923 gen_lowpart (V1TImode, dcopy.op0),
20924 shift));
20926 else
20928 target = gen_reg_rtx (V2TImode);
20929 emit_insn (gen_avx2_palignrv2ti (target,
20930 gen_lowpart (V2TImode, dcopy.op1),
20931 gen_lowpart (V2TImode, dcopy.op0),
20932 shift));
20935 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
20937 /* Test for the degenerate case where the alignment by itself
20938 produces the desired permutation. */
20939 if (in_order)
20941 emit_move_insn (d->target, dcopy.op0);
20942 return true;
20945 ok = expand_vec_perm_1 (&dcopy);
20946 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
20948 return ok;
20951 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20952 the permutation using the SSE4_1 pblendv instruction. Potentially
20953 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20955 static bool
20956 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
20958 unsigned i, which, nelt = d->nelt;
20959 struct expand_vec_perm_d dcopy, dcopy1;
20960 machine_mode vmode = d->vmode;
20961 bool ok;
20963 /* Use the same checks as in expand_vec_perm_blend. */
20964 if (d->one_operand_p)
20965 return false;
20966 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20968 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20970 else if (TARGET_SSE4_1
20971 && (GET_MODE_SIZE (vmode) == 16
20972 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
20973 || GET_MODE_SIZE (vmode) == 4))
20975 else
20976 return false;
20978 /* Figure out where permutation elements stay not in their
20979 respective lanes. */
20980 for (i = 0, which = 0; i < nelt; ++i)
20982 unsigned e = d->perm[i];
20983 if (e != i)
20984 which |= (e < nelt ? 1 : 2);
20986 /* We can pblend the part where elements stay not in their
20987 respective lanes only when these elements are all in one
20988 half of a permutation.
20989 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20990 lanes, but both 8 and 9 >= 8
20991 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20992 respective lanes and 8 >= 8, but 2 not. */
20993 if (which != 1 && which != 2)
20994 return false;
20995 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
20996 return true;
20998 /* First we apply one operand permutation to the part where
20999 elements stay not in their respective lanes. */
21000 dcopy = *d;
21001 if (which == 2)
21002 dcopy.op0 = dcopy.op1 = d->op1;
21003 else
21004 dcopy.op0 = dcopy.op1 = d->op0;
21005 if (!d->testing_p)
21006 dcopy.target = gen_reg_rtx (vmode);
21007 dcopy.one_operand_p = true;
21009 for (i = 0; i < nelt; ++i)
21010 dcopy.perm[i] = d->perm[i] & (nelt - 1);
21012 ok = expand_vec_perm_1 (&dcopy);
21013 if (GET_MODE_SIZE (vmode) != 16 && !ok)
21014 return false;
21015 else
21016 gcc_assert (ok);
21017 if (d->testing_p)
21018 return true;
21020 /* Next we put permuted elements into their positions. */
21021 dcopy1 = *d;
21022 if (which == 2)
21023 dcopy1.op1 = dcopy.target;
21024 else
21025 dcopy1.op0 = dcopy.target;
21027 for (i = 0; i < nelt; ++i)
21028 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
21030 ok = expand_vec_perm_blend (&dcopy1);
21031 gcc_assert (ok);
21033 return true;
21036 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
21038 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21039 a two vector permutation into a single vector permutation by using
21040 an interleave operation to merge the vectors. */
21042 static bool
21043 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
21045 struct expand_vec_perm_d dremap, dfinal;
21046 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
21047 unsigned HOST_WIDE_INT contents;
21048 unsigned char remap[2 * MAX_VECT_LEN];
21049 rtx_insn *seq;
21050 bool ok, same_halves = false;
21052 if (GET_MODE_SIZE (d->vmode) == 4
21053 || GET_MODE_SIZE (d->vmode) == 8
21054 || GET_MODE_SIZE (d->vmode) == 16)
21056 if (d->one_operand_p)
21057 return false;
21059 else if (GET_MODE_SIZE (d->vmode) == 32)
21061 if (!TARGET_AVX)
21062 return false;
21063 /* For 32-byte modes allow even d->one_operand_p.
21064 The lack of cross-lane shuffling in some instructions
21065 might prevent a single insn shuffle. */
21066 dfinal = *d;
21067 dfinal.testing_p = true;
21068 /* If expand_vec_perm_interleave3 can expand this into
21069 a 3 insn sequence, give up and let it be expanded as
21070 3 insn sequence. While that is one insn longer,
21071 it doesn't need a memory operand and in the common
21072 case that both interleave low and high permutations
21073 with the same operands are adjacent needs 4 insns
21074 for both after CSE. */
21075 if (expand_vec_perm_interleave3 (&dfinal))
21076 return false;
21078 else
21079 return false;
21081 /* Examine from whence the elements come. */
21082 contents = 0;
21083 for (i = 0; i < nelt; ++i)
21084 contents |= HOST_WIDE_INT_1U << d->perm[i];
21086 memset (remap, 0xff, sizeof (remap));
21087 dremap = *d;
21089 if (GET_MODE_SIZE (d->vmode) == 4
21090 || GET_MODE_SIZE (d->vmode) == 8)
21092 unsigned HOST_WIDE_INT h1, h2, h3, h4;
21094 /* Split the two input vectors into 4 halves. */
21095 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
21096 h2 = h1 << nelt2;
21097 h3 = h2 << nelt2;
21098 h4 = h3 << nelt2;
21100 /* If the elements from the low halves use interleave low,
21101 and similarly for interleave high. */
21102 if ((contents & (h1 | h3)) == contents)
21104 /* punpckl* */
21105 for (i = 0; i < nelt2; ++i)
21107 remap[i] = i * 2;
21108 remap[i + nelt] = i * 2 + 1;
21109 dremap.perm[i * 2] = i;
21110 dremap.perm[i * 2 + 1] = i + nelt;
21113 else if ((contents & (h2 | h4)) == contents)
21115 /* punpckh* */
21116 for (i = 0; i < nelt2; ++i)
21118 remap[i + nelt2] = i * 2;
21119 remap[i + nelt + nelt2] = i * 2 + 1;
21120 dremap.perm[i * 2] = i + nelt2;
21121 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
21124 else
21125 return false;
21127 else if (GET_MODE_SIZE (d->vmode) == 16)
21129 unsigned HOST_WIDE_INT h1, h2, h3, h4;
21131 /* Split the two input vectors into 4 halves. */
21132 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
21133 h2 = h1 << nelt2;
21134 h3 = h2 << nelt2;
21135 h4 = h3 << nelt2;
21137 /* If the elements from the low halves use interleave low, and similarly
21138 for interleave high. If the elements are from mis-matched halves, we
21139 can use shufps for V4SF/V4SI or do a DImode shuffle. */
21140 if ((contents & (h1 | h3)) == contents)
21142 /* punpckl* */
21143 for (i = 0; i < nelt2; ++i)
21145 remap[i] = i * 2;
21146 remap[i + nelt] = i * 2 + 1;
21147 dremap.perm[i * 2] = i;
21148 dremap.perm[i * 2 + 1] = i + nelt;
21150 if (!TARGET_SSE2 && d->vmode == V4SImode)
21151 dremap.vmode = V4SFmode;
21153 else if ((contents & (h2 | h4)) == contents)
21155 /* punpckh* */
21156 for (i = 0; i < nelt2; ++i)
21158 remap[i + nelt2] = i * 2;
21159 remap[i + nelt + nelt2] = i * 2 + 1;
21160 dremap.perm[i * 2] = i + nelt2;
21161 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
21163 if (!TARGET_SSE2 && d->vmode == V4SImode)
21164 dremap.vmode = V4SFmode;
21166 else if ((contents & (h1 | h4)) == contents)
21168 /* shufps */
21169 for (i = 0; i < nelt2; ++i)
21171 remap[i] = i;
21172 remap[i + nelt + nelt2] = i + nelt2;
21173 dremap.perm[i] = i;
21174 dremap.perm[i + nelt2] = i + nelt + nelt2;
21176 if (nelt != 4)
21178 /* shufpd */
21179 dremap.vmode = V2DImode;
21180 dremap.nelt = 2;
21181 dremap.perm[0] = 0;
21182 dremap.perm[1] = 3;
21185 else if ((contents & (h2 | h3)) == contents)
21187 /* shufps */
21188 for (i = 0; i < nelt2; ++i)
21190 remap[i + nelt2] = i;
21191 remap[i + nelt] = i + nelt2;
21192 dremap.perm[i] = i + nelt2;
21193 dremap.perm[i + nelt2] = i + nelt;
21195 if (nelt != 4)
21197 /* shufpd */
21198 dremap.vmode = V2DImode;
21199 dremap.nelt = 2;
21200 dremap.perm[0] = 1;
21201 dremap.perm[1] = 2;
21204 else
21205 return false;
21207 else
21209 unsigned int nelt4 = nelt / 4, nzcnt = 0;
21210 unsigned HOST_WIDE_INT q[8];
21211 unsigned int nonzero_halves[4];
21213 /* Split the two input vectors into 8 quarters. */
21214 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
21215 for (i = 1; i < 8; ++i)
21216 q[i] = q[0] << (nelt4 * i);
21217 for (i = 0; i < 4; ++i)
21218 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
21220 nonzero_halves[nzcnt] = i;
21221 ++nzcnt;
21224 if (nzcnt == 1)
21226 gcc_assert (d->one_operand_p);
21227 nonzero_halves[1] = nonzero_halves[0];
21228 same_halves = true;
21230 else if (d->one_operand_p)
21232 gcc_assert (nonzero_halves[0] == 0);
21233 gcc_assert (nonzero_halves[1] == 1);
21236 if (nzcnt <= 2)
21238 if (d->perm[0] / nelt2 == nonzero_halves[1])
21240 /* Attempt to increase the likelihood that dfinal
21241 shuffle will be intra-lane. */
21242 std::swap (nonzero_halves[0], nonzero_halves[1]);
21245 /* vperm2f128 or vperm2i128. */
21246 for (i = 0; i < nelt2; ++i)
21248 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
21249 remap[i + nonzero_halves[0] * nelt2] = i;
21250 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
21251 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
21254 if (d->vmode != V8SFmode
21255 && d->vmode != V4DFmode
21256 && d->vmode != V8SImode)
21258 dremap.vmode = V8SImode;
21259 dremap.nelt = 8;
21260 for (i = 0; i < 4; ++i)
21262 dremap.perm[i] = i + nonzero_halves[0] * 4;
21263 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
21267 else if (d->one_operand_p)
21268 return false;
21269 else if (TARGET_AVX2
21270 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
21272 /* vpunpckl* */
21273 for (i = 0; i < nelt4; ++i)
21275 remap[i] = i * 2;
21276 remap[i + nelt] = i * 2 + 1;
21277 remap[i + nelt2] = i * 2 + nelt2;
21278 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
21279 dremap.perm[i * 2] = i;
21280 dremap.perm[i * 2 + 1] = i + nelt;
21281 dremap.perm[i * 2 + nelt2] = i + nelt2;
21282 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
21285 else if (TARGET_AVX2
21286 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
21288 /* vpunpckh* */
21289 for (i = 0; i < nelt4; ++i)
21291 remap[i + nelt4] = i * 2;
21292 remap[i + nelt + nelt4] = i * 2 + 1;
21293 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
21294 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
21295 dremap.perm[i * 2] = i + nelt4;
21296 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
21297 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
21298 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
21301 else
21302 return false;
21305 /* Use the remapping array set up above to move the elements from their
21306 swizzled locations into their final destinations. */
21307 dfinal = *d;
21308 for (i = 0; i < nelt; ++i)
21310 unsigned e = remap[d->perm[i]];
21311 gcc_assert (e < nelt);
21312 /* If same_halves is true, both halves of the remapped vector are the
21313 same. Avoid cross-lane accesses if possible. */
21314 if (same_halves && i >= nelt2)
21316 gcc_assert (e < nelt2);
21317 dfinal.perm[i] = e + nelt2;
21319 else
21320 dfinal.perm[i] = e;
21322 if (!d->testing_p)
21324 dremap.target = gen_reg_rtx (dremap.vmode);
21325 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
21327 dfinal.op1 = dfinal.op0;
21328 dfinal.one_operand_p = true;
21330 /* Test if the final remap can be done with a single insn. For V4SFmode or
21331 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
21332 start_sequence ();
21333 ok = expand_vec_perm_1 (&dfinal);
21334 seq = get_insns ();
21335 end_sequence ();
21337 if (!ok)
21338 return false;
21340 if (d->testing_p)
21341 return true;
21343 if (dremap.vmode != dfinal.vmode)
21345 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
21346 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
21349 ok = expand_vec_perm_1 (&dremap);
21350 gcc_assert (ok);
21352 emit_insn (seq);
21353 return true;
21356 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21357 a single vector cross-lane permutation into vpermq followed
21358 by any of the single insn permutations. */
21360 static bool
21361 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
21363 struct expand_vec_perm_d dremap, dfinal;
21364 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
21365 unsigned contents[2];
21366 bool ok;
21368 if (!(TARGET_AVX2
21369 && (d->vmode == V32QImode || d->vmode == V16HImode)
21370 && d->one_operand_p))
21371 return false;
21373 contents[0] = 0;
21374 contents[1] = 0;
21375 for (i = 0; i < nelt2; ++i)
21377 contents[0] |= 1u << (d->perm[i] / nelt4);
21378 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
21381 for (i = 0; i < 2; ++i)
21383 unsigned int cnt = 0;
21384 for (j = 0; j < 4; ++j)
21385 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
21386 return false;
21389 if (d->testing_p)
21390 return true;
21392 dremap = *d;
21393 dremap.vmode = V4DImode;
21394 dremap.nelt = 4;
21395 dremap.target = gen_reg_rtx (V4DImode);
21396 dremap.op0 = gen_lowpart (V4DImode, d->op0);
21397 dremap.op1 = dremap.op0;
21398 dremap.one_operand_p = true;
21399 for (i = 0; i < 2; ++i)
21401 unsigned int cnt = 0;
21402 for (j = 0; j < 4; ++j)
21403 if ((contents[i] & (1u << j)) != 0)
21404 dremap.perm[2 * i + cnt++] = j;
21405 for (; cnt < 2; ++cnt)
21406 dremap.perm[2 * i + cnt] = 0;
21409 dfinal = *d;
21410 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
21411 dfinal.op1 = dfinal.op0;
21412 dfinal.one_operand_p = true;
21413 for (i = 0, j = 0; i < nelt; ++i)
21415 if (i == nelt2)
21416 j = 2;
21417 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
21418 if ((d->perm[i] / nelt4) == dremap.perm[j])
21420 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
21421 dfinal.perm[i] |= nelt4;
21422 else
21423 gcc_unreachable ();
21426 ok = expand_vec_perm_1 (&dremap);
21427 gcc_assert (ok);
21429 ok = expand_vec_perm_1 (&dfinal);
21430 gcc_assert (ok);
21432 return true;
21435 static bool canonicalize_perm (struct expand_vec_perm_d *d);
21437 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
21438 a vector permutation using two instructions, vperm2f128 resp.
21439 vperm2i128 followed by any single in-lane permutation. */
21441 static bool
21442 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
21444 struct expand_vec_perm_d dfirst, dsecond;
21445 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
21446 bool ok;
21448 if (!TARGET_AVX
21449 || GET_MODE_SIZE (d->vmode) != 32
21450 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
21451 return false;
21453 dsecond = *d;
21454 dsecond.one_operand_p = false;
21455 dsecond.testing_p = true;
21457 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
21458 immediate. For perm < 16 the second permutation uses
21459 d->op0 as first operand, for perm >= 16 it uses d->op1
21460 as first operand. The second operand is the result of
21461 vperm2[fi]128. */
21462 for (perm = 0; perm < 32; perm++)
21464 /* Ignore permutations which do not move anything cross-lane. */
21465 if (perm < 16)
21467 /* The second shuffle for e.g. V4DFmode has
21468 0123 and ABCD operands.
21469 Ignore AB23, as 23 is already in the second lane
21470 of the first operand. */
21471 if ((perm & 0xc) == (1 << 2)) continue;
21472 /* And 01CD, as 01 is in the first lane of the first
21473 operand. */
21474 if ((perm & 3) == 0) continue;
21475 /* And 4567, as then the vperm2[fi]128 doesn't change
21476 anything on the original 4567 second operand. */
21477 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
21479 else
21481 /* The second shuffle for e.g. V4DFmode has
21482 4567 and ABCD operands.
21483 Ignore AB67, as 67 is already in the second lane
21484 of the first operand. */
21485 if ((perm & 0xc) == (3 << 2)) continue;
21486 /* And 45CD, as 45 is in the first lane of the first
21487 operand. */
21488 if ((perm & 3) == 2) continue;
21489 /* And 0123, as then the vperm2[fi]128 doesn't change
21490 anything on the original 0123 first operand. */
21491 if ((perm & 0xf) == (1 << 2)) continue;
21494 for (i = 0; i < nelt; i++)
21496 j = d->perm[i] / nelt2;
21497 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
21498 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
21499 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
21500 dsecond.perm[i] = d->perm[i] & (nelt - 1);
21501 else
21502 break;
21505 if (i == nelt)
21507 start_sequence ();
21508 ok = expand_vec_perm_1 (&dsecond);
21509 end_sequence ();
21511 else
21512 ok = false;
21514 if (ok)
21516 if (d->testing_p)
21517 return true;
21519 /* Found a usable second shuffle. dfirst will be
21520 vperm2f128 on d->op0 and d->op1. */
21521 dsecond.testing_p = false;
21522 dfirst = *d;
21523 dfirst.target = gen_reg_rtx (d->vmode);
21524 for (i = 0; i < nelt; i++)
21525 dfirst.perm[i] = (i & (nelt2 - 1))
21526 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
21528 canonicalize_perm (&dfirst);
21529 ok = expand_vec_perm_1 (&dfirst);
21530 gcc_assert (ok);
21532 /* And dsecond is some single insn shuffle, taking
21533 d->op0 and result of vperm2f128 (if perm < 16) or
21534 d->op1 and result of vperm2f128 (otherwise). */
21535 if (perm >= 16)
21536 dsecond.op0 = dsecond.op1;
21537 dsecond.op1 = dfirst.target;
21539 ok = expand_vec_perm_1 (&dsecond);
21540 gcc_assert (ok);
21542 return true;
21545 /* For one operand, the only useful vperm2f128 permutation is 0x01
21546 aka lanes swap. */
21547 if (d->one_operand_p)
21548 return false;
21551 return false;
21554 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21555 a two vector permutation using 2 intra-lane interleave insns
21556 and cross-lane shuffle for 32-byte vectors. */
21558 static bool
21559 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
21561 unsigned i, nelt;
21562 rtx (*gen) (rtx, rtx, rtx);
21564 if (d->one_operand_p)
21565 return false;
21566 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
21568 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
21570 else
21571 return false;
21573 nelt = d->nelt;
21574 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
21575 return false;
21576 for (i = 0; i < nelt; i += 2)
21577 if (d->perm[i] != d->perm[0] + i / 2
21578 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
21579 return false;
21581 if (d->testing_p)
21582 return true;
21584 switch (d->vmode)
21586 case E_V32QImode:
21587 if (d->perm[0])
21588 gen = gen_vec_interleave_highv32qi;
21589 else
21590 gen = gen_vec_interleave_lowv32qi;
21591 break;
21592 case E_V16HImode:
21593 if (d->perm[0])
21594 gen = gen_vec_interleave_highv16hi;
21595 else
21596 gen = gen_vec_interleave_lowv16hi;
21597 break;
21598 case E_V8SImode:
21599 if (d->perm[0])
21600 gen = gen_vec_interleave_highv8si;
21601 else
21602 gen = gen_vec_interleave_lowv8si;
21603 break;
21604 case E_V4DImode:
21605 if (d->perm[0])
21606 gen = gen_vec_interleave_highv4di;
21607 else
21608 gen = gen_vec_interleave_lowv4di;
21609 break;
21610 case E_V8SFmode:
21611 if (d->perm[0])
21612 gen = gen_vec_interleave_highv8sf;
21613 else
21614 gen = gen_vec_interleave_lowv8sf;
21615 break;
21616 case E_V4DFmode:
21617 if (d->perm[0])
21618 gen = gen_vec_interleave_highv4df;
21619 else
21620 gen = gen_vec_interleave_lowv4df;
21621 break;
21622 default:
21623 gcc_unreachable ();
21626 emit_insn (gen (d->target, d->op0, d->op1));
21627 return true;
21630 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21631 a single vector permutation using a single intra-lane vector
21632 permutation, vperm2f128 swapping the lanes and vblend* insn blending
21633 the non-swapped and swapped vectors together. */
21635 static bool
21636 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
21638 struct expand_vec_perm_d dfirst, dsecond;
21639 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
21640 rtx_insn *seq;
21641 bool ok;
21642 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21644 if (!TARGET_AVX
21645 || TARGET_AVX2
21646 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21647 || !d->one_operand_p)
21648 return false;
21650 dfirst = *d;
21651 for (i = 0; i < nelt; i++)
21652 dfirst.perm[i] = 0xff;
21653 for (i = 0, msk = 0; i < nelt; i++)
21655 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21656 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
21657 return false;
21658 dfirst.perm[j] = d->perm[i];
21659 if (j != i)
21660 msk |= (1 << i);
21662 for (i = 0; i < nelt; i++)
21663 if (dfirst.perm[i] == 0xff)
21664 dfirst.perm[i] = i;
21666 if (!d->testing_p)
21667 dfirst.target = gen_reg_rtx (dfirst.vmode);
21669 start_sequence ();
21670 ok = expand_vec_perm_1 (&dfirst);
21671 seq = get_insns ();
21672 end_sequence ();
21674 if (!ok)
21675 return false;
21677 if (d->testing_p)
21678 return true;
21680 emit_insn (seq);
21682 dsecond = *d;
21683 dsecond.op0 = dfirst.target;
21684 dsecond.op1 = dfirst.target;
21685 dsecond.one_operand_p = true;
21686 dsecond.target = gen_reg_rtx (dsecond.vmode);
21687 for (i = 0; i < nelt; i++)
21688 dsecond.perm[i] = i ^ nelt2;
21690 ok = expand_vec_perm_1 (&dsecond);
21691 gcc_assert (ok);
21693 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21694 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
21695 return true;
21698 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21699 a two vector permutation using two single vector permutations and
21700 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
21701 of dfirst or dsecond is identity permutation. */
21703 static bool
21704 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
21706 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
21707 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21708 bool ident1 = true, ident2 = true;
21710 if (d->one_operand_p)
21711 return false;
21713 if (GET_MODE_SIZE (d->vmode) == 16)
21715 if (!TARGET_SSE)
21716 return false;
21717 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
21718 return false;
21720 else if (GET_MODE_SIZE (d->vmode) == 32)
21722 if (!TARGET_AVX)
21723 return false;
21724 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
21725 return false;
21726 lane = nelt2;
21728 else
21729 return false;
21731 for (i = 1; i < nelt; i++)
21732 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
21733 return false;
21735 dfirst = *d;
21736 dsecond = *d;
21737 dfinal = *d;
21738 dfirst.op1 = dfirst.op0;
21739 dfirst.one_operand_p = true;
21740 dsecond.op0 = dsecond.op1;
21741 dsecond.one_operand_p = true;
21743 for (i = 0; i < nelt; i++)
21744 if (d->perm[i] >= nelt)
21746 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
21747 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
21748 ident2 = false;
21749 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
21750 = d->perm[i] - nelt;
21752 else
21754 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
21755 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
21756 ident1 = false;
21757 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
21760 if (two_insn && !ident1 && !ident2)
21761 return false;
21763 if (!d->testing_p)
21765 if (!ident1)
21766 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21767 if (!ident2)
21768 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21769 if (d->perm[0] >= nelt)
21770 std::swap (dfinal.op0, dfinal.op1);
21773 bool ok;
21774 rtx_insn *seq1 = NULL, *seq2 = NULL;
21776 if (!ident1)
21778 start_sequence ();
21779 ok = expand_vec_perm_1 (&dfirst);
21780 seq1 = get_insns ();
21781 end_sequence ();
21783 if (!ok)
21784 return false;
21787 if (!ident2)
21789 start_sequence ();
21790 ok = expand_vec_perm_1 (&dsecond);
21791 seq2 = get_insns ();
21792 end_sequence ();
21794 if (!ok)
21795 return false;
21798 if (d->testing_p)
21799 return true;
21801 for (i = 0; i < nelt; i++)
21803 dfinal.perm[i] = i / 2;
21804 if (i >= lane)
21805 dfinal.perm[i] += lane / 2;
21806 if ((i & 1) != 0)
21807 dfinal.perm[i] += nelt;
21809 emit_insn (seq1);
21810 emit_insn (seq2);
21811 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
21812 dfinal.perm, dfinal.nelt, false);
21813 gcc_assert (ok);
21814 return true;
21817 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21818 the permutation using two single vector permutations and the SSE4_1 pblendv
21819 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21820 identity permutation. */
21822 static bool
21823 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
21825 unsigned i, nelt = d->nelt;
21826 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21827 machine_mode vmode = d->vmode;
21828 bool ident1 = true, ident2 = true;
21830 /* Use the same checks as in expand_vec_perm_blend. */
21831 if (d->one_operand_p)
21832 return false;
21833 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21835 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21837 else if (TARGET_SSE4_1
21838 && (GET_MODE_SIZE (vmode) == 16
21839 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
21840 || GET_MODE_SIZE (vmode) == 4))
21842 else
21843 return false;
21845 dfirst = *d;
21846 dsecond = *d;
21847 dfinal = *d;
21848 dfirst.op1 = dfirst.op0;
21849 dfirst.one_operand_p = true;
21850 dsecond.op0 = dsecond.op1;
21851 dsecond.one_operand_p = true;
21853 for (i = 0; i < nelt; ++i)
21854 if (d->perm[i] >= nelt)
21856 dfirst.perm[i] = 0xff;
21857 dsecond.perm[i] = d->perm[i] - nelt;
21858 if (d->perm[i] != i + nelt)
21859 ident2 = false;
21861 else
21863 dsecond.perm[i] = 0xff;
21864 dfirst.perm[i] = d->perm[i];
21865 if (d->perm[i] != i)
21866 ident1 = false;
21869 if (two_insn && !ident1 && !ident2)
21870 return false;
21872 /* For now. Ideally treat 0xff as a wildcard. */
21873 for (i = 0; i < nelt; ++i)
21874 if (dfirst.perm[i] == 0xff)
21876 if (GET_MODE_SIZE (vmode) == 32
21877 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
21878 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21879 else
21880 dfirst.perm[i] = i;
21882 else
21884 if (GET_MODE_SIZE (vmode) == 32
21885 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
21886 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21887 else
21888 dsecond.perm[i] = i;
21891 if (!d->testing_p)
21893 if (!ident1)
21894 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21895 if (!ident2)
21896 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21899 bool ok;
21900 rtx_insn *seq1 = NULL, *seq2 = NULL;
21902 if (!ident1)
21904 start_sequence ();
21905 ok = expand_vec_perm_1 (&dfirst);
21906 seq1 = get_insns ();
21907 end_sequence ();
21909 if (!ok)
21910 return false;
21913 if (!ident2)
21915 start_sequence ();
21916 ok = expand_vec_perm_1 (&dsecond);
21917 seq2 = get_insns ();
21918 end_sequence ();
21920 if (!ok)
21921 return false;
21924 if (d->testing_p)
21925 return true;
21927 for (i = 0; i < nelt; ++i)
21928 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
21930 emit_insn (seq1);
21931 emit_insn (seq2);
21932 ok = expand_vec_perm_blend (&dfinal);
21933 gcc_assert (ok);
21934 return true;
21937 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
21938 permutation using two vperm2f128, followed by a vshufpd insn blending
21939 the two vectors together. */
21941 static bool
21942 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
21944 struct expand_vec_perm_d dfirst, dsecond, dthird;
21945 bool ok;
21947 if (!TARGET_AVX || (d->vmode != V4DFmode))
21948 return false;
21950 if (d->testing_p)
21951 return true;
21953 dfirst = *d;
21954 dsecond = *d;
21955 dthird = *d;
21957 dfirst.perm[0] = (d->perm[0] & ~1);
21958 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
21959 dfirst.perm[2] = (d->perm[2] & ~1);
21960 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
21961 dsecond.perm[0] = (d->perm[1] & ~1);
21962 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
21963 dsecond.perm[2] = (d->perm[3] & ~1);
21964 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
21965 dthird.perm[0] = (d->perm[0] % 2);
21966 dthird.perm[1] = (d->perm[1] % 2) + 4;
21967 dthird.perm[2] = (d->perm[2] % 2) + 2;
21968 dthird.perm[3] = (d->perm[3] % 2) + 6;
21970 dfirst.target = gen_reg_rtx (dfirst.vmode);
21971 dsecond.target = gen_reg_rtx (dsecond.vmode);
21972 dthird.op0 = dfirst.target;
21973 dthird.op1 = dsecond.target;
21974 dthird.one_operand_p = false;
21976 canonicalize_perm (&dfirst);
21977 canonicalize_perm (&dsecond);
21979 ok = expand_vec_perm_1 (&dfirst)
21980 && expand_vec_perm_1 (&dsecond)
21981 && expand_vec_perm_1 (&dthird);
21983 gcc_assert (ok);
21985 return true;
21988 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
21990 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21991 a two vector permutation using two intra-lane vector
21992 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21993 the non-swapped and swapped vectors together. */
21995 static bool
21996 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
21998 struct expand_vec_perm_d dfirst, dsecond, dthird;
21999 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
22000 rtx_insn *seq1, *seq2;
22001 bool ok;
22002 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
22004 if (!TARGET_AVX
22005 || TARGET_AVX2
22006 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
22007 || d->one_operand_p)
22008 return false;
22010 dfirst = *d;
22011 dsecond = *d;
22012 for (i = 0; i < nelt; i++)
22014 dfirst.perm[i] = 0xff;
22015 dsecond.perm[i] = 0xff;
22017 for (i = 0, msk = 0; i < nelt; i++)
22019 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
22020 if (j == i)
22022 dfirst.perm[j] = d->perm[i];
22023 which1 |= (d->perm[i] < nelt ? 1 : 2);
22025 else
22027 dsecond.perm[j] = d->perm[i];
22028 which2 |= (d->perm[i] < nelt ? 1 : 2);
22029 msk |= (1U << i);
22032 if (msk == 0 || msk == (1U << nelt) - 1)
22033 return false;
22035 if (!d->testing_p)
22037 dfirst.target = gen_reg_rtx (dfirst.vmode);
22038 dsecond.target = gen_reg_rtx (dsecond.vmode);
22041 for (i = 0; i < nelt; i++)
22043 if (dfirst.perm[i] == 0xff)
22044 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
22045 if (dsecond.perm[i] == 0xff)
22046 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
22048 canonicalize_perm (&dfirst);
22049 start_sequence ();
22050 ok = ix86_expand_vec_perm_const_1 (&dfirst);
22051 seq1 = get_insns ();
22052 end_sequence ();
22054 if (!ok)
22055 return false;
22057 canonicalize_perm (&dsecond);
22058 start_sequence ();
22059 ok = ix86_expand_vec_perm_const_1 (&dsecond);
22060 seq2 = get_insns ();
22061 end_sequence ();
22063 if (!ok)
22064 return false;
22066 if (d->testing_p)
22067 return true;
22069 emit_insn (seq1);
22070 emit_insn (seq2);
22072 dthird = *d;
22073 dthird.op0 = dsecond.target;
22074 dthird.op1 = dsecond.target;
22075 dthird.one_operand_p = true;
22076 dthird.target = gen_reg_rtx (dthird.vmode);
22077 for (i = 0; i < nelt; i++)
22078 dthird.perm[i] = i ^ nelt2;
22080 ok = expand_vec_perm_1 (&dthird);
22081 gcc_assert (ok);
22083 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
22084 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
22085 return true;
22088 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
22089 permutation with two pshufb insns and an ior. We should have already
22090 failed all two instruction sequences. */
22092 static bool
22093 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
22095 rtx rperm[2][16], vperm, l, h, op, m128;
22096 unsigned int i, nelt, eltsz;
22097 machine_mode mode;
22098 rtx (*gen) (rtx, rtx, rtx);
22100 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
22101 && GET_MODE_SIZE (d->vmode) != 8
22102 && GET_MODE_SIZE (d->vmode) != 4))
22103 return false;
22104 gcc_assert (!d->one_operand_p);
22106 if (d->testing_p)
22107 return true;
22109 switch (GET_MODE_SIZE (d->vmode))
22111 case 4:
22112 mode = V4QImode;
22113 gen = gen_mmx_pshufbv4qi3;
22114 break;
22115 case 8:
22116 mode = V8QImode;
22117 gen = gen_mmx_pshufbv8qi3;
22118 break;
22119 case 16:
22120 mode = V16QImode;
22121 gen = gen_ssse3_pshufbv16qi3;
22122 break;
22123 default:
22124 gcc_unreachable ();
22127 nelt = d->nelt;
22128 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22130 /* Generate two permutation masks. If the required element is within
22131 the given vector it is shuffled into the proper lane. If the required
22132 element is in the other vector, force a zero into the lane by setting
22133 bit 7 in the permutation mask. */
22134 m128 = GEN_INT (-128);
22135 for (i = 0; i < nelt; ++i)
22137 unsigned j, k, e = d->perm[i];
22138 unsigned which = (e >= nelt);
22139 if (e >= nelt)
22140 e -= nelt;
22142 for (j = 0; j < eltsz; ++j)
22144 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
22145 rperm[1-which][i*eltsz + j] = m128;
22148 for (k = i*eltsz + j; k < 16; ++k)
22149 rperm[0][k] = rperm[1][k] = m128;
22152 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
22153 vperm = force_reg (V16QImode, vperm);
22155 l = gen_reg_rtx (mode);
22156 op = gen_lowpart (mode, d->op0);
22157 emit_insn (gen (l, op, vperm));
22159 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
22160 vperm = force_reg (V16QImode, vperm);
22162 h = gen_reg_rtx (mode);
22163 op = gen_lowpart (mode, d->op1);
22164 emit_insn (gen (h, op, vperm));
22166 op = d->target;
22167 if (d->vmode != mode)
22168 op = gen_reg_rtx (mode);
22169 ix86_emit_vec_binop (IOR, mode, op, l, h);
22170 if (op != d->target)
22171 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22173 return true;
22176 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
22177 with two vpshufb insns, vpermq and vpor. We should have already failed
22178 all two or three instruction sequences. */
22180 static bool
22181 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
22183 rtx rperm[2][32], vperm, l, h, hp, op, m128;
22184 unsigned int i, nelt, eltsz;
22186 if (!TARGET_AVX2
22187 || !d->one_operand_p
22188 || (d->vmode != V32QImode && d->vmode != V16HImode))
22189 return false;
22191 if (d->testing_p)
22192 return true;
22194 nelt = d->nelt;
22195 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22197 /* Generate two permutation masks. If the required element is within
22198 the same lane, it is shuffled in. If the required element from the
22199 other lane, force a zero by setting bit 7 in the permutation mask.
22200 In the other mask the mask has non-negative elements if element
22201 is requested from the other lane, but also moved to the other lane,
22202 so that the result of vpshufb can have the two V2TImode halves
22203 swapped. */
22204 m128 = GEN_INT (-128);
22205 for (i = 0; i < nelt; ++i)
22207 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22208 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
22210 for (j = 0; j < eltsz; ++j)
22212 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
22213 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
22217 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
22218 vperm = force_reg (V32QImode, vperm);
22220 h = gen_reg_rtx (V32QImode);
22221 op = gen_lowpart (V32QImode, d->op0);
22222 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
22224 /* Swap the 128-byte lanes of h into hp. */
22225 hp = gen_reg_rtx (V4DImode);
22226 op = gen_lowpart (V4DImode, h);
22227 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
22228 const1_rtx));
22230 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
22231 vperm = force_reg (V32QImode, vperm);
22233 l = gen_reg_rtx (V32QImode);
22234 op = gen_lowpart (V32QImode, d->op0);
22235 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
22237 op = d->target;
22238 if (d->vmode != V32QImode)
22239 op = gen_reg_rtx (V32QImode);
22240 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
22241 if (op != d->target)
22242 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22244 return true;
22247 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22248 and extract-odd permutations of two V32QImode and V16QImode operand
22249 with two vpshufb insns, vpor and vpermq. We should have already
22250 failed all two or three instruction sequences. */
22252 static bool
22253 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
22255 rtx rperm[2][32], vperm, l, h, ior, op, m128;
22256 unsigned int i, nelt, eltsz;
22258 if (!TARGET_AVX2
22259 || d->one_operand_p
22260 || (d->vmode != V32QImode && d->vmode != V16HImode))
22261 return false;
22263 for (i = 0; i < d->nelt; ++i)
22264 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
22265 return false;
22267 if (d->testing_p)
22268 return true;
22270 nelt = d->nelt;
22271 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22273 /* Generate two permutation masks. In the first permutation mask
22274 the first quarter will contain indexes for the first half
22275 of the op0, the second quarter will contain bit 7 set, third quarter
22276 will contain indexes for the second half of the op0 and the
22277 last quarter bit 7 set. In the second permutation mask
22278 the first quarter will contain bit 7 set, the second quarter
22279 indexes for the first half of the op1, the third quarter bit 7 set
22280 and last quarter indexes for the second half of the op1.
22281 I.e. the first mask e.g. for V32QImode extract even will be:
22282 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
22283 (all values masked with 0xf except for -128) and second mask
22284 for extract even will be
22285 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
22286 m128 = GEN_INT (-128);
22287 for (i = 0; i < nelt; ++i)
22289 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22290 unsigned which = d->perm[i] >= nelt;
22291 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
22293 for (j = 0; j < eltsz; ++j)
22295 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
22296 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
22300 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
22301 vperm = force_reg (V32QImode, vperm);
22303 l = gen_reg_rtx (V32QImode);
22304 op = gen_lowpart (V32QImode, d->op0);
22305 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
22307 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
22308 vperm = force_reg (V32QImode, vperm);
22310 h = gen_reg_rtx (V32QImode);
22311 op = gen_lowpart (V32QImode, d->op1);
22312 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
22314 ior = gen_reg_rtx (V32QImode);
22315 emit_insn (gen_iorv32qi3 (ior, l, h));
22317 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
22318 op = gen_reg_rtx (V4DImode);
22319 ior = gen_lowpart (V4DImode, ior);
22320 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
22321 const1_rtx, GEN_INT (3)));
22322 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22324 return true;
22327 /* Implement permutation with pslldq + psrldq + por when pshufb is not
22328 available. */
22329 static bool
22330 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
22332 unsigned i, nelt = d->nelt;
22333 unsigned start1, end1 = -1;
22334 machine_mode vmode = d->vmode, imode;
22335 int start2 = -1;
22336 bool clear_op0, clear_op1;
22337 unsigned inner_size;
22338 rtx op0, op1, dop1;
22339 rtx (*gen_vec_shr) (rtx, rtx, rtx);
22340 rtx (*gen_vec_shl) (rtx, rtx, rtx);
22342 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
22343 if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
22344 return false;
22346 start1 = d->perm[0];
22347 for (i = 1; i < nelt; i++)
22349 if (d->perm[i] != d->perm[i-1] + 1
22350 || d->perm[i] == nelt)
22352 if (start2 == -1)
22354 start2 = d->perm[i];
22355 end1 = d->perm[i-1];
22357 else
22358 return false;
22362 clear_op0 = end1 != nelt - 1;
22363 clear_op1 = start2 % nelt != 0;
22364 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
22365 if (!pandn && (clear_op0 || clear_op1))
22366 return false;
22368 if (d->testing_p)
22369 return true;
22371 gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
22372 gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
22373 imode = GET_MODE_INNER (vmode);
22374 inner_size = GET_MODE_BITSIZE (imode);
22375 op0 = gen_reg_rtx (vmode);
22376 op1 = gen_reg_rtx (vmode);
22378 if (start1)
22379 emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
22380 else
22381 emit_move_insn (op0, d->op0);
22383 dop1 = d->op1;
22384 if (d->one_operand_p)
22385 dop1 = d->op0;
22387 int shl_offset = end1 - start1 + 1 - start2 % nelt;
22388 if (shl_offset)
22389 emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
22390 else
22391 emit_move_insn (op1, dop1);
22393 /* Clear lower/upper bits for op0/op1. */
22394 if (clear_op0 || clear_op1)
22396 rtx vec[16];
22397 rtx const_vec;
22398 rtx clear;
22399 for (i = 0; i != nelt; i++)
22401 if (i < (end1 - start1 + 1))
22402 vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
22403 else
22404 vec[i] = CONST0_RTX (imode);
22406 const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
22407 const_vec = validize_mem (force_const_mem (vmode, const_vec));
22408 clear = force_reg (vmode, const_vec);
22410 if (clear_op0)
22411 emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
22412 if (clear_op1)
22413 emit_move_insn (op1, gen_rtx_AND (vmode,
22414 gen_rtx_NOT (vmode, clear),
22415 op1));
22418 emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
22419 return true;
22422 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22423 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
22424 operands with two "and" and "pack" or two "shift" and "pack" insns.
22425 We should have already failed all two instruction sequences. */
22427 static bool
22428 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
22430 rtx op, dop0, dop1, t;
22431 unsigned i, odd, c, s, nelt = d->nelt;
22432 bool end_perm = false;
22433 machine_mode half_mode;
22434 rtx (*gen_and) (rtx, rtx, rtx);
22435 rtx (*gen_pack) (rtx, rtx, rtx);
22436 rtx (*gen_shift) (rtx, rtx, rtx);
22438 if (d->one_operand_p)
22439 return false;
22441 switch (d->vmode)
22443 case E_V4HImode:
22444 /* Required for "pack". */
22445 if (!TARGET_SSE4_1)
22446 return false;
22447 c = 0xffff;
22448 s = 16;
22449 half_mode = V2SImode;
22450 gen_and = gen_andv2si3;
22451 gen_pack = gen_mmx_packusdw;
22452 gen_shift = gen_lshrv2si3;
22453 break;
22454 case E_V8HImode:
22455 /* Required for "pack". */
22456 if (!TARGET_SSE4_1)
22457 return false;
22458 c = 0xffff;
22459 s = 16;
22460 half_mode = V4SImode;
22461 gen_and = gen_andv4si3;
22462 gen_pack = gen_sse4_1_packusdw;
22463 gen_shift = gen_lshrv4si3;
22464 break;
22465 case E_V8QImode:
22466 /* No check as all instructions are SSE2. */
22467 c = 0xff;
22468 s = 8;
22469 half_mode = V4HImode;
22470 gen_and = gen_andv4hi3;
22471 gen_pack = gen_mmx_packuswb;
22472 gen_shift = gen_lshrv4hi3;
22473 break;
22474 case E_V16QImode:
22475 /* No check as all instructions are SSE2. */
22476 c = 0xff;
22477 s = 8;
22478 half_mode = V8HImode;
22479 gen_and = gen_andv8hi3;
22480 gen_pack = gen_sse2_packuswb;
22481 gen_shift = gen_lshrv8hi3;
22482 break;
22483 case E_V16HImode:
22484 if (!TARGET_AVX2)
22485 return false;
22486 c = 0xffff;
22487 s = 16;
22488 half_mode = V8SImode;
22489 gen_and = gen_andv8si3;
22490 gen_pack = gen_avx2_packusdw;
22491 gen_shift = gen_lshrv8si3;
22492 end_perm = true;
22493 break;
22494 case E_V32QImode:
22495 if (!TARGET_AVX2)
22496 return false;
22497 c = 0xff;
22498 s = 8;
22499 half_mode = V16HImode;
22500 gen_and = gen_andv16hi3;
22501 gen_pack = gen_avx2_packuswb;
22502 gen_shift = gen_lshrv16hi3;
22503 end_perm = true;
22504 break;
22505 default:
22506 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
22507 are more profitable than general shuffles. */
22508 return false;
22511 /* Check that permutation is even or odd. */
22512 odd = d->perm[0];
22513 if (odd > 1)
22514 return false;
22516 for (i = 1; i < nelt; ++i)
22517 if (d->perm[i] != 2 * i + odd)
22518 return false;
22520 if (d->testing_p)
22521 return true;
22523 dop0 = gen_reg_rtx (half_mode);
22524 dop1 = gen_reg_rtx (half_mode);
22525 if (odd == 0)
22527 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
22528 t = force_reg (half_mode, t);
22529 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
22530 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
22532 else
22534 emit_insn (gen_shift (dop0,
22535 gen_lowpart (half_mode, d->op0),
22536 GEN_INT (s)));
22537 emit_insn (gen_shift (dop1,
22538 gen_lowpart (half_mode, d->op1),
22539 GEN_INT (s)));
22541 /* In AVX2 for 256 bit case we need to permute pack result. */
22542 if (TARGET_AVX2 && end_perm)
22544 op = gen_reg_rtx (d->vmode);
22545 t = gen_reg_rtx (V4DImode);
22546 emit_insn (gen_pack (op, dop0, dop1));
22547 emit_insn (gen_avx2_permv4di_1 (t,
22548 gen_lowpart (V4DImode, op),
22549 const0_rtx,
22550 const2_rtx,
22551 const1_rtx,
22552 GEN_INT (3)));
22553 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
22555 else
22556 emit_insn (gen_pack (d->target, dop0, dop1));
22558 return true;
22561 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22562 and extract-odd permutations of two V64QI operands
22563 with two "shifts", two "truncs" and one "concat" insns for "odd"
22564 and two "truncs" and one concat insn for "even."
22565 Have already failed all two instruction sequences. */
22567 static bool
22568 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
22570 rtx t1, t2, t3, t4;
22571 unsigned i, odd, nelt = d->nelt;
22573 if (!TARGET_AVX512BW
22574 || d->one_operand_p
22575 || d->vmode != V64QImode)
22576 return false;
22578 /* Check that permutation is even or odd. */
22579 odd = d->perm[0];
22580 if (odd > 1)
22581 return false;
22583 for (i = 1; i < nelt; ++i)
22584 if (d->perm[i] != 2 * i + odd)
22585 return false;
22587 if (d->testing_p)
22588 return true;
22591 if (odd)
22593 t1 = gen_reg_rtx (V32HImode);
22594 t2 = gen_reg_rtx (V32HImode);
22595 emit_insn (gen_lshrv32hi3 (t1,
22596 gen_lowpart (V32HImode, d->op0),
22597 GEN_INT (8)));
22598 emit_insn (gen_lshrv32hi3 (t2,
22599 gen_lowpart (V32HImode, d->op1),
22600 GEN_INT (8)));
22602 else
22604 t1 = gen_lowpart (V32HImode, d->op0);
22605 t2 = gen_lowpart (V32HImode, d->op1);
22608 t3 = gen_reg_rtx (V32QImode);
22609 t4 = gen_reg_rtx (V32QImode);
22610 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
22611 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
22612 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
22614 return true;
22617 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
22618 and extract-odd permutations. */
22620 static bool
22621 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
22623 rtx t1, t2, t3, t4, t5;
22625 switch (d->vmode)
22627 case E_V4DFmode:
22628 if (d->testing_p)
22629 break;
22630 t1 = gen_reg_rtx (V4DFmode);
22631 t2 = gen_reg_rtx (V4DFmode);
22633 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22634 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
22635 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
22637 /* Now an unpck[lh]pd will produce the result required. */
22638 if (odd)
22639 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
22640 else
22641 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
22642 emit_insn (t3);
22643 break;
22645 case E_V8SFmode:
22647 int mask = odd ? 0xdd : 0x88;
22649 if (d->testing_p)
22650 break;
22651 t1 = gen_reg_rtx (V8SFmode);
22652 t2 = gen_reg_rtx (V8SFmode);
22653 t3 = gen_reg_rtx (V8SFmode);
22655 /* Shuffle within the 128-bit lanes to produce:
22656 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
22657 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
22658 GEN_INT (mask)));
22660 /* Shuffle the lanes around to produce:
22661 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
22662 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
22663 GEN_INT (0x3)));
22665 /* Shuffle within the 128-bit lanes to produce:
22666 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
22667 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
22669 /* Shuffle within the 128-bit lanes to produce:
22670 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
22671 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
22673 /* Shuffle the lanes around to produce:
22674 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
22675 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
22676 GEN_INT (0x20)));
22678 break;
22680 case E_V2DFmode:
22681 case E_V4SFmode:
22682 case E_V2DImode:
22683 case E_V2SImode:
22684 case E_V4SImode:
22685 case E_V2HImode:
22686 /* These are always directly implementable by expand_vec_perm_1. */
22687 gcc_unreachable ();
22689 case E_V2SFmode:
22690 gcc_assert (TARGET_MMX_WITH_SSE);
22691 /* We have no suitable instructions. */
22692 if (d->testing_p)
22693 return false;
22694 break;
22696 case E_V4QImode:
22697 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22698 return expand_vec_perm_pshufb2 (d);
22699 else
22701 if (d->testing_p)
22702 break;
22703 /* We need 2*log2(N)-1 operations to achieve odd/even
22704 with interleave. */
22705 t1 = gen_reg_rtx (V4QImode);
22706 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
22707 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
22708 if (odd)
22709 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
22710 else
22711 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
22712 emit_insn (t2);
22714 break;
22716 case E_V4HImode:
22717 if (TARGET_SSE4_1)
22718 return expand_vec_perm_even_odd_pack (d);
22719 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22720 return expand_vec_perm_pshufb2 (d);
22721 else
22723 if (d->testing_p)
22724 break;
22725 /* We need 2*log2(N)-1 operations to achieve odd/even
22726 with interleave. */
22727 t1 = gen_reg_rtx (V4HImode);
22728 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
22729 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
22730 if (odd)
22731 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
22732 else
22733 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
22734 emit_insn (t2);
22736 break;
22738 case E_V8HImode:
22739 if (TARGET_SSE4_1)
22740 return expand_vec_perm_even_odd_pack (d);
22741 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22742 return expand_vec_perm_pshufb2 (d);
22743 else
22745 if (d->testing_p)
22746 break;
22747 /* We need 2*log2(N)-1 operations to achieve odd/even
22748 with interleave. */
22749 t1 = gen_reg_rtx (V8HImode);
22750 t2 = gen_reg_rtx (V8HImode);
22751 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
22752 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
22753 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
22754 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
22755 if (odd)
22756 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
22757 else
22758 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
22759 emit_insn (t3);
22761 break;
22763 case E_V8QImode:
22764 case E_V16QImode:
22765 return expand_vec_perm_even_odd_pack (d);
22767 case E_V16HImode:
22768 case E_V32QImode:
22769 return expand_vec_perm_even_odd_pack (d);
22771 case E_V64QImode:
22772 return expand_vec_perm_even_odd_trunc (d);
22774 case E_V4DImode:
22775 if (!TARGET_AVX2)
22777 struct expand_vec_perm_d d_copy = *d;
22778 d_copy.vmode = V4DFmode;
22779 if (d->testing_p)
22780 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
22781 else
22782 d_copy.target = gen_reg_rtx (V4DFmode);
22783 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
22784 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
22785 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22787 if (!d->testing_p)
22788 emit_move_insn (d->target,
22789 gen_lowpart (V4DImode, d_copy.target));
22790 return true;
22792 return false;
22795 if (d->testing_p)
22796 break;
22798 t1 = gen_reg_rtx (V4DImode);
22799 t2 = gen_reg_rtx (V4DImode);
22801 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22802 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
22803 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
22805 /* Now an vpunpck[lh]qdq will produce the result required. */
22806 if (odd)
22807 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
22808 else
22809 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
22810 emit_insn (t3);
22811 break;
22813 case E_V8SImode:
22814 if (!TARGET_AVX2)
22816 struct expand_vec_perm_d d_copy = *d;
22817 d_copy.vmode = V8SFmode;
22818 if (d->testing_p)
22819 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
22820 else
22821 d_copy.target = gen_reg_rtx (V8SFmode);
22822 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
22823 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
22824 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22826 if (!d->testing_p)
22827 emit_move_insn (d->target,
22828 gen_lowpart (V8SImode, d_copy.target));
22829 return true;
22831 return false;
22834 if (d->testing_p)
22835 break;
22837 t1 = gen_reg_rtx (V8SImode);
22838 t2 = gen_reg_rtx (V8SImode);
22839 t3 = gen_reg_rtx (V4DImode);
22840 t4 = gen_reg_rtx (V4DImode);
22841 t5 = gen_reg_rtx (V4DImode);
22843 /* Shuffle the lanes around into
22844 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22845 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
22846 gen_lowpart (V4DImode, d->op1),
22847 GEN_INT (0x20)));
22848 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
22849 gen_lowpart (V4DImode, d->op1),
22850 GEN_INT (0x31)));
22852 /* Swap the 2nd and 3rd position in each lane into
22853 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22854 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
22855 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22856 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
22857 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22859 /* Now an vpunpck[lh]qdq will produce
22860 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22861 if (odd)
22862 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
22863 gen_lowpart (V4DImode, t2));
22864 else
22865 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
22866 gen_lowpart (V4DImode, t2));
22867 emit_insn (t3);
22868 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
22869 break;
22871 default:
22872 gcc_unreachable ();
22875 return true;
22878 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22879 extract-even and extract-odd permutations. */
22881 static bool
22882 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
22884 unsigned i, odd, nelt = d->nelt;
22886 odd = d->perm[0];
22887 if (odd != 0 && odd != 1)
22888 return false;
22890 for (i = 1; i < nelt; ++i)
22891 if (d->perm[i] != 2 * i + odd)
22892 return false;
22894 if (d->vmode == E_V32HImode
22895 && d->testing_p
22896 && !TARGET_AVX512BW)
22897 return false;
22899 return expand_vec_perm_even_odd_1 (d, odd);
22902 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
22903 permutations. We assume that expand_vec_perm_1 has already failed. */
22905 static bool
22906 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
22908 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
22909 machine_mode vmode = d->vmode;
22910 rtx (*gen) (rtx, rtx, rtx);
22911 unsigned char perm2[4];
22912 rtx op0 = d->op0, dest;
22913 bool ok;
22915 switch (vmode)
22917 case E_V4DFmode:
22918 case E_V8SFmode:
22919 /* These are special-cased in sse.md so that we can optionally
22920 use the vbroadcast instruction. They expand to two insns
22921 if the input happens to be in a register. */
22922 gcc_unreachable ();
22924 case E_V2DFmode:
22925 case E_V2SFmode:
22926 case E_V4SFmode:
22927 case E_V2DImode:
22928 case E_V2SImode:
22929 case E_V4SImode:
22930 case E_V2HImode:
22931 case E_V4HImode:
22932 /* These are always implementable using standard shuffle patterns. */
22933 gcc_unreachable ();
22935 case E_V4QImode:
22936 /* This can be implemented via interleave and pshuflw. */
22937 if (d->testing_p)
22938 return true;
22940 if (elt >= nelt2)
22942 gen = gen_mmx_punpckhbw_low;
22943 elt -= nelt2;
22945 else
22946 gen = gen_mmx_punpcklbw_low;
22948 dest = gen_reg_rtx (vmode);
22949 emit_insn (gen (dest, op0, op0));
22950 vmode = get_mode_wider_vector (vmode);
22951 op0 = gen_lowpart (vmode, dest);
22953 memset (perm2, elt, 2);
22954 dest = gen_reg_rtx (vmode);
22955 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22956 gcc_assert (ok);
22958 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22959 return true;
22961 case E_V8QImode:
22962 /* This can be implemented via interleave. We save one insn by
22963 stopping once we have promoted to V2SImode and then use pshufd. */
22964 if (d->testing_p)
22965 return true;
22968 if (elt >= nelt2)
22970 gen = vmode == V8QImode ? gen_mmx_punpckhbw
22971 : gen_mmx_punpckhwd;
22972 elt -= nelt2;
22974 else
22975 gen = vmode == V8QImode ? gen_mmx_punpcklbw
22976 : gen_mmx_punpcklwd;
22977 nelt2 /= 2;
22979 dest = gen_reg_rtx (vmode);
22980 emit_insn (gen (dest, op0, op0));
22981 vmode = get_mode_wider_vector (vmode);
22982 op0 = gen_lowpart (vmode, dest);
22984 while (vmode != V2SImode);
22986 memset (perm2, elt, 2);
22987 dest = gen_reg_rtx (vmode);
22988 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22989 gcc_assert (ok);
22991 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22992 return true;
22994 case E_V8HImode:
22995 case E_V16QImode:
22996 /* These can be implemented via interleave. We save one insn by
22997 stopping once we have promoted to V4SImode and then use pshufd. */
22998 if (d->testing_p)
22999 return true;
23002 if (elt >= nelt2)
23004 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
23005 : gen_vec_interleave_highv8hi;
23006 elt -= nelt2;
23008 else
23009 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
23010 : gen_vec_interleave_lowv8hi;
23011 nelt2 /= 2;
23013 dest = gen_reg_rtx (vmode);
23014 emit_insn (gen (dest, op0, op0));
23015 vmode = get_mode_wider_vector (vmode);
23016 op0 = gen_lowpart (vmode, dest);
23018 while (vmode != V4SImode);
23020 memset (perm2, elt, 4);
23021 dest = gen_reg_rtx (vmode);
23022 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
23023 gcc_assert (ok);
23025 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
23026 return true;
23028 case E_V8HFmode:
23029 case E_V8BFmode:
23030 /* This can be implemented via interleave and pshufd. */
23031 if (d->testing_p)
23032 return true;
23034 rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx);
23035 if (elt >= nelt2)
23037 gen_interleave = gen_vec_interleave_high;
23038 elt -= nelt2;
23040 else
23041 gen_interleave = gen_vec_interleave_low;
23042 nelt2 /= 2;
23044 dest = gen_reg_rtx (vmode);
23045 emit_insn (gen_interleave (vmode, 1, dest, op0, op0));
23047 vmode = V4SImode;
23048 op0 = gen_lowpart (vmode, dest);
23050 memset (perm2, elt, 4);
23051 dest = gen_reg_rtx (vmode);
23052 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
23053 gcc_assert (ok);
23055 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
23056 return true;
23058 case E_V32QImode:
23059 case E_V16HImode:
23060 case E_V8SImode:
23061 case E_V4DImode:
23062 /* For AVX2 broadcasts of the first element vpbroadcast* or
23063 vpermq should be used by expand_vec_perm_1. */
23064 gcc_assert (!TARGET_AVX2 || d->perm[0]);
23065 return false;
23067 case E_V64QImode:
23068 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
23069 return false;
23071 case E_V32HImode:
23072 gcc_assert (!TARGET_AVX512BW);
23073 return false;
23075 default:
23076 gcc_unreachable ();
23080 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
23081 broadcast permutations. */
23083 static bool
23084 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
23086 unsigned i, elt, nelt = d->nelt;
23088 if (!d->one_operand_p)
23089 return false;
23091 elt = d->perm[0];
23092 for (i = 1; i < nelt; ++i)
23093 if (d->perm[i] != elt)
23094 return false;
23096 return expand_vec_perm_broadcast_1 (d);
23099 /* Implement arbitrary permutations of two V64QImode operands
23100 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
23101 static bool
23102 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
23104 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
23105 return false;
23107 if (d->testing_p)
23108 return true;
23110 struct expand_vec_perm_d ds[2];
23111 rtx rperm[128], vperm, target0, target1;
23112 unsigned int i, nelt;
23113 machine_mode vmode;
23115 nelt = d->nelt;
23116 vmode = V64QImode;
23118 for (i = 0; i < 2; i++)
23120 ds[i] = *d;
23121 ds[i].vmode = V32HImode;
23122 ds[i].nelt = 32;
23123 ds[i].target = gen_reg_rtx (V32HImode);
23124 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
23125 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
23128 /* Prepare permutations such that the first one takes care of
23129 putting the even bytes into the right positions or one higher
23130 positions (ds[0]) and the second one takes care of
23131 putting the odd bytes into the right positions or one below
23132 (ds[1]). */
23134 for (i = 0; i < nelt; i++)
23136 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
23137 if (i & 1)
23139 rperm[i] = constm1_rtx;
23140 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
23142 else
23144 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
23145 rperm[i + 64] = constm1_rtx;
23149 bool ok = expand_vec_perm_1 (&ds[0]);
23150 gcc_assert (ok);
23151 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
23153 ok = expand_vec_perm_1 (&ds[1]);
23154 gcc_assert (ok);
23155 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
23157 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
23158 vperm = force_reg (vmode, vperm);
23159 target0 = gen_reg_rtx (V64QImode);
23160 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
23162 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
23163 vperm = force_reg (vmode, vperm);
23164 target1 = gen_reg_rtx (V64QImode);
23165 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
23167 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
23168 return true;
23171 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
23172 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
23173 all the shorter instruction sequences. */
23175 static bool
23176 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
23178 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
23179 unsigned int i, nelt, eltsz;
23180 bool used[4];
23182 if (!TARGET_AVX2
23183 || d->one_operand_p
23184 || (d->vmode != V32QImode && d->vmode != V16HImode))
23185 return false;
23187 if (d->testing_p)
23188 return true;
23190 nelt = d->nelt;
23191 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
23193 /* Generate 4 permutation masks. If the required element is within
23194 the same lane, it is shuffled in. If the required element from the
23195 other lane, force a zero by setting bit 7 in the permutation mask.
23196 In the other mask the mask has non-negative elements if element
23197 is requested from the other lane, but also moved to the other lane,
23198 so that the result of vpshufb can have the two V2TImode halves
23199 swapped. */
23200 m128 = GEN_INT (-128);
23201 for (i = 0; i < 32; ++i)
23203 rperm[0][i] = m128;
23204 rperm[1][i] = m128;
23205 rperm[2][i] = m128;
23206 rperm[3][i] = m128;
23208 used[0] = false;
23209 used[1] = false;
23210 used[2] = false;
23211 used[3] = false;
23212 for (i = 0; i < nelt; ++i)
23214 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
23215 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
23216 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
23218 for (j = 0; j < eltsz; ++j)
23219 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
23220 used[which] = true;
23223 for (i = 0; i < 2; ++i)
23225 if (!used[2 * i + 1])
23227 h[i] = NULL_RTX;
23228 continue;
23230 vperm = gen_rtx_CONST_VECTOR (V32QImode,
23231 gen_rtvec_v (32, rperm[2 * i + 1]));
23232 vperm = force_reg (V32QImode, vperm);
23233 h[i] = gen_reg_rtx (V32QImode);
23234 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
23235 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
23238 /* Swap the 128-byte lanes of h[X]. */
23239 for (i = 0; i < 2; ++i)
23241 if (h[i] == NULL_RTX)
23242 continue;
23243 op = gen_reg_rtx (V4DImode);
23244 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
23245 const2_rtx, GEN_INT (3), const0_rtx,
23246 const1_rtx));
23247 h[i] = gen_lowpart (V32QImode, op);
23250 for (i = 0; i < 2; ++i)
23252 if (!used[2 * i])
23254 l[i] = NULL_RTX;
23255 continue;
23257 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
23258 vperm = force_reg (V32QImode, vperm);
23259 l[i] = gen_reg_rtx (V32QImode);
23260 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
23261 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
23264 for (i = 0; i < 2; ++i)
23266 if (h[i] && l[i])
23268 op = gen_reg_rtx (V32QImode);
23269 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
23270 l[i] = op;
23272 else if (h[i])
23273 l[i] = h[i];
23276 gcc_assert (l[0] && l[1]);
23277 op = d->target;
23278 if (d->vmode != V32QImode)
23279 op = gen_reg_rtx (V32QImode);
23280 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
23281 if (op != d->target)
23282 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
23283 return true;
23286 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
23287 taken care of, perform the expansion in D and return true on success. */
23289 static bool
23290 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
23292 /* Try a single instruction expansion. */
23293 if (expand_vec_perm_1 (d))
23294 return true;
23296 /* Try sequences of two instructions. */
23298 if (expand_vec_perm_pshuflw_pshufhw (d))
23299 return true;
23301 if (expand_vec_perm_palignr (d, false))
23302 return true;
23304 if (expand_vec_perm_interleave2 (d))
23305 return true;
23307 if (expand_vec_perm_broadcast (d))
23308 return true;
23310 if (expand_vec_perm_vpermq_perm_1 (d))
23311 return true;
23313 if (expand_vec_perm_vperm2f128 (d))
23314 return true;
23316 if (expand_vec_perm_pblendv (d))
23317 return true;
23319 if (expand_vec_perm_2perm_interleave (d, true))
23320 return true;
23322 if (expand_vec_perm_2perm_pblendv (d, true))
23323 return true;
23325 if (expand_vec_perm_shufps_shufps (d))
23326 return true;
23328 /* Try sequences of three instructions. */
23330 if (expand_vec_perm_even_odd_pack (d))
23331 return true;
23333 if (expand_vec_perm_2vperm2f128_vshuf (d))
23334 return true;
23336 if (expand_vec_perm_pshufb2 (d))
23337 return true;
23339 if (expand_vec_perm_pslldq_psrldq_por (d, false))
23340 return true;
23342 if (expand_vec_perm_interleave3 (d))
23343 return true;
23345 if (expand_vec_perm_vperm2f128_vblend (d))
23346 return true;
23348 if (expand_vec_perm_2perm_interleave (d, false))
23349 return true;
23351 if (expand_vec_perm_2perm_pblendv (d, false))
23352 return true;
23354 /* Try sequences of four instructions. */
23356 if (expand_vec_perm_even_odd_trunc (d))
23357 return true;
23358 if (expand_vec_perm_vpshufb2_vpermq (d))
23359 return true;
23361 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
23362 return true;
23364 if (expand_vec_perm_vpermt2_vpshub2 (d))
23365 return true;
23367 /* ??? Look for narrow permutations whose element orderings would
23368 allow the promotion to a wider mode. */
23370 /* ??? Look for sequences of interleave or a wider permute that place
23371 the data into the correct lanes for a half-vector shuffle like
23372 pshuf[lh]w or vpermilps. */
23374 /* ??? Look for sequences of interleave that produce the desired results.
23375 The combinatorics of punpck[lh] get pretty ugly... */
23377 if (expand_vec_perm_even_odd (d))
23378 return true;
23380 /* Generate four or five instructions. */
23381 if (expand_vec_perm_pslldq_psrldq_por (d, true))
23382 return true;
23384 /* Even longer sequences. */
23385 if (expand_vec_perm_vpshufb4_vpermq2 (d))
23386 return true;
23388 /* See if we can get the same permutation in different vector integer
23389 mode. */
23390 struct expand_vec_perm_d nd;
23391 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
23393 if (!d->testing_p)
23394 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
23395 return true;
23398 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
23399 if (expand_vec_perm2_vperm2f128_vblend (d))
23400 return true;
23402 return false;
23405 /* If a permutation only uses one operand, make it clear. Returns true
23406 if the permutation references both operands. */
23408 static bool
23409 canonicalize_perm (struct expand_vec_perm_d *d)
23411 int i, which, nelt = d->nelt;
23413 for (i = which = 0; i < nelt; ++i)
23414 which |= (d->perm[i] < nelt ? 1 : 2);
23416 d->one_operand_p = true;
23417 switch (which)
23419 default:
23420 gcc_unreachable();
23422 case 3:
23423 if (!rtx_equal_p (d->op0, d->op1))
23425 d->one_operand_p = false;
23426 break;
23428 /* The elements of PERM do not suggest that only the first operand
23429 is used, but both operands are identical. Allow easier matching
23430 of the permutation by folding the permutation into the single
23431 input vector. */
23432 /* FALLTHRU */
23434 case 2:
23435 for (i = 0; i < nelt; ++i)
23436 d->perm[i] &= nelt - 1;
23437 d->op0 = d->op1;
23438 break;
23440 case 1:
23441 d->op1 = d->op0;
23442 break;
23445 return (which == 3);
23448 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
23450 bool
23451 ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
23452 rtx target, rtx op0, rtx op1,
23453 const vec_perm_indices &sel)
23455 if (vmode != op_mode)
23456 return false;
23458 struct expand_vec_perm_d d;
23459 unsigned char perm[MAX_VECT_LEN];
23460 unsigned int i, nelt, which;
23461 bool two_args;
23463 if (GET_MODE_SIZE (vmode) == 64 && !TARGET_EVEX512)
23464 return false;
23466 /* For HF mode vector, convert it to HI using subreg. */
23467 if (GET_MODE_INNER (vmode) == HFmode)
23469 machine_mode orig_mode = vmode;
23470 vmode = mode_for_vector (HImode,
23471 GET_MODE_NUNITS (vmode)).require ();
23472 if (target)
23473 target = lowpart_subreg (vmode, target, orig_mode);
23474 if (op0)
23475 op0 = lowpart_subreg (vmode, op0, orig_mode);
23476 if (op1)
23477 op1 = lowpart_subreg (vmode, op1, orig_mode);
23480 d.target = target;
23481 d.op0 = op0;
23482 d.op1 = op1;
23484 d.vmode = vmode;
23485 gcc_assert (VECTOR_MODE_P (d.vmode));
23486 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23487 d.testing_p = !target;
23489 gcc_assert (sel.length () == nelt);
23490 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
23492 /* Given sufficient ISA support we can just return true here
23493 for selected vector modes. */
23494 switch (d.vmode)
23496 case E_V16SFmode:
23497 case E_V16SImode:
23498 case E_V8DImode:
23499 case E_V8DFmode:
23500 if (!TARGET_AVX512F)
23501 return false;
23502 /* All implementable with a single vperm[it]2 insn. */
23503 if (d.testing_p)
23504 return true;
23505 break;
23506 case E_V32HImode:
23507 if (!TARGET_AVX512F)
23508 return false;
23509 if (d.testing_p && TARGET_AVX512BW)
23510 /* All implementable with a single vperm[it]2 insn. */
23511 return true;
23512 break;
23513 case E_V64QImode:
23514 if (!TARGET_AVX512F)
23515 return false;
23516 if (d.testing_p && TARGET_AVX512BW)
23517 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
23518 return true;
23519 break;
23520 case E_V8SImode:
23521 case E_V8SFmode:
23522 case E_V4DFmode:
23523 case E_V4DImode:
23524 if (!TARGET_AVX)
23525 return false;
23526 if (d.testing_p && TARGET_AVX512VL)
23527 /* All implementable with a single vperm[it]2 insn. */
23528 return true;
23529 break;
23530 case E_V16HImode:
23531 if (!TARGET_SSE2)
23532 return false;
23533 if (d.testing_p && TARGET_AVX2)
23534 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23535 return true;
23536 break;
23537 case E_V32QImode:
23538 if (!TARGET_SSE2)
23539 return false;
23540 if (d.testing_p && TARGET_AVX2)
23541 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23542 return true;
23543 break;
23544 case E_V8HImode:
23545 case E_V16QImode:
23546 if (!TARGET_SSE2)
23547 return false;
23548 /* Fall through. */
23549 case E_V4SImode:
23550 case E_V4SFmode:
23551 if (!TARGET_SSE)
23552 return false;
23553 /* All implementable with a single vpperm insn. */
23554 if (d.testing_p && TARGET_XOP)
23555 return true;
23556 /* All implementable with 2 pshufb + 1 ior. */
23557 if (d.testing_p && TARGET_SSSE3)
23558 return true;
23559 break;
23560 case E_V2SFmode:
23561 case E_V2SImode:
23562 case E_V4HImode:
23563 case E_V8QImode:
23564 if (!TARGET_MMX_WITH_SSE)
23565 return false;
23566 break;
23567 case E_V2HImode:
23568 if (!TARGET_SSE2)
23569 return false;
23570 /* All implementable with *punpckwd. */
23571 if (d.testing_p)
23572 return true;
23573 break;
23574 case E_V4QImode:
23575 if (!TARGET_SSE2)
23576 return false;
23577 break;
23578 case E_V2DImode:
23579 case E_V2DFmode:
23580 if (!TARGET_SSE)
23581 return false;
23582 /* All implementable with shufpd or unpck[lh]pd. */
23583 if (d.testing_p)
23584 return true;
23585 break;
23586 default:
23587 return false;
23590 for (i = which = 0; i < nelt; ++i)
23592 unsigned char e = sel[i];
23593 gcc_assert (e < 2 * nelt);
23594 d.perm[i] = e;
23595 perm[i] = e;
23596 which |= (e < nelt ? 1 : 2);
23599 if (d.testing_p)
23601 /* For all elements from second vector, fold the elements to first. */
23602 if (which == 2)
23603 for (i = 0; i < nelt; ++i)
23604 d.perm[i] -= nelt;
23606 /* Check whether the mask can be applied to the vector type. */
23607 d.one_operand_p = (which != 3);
23609 /* Implementable with shufps, pshufd or pshuflw. */
23610 if (d.one_operand_p
23611 && (d.vmode == V4SFmode || d.vmode == V2SFmode
23612 || d.vmode == V4SImode || d.vmode == V2SImode
23613 || d.vmode == V4HImode || d.vmode == V2HImode))
23614 return true;
23616 /* Otherwise we have to go through the motions and see if we can
23617 figure out how to generate the requested permutation. */
23618 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
23619 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
23620 if (!d.one_operand_p)
23621 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
23623 start_sequence ();
23624 bool ret = ix86_expand_vec_perm_const_1 (&d);
23625 end_sequence ();
23627 return ret;
23630 two_args = canonicalize_perm (&d);
23632 /* If one of the operands is a zero vector, try to match pmovzx. */
23633 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
23635 struct expand_vec_perm_d dzero = d;
23636 if (d.op0 == CONST0_RTX (vmode))
23638 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
23639 std::swap (dzero.op0, dzero.op1);
23640 for (i = 0; i < nelt; ++i)
23641 dzero.perm[i] ^= nelt;
23643 else
23644 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
23646 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
23647 dzero.perm, nelt, dzero.testing_p))
23648 return true;
23651 /* Force operands into registers. */
23652 rtx nop0 = force_reg (vmode, d.op0);
23653 if (d.op0 == d.op1)
23654 d.op1 = nop0;
23655 d.op0 = nop0;
23656 d.op1 = force_reg (vmode, d.op1);
23658 if (ix86_expand_vec_perm_const_1 (&d))
23659 return true;
23661 /* If the selector says both arguments are needed, but the operands are the
23662 same, the above tried to expand with one_operand_p and flattened selector.
23663 If that didn't work, retry without one_operand_p; we succeeded with that
23664 during testing. */
23665 if (two_args && d.one_operand_p)
23667 d.one_operand_p = false;
23668 memcpy (d.perm, perm, sizeof (perm));
23669 return ix86_expand_vec_perm_const_1 (&d);
23672 return false;
23675 void
23676 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
23678 struct expand_vec_perm_d d;
23679 unsigned i, nelt;
23681 d.target = targ;
23682 d.op0 = op0;
23683 d.op1 = op1;
23684 d.vmode = GET_MODE (targ);
23685 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23686 d.one_operand_p = false;
23687 d.testing_p = false;
23689 for (i = 0; i < nelt; ++i)
23690 d.perm[i] = i * 2 + odd;
23692 /* We'll either be able to implement the permutation directly... */
23693 if (expand_vec_perm_1 (&d))
23694 return;
23696 /* ... or we use the special-case patterns. */
23697 expand_vec_perm_even_odd_1 (&d, odd);
23700 static void
23701 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
23703 struct expand_vec_perm_d d;
23704 unsigned i, nelt, base;
23705 bool ok;
23707 d.target = targ;
23708 d.op0 = op0;
23709 d.op1 = op1;
23710 d.vmode = GET_MODE (targ);
23711 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23712 d.one_operand_p = false;
23713 d.testing_p = false;
23715 base = high_p ? nelt / 2 : 0;
23716 for (i = 0; i < nelt / 2; ++i)
23718 d.perm[i * 2] = i + base;
23719 d.perm[i * 2 + 1] = i + base + nelt;
23722 /* Note that for AVX this isn't one instruction. */
23723 ok = ix86_expand_vec_perm_const_1 (&d);
23724 gcc_assert (ok);
23727 /* Expand a vector operation shift by constant for a V*QImode in terms of the
23728 same operation on V*HImode. Return true if success. */
23729 static bool
23730 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
23731 rtx dest, rtx op1, rtx op2)
23733 machine_mode qimode, himode;
23734 HOST_WIDE_INT and_constant, xor_constant;
23735 HOST_WIDE_INT shift_amount;
23736 rtx vec_const_and, vec_const_xor;
23737 rtx tmp, op1_subreg;
23738 rtx (*gen_shift) (rtx, rtx, rtx);
23739 rtx (*gen_and) (rtx, rtx, rtx);
23740 rtx (*gen_xor) (rtx, rtx, rtx);
23741 rtx (*gen_sub) (rtx, rtx, rtx);
23743 /* Only optimize shift by constant. */
23744 if (!CONST_INT_P (op2))
23745 return false;
23747 qimode = GET_MODE (dest);
23748 shift_amount = INTVAL (op2);
23749 /* Do nothing when shift amount greater equal 8. */
23750 if (shift_amount > 7)
23751 return false;
23753 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
23754 /* Record sign bit. */
23755 xor_constant = 1 << (8 - shift_amount - 1);
23757 /* Zero upper/lower bits shift from left/right element. */
23758 and_constant
23759 = (code == ASHIFT ? 256 - (1 << shift_amount)
23760 : (1 << (8 - shift_amount)) - 1);
23762 switch (qimode)
23764 case V16QImode:
23765 himode = V8HImode;
23766 gen_shift =
23767 ((code == ASHIFT)
23768 ? gen_ashlv8hi3
23769 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
23770 gen_and = gen_andv16qi3;
23771 gen_xor = gen_xorv16qi3;
23772 gen_sub = gen_subv16qi3;
23773 break;
23774 case V32QImode:
23775 himode = V16HImode;
23776 gen_shift =
23777 ((code == ASHIFT)
23778 ? gen_ashlv16hi3
23779 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
23780 gen_and = gen_andv32qi3;
23781 gen_xor = gen_xorv32qi3;
23782 gen_sub = gen_subv32qi3;
23783 break;
23784 case V64QImode:
23785 himode = V32HImode;
23786 gen_shift =
23787 ((code == ASHIFT)
23788 ? gen_ashlv32hi3
23789 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
23790 gen_and = gen_andv64qi3;
23791 gen_xor = gen_xorv64qi3;
23792 gen_sub = gen_subv64qi3;
23793 break;
23794 default:
23795 gcc_unreachable ();
23798 tmp = gen_reg_rtx (himode);
23799 vec_const_and = gen_reg_rtx (qimode);
23800 op1_subreg = lowpart_subreg (himode, op1, qimode);
23802 /* For ASHIFT and LSHIFTRT, perform operation like
23803 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23804 vpand %vec_const_and, %dest. */
23805 emit_insn (gen_shift (tmp, op1_subreg, op2));
23806 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
23807 emit_move_insn (vec_const_and,
23808 ix86_build_const_vector (qimode, true,
23809 gen_int_mode (and_constant, QImode)));
23810 emit_insn (gen_and (dest, dest, vec_const_and));
23812 /* For ASHIFTRT, perform extra operation like
23813 vpxor %vec_const_xor, %dest, %dest
23814 vpsubb %vec_const_xor, %dest, %dest */
23815 if (code == ASHIFTRT)
23817 vec_const_xor = gen_reg_rtx (qimode);
23818 emit_move_insn (vec_const_xor,
23819 ix86_build_const_vector (qimode, true,
23820 gen_int_mode (xor_constant, QImode)));
23821 emit_insn (gen_xor (dest, dest, vec_const_xor));
23822 emit_insn (gen_sub (dest, dest, vec_const_xor));
23824 return true;
23827 void
23828 ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23830 machine_mode qimode = GET_MODE (dest);
23831 rtx qop1, qop2, hop1, hop2, qdest, hdest;
23832 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23833 bool uns_p = code != ASHIFTRT;
23835 switch (qimode)
23837 case E_V4QImode:
23838 case E_V8QImode:
23839 break;
23840 default:
23841 gcc_unreachable ();
23844 qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
23846 if (op2vec)
23847 qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
23848 else
23849 qop2 = op2;
23851 qdest = gen_reg_rtx (V16QImode);
23853 if (CONST_INT_P (op2)
23854 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23855 && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
23857 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23858 return;
23861 switch (code)
23863 case MULT:
23864 gcc_assert (op2vec);
23865 if (!TARGET_SSE4_1)
23867 /* Unpack data such that we've got a source byte in each low byte
23868 of each word. We don't care what goes into the high byte of
23869 each word. Rather than trying to get zero in there, most
23870 convenient is to let it be a copy of the low byte. */
23871 hop1 = copy_to_reg (qop1);
23872 hop2 = copy_to_reg (qop2);
23873 emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
23874 emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
23875 break;
23877 /* FALLTHRU */
23878 case ASHIFT:
23879 case ASHIFTRT:
23880 case LSHIFTRT:
23881 hop1 = gen_reg_rtx (V8HImode);
23882 ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
23883 /* mult/vashr/vlshr/vashl */
23884 if (op2vec)
23886 hop2 = gen_reg_rtx (V8HImode);
23887 ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
23889 else
23890 hop2 = qop2;
23892 break;
23893 default:
23894 gcc_unreachable ();
23897 if (code != MULT && op2vec)
23899 /* Expand vashr/vlshr/vashl. */
23900 hdest = gen_reg_rtx (V8HImode);
23901 emit_insn (gen_rtx_SET (hdest,
23902 simplify_gen_binary (code, V8HImode,
23903 hop1, hop2)));
23905 else
23906 /* Expand mult/ashr/lshr/ashl. */
23907 hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
23908 NULL_RTX, 1, OPTAB_DIRECT);
23910 if (TARGET_AVX512BW && TARGET_AVX512VL)
23912 if (qimode == V8QImode)
23913 qdest = dest;
23914 else
23915 qdest = gen_reg_rtx (V8QImode);
23917 emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
23919 else
23921 struct expand_vec_perm_d d;
23922 rtx qres = gen_lowpart (V16QImode, hdest);
23923 bool ok;
23924 int i;
23926 /* Merge the data back into the right place. */
23927 d.target = qdest;
23928 d.op0 = d.op1 = qres;
23929 d.vmode = V16QImode;
23930 d.nelt = 16;
23931 d.one_operand_p = false;
23932 d.testing_p = false;
23934 for (i = 0; i < d.nelt; ++i)
23935 d.perm[i] = i * 2;
23937 ok = ix86_expand_vec_perm_const_1 (&d);
23938 gcc_assert (ok);
23941 if (qdest != dest)
23942 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23945 /* Emit instruction in 2x wider mode. For example, optimize
23946 vector MUL generation like
23948 vpmovzxbw ymm2, xmm0
23949 vpmovzxbw ymm3, xmm1
23950 vpmullw ymm4, ymm2, ymm3
23951 vpmovwb xmm0, ymm4
23953 it would take less instructions than ix86_expand_vecop_qihi.
23954 Return true if success. */
23956 static bool
23957 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23959 machine_mode himode, qimode = GET_MODE (dest);
23960 machine_mode wqimode;
23961 rtx qop1, qop2, hop1, hop2, hdest;
23962 rtx (*gen_truncate)(rtx, rtx) = NULL;
23963 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23964 bool uns_p = code != ASHIFTRT;
23966 if ((qimode == V16QImode && !TARGET_AVX2)
23967 || (qimode == V32QImode && (!TARGET_AVX512BW || !TARGET_EVEX512))
23968 /* There are no V64HImode instructions. */
23969 || qimode == V64QImode)
23970 return false;
23972 /* Do not generate ymm/zmm instructions when
23973 target prefers 128/256 bit vector width. */
23974 if ((qimode == V16QImode && TARGET_PREFER_AVX128)
23975 || (qimode == V32QImode && TARGET_PREFER_AVX256))
23976 return false;
23978 switch (qimode)
23980 case E_V16QImode:
23981 himode = V16HImode;
23982 if (TARGET_AVX512VL && TARGET_AVX512BW)
23983 gen_truncate = gen_truncv16hiv16qi2;
23984 break;
23985 case E_V32QImode:
23986 himode = V32HImode;
23987 gen_truncate = gen_truncv32hiv32qi2;
23988 break;
23989 default:
23990 gcc_unreachable ();
23993 wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
23994 qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
23996 if (op2vec)
23997 qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
23998 else
23999 qop2 = op2;
24001 hop1 = gen_reg_rtx (himode);
24002 ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
24004 if (op2vec)
24006 hop2 = gen_reg_rtx (himode);
24007 ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
24009 else
24010 hop2 = qop2;
24012 if (code != MULT && op2vec)
24014 /* Expand vashr/vlshr/vashl. */
24015 hdest = gen_reg_rtx (himode);
24016 emit_insn (gen_rtx_SET (hdest,
24017 simplify_gen_binary (code, himode,
24018 hop1, hop2)));
24020 else
24021 /* Expand mult/ashr/lshr/ashl. */
24022 hdest = expand_simple_binop (himode, code, hop1, hop2,
24023 NULL_RTX, 1, OPTAB_DIRECT);
24025 if (gen_truncate)
24026 emit_insn (gen_truncate (dest, hdest));
24027 else
24029 struct expand_vec_perm_d d;
24030 rtx wqdest = gen_reg_rtx (wqimode);
24031 rtx wqres = gen_lowpart (wqimode, hdest);
24032 bool ok;
24033 int i;
24035 /* Merge the data back into the right place. */
24036 d.target = wqdest;
24037 d.op0 = d.op1 = wqres;
24038 d.vmode = wqimode;
24039 d.nelt = GET_MODE_NUNITS (wqimode);
24040 d.one_operand_p = false;
24041 d.testing_p = false;
24043 for (i = 0; i < d.nelt; ++i)
24044 d.perm[i] = i * 2;
24046 ok = ix86_expand_vec_perm_const_1 (&d);
24047 gcc_assert (ok);
24049 emit_move_insn (dest, gen_lowpart (qimode, wqdest));
24052 return true;
24055 /* Expand a vector operation CODE for a V*QImode in terms of the
24056 same operation on V*HImode. */
24058 void
24059 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
24061 machine_mode qimode = GET_MODE (dest);
24062 machine_mode himode;
24063 rtx (*gen_il) (rtx, rtx, rtx);
24064 rtx (*gen_ih) (rtx, rtx, rtx);
24065 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
24066 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
24067 struct expand_vec_perm_d d;
24068 bool full_interleave = true;
24069 bool uns_p = code != ASHIFTRT;
24070 bool ok;
24071 int i;
24073 if (CONST_INT_P (op2)
24074 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
24075 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
24076 return;
24078 if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
24079 return;
24081 switch (qimode)
24083 case E_V16QImode:
24084 himode = V8HImode;
24085 break;
24086 case E_V32QImode:
24087 himode = V16HImode;
24088 break;
24089 case E_V64QImode:
24090 himode = V32HImode;
24091 break;
24092 default:
24093 gcc_unreachable ();
24096 switch (code)
24098 case MULT:
24099 gcc_assert (op2vec);
24100 /* Unpack data such that we've got a source byte in each low byte of
24101 each word. We don't care what goes into the high byte of each word.
24102 Rather than trying to get zero in there, most convenient is to let
24103 it be a copy of the low byte. */
24104 switch (qimode)
24106 case E_V16QImode:
24107 gen_il = gen_vec_interleave_lowv16qi;
24108 gen_ih = gen_vec_interleave_highv16qi;
24109 break;
24110 case E_V32QImode:
24111 gen_il = gen_avx2_interleave_lowv32qi;
24112 gen_ih = gen_avx2_interleave_highv32qi;
24113 full_interleave = false;
24114 break;
24115 case E_V64QImode:
24116 gen_il = gen_avx512bw_interleave_lowv64qi;
24117 gen_ih = gen_avx512bw_interleave_highv64qi;
24118 full_interleave = false;
24119 break;
24120 default:
24121 gcc_unreachable ();
24124 op2_l = gen_reg_rtx (qimode);
24125 op2_h = gen_reg_rtx (qimode);
24126 emit_insn (gen_il (op2_l, op2, op2));
24127 emit_insn (gen_ih (op2_h, op2, op2));
24129 op1_l = gen_reg_rtx (qimode);
24130 op1_h = gen_reg_rtx (qimode);
24131 emit_insn (gen_il (op1_l, op1, op1));
24132 emit_insn (gen_ih (op1_h, op1, op1));
24133 break;
24135 case ASHIFT:
24136 case ASHIFTRT:
24137 case LSHIFTRT:
24138 op1_l = gen_reg_rtx (himode);
24139 op1_h = gen_reg_rtx (himode);
24140 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
24141 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
24142 /* vashr/vlshr/vashl */
24143 if (op2vec)
24145 rtx tmp = force_reg (qimode, op2);
24146 op2_l = gen_reg_rtx (himode);
24147 op2_h = gen_reg_rtx (himode);
24148 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
24149 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
24151 else
24152 op2_l = op2_h = op2;
24154 break;
24155 default:
24156 gcc_unreachable ();
24159 if (code != MULT && op2vec)
24161 /* Expand vashr/vlshr/vashl. */
24162 res_l = gen_reg_rtx (himode);
24163 res_h = gen_reg_rtx (himode);
24164 emit_insn (gen_rtx_SET (res_l,
24165 simplify_gen_binary (code, himode,
24166 op1_l, op2_l)));
24167 emit_insn (gen_rtx_SET (res_h,
24168 simplify_gen_binary (code, himode,
24169 op1_h, op2_h)));
24171 else
24173 /* Expand mult/ashr/lshr/ashl. */
24174 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
24175 1, OPTAB_DIRECT);
24176 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
24177 1, OPTAB_DIRECT);
24180 gcc_assert (res_l && res_h);
24182 /* Merge the data back into the right place. */
24183 d.target = dest;
24184 d.op0 = gen_lowpart (qimode, res_l);
24185 d.op1 = gen_lowpart (qimode, res_h);
24186 d.vmode = qimode;
24187 d.nelt = GET_MODE_NUNITS (qimode);
24188 d.one_operand_p = false;
24189 d.testing_p = false;
24191 if (full_interleave)
24193 /* We used the full interleave, the desired
24194 results are in the even elements. */
24195 for (i = 0; i < d.nelt; ++i)
24196 d.perm[i] = i * 2;
24198 else
24200 /* For AVX, the interleave used above was not cross-lane. So the
24201 extraction is evens but with the second and third quarter swapped.
24202 Happily, that is even one insn shorter than even extraction.
24203 For AVX512BW we have 4 lanes. We extract evens from within a lane,
24204 always first from the first and then from the second source operand,
24205 the index bits above the low 4 bits remains the same.
24206 Thus, for d.nelt == 32 we want permutation
24207 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
24208 and for d.nelt == 64 we want permutation
24209 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
24210 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
24211 for (i = 0; i < d.nelt; ++i)
24212 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
24215 ok = ix86_expand_vec_perm_const_1 (&d);
24216 gcc_assert (ok);
24219 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
24220 if op is CONST_VECTOR with all odd elements equal to their
24221 preceding element. */
24223 static bool
24224 const_vector_equal_evenodd_p (rtx op)
24226 machine_mode mode = GET_MODE (op);
24227 int i, nunits = GET_MODE_NUNITS (mode);
24228 if (GET_CODE (op) != CONST_VECTOR
24229 || nunits != CONST_VECTOR_NUNITS (op))
24230 return false;
24231 for (i = 0; i < nunits; i += 2)
24232 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
24233 return false;
24234 return true;
24237 void
24238 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
24239 bool uns_p, bool odd_p)
24241 machine_mode mode = GET_MODE (op1);
24242 machine_mode wmode = GET_MODE (dest);
24243 rtx x;
24244 rtx orig_op1 = op1, orig_op2 = op2;
24246 if (!nonimmediate_operand (op1, mode))
24247 op1 = force_reg (mode, op1);
24248 if (!nonimmediate_operand (op2, mode))
24249 op2 = force_reg (mode, op2);
24251 /* We only play even/odd games with vectors of SImode. */
24252 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
24254 /* If we're looking for the odd results, shift those members down to
24255 the even slots. For some cpus this is faster than a PSHUFD. */
24256 if (odd_p)
24258 /* For XOP use vpmacsdqh, but only for smult, as it is only
24259 signed. */
24260 if (TARGET_XOP && mode == V4SImode && !uns_p)
24262 x = force_reg (wmode, CONST0_RTX (wmode));
24263 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
24264 return;
24267 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
24268 if (!const_vector_equal_evenodd_p (orig_op1))
24269 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
24270 x, NULL, 1, OPTAB_DIRECT);
24271 if (!const_vector_equal_evenodd_p (orig_op2))
24272 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
24273 x, NULL, 1, OPTAB_DIRECT);
24274 op1 = gen_lowpart (mode, op1);
24275 op2 = gen_lowpart (mode, op2);
24278 if (mode == V16SImode)
24280 if (uns_p)
24281 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
24282 else
24283 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
24285 else if (mode == V8SImode)
24287 if (uns_p)
24288 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
24289 else
24290 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
24292 else if (uns_p)
24293 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
24294 else if (TARGET_SSE4_1)
24295 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
24296 else
24298 rtx s1, s2, t0, t1, t2;
24300 /* The easiest way to implement this without PMULDQ is to go through
24301 the motions as if we are performing a full 64-bit multiply. With
24302 the exception that we need to do less shuffling of the elements. */
24304 /* Compute the sign-extension, aka highparts, of the two operands. */
24305 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
24306 op1, pc_rtx, pc_rtx);
24307 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
24308 op2, pc_rtx, pc_rtx);
24310 /* Multiply LO(A) * HI(B), and vice-versa. */
24311 t1 = gen_reg_rtx (wmode);
24312 t2 = gen_reg_rtx (wmode);
24313 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
24314 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
24316 /* Multiply LO(A) * LO(B). */
24317 t0 = gen_reg_rtx (wmode);
24318 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
24320 /* Combine and shift the highparts into place. */
24321 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
24322 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
24323 1, OPTAB_DIRECT);
24325 /* Combine high and low parts. */
24326 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
24327 return;
24329 emit_insn (x);
24332 void
24333 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
24334 bool uns_p, bool high_p)
24336 machine_mode wmode = GET_MODE (dest);
24337 machine_mode mode = GET_MODE (op1);
24338 rtx t1, t2, t3, t4, mask;
24340 switch (mode)
24342 case E_V4SImode:
24343 t1 = gen_reg_rtx (mode);
24344 t2 = gen_reg_rtx (mode);
24345 if (TARGET_XOP && !uns_p)
24347 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
24348 shuffle the elements once so that all elements are in the right
24349 place for immediate use: { A C B D }. */
24350 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
24351 const1_rtx, GEN_INT (3)));
24352 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
24353 const1_rtx, GEN_INT (3)));
24355 else
24357 /* Put the elements into place for the multiply. */
24358 ix86_expand_vec_interleave (t1, op1, op1, high_p);
24359 ix86_expand_vec_interleave (t2, op2, op2, high_p);
24360 high_p = false;
24362 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
24363 break;
24365 case E_V8SImode:
24366 /* Shuffle the elements between the lanes. After this we
24367 have { A B E F | C D G H } for each operand. */
24368 t1 = gen_reg_rtx (V4DImode);
24369 t2 = gen_reg_rtx (V4DImode);
24370 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
24371 const0_rtx, const2_rtx,
24372 const1_rtx, GEN_INT (3)));
24373 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
24374 const0_rtx, const2_rtx,
24375 const1_rtx, GEN_INT (3)));
24377 /* Shuffle the elements within the lanes. After this we
24378 have { A A B B | C C D D } or { E E F F | G G H H }. */
24379 t3 = gen_reg_rtx (V8SImode);
24380 t4 = gen_reg_rtx (V8SImode);
24381 mask = GEN_INT (high_p
24382 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
24383 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
24384 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
24385 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
24387 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
24388 break;
24390 case E_V8HImode:
24391 case E_V16HImode:
24392 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
24393 uns_p, OPTAB_DIRECT);
24394 t2 = expand_binop (mode,
24395 uns_p ? umul_highpart_optab : smul_highpart_optab,
24396 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
24397 gcc_assert (t1 && t2);
24399 t3 = gen_reg_rtx (mode);
24400 ix86_expand_vec_interleave (t3, t1, t2, high_p);
24401 emit_move_insn (dest, gen_lowpart (wmode, t3));
24402 break;
24404 case E_V16QImode:
24405 case E_V32QImode:
24406 case E_V32HImode:
24407 case E_V16SImode:
24408 case E_V64QImode:
24409 t1 = gen_reg_rtx (wmode);
24410 t2 = gen_reg_rtx (wmode);
24411 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
24412 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
24414 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
24415 break;
24417 default:
24418 gcc_unreachable ();
24422 void
24423 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
24425 rtx res_1, res_2, res_3, res_4;
24427 res_1 = gen_reg_rtx (V4SImode);
24428 res_2 = gen_reg_rtx (V4SImode);
24429 res_3 = gen_reg_rtx (V2DImode);
24430 res_4 = gen_reg_rtx (V2DImode);
24431 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
24432 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
24434 /* Move the results in element 2 down to element 1; we don't care
24435 what goes in elements 2 and 3. Then we can merge the parts
24436 back together with an interleave.
24438 Note that two other sequences were tried:
24439 (1) Use interleaves at the start instead of psrldq, which allows
24440 us to use a single shufps to merge things back at the end.
24441 (2) Use shufps here to combine the two vectors, then pshufd to
24442 put the elements in the correct order.
24443 In both cases the cost of the reformatting stall was too high
24444 and the overall sequence slower. */
24446 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
24447 const0_rtx, const2_rtx,
24448 const0_rtx, const0_rtx));
24449 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
24450 const0_rtx, const2_rtx,
24451 const0_rtx, const0_rtx));
24452 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
24454 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
24457 void
24458 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
24460 machine_mode mode = GET_MODE (op0);
24461 rtx t1, t2, t3, t4, t5, t6;
24463 if (TARGET_AVX512DQ && TARGET_EVEX512 && mode == V8DImode)
24464 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
24465 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
24466 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
24467 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
24468 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
24469 else if (TARGET_XOP && mode == V2DImode)
24471 /* op1: A,B,C,D, op2: E,F,G,H */
24472 op1 = gen_lowpart (V4SImode, op1);
24473 op2 = gen_lowpart (V4SImode, op2);
24475 t1 = gen_reg_rtx (V4SImode);
24476 t2 = gen_reg_rtx (V4SImode);
24477 t3 = gen_reg_rtx (V2DImode);
24478 t4 = gen_reg_rtx (V2DImode);
24480 /* t1: B,A,D,C */
24481 emit_insn (gen_sse2_pshufd_1 (t1, op1,
24482 GEN_INT (1),
24483 GEN_INT (0),
24484 GEN_INT (3),
24485 GEN_INT (2)));
24487 /* t2: (B*E),(A*F),(D*G),(C*H) */
24488 emit_insn (gen_mulv4si3 (t2, t1, op2));
24490 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
24491 emit_insn (gen_xop_phadddq (t3, t2));
24493 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
24494 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
24496 /* Multiply lower parts and add all */
24497 t5 = gen_reg_rtx (V2DImode);
24498 emit_insn (gen_vec_widen_umult_even_v4si (t5,
24499 gen_lowpart (V4SImode, op1),
24500 gen_lowpart (V4SImode, op2)));
24501 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
24503 else
24505 machine_mode nmode;
24506 rtx (*umul) (rtx, rtx, rtx);
24508 if (mode == V2DImode)
24510 umul = gen_vec_widen_umult_even_v4si;
24511 nmode = V4SImode;
24513 else if (mode == V4DImode)
24515 umul = gen_vec_widen_umult_even_v8si;
24516 nmode = V8SImode;
24518 else if (mode == V8DImode)
24520 umul = gen_vec_widen_umult_even_v16si;
24521 nmode = V16SImode;
24523 else
24524 gcc_unreachable ();
24527 /* Multiply low parts. */
24528 t1 = gen_reg_rtx (mode);
24529 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
24531 /* Shift input vectors right 32 bits so we can multiply high parts. */
24532 t6 = GEN_INT (32);
24533 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
24534 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
24536 /* Multiply high parts by low parts. */
24537 t4 = gen_reg_rtx (mode);
24538 t5 = gen_reg_rtx (mode);
24539 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
24540 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
24542 /* Combine and shift the highparts back. */
24543 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
24544 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
24546 /* Combine high and low parts. */
24547 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
24550 set_unique_reg_note (get_last_insn (), REG_EQUAL,
24551 gen_rtx_MULT (mode, op1, op2));
24554 /* Return 1 if control tansfer instruction INSN
24555 should be encoded with notrack prefix. */
24557 bool
24558 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
24560 if (!insn || !((flag_cf_protection & CF_BRANCH)))
24561 return false;
24563 if (CALL_P (insn))
24565 rtx call = get_call_rtx_from (insn);
24566 gcc_assert (call != NULL_RTX);
24567 rtx addr = XEXP (call, 0);
24569 /* Do not emit 'notrack' if it's not an indirect call. */
24570 if (MEM_P (addr)
24571 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
24572 return false;
24573 else
24574 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
24577 if (JUMP_P (insn) && !flag_cet_switch)
24579 rtx target = JUMP_LABEL (insn);
24580 if (target == NULL_RTX || ANY_RETURN_P (target))
24581 return false;
24583 /* Check the jump is a switch table. */
24584 rtx_insn *label = as_a<rtx_insn *> (target);
24585 rtx_insn *table = next_insn (label);
24586 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
24587 return false;
24588 else
24589 return true;
24591 return false;
24594 /* Calculate integer abs() using only SSE2 instructions. */
24596 void
24597 ix86_expand_sse2_abs (rtx target, rtx input)
24599 machine_mode mode = GET_MODE (target);
24600 rtx tmp0, tmp1, x;
24602 switch (mode)
24604 case E_V2DImode:
24605 case E_V4DImode:
24606 /* For 64-bit signed integer X, with SSE4.2 use
24607 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
24608 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
24609 32 and use logical instead of arithmetic right shift (which is
24610 unimplemented) and subtract. */
24611 if (TARGET_SSE4_2)
24613 tmp0 = gen_reg_rtx (mode);
24614 tmp1 = gen_reg_rtx (mode);
24615 emit_move_insn (tmp1, CONST0_RTX (mode));
24616 if (mode == E_V2DImode)
24617 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
24618 else
24619 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
24621 else
24623 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
24624 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
24625 - 1), NULL, 0, OPTAB_DIRECT);
24626 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
24629 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
24630 NULL, 0, OPTAB_DIRECT);
24631 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
24632 target, 0, OPTAB_DIRECT);
24633 break;
24635 case E_V4SImode:
24636 /* For 32-bit signed integer X, the best way to calculate the absolute
24637 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
24638 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
24639 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
24640 NULL, 0, OPTAB_DIRECT);
24641 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
24642 NULL, 0, OPTAB_DIRECT);
24643 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
24644 target, 0, OPTAB_DIRECT);
24645 break;
24647 case E_V8HImode:
24648 /* For 16-bit signed integer X, the best way to calculate the absolute
24649 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
24650 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
24652 x = expand_simple_binop (mode, SMAX, tmp0, input,
24653 target, 0, OPTAB_DIRECT);
24654 break;
24656 case E_V16QImode:
24657 /* For 8-bit signed integer X, the best way to calculate the absolute
24658 value of X is min ((unsigned char) X, (unsigned char) (-X)),
24659 as SSE2 provides the PMINUB insn. */
24660 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
24662 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
24663 target, 0, OPTAB_DIRECT);
24664 break;
24666 default:
24667 gcc_unreachable ();
24670 if (x != target)
24671 emit_move_insn (target, x);
24674 /* Expand an extract from a vector register through pextr insn.
24675 Return true if successful. */
24677 bool
24678 ix86_expand_pextr (rtx *operands)
24680 rtx dst = operands[0];
24681 rtx src = operands[1];
24683 unsigned int size = INTVAL (operands[2]);
24684 unsigned int pos = INTVAL (operands[3]);
24686 if (SUBREG_P (dst))
24688 /* Reject non-lowpart subregs. */
24689 if (SUBREG_BYTE (dst) > 0)
24690 return false;
24691 dst = SUBREG_REG (dst);
24694 if (SUBREG_P (src))
24696 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
24697 src = SUBREG_REG (src);
24700 switch (GET_MODE (src))
24702 case E_V16QImode:
24703 case E_V8HImode:
24704 case E_V4SImode:
24705 case E_V2DImode:
24706 case E_V1TImode:
24708 machine_mode srcmode, dstmode;
24709 rtx d, pat;
24711 if (!int_mode_for_size (size, 0).exists (&dstmode))
24712 return false;
24714 switch (dstmode)
24716 case E_QImode:
24717 if (!TARGET_SSE4_1)
24718 return false;
24719 srcmode = V16QImode;
24720 break;
24722 case E_HImode:
24723 if (!TARGET_SSE2)
24724 return false;
24725 srcmode = V8HImode;
24726 break;
24728 case E_SImode:
24729 if (!TARGET_SSE4_1)
24730 return false;
24731 srcmode = V4SImode;
24732 break;
24734 case E_DImode:
24735 gcc_assert (TARGET_64BIT);
24736 if (!TARGET_SSE4_1)
24737 return false;
24738 srcmode = V2DImode;
24739 break;
24741 default:
24742 return false;
24745 /* Reject extractions from misaligned positions. */
24746 if (pos & (size-1))
24747 return false;
24749 if (GET_MODE (dst) == dstmode)
24750 d = dst;
24751 else
24752 d = gen_reg_rtx (dstmode);
24754 /* Construct insn pattern. */
24755 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
24756 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
24758 /* Let the rtl optimizers know about the zero extension performed. */
24759 if (dstmode == QImode || dstmode == HImode)
24761 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
24762 d = gen_lowpart (SImode, d);
24765 emit_insn (gen_rtx_SET (d, pat));
24767 if (d != dst)
24768 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24769 return true;
24772 default:
24773 return false;
24777 /* Expand an insert into a vector register through pinsr insn.
24778 Return true if successful. */
24780 bool
24781 ix86_expand_pinsr (rtx *operands)
24783 rtx dst = operands[0];
24784 rtx src = operands[3];
24786 unsigned int size = INTVAL (operands[1]);
24787 unsigned int pos = INTVAL (operands[2]);
24789 if (SUBREG_P (dst))
24791 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
24792 dst = SUBREG_REG (dst);
24795 switch (GET_MODE (dst))
24797 case E_V16QImode:
24798 case E_V8HImode:
24799 case E_V4SImode:
24800 case E_V2DImode:
24801 case E_V1TImode:
24803 machine_mode srcmode, dstmode;
24804 rtx (*pinsr)(rtx, rtx, rtx, rtx);
24805 rtx d;
24807 if (!int_mode_for_size (size, 0).exists (&srcmode))
24808 return false;
24810 switch (srcmode)
24812 case E_QImode:
24813 if (!TARGET_SSE4_1)
24814 return false;
24815 dstmode = V16QImode;
24816 pinsr = gen_sse4_1_pinsrb;
24817 break;
24819 case E_HImode:
24820 if (!TARGET_SSE2)
24821 return false;
24822 dstmode = V8HImode;
24823 pinsr = gen_sse2_pinsrw;
24824 break;
24826 case E_SImode:
24827 if (!TARGET_SSE4_1)
24828 return false;
24829 dstmode = V4SImode;
24830 pinsr = gen_sse4_1_pinsrd;
24831 break;
24833 case E_DImode:
24834 gcc_assert (TARGET_64BIT);
24835 if (!TARGET_SSE4_1)
24836 return false;
24837 dstmode = V2DImode;
24838 pinsr = gen_sse4_1_pinsrq;
24839 break;
24841 default:
24842 return false;
24845 /* Reject insertions to misaligned positions. */
24846 if (pos & (size-1))
24847 return false;
24849 if (SUBREG_P (src))
24851 unsigned int srcpos = SUBREG_BYTE (src);
24853 if (srcpos > 0)
24855 rtx extr_ops[4];
24857 extr_ops[0] = gen_reg_rtx (srcmode);
24858 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
24859 extr_ops[2] = GEN_INT (size);
24860 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
24862 if (!ix86_expand_pextr (extr_ops))
24863 return false;
24865 src = extr_ops[0];
24867 else
24868 src = gen_lowpart (srcmode, SUBREG_REG (src));
24871 if (GET_MODE (dst) == dstmode)
24872 d = dst;
24873 else
24874 d = gen_reg_rtx (dstmode);
24876 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
24877 gen_lowpart (srcmode, src),
24878 GEN_INT (1 << (pos / size))));
24879 if (d != dst)
24880 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24881 return true;
24884 default:
24885 return false;
24889 /* All CPUs prefer to avoid cross-lane operations so perform reductions
24890 upper against lower halves up to SSE reg size. */
24892 machine_mode
24893 ix86_split_reduction (machine_mode mode)
24895 /* Reduce lowpart against highpart until we reach SSE reg width to
24896 avoid cross-lane operations. */
24897 switch (mode)
24899 case E_V8DImode:
24900 case E_V4DImode:
24901 return V2DImode;
24902 case E_V16SImode:
24903 case E_V8SImode:
24904 return V4SImode;
24905 case E_V32HImode:
24906 case E_V16HImode:
24907 return V8HImode;
24908 case E_V64QImode:
24909 case E_V32QImode:
24910 return V16QImode;
24911 case E_V16SFmode:
24912 case E_V8SFmode:
24913 return V4SFmode;
24914 case E_V8DFmode:
24915 case E_V4DFmode:
24916 return V2DFmode;
24917 default:
24918 return mode;
24922 /* Generate call to __divmoddi4. */
24924 void
24925 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
24926 rtx op0, rtx op1,
24927 rtx *quot_p, rtx *rem_p)
24929 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
24931 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
24932 mode, op0, mode, op1, mode,
24933 XEXP (rem, 0), Pmode);
24934 *quot_p = quot;
24935 *rem_p = rem;
24938 void
24939 ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
24940 enum rtx_code code, bool after,
24941 bool doubleword)
24943 rtx old_reg, new_reg, old_mem, success;
24944 machine_mode mode = GET_MODE (target);
24945 rtx_code_label *loop_label = NULL;
24947 old_reg = gen_reg_rtx (mode);
24948 new_reg = old_reg;
24949 old_mem = copy_to_reg (mem);
24950 loop_label = gen_label_rtx ();
24951 emit_label (loop_label);
24952 emit_move_insn (old_reg, old_mem);
24954 /* return value for atomic_fetch_op. */
24955 if (!after)
24956 emit_move_insn (target, old_reg);
24958 if (code == NOT)
24960 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
24961 true, OPTAB_LIB_WIDEN);
24962 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
24964 else
24965 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
24966 true, OPTAB_LIB_WIDEN);
24968 /* return value for atomic_op_fetch. */
24969 if (after)
24970 emit_move_insn (target, new_reg);
24972 success = NULL_RTX;
24974 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
24975 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
24976 SImode),
24977 doubleword, loop_label);
24980 /* Relax cmpxchg instruction, param loop_label indicates whether
24981 the instruction should be relaxed with a pause loop. If not,
24982 it will be relaxed to an atomic load + compare, and skip
24983 cmpxchg instruction if mem != exp_input. */
24985 void
24986 ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
24987 rtx mem, rtx exp_input, rtx new_input,
24988 rtx mem_model, bool doubleword,
24989 rtx_code_label *loop_label)
24991 rtx_code_label *cmp_label = NULL;
24992 rtx_code_label *done_label = NULL;
24993 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
24994 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
24995 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
24996 machine_mode mode = GET_MODE (target_val), hmode = mode;
24998 if (*ptarget_bool == NULL)
24999 target_bool = gen_reg_rtx (QImode);
25000 else
25001 target_bool = *ptarget_bool;
25003 cmp_label = gen_label_rtx ();
25004 done_label = gen_label_rtx ();
25006 new_mem = gen_reg_rtx (mode);
25007 /* Load memory first. */
25008 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
25010 switch (mode)
25012 case E_TImode:
25013 gendw = gen_atomic_compare_and_swapti_doubleword;
25014 hmode = DImode;
25015 break;
25016 case E_DImode:
25017 if (doubleword)
25019 gendw = gen_atomic_compare_and_swapdi_doubleword;
25020 hmode = SImode;
25022 else
25023 gen = gen_atomic_compare_and_swapdi_1;
25024 break;
25025 case E_SImode:
25026 gen = gen_atomic_compare_and_swapsi_1;
25027 break;
25028 case E_HImode:
25029 gen = gen_atomic_compare_and_swaphi_1;
25030 break;
25031 case E_QImode:
25032 gen = gen_atomic_compare_and_swapqi_1;
25033 break;
25034 default:
25035 gcc_unreachable ();
25038 /* Compare mem value with expected value. */
25039 if (doubleword)
25041 rtx low_new_mem = gen_lowpart (hmode, new_mem);
25042 rtx low_exp_input = gen_lowpart (hmode, exp_input);
25043 rtx high_new_mem = gen_highpart (hmode, new_mem);
25044 rtx high_exp_input = gen_highpart (hmode, exp_input);
25045 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
25046 hmode, 1, cmp_label,
25047 profile_probability::guessed_never ());
25048 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
25049 hmode, 1, cmp_label,
25050 profile_probability::guessed_never ());
25052 else
25053 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
25054 GET_MODE (exp_input), 1, cmp_label,
25055 profile_probability::guessed_never ());
25057 /* Directly emits cmpxchg here. */
25058 if (doubleword)
25059 emit_insn (gendw (target_val, mem, exp_input,
25060 gen_lowpart (hmode, new_input),
25061 gen_highpart (hmode, new_input),
25062 mem_model));
25063 else
25064 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
25066 if (!loop_label)
25068 emit_jump_insn (gen_jump (done_label));
25069 emit_barrier ();
25070 emit_label (cmp_label);
25071 emit_move_insn (target_val, new_mem);
25072 emit_label (done_label);
25073 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
25074 const0_rtx);
25076 else
25078 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
25079 const0_rtx);
25080 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
25081 GET_MODE (target_bool), 1, loop_label,
25082 profile_probability::guessed_never ());
25083 emit_jump_insn (gen_jump (done_label));
25084 emit_barrier ();
25086 /* If mem is not expected, pause and loop back. */
25087 emit_label (cmp_label);
25088 emit_move_insn (target_val, new_mem);
25089 emit_insn (gen_pause ());
25090 emit_jump_insn (gen_jump (loop_label));
25091 emit_barrier ();
25092 emit_label (done_label);
25095 *ptarget_bool = target_bool;
25098 /* Convert a BFmode VAL to SFmode without signaling sNaNs.
25099 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
25102 ix86_expand_fast_convert_bf_to_sf (rtx val)
25104 rtx op = gen_lowpart (HImode, val), ret;
25105 if (CONST_INT_P (op))
25107 ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
25108 val, BFmode);
25109 if (ret)
25110 return ret;
25111 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
25112 ret = gen_reg_rtx (SImode);
25113 emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
25114 emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
25115 return gen_lowpart (SFmode, ret);
25118 ret = gen_reg_rtx (SFmode);
25119 emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
25120 return ret;
25123 #include "gt-i386-expand.h"